Spaces:

amit01Xindus
/

new-project

Sleeping

App Files Files Community

amit01Xindus commited on Jun 17

Commit

96c003e

verified ·

1 Parent(s): d1e9a85

Upload 8 files

Browse files

Files changed (8) hide show

app.py +180 -0
dockerfile +53 -0
pdf_excel.py +737 -0
pdf_html.py +636 -0
pdf_json.py +513 -0
pdf_word.py +559 -0
requirements.txt +12 -0
static/index.html +896 -0

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from flask import Flask, request, jsonify, send_file, send_from_directory
+from flask_cors import CORS
+from werkzeug.utils import secure_filename
+import os
+import traceback
+from pdf_html import PDFToHTMLConverter
+from pdf_word import PDFToWordConverter
+from pdf_json import PDFToJSONConverter
+from pdf_excel import PDFToExcelConverter
+app = Flask(__name__, static_folder='static')
+CORS(app)
+# Configure file size limits and folders
+app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024  # 100 MB limit
+app.config['UPLOAD_FOLDER'] = 'uploads'
+app.config['OUTPUT_FOLDER'] = 'outputs'
+app.config['SECRET_KEY'] = 'your-secret-key-here'  # IMPORTANT: Change this in production!
+# Create necessary directories if they don't exist
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
+# Placeholder for Hugging Face API Token
+HF_TOKEN = "Api_token"  # Replace with your actual token
+# Define allowed file extensions for uploads
+ALLOWED_EXTENSIONS = {'pdf'}
+def allowed_file(filename):
+    """Checks if the uploaded file has an allowed extension."""
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+@app.route('/')
+def serve_index():
+    """Serves the main index.html file."""
+    return send_from_directory('static', 'index.html')
+@app.route('/<path:filename>')
+def serve_static(filename):
+    """Serves other static files (CSS, JS, etc.)."""
+    return send_from_directory('static', filename)
+@app.route('/convert', methods=['POST'])
+def convert_pdf():
+    """
+    Handles PDF conversion requests.
+    Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'.
+    """
+    try:
+        # Check if a file was included in the request
+        if 'file' not in request.files:
+            return jsonify({'success': False, 'error': 'No file uploaded.'}), 400
+        file = request.files['file']
+        format_type = request.form.get('format')
+        output_name = request.form.get('output_name', 'converted_file')
+        # Validate file and format
+        if file.filename == '':
+            return jsonify({'success': False, 'error': 'No file selected.'}), 400
+        if not format_type or format_type not in ['html', 'word', 'json', 'excel']:
+            return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400
+        if not allowed_file(file.filename):
+            return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400
+        # Securely save the uploaded file
+        filename_secured = secure_filename(file.filename)
+        input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured)
+        file.save(input_path)
+        # Define output file extensions based on format
+        extensions = {
+            'html': '.html',
+            'word': '.docx',
+            'json': '.json',
+            'excel': '.xlsx'
+        }
+        output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}"
+        output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename)
+        success_message = ""
+        try:
+            # Perform conversion based on the requested format
+            if format_type == 'html':
+                converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
+                try:
+                    # First try with HF models
+                    converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True)
+                except AttributeError as ae:
+                    if '_group_overlapping_text' in str(ae):
+                        # Fall back to non-HF mode if the method is missing
+                        converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False)
+                    else:
+                        raise
+                success_message = "Successfully converted to HTML!"
+            elif format_type == 'word':
+                converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
+                converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False)
+                success_message = "Successfully converted to Word!"
+            elif format_type == 'json':
+                converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
+                converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False)
+                success_message = "Successfully converted to JSON!"
+            elif format_type == 'excel':
+                converter = PDFToExcelConverter(huggingface_token=HF_TOKEN)
+                converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False)
+                success_message = "Successfully converted to Excel!"
+        except Exception as conv_e:
+            # Clean up the output file if conversion failed
+            if os.path.exists(output_path):
+                try:
+                    os.remove(output_path)
+                except Exception as e:
+                    print(f"Warning: Could not remove output file {output_path}: {e}")
+            raise conv_e
+        # Clean up the uploaded input file
+        try:
+            os.remove(input_path)
+        except Exception as e:
+            print(f"Warning: Could not remove input file {input_path}: {e}")
+        # Return success response with download URL
+        return jsonify({
+            'success': True,
+            'message': success_message,
+            'download_url': f'/download/{output_filename}'
+        }), 200
+    except Exception as e:
+        # Clean up input file in case of error
+        if 'input_path' in locals() and os.path.exists(input_path):
+            try:
+                os.remove(input_path)
+            except Exception as cleanup_e:
+                print(f"Error during error cleanup for {input_path}: {cleanup_e}")
+        traceback.print_exc()
+        error_msg = str(e)
+        if '_group_overlapping_text' in error_msg:
+            error_msg = "HTML conversion failed due to incompatible converter version. Please try another format."
+        return jsonify({
+            'success': False,
+            'error': f'Conversion failed: {error_msg}'
+        }), 500
+@app.route('/download/<filename>')
+def download_file(filename):
+    """Allows downloading of converted files."""
+    try:
+        file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename)
+        if os.path.exists(file_path):
+            return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True)
+        return jsonify({'error': 'File not found.'}), 404
+    except Exception as e:
+        traceback.print_exc()
+        return jsonify({'error': str(e)}), 500
+@app.route('/health')
+def health_check():
+    """Simple health check endpoint."""
+    return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200
+@app.errorhandler(413)
+def too_large(e):
+    """Handles file too large errors."""
+    return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413
+@app.errorhandler(500)
+def internal_error(e):
+    """Handles general internal server errors."""
+    traceback.print_exc()
+    return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=5000)

dockerfile ADDED Viewed

	@@ -0,0 +1,53 @@

+# Use a minimal Python base image
+FROM python:3.10-slim
+# Install system dependencies including libcrypt and additional libraries for PyMuPDF
+RUN apt-get update && \
+    apt-get install -y \
+    libcrypt1 \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    poppler-utils \
+    build-essential \
+    libfontconfig1 \
+    libxrender1 \
+    libxtst6 \
+    libxi6 \
+    libfreetype6-dev \
+    libjpeg-dev \
+    libopenjp2-7-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Upgrade pip and install Python dependencies with verbose output
+RUN pip install --upgrade pip
+# Install PyMuPDF first to check for issues early
+RUN pip install --no-cache-dir PyMuPDF==1.23.0
+# Test PyMuPDF import
+RUN python -c "import fitz; print('PyMuPDF imported successfully')"
+# Install remaining dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Verify python-docx installation
+RUN python -c "from docx import Document; print('python-docx installed successfully')"
+# Copy source code to container
+COPY . .
+# Expose the port Flask will run on (important for Hugging Face)
+EXPOSE 7860
+# Run the Flask app
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]

pdf_excel.py ADDED Viewed

	@@ -0,0 +1,737 @@

+import os
+import pandas as pd
+import fitz  # PyMuPDF
+import openpyxl
+from openpyxl.utils.dataframe import dataframe_to_rows
+from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
+from dataclasses import dataclass
+from typing import List, Dict, Any, Tuple, Optional
+import re
+from pathlib import Path
+import logging
+from datetime import datetime
+import numpy as np
+# Optional imports with graceful fallback
+try:
+    import camelot  # For advanced table extraction
+    CAMELOT_AVAILABLE = True
+except ImportError:
+    CAMELOT_AVAILABLE = False
+    print("⚠️  Camelot not installed. Run: pip install camelot-py[cv]")
+try:
+    import tabula  # Alternative table extraction
+    TABULA_AVAILABLE = True
+except ImportError:
+    TABULA_AVAILABLE = False
+    print("⚠️  Tabula not installed. Run: pip install tabula-py")
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+@dataclass
+class TextBlock:
+    text: str
+    x: float
+    y: float
+    width: float
+    height: float
+    font_size: float
+    font_name: str
+    is_bold: bool = False
+    is_italic: bool = False
+    page_num: int = 1
+    block_id: str = ""
+@dataclass
+class TableData:
+    data: List[List[str]]
+    bbox: Tuple[float, float, float, float]
+    page_num: int
+    confidence: float = 0.0
+    has_header: bool = True
+class PDFToExcelConverter:
+    """
+    Enhanced PDF to Excel converter with multiple extraction methods
+    for better accuracy and handling of complex documents.
+    """
+    def __init__(self):
+        # Check available extraction methods
+        available_methods = ['pymupdf']  # Always available
+        if CAMELOT_AVAILABLE:
+            available_methods.append('camelot')
+        if TABULA_AVAILABLE:
+            available_methods.append('tabula')
+        self.extraction_methods = available_methods
+        self.output_formats = {
+            'separate_sheets': 'Each table and text section on separate sheets',
+            'combined': 'All content combined logically',
+            'structured': 'Maintain document structure with proper formatting'
+        }
+        # Log available methods
+        logger.info(f"Available extraction methods: {', '.join(available_methods)}")
+    def extract_text_blocks_advanced(self, page, page_num: int) -> List[TextBlock]:
+        """
+        Advanced text extraction with better formatting detection
+        """
+        text_blocks = []
+        try:
+            # Method 1: Dictionary-based extraction (most detailed)
+            page_dict = page.get_text("dict")
+            for block_idx, block in enumerate(page_dict.get("blocks", [])):
+                if block.get("type", 1) != 0:  # Skip non-text blocks
+                    continue
+                for line_idx, line in enumerate(block.get("lines", [])):
+                    for span_idx, span in enumerate(line.get("spans", [])):
+                        text_content = span.get("text", "").strip()
+                        if not text_content:
+                            continue
+                        bbox = span["bbox"]
+                        flags = span.get("flags", 0)
+                        # Enhanced font detection
+                        font_name = span.get("font", "Arial")
+                        font_size = span.get("size", 12)
+                        is_bold = bool(flags & 16) or "bold" in font_name.lower()
+                        is_italic = bool(flags & 2) or "italic" in font_name.lower()
+                        text_block = TextBlock(
+                            text=text_content,
+                            x=bbox[0], y=bbox[1],
+                            width=bbox[2] - bbox[0],
+                            height=bbox[3] - bbox[1],
+                            font_size=font_size,
+                            font_name=font_name,
+                            is_bold=is_bold,
+                            is_italic=is_italic,
+                            page_num=page_num,
+                            block_id=f"p{page_num}_b{block_idx}_l{line_idx}_s{span_idx}"
+                        )
+                        text_blocks.append(text_block)
+        except Exception as e:
+            logger.warning(f"Advanced text extraction failed for page {page_num}: {e}")
+            # Fallback to simple extraction
+            text_blocks = self._extract_text_simple_fallback(page, page_num)
+        return text_blocks
+    def _extract_text_simple_fallback(self, page, page_num: int) -> List[TextBlock]:
+        """
+        Fallback text extraction method
+        """
+        text_blocks = []
+        try:
+            text = page.get_text()
+            if text.strip():
+                # Create a single text block for the entire page content
+                rect = page.rect
+                text_block = TextBlock(
+                    text=text.strip(),
+                    x=0, y=0,
+                    width=rect.width,
+                    height=rect.height,
+                    font_size=12,
+                    font_name="Arial",
+                    page_num=page_num,
+                    block_id=f"p{page_num}_fallback"
+                )
+                text_blocks.append(text_block)
+        except Exception as e:
+            logger.error(f"Fallback text extraction failed for page {page_num}: {e}")
+        return text_blocks
+    def extract_tables_multiple_methods(self, pdf_path: str, page_num: int) -> List[TableData]:
+        """
+        Extract tables using multiple methods and combine results
+        """
+        all_tables = []
+        # Method 1: PyMuPDF built-in table detection
+        tables_pymupdf = self._extract_tables_pymupdf(pdf_path, page_num)
+        all_tables.extend(tables_pymupdf)
+        # Method 2: Camelot (if available)
+        if CAMELOT_AVAILABLE:
+            try:
+                tables_camelot = self._extract_tables_camelot(pdf_path, page_num)
+                all_tables.extend(tables_camelot)
+            except Exception as e:
+                logger.warning(f"Camelot extraction failed: {e}")
+        # Method 3: Tabula (if available)
+        if TABULA_AVAILABLE:
+            try:
+                tables_tabula = self._extract_tables_tabula(pdf_path, page_num)
+                all_tables.extend(tables_tabula)
+            except Exception as e:
+                logger.warning(f"Tabula extraction failed: {e}")
+        # Remove duplicates and return best tables
+        return self._deduplicate_tables(all_tables)
+    def _extract_tables_pymupdf(self, pdf_path: str, page_num: int) -> List[TableData]:
+        """
+        Extract tables using PyMuPDF
+        """
+        tables = []
+        try:
+            doc = fitz.open(pdf_path)
+            page = doc[page_num - 1]  # Convert to 0-based index
+            detected_tables = page.find_tables()
+            for i, table in enumerate(detected_tables):
+                try:
+                    table_data = table.extract()
+                    if table_data and len(table_data) > 0:
+                        # Clean the table data
+                        cleaned_data = []
+                        for row in table_data:
+                            cleaned_row = []
+                            for cell in row:
+                                cell_text = str(cell).strip() if cell else ""
+                                cleaned_row.append(cell_text)
+                            if any(cleaned_row):  # Only add non-empty rows
+                                cleaned_data.append(cleaned_row)
+                        if cleaned_data:
+                            tables.append(TableData(
+                                data=cleaned_data,
+                                bbox=table.bbox,
+                                page_num=page_num,
+                                confidence=0.8,  # PyMuPDF generally reliable
+                                has_header=True
+                            ))
+                except Exception as e:
+                    logger.warning(f"Error extracting PyMuPDF table {i}: {e}")
+            doc.close()
+        except Exception as e:
+            logger.error(f"PyMuPDF table extraction failed: {e}")
+        return tables
+    def _extract_tables_camelot(self, pdf_path: str, page_num: int) -> List[TableData]:
+        """
+        Extract tables using Camelot (only if available)
+        """
+        if not CAMELOT_AVAILABLE:
+            return []
+        tables = []
+        try:
+            # Camelot works with page numbers (1-based)
+            camelot_tables = camelot.read_pdf(pdf_path, pages=str(page_num), flavor='lattice')
+            for i, table in enumerate(camelot_tables):
+                df = table.df
+                if not df.empty:
+                    # Convert DataFrame to list of lists
+                    table_data = df.values.tolist()
+                    # Add headers if they exist
+                    if not df.columns.empty:
+                        headers = df.columns.tolist()
+                        table_data.insert(0, headers)
+                    tables.append(TableData(
+                        data=table_data,
+                        bbox=(0, 0, 100, 100),  # Camelot doesn't provide bbox
+                        page_num=page_num,
+                        confidence=table.accuracy / 100.0 if hasattr(table, 'accuracy') else 0.7,
+                        has_header=True
+                    ))
+        except Exception as e:
+            logger.warning(f"Camelot extraction failed: {e}")
+        return tables
+    def _extract_tables_tabula(self, pdf_path: str, page_num: int) -> List[TableData]:
+        """
+        Extract tables using Tabula (only if available)
+        """
+        if not TABULA_AVAILABLE:
+            return []
+        tables = []
+        try:
+            # Tabula works with page numbers (1-based)
+            tabula_tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True)
+            for i, df in enumerate(tabula_tables):
+                if not df.empty:
+                    # Convert DataFrame to list of lists
+                    table_data = df.fillna('').values.tolist()
+                    # Add headers
+                    headers = df.columns.tolist()
+                    table_data.insert(0, headers)
+                    tables.append(TableData(
+                        data=table_data,
+                        bbox=(0, 0, 100, 100),  # Tabula doesn't provide bbox
+                        page_num=page_num,
+                        confidence=0.7,
+                        has_header=True
+                    ))
+        except Exception as e:
+            logger.warning(f"Tabula extraction failed: {e}")
+        return tables
+    def _deduplicate_tables(self, tables: List[TableData]) -> List[TableData]:
+        """
+        Remove duplicate tables by comparing content
+        """
+        if not tables:
+            return tables
+        unique_tables = []
+        for table in tables:
+            is_duplicate = False
+            for existing_table in unique_tables:
+                if self._tables_are_similar(table, existing_table):
+                    # Keep the one with higher confidence
+                    if table.confidence > existing_table.confidence:
+                        unique_tables.remove(existing_table)
+                        unique_tables.append(table)
+                    is_duplicate = True
+                    break
+            if not is_duplicate:
+                unique_tables.append(table)
+        return unique_tables
+    def _tables_are_similar(self, table1: TableData, table2: TableData, threshold: float = 0.8) -> bool:
+        """
+        Check if two tables are similar (likely duplicates)
+        """
+        if len(table1.data) != len(table2.data):
+            return False
+        if not table1.data or not table2.data:
+            return False
+        # Compare dimensions
+        if len(table1.data[0]) != len(table2.data[0]):
+            return False
+        # Compare content similarity
+        matching_cells = 0
+        total_cells = len(table1.data) * len(table1.data[0])
+        for i, (row1, row2) in enumerate(zip(table1.data, table2.data)):
+            for j, (cell1, cell2) in enumerate(zip(row1, row2)):
+                if str(cell1).strip().lower() == str(cell2).strip().lower():
+                    matching_cells += 1
+        similarity = matching_cells / total_cells if total_cells > 0 else 0
+        return similarity >= threshold
+    def process_pdf_to_excel(self, pdf_path: str, output_path: str, format_type: str = 'structured') -> str:
+        """
+        Convert PDF to Excel with enhanced processing
+        """
+        logger.info(f"Starting PDF to Excel conversion: {pdf_path}")
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        # Extract content from PDF
+        pdf_content = self._extract_comprehensive_content(pdf_path)
+        # Create Excel workbook
+        output_path = self._create_excel_workbook(pdf_content, output_path, format_type)
+        logger.info(f"Successfully converted PDF to Excel: {output_path}")
+        return output_path
+    def _extract_comprehensive_content(self, pdf_path: str) -> Dict[str, Any]:
+        """
+        Extract all content from PDF using multiple methods
+        """
+        content = {
+            'pages': [],
+            'total_pages': 0,
+            'metadata': {}
+        }
+        try:
+            doc = fitz.open(pdf_path)
+            content['total_pages'] = doc.page_count
+            content['metadata'] = doc.metadata
+            logger.info(f"Processing {doc.page_count} pages...")
+            for page_num in range(doc.page_count):
+                page = doc[page_num]
+                logger.info(f"Processing page {page_num + 1}/{doc.page_count}")
+                # Extract text blocks
+                text_blocks = self.extract_text_blocks_advanced(page, page_num + 1)
+                # Extract tables using multiple methods
+                tables = self.extract_tables_multiple_methods(pdf_path, page_num + 1)
+                # Extract images (basic)
+                images = self._extract_images_basic(page, page_num + 1)
+                page_content = {
+                    'page_number': page_num + 1,
+                    'text_blocks': text_blocks,
+                    'tables': tables,
+                    'images': images,
+                    'page_width': page.rect.width,
+                    'page_height': page.rect.height
+                }
+                content['pages'].append(page_content)
+            doc.close()
+        except Exception as e:
+            logger.error(f"Error extracting PDF content: {e}")
+            raise
+        return content
+    def _extract_images_basic(self, page, page_num: int) -> List[Dict]:
+        """
+        Basic image extraction for reference
+        """
+        images = []
+        try:
+            image_list = page.get_images()
+            for i, img in enumerate(image_list):
+                images.append({
+                    'index': i,
+                    'page': page_num,
+                    'bbox': img  # Simplified
+                })
+        except Exception as e:
+            logger.warning(f"Image extraction failed for page {page_num}: {e}")
+        return images
+    def _create_excel_workbook(self, content: Dict[str, Any], output_path: str, format_type: str) -> str:
+        """
+        Create Excel workbook with proper formatting
+        """
+        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+            if format_type == 'structured':
+                self._create_structured_workbook(content, writer)
+            elif format_type == 'combined':
+                self._create_combined_workbook(content, writer)
+            else:  # separate_sheets
+                self._create_separate_sheets_workbook(content, writer)
+            # Add summary sheet
+            self._add_summary_sheet(content, writer)
+        # Apply formatting
+        self._apply_excel_formatting(output_path)
+        return output_path
+    def _create_structured_workbook(self, content: Dict[str, Any], writer):
+        """
+        Create structured workbook maintaining document flow
+        """
+        for page_data in content['pages']:
+            page_num = page_data['page_number']
+            # Process tables first
+            table_count = 0
+            for table in page_data['tables']:
+                if table.data:
+                    df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
+                    sheet_name = f"P{page_num}_Table{table_count + 1}"[:31]
+                    df.to_excel(writer, sheet_name=sheet_name, index=False)
+                    table_count += 1
+            # Process text content
+            if page_data['text_blocks']:
+                # Group text blocks by proximity and formatting
+                text_groups = self._group_text_blocks(page_data['text_blocks'])
+                for i, group in enumerate(text_groups):
+                    if group['content'].strip():
+                        text_df = pd.DataFrame([{
+                            'Content': group['content'],
+                            'Font_Size': group.get('font_size', 12),
+                            'Is_Bold': group.get('is_bold', False),
+                            'Position_X': group.get('x', 0),
+                            'Position_Y': group.get('y', 0)
+                        }])
+                        sheet_name = f"P{page_num}_Text{i + 1}"[:31]
+                        text_df.to_excel(writer, sheet_name=sheet_name, index=False)
+    def _create_combined_workbook(self, content: Dict[str, Any], writer):
+        """
+        Create combined workbook with all tables and text together
+        """
+        all_tables = []
+        all_text = []
+        for page_data in content['pages']:
+            page_num = page_data['page_number']
+            # Collect all tables
+            for i, table in enumerate(page_data['tables']):
+                if table.data:
+                    df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
+                    df['Source_Page'] = page_num
+                    df['Table_Index'] = i + 1
+                    all_tables.append(df)
+            # Collect all text
+            text_content = '\n'.join([block.text for block in page_data['text_blocks']])
+            if text_content.strip():
+                all_text.append({
+                    'Page': page_num,
+                    'Content': text_content.strip()
+                })
+        # Write combined tables
+        if all_tables:
+            combined_tables = pd.concat(all_tables, ignore_index=True)
+            combined_tables.to_excel(writer, sheet_name='All_Tables', index=False)
+        # Write combined text
+        if all_text:
+            text_df = pd.DataFrame(all_text)
+            text_df.to_excel(writer, sheet_name='All_Text', index=False)
+    def _create_separate_sheets_workbook(self, content: Dict[str, Any], writer):
+        """
+        Create workbook with each element on separate sheets
+        """
+        table_counter = 1
+        text_counter = 1
+        for page_data in content['pages']:
+            page_num = page_data['page_number']
+            # Each table gets its own sheet
+            for table in page_data['tables']:
+                if table.data:
+                    df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
+                    sheet_name = f"Table_{table_counter}"[:31]
+                    df.to_excel(writer, sheet_name=sheet_name, index=False)
+                    table_counter += 1
+            # Page text gets its own sheet
+            if page_data['text_blocks']:
+                text_content = '\n'.join([block.text for block in page_data['text_blocks']])
+                if text_content.strip():
+                    text_df = pd.DataFrame([{'Page': page_num, 'Content': text_content}])
+                    sheet_name = f"Text_{text_counter}"[:31]
+                    text_df.to_excel(writer, sheet_name=sheet_name, index=False)
+                    text_counter += 1
+    def _group_text_blocks(self, text_blocks: List[TextBlock]) -> List[Dict]:
+        """
+        Group text blocks by proximity and formatting
+        """
+        if not text_blocks:
+            return []
+        # Sort by position (top to bottom, left to right)
+        sorted_blocks = sorted(text_blocks, key=lambda b: (b.y, b.x))
+        groups = []
+        current_group = {
+            'content': '',
+            'font_size': sorted_blocks[0].font_size,
+            'is_bold': sorted_blocks[0].is_bold,
+            'x': sorted_blocks[0].x,
+            'y': sorted_blocks[0].y
+        }
+        for block in sorted_blocks:
+            # Check if block should be in current group (similar formatting and position)
+            if (abs(current_group['font_size'] - block.font_size) < 2 and
+                current_group['is_bold'] == block.is_bold):
+                current_group['content'] += ' ' + block.text
+            else:
+                # Start new group
+                if current_group['content'].strip():
+                    groups.append(current_group)
+                current_group = {
+                    'content': block.text,
+                    'font_size': block.font_size,
+                    'is_bold': block.is_bold,
+                    'x': block.x,
+                    'y': block.y
+                }
+        # Add last group
+        if current_group['content'].strip():
+            groups.append(current_group)
+        return groups
+    def _add_summary_sheet(self, content: Dict[str, Any], writer):
+        """
+        Add summary sheet with document statistics
+        """
+        total_tables = sum(len(page['tables']) for page in content['pages'])
+        total_text_blocks = sum(len(page['text_blocks']) for page in content['pages'])
+        summary_data = {
+            'Statistic': [
+                'Total Pages',
+                'Total Tables',
+                'Total Text Blocks',
+                'Processing Date',
+                'Document Title'
+            ],
+            'Value': [
+                content['total_pages'],
+                total_tables,
+                total_text_blocks,
+                datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                content['metadata'].get('title', 'Unknown')
+            ]
+        }
+        summary_df = pd.DataFrame(summary_data)
+        summary_df.to_excel(writer, sheet_name='Summary', index=False)
+    def _apply_excel_formatting(self, file_path: str):
+        """
+        Apply formatting to the Excel file
+        """
+        try:
+            wb = openpyxl.load_workbook(file_path)
+            # Define styles
+            header_font = Font(bold=True, color="FFFFFF")
+            header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
+            border = Border(
+                left=Side(style='thin'),
+                right=Side(style='thin'),
+                top=Side(style='thin'),
+                bottom=Side(style='thin')
+            )
+            for sheet_name in wb.sheetnames:
+                ws = wb[sheet_name]
+                # Format headers
+                if ws.max_row > 0:
+                    for cell in ws[1]:
+                        cell.font = header_font
+                        cell.fill = header_fill
+                        cell.alignment = Alignment(horizontal='center', vertical='center')
+                        cell.border = border
+                # Auto-adjust column widths
+                for column in ws.columns:
+                    max_length = 0
+                    column_letter = column[0].column_letter
+                    for cell in column:
+                        try:
+                            if len(str(cell.value)) > max_length:
+                                max_length = len(str(cell.value))
+                        except:
+                            pass
+                    adjusted_width = min(max_length + 2, 50)
+                    ws.column_dimensions[column_letter].width = adjusted_width
+            wb.save(file_path)
+        except Exception as e:
+            logger.warning(f"Could not apply formatting: {e}")
+# Usage example and main function
+def install_dependencies():
+    """
+    Print installation instructions for missing dependencies
+    """
+    print("📦 INSTALLATION INSTRUCTIONS:")
+    print("=" * 50)
+    required_packages = [
+        ("PyMuPDF", "pip install PyMuPDF", True),
+        ("pandas", "pip install pandas", True),
+        ("openpyxl", "pip install openpyxl", True),
+        ("numpy", "pip install numpy", True),
+        ("camelot-py", "pip install camelot-py[cv]", CAMELOT_AVAILABLE),
+        ("tabula-py", "pip install tabula-py", TABULA_AVAILABLE)
+    ]
+    print("\n✅ CORE PACKAGES (Required):")
+    for name, cmd, available in required_packages[:4]:
+        status = "✅ Installed" if available else "❌ Missing"
+        print(f"  {name}: {status}")
+        if not available:
+            print(f"    Install: {cmd}")
+    print("\n🔧 OPTIONAL PACKAGES (For better table extraction):")
+    for name, cmd, available in required_packages[4:]:
+        status = "✅ Installed" if available else "❌ Missing"
+        print(f"  {name}: {status}")
+        if not available:
+            print(f"    Install: {cmd}")
+    print("\n💡 INSTALL ALL AT ONCE:")
+    print("pip install PyMuPDF pandas openpyxl numpy camelot-py[cv] tabula-py")
+    print("\n" + "=" * 50)
+def main():
+    """
+    Main function to demonstrate usage
+    """
+    print("🚀 Enhanced PDF to Excel Converter")
+    print("=" * 40)
+    # Show installation status
+    install_dependencies()
+    converter = PDFToExcelConverter()
+    # Example usage
+    pdf_path = "input.pdf"  # Replace with your PDF path
+    output_path = "output.xlsx"  # Replace with desired output path
+    try:
+        # Check if PDF file exists
+        if not os.path.exists(pdf_path):
+            print(f"\n❌ PDF file not found: {pdf_path}")
+            print("Please update the 'pdf_path' variable with your actual PDF file path.")
+            return
+        print(f"\n🔄 Converting: {pdf_path}")
+        result = converter.process_pdf_to_excel(
+            pdf_path=pdf_path,
+            output_path=output_path,
+            format_type='structured'  # Options: 'structured', 'combined', 'separate_sheets'
+        )
+        print(f"✅ Conversion completed successfully: {result}")
+    except Exception as e:
+        print(f"❌ Conversion failed: {e}")
+        print("\n🛠️  TROUBLESHOOTING:")
+        print("1. Make sure all required packages are installed")
+        print("2. Check that your PDF file exists and is readable")
+        print("3. Ensure you have write permissions for the output directory")
+if __name__ == "__main__":
+    main()

pdf_html.py ADDED Viewed

	@@ -0,0 +1,636 @@

+import os
+import base64
+import json
+import requests
+from typing import Dict, List, Any
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from datetime import datetime
+@dataclass
+class TextBlock:
+    text: str
+    x: float
+    y: float
+    width: float
+    height: float
+    font_size: float
+    font_name: str
+    is_bold: bool = False
+    is_italic: bool = False
+    block_id: str = ""
+class PDFToHTMLConverter:
+    def __init__(self, huggingface_token: str = None):
+        self.hf_token = huggingface_token
+        self.hf_headers = {
+            "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
+        }
+        self.models = {
+            "document_layout": "microsoft/layoutlm-base-uncased",
+            "table_detection": "microsoft/table-transformer-detection",
+            "ocr": "microsoft/trocr-base-printed",
+            "math_detection": "facebook/detr-resnet-50"
+        }
+        self.hf_inference_url = "https://api-inference.huggingface.co/models"
+    def pdf_to_base64(self, pdf_path: str) -> str:
+        try:
+            with open(pdf_path, "rb") as pdf_file:
+                return base64.b64encode(pdf_file.read()).decode('utf-8')
+        except Exception as e:
+            raise Exception(f"Error converting PDF to base64: {str(e)}")
+    def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
+        doc = None
+        try:
+            if not os.path.exists(pdf_path):
+                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+            doc = fitz.open(pdf_path)
+            if doc is None:
+                raise RuntimeError("Failed to open PDF document")
+            if doc.page_count == 0:
+                raise ValueError("PDF document has no pages")
+            print(f"📄 PDF opened successfully: {doc.page_count} pages")
+            pages_content = []
+            for page_num in range(doc.page_count):
+                try:
+                    page = doc[page_num]
+                    print(f"🔄 Processing page {page_num + 1}/{doc.page_count}")
+                    text_blocks = []
+                    try:
+                        page_dict = page.get_text("dict")
+                        text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
+                    except Exception as e:
+                        print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
+                        text_blocks = self._extract_text_blocks_simple(page, page_num)
+                    images = self._extract_images_safely(page, doc, page_num)
+                    tables = self._detect_tables_safely(page)
+                    page_rect = page.rect
+                    pages_content.append({
+                        "page_number": page_num + 1,
+                        "text_blocks": text_blocks,
+                        "images": images,
+                        "tables": tables,
+                        "page_width": page_rect.width,
+                        "page_height": page_rect.height
+                    })
+                except Exception as e:
+                    print(f"❌ Error processing page {page_num + 1}: {e}")
+                    pages_content.append({
+                        "page_number": page_num + 1,
+                        "text_blocks": [],
+                        "images": [],
+                        "tables": [],
+                        "page_width": 595,
+                        "page_height": 842
+                    })
+            result = {
+                "pages": pages_content,
+                "total_pages": doc.page_count
+            }
+            return result
+        except Exception as e:
+            raise Exception(f"Error extracting PDF content: {str(e)}")
+        finally:
+            if doc is not None:
+                try:
+                    doc.close()
+                    print("✅ PDF document closed successfully")
+                except Exception as e:
+                    print(f"⚠️ Error closing PDF document: {e}")
+    def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
+        text_blocks = []
+        for block_idx, block in enumerate(page_dict.get("blocks", [])):
+            if "lines" not in block:
+                continue
+            for line_idx, line in enumerate(block["lines"]):
+                for span_idx, span in enumerate(line["spans"]):
+                    text_content = span.get("text", "").strip()
+                    if text_content:
+                        bbox = span["bbox"]
+                        font_info = {
+                            "size": span.get("size", 12),
+                            "font": span.get("font", "Arial"),
+                            "is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
+                            "is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
+                        }
+                        text_block = TextBlock(
+                            text=text_content,
+                            x=bbox[0],
+                            y=bbox[1],
+                            width=bbox[2] - bbox[0],
+                            height=bbox[3] - bbox[1],
+                            font_size=font_info["size"],
+                            font_name=font_info["font"],
+                            is_bold=font_info["is_bold"],
+                            is_italic=font_info["is_italic"],
+                            block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
+                        )
+                        text_blocks.append(text_block)
+        return text_blocks
+    def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
+        text_blocks = []
+        try:
+            blocks_data = page.get_text("blocks")
+            for block_idx, block in enumerate(blocks_data):
+                if block[6] == 0:
+                    text = block[4].strip()
+                    if text:
+                        x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
+                        lines = text.split('\n')
+                        line_height = (y1 - y0) / max(len(lines), 1)
+                        for line_idx, line in enumerate(lines):
+                            if line.strip():
+                                text_block = TextBlock(
+                                    text=line.strip(),
+                                    x=x0,
+                                    y=y0 + (line_idx * line_height),
+                                    width=x1 - x0,
+                                    height=line_height,
+                                    font_size=12,
+                                    font_name="Arial",
+                                    is_bold=False,
+                                    is_italic=False,
+                                    block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
+                                )
+                                text_blocks.append(text_block)
+        except Exception as e:
+            print(f"⚠️ Simple text block extraction failed: {e}")
+        return text_blocks
+    def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
+        images = []
+        try:
+            image_list = page.get_images(full=True)
+            for img_index, img_info in enumerate(image_list):
+                try:
+                    xref = img_info[0]
+                    img_rects = [r for r in page.get_image_rects(xref)]
+                    if not img_rects:
+                        continue
+                    bbox = img_rects[0]
+                    pix = fitz.Pixmap(doc, xref)
+                    if pix.n - pix.alpha < 4:
+                        img_data = pix.tobytes("png")
+                        img_base64 = base64.b64encode(img_data).decode()
+                        images.append({
+                            "index": img_index,
+                            "data": img_base64,
+                            "bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
+                        })
+                    pix = None
+                except Exception as e:
+                    print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
+                    continue
+        except Exception as e:
+            print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
+        return images
+    def _detect_tables_safely(self, page) -> List[Dict]:
+        tables = []
+        try:
+            tabs = page.find_tables()
+            for tab_index, tab in enumerate(tabs):
+                try:
+                    table_data = tab.extract()
+                    if table_data:
+                        cleaned_data = []
+                        for row in table_data:
+                            cleaned_row = [str(cell).strip() if cell else "" for cell in row]
+                            if any(cleaned_row):
+                                cleaned_data.append(cleaned_row)
+                        if cleaned_data:
+                            tables.append({
+                                "bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1),
+                                "data": cleaned_data
+                            })
+                except Exception as e:
+                    print(f"⚠️ Error extracting table {tab_index}: {e}")
+                    continue
+        except Exception as e:
+            print(f"⚠️ General error in table detection: {e}")
+        return tables
+    def enhance_math_symbols(self, text: str) -> str:
+        math_replacements = {
+            '±': '&plusmn;', '×': '&times;', '÷': '&divide;', '∑': '&sum;',
+            '∏': '&prod;', '√': '&radic;', '∞': '&infin;', '∫': '&int;',
+            '∂': '&part;', '∆': '&Delta;', '∇': '&nabla;', '∈': '&isin;',
+            '∉': '&notin;', '⊂': '&sub;', '⊃': '&sup;', '⊆': '&sube;',
+            '⊇': '&supe;', '∪': '&cup;', '∩': '&cap;', '≤': '&le;',
+            '≥': '&ge;', '≠': '&ne;', '≡': '&equiv;', '≈': '&asymp;',
+            '∝': '&prop;', '∴': '&there4;',
+            'α': '&alpha;', 'β': '&beta;', 'γ': '&gamma;', 'δ': '&delta;',
+            'ε': '&epsilon;', 'ζ': '&zeta;', 'η': '&eta;', 'θ': '&theta;',
+            'ι': '&iota;', 'κ': '&kappa;', 'λ': '&lambda;', 'μ': '&mu;',
+            'ν': '&nu;', 'ξ': '&xi;', 'π': '&pi;', 'ρ': '&rho;', 'σ': '&sigma;',
+            'τ': '&tau;', 'υ': '&upsilon;', 'φ': '&phi;', 'χ': '&chi;',
+            'ψ': '&psi;', 'ω': '&omega;',
+            '½': '&frac12;', '⅓': '&frac13;', '¼': '&frac14;', '⅔': '&frac23;',
+            '¾': '&frac34;', '⅛': '&frac18;', '²': '&sup2;', '³': '&sup3;',
+            '¹': '&sup1;', '°': '&deg;'
+        }
+        for symbol, html_entity in math_replacements.items():
+            text = text.replace(symbol, html_entity)
+        return text
+    def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str:
+            html_content = []
+            html_content.append("""<!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>PDF Document</title>
+        <style>
+            * {
+                box-sizing: border-box;
+                margin: 0;
+                padding: 0;
+            }
+            body {
+                font-family: 'Times New Roman', Times, serif;
+                background-color: #f5f5f5;
+                padding: 20px;
+                line-height: 1.2;
+                color: #000000;
+            }
+            .document-container {
+                max-width: 1200px;
+                margin: 0 auto;
+                background-color: white;
+                box-shadow: 0 4px 12px rgba(0,0,0,0.1);
+                border: 1px solid #ddd;
+            }
+            .page-wrapper {
+                background-color: white;
+                margin: 0;
+                padding: 40px;
+                border-bottom: 2px solid #000;
+                position: relative;
+                min-height: 800px;
+                page-break-after: always;
+                overflow: visible;
+            }
+            .page-header {
+                background-color: #f8f8f8;
+                padding: 10px 15px;
+                margin: -40px -40px 30px -40px;
+                border-bottom: 2px solid #000;
+                font-weight: bold;
+                color: #000;
+                font-size: 14px;
+                text-align: center;
+            }
+            .content-layer {
+                position: relative;
+                width: 100%;
+                height: 100%;
+            }
+            .text-content {
+                position: relative;
+                z-index: 10;
+                line-height: 1.4;
+            }
+            .text-block {
+                margin-bottom: 8px;
+                font-family: 'Times New Roman', Times, serif;
+                color: #000;
+                word-wrap: break-word;
+                overflow-wrap: break-word;
+            }
+            .text-block.inline {
+                display: inline;
+                margin-bottom: 0;
+                margin-right: 5px;
+            }
+            .text-group {
+                margin-bottom: 12px;
+                line-height: 1.3;
+            }
+            .bold {
+                font-weight: bold;
+            }
+            .italic {
+                font-style: italic;
+            }
+            .table-container {
+                margin: 20px 0;
+                background-color: white;
+                overflow: auto;
+                z-index: 20;
+                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            }
+            .table {
+                width: 100%;
+                border-collapse: collapse;
+                border: 2px solid #000;
+                font-family: 'Times New Roman', Times, serif;
+                font-size: 12px;
+                color: #000;
+                background-color: white;
+                margin: 0;
+            }
+            .table td, .table th {
+                border: 1px solid #000;
+                padding: 8px 12px;
+                text-align: left;
+                vertical-align: top;
+                background-color: white;
+                font-family: 'Times New Roman', Times, serif;
+                word-wrap: break-word;
+                min-width: 60px;
+            }
+            .table th {
+                background-color: #f0f0f0;
+                font-weight: bold;
+                text-align: center;
+            }
+            .table tr:nth-child(even) td {
+                background-color: #f9f9f9;
+            }
+            .table tr:hover td {
+                background-color: #f0f0f0;
+            }
+            .image-container {
+                margin: 15px 0;
+                border: 1px solid #ccc;
+                background-color: white;
+                text-align: center;
+                overflow: hidden;
+                z-index: 5;
+            }
+            .image {
+                max-width: 100%;
+                height: auto;
+                display: block;
+                margin: 0 auto;
+            }
+            .math-symbol {
+                font-family: 'Times New Roman', serif;
+            }
+            .document-info {
+                background-color: #f8f8f8;
+                padding: 15px;
+                border: 1px solid #ccc;
+                margin-bottom: 20px;
+                text-align: center;
+                font-family: 'Times New Roman', Times, serif;
+            }
+            @media print {
+                body {
+                    background-color: white;
+                    padding: 0;
+                }
+                .page-wrapper {
+                    border: none;
+                    box-shadow: none;
+                    margin: 0;
+                    page-break-after: always;
+                }
+                .document-info {
+                    display: none;
+                }
+                .table {
+                    border: 2px solid #000 !important;
+                }
+                .table td, .table th {
+                    border: 1px solid #000 !important;
+                }
+            }
+        </style>
+    </head>
+    <body>
+        <div class="document-container">""")
+            html_content.append(f"""
+            <div class="document-info">
+                <h1>PDF Document Conversion</h1>
+                <p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p>
+                <p><strong>Converted on:</strong> {self._get_current_timestamp()}</p>
+            </div>""")
+            for page in pdf_content["pages"]:
+                page_width = max(page["page_width"], 595)
+                page_height = max(page["page_height"], 842)
+                html_content.append(f"""
+            <div class="page-wrapper">
+                <div class="page-header">
+                    Page {page["page_number"]} ({page_width:.0f}×{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])}
+                </div>
+                <div class="content-layer">""")
+                # Add images first
+                for img in page["images"]:
+                    html_content.append(f"""
+                    <div class="image-container">
+                        <img class="image" src="data:image/png;base64,{img['data']}"
+                            alt="Page {page['page_number']} Image {img['index']}">
+                    </div>""")
+                # Add tables with improved generation
+                for table_idx, table in enumerate(page["tables"]):
+                    print(f"🔄 Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})")
+                    html_content.append(self._generate_html_table(
+                        table["data"],
+                        header_rows=table.get("header_rows", 1)
+                    ))
+                # Add text content (non-overlapping groups)
+                text_groups = self._group_overlapping_text(page["text_blocks"])
+                html_content.append('                <div class="text-content">')
+                for group in text_groups:
+                    if len(group) == 1:
+                        block = group[0]
+                        if block.text.strip():
+                            enhanced_text = self.enhance_math_symbols(block.text)
+                            enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')
+                            css_classes = ["text-block"]
+                            if block.is_bold:
+                                css_classes.append("bold")
+                            if block.is_italic:
+                                css_classes.append("italic")
+                            if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
+                                css_classes.append("math-symbol")
+                            font_family = "'Times New Roman', Times, serif"
+                            if 'arial' in block.font_name.lower():
+                                font_family = "Arial, sans-serif"
+                            elif 'helvetica' in block.font_name.lower():
+                                font_family = "Helvetica, Arial, sans-serif"
+                            elif 'courier' in block.font_name.lower():
+                                font_family = "'Courier New', monospace"
+                            font_size = max(block.font_size * 0.9, 10)
+                            html_content.append(f"""
+                        <div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
+                            {enhanced_text}
+                        </div>""")
+                    else:
+                        group.sort(key=lambda b: b.x)
+                        html_content.append('                    <div class="text-group">')
+                        for block in group:
+                            if block.text.strip():
+                                enhanced_text = self.enhance_math_symbols(block.text)
+                                enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')
+                                css_classes = ["text-block", "inline"]
+                                if block.is_bold:
+                                    css_classes.append("bold")
+                                if block.is_italic:
+                                    css_classes.append("italic")
+                                if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
+                                    css_classes.append("math-symbol")
+                                font_family = "'Times New Roman', Times, serif"
+                                if 'arial' in block.font_name.lower():
+                                    font_family = "Arial, sans-serif"
+                                elif 'helvetica' in block.font_name.lower():
+                                    font_family = "Helvetica, Arial, sans-serif"
+                                elif 'courier' in block.font_name.lower():
+                                    font_family = "'Courier New', monospace"
+                                font_size = max(block.font_size * 0.9, 10)
+                                html_content.append(f"""
+                            <span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
+                                {enhanced_text}
+                            </span>""")
+                        html_content.append('                    </div>')
+                html_content.append("""                </div>
+                </div>
+            </div>""")
+            html_content.append("    </div>")
+            html_content.append("""
+    </body>
+    </html>""")
+            final_html = "\n".join(html_content)
+            if output_path:
+                try:
+                    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        f.write(final_html)
+                    print(f"✅ HTML saved to: {output_path}")
+                except Exception as e:
+                    print(f"⚠️ Error saving HTML to {output_path}: {e}")
+            return final_html
+    def _get_current_timestamp(self) -> str:
+        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str:
+        print(f"🚀 Processing PDF: {pdf_path}")
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        print("📄 Extracting PDF content...")
+        pdf_content = self.extract_pdf_content(pdf_path)
+        if use_hf_models and self.hf_token:
+            print("🤖 Attempting to enhance with Hugging Face models...")
+            try:
+                print("Note: Hugging Face model integration requires further implementation.")
+            except Exception as e:
+                print(f"⚠️ Hugging Face enhancement failed: {e}")
+        print("🔄 Converting to HTML...")
+        html_content = self.convert_to_html(pdf_content, output_path)
+        print("✅ Processing complete!")
+        return html_content
+def main():
+    HF_TOKEN = os.getenv("HF_API_TOKEN")
+    converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
+    pdf_path = "new-pdf.pdf"
+    output_path = "sample_converted.html"
+    try:
+        html_content = converter.process_pdf(
+            pdf_path=pdf_path,
+            output_path=output_path,
+            use_hf_models=False
+        )
+        print(f"✅ Successfully converted '{pdf_path}' to '{output_path}'")
+        print(f"🌐 Open '{output_path}' in your web browser to view the result!")
+    except FileNotFoundError as e:
+        print(f"❌ Error: {e}")
+        print("Please ensure the PDF file exists at the specified path.")
+    except Exception as e:
+        print(f"❌ An unexpected error occurred: {str(e)}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

pdf_json.py ADDED Viewed

	@@ -0,0 +1,513 @@

+import os
+import base64
+import json
+import requests
+from typing import Dict, List, Any, Optional
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+import re
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from datetime import datetime
+@dataclass
+class TextBlock:
+    text: str
+    x: float
+    y: float
+    width: float
+    height: float
+    font_size: float
+    font_name: str
+    is_bold: bool = False
+    is_italic: bool = False
+    block_id: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert TextBlock to dictionary"""
+        return asdict(self)
+@dataclass
+class ImageData:
+    index: int
+    base64_data: str
+    bbox: tuple
+    width: float
+    height: float
+    format: str = "PNG"
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert ImageData to dictionary"""
+        return asdict(self)
+@dataclass
+class TableData:
+    bbox: tuple
+    data: List[List[str]]
+    rows: int
+    columns: int
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert TableData to dictionary"""
+        return asdict(self)
+@dataclass
+class PageData:
+    page_number: int
+    text_blocks: List[TextBlock]
+    images: List[ImageData]
+    tables: List[TableData]
+    page_width: float
+    page_height: float
+    word_count: int = 0
+    character_count: int = 0
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert PageData to dictionary"""
+        return {
+            "page_number": self.page_number,
+            "text_blocks": [block.to_dict() for block in self.text_blocks],
+            "images": [img.to_dict() for img in self.images],
+            "tables": [table.to_dict() for table in self.tables],
+            "page_width": self.page_width,
+            "page_height": self.page_height,
+            "word_count": self.word_count,
+            "character_count": self.character_count
+        }
+class PDFToJSONConverter:
+    def __init__(self, huggingface_token: str = None):
+        self.hf_token = huggingface_token
+        self.hf_headers = {
+            "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
+        }
+        self.models = {
+            "document_layout": "microsoft/layoutlm-base-uncased",
+            "table_detection": "microsoft/table-transformer-detection",
+            "ocr": "microsoft/trocr-base-printed",
+            "math_detection": "facebook/detr-resnet-50"
+        }
+        self.hf_inference_url = "https://api-inference.huggingface.co/models"
+    def pdf_to_base64(self, pdf_path: str) -> str:
+        """Convert PDF file to base64 string"""
+        try:
+            with open(pdf_path, "rb") as pdf_file:
+                return base64.b64encode(pdf_file.read()).decode('utf-8')
+        except Exception as e:
+            raise Exception(f"Error converting PDF to base64: {str(e)}")
+    def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
+        """Extract all content from PDF and return structured data"""
+        doc = None
+        try:
+            if not os.path.exists(pdf_path):
+                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+            doc = fitz.open(pdf_path)
+            if doc is None:
+                raise RuntimeError("Failed to open PDF document")
+            if doc.page_count == 0:
+                raise ValueError("PDF document has no pages")
+            print(f"📄 PDF opened successfully: {doc.page_count} pages")
+            pages_data = []
+            document_stats = {
+                "total_pages": doc.page_count,
+                "total_words": 0,
+                "total_characters": 0,
+                "total_images": 0,
+                "total_tables": 0
+            }
+            for page_num in range(doc.page_count):
+                try:
+                    page = doc[page_num]
+                    print(f"🔄 Processing page {page_num + 1}/{doc.page_count}")
+                    # Extract text blocks
+                    text_blocks = []
+                    try:
+                        page_dict = page.get_text("dict")
+                        text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
+                    except Exception as e:
+                        print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
+                        text_blocks = self._extract_text_blocks_simple(page, page_num)
+                    # Extract images
+                    images = self._extract_images_safely(page, doc, page_num)
+                    # Extract tables
+                    tables = self._detect_tables_safely(page)
+                    # Get page dimensions
+                    page_rect = page.rect
+                    # Calculate statistics
+                    page_text = " ".join([block.text for block in text_blocks])
+                    word_count = len(page_text.split())
+                    char_count = len(page_text)
+                    # Create page data
+                    page_data = PageData(
+                        page_number=page_num + 1,
+                        text_blocks=text_blocks,
+                        images=images,
+                        tables=tables,
+                        page_width=page_rect.width,
+                        page_height=page_rect.height,
+                        word_count=word_count,
+                        character_count=char_count
+                    )
+                    pages_data.append(page_data)
+                    # Update document statistics
+                    document_stats["total_words"] += word_count
+                    document_stats["total_characters"] += char_count
+                    document_stats["total_images"] += len(images)
+                    document_stats["total_tables"] += len(tables)
+                except Exception as e:
+                    print(f"❌ Error processing page {page_num + 1}: {e}")
+                    # Create empty page data for failed pages
+                    empty_page = PageData(
+                        page_number=page_num + 1,
+                        text_blocks=[],
+                        images=[],
+                        tables=[],
+                        page_width=595,
+                        page_height=842,
+                        word_count=0,
+                        character_count=0
+                    )
+                    pages_data.append(empty_page)
+            result = {
+                "document_info": {
+                    "filename": os.path.basename(pdf_path),
+                    "file_size": os.path.getsize(pdf_path),
+                    "conversion_timestamp": self._get_current_timestamp(),
+                    "converter_version": "1.0.0"
+                },
+                "document_statistics": document_stats,
+                "pages": [page.to_dict() for page in pages_data]
+            }
+            return result
+        except Exception as e:
+            raise Exception(f"Error extracting PDF content: {str(e)}")
+        finally:
+            if doc is not None:
+                try:
+                    doc.close()
+                    print("✅ PDF document closed successfully")
+                except Exception as e:
+                    print(f"⚠️ Error closing PDF document: {e}")
+    def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
+        """Extract text blocks from page dictionary with detailed formatting"""
+        text_blocks = []
+        for block_idx, block in enumerate(page_dict.get("blocks", [])):
+            if "lines" not in block:
+                continue
+            for line_idx, line in enumerate(block["lines"]):
+                for span_idx, span in enumerate(line["spans"]):
+                    text_content = span.get("text", "").strip()
+                    if text_content:
+                        bbox = span["bbox"]
+                        font_info = {
+                            "size": span.get("size", 12),
+                            "font": span.get("font", "Arial"),
+                            "is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
+                            "is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
+                        }
+                        text_block = TextBlock(
+                            text=text_content,
+                            x=round(bbox[0], 2),
+                            y=round(bbox[1], 2),
+                            width=round(bbox[2] - bbox[0], 2),
+                            height=round(bbox[3] - bbox[1], 2),
+                            font_size=round(font_info["size"], 2),
+                            font_name=font_info["font"],
+                            is_bold=font_info["is_bold"],
+                            is_italic=font_info["is_italic"],
+                            block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
+                        )
+                        text_blocks.append(text_block)
+        return text_blocks
+    def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
+        """Fallback method for text extraction"""
+        text_blocks = []
+        try:
+            blocks_data = page.get_text("blocks")
+            for block_idx, block in enumerate(blocks_data):
+                if block[6] == 0:  # Text block
+                    text = block[4].strip()
+                    if text:
+                        x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
+                        lines = text.split('\n')
+                        line_height = (y1 - y0) / max(len(lines), 1)
+                        for line_idx, line in enumerate(lines):
+                            if line.strip():
+                                text_block = TextBlock(
+                                    text=line.strip(),
+                                    x=round(x0, 2),
+                                    y=round(y0 + (line_idx * line_height), 2),
+                                    width=round(x1 - x0, 2),
+                                    height=round(line_height, 2),
+                                    font_size=12.0,
+                                    font_name="Arial",
+                                    is_bold=False,
+                                    is_italic=False,
+                                    block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
+                                )
+                                text_blocks.append(text_block)
+        except Exception as e:
+            print(f"⚠️ Simple text block extraction failed: {e}")
+        return text_blocks
+    def _extract_images_safely(self, page, doc, page_num) -> List[ImageData]:
+        """Extract images from page and return structured data"""
+        images = []
+        try:
+            image_list = page.get_images(full=True)
+            for img_index, img_info in enumerate(image_list):
+                try:
+                    xref = img_info[0]
+                    # Get image rectangles
+                    img_rects = [r for r in page.get_image_rects(xref)]
+                    if not img_rects:
+                        continue
+                    bbox = img_rects[0]
+                    # Extract image data
+                    pix = fitz.Pixmap(doc, xref)
+                    if pix.n - pix.alpha < 4:  # Valid image
+                        img_data = pix.tobytes("png")
+                        img_base64 = base64.b64encode(img_data).decode()
+                        image_data = ImageData(
+                            index=img_index,
+                            base64_data=img_base64,
+                            bbox=(round(bbox.x0, 2), round(bbox.y0, 2),
+                                  round(bbox.x1, 2), round(bbox.y1, 2)),
+                            width=round(bbox.x1 - bbox.x0, 2),
+                            height=round(bbox.y1 - bbox.y0, 2),
+                            format="PNG"
+                        )
+                        images.append(image_data)
+                    pix = None
+                except Exception as e:
+                    print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
+                    continue
+        except Exception as e:
+            print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
+        return images
+    def _detect_tables_safely(self, page) -> List[TableData]:
+        """Extract tables from page and return structured data"""
+        tables = []
+        try:
+            tabs = page.find_tables()
+            for tab_index, tab in enumerate(tabs):
+                try:
+                    table_data = tab.extract()
+                    if table_data:
+                        # Clean table data
+                        cleaned_data = []
+                        for row in table_data:
+                            cleaned_row = [str(cell).strip() if cell else "" for cell in row]
+                            if any(cleaned_row):  # Only add non-empty rows
+                                cleaned_data.append(cleaned_row)
+                        if cleaned_data:
+                            table_obj = TableData(
+                                bbox=(round(tab.bbox.x0, 2), round(tab.bbox.y0, 2),
+                                      round(tab.bbox.x1, 2), round(tab.bbox.y1, 2)),
+                                data=cleaned_data,
+                                rows=len(cleaned_data),
+                                columns=max(len(row) for row in cleaned_data) if cleaned_data else 0
+                            )
+                            tables.append(table_obj)
+                except Exception as e:
+                    print(f"⚠️ Error extracting table {tab_index}: {e}")
+                    continue
+        except Exception as e:
+            print(f"⚠️ General error in table detection: {e}")
+        return tables
+    def convert_to_json(self, pdf_content: Dict[str, Any], output_path: str = None,
+                       pretty_print: bool = True, include_base64_images: bool = True) -> str:
+        """Convert PDF content to JSON format"""
+        print("🔄 Converting to JSON format...")
+        try:
+            # Create a copy of the content for modification
+            json_content = pdf_content.copy()
+            # Add metadata
+            json_content["conversion_options"] = {
+                "pretty_print": pretty_print,
+                "include_base64_images": include_base64_images,
+                "json_schema_version": "1.0"
+            }
+            # Optionally remove base64 image data to reduce file size
+            if not include_base64_images:
+                for page in json_content["pages"]:
+                    for image in page["images"]:
+                        image["base64_data"] = "[Base64 data removed - set include_base64_images=True to include]"
+            # Convert to JSON string
+            if pretty_print:
+                json_string = json.dumps(json_content, indent=2, ensure_ascii=False)
+            else:
+                json_string = json.dumps(json_content, ensure_ascii=False)
+            # Save to file if output path provided
+            if output_path:
+                try:
+                    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        f.write(json_string)
+                    print(f"✅ JSON saved to: {output_path}")
+                    print(f"📊 File size: {len(json_string):,} characters")
+                except Exception as e:
+                    print(f"⚠️ Error saving JSON to {output_path}: {e}")
+            return json_string
+        except Exception as e:
+            raise Exception(f"Error converting to JSON: {str(e)}")
+    def create_json_summary(self, pdf_content: Dict[str, Any]) -> Dict[str, Any]:
+        """Create a summary of the PDF content without full data"""
+        summary = {
+            "document_info": pdf_content.get("document_info", {}),
+            "document_statistics": pdf_content.get("document_statistics", {}),
+            "page_summaries": []
+        }
+        for page in pdf_content.get("pages", []):
+            page_summary = {
+                "page_number": page["page_number"],
+                "text_blocks_count": len(page["text_blocks"]),
+                "images_count": len(page["images"]),
+                "tables_count": len(page["tables"]),
+                "word_count": page["word_count"],
+                "character_count": page["character_count"],
+                "page_dimensions": {
+                    "width": page["page_width"],
+                    "height": page["page_height"]
+                },
+                "sample_text": " ".join([block["text"] for block in page["text_blocks"][:3]])[:200] + "..." if page["text_blocks"] else ""
+            }
+            summary["page_summaries"].append(page_summary)
+        return summary
+    def _get_current_timestamp(self) -> str:
+        """Get current timestamp as string"""
+        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    def process_pdf_to_json(self, pdf_path: str, output_path: str = None,
+                           pretty_print: bool = True, include_base64_images: bool = True,
+                           create_summary: bool = False, use_hf_models: bool = False) -> str:
+        """Main method to process PDF and convert to JSON"""
+        print(f"🚀 Processing PDF to JSON: {pdf_path}")
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        print("📄 Extracting PDF content...")
+        pdf_content = self.extract_pdf_content(pdf_path)
+        if use_hf_models and self.hf_token:
+            print("🤖 Attempting to enhance with Hugging Face models...")
+            try:
+                print("Note: Hugging Face model integration requires further implementation.")
+            except Exception as e:
+                print(f"⚠️ Hugging Face enhancement failed: {e}")
+        print("🔄 Converting to JSON...")
+        json_content = self.convert_to_json(
+            pdf_content,
+            output_path,
+            pretty_print,
+            include_base64_images
+        )
+        # Create summary file if requested
+        if create_summary and output_path:
+            summary_path = output_path.replace('.json', '_summary.json')
+            summary_data = self.create_json_summary(pdf_content)
+            summary_json = json.dumps(summary_data, indent=2, ensure_ascii=False)
+            try:
+                with open(summary_path, 'w', encoding='utf-8') as f:
+                    f.write(summary_json)
+                print(f"✅ Summary JSON saved to: {summary_path}")
+            except Exception as e:
+                print(f"⚠️ Error saving summary: {e}")
+        print("✅ Processing complete!")
+        return json_content
+def main():
+    """Main function to demonstrate PDF to JSON conversion"""
+    # Set your Hugging Face token if needed
+    HF_TOKEN = os.getenv("HF_API_TOKEN")
+    # Initialize converter
+    converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
+    # Define paths
+    pdf_path = "new-pdf.pdf"  # Change this to your PDF file path
+    output_path = "converted_document.json"  # Output JSON file path
+    try:
+        # Convert PDF to JSON
+        json_content = converter.process_pdf_to_json(
+            pdf_path=pdf_path,
+            output_path=output_path,
+            pretty_print=True,  # Format JSON with indentation
+            include_base64_images=True,  # Include image data (set False to reduce file size)
+            create_summary=True,  # Create additional summary file
+            use_hf_models=False  # Set to True if you want to use HuggingFace models
+        )
+        print(f"✅ Successfully converted '{pdf_path}' to '{output_path}'")
+        print(f"📊 JSON length: {len(json_content):,} characters")
+        print(f"📄 Open '{output_path}' to view the structured JSON data!")
+        # Optional: Print first 500 characters of JSON as preview
+        print("\n📋 JSON Preview (first 500 characters):")
+        print("-" * 50)
+        print(json_content[:500] + "..." if len(json_content) > 500 else json_content)
+    except FileNotFoundError as e:
+        print(f"❌ Error: {e}")
+        print("Please ensure the PDF file exists at the specified path.")
+    except Exception as e:
+        print(f"❌ An unexpected error occurred: {str(e)}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

pdf_word.py ADDED Viewed

	@@ -0,0 +1,559 @@

+import os
+import base64
+import json
+from typing import Dict, List, Any
+import fitz
+from PIL import Image
+import io
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from datetime import datetime
+from docx import Document
+from docx.shared import Inches, Pt
+from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
+from docx.enum.table import WD_TABLE_ALIGNMENT
+from docx.oxml.shared import OxmlElement, qn
+from docx.oxml.ns import nsdecls
+from docx.oxml import parse_xml
+import unicodedata
+import docx
+import camelot
+@dataclass
+class TextBlock:
+    text: str
+    x: float
+    y: float
+    width: float
+    height: float
+    font_size: float
+    font_name: str
+    is_bold: bool = False
+    is_italic: bool = False
+    block_id: str = ""
+    is_math: bool = False
+class PDFToWordConverter:
+    def __init__(self, huggingface_token: str = None):
+        self.hf_token = huggingface_token
+        self.hf_headers = {
+            "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
+        }
+        self.models = {
+            "document_layout": "microsoft/layoutlm-base-uncased",
+            "table_detection": "microsoft/table-transformer-detection",
+            "ocr": "microsoft/trocr-base-printed",
+            "math_detection": "facebook/detr-resnet-50"
+        }
+        self.hf_inference_url = "https://api-inference.huggingface.co/models"
+        self.math_symbols = {
+            '√': '√', '∑': '∑', '∏': '∏', '∫': '∫', '∞': '∞', '≤': '≤', '≥': '≥', '≠': '≠', '±': '±',
+            '×': '×', '÷': '÷', 'α': 'α', 'β': 'β', 'γ': 'γ', 'δ': 'δ', 'θ': 'θ', 'λ': 'λ', 'μ': 'μ',
+            'π': 'π', 'σ': 'σ', 'φ': 'φ', 'ω': 'ω'
+        }
+    def detect_mathematical_content(self, text: str) -> bool:
+        math_patterns = [
+            r'\d+\s*[+\-*/=]\s*\d+', r'[a-zA-Z]\s*=\s*\d+', r'\b(?:sin|cos|tan|log|ln|exp)\s*\(',
+            r'\d+\s*\^\s*\d+', r'√\d+', r'\d+/\d+', r'[∑∏∫]', r'[≤≥≠±×÷]', r'[αβγδθλμπσφω]',
+            r'\bEquation\s+\d+', r'\d+\.\d+', r'\$\d+,?\d*', r'NORMSINV', r'using Equation'
+        ]
+        for pattern in math_patterns:
+            if re.search(pattern, text, re.IGNORECASE):
+                return True
+        return False
+    def preserve_mathematical_formatting(self, text: str) -> str:
+        if not text:
+            return ""
+        text = text.replace('×', '×')
+        text = text.replace('÷', '÷')
+        text = text.replace('±', '±')
+        text = text.replace('≤', '≤')
+        text = text.replace('≥', '≥')
+        text = text.replace('≠', '≠')
+        text = text.replace('√', '√')
+        text = text.replace('∑', '∑')
+        text = text.replace('∏', '∏')
+        text = text.replace('∫', '∫')
+        text = text.replace('∞', '∞')
+        text = re.sub(r'(\d+)\s*\^\s*(\d+)', r'\1^\2', text)
+        text = re.sub(r'(\w+)\s*\(\s*([^)]+)\s*\)', r'\1(\2)', text)
+        return text
+    def clean_text_for_xml(self, text: str) -> str:
+        if not text:
+            return ""
+        try:
+            text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
+            text = text.replace('\ufeff', '')
+            text = text.replace('\u0000', '')
+            text = unicodedata.normalize('NFKC', text)
+            printable_chars = []
+            for char in text:
+                if char.isprintable() or char.isspace() or char in self.math_symbols:
+                    printable_chars.append(char)
+                else:
+                    printable_chars.append(' ')
+            text = ''.join(printable_chars)
+            text = re.sub(r'\s+', ' ', text).strip()
+            text = text.encode('utf-8', errors='ignore').decode('utf-8')
+            return self.preserve_mathematical_formatting(text)
+        except Exception:
+            return ''.join(char for char in str(text) if ord(char) < 128).strip()
+    def clean_font_name(self, font_name: str) -> str:
+        if not font_name:
+            return "Calibri"
+        try:
+            cleaned = self.clean_text_for_xml(font_name)
+            cleaned = re.sub(r'[^\w\s-]', '', cleaned)
+            if not cleaned.strip():
+                return "Calibri"
+            return cleaned.strip()
+        except Exception:
+            return "Calibri"
+    def pdf_to_base64(self, pdf_path: str) -> str:
+        try:
+            with open(pdf_path, "rb") as pdf_file:
+                return base64.b64encode(pdf_file.read()).decode('utf-8')
+        except Exception as e:
+            raise Exception(f"Error converting PDF to base64: {str(e)}")
+    def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
+        doc = None
+        try:
+            if not os.path.exists(pdf_path):
+                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+            doc = fitz.open(pdf_path)
+            if doc is None:
+                raise RuntimeError("Failed to open PDF document")
+            if doc.page_count == 0:
+                raise ValueError("PDF document has no pages")
+            print(f"PDF opened successfully: {doc.page_count} pages")
+            pages_content = []
+            for page_num in range(doc.page_count):
+                try:
+                    page = doc[page_num]
+                    print(f"Processing page {page_num + 1}/{doc.page_count}")
+                    text_blocks = []
+                    try:
+                        page_dict = page.get_text("dict")
+                        text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
+                    except Exception as e:
+                        print(f"Dict method failed for page {page_num + 1}, using fallback: {e}")
+                        text_blocks = self._extract_text_blocks_simple(page, page_num)
+                    images = self._extract_images_safely(page, doc, page_num)
+                    tables = self._detect_tables_with_camelot(pdf_path, page_num)
+                    page_rect = page.rect
+                    pages_content.append({
+                        "page_number": page_num + 1,
+                        "text_blocks": text_blocks,
+                        "images": images,
+                        "tables": tables,
+                        "page_width": page_rect.width,
+                        "page_height": page_rect.height
+                    })
+                except Exception as e:
+                    print(f"Error processing page {page_num + 1}: {e}")
+                    pages_content.append({
+                        "page_number": page_num + 1,
+                        "text_blocks": [],
+                        "images": [],
+                        "tables": [],
+                        "page_width": 595,
+                        "page_height": 842
+                    })
+            result = {
+                "pages": pages_content,
+                "total_pages": doc.page_count
+            }
+            return result
+        except Exception as e:
+            raise Exception(f"Error extracting PDF content: {str(e)}")
+        finally:
+            if doc is not None:
+                try:
+                    doc.close()
+                    print("PDF document closed successfully")
+                except Exception as e:
+                    print(f"Error closing PDF document: {e}")
+    def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
+        text_blocks = []
+        for block_idx, block in enumerate(page_dict.get("blocks", [])):
+            if "lines" not in block:
+                continue
+            for line_idx, line in enumerate(block["lines"]):
+                for span_idx, span in enumerate(line["spans"]):
+                    text_content = span.get("text", "").strip()
+                    if text_content:
+                        cleaned_text = self.clean_text_for_xml(text_content)
+                        if not cleaned_text:
+                            continue
+                        bbox = span["bbox"]
+                        font_name = self.clean_font_name(span.get("font", "Arial"))
+                        font_info = {
+                            "size": max(span.get("size", 12), 6),
+                            "font": font_name,
+                            "is_bold": "bold" in font_name.lower() or bool(span.get("flags", 0) & 16),
+                            "is_italic": "italic" in font_name.lower() or bool(span.get("flags", 0) & 2)
+                        }
+                        is_math = self.detect_mathematical_content(cleaned_text)
+                        text_block = TextBlock(
+                            text=cleaned_text,
+                            x=bbox[0], y=bbox[1],
+                            width=bbox[2] - bbox[0], height=bbox[3] - bbox[1],
+                            font_size=font_info["size"], font_name=font_info["font"],
+                            is_bold=font_info["is_bold"], is_italic=font_info["is_italic"],
+                            block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}",
+                            is_math=is_math
+                        )
+                        text_blocks.append(text_block)
+        return text_blocks
+    def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
+        text_blocks = []
+        try:
+            blocks_data = page.get_text("blocks")
+            for block_idx, block in enumerate(blocks_data):
+                if block[6] == 0:
+                    text = block[4].strip()
+                    if text:
+                        cleaned_text = self.clean_text_for_xml(text)
+                        if not cleaned_text:
+                            continue
+                        x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
+                        lines = cleaned_text.split('\n')
+                        line_height = (y1 - y0) / max(len(lines), 1)
+                        for line_idx, line in enumerate(lines):
+                            line_text = self.clean_text_for_xml(line)
+                            if line_text:
+                                is_math = self.detect_mathematical_content(line_text)
+                                text_block = TextBlock(
+                                    text=line_text,
+                                    x=x0, y=y0 + (line_idx * line_height),
+                                    width=x1 - x0, height=line_height,
+                                    font_size=12, font_name="Arial",
+                                    is_bold=False, is_italic=False,
+                                    block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}",
+                                    is_math=is_math
+                                )
+                                text_blocks.append(text_block)
+        except Exception as e:
+            print(f"Simple text block extraction failed: {e}")
+        return text_blocks
+    def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
+        images = []
+        try:
+            image_list = page.get_images(full=True)
+            for img_index, img_info in enumerate(image_list):
+                try:
+                    xref = img_info[0]
+                    img_rects = [r for r in page.get_image_rects(xref)]
+                    if not img_rects:
+                        continue
+                    bbox = img_rects[0]
+                    pix = fitz.Pixmap(doc, xref)
+                    if pix.n - pix.alpha < 4:
+                        img_data = pix.tobytes("png")
+                        img_base64 = base64.b64encode(img_data).decode()
+                        images.append({
+                            "index": img_index,
+                            "data": img_data,
+                            "base64": img_base64,
+                            "bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
+                        })
+                    pix = None
+                except Exception as e:
+                    print(f"Error extracting image {img_index} on page {page_num+1}: {e}")
+                    continue
+        except Exception as e:
+            print(f"General error in image extraction for page {page_num+1}: {e}")
+        return images
+    def _detect_tables_with_camelot(self, pdf_path: str, page_num: int) -> List[Dict]:
+        tables = []
+        try:
+            try:
+                camelot_tables = camelot.read_pdf(
+                    pdf_path,
+                    pages=str(page_num + 1),
+                    flavor='lattice',
+                    suppress_stdout=True
+                )
+                if len(camelot_tables) == 0:
+                    camelot_tables = camelot.read_pdf(
+                        pdf_path,
+                        pages=str(page_num + 1),
+                        flavor='stream',
+                        suppress_stdout=True
+                    )
+            except:
+                camelot_tables = camelot.read_pdf(
+                    pdf_path,
+                    pages=str(page_num + 1),
+                    flavor='stream',
+                    suppress_stdout=True
+                )
+            for table in camelot_tables:
+                table_data = table.df.values.tolist()
+                if table_data and any(any(str(cell).strip() for cell in row) for row in table_data):
+                    cleaned_data = []
+                    for row in table_data:
+                        cleaned_row = []
+                        for cell in row:
+                            cell_text = str(cell).strip() if cell is not None else ""
+                            cleaned_cell = self.clean_text_for_xml(cell_text)
+                            cleaned_row.append(cleaned_cell)
+                        cleaned_data.append(cleaned_row)
+                    tables.append({
+                        "bbox": table.bbox,
+                        "data": cleaned_data,
+                        "accuracy": getattr(table, 'accuracy', 0)
+                    })
+                    print(f"Found table with {len(cleaned_data)} rows and {len(cleaned_data[0]) if cleaned_data else 0} columns on page {page_num + 1}")
+        except Exception as e:
+            print(f"Error detecting tables with Camelot on page {page_num + 1}: {e}")
+        return tables
+    def _add_page_break(self, doc):
+        try:
+            paragraph = doc.add_paragraph()
+            run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
+            run.add_break(WD_BREAK.PAGE)
+        except:
+            doc.add_page_break()
+    def _set_font_properties(self, run, text_block: TextBlock):
+        try:
+            font_name = self.clean_font_name(text_block.font_name)
+            if 'Times' in font_name or 'Roman' in font_name:
+                run.font.name = 'Times New Roman'
+            elif 'Arial' in font_name:
+                run.font.name = 'Arial'
+            elif 'Courier' in font_name:
+                run.font.name = 'Courier New'
+            else:
+                run.font.name = 'Calibri'
+            try:
+                font_size_val = float(text_block.font_size)
+                font_size = max(min(int(font_size_val), 72), 6)
+                run.font.size = Pt(font_size)
+            except (ValueError, TypeError):
+                print(f"Warning: Invalid font_size '{text_block.font_size}'. Using default 11pt.")
+                run.font.size = Pt(11)
+            run.font.bold = bool(text_block.is_bold)
+            run.font.italic = bool(text_block.is_italic)
+            if text_block.is_math:
+                run.font.name = 'Cambria Math'
+        except Exception as e:
+            print(f"Error setting font properties for text_block: {e}")
+            run.font.name = 'Calibri'
+            run.font.size = Pt(11)
+            run.font.bold = False
+            run.font.italic = False
+    def _group_text_blocks_by_lines(self, text_blocks: List[TextBlock]) -> List[List[TextBlock]]:
+        if not text_blocks:
+            return []
+        sorted_blocks = sorted(text_blocks, key=lambda b: (round(b.y, 1), b.x))
+        lines = []
+        current_line = []
+        current_y = None
+        for block in sorted_blocks:
+            if current_y is None or abs(block.y - current_y) <= 5:
+                current_line.append(block)
+                current_y = block.y if current_y is None else current_y
+            else:
+                if current_line:
+                    lines.append(current_line)
+                current_line = [block]
+                current_y = block.y
+        if current_line:
+            lines.append(current_line)
+        return lines
+    def _set_table_borders(self, table):
+        tbl = table._tbl
+        for row in tbl.tr_lst:
+            for cell in row.tc_lst:
+                tcPr = cell.tcPr
+                tcBorders = OxmlElement('w:tcBorders')
+                for border_name in ['top', 'left', 'bottom', 'right']:
+                    border = OxmlElement(f'w:{border_name}')
+                    border.set(qn('w:val'), 'single')
+                    border.set(qn('w:sz'), '4')
+                    border.set(qn('w:space'), '0')
+                    border.set(qn('w:color'), '000000')
+                    tcBorders.append(border)
+                tcPr.append(tcBorders)
+    def _create_enhanced_table(self, doc, table_data):
+        try:
+            table_rows = table_data["data"]
+            if not table_rows or not any(any(str(cell).strip() for cell in row) for row in table_rows):
+                return None
+            max_cols = max(len(row) for row in table_rows) if table_rows else 0
+            if max_cols == 0:
+                return None
+            word_table = doc.add_table(rows=len(table_rows), cols=max_cols)
+            self._set_table_borders(word_table)
+            word_table.alignment = WD_TABLE_ALIGNMENT.CENTER
+            word_table.autofit = False
+            for row_idx, row_data in enumerate(table_rows):
+                for col_idx in range(max_cols):
+                    cell = word_table.cell(row_idx, col_idx)
+                    cell_data = row_data[col_idx] if col_idx < len(row_data) else ""
+                    clean_cell_data = self.clean_text_for_xml(str(cell_data) if cell_data else "")
+                    paragraph = cell.paragraphs[0]
+                    paragraph.clear()
+                    run = paragraph.add_run(clean_cell_data)
+                    if self.detect_mathematical_content(clean_cell_data):
+                        run.font.name = 'Cambria Math'
+                    else:
+                        run.font.name = 'Calibri'
+                    run.font.size = Pt(9)
+                    if row_idx == 0:
+                        run.font.bold = True
+                        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    cell.vertical_alignment = docx.enum.table.WD_ALIGN_VERTICAL.CENTER
+            print(f"Created table with {len(table_rows)} rows and {max_cols} columns")
+            return word_table
+        except Exception as e:
+            print(f"Error creating enhanced table: {e}")
+            return None
+    def convert_to_word(self, pdf_content: Dict[str, Any], output_path: str = None) -> Document:
+        print("Creating Word document...")
+        doc = Document()
+        doc.core_properties.title = "PDF to Word Conversion"
+        doc.core_properties.author = "PDF Converter"
+        doc.core_properties.created = datetime.now()
+        header_para = doc.add_paragraph()
+        header_run = header_para.add_run("PDF Document Conversion")
+        header_run.font.size = Pt(16)
+        header_run.font.bold = True
+        header_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        info_para = doc.add_paragraph()
+        info_run = info_para.add_run(f"Total Pages: {pdf_content.get('total_pages', 'Unknown')} | Converted on: {self._get_current_timestamp()}")
+        info_run.font.size = Pt(10)
+        info_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        doc.add_paragraph()
+        for page_idx, page in enumerate(pdf_content["pages"]):
+            print(f"Converting page {page['page_number']}/{pdf_content.get('total_pages', '?')}")
+            page_header = doc.add_paragraph()
+            page_header_run = page_header.add_run(f"--- Page {page['page_number']} ---")
+            page_header_run.font.bold = True
+            page_header_run.font.size = Pt(12)
+            page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            for img in page["images"]:
+                try:
+                    img_para = doc.add_paragraph()
+                    img_run = img_para.add_run()
+                    img_stream = io.BytesIO(img['data'])
+                    img_bbox = img['bbox']
+                    img_width_px = img_bbox[2] - img_bbox[0]
+                    page_width_px = page.get('page_width', 595)
+                    img_width = min(Inches(img_width_px / 72), Inches(6.5))
+                    img_run.add_picture(img_stream, width=img_width)
+                    img_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                except Exception as e:
+                    print(f"Error adding image to Word document: {e}")
+                    img_para = doc.add_paragraph()
+                    img_run = img_para.add_run(f"[Image {img['index']} - Could not be inserted]")
+                    img_run.font.italic = True
+            if page["tables"]:
+                for table_data in page["tables"]:
+                    try:
+                        enhanced_table = self._create_enhanced_table(doc, table_data)
+                        if enhanced_table:
+                            doc.add_paragraph()
+                    except Exception as e:
+                        print(f"Error adding table to Word document: {e}")
+            text_lines = self._group_text_blocks_by_lines(page["text_blocks"])
+            for line_blocks in text_lines:
+                if not line_blocks:
+                    continue
+                para = doc.add_paragraph()
+                line_blocks.sort(key=lambda b: b.x)
+                for block in line_blocks:
+                    cleaned_text = self.clean_text_for_xml(block.text)
+                    if cleaned_text:
+                        run = para.add_run(cleaned_text + " ")
+                        self._set_font_properties(run, block)
+            if page_idx < len(pdf_content["pages"]) - 1:
+                self._add_page_break(doc)
+        if output_path:
+            try:
+                Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+                doc.save(output_path)
+                print(f"Word document saved to: {output_path}")
+            except Exception as e:
+                print(f"Error saving Word document to {output_path}: {e}")
+        return doc
+    def _get_current_timestamp(self) -> str:
+        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    def process_pdf_to_word(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> Document:
+        print(f"Processing PDF to Word: {pdf_path}")
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        print("Extracting PDF content...")
+        pdf_content = self.extract_pdf_content(pdf_path)
+        if use_hf_models and self.hf_token:
+            print("Attempting to enhance with Hugging Face models...")
+            try:
+                print("Note: Hugging Face model integration requires further implementation.")
+            except Exception as e:
+                print(f"Hugging Face enhancement failed: {e}")
+        print("Converting to Word document...")
+        word_doc = self.convert_to_word(pdf_content, output_path)
+        print("Processing complete!")
+        return word_doc
+def main():
+    HF_TOKEN = os.getenv("HF_API_TOKEN")
+    converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
+    pdf_path = "supplychain (1).pdf"
+    output_path = "converted_document_enhanced.docx"
+    try:
+        word_document = converter.process_pdf_to_word(
+            pdf_path=pdf_path,
+            output_path=output_path,
+            use_hf_models=False
+        )
+        print(f"Successfully converted '{pdf_path}' to '{output_path}'")
+        print(f"Open '{output_path}' in Microsoft Word to view the result!")
+    except FileNotFoundError as e:
+        print(f"Error: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {str(e)}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+PyMuPDF==1.23.26
+Pillow==10.0.0
+requests==2.31.0
+transformers==4.35.0
+torch==2.1.0
+numpy==1.24.0
+flask==2.3.3
+flask-cors==4.0.0
+werkzeug==2.3.7
+camelot-py[cv]==0.11.0
+gunicorn==21.2.0
+python-docx==1.1.0

static/index.html ADDED Viewed

	@@ -0,0 +1,896 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>PDF Converter Tool</title>
+    <style>
+      * {
+        margin: 0;
+        padding: 0;
+        box-sizing: border-box;
+      }
+      body {
+        font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        min-height: 100vh;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        padding: 20px;
+      }
+      .container {
+        background: rgba(255, 255, 255, 0.95);
+        backdrop-filter: blur(10px);
+        padding: 40px;
+        border-radius: 20px;
+        box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
+        max-width: 600px;
+        width: 100%;
+        animation: slideIn 0.6s ease-out;
+      }
+      @keyframes slideIn {
+        from {
+          opacity: 0;
+          transform: translateY(30px);
+        }
+        to {
+          opacity: 1;
+          transform: translateY(0);
+        }
+      }
+      .header {
+        text-align: center;
+        margin-bottom: 40px;
+      }
+      .header h1 {
+        color: #333;
+        font-size: 2.5em;
+        margin-bottom: 10px;
+        background: linear-gradient(45deg, #667eea, #764ba2);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        background-clip: text;
+      }
+      .header p {
+        color: #666;
+        font-size: 1.1em;
+      }
+      .status-indicator {
+        position: absolute;
+        top: 20px;
+        right: 20px;
+        padding: 8px 16px;
+        border-radius: 20px;
+        font-size: 0.8em;
+        font-weight: 600;
+        text-transform: uppercase;
+        letter-spacing: 0.5px;
+      }
+      .status-online {
+        background: #d4edda;
+        color: #155724;
+        border: 1px solid #c3e6cb;
+      }
+      .status-offline {
+        background: #f8d7da;
+        color: #721c24;
+        border: 1px solid #f5c6cb;
+      }
+      .conversion-options {
+        display: grid;
+        gap: 20px;
+        margin-bottom: 30px;
+      }
+      .option-card {
+        background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+        border: none;
+        border-radius: 15px;
+        padding: 25px;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        color: white;
+        text-align: left;
+        position: relative;
+        overflow: hidden;
+      }
+      .option-card:hover {
+        transform: translateY(-5px);
+        box-shadow: 0 15px 30px rgba(0, 0, 0, 0.2);
+      }
+      .option-card.html {
+        background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
+      }
+      .option-card.word {
+        background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
+      }
+      .option-card.json {
+        background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
+      }
+      /* New style for Excel option card */
+      .option-card.excel {
+        background: linear-gradient(135deg, #2ecc71 0%, #27ae60 100%); /* Green shades for Excel */
+      }
+      .option-card::before {
+        content: "";
+        position: absolute;
+        top: 0;
+        left: -100%;
+        width: 100%;
+        height: 100%;
+        background: linear-gradient(
+          90deg,
+          transparent,
+          rgba(255, 255, 255, 0.2),
+          transparent
+        );
+        transition: left 0.5s;
+      }
+      .option-card:hover::before {
+        left: 100%;
+      }
+      .option-icon {
+        font-size: 2em;
+        margin-bottom: 10px;
+      }
+      .option-title {
+        font-size: 1.3em;
+        font-weight: bold;
+        margin-bottom: 5px;
+      }
+      .option-desc {
+        font-size: 0.9em;
+        opacity: 0.9;
+      }
+      .upload-section {
+        display: none;
+        background: #f8f9fa;
+        border-radius: 15px;
+        padding: 30px;
+        margin-top: 20px;
+        border: 2px dashed #ddd;
+        transition: all 0.3s ease;
+      }
+      .upload-section.active {
+        display: block;
+        animation: fadeIn 0.5s ease-out;
+      }
+      @keyframes fadeIn {
+        from {
+          opacity: 0;
+        }
+        to {
+          opacity: 1;
+        }
+      }
+      .file-input-wrapper {
+        position: relative;
+        display: inline-block;
+        width: 100%;
+        margin-bottom: 20px;
+      }
+      .file-input {
+        display: none;
+      }
+      .file-input-label {
+        display: block;
+        padding: 15px 25px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border-radius: 10px;
+        cursor: pointer;
+        text-align: center;
+        transition: all 0.3s ease;
+        font-weight: 500;
+      }
+      .file-input-label:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2);
+      }
+      .file-name {
+        margin-top: 10px;
+        padding: 10px;
+        background: #e9ecef;
+        border-radius: 8px;
+        font-size: 0.9em;
+        color: #495057;
+        display: none;
+      }
+      .output-name {
+        width: 100%;
+        padding: 15px;
+        border: 2px solid #e9ecef;
+        border-radius: 10px;
+        font-size: 1em;
+        margin-bottom: 20px;
+        transition: border-color 0.3s ease;
+      }
+      .output-name:focus {
+        outline: none;
+        border-color: #667eea;
+      }
+      .convert-btn {
+        width: 100%;
+        padding: 15px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border: none;
+        border-radius: 10px;
+        font-size: 1.1em;
+        font-weight: 600;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        position: relative;
+        overflow: hidden;
+      }
+      .convert-btn:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2);
+      }
+      .convert-btn:disabled {
+        opacity: 0.7;
+        cursor: not-allowed;
+        transform: none;
+      }
+      .back-btn {
+        background: #6c757d;
+        color: white;
+        border: none;
+        padding: 10px 20px;
+        border-radius: 8px;
+        cursor: pointer;
+        margin-bottom: 20px;
+        transition: all 0.3s ease;
+      }
+      .back-btn:hover {
+        background: #5a6268;
+        transform: translateY(-1px);
+      }
+      .progress-bar {
+        width: 100%;
+        height: 6px;
+        background: #e9ecef;
+        border-radius: 3px;
+        margin: 20px 0;
+        overflow: hidden;
+        display: none;
+      }
+      .progress-fill {
+        height: 100%;
+        background: linear-gradient(90deg, #667eea, #764ba2);
+        width: 0%;
+        transition: width 0.3s ease;
+        border-radius: 3px;
+      }
+      .result-section {
+        margin-top: 20px;
+        padding: 20px;
+        border-radius: 12px;
+        display: none;
+      }
+      .result-success {
+        background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
+        border: 1px solid #c3e6cb;
+        color: #155724;
+      }
+      .result-error {
+        background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%);
+        border: 1px solid #f5c6cb;
+        color: #721c24;
+      }
+      .loading {
+        display: none;
+        text-align: center;
+        margin: 20px 0;
+      }
+      .spinner {
+        border: 4px solid #f3f3f3;
+        border-top: 4px solid #667eea;
+        border-radius: 50%;
+        width: 40px;
+        height: 40px;
+        animation: spin 1s linear infinite;
+        margin: 0 auto 10px;
+      }
+      @keyframes spin {
+        0% {
+          transform: rotate(0deg);
+        }
+        100% {
+          transform: rotate(360deg);
+        }
+      }
+      .drag-over {
+        border-color: #667eea !important;
+        background: rgba(102, 126, 234, 0.1) !important;
+      }
+      .debug-info {
+        margin-top: 20px;
+        padding: 15px;
+        background: #f8f9fa;
+        border-radius: 8px;
+        font-size: 0.9em;
+        color: #6c757d;
+        border-left: 4px solid #007bff;
+      }
+      .error-details {
+        margin-top: 10px;
+        padding: 10px;
+        background: #fff3cd;
+        border: 1px solid #ffeaa7;
+        border-radius: 6px;
+        font-size: 0.85em;
+        color: #856404;
+        max-height: 200px;
+        overflow-y: auto;
+      }
+      @media (max-width: 768px) {
+        .container {
+          padding: 25px;
+          margin: 10px;
+        }
+        .header h1 {
+          font-size: 2em;
+        }
+        .option-card {
+          padding: 20px;
+        }
+        .status-indicator {
+          position: relative;
+          top: auto;
+          right: auto;
+          margin-bottom: 20px;
+          display: inline-block;
+        }
+      }
+    </style>
+  </head>
+  <body>
+    <div class="container">
+      <div id="status-indicator" class="status-indicator status-offline">
+        Server Offline
+      </div>
+      <div class="header">
+        <h1>🧠 PDF Converter</h1>
+        <p>Convert your PDF files to HTML, Word, JSON, or Excel format</p>
+      </div>
+      <div id="main-menu">
+        <div class="conversion-options">
+          <button class="option-card html" onclick="showUploadSection('html')">
+            <div class="option-icon">🌐</div>
+            <div class="option-title">Convert to HTML</div>
+            <div class="option-desc">
+              Transform PDF into web-ready HTML format
+            </div>
+          </button>
+          <button class="option-card word" onclick="showUploadSection('word')">
+            <div class="option-icon">📄</div>
+            <div class="option-title">Convert to Word</div>
+            <div class="option-desc">
+              Create editable Word documents from PDF
+            </div>
+          </button>
+          <button class="option-card json" onclick="showUploadSection('json')">
+            <div class="option-icon">📊</div>
+            <div class="option-title">Convert to JSON</div>
+            <div class="option-desc">
+              Extract structured data in JSON format
+            </div>
+          </button>
+          <button class="option-card excel" onclick="showUploadSection('excel')">
+            <div class="option-icon">📈</div>
+            <div class="option-title">Convert to Excel</div>
+            <div class="option-desc">
+              Organize PDF tables into an Excel spreadsheet
+            </div>
+          </button>
+        </div>
+      </div>
+      <div id="upload-section" class="upload-section">
+        <button class="back-btn" onclick="showMainMenu()">
+          ← Back to Menu
+        </button>
+        <div class="file-input-wrapper">
+          <input
+            type="file"
+            id="pdf-file"
+            class="file-input"
+            accept=".pdf"
+            onchange="handleFileSelect(event)"
+          />
+          <label for="pdf-file" class="file-input-label" id="file-label">
+            📄 Choose PDF File or Drag & Drop Here
+          </label>
+          <div id="file-name" class="file-name"></div>
+        </div>
+        <div class="loading" id="loading">
+          <div class="spinner"></div>
+          <p>Converting your PDF file...</p>
+        </div>
+        <div class="progress-bar" id="progress-bar">
+          <div class="progress-fill" id="progress-fill"></div>
+        </div>
+        <button
+          class="convert-btn"
+          id="convert-btn"
+          onclick="convertFile()"
+          disabled
+        >
+          🚀 Start Conversion
+        </button>
+        <div id="result-section" class="result-section">
+          <div id="result-message"></div>
+        </div>
+        <div id="debug-info" class="debug-info" style="display: none">
+          <strong>Debug Information:</strong>
+          <div id="debug-content"></div>
+        </div>
+      </div>
+    </div>
+    <script>
+      let currentFormat = "";
+      let selectedFile = null;
+      let serverOnline = false;
+      // Check server status on page load
+      document.addEventListener("DOMContentLoaded", function () {
+        checkServerStatus();
+        // Check server status every 30 seconds
+        setInterval(checkServerStatus, 30000);
+      });
+      async function checkServerStatus() {
+        try {
+          const response = await fetch("/health", {
+            method: "GET",
+            mode: "cors",
+            headers: {
+              Accept: "application/json",
+            },
+            signal: AbortSignal.timeout(5000), // 5 second timeout
+          });
+          if (response.ok) {
+            const data = await response.json();
+            updateServerStatus(true, data.message || "Server is online");
+          } else {
+            updateServerStatus(false, `Server returned ${response.status}`);
+          }
+        } catch (error) {
+          updateServerStatus(false, error.message);
+        }
+      }
+      function updateServerStatus(online, message) {
+        serverOnline = online;
+        const indicator = document.getElementById("status-indicator");
+        if (online) {
+          indicator.className = "status-indicator status-online";
+          indicator.textContent = "Server Online";
+          indicator.title = message;
+        } else {
+          indicator.className = "status-indicator status-offline";
+          indicator.textContent = "Server Offline";
+          indicator.title = `Error: ${message}`;
+        }
+      }
+     function showUploadSection(format) {
+  if (!serverOnline) {
+    alert("Server is offline. Please start the Flask server first.");
+    return;
+  }
+  currentFormat = format;
+  document.getElementById("main-menu").style.display = "none";
+  document.getElementById("upload-section").classList.add("active");
+  resetForm(); // ✅ Always reset when entering upload
+  const outputInput = document.getElementById("output-name");
+  const extensions = { html: ".html", word: ".docx", json: ".json", excel: ".xlsx" };
+  outputInput.placeholder = `Enter output filename (e.g., converted_file${extensions[format]})`;
+}
+     function showMainMenu() {
+        window.location.reload();
+  document.getElementById("main-menu").style.display = "block";
+  document.getElementById("upload-section").classList.remove("active");
+  resetForm();
+  selectedFile = null;
+}
+      function resetForm() {
+  selectedFile = null;
+  const pdfInput = document.getElementById("pdf-file");
+  const outputInput = document.getElementById("output-name");
+  const fileName = document.getElementById("file-name");
+  const fileLabel = document.getElementById("file-label");
+  // Clear inputs
+  pdfInput.value = "";
+  // Hide filename display
+  fileName.style.display = "none";
+  fileName.textContent = "";
+  // Reset label text
+  fileLabel.textContent = "📄 Choose PDF File or Drag & Drop Here";
+  // Reset buttons and sections
+  document.getElementById("convert-btn").disabled = true;
+  document.getElementById("result-section").style.display = "none";
+  document.getElementById("loading").style.display = "none";
+  document.getElementById("progress-bar").style.display = "none";
+  document.getElementById("debug-info").style.display = "none";
+  // Also reset drag-over styling if stuck
+  document.getElementById("upload-section").classList.remove("drag-over");
+}
+      function handleFileSelect(event) {
+        const file = event.target.files[0];
+        if (file && file.type === "application/pdf") {
+          selectedFile = file;
+          document.getElementById("file-name").textContent = `Selected: ${
+            file.name
+          } (${(file.size / 1024 / 1024).toFixed(2)} MB)`;
+          document.getElementById("file-name").style.display = "block";
+          document.getElementById(
+            "file-label"
+          ).textContent = `✅ ${file.name} selected`;
+          checkFormValidity();
+        } else {
+          alert("Please select a valid PDF file.");
+          resetFileInput();
+        }
+      }
+      function resetFileInput() {
+        selectedFile = null;
+        document.getElementById("pdf-file").value = "";
+        document.getElementById("file-name").style.display = "none";
+        document.getElementById("file-label").textContent =
+          "📄 Choose PDF File or Drag & Drop Here";
+        checkFormValidity();
+      }
+      function checkFormValidity() {
+        const outputName = document.getElementById("output-name").value.trim();
+        const convertBtn = document.getElementById("convert-btn");
+        if (selectedFile && outputName && serverOnline) {
+          convertBtn.disabled = false;
+          convertBtn.textContent = "🚀 Start Conversion";
+        } else {
+          convertBtn.disabled = true;
+          convertBtn.textContent = serverOnline
+            ? "🚀 Start Conversion"
+            : "❌ Server Offline";
+        }
+      }
+      // Add event listener for output name input
+      document
+        .getElementById("output-name")
+        .addEventListener("input", checkFormValidity);
+      // Drag and drop functionality
+      const uploadSection = document.getElementById("upload-section");
+      ["dragenter", "dragover", "dragleave", "drop"].forEach((eventName) => {
+        uploadSection.addEventListener(eventName, preventDefaults, false);
+      });
+      function preventDefaults(e) {
+        e.preventDefault();
+        e.stopPropagation();
+      }
+      ["dragenter", "dragover"].forEach((eventName) => {
+        uploadSection.addEventListener(eventName, highlight, false);
+      });
+      ["dragleave", "drop"].forEach((eventName) => {
+        uploadSection.addEventListener(eventName, unhighlight, false);
+      });
+      function highlight() {
+        uploadSection.classList.add("drag-over");
+      }
+      function unhighlight() {
+        uploadSection.classList.remove("drag-over");
+      }
+      uploadSection.addEventListener("drop", handleDrop, false);
+      function handleDrop(e) {
+        const dt = e.dataTransfer;
+        const files = dt.files;
+        if (files.length > 0) {
+          const file = files[0];
+          if (file.type === "application/pdf") {
+            selectedFile = file;
+            document.getElementById("file-name").textContent = `Selected: ${
+              file.name
+            } (${(file.size / 1024 / 1024).toFixed(2)} MB)`;
+            document.getElementById("file-name").style.display = "block";
+            document.getElementById(
+              "file-label"
+            ).textContent = `✅ ${file.name} selected`;
+            checkFormValidity();
+          } else {
+            alert("Please drop a valid PDF file.");
+          }
+        }
+      }
+      function checkFormValidity() {
+  const convertBtn = document.getElementById("convert-btn");
+  if (selectedFile && serverOnline) {
+    convertBtn.disabled = false;
+    convertBtn.textContent = "🚀 Start Conversion";
+  } else {
+    convertBtn.disabled = true;
+    convertBtn.textContent = serverOnline
+      ? "🚀 Start Conversion"
+      : "❌ Server Offline";
+  }
+}
+async function convertFile() {
+  if (!selectedFile || !currentFormat) {
+    alert("Please select a file and format.");
+    return;
+  }
+  if (!serverOnline) {
+    alert("Server is offline. Please start the Flask server first.");
+    return;
+  }
+  const outputName = selectedFile.name.replace(/\.[^/.]+$/, "");
+  document.getElementById("loading").style.display = "block";
+  document.getElementById("progress-bar").style.display = "block";
+  document.getElementById("convert-btn").disabled = true;
+  document.getElementById("result-section").style.display = "none";
+  document.getElementById("debug-info").style.display = "none";
+  simulateProgress();
+  const formData = new FormData();
+  formData.append("file", selectedFile);
+  formData.append("format", currentFormat);
+  formData.append("output_name", outputName);
+  const debugInfo = {
+    fileName: selectedFile.name,
+    fileSize: selectedFile.size,
+    format: currentFormat,
+    outputName: outputName,
+    timestamp: new Date().toISOString(),
+  };
+        try {
+          console.log("🔄 Starting conversion...", debugInfo);
+          const controller = new AbortController();
+          const timeoutId = setTimeout(() => controller.abort(), 420000); // 60 second timeout
+          const response = await fetch("/convert", {
+            method: "POST",
+            body: formData,
+            headers: {
+              Accept: "application/json",
+            },
+            mode: "cors",
+            signal: controller.signal,
+          });
+          clearTimeout(timeoutId);
+          console.log("📡 Response status:", response.status);
+          if (!response.ok) {
+            const errorText = await response.text();
+            throw new Error(`Server returned ${response.status}: ${errorText}`);
+          }
+          const result = await response.json();
+          console.log("✅ Conversion result:", result);
+          // Hide loading
+          document.getElementById("loading").style.display = "none";
+          document.getElementById("progress-bar").style.display = "none";
+          // Show result
+          const resultSection = document.getElementById("result-section");
+          const resultMessage = document.getElementById("result-message");
+          if (result.success) {
+            resultSection.className = "result-section result-success";
+            resultMessage.innerHTML = `<h3>✅ Conversion Successful!</h3>
+        <p>Your PDF has been converted to ${currentFormat.toUpperCase()} format.</p>
+        <p><strong>Output file:</strong> ${
+          result.output_path || "Generated successfully"
+        }</p>`;
+            if (result.download_url) {
+              const downloadUrl = `${window.location.origin}${result.download_url}`;
+              // Add link for user
+              resultMessage.innerHTML += `<p><a href="${downloadUrl}" target="_blank" style="color: #155724; text-decoration: none; font-weight: bold;">📥 Download File</a></p>`;
+              // ⬇️ Auto-download
+              const a = document.createElement("a");
+              a.href = downloadUrl;
+              a.download = result.output_path || "converted_file";
+              document.body.appendChild(a);
+              a.click();
+              document.body.removeChild(a);
+            }
+          } else {
+            resultSection.className = "result-section result-error";
+            resultMessage.innerHTML = `
+                        <h3>❌ Conversion Failed</h3>
+                        <p>${
+                          result.error || "An unexpected error occurred."
+                        }</p>
+                    `;
+          }
+          resultSection.style.display = "block";
+        } catch (error) {
+          console.error("❌ Error during conversion:", error);
+          // Hide loading
+          document.getElementById("loading").style.display = "none";
+          document.getElementById("progress-bar").style.display = "none";
+          // Show error
+          const resultSection = document.getElementById("result-section");
+          const resultMessage = document.getElementById("result-message");
+          resultSection.className = "result-section result-error";
+          let errorMessage = "An unexpected error occurred.";
+          if (error.name === "AbortError") {
+            errorMessage =
+              "Request timed out. The file might be too large or the server is taking too long to respond.";
+          } else if (error.message.includes("Failed to fetch")) {
+            errorMessage =
+              "Cannot connect to server. Please ensure the Flask server is running on http://localhost:5000";
+          } else {
+            errorMessage = error.message;
+          }
+          resultMessage.innerHTML = `
+                    <h3>❌ Conversion Error</h3>
+                    <p>${errorMessage}</p>
+                `;
+          resultSection.style.display = "block";
+          // Show debug information
+          const debugElement = document.getElementById("debug-info");
+          const debugContent = document.getElementById("debug-content");
+          debugContent.innerHTML = `
+                    <div class="error-details">
+                        <strong>Error Details:</strong><br>
+                        Type: ${error.name}<br>
+                        Message: ${error.message}<br>
+                        <br>
+                        <strong>Request Details:</strong><br>
+                        ${JSON.stringify(debugInfo, null, 2)}
+                        <br>
+                        <strong>Troubleshooting:</strong><br>
+                        1. Ensure Flask server is running: python app.py<br>
+                        2. Check server logs for errors<br>
+                        3. Verify file size is under 100MB<br>
+                        4. Check browser console for additional errors
+                    </div>
+                `;
+          debugElement.style.display = "block";
+        }
+        document.getElementById("convert-btn").disabled = false;
+        checkFormValidity(); // Update button state
+      }
+      function simulateProgress() {
+        const progressFill = document.getElementById("progress-fill");
+        let progress = 0;
+        const interval = setInterval(() => {
+          progress += Math.random() * 15;
+          if (progress > 90) progress = 90;
+          progressFill.style.width = progress + "%";
+          if (progress >= 90) {
+            clearInterval(interval);
+          }
+        }, 200);
+        // Reset progress after animation
+        setTimeout(() => {
+          progressFill.style.width = "0%";
+        }, 5000);
+      }
+    </script>
+  </body>
+</html>