import os import base64 import json import requests from typing import Dict, List, Any import fitz # PyMuPDF from PIL import Image import io import re from dataclasses import dataclass from pathlib import Path from datetime import datetime @dataclass class TextBlock: text: str x: float y: float width: float height: float font_size: float font_name: str is_bold: bool = False is_italic: bool = False block_id: str = "" class PDFToHTMLConverter: def __init__(self, huggingface_token: str = None): self.hf_token = huggingface_token self.hf_headers = { "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None } self.models = { "document_layout": "microsoft/layoutlm-base-uncased", "table_detection": "microsoft/table-transformer-detection", "ocr": "microsoft/trocr-base-printed", "math_detection": "facebook/detr-resnet-50" } self.hf_inference_url = "https://api-inference.huggingface.co/models" def pdf_to_base64(self, pdf_path: str) -> str: try: with open(pdf_path, "rb") as pdf_file: return base64.b64encode(pdf_file.read()).decode('utf-8') except Exception as e: raise Exception(f"Error converting PDF to base64: {str(e)}") def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]: doc = None try: if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF file not found: {pdf_path}") doc = fitz.open(pdf_path) if doc is None: raise RuntimeError("Failed to open PDF document") if doc.page_count == 0: raise ValueError("PDF document has no pages") print(f"📄 PDF opened successfully: {doc.page_count} pages") pages_content = [] for page_num in range(doc.page_count): try: page = doc[page_num] print(f"🔄 Processing page {page_num + 1}/{doc.page_count}") text_blocks = [] try: page_dict = page.get_text("dict") text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num) except Exception as e: print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}") text_blocks = self._extract_text_blocks_simple(page, page_num) images = self._extract_images_safely(page, doc, page_num) tables = self._detect_tables_safely(page) page_rect = page.rect pages_content.append({ "page_number": page_num + 1, "text_blocks": text_blocks, "images": images, "tables": tables, "page_width": page_rect.width, "page_height": page_rect.height }) except Exception as e: print(f"❌ Error processing page {page_num + 1}: {e}") pages_content.append({ "page_number": page_num + 1, "text_blocks": [], "images": [], "tables": [], "page_width": 595, "page_height": 842 }) result = { "pages": pages_content, "total_pages": doc.page_count } return result except Exception as e: raise Exception(f"Error extracting PDF content: {str(e)}") finally: if doc is not None: try: doc.close() print("✅ PDF document closed successfully") except Exception as e: print(f"⚠️ Error closing PDF document: {e}") def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]: text_blocks = [] for block_idx, block in enumerate(page_dict.get("blocks", [])): if "lines" not in block: continue for line_idx, line in enumerate(block["lines"]): for span_idx, span in enumerate(line["spans"]): text_content = span.get("text", "").strip() if text_content: bbox = span["bbox"] font_info = { "size": span.get("size", 12), "font": span.get("font", "Arial"), "is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16, "is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2 } text_block = TextBlock( text=text_content, x=bbox[0], y=bbox[1], width=bbox[2] - bbox[0], height=bbox[3] - bbox[1], font_size=font_info["size"], font_name=font_info["font"], is_bold=font_info["is_bold"], is_italic=font_info["is_italic"], block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}" ) text_blocks.append(text_block) return text_blocks def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]: text_blocks = [] try: blocks_data = page.get_text("blocks") for block_idx, block in enumerate(blocks_data): if block[6] == 0: text = block[4].strip() if text: x0, y0, x1, y1 = block[0], block[1], block[2], block[3] lines = text.split('\n') line_height = (y1 - y0) / max(len(lines), 1) for line_idx, line in enumerate(lines): if line.strip(): text_block = TextBlock( text=line.strip(), x=x0, y=y0 + (line_idx * line_height), width=x1 - x0, height=line_height, font_size=12, font_name="Arial", is_bold=False, is_italic=False, block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}" ) text_blocks.append(text_block) except Exception as e: print(f"⚠️ Simple text block extraction failed: {e}") return text_blocks def _extract_images_safely(self, page, doc, page_num) -> List[Dict]: images = [] try: image_list = page.get_images(full=True) for img_index, img_info in enumerate(image_list): try: xref = img_info[0] img_rects = [r for r in page.get_image_rects(xref)] if not img_rects: continue bbox = img_rects[0] pix = fitz.Pixmap(doc, xref) if pix.n - pix.alpha < 4: img_data = pix.tobytes("png") img_base64 = base64.b64encode(img_data).decode() images.append({ "index": img_index, "data": img_base64, "bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1) }) pix = None except Exception as e: print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}") continue except Exception as e: print(f"⚠️ General error in image extraction for page {page_num+1}: {e}") return images def _detect_tables_safely(self, page) -> List[Dict]: tables = [] try: tabs = page.find_tables() for tab_index, tab in enumerate(tabs): try: table_data = tab.extract() if table_data: cleaned_data = [] for row in table_data: cleaned_row = [str(cell).strip() if cell else "" for cell in row] if any(cleaned_row): cleaned_data.append(cleaned_row) if cleaned_data: tables.append({ "bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1), "data": cleaned_data }) except Exception as e: print(f"⚠️ Error extracting table {tab_index}: {e}") continue except Exception as e: print(f"⚠️ General error in table detection: {e}") return tables def enhance_math_symbols(self, text: str) -> str: math_replacements = { '±': '±', '×': '×', '÷': '÷', '∑': '∑', '∏': '∏', '√': '√', '∞': '∞', '∫': '∫', '∂': '∂', '∆': 'Δ', '∇': '∇', '∈': '∈', '∉': '∉', '⊂': '⊂', '⊃': '⊃', '⊆': '⊆', '⊇': '⊇', '∪': '∪', '∩': '∩', '≤': '≤', '≥': '≥', '≠': '≠', '≡': '≡', '≈': '≈', '∝': '∝', '∴': '∴', 'α': 'α', 'β': 'β', 'γ': 'γ', 'δ': 'δ', 'ε': 'ε', 'ζ': 'ζ', 'η': 'η', 'θ': 'θ', 'ι': 'ι', 'κ': 'κ', 'λ': 'λ', 'μ': 'μ', 'ν': 'ν', 'ξ': 'ξ', 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'υ': 'υ', 'φ': 'φ', 'χ': 'χ', 'ψ': 'ψ', 'ω': 'ω', '½': '½', '⅓': '⅓', '¼': '¼', '⅔': '⅔', '¾': '¾', '⅛': '⅛', '²': '²', '³': '³', '¹': '¹', '°': '°' } for symbol, html_entity in math_replacements.items(): text = text.replace(symbol, html_entity) return text def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str: html_content = [] html_content.append("""
Total Pages: {pdf_content.get('total_pages', 'Unknown')}
Converted on: {self._get_current_timestamp()}