amit01Xindus commited on
Commit
96c003e
Β·
verified Β·
1 Parent(s): d1e9a85

Upload 8 files

Browse files
Files changed (8) hide show
  1. app.py +180 -0
  2. dockerfile +53 -0
  3. pdf_excel.py +737 -0
  4. pdf_html.py +636 -0
  5. pdf_json.py +513 -0
  6. pdf_word.py +559 -0
  7. requirements.txt +12 -0
  8. static/index.html +896 -0
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, send_file, send_from_directory
2
+ from flask_cors import CORS
3
+ from werkzeug.utils import secure_filename
4
+ import os
5
+ import traceback
6
+ from pdf_html import PDFToHTMLConverter
7
+ from pdf_word import PDFToWordConverter
8
+ from pdf_json import PDFToJSONConverter
9
+ from pdf_excel import PDFToExcelConverter
10
+
11
+ app = Flask(__name__, static_folder='static')
12
+ CORS(app)
13
+
14
+ # Configure file size limits and folders
15
+ app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100 MB limit
16
+ app.config['UPLOAD_FOLDER'] = 'uploads'
17
+ app.config['OUTPUT_FOLDER'] = 'outputs'
18
+ app.config['SECRET_KEY'] = 'your-secret-key-here' # IMPORTANT: Change this in production!
19
+
20
+ # Create necessary directories if they don't exist
21
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
22
+ os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
23
+
24
+ # Placeholder for Hugging Face API Token
25
+ HF_TOKEN = "Api_token" # Replace with your actual token
26
+
27
+ # Define allowed file extensions for uploads
28
+ ALLOWED_EXTENSIONS = {'pdf'}
29
+
30
+ def allowed_file(filename):
31
+ """Checks if the uploaded file has an allowed extension."""
32
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
33
+
34
+ @app.route('/')
35
+ def serve_index():
36
+ """Serves the main index.html file."""
37
+ return send_from_directory('static', 'index.html')
38
+
39
+ @app.route('/<path:filename>')
40
+ def serve_static(filename):
41
+ """Serves other static files (CSS, JS, etc.)."""
42
+ return send_from_directory('static', filename)
43
+
44
+ @app.route('/convert', methods=['POST'])
45
+ def convert_pdf():
46
+ """
47
+ Handles PDF conversion requests.
48
+ Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'.
49
+ """
50
+ try:
51
+ # Check if a file was included in the request
52
+ if 'file' not in request.files:
53
+ return jsonify({'success': False, 'error': 'No file uploaded.'}), 400
54
+
55
+ file = request.files['file']
56
+ format_type = request.form.get('format')
57
+ output_name = request.form.get('output_name', 'converted_file')
58
+
59
+ # Validate file and format
60
+ if file.filename == '':
61
+ return jsonify({'success': False, 'error': 'No file selected.'}), 400
62
+
63
+ if not format_type or format_type not in ['html', 'word', 'json', 'excel']:
64
+ return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400
65
+
66
+ if not allowed_file(file.filename):
67
+ return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400
68
+
69
+ # Securely save the uploaded file
70
+ filename_secured = secure_filename(file.filename)
71
+ input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured)
72
+ file.save(input_path)
73
+
74
+ # Define output file extensions based on format
75
+ extensions = {
76
+ 'html': '.html',
77
+ 'word': '.docx',
78
+ 'json': '.json',
79
+ 'excel': '.xlsx'
80
+ }
81
+ output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}"
82
+ output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename)
83
+
84
+ success_message = ""
85
+
86
+ try:
87
+ # Perform conversion based on the requested format
88
+ if format_type == 'html':
89
+ converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
90
+ try:
91
+ # First try with HF models
92
+ converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True)
93
+ except AttributeError as ae:
94
+ if '_group_overlapping_text' in str(ae):
95
+ # Fall back to non-HF mode if the method is missing
96
+ converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False)
97
+ else:
98
+ raise
99
+ success_message = "Successfully converted to HTML!"
100
+ elif format_type == 'word':
101
+ converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
102
+ converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False)
103
+ success_message = "Successfully converted to Word!"
104
+ elif format_type == 'json':
105
+ converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
106
+ converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False)
107
+ success_message = "Successfully converted to JSON!"
108
+ elif format_type == 'excel':
109
+ converter = PDFToExcelConverter(huggingface_token=HF_TOKEN)
110
+ converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False)
111
+ success_message = "Successfully converted to Excel!"
112
+ except Exception as conv_e:
113
+ # Clean up the output file if conversion failed
114
+ if os.path.exists(output_path):
115
+ try:
116
+ os.remove(output_path)
117
+ except Exception as e:
118
+ print(f"Warning: Could not remove output file {output_path}: {e}")
119
+ raise conv_e
120
+
121
+ # Clean up the uploaded input file
122
+ try:
123
+ os.remove(input_path)
124
+ except Exception as e:
125
+ print(f"Warning: Could not remove input file {input_path}: {e}")
126
+
127
+ # Return success response with download URL
128
+ return jsonify({
129
+ 'success': True,
130
+ 'message': success_message,
131
+ 'download_url': f'/download/{output_filename}'
132
+ }), 200
133
+
134
+ except Exception as e:
135
+ # Clean up input file in case of error
136
+ if 'input_path' in locals() and os.path.exists(input_path):
137
+ try:
138
+ os.remove(input_path)
139
+ except Exception as cleanup_e:
140
+ print(f"Error during error cleanup for {input_path}: {cleanup_e}")
141
+
142
+ traceback.print_exc()
143
+ error_msg = str(e)
144
+ if '_group_overlapping_text' in error_msg:
145
+ error_msg = "HTML conversion failed due to incompatible converter version. Please try another format."
146
+ return jsonify({
147
+ 'success': False,
148
+ 'error': f'Conversion failed: {error_msg}'
149
+ }), 500
150
+
151
+ @app.route('/download/<filename>')
152
+ def download_file(filename):
153
+ """Allows downloading of converted files."""
154
+ try:
155
+ file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename)
156
+ if os.path.exists(file_path):
157
+ return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True)
158
+ return jsonify({'error': 'File not found.'}), 404
159
+ except Exception as e:
160
+ traceback.print_exc()
161
+ return jsonify({'error': str(e)}), 500
162
+
163
+ @app.route('/health')
164
+ def health_check():
165
+ """Simple health check endpoint."""
166
+ return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200
167
+
168
+ @app.errorhandler(413)
169
+ def too_large(e):
170
+ """Handles file too large errors."""
171
+ return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413
172
+
173
+ @app.errorhandler(500)
174
+ def internal_error(e):
175
+ """Handles general internal server errors."""
176
+ traceback.print_exc()
177
+ return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500
178
+
179
+ if __name__ == '__main__':
180
+ app.run(debug=True, host='0.0.0.0', port=5000)
dockerfile ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a minimal Python base image
2
+ FROM python:3.10-slim
3
+
4
+ # Install system dependencies including libcrypt and additional libraries for PyMuPDF
5
+ RUN apt-get update && \
6
+ apt-get install -y \
7
+ libcrypt1 \
8
+ libgl1-mesa-glx \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender-dev \
13
+ libgomp1 \
14
+ poppler-utils \
15
+ build-essential \
16
+ libfontconfig1 \
17
+ libxrender1 \
18
+ libxtst6 \
19
+ libxi6 \
20
+ libfreetype6-dev \
21
+ libjpeg-dev \
22
+ libopenjp2-7-dev \
23
+ && rm -rf /var/lib/apt/lists/*
24
+
25
+ # Set working directory
26
+ WORKDIR /app
27
+
28
+ # Copy requirements first for better caching
29
+ COPY requirements.txt .
30
+
31
+ # Upgrade pip and install Python dependencies with verbose output
32
+ RUN pip install --upgrade pip
33
+
34
+ # Install PyMuPDF first to check for issues early
35
+ RUN pip install --no-cache-dir PyMuPDF==1.23.0
36
+
37
+ # Test PyMuPDF import
38
+ RUN python -c "import fitz; print('PyMuPDF imported successfully')"
39
+
40
+ # Install remaining dependencies
41
+ RUN pip install --no-cache-dir -r requirements.txt
42
+
43
+ # Verify python-docx installation
44
+ RUN python -c "from docx import Document; print('python-docx installed successfully')"
45
+
46
+ # Copy source code to container
47
+ COPY . .
48
+
49
+ # Expose the port Flask will run on (important for Hugging Face)
50
+ EXPOSE 7860
51
+
52
+ # Run the Flask app
53
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]
pdf_excel.py ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import fitz # PyMuPDF
4
+ import openpyxl
5
+ from openpyxl.utils.dataframe import dataframe_to_rows
6
+ from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
7
+ from dataclasses import dataclass
8
+ from typing import List, Dict, Any, Tuple, Optional
9
+ import re
10
+ from pathlib import Path
11
+ import logging
12
+ from datetime import datetime
13
+ import numpy as np
14
+
15
+ # Optional imports with graceful fallback
16
+ try:
17
+ import camelot # For advanced table extraction
18
+ CAMELOT_AVAILABLE = True
19
+ except ImportError:
20
+ CAMELOT_AVAILABLE = False
21
+ print("⚠️ Camelot not installed. Run: pip install camelot-py[cv]")
22
+
23
+ try:
24
+ import tabula # Alternative table extraction
25
+ TABULA_AVAILABLE = True
26
+ except ImportError:
27
+ TABULA_AVAILABLE = False
28
+ print("⚠️ Tabula not installed. Run: pip install tabula-py")
29
+
30
+ # Set up logging
31
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
+ logger = logging.getLogger(__name__)
33
+
34
+ @dataclass
35
+ class TextBlock:
36
+ text: str
37
+ x: float
38
+ y: float
39
+ width: float
40
+ height: float
41
+ font_size: float
42
+ font_name: str
43
+ is_bold: bool = False
44
+ is_italic: bool = False
45
+ page_num: int = 1
46
+ block_id: str = ""
47
+
48
+ @dataclass
49
+ class TableData:
50
+ data: List[List[str]]
51
+ bbox: Tuple[float, float, float, float]
52
+ page_num: int
53
+ confidence: float = 0.0
54
+ has_header: bool = True
55
+
56
+ class PDFToExcelConverter:
57
+ """
58
+ Enhanced PDF to Excel converter with multiple extraction methods
59
+ for better accuracy and handling of complex documents.
60
+ """
61
+
62
+ def __init__(self):
63
+ # Check available extraction methods
64
+ available_methods = ['pymupdf'] # Always available
65
+ if CAMELOT_AVAILABLE:
66
+ available_methods.append('camelot')
67
+ if TABULA_AVAILABLE:
68
+ available_methods.append('tabula')
69
+
70
+ self.extraction_methods = available_methods
71
+ self.output_formats = {
72
+ 'separate_sheets': 'Each table and text section on separate sheets',
73
+ 'combined': 'All content combined logically',
74
+ 'structured': 'Maintain document structure with proper formatting'
75
+ }
76
+
77
+ # Log available methods
78
+ logger.info(f"Available extraction methods: {', '.join(available_methods)}")
79
+
80
+ def extract_text_blocks_advanced(self, page, page_num: int) -> List[TextBlock]:
81
+ """
82
+ Advanced text extraction with better formatting detection
83
+ """
84
+ text_blocks = []
85
+
86
+ try:
87
+ # Method 1: Dictionary-based extraction (most detailed)
88
+ page_dict = page.get_text("dict")
89
+
90
+ for block_idx, block in enumerate(page_dict.get("blocks", [])):
91
+ if block.get("type", 1) != 0: # Skip non-text blocks
92
+ continue
93
+
94
+ for line_idx, line in enumerate(block.get("lines", [])):
95
+ for span_idx, span in enumerate(line.get("spans", [])):
96
+ text_content = span.get("text", "").strip()
97
+ if not text_content:
98
+ continue
99
+
100
+ bbox = span["bbox"]
101
+ flags = span.get("flags", 0)
102
+
103
+ # Enhanced font detection
104
+ font_name = span.get("font", "Arial")
105
+ font_size = span.get("size", 12)
106
+ is_bold = bool(flags & 16) or "bold" in font_name.lower()
107
+ is_italic = bool(flags & 2) or "italic" in font_name.lower()
108
+
109
+ text_block = TextBlock(
110
+ text=text_content,
111
+ x=bbox[0], y=bbox[1],
112
+ width=bbox[2] - bbox[0],
113
+ height=bbox[3] - bbox[1],
114
+ font_size=font_size,
115
+ font_name=font_name,
116
+ is_bold=is_bold,
117
+ is_italic=is_italic,
118
+ page_num=page_num,
119
+ block_id=f"p{page_num}_b{block_idx}_l{line_idx}_s{span_idx}"
120
+ )
121
+ text_blocks.append(text_block)
122
+
123
+ except Exception as e:
124
+ logger.warning(f"Advanced text extraction failed for page {page_num}: {e}")
125
+ # Fallback to simple extraction
126
+ text_blocks = self._extract_text_simple_fallback(page, page_num)
127
+
128
+ return text_blocks
129
+
130
+ def _extract_text_simple_fallback(self, page, page_num: int) -> List[TextBlock]:
131
+ """
132
+ Fallback text extraction method
133
+ """
134
+ text_blocks = []
135
+ try:
136
+ text = page.get_text()
137
+ if text.strip():
138
+ # Create a single text block for the entire page content
139
+ rect = page.rect
140
+ text_block = TextBlock(
141
+ text=text.strip(),
142
+ x=0, y=0,
143
+ width=rect.width,
144
+ height=rect.height,
145
+ font_size=12,
146
+ font_name="Arial",
147
+ page_num=page_num,
148
+ block_id=f"p{page_num}_fallback"
149
+ )
150
+ text_blocks.append(text_block)
151
+ except Exception as e:
152
+ logger.error(f"Fallback text extraction failed for page {page_num}: {e}")
153
+
154
+ return text_blocks
155
+
156
+ def extract_tables_multiple_methods(self, pdf_path: str, page_num: int) -> List[TableData]:
157
+ """
158
+ Extract tables using multiple methods and combine results
159
+ """
160
+ all_tables = []
161
+
162
+ # Method 1: PyMuPDF built-in table detection
163
+ tables_pymupdf = self._extract_tables_pymupdf(pdf_path, page_num)
164
+ all_tables.extend(tables_pymupdf)
165
+
166
+ # Method 2: Camelot (if available)
167
+ if CAMELOT_AVAILABLE:
168
+ try:
169
+ tables_camelot = self._extract_tables_camelot(pdf_path, page_num)
170
+ all_tables.extend(tables_camelot)
171
+ except Exception as e:
172
+ logger.warning(f"Camelot extraction failed: {e}")
173
+
174
+ # Method 3: Tabula (if available)
175
+ if TABULA_AVAILABLE:
176
+ try:
177
+ tables_tabula = self._extract_tables_tabula(pdf_path, page_num)
178
+ all_tables.extend(tables_tabula)
179
+ except Exception as e:
180
+ logger.warning(f"Tabula extraction failed: {e}")
181
+
182
+ # Remove duplicates and return best tables
183
+ return self._deduplicate_tables(all_tables)
184
+
185
+ def _extract_tables_pymupdf(self, pdf_path: str, page_num: int) -> List[TableData]:
186
+ """
187
+ Extract tables using PyMuPDF
188
+ """
189
+ tables = []
190
+ try:
191
+ doc = fitz.open(pdf_path)
192
+ page = doc[page_num - 1] # Convert to 0-based index
193
+
194
+ detected_tables = page.find_tables()
195
+ for i, table in enumerate(detected_tables):
196
+ try:
197
+ table_data = table.extract()
198
+ if table_data and len(table_data) > 0:
199
+ # Clean the table data
200
+ cleaned_data = []
201
+ for row in table_data:
202
+ cleaned_row = []
203
+ for cell in row:
204
+ cell_text = str(cell).strip() if cell else ""
205
+ cleaned_row.append(cell_text)
206
+ if any(cleaned_row): # Only add non-empty rows
207
+ cleaned_data.append(cleaned_row)
208
+
209
+ if cleaned_data:
210
+ tables.append(TableData(
211
+ data=cleaned_data,
212
+ bbox=table.bbox,
213
+ page_num=page_num,
214
+ confidence=0.8, # PyMuPDF generally reliable
215
+ has_header=True
216
+ ))
217
+ except Exception as e:
218
+ logger.warning(f"Error extracting PyMuPDF table {i}: {e}")
219
+
220
+ doc.close()
221
+ except Exception as e:
222
+ logger.error(f"PyMuPDF table extraction failed: {e}")
223
+
224
+ return tables
225
+
226
+ def _extract_tables_camelot(self, pdf_path: str, page_num: int) -> List[TableData]:
227
+ """
228
+ Extract tables using Camelot (only if available)
229
+ """
230
+ if not CAMELOT_AVAILABLE:
231
+ return []
232
+
233
+ tables = []
234
+ try:
235
+ # Camelot works with page numbers (1-based)
236
+ camelot_tables = camelot.read_pdf(pdf_path, pages=str(page_num), flavor='lattice')
237
+
238
+ for i, table in enumerate(camelot_tables):
239
+ df = table.df
240
+ if not df.empty:
241
+ # Convert DataFrame to list of lists
242
+ table_data = df.values.tolist()
243
+ # Add headers if they exist
244
+ if not df.columns.empty:
245
+ headers = df.columns.tolist()
246
+ table_data.insert(0, headers)
247
+
248
+ tables.append(TableData(
249
+ data=table_data,
250
+ bbox=(0, 0, 100, 100), # Camelot doesn't provide bbox
251
+ page_num=page_num,
252
+ confidence=table.accuracy / 100.0 if hasattr(table, 'accuracy') else 0.7,
253
+ has_header=True
254
+ ))
255
+
256
+ except Exception as e:
257
+ logger.warning(f"Camelot extraction failed: {e}")
258
+
259
+ return tables
260
+
261
+ def _extract_tables_tabula(self, pdf_path: str, page_num: int) -> List[TableData]:
262
+ """
263
+ Extract tables using Tabula (only if available)
264
+ """
265
+ if not TABULA_AVAILABLE:
266
+ return []
267
+
268
+ tables = []
269
+ try:
270
+ # Tabula works with page numbers (1-based)
271
+ tabula_tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True)
272
+
273
+ for i, df in enumerate(tabula_tables):
274
+ if not df.empty:
275
+ # Convert DataFrame to list of lists
276
+ table_data = df.fillna('').values.tolist()
277
+ # Add headers
278
+ headers = df.columns.tolist()
279
+ table_data.insert(0, headers)
280
+
281
+ tables.append(TableData(
282
+ data=table_data,
283
+ bbox=(0, 0, 100, 100), # Tabula doesn't provide bbox
284
+ page_num=page_num,
285
+ confidence=0.7,
286
+ has_header=True
287
+ ))
288
+
289
+ except Exception as e:
290
+ logger.warning(f"Tabula extraction failed: {e}")
291
+
292
+ return tables
293
+
294
+ def _deduplicate_tables(self, tables: List[TableData]) -> List[TableData]:
295
+ """
296
+ Remove duplicate tables by comparing content
297
+ """
298
+ if not tables:
299
+ return tables
300
+
301
+ unique_tables = []
302
+ for table in tables:
303
+ is_duplicate = False
304
+ for existing_table in unique_tables:
305
+ if self._tables_are_similar(table, existing_table):
306
+ # Keep the one with higher confidence
307
+ if table.confidence > existing_table.confidence:
308
+ unique_tables.remove(existing_table)
309
+ unique_tables.append(table)
310
+ is_duplicate = True
311
+ break
312
+
313
+ if not is_duplicate:
314
+ unique_tables.append(table)
315
+
316
+ return unique_tables
317
+
318
+ def _tables_are_similar(self, table1: TableData, table2: TableData, threshold: float = 0.8) -> bool:
319
+ """
320
+ Check if two tables are similar (likely duplicates)
321
+ """
322
+ if len(table1.data) != len(table2.data):
323
+ return False
324
+
325
+ if not table1.data or not table2.data:
326
+ return False
327
+
328
+ # Compare dimensions
329
+ if len(table1.data[0]) != len(table2.data[0]):
330
+ return False
331
+
332
+ # Compare content similarity
333
+ matching_cells = 0
334
+ total_cells = len(table1.data) * len(table1.data[0])
335
+
336
+ for i, (row1, row2) in enumerate(zip(table1.data, table2.data)):
337
+ for j, (cell1, cell2) in enumerate(zip(row1, row2)):
338
+ if str(cell1).strip().lower() == str(cell2).strip().lower():
339
+ matching_cells += 1
340
+
341
+ similarity = matching_cells / total_cells if total_cells > 0 else 0
342
+ return similarity >= threshold
343
+
344
+ def process_pdf_to_excel(self, pdf_path: str, output_path: str, format_type: str = 'structured') -> str:
345
+ """
346
+ Convert PDF to Excel with enhanced processing
347
+ """
348
+ logger.info(f"Starting PDF to Excel conversion: {pdf_path}")
349
+
350
+ if not os.path.exists(pdf_path):
351
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
352
+
353
+ # Extract content from PDF
354
+ pdf_content = self._extract_comprehensive_content(pdf_path)
355
+
356
+ # Create Excel workbook
357
+ output_path = self._create_excel_workbook(pdf_content, output_path, format_type)
358
+
359
+ logger.info(f"Successfully converted PDF to Excel: {output_path}")
360
+ return output_path
361
+
362
+ def _extract_comprehensive_content(self, pdf_path: str) -> Dict[str, Any]:
363
+ """
364
+ Extract all content from PDF using multiple methods
365
+ """
366
+ content = {
367
+ 'pages': [],
368
+ 'total_pages': 0,
369
+ 'metadata': {}
370
+ }
371
+
372
+ try:
373
+ doc = fitz.open(pdf_path)
374
+ content['total_pages'] = doc.page_count
375
+ content['metadata'] = doc.metadata
376
+
377
+ logger.info(f"Processing {doc.page_count} pages...")
378
+
379
+ for page_num in range(doc.page_count):
380
+ page = doc[page_num]
381
+ logger.info(f"Processing page {page_num + 1}/{doc.page_count}")
382
+
383
+ # Extract text blocks
384
+ text_blocks = self.extract_text_blocks_advanced(page, page_num + 1)
385
+
386
+ # Extract tables using multiple methods
387
+ tables = self.extract_tables_multiple_methods(pdf_path, page_num + 1)
388
+
389
+ # Extract images (basic)
390
+ images = self._extract_images_basic(page, page_num + 1)
391
+
392
+ page_content = {
393
+ 'page_number': page_num + 1,
394
+ 'text_blocks': text_blocks,
395
+ 'tables': tables,
396
+ 'images': images,
397
+ 'page_width': page.rect.width,
398
+ 'page_height': page.rect.height
399
+ }
400
+
401
+ content['pages'].append(page_content)
402
+
403
+ doc.close()
404
+
405
+ except Exception as e:
406
+ logger.error(f"Error extracting PDF content: {e}")
407
+ raise
408
+
409
+ return content
410
+
411
+ def _extract_images_basic(self, page, page_num: int) -> List[Dict]:
412
+ """
413
+ Basic image extraction for reference
414
+ """
415
+ images = []
416
+ try:
417
+ image_list = page.get_images()
418
+ for i, img in enumerate(image_list):
419
+ images.append({
420
+ 'index': i,
421
+ 'page': page_num,
422
+ 'bbox': img # Simplified
423
+ })
424
+ except Exception as e:
425
+ logger.warning(f"Image extraction failed for page {page_num}: {e}")
426
+
427
+ return images
428
+
429
+ def _create_excel_workbook(self, content: Dict[str, Any], output_path: str, format_type: str) -> str:
430
+ """
431
+ Create Excel workbook with proper formatting
432
+ """
433
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
434
+
435
+ if format_type == 'structured':
436
+ self._create_structured_workbook(content, writer)
437
+ elif format_type == 'combined':
438
+ self._create_combined_workbook(content, writer)
439
+ else: # separate_sheets
440
+ self._create_separate_sheets_workbook(content, writer)
441
+
442
+ # Add summary sheet
443
+ self._add_summary_sheet(content, writer)
444
+
445
+ # Apply formatting
446
+ self._apply_excel_formatting(output_path)
447
+
448
+ return output_path
449
+
450
+ def _create_structured_workbook(self, content: Dict[str, Any], writer):
451
+ """
452
+ Create structured workbook maintaining document flow
453
+ """
454
+ for page_data in content['pages']:
455
+ page_num = page_data['page_number']
456
+
457
+ # Process tables first
458
+ table_count = 0
459
+ for table in page_data['tables']:
460
+ if table.data:
461
+ df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
462
+ sheet_name = f"P{page_num}_Table{table_count + 1}"[:31]
463
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
464
+ table_count += 1
465
+
466
+ # Process text content
467
+ if page_data['text_blocks']:
468
+ # Group text blocks by proximity and formatting
469
+ text_groups = self._group_text_blocks(page_data['text_blocks'])
470
+
471
+ for i, group in enumerate(text_groups):
472
+ if group['content'].strip():
473
+ text_df = pd.DataFrame([{
474
+ 'Content': group['content'],
475
+ 'Font_Size': group.get('font_size', 12),
476
+ 'Is_Bold': group.get('is_bold', False),
477
+ 'Position_X': group.get('x', 0),
478
+ 'Position_Y': group.get('y', 0)
479
+ }])
480
+ sheet_name = f"P{page_num}_Text{i + 1}"[:31]
481
+ text_df.to_excel(writer, sheet_name=sheet_name, index=False)
482
+
483
+ def _create_combined_workbook(self, content: Dict[str, Any], writer):
484
+ """
485
+ Create combined workbook with all tables and text together
486
+ """
487
+ all_tables = []
488
+ all_text = []
489
+
490
+ for page_data in content['pages']:
491
+ page_num = page_data['page_number']
492
+
493
+ # Collect all tables
494
+ for i, table in enumerate(page_data['tables']):
495
+ if table.data:
496
+ df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
497
+ df['Source_Page'] = page_num
498
+ df['Table_Index'] = i + 1
499
+ all_tables.append(df)
500
+
501
+ # Collect all text
502
+ text_content = '\n'.join([block.text for block in page_data['text_blocks']])
503
+ if text_content.strip():
504
+ all_text.append({
505
+ 'Page': page_num,
506
+ 'Content': text_content.strip()
507
+ })
508
+
509
+ # Write combined tables
510
+ if all_tables:
511
+ combined_tables = pd.concat(all_tables, ignore_index=True)
512
+ combined_tables.to_excel(writer, sheet_name='All_Tables', index=False)
513
+
514
+ # Write combined text
515
+ if all_text:
516
+ text_df = pd.DataFrame(all_text)
517
+ text_df.to_excel(writer, sheet_name='All_Text', index=False)
518
+
519
+ def _create_separate_sheets_workbook(self, content: Dict[str, Any], writer):
520
+ """
521
+ Create workbook with each element on separate sheets
522
+ """
523
+ table_counter = 1
524
+ text_counter = 1
525
+
526
+ for page_data in content['pages']:
527
+ page_num = page_data['page_number']
528
+
529
+ # Each table gets its own sheet
530
+ for table in page_data['tables']:
531
+ if table.data:
532
+ df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
533
+ sheet_name = f"Table_{table_counter}"[:31]
534
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
535
+ table_counter += 1
536
+
537
+ # Page text gets its own sheet
538
+ if page_data['text_blocks']:
539
+ text_content = '\n'.join([block.text for block in page_data['text_blocks']])
540
+ if text_content.strip():
541
+ text_df = pd.DataFrame([{'Page': page_num, 'Content': text_content}])
542
+ sheet_name = f"Text_{text_counter}"[:31]
543
+ text_df.to_excel(writer, sheet_name=sheet_name, index=False)
544
+ text_counter += 1
545
+
546
+ def _group_text_blocks(self, text_blocks: List[TextBlock]) -> List[Dict]:
547
+ """
548
+ Group text blocks by proximity and formatting
549
+ """
550
+ if not text_blocks:
551
+ return []
552
+
553
+ # Sort by position (top to bottom, left to right)
554
+ sorted_blocks = sorted(text_blocks, key=lambda b: (b.y, b.x))
555
+
556
+ groups = []
557
+ current_group = {
558
+ 'content': '',
559
+ 'font_size': sorted_blocks[0].font_size,
560
+ 'is_bold': sorted_blocks[0].is_bold,
561
+ 'x': sorted_blocks[0].x,
562
+ 'y': sorted_blocks[0].y
563
+ }
564
+
565
+ for block in sorted_blocks:
566
+ # Check if block should be in current group (similar formatting and position)
567
+ if (abs(current_group['font_size'] - block.font_size) < 2 and
568
+ current_group['is_bold'] == block.is_bold):
569
+ current_group['content'] += ' ' + block.text
570
+ else:
571
+ # Start new group
572
+ if current_group['content'].strip():
573
+ groups.append(current_group)
574
+ current_group = {
575
+ 'content': block.text,
576
+ 'font_size': block.font_size,
577
+ 'is_bold': block.is_bold,
578
+ 'x': block.x,
579
+ 'y': block.y
580
+ }
581
+
582
+ # Add last group
583
+ if current_group['content'].strip():
584
+ groups.append(current_group)
585
+
586
+ return groups
587
+
588
+ def _add_summary_sheet(self, content: Dict[str, Any], writer):
589
+ """
590
+ Add summary sheet with document statistics
591
+ """
592
+ total_tables = sum(len(page['tables']) for page in content['pages'])
593
+ total_text_blocks = sum(len(page['text_blocks']) for page in content['pages'])
594
+
595
+ summary_data = {
596
+ 'Statistic': [
597
+ 'Total Pages',
598
+ 'Total Tables',
599
+ 'Total Text Blocks',
600
+ 'Processing Date',
601
+ 'Document Title'
602
+ ],
603
+ 'Value': [
604
+ content['total_pages'],
605
+ total_tables,
606
+ total_text_blocks,
607
+ datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
608
+ content['metadata'].get('title', 'Unknown')
609
+ ]
610
+ }
611
+
612
+ summary_df = pd.DataFrame(summary_data)
613
+ summary_df.to_excel(writer, sheet_name='Summary', index=False)
614
+
615
+ def _apply_excel_formatting(self, file_path: str):
616
+ """
617
+ Apply formatting to the Excel file
618
+ """
619
+ try:
620
+ wb = openpyxl.load_workbook(file_path)
621
+
622
+ # Define styles
623
+ header_font = Font(bold=True, color="FFFFFF")
624
+ header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
625
+ border = Border(
626
+ left=Side(style='thin'),
627
+ right=Side(style='thin'),
628
+ top=Side(style='thin'),
629
+ bottom=Side(style='thin')
630
+ )
631
+
632
+ for sheet_name in wb.sheetnames:
633
+ ws = wb[sheet_name]
634
+
635
+ # Format headers
636
+ if ws.max_row > 0:
637
+ for cell in ws[1]:
638
+ cell.font = header_font
639
+ cell.fill = header_fill
640
+ cell.alignment = Alignment(horizontal='center', vertical='center')
641
+ cell.border = border
642
+
643
+ # Auto-adjust column widths
644
+ for column in ws.columns:
645
+ max_length = 0
646
+ column_letter = column[0].column_letter
647
+
648
+ for cell in column:
649
+ try:
650
+ if len(str(cell.value)) > max_length:
651
+ max_length = len(str(cell.value))
652
+ except:
653
+ pass
654
+
655
+ adjusted_width = min(max_length + 2, 50)
656
+ ws.column_dimensions[column_letter].width = adjusted_width
657
+
658
+ wb.save(file_path)
659
+
660
+ except Exception as e:
661
+ logger.warning(f"Could not apply formatting: {e}")
662
+
663
+ # Usage example and main function
664
+ def install_dependencies():
665
+ """
666
+ Print installation instructions for missing dependencies
667
+ """
668
+ print("πŸ“¦ INSTALLATION INSTRUCTIONS:")
669
+ print("=" * 50)
670
+
671
+ required_packages = [
672
+ ("PyMuPDF", "pip install PyMuPDF", True),
673
+ ("pandas", "pip install pandas", True),
674
+ ("openpyxl", "pip install openpyxl", True),
675
+ ("numpy", "pip install numpy", True),
676
+ ("camelot-py", "pip install camelot-py[cv]", CAMELOT_AVAILABLE),
677
+ ("tabula-py", "pip install tabula-py", TABULA_AVAILABLE)
678
+ ]
679
+
680
+ print("\nβœ… CORE PACKAGES (Required):")
681
+ for name, cmd, available in required_packages[:4]:
682
+ status = "βœ… Installed" if available else "❌ Missing"
683
+ print(f" {name}: {status}")
684
+ if not available:
685
+ print(f" Install: {cmd}")
686
+
687
+ print("\nπŸ”§ OPTIONAL PACKAGES (For better table extraction):")
688
+ for name, cmd, available in required_packages[4:]:
689
+ status = "βœ… Installed" if available else "❌ Missing"
690
+ print(f" {name}: {status}")
691
+ if not available:
692
+ print(f" Install: {cmd}")
693
+
694
+ print("\nπŸ’‘ INSTALL ALL AT ONCE:")
695
+ print("pip install PyMuPDF pandas openpyxl numpy camelot-py[cv] tabula-py")
696
+ print("\n" + "=" * 50)
697
+
698
+ def main():
699
+ """
700
+ Main function to demonstrate usage
701
+ """
702
+ print("πŸš€ Enhanced PDF to Excel Converter")
703
+ print("=" * 40)
704
+
705
+ # Show installation status
706
+ install_dependencies()
707
+
708
+ converter = PDFToExcelConverter()
709
+
710
+ # Example usage
711
+ pdf_path = "input.pdf" # Replace with your PDF path
712
+ output_path = "output.xlsx" # Replace with desired output path
713
+
714
+ try:
715
+ # Check if PDF file exists
716
+ if not os.path.exists(pdf_path):
717
+ print(f"\n❌ PDF file not found: {pdf_path}")
718
+ print("Please update the 'pdf_path' variable with your actual PDF file path.")
719
+ return
720
+
721
+ print(f"\nπŸ”„ Converting: {pdf_path}")
722
+ result = converter.process_pdf_to_excel(
723
+ pdf_path=pdf_path,
724
+ output_path=output_path,
725
+ format_type='structured' # Options: 'structured', 'combined', 'separate_sheets'
726
+ )
727
+ print(f"βœ… Conversion completed successfully: {result}")
728
+
729
+ except Exception as e:
730
+ print(f"❌ Conversion failed: {e}")
731
+ print("\nπŸ› οΈ TROUBLESHOOTING:")
732
+ print("1. Make sure all required packages are installed")
733
+ print("2. Check that your PDF file exists and is readable")
734
+ print("3. Ensure you have write permissions for the output directory")
735
+
736
+ if __name__ == "__main__":
737
+ main()
pdf_html.py ADDED
@@ -0,0 +1,636 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import json
4
+ import requests
5
+ from typing import Dict, List, Any
6
+ import fitz # PyMuPDF
7
+ from PIL import Image
8
+ import io
9
+ import re
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+
14
+ @dataclass
15
+ class TextBlock:
16
+ text: str
17
+ x: float
18
+ y: float
19
+ width: float
20
+ height: float
21
+ font_size: float
22
+ font_name: str
23
+ is_bold: bool = False
24
+ is_italic: bool = False
25
+ block_id: str = ""
26
+
27
+ class PDFToHTMLConverter:
28
+ def __init__(self, huggingface_token: str = None):
29
+ self.hf_token = huggingface_token
30
+ self.hf_headers = {
31
+ "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
32
+ }
33
+ self.models = {
34
+ "document_layout": "microsoft/layoutlm-base-uncased",
35
+ "table_detection": "microsoft/table-transformer-detection",
36
+ "ocr": "microsoft/trocr-base-printed",
37
+ "math_detection": "facebook/detr-resnet-50"
38
+ }
39
+ self.hf_inference_url = "https://api-inference.huggingface.co/models"
40
+
41
+ def pdf_to_base64(self, pdf_path: str) -> str:
42
+ try:
43
+ with open(pdf_path, "rb") as pdf_file:
44
+ return base64.b64encode(pdf_file.read()).decode('utf-8')
45
+ except Exception as e:
46
+ raise Exception(f"Error converting PDF to base64: {str(e)}")
47
+
48
+ def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
49
+ doc = None
50
+ try:
51
+ if not os.path.exists(pdf_path):
52
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
53
+
54
+ doc = fitz.open(pdf_path)
55
+
56
+ if doc is None:
57
+ raise RuntimeError("Failed to open PDF document")
58
+
59
+ if doc.page_count == 0:
60
+ raise ValueError("PDF document has no pages")
61
+
62
+ print(f"πŸ“„ PDF opened successfully: {doc.page_count} pages")
63
+
64
+ pages_content = []
65
+
66
+ for page_num in range(doc.page_count):
67
+ try:
68
+ page = doc[page_num]
69
+ print(f"πŸ”„ Processing page {page_num + 1}/{doc.page_count}")
70
+
71
+ text_blocks = []
72
+ try:
73
+ page_dict = page.get_text("dict")
74
+ text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
75
+ except Exception as e:
76
+ print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
77
+ text_blocks = self._extract_text_blocks_simple(page, page_num)
78
+
79
+ images = self._extract_images_safely(page, doc, page_num)
80
+ tables = self._detect_tables_safely(page)
81
+
82
+ page_rect = page.rect
83
+
84
+ pages_content.append({
85
+ "page_number": page_num + 1,
86
+ "text_blocks": text_blocks,
87
+ "images": images,
88
+ "tables": tables,
89
+ "page_width": page_rect.width,
90
+ "page_height": page_rect.height
91
+ })
92
+
93
+ except Exception as e:
94
+ print(f"❌ Error processing page {page_num + 1}: {e}")
95
+ pages_content.append({
96
+ "page_number": page_num + 1,
97
+ "text_blocks": [],
98
+ "images": [],
99
+ "tables": [],
100
+ "page_width": 595,
101
+ "page_height": 842
102
+ })
103
+
104
+ result = {
105
+ "pages": pages_content,
106
+ "total_pages": doc.page_count
107
+ }
108
+ return result
109
+
110
+ except Exception as e:
111
+ raise Exception(f"Error extracting PDF content: {str(e)}")
112
+ finally:
113
+ if doc is not None:
114
+ try:
115
+ doc.close()
116
+ print("βœ… PDF document closed successfully")
117
+ except Exception as e:
118
+ print(f"⚠️ Error closing PDF document: {e}")
119
+
120
+ def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
121
+ text_blocks = []
122
+
123
+ for block_idx, block in enumerate(page_dict.get("blocks", [])):
124
+ if "lines" not in block:
125
+ continue
126
+
127
+ for line_idx, line in enumerate(block["lines"]):
128
+ for span_idx, span in enumerate(line["spans"]):
129
+ text_content = span.get("text", "").strip()
130
+ if text_content:
131
+ bbox = span["bbox"]
132
+ font_info = {
133
+ "size": span.get("size", 12),
134
+ "font": span.get("font", "Arial"),
135
+ "is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
136
+ "is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
137
+ }
138
+
139
+ text_block = TextBlock(
140
+ text=text_content,
141
+ x=bbox[0],
142
+ y=bbox[1],
143
+ width=bbox[2] - bbox[0],
144
+ height=bbox[3] - bbox[1],
145
+ font_size=font_info["size"],
146
+ font_name=font_info["font"],
147
+ is_bold=font_info["is_bold"],
148
+ is_italic=font_info["is_italic"],
149
+ block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
150
+ )
151
+ text_blocks.append(text_block)
152
+
153
+ return text_blocks
154
+
155
+ def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
156
+ text_blocks = []
157
+ try:
158
+ blocks_data = page.get_text("blocks")
159
+ for block_idx, block in enumerate(blocks_data):
160
+ if block[6] == 0:
161
+ text = block[4].strip()
162
+ if text:
163
+ x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
164
+
165
+ lines = text.split('\n')
166
+ line_height = (y1 - y0) / max(len(lines), 1)
167
+
168
+ for line_idx, line in enumerate(lines):
169
+ if line.strip():
170
+ text_block = TextBlock(
171
+ text=line.strip(),
172
+ x=x0,
173
+ y=y0 + (line_idx * line_height),
174
+ width=x1 - x0,
175
+ height=line_height,
176
+ font_size=12,
177
+ font_name="Arial",
178
+ is_bold=False,
179
+ is_italic=False,
180
+ block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
181
+ )
182
+ text_blocks.append(text_block)
183
+ except Exception as e:
184
+ print(f"⚠️ Simple text block extraction failed: {e}")
185
+
186
+ return text_blocks
187
+
188
+ def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
189
+ images = []
190
+ try:
191
+ image_list = page.get_images(full=True)
192
+ for img_index, img_info in enumerate(image_list):
193
+ try:
194
+ xref = img_info[0]
195
+
196
+ img_rects = [r for r in page.get_image_rects(xref)]
197
+ if not img_rects:
198
+ continue
199
+
200
+ bbox = img_rects[0]
201
+
202
+ pix = fitz.Pixmap(doc, xref)
203
+ if pix.n - pix.alpha < 4:
204
+ img_data = pix.tobytes("png")
205
+ img_base64 = base64.b64encode(img_data).decode()
206
+
207
+ images.append({
208
+ "index": img_index,
209
+ "data": img_base64,
210
+ "bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
211
+ })
212
+ pix = None
213
+ except Exception as e:
214
+ print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
215
+ continue
216
+ except Exception as e:
217
+ print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
218
+ return images
219
+
220
+ def _detect_tables_safely(self, page) -> List[Dict]:
221
+ tables = []
222
+ try:
223
+ tabs = page.find_tables()
224
+ for tab_index, tab in enumerate(tabs):
225
+ try:
226
+ table_data = tab.extract()
227
+ if table_data:
228
+ cleaned_data = []
229
+ for row in table_data:
230
+ cleaned_row = [str(cell).strip() if cell else "" for cell in row]
231
+ if any(cleaned_row):
232
+ cleaned_data.append(cleaned_row)
233
+
234
+ if cleaned_data:
235
+ tables.append({
236
+ "bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1),
237
+ "data": cleaned_data
238
+ })
239
+ except Exception as e:
240
+ print(f"⚠️ Error extracting table {tab_index}: {e}")
241
+ continue
242
+ except Exception as e:
243
+ print(f"⚠️ General error in table detection: {e}")
244
+ return tables
245
+
246
+ def enhance_math_symbols(self, text: str) -> str:
247
+ math_replacements = {
248
+ 'Β±': '&plusmn;', 'Γ—': '&times;', 'Γ·': '&divide;', 'βˆ‘': '&sum;',
249
+ '∏': '&prod;', '√': '&radic;', '∞': '&infin;', '∫': '&int;',
250
+ 'βˆ‚': '&part;', 'βˆ†': '&Delta;', 'βˆ‡': '&nabla;', '∈': '&isin;',
251
+ 'βˆ‰': '&notin;', 'βŠ‚': '&sub;', 'βŠƒ': '&sup;', 'βŠ†': '&sube;',
252
+ 'βŠ‡': '&supe;', 'βˆͺ': '&cup;', '∩': '&cap;', '≀': '&le;',
253
+ 'β‰₯': '&ge;', 'β‰ ': '&ne;', '≑': '&equiv;', 'β‰ˆ': '&asymp;',
254
+ '∝': '&prop;', '∴': '&there4;',
255
+ 'Ξ±': '&alpha;', 'Ξ²': '&beta;', 'Ξ³': '&gamma;', 'Ξ΄': '&delta;',
256
+ 'Ξ΅': '&epsilon;', 'ΞΆ': '&zeta;', 'Ξ·': '&eta;', 'ΞΈ': '&theta;',
257
+ 'ΞΉ': '&iota;', 'ΞΊ': '&kappa;', 'Ξ»': '&lambda;', 'ΞΌ': '&mu;',
258
+ 'Ξ½': '&nu;', 'ΞΎ': '&xi;', 'Ο€': '&pi;', 'ρ': '&rho;', 'Οƒ': '&sigma;',
259
+ 'Ο„': '&tau;', 'Ο…': '&upsilon;', 'Ο†': '&phi;', 'Ο‡': '&chi;',
260
+ 'ψ': '&psi;', 'Ο‰': '&omega;',
261
+ 'Β½': '&frac12;', 'β…“': '&frac13;', 'ΒΌ': '&frac14;', 'β…”': '&frac23;',
262
+ 'ΒΎ': '&frac34;', 'β…›': '&frac18;', 'Β²': '&sup2;', 'Β³': '&sup3;',
263
+ 'ΒΉ': '&sup1;', 'Β°': '&deg;'
264
+ }
265
+
266
+ for symbol, html_entity in math_replacements.items():
267
+ text = text.replace(symbol, html_entity)
268
+
269
+ return text
270
+
271
+ def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str:
272
+ html_content = []
273
+ html_content.append("""<!DOCTYPE html>
274
+ <html lang="en">
275
+ <head>
276
+ <meta charset="UTF-8">
277
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
278
+ <title>PDF Document</title>
279
+ <style>
280
+ * {
281
+ box-sizing: border-box;
282
+ margin: 0;
283
+ padding: 0;
284
+ }
285
+
286
+ body {
287
+ font-family: 'Times New Roman', Times, serif;
288
+ background-color: #f5f5f5;
289
+ padding: 20px;
290
+ line-height: 1.2;
291
+ color: #000000;
292
+ }
293
+
294
+ .document-container {
295
+ max-width: 1200px;
296
+ margin: 0 auto;
297
+ background-color: white;
298
+ box-shadow: 0 4px 12px rgba(0,0,0,0.1);
299
+ border: 1px solid #ddd;
300
+ }
301
+
302
+ .page-wrapper {
303
+ background-color: white;
304
+ margin: 0;
305
+ padding: 40px;
306
+ border-bottom: 2px solid #000;
307
+ position: relative;
308
+ min-height: 800px;
309
+ page-break-after: always;
310
+ overflow: visible;
311
+ }
312
+
313
+ .page-header {
314
+ background-color: #f8f8f8;
315
+ padding: 10px 15px;
316
+ margin: -40px -40px 30px -40px;
317
+ border-bottom: 2px solid #000;
318
+ font-weight: bold;
319
+ color: #000;
320
+ font-size: 14px;
321
+ text-align: center;
322
+ }
323
+
324
+ .content-layer {
325
+ position: relative;
326
+ width: 100%;
327
+ height: 100%;
328
+ }
329
+
330
+ .text-content {
331
+ position: relative;
332
+ z-index: 10;
333
+ line-height: 1.4;
334
+ }
335
+
336
+ .text-block {
337
+ margin-bottom: 8px;
338
+ font-family: 'Times New Roman', Times, serif;
339
+ color: #000;
340
+ word-wrap: break-word;
341
+ overflow-wrap: break-word;
342
+ }
343
+
344
+ .text-block.inline {
345
+ display: inline;
346
+ margin-bottom: 0;
347
+ margin-right: 5px;
348
+ }
349
+
350
+ .text-group {
351
+ margin-bottom: 12px;
352
+ line-height: 1.3;
353
+ }
354
+
355
+ .bold {
356
+ font-weight: bold;
357
+ }
358
+
359
+ .italic {
360
+ font-style: italic;
361
+ }
362
+
363
+ .table-container {
364
+ margin: 20px 0;
365
+ background-color: white;
366
+ overflow: auto;
367
+ z-index: 20;
368
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
369
+ }
370
+
371
+ .table {
372
+ width: 100%;
373
+ border-collapse: collapse;
374
+ border: 2px solid #000;
375
+ font-family: 'Times New Roman', Times, serif;
376
+ font-size: 12px;
377
+ color: #000;
378
+ background-color: white;
379
+ margin: 0;
380
+ }
381
+
382
+ .table td, .table th {
383
+ border: 1px solid #000;
384
+ padding: 8px 12px;
385
+ text-align: left;
386
+ vertical-align: top;
387
+ background-color: white;
388
+ font-family: 'Times New Roman', Times, serif;
389
+ word-wrap: break-word;
390
+ min-width: 60px;
391
+ }
392
+
393
+ .table th {
394
+ background-color: #f0f0f0;
395
+ font-weight: bold;
396
+ text-align: center;
397
+ }
398
+
399
+ .table tr:nth-child(even) td {
400
+ background-color: #f9f9f9;
401
+ }
402
+
403
+ .table tr:hover td {
404
+ background-color: #f0f0f0;
405
+ }
406
+
407
+ .image-container {
408
+ margin: 15px 0;
409
+ border: 1px solid #ccc;
410
+ background-color: white;
411
+ text-align: center;
412
+ overflow: hidden;
413
+ z-index: 5;
414
+ }
415
+
416
+ .image {
417
+ max-width: 100%;
418
+ height: auto;
419
+ display: block;
420
+ margin: 0 auto;
421
+ }
422
+
423
+ .math-symbol {
424
+ font-family: 'Times New Roman', serif;
425
+ }
426
+
427
+ .document-info {
428
+ background-color: #f8f8f8;
429
+ padding: 15px;
430
+ border: 1px solid #ccc;
431
+ margin-bottom: 20px;
432
+ text-align: center;
433
+ font-family: 'Times New Roman', Times, serif;
434
+ }
435
+
436
+ @media print {
437
+ body {
438
+ background-color: white;
439
+ padding: 0;
440
+ }
441
+ .page-wrapper {
442
+ border: none;
443
+ box-shadow: none;
444
+ margin: 0;
445
+ page-break-after: always;
446
+ }
447
+ .document-info {
448
+ display: none;
449
+ }
450
+ .table {
451
+ border: 2px solid #000 !important;
452
+ }
453
+ .table td, .table th {
454
+ border: 1px solid #000 !important;
455
+ }
456
+ }
457
+ </style>
458
+ </head>
459
+ <body>
460
+ <div class="document-container">""")
461
+
462
+ html_content.append(f"""
463
+ <div class="document-info">
464
+ <h1>PDF Document Conversion</h1>
465
+ <p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p>
466
+ <p><strong>Converted on:</strong> {self._get_current_timestamp()}</p>
467
+ </div>""")
468
+
469
+ for page in pdf_content["pages"]:
470
+ page_width = max(page["page_width"], 595)
471
+ page_height = max(page["page_height"], 842)
472
+
473
+ html_content.append(f"""
474
+ <div class="page-wrapper">
475
+ <div class="page-header">
476
+ Page {page["page_number"]} ({page_width:.0f}Γ—{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])}
477
+ </div>
478
+ <div class="content-layer">""")
479
+
480
+ # Add images first
481
+ for img in page["images"]:
482
+ html_content.append(f"""
483
+ <div class="image-container">
484
+ <img class="image" src="data:image/png;base64,{img['data']}"
485
+ alt="Page {page['page_number']} Image {img['index']}">
486
+ </div>""")
487
+
488
+ # Add tables with improved generation
489
+ for table_idx, table in enumerate(page["tables"]):
490
+ print(f"πŸ”„ Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})")
491
+ html_content.append(self._generate_html_table(
492
+ table["data"],
493
+ header_rows=table.get("header_rows", 1)
494
+ ))
495
+
496
+ # Add text content (non-overlapping groups)
497
+ text_groups = self._group_overlapping_text(page["text_blocks"])
498
+
499
+ html_content.append(' <div class="text-content">')
500
+
501
+ for group in text_groups:
502
+ if len(group) == 1:
503
+ block = group[0]
504
+ if block.text.strip():
505
+ enhanced_text = self.enhance_math_symbols(block.text)
506
+ enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')
507
+
508
+ css_classes = ["text-block"]
509
+ if block.is_bold:
510
+ css_classes.append("bold")
511
+ if block.is_italic:
512
+ css_classes.append("italic")
513
+ if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
514
+ css_classes.append("math-symbol")
515
+
516
+ font_family = "'Times New Roman', Times, serif"
517
+ if 'arial' in block.font_name.lower():
518
+ font_family = "Arial, sans-serif"
519
+ elif 'helvetica' in block.font_name.lower():
520
+ font_family = "Helvetica, Arial, sans-serif"
521
+ elif 'courier' in block.font_name.lower():
522
+ font_family = "'Courier New', monospace"
523
+
524
+ font_size = max(block.font_size * 0.9, 10)
525
+
526
+ html_content.append(f"""
527
+ <div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
528
+ {enhanced_text}
529
+ </div>""")
530
+ else:
531
+ group.sort(key=lambda b: b.x)
532
+ html_content.append(' <div class="text-group">')
533
+
534
+ for block in group:
535
+ if block.text.strip():
536
+ enhanced_text = self.enhance_math_symbols(block.text)
537
+ enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')
538
+
539
+ css_classes = ["text-block", "inline"]
540
+ if block.is_bold:
541
+ css_classes.append("bold")
542
+ if block.is_italic:
543
+ css_classes.append("italic")
544
+ if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
545
+ css_classes.append("math-symbol")
546
+
547
+ font_family = "'Times New Roman', Times, serif"
548
+ if 'arial' in block.font_name.lower():
549
+ font_family = "Arial, sans-serif"
550
+ elif 'helvetica' in block.font_name.lower():
551
+ font_family = "Helvetica, Arial, sans-serif"
552
+ elif 'courier' in block.font_name.lower():
553
+ font_family = "'Courier New', monospace"
554
+
555
+ font_size = max(block.font_size * 0.9, 10)
556
+
557
+ html_content.append(f"""
558
+ <span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
559
+ {enhanced_text}
560
+ </span>""")
561
+
562
+ html_content.append(' </div>')
563
+
564
+ html_content.append(""" </div>
565
+ </div>
566
+ </div>""")
567
+
568
+ html_content.append(" </div>")
569
+ html_content.append("""
570
+ </body>
571
+ </html>""")
572
+ final_html = "\n".join(html_content)
573
+
574
+ if output_path:
575
+ try:
576
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
577
+ with open(output_path, 'w', encoding='utf-8') as f:
578
+ f.write(final_html)
579
+ print(f"βœ… HTML saved to: {output_path}")
580
+ except Exception as e:
581
+ print(f"⚠️ Error saving HTML to {output_path}: {e}")
582
+
583
+ return final_html
584
+
585
+ def _get_current_timestamp(self) -> str:
586
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
587
+
588
+ def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str:
589
+ print(f"πŸš€ Processing PDF: {pdf_path}")
590
+
591
+ if not os.path.exists(pdf_path):
592
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
593
+
594
+ print("πŸ“„ Extracting PDF content...")
595
+ pdf_content = self.extract_pdf_content(pdf_path)
596
+
597
+ if use_hf_models and self.hf_token:
598
+ print("πŸ€– Attempting to enhance with Hugging Face models...")
599
+ try:
600
+ print("Note: Hugging Face model integration requires further implementation.")
601
+ except Exception as e:
602
+ print(f"⚠️ Hugging Face enhancement failed: {e}")
603
+
604
+ print("πŸ”„ Converting to HTML...")
605
+ html_content = self.convert_to_html(pdf_content, output_path)
606
+
607
+ print("βœ… Processing complete!")
608
+ return html_content
609
+
610
+ def main():
611
+ HF_TOKEN = os.getenv("HF_API_TOKEN")
612
+
613
+ converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
614
+ pdf_path = "new-pdf.pdf"
615
+ output_path = "sample_converted.html"
616
+
617
+ try:
618
+ html_content = converter.process_pdf(
619
+ pdf_path=pdf_path,
620
+ output_path=output_path,
621
+ use_hf_models=False
622
+ )
623
+
624
+ print(f"βœ… Successfully converted '{pdf_path}' to '{output_path}'")
625
+ print(f"🌐 Open '{output_path}' in your web browser to view the result!")
626
+
627
+ except FileNotFoundError as e:
628
+ print(f"❌ Error: {e}")
629
+ print("Please ensure the PDF file exists at the specified path.")
630
+ except Exception as e:
631
+ print(f"❌ An unexpected error occurred: {str(e)}")
632
+ import traceback
633
+ traceback.print_exc()
634
+
635
+ if __name__ == "__main__":
636
+ main()
pdf_json.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import json
4
+ import requests
5
+ from typing import Dict, List, Any, Optional
6
+ import fitz # PyMuPDF
7
+ from PIL import Image
8
+ import io
9
+ import re
10
+ from dataclasses import dataclass, asdict
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+
14
+ @dataclass
15
+ class TextBlock:
16
+ text: str
17
+ x: float
18
+ y: float
19
+ width: float
20
+ height: float
21
+ font_size: float
22
+ font_name: str
23
+ is_bold: bool = False
24
+ is_italic: bool = False
25
+ block_id: str = ""
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ """Convert TextBlock to dictionary"""
29
+ return asdict(self)
30
+
31
+ @dataclass
32
+ class ImageData:
33
+ index: int
34
+ base64_data: str
35
+ bbox: tuple
36
+ width: float
37
+ height: float
38
+ format: str = "PNG"
39
+
40
+ def to_dict(self) -> Dict[str, Any]:
41
+ """Convert ImageData to dictionary"""
42
+ return asdict(self)
43
+
44
+ @dataclass
45
+ class TableData:
46
+ bbox: tuple
47
+ data: List[List[str]]
48
+ rows: int
49
+ columns: int
50
+
51
+ def to_dict(self) -> Dict[str, Any]:
52
+ """Convert TableData to dictionary"""
53
+ return asdict(self)
54
+
55
+ @dataclass
56
+ class PageData:
57
+ page_number: int
58
+ text_blocks: List[TextBlock]
59
+ images: List[ImageData]
60
+ tables: List[TableData]
61
+ page_width: float
62
+ page_height: float
63
+ word_count: int = 0
64
+ character_count: int = 0
65
+
66
+ def to_dict(self) -> Dict[str, Any]:
67
+ """Convert PageData to dictionary"""
68
+ return {
69
+ "page_number": self.page_number,
70
+ "text_blocks": [block.to_dict() for block in self.text_blocks],
71
+ "images": [img.to_dict() for img in self.images],
72
+ "tables": [table.to_dict() for table in self.tables],
73
+ "page_width": self.page_width,
74
+ "page_height": self.page_height,
75
+ "word_count": self.word_count,
76
+ "character_count": self.character_count
77
+ }
78
+
79
+ class PDFToJSONConverter:
80
+ def __init__(self, huggingface_token: str = None):
81
+ self.hf_token = huggingface_token
82
+ self.hf_headers = {
83
+ "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
84
+ }
85
+ self.models = {
86
+ "document_layout": "microsoft/layoutlm-base-uncased",
87
+ "table_detection": "microsoft/table-transformer-detection",
88
+ "ocr": "microsoft/trocr-base-printed",
89
+ "math_detection": "facebook/detr-resnet-50"
90
+ }
91
+ self.hf_inference_url = "https://api-inference.huggingface.co/models"
92
+
93
+ def pdf_to_base64(self, pdf_path: str) -> str:
94
+ """Convert PDF file to base64 string"""
95
+ try:
96
+ with open(pdf_path, "rb") as pdf_file:
97
+ return base64.b64encode(pdf_file.read()).decode('utf-8')
98
+ except Exception as e:
99
+ raise Exception(f"Error converting PDF to base64: {str(e)}")
100
+
101
+ def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
102
+ """Extract all content from PDF and return structured data"""
103
+ doc = None
104
+ try:
105
+ if not os.path.exists(pdf_path):
106
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
107
+
108
+ doc = fitz.open(pdf_path)
109
+
110
+ if doc is None:
111
+ raise RuntimeError("Failed to open PDF document")
112
+
113
+ if doc.page_count == 0:
114
+ raise ValueError("PDF document has no pages")
115
+
116
+ print(f"πŸ“„ PDF opened successfully: {doc.page_count} pages")
117
+
118
+ pages_data = []
119
+ document_stats = {
120
+ "total_pages": doc.page_count,
121
+ "total_words": 0,
122
+ "total_characters": 0,
123
+ "total_images": 0,
124
+ "total_tables": 0
125
+ }
126
+
127
+ for page_num in range(doc.page_count):
128
+ try:
129
+ page = doc[page_num]
130
+ print(f"πŸ”„ Processing page {page_num + 1}/{doc.page_count}")
131
+
132
+ # Extract text blocks
133
+ text_blocks = []
134
+ try:
135
+ page_dict = page.get_text("dict")
136
+ text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
137
+ except Exception as e:
138
+ print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
139
+ text_blocks = self._extract_text_blocks_simple(page, page_num)
140
+
141
+ # Extract images
142
+ images = self._extract_images_safely(page, doc, page_num)
143
+
144
+ # Extract tables
145
+ tables = self._detect_tables_safely(page)
146
+
147
+ # Get page dimensions
148
+ page_rect = page.rect
149
+
150
+ # Calculate statistics
151
+ page_text = " ".join([block.text for block in text_blocks])
152
+ word_count = len(page_text.split())
153
+ char_count = len(page_text)
154
+
155
+ # Create page data
156
+ page_data = PageData(
157
+ page_number=page_num + 1,
158
+ text_blocks=text_blocks,
159
+ images=images,
160
+ tables=tables,
161
+ page_width=page_rect.width,
162
+ page_height=page_rect.height,
163
+ word_count=word_count,
164
+ character_count=char_count
165
+ )
166
+
167
+ pages_data.append(page_data)
168
+
169
+ # Update document statistics
170
+ document_stats["total_words"] += word_count
171
+ document_stats["total_characters"] += char_count
172
+ document_stats["total_images"] += len(images)
173
+ document_stats["total_tables"] += len(tables)
174
+
175
+ except Exception as e:
176
+ print(f"❌ Error processing page {page_num + 1}: {e}")
177
+ # Create empty page data for failed pages
178
+ empty_page = PageData(
179
+ page_number=page_num + 1,
180
+ text_blocks=[],
181
+ images=[],
182
+ tables=[],
183
+ page_width=595,
184
+ page_height=842,
185
+ word_count=0,
186
+ character_count=0
187
+ )
188
+ pages_data.append(empty_page)
189
+
190
+ result = {
191
+ "document_info": {
192
+ "filename": os.path.basename(pdf_path),
193
+ "file_size": os.path.getsize(pdf_path),
194
+ "conversion_timestamp": self._get_current_timestamp(),
195
+ "converter_version": "1.0.0"
196
+ },
197
+ "document_statistics": document_stats,
198
+ "pages": [page.to_dict() for page in pages_data]
199
+ }
200
+
201
+ return result
202
+
203
+ except Exception as e:
204
+ raise Exception(f"Error extracting PDF content: {str(e)}")
205
+ finally:
206
+ if doc is not None:
207
+ try:
208
+ doc.close()
209
+ print("βœ… PDF document closed successfully")
210
+ except Exception as e:
211
+ print(f"⚠️ Error closing PDF document: {e}")
212
+
213
+ def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
214
+ """Extract text blocks from page dictionary with detailed formatting"""
215
+ text_blocks = []
216
+
217
+ for block_idx, block in enumerate(page_dict.get("blocks", [])):
218
+ if "lines" not in block:
219
+ continue
220
+
221
+ for line_idx, line in enumerate(block["lines"]):
222
+ for span_idx, span in enumerate(line["spans"]):
223
+ text_content = span.get("text", "").strip()
224
+ if text_content:
225
+ bbox = span["bbox"]
226
+ font_info = {
227
+ "size": span.get("size", 12),
228
+ "font": span.get("font", "Arial"),
229
+ "is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
230
+ "is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
231
+ }
232
+
233
+ text_block = TextBlock(
234
+ text=text_content,
235
+ x=round(bbox[0], 2),
236
+ y=round(bbox[1], 2),
237
+ width=round(bbox[2] - bbox[0], 2),
238
+ height=round(bbox[3] - bbox[1], 2),
239
+ font_size=round(font_info["size"], 2),
240
+ font_name=font_info["font"],
241
+ is_bold=font_info["is_bold"],
242
+ is_italic=font_info["is_italic"],
243
+ block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
244
+ )
245
+ text_blocks.append(text_block)
246
+
247
+ return text_blocks
248
+
249
+ def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
250
+ """Fallback method for text extraction"""
251
+ text_blocks = []
252
+ try:
253
+ blocks_data = page.get_text("blocks")
254
+ for block_idx, block in enumerate(blocks_data):
255
+ if block[6] == 0: # Text block
256
+ text = block[4].strip()
257
+ if text:
258
+ x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
259
+
260
+ lines = text.split('\n')
261
+ line_height = (y1 - y0) / max(len(lines), 1)
262
+
263
+ for line_idx, line in enumerate(lines):
264
+ if line.strip():
265
+ text_block = TextBlock(
266
+ text=line.strip(),
267
+ x=round(x0, 2),
268
+ y=round(y0 + (line_idx * line_height), 2),
269
+ width=round(x1 - x0, 2),
270
+ height=round(line_height, 2),
271
+ font_size=12.0,
272
+ font_name="Arial",
273
+ is_bold=False,
274
+ is_italic=False,
275
+ block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
276
+ )
277
+ text_blocks.append(text_block)
278
+ except Exception as e:
279
+ print(f"⚠️ Simple text block extraction failed: {e}")
280
+
281
+ return text_blocks
282
+
283
+ def _extract_images_safely(self, page, doc, page_num) -> List[ImageData]:
284
+ """Extract images from page and return structured data"""
285
+ images = []
286
+ try:
287
+ image_list = page.get_images(full=True)
288
+ for img_index, img_info in enumerate(image_list):
289
+ try:
290
+ xref = img_info[0]
291
+
292
+ # Get image rectangles
293
+ img_rects = [r for r in page.get_image_rects(xref)]
294
+ if not img_rects:
295
+ continue
296
+
297
+ bbox = img_rects[0]
298
+
299
+ # Extract image data
300
+ pix = fitz.Pixmap(doc, xref)
301
+ if pix.n - pix.alpha < 4: # Valid image
302
+ img_data = pix.tobytes("png")
303
+ img_base64 = base64.b64encode(img_data).decode()
304
+
305
+ image_data = ImageData(
306
+ index=img_index,
307
+ base64_data=img_base64,
308
+ bbox=(round(bbox.x0, 2), round(bbox.y0, 2),
309
+ round(bbox.x1, 2), round(bbox.y1, 2)),
310
+ width=round(bbox.x1 - bbox.x0, 2),
311
+ height=round(bbox.y1 - bbox.y0, 2),
312
+ format="PNG"
313
+ )
314
+ images.append(image_data)
315
+ pix = None
316
+ except Exception as e:
317
+ print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
318
+ continue
319
+ except Exception as e:
320
+ print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
321
+ return images
322
+
323
+ def _detect_tables_safely(self, page) -> List[TableData]:
324
+ """Extract tables from page and return structured data"""
325
+ tables = []
326
+ try:
327
+ tabs = page.find_tables()
328
+ for tab_index, tab in enumerate(tabs):
329
+ try:
330
+ table_data = tab.extract()
331
+ if table_data:
332
+ # Clean table data
333
+ cleaned_data = []
334
+ for row in table_data:
335
+ cleaned_row = [str(cell).strip() if cell else "" for cell in row]
336
+ if any(cleaned_row): # Only add non-empty rows
337
+ cleaned_data.append(cleaned_row)
338
+
339
+ if cleaned_data:
340
+ table_obj = TableData(
341
+ bbox=(round(tab.bbox.x0, 2), round(tab.bbox.y0, 2),
342
+ round(tab.bbox.x1, 2), round(tab.bbox.y1, 2)),
343
+ data=cleaned_data,
344
+ rows=len(cleaned_data),
345
+ columns=max(len(row) for row in cleaned_data) if cleaned_data else 0
346
+ )
347
+ tables.append(table_obj)
348
+ except Exception as e:
349
+ print(f"⚠️ Error extracting table {tab_index}: {e}")
350
+ continue
351
+ except Exception as e:
352
+ print(f"⚠️ General error in table detection: {e}")
353
+ return tables
354
+
355
+ def convert_to_json(self, pdf_content: Dict[str, Any], output_path: str = None,
356
+ pretty_print: bool = True, include_base64_images: bool = True) -> str:
357
+ """Convert PDF content to JSON format"""
358
+ print("πŸ”„ Converting to JSON format...")
359
+
360
+ try:
361
+ # Create a copy of the content for modification
362
+ json_content = pdf_content.copy()
363
+
364
+ # Add metadata
365
+ json_content["conversion_options"] = {
366
+ "pretty_print": pretty_print,
367
+ "include_base64_images": include_base64_images,
368
+ "json_schema_version": "1.0"
369
+ }
370
+
371
+ # Optionally remove base64 image data to reduce file size
372
+ if not include_base64_images:
373
+ for page in json_content["pages"]:
374
+ for image in page["images"]:
375
+ image["base64_data"] = "[Base64 data removed - set include_base64_images=True to include]"
376
+
377
+ # Convert to JSON string
378
+ if pretty_print:
379
+ json_string = json.dumps(json_content, indent=2, ensure_ascii=False)
380
+ else:
381
+ json_string = json.dumps(json_content, ensure_ascii=False)
382
+
383
+ # Save to file if output path provided
384
+ if output_path:
385
+ try:
386
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
387
+ with open(output_path, 'w', encoding='utf-8') as f:
388
+ f.write(json_string)
389
+ print(f"βœ… JSON saved to: {output_path}")
390
+ print(f"πŸ“Š File size: {len(json_string):,} characters")
391
+ except Exception as e:
392
+ print(f"⚠️ Error saving JSON to {output_path}: {e}")
393
+
394
+ return json_string
395
+
396
+ except Exception as e:
397
+ raise Exception(f"Error converting to JSON: {str(e)}")
398
+
399
+ def create_json_summary(self, pdf_content: Dict[str, Any]) -> Dict[str, Any]:
400
+ """Create a summary of the PDF content without full data"""
401
+ summary = {
402
+ "document_info": pdf_content.get("document_info", {}),
403
+ "document_statistics": pdf_content.get("document_statistics", {}),
404
+ "page_summaries": []
405
+ }
406
+
407
+ for page in pdf_content.get("pages", []):
408
+ page_summary = {
409
+ "page_number": page["page_number"],
410
+ "text_blocks_count": len(page["text_blocks"]),
411
+ "images_count": len(page["images"]),
412
+ "tables_count": len(page["tables"]),
413
+ "word_count": page["word_count"],
414
+ "character_count": page["character_count"],
415
+ "page_dimensions": {
416
+ "width": page["page_width"],
417
+ "height": page["page_height"]
418
+ },
419
+ "sample_text": " ".join([block["text"] for block in page["text_blocks"][:3]])[:200] + "..." if page["text_blocks"] else ""
420
+ }
421
+ summary["page_summaries"].append(page_summary)
422
+
423
+ return summary
424
+
425
+ def _get_current_timestamp(self) -> str:
426
+ """Get current timestamp as string"""
427
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
428
+
429
+ def process_pdf_to_json(self, pdf_path: str, output_path: str = None,
430
+ pretty_print: bool = True, include_base64_images: bool = True,
431
+ create_summary: bool = False, use_hf_models: bool = False) -> str:
432
+ """Main method to process PDF and convert to JSON"""
433
+ print(f"πŸš€ Processing PDF to JSON: {pdf_path}")
434
+
435
+ if not os.path.exists(pdf_path):
436
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
437
+
438
+ print("πŸ“„ Extracting PDF content...")
439
+ pdf_content = self.extract_pdf_content(pdf_path)
440
+
441
+ if use_hf_models and self.hf_token:
442
+ print("πŸ€– Attempting to enhance with Hugging Face models...")
443
+ try:
444
+ print("Note: Hugging Face model integration requires further implementation.")
445
+ except Exception as e:
446
+ print(f"⚠️ Hugging Face enhancement failed: {e}")
447
+
448
+ print("πŸ”„ Converting to JSON...")
449
+ json_content = self.convert_to_json(
450
+ pdf_content,
451
+ output_path,
452
+ pretty_print,
453
+ include_base64_images
454
+ )
455
+
456
+ # Create summary file if requested
457
+ if create_summary and output_path:
458
+ summary_path = output_path.replace('.json', '_summary.json')
459
+ summary_data = self.create_json_summary(pdf_content)
460
+ summary_json = json.dumps(summary_data, indent=2, ensure_ascii=False)
461
+
462
+ try:
463
+ with open(summary_path, 'w', encoding='utf-8') as f:
464
+ f.write(summary_json)
465
+ print(f"βœ… Summary JSON saved to: {summary_path}")
466
+ except Exception as e:
467
+ print(f"⚠️ Error saving summary: {e}")
468
+
469
+ print("βœ… Processing complete!")
470
+ return json_content
471
+
472
+ def main():
473
+ """Main function to demonstrate PDF to JSON conversion"""
474
+ # Set your Hugging Face token if needed
475
+ HF_TOKEN = os.getenv("HF_API_TOKEN")
476
+
477
+ # Initialize converter
478
+ converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
479
+
480
+ # Define paths
481
+ pdf_path = "new-pdf.pdf" # Change this to your PDF file path
482
+ output_path = "converted_document.json" # Output JSON file path
483
+
484
+ try:
485
+ # Convert PDF to JSON
486
+ json_content = converter.process_pdf_to_json(
487
+ pdf_path=pdf_path,
488
+ output_path=output_path,
489
+ pretty_print=True, # Format JSON with indentation
490
+ include_base64_images=True, # Include image data (set False to reduce file size)
491
+ create_summary=True, # Create additional summary file
492
+ use_hf_models=False # Set to True if you want to use HuggingFace models
493
+ )
494
+
495
+ print(f"βœ… Successfully converted '{pdf_path}' to '{output_path}'")
496
+ print(f"πŸ“Š JSON length: {len(json_content):,} characters")
497
+ print(f"πŸ“„ Open '{output_path}' to view the structured JSON data!")
498
+
499
+ # Optional: Print first 500 characters of JSON as preview
500
+ print("\nπŸ“‹ JSON Preview (first 500 characters):")
501
+ print("-" * 50)
502
+ print(json_content[:500] + "..." if len(json_content) > 500 else json_content)
503
+
504
+ except FileNotFoundError as e:
505
+ print(f"❌ Error: {e}")
506
+ print("Please ensure the PDF file exists at the specified path.")
507
+ except Exception as e:
508
+ print(f"❌ An unexpected error occurred: {str(e)}")
509
+ import traceback
510
+ traceback.print_exc()
511
+
512
+ if __name__ == "__main__":
513
+ main()
pdf_word.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import json
4
+ from typing import Dict, List, Any
5
+ import fitz
6
+ from PIL import Image
7
+ import io
8
+ import re
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+ from docx import Document
13
+ from docx.shared import Inches, Pt
14
+ from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
15
+ from docx.enum.table import WD_TABLE_ALIGNMENT
16
+ from docx.oxml.shared import OxmlElement, qn
17
+ from docx.oxml.ns import nsdecls
18
+ from docx.oxml import parse_xml
19
+ import unicodedata
20
+ import docx
21
+ import camelot
22
+
23
+ @dataclass
24
+ class TextBlock:
25
+ text: str
26
+ x: float
27
+ y: float
28
+ width: float
29
+ height: float
30
+ font_size: float
31
+ font_name: str
32
+ is_bold: bool = False
33
+ is_italic: bool = False
34
+ block_id: str = ""
35
+ is_math: bool = False
36
+
37
+ class PDFToWordConverter:
38
+ def __init__(self, huggingface_token: str = None):
39
+ self.hf_token = huggingface_token
40
+ self.hf_headers = {
41
+ "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
42
+ }
43
+ self.models = {
44
+ "document_layout": "microsoft/layoutlm-base-uncased",
45
+ "table_detection": "microsoft/table-transformer-detection",
46
+ "ocr": "microsoft/trocr-base-printed",
47
+ "math_detection": "facebook/detr-resnet-50"
48
+ }
49
+ self.hf_inference_url = "https://api-inference.huggingface.co/models"
50
+ self.math_symbols = {
51
+ '√': '√', 'βˆ‘': 'βˆ‘', '∏': '∏', '∫': '∫', '∞': '∞', '≀': '≀', 'β‰₯': 'β‰₯', 'β‰ ': 'β‰ ', 'Β±': 'Β±',
52
+ 'Γ—': 'Γ—', 'Γ·': 'Γ·', 'Ξ±': 'Ξ±', 'Ξ²': 'Ξ²', 'Ξ³': 'Ξ³', 'Ξ΄': 'Ξ΄', 'ΞΈ': 'ΞΈ', 'Ξ»': 'Ξ»', 'ΞΌ': 'ΞΌ',
53
+ 'Ο€': 'Ο€', 'Οƒ': 'Οƒ', 'Ο†': 'Ο†', 'Ο‰': 'Ο‰'
54
+ }
55
+
56
+ def detect_mathematical_content(self, text: str) -> bool:
57
+ math_patterns = [
58
+ r'\d+\s*[+\-*/=]\s*\d+', r'[a-zA-Z]\s*=\s*\d+', r'\b(?:sin|cos|tan|log|ln|exp)\s*\(',
59
+ r'\d+\s*\^\s*\d+', r'√\d+', r'\d+/\d+', r'[βˆ‘βˆβˆ«]', r'[≀β‰₯β‰ Β±Γ—Γ·]', r'[αβγδθλμπσφω]',
60
+ r'\bEquation\s+\d+', r'\d+\.\d+', r'\$\d+,?\d*', r'NORMSINV', r'using Equation'
61
+ ]
62
+ for pattern in math_patterns:
63
+ if re.search(pattern, text, re.IGNORECASE):
64
+ return True
65
+ return False
66
+
67
+ def preserve_mathematical_formatting(self, text: str) -> str:
68
+ if not text:
69
+ return ""
70
+ text = text.replace('Γ—', 'Γ—')
71
+ text = text.replace('Γ·', 'Γ·')
72
+ text = text.replace('Β±', 'Β±')
73
+ text = text.replace('≀', '≀')
74
+ text = text.replace('β‰₯', 'β‰₯')
75
+ text = text.replace('β‰ ', 'β‰ ')
76
+ text = text.replace('√', '√')
77
+ text = text.replace('βˆ‘', 'βˆ‘')
78
+ text = text.replace('∏', '∏')
79
+ text = text.replace('∫', '∫')
80
+ text = text.replace('∞', '∞')
81
+ text = re.sub(r'(\d+)\s*\^\s*(\d+)', r'\1^\2', text)
82
+ text = re.sub(r'(\w+)\s*\(\s*([^)]+)\s*\)', r'\1(\2)', text)
83
+ return text
84
+
85
+ def clean_text_for_xml(self, text: str) -> str:
86
+ if not text:
87
+ return ""
88
+ try:
89
+ text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
90
+ text = text.replace('\ufeff', '')
91
+ text = text.replace('\u0000', '')
92
+ text = unicodedata.normalize('NFKC', text)
93
+ printable_chars = []
94
+ for char in text:
95
+ if char.isprintable() or char.isspace() or char in self.math_symbols:
96
+ printable_chars.append(char)
97
+ else:
98
+ printable_chars.append(' ')
99
+ text = ''.join(printable_chars)
100
+ text = re.sub(r'\s+', ' ', text).strip()
101
+ text = text.encode('utf-8', errors='ignore').decode('utf-8')
102
+ return self.preserve_mathematical_formatting(text)
103
+ except Exception:
104
+ return ''.join(char for char in str(text) if ord(char) < 128).strip()
105
+
106
+ def clean_font_name(self, font_name: str) -> str:
107
+ if not font_name:
108
+ return "Calibri"
109
+ try:
110
+ cleaned = self.clean_text_for_xml(font_name)
111
+ cleaned = re.sub(r'[^\w\s-]', '', cleaned)
112
+ if not cleaned.strip():
113
+ return "Calibri"
114
+ return cleaned.strip()
115
+ except Exception:
116
+ return "Calibri"
117
+
118
+ def pdf_to_base64(self, pdf_path: str) -> str:
119
+ try:
120
+ with open(pdf_path, "rb") as pdf_file:
121
+ return base64.b64encode(pdf_file.read()).decode('utf-8')
122
+ except Exception as e:
123
+ raise Exception(f"Error converting PDF to base64: {str(e)}")
124
+
125
+ def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
126
+ doc = None
127
+ try:
128
+ if not os.path.exists(pdf_path):
129
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
130
+ doc = fitz.open(pdf_path)
131
+ if doc is None:
132
+ raise RuntimeError("Failed to open PDF document")
133
+ if doc.page_count == 0:
134
+ raise ValueError("PDF document has no pages")
135
+ print(f"PDF opened successfully: {doc.page_count} pages")
136
+ pages_content = []
137
+ for page_num in range(doc.page_count):
138
+ try:
139
+ page = doc[page_num]
140
+ print(f"Processing page {page_num + 1}/{doc.page_count}")
141
+ text_blocks = []
142
+ try:
143
+ page_dict = page.get_text("dict")
144
+ text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
145
+ except Exception as e:
146
+ print(f"Dict method failed for page {page_num + 1}, using fallback: {e}")
147
+ text_blocks = self._extract_text_blocks_simple(page, page_num)
148
+ images = self._extract_images_safely(page, doc, page_num)
149
+ tables = self._detect_tables_with_camelot(pdf_path, page_num)
150
+ page_rect = page.rect
151
+ pages_content.append({
152
+ "page_number": page_num + 1,
153
+ "text_blocks": text_blocks,
154
+ "images": images,
155
+ "tables": tables,
156
+ "page_width": page_rect.width,
157
+ "page_height": page_rect.height
158
+ })
159
+ except Exception as e:
160
+ print(f"Error processing page {page_num + 1}: {e}")
161
+ pages_content.append({
162
+ "page_number": page_num + 1,
163
+ "text_blocks": [],
164
+ "images": [],
165
+ "tables": [],
166
+ "page_width": 595,
167
+ "page_height": 842
168
+ })
169
+ result = {
170
+ "pages": pages_content,
171
+ "total_pages": doc.page_count
172
+ }
173
+ return result
174
+ except Exception as e:
175
+ raise Exception(f"Error extracting PDF content: {str(e)}")
176
+ finally:
177
+ if doc is not None:
178
+ try:
179
+ doc.close()
180
+ print("PDF document closed successfully")
181
+ except Exception as e:
182
+ print(f"Error closing PDF document: {e}")
183
+
184
+ def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
185
+ text_blocks = []
186
+ for block_idx, block in enumerate(page_dict.get("blocks", [])):
187
+ if "lines" not in block:
188
+ continue
189
+ for line_idx, line in enumerate(block["lines"]):
190
+ for span_idx, span in enumerate(line["spans"]):
191
+ text_content = span.get("text", "").strip()
192
+ if text_content:
193
+ cleaned_text = self.clean_text_for_xml(text_content)
194
+ if not cleaned_text:
195
+ continue
196
+ bbox = span["bbox"]
197
+ font_name = self.clean_font_name(span.get("font", "Arial"))
198
+ font_info = {
199
+ "size": max(span.get("size", 12), 6),
200
+ "font": font_name,
201
+ "is_bold": "bold" in font_name.lower() or bool(span.get("flags", 0) & 16),
202
+ "is_italic": "italic" in font_name.lower() or bool(span.get("flags", 0) & 2)
203
+ }
204
+ is_math = self.detect_mathematical_content(cleaned_text)
205
+ text_block = TextBlock(
206
+ text=cleaned_text,
207
+ x=bbox[0], y=bbox[1],
208
+ width=bbox[2] - bbox[0], height=bbox[3] - bbox[1],
209
+ font_size=font_info["size"], font_name=font_info["font"],
210
+ is_bold=font_info["is_bold"], is_italic=font_info["is_italic"],
211
+ block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}",
212
+ is_math=is_math
213
+ )
214
+ text_blocks.append(text_block)
215
+ return text_blocks
216
+
217
+ def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
218
+ text_blocks = []
219
+ try:
220
+ blocks_data = page.get_text("blocks")
221
+ for block_idx, block in enumerate(blocks_data):
222
+ if block[6] == 0:
223
+ text = block[4].strip()
224
+ if text:
225
+ cleaned_text = self.clean_text_for_xml(text)
226
+ if not cleaned_text:
227
+ continue
228
+ x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
229
+ lines = cleaned_text.split('\n')
230
+ line_height = (y1 - y0) / max(len(lines), 1)
231
+ for line_idx, line in enumerate(lines):
232
+ line_text = self.clean_text_for_xml(line)
233
+ if line_text:
234
+ is_math = self.detect_mathematical_content(line_text)
235
+ text_block = TextBlock(
236
+ text=line_text,
237
+ x=x0, y=y0 + (line_idx * line_height),
238
+ width=x1 - x0, height=line_height,
239
+ font_size=12, font_name="Arial",
240
+ is_bold=False, is_italic=False,
241
+ block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}",
242
+ is_math=is_math
243
+ )
244
+ text_blocks.append(text_block)
245
+ except Exception as e:
246
+ print(f"Simple text block extraction failed: {e}")
247
+ return text_blocks
248
+
249
+ def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
250
+ images = []
251
+ try:
252
+ image_list = page.get_images(full=True)
253
+ for img_index, img_info in enumerate(image_list):
254
+ try:
255
+ xref = img_info[0]
256
+ img_rects = [r for r in page.get_image_rects(xref)]
257
+ if not img_rects:
258
+ continue
259
+ bbox = img_rects[0]
260
+ pix = fitz.Pixmap(doc, xref)
261
+ if pix.n - pix.alpha < 4:
262
+ img_data = pix.tobytes("png")
263
+ img_base64 = base64.b64encode(img_data).decode()
264
+ images.append({
265
+ "index": img_index,
266
+ "data": img_data,
267
+ "base64": img_base64,
268
+ "bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
269
+ })
270
+ pix = None
271
+ except Exception as e:
272
+ print(f"Error extracting image {img_index} on page {page_num+1}: {e}")
273
+ continue
274
+ except Exception as e:
275
+ print(f"General error in image extraction for page {page_num+1}: {e}")
276
+ return images
277
+
278
+ def _detect_tables_with_camelot(self, pdf_path: str, page_num: int) -> List[Dict]:
279
+ tables = []
280
+ try:
281
+ try:
282
+ camelot_tables = camelot.read_pdf(
283
+ pdf_path,
284
+ pages=str(page_num + 1),
285
+ flavor='lattice',
286
+ suppress_stdout=True
287
+ )
288
+ if len(camelot_tables) == 0:
289
+ camelot_tables = camelot.read_pdf(
290
+ pdf_path,
291
+ pages=str(page_num + 1),
292
+ flavor='stream',
293
+ suppress_stdout=True
294
+ )
295
+ except:
296
+ camelot_tables = camelot.read_pdf(
297
+ pdf_path,
298
+ pages=str(page_num + 1),
299
+ flavor='stream',
300
+ suppress_stdout=True
301
+ )
302
+
303
+ for table in camelot_tables:
304
+ table_data = table.df.values.tolist()
305
+ if table_data and any(any(str(cell).strip() for cell in row) for row in table_data):
306
+ cleaned_data = []
307
+ for row in table_data:
308
+ cleaned_row = []
309
+ for cell in row:
310
+ cell_text = str(cell).strip() if cell is not None else ""
311
+ cleaned_cell = self.clean_text_for_xml(cell_text)
312
+ cleaned_row.append(cleaned_cell)
313
+ cleaned_data.append(cleaned_row)
314
+
315
+ tables.append({
316
+ "bbox": table.bbox,
317
+ "data": cleaned_data,
318
+ "accuracy": getattr(table, 'accuracy', 0)
319
+ })
320
+ print(f"Found table with {len(cleaned_data)} rows and {len(cleaned_data[0]) if cleaned_data else 0} columns on page {page_num + 1}")
321
+ except Exception as e:
322
+ print(f"Error detecting tables with Camelot on page {page_num + 1}: {e}")
323
+ return tables
324
+
325
+ def _add_page_break(self, doc):
326
+ try:
327
+ paragraph = doc.add_paragraph()
328
+ run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
329
+ run.add_break(WD_BREAK.PAGE)
330
+ except:
331
+ doc.add_page_break()
332
+
333
+ def _set_font_properties(self, run, text_block: TextBlock):
334
+ try:
335
+ font_name = self.clean_font_name(text_block.font_name)
336
+ if 'Times' in font_name or 'Roman' in font_name:
337
+ run.font.name = 'Times New Roman'
338
+ elif 'Arial' in font_name:
339
+ run.font.name = 'Arial'
340
+ elif 'Courier' in font_name:
341
+ run.font.name = 'Courier New'
342
+ else:
343
+ run.font.name = 'Calibri'
344
+ try:
345
+ font_size_val = float(text_block.font_size)
346
+ font_size = max(min(int(font_size_val), 72), 6)
347
+ run.font.size = Pt(font_size)
348
+ except (ValueError, TypeError):
349
+ print(f"Warning: Invalid font_size '{text_block.font_size}'. Using default 11pt.")
350
+ run.font.size = Pt(11)
351
+ run.font.bold = bool(text_block.is_bold)
352
+ run.font.italic = bool(text_block.is_italic)
353
+ if text_block.is_math:
354
+ run.font.name = 'Cambria Math'
355
+ except Exception as e:
356
+ print(f"Error setting font properties for text_block: {e}")
357
+ run.font.name = 'Calibri'
358
+ run.font.size = Pt(11)
359
+ run.font.bold = False
360
+ run.font.italic = False
361
+
362
+ def _group_text_blocks_by_lines(self, text_blocks: List[TextBlock]) -> List[List[TextBlock]]:
363
+ if not text_blocks:
364
+ return []
365
+ sorted_blocks = sorted(text_blocks, key=lambda b: (round(b.y, 1), b.x))
366
+ lines = []
367
+ current_line = []
368
+ current_y = None
369
+ for block in sorted_blocks:
370
+ if current_y is None or abs(block.y - current_y) <= 5:
371
+ current_line.append(block)
372
+ current_y = block.y if current_y is None else current_y
373
+ else:
374
+ if current_line:
375
+ lines.append(current_line)
376
+ current_line = [block]
377
+ current_y = block.y
378
+ if current_line:
379
+ lines.append(current_line)
380
+ return lines
381
+
382
+ def _set_table_borders(self, table):
383
+ tbl = table._tbl
384
+ for row in tbl.tr_lst:
385
+ for cell in row.tc_lst:
386
+ tcPr = cell.tcPr
387
+ tcBorders = OxmlElement('w:tcBorders')
388
+
389
+ for border_name in ['top', 'left', 'bottom', 'right']:
390
+ border = OxmlElement(f'w:{border_name}')
391
+ border.set(qn('w:val'), 'single')
392
+ border.set(qn('w:sz'), '4')
393
+ border.set(qn('w:space'), '0')
394
+ border.set(qn('w:color'), '000000')
395
+ tcBorders.append(border)
396
+
397
+ tcPr.append(tcBorders)
398
+
399
+ def _create_enhanced_table(self, doc, table_data):
400
+ try:
401
+ table_rows = table_data["data"]
402
+ if not table_rows or not any(any(str(cell).strip() for cell in row) for row in table_rows):
403
+ return None
404
+
405
+ max_cols = max(len(row) for row in table_rows) if table_rows else 0
406
+ if max_cols == 0:
407
+ return None
408
+
409
+ word_table = doc.add_table(rows=len(table_rows), cols=max_cols)
410
+
411
+ self._set_table_borders(word_table)
412
+ word_table.alignment = WD_TABLE_ALIGNMENT.CENTER
413
+ word_table.autofit = False
414
+
415
+ for row_idx, row_data in enumerate(table_rows):
416
+ for col_idx in range(max_cols):
417
+ cell = word_table.cell(row_idx, col_idx)
418
+ cell_data = row_data[col_idx] if col_idx < len(row_data) else ""
419
+ clean_cell_data = self.clean_text_for_xml(str(cell_data) if cell_data else "")
420
+
421
+ paragraph = cell.paragraphs[0]
422
+ paragraph.clear()
423
+ run = paragraph.add_run(clean_cell_data)
424
+
425
+ if self.detect_mathematical_content(clean_cell_data):
426
+ run.font.name = 'Cambria Math'
427
+ else:
428
+ run.font.name = 'Calibri'
429
+ run.font.size = Pt(9)
430
+
431
+ if row_idx == 0:
432
+ run.font.bold = True
433
+ paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
434
+
435
+ cell.vertical_alignment = docx.enum.table.WD_ALIGN_VERTICAL.CENTER
436
+
437
+ print(f"Created table with {len(table_rows)} rows and {max_cols} columns")
438
+ return word_table
439
+ except Exception as e:
440
+ print(f"Error creating enhanced table: {e}")
441
+ return None
442
+
443
+ def convert_to_word(self, pdf_content: Dict[str, Any], output_path: str = None) -> Document:
444
+ print("Creating Word document...")
445
+ doc = Document()
446
+ doc.core_properties.title = "PDF to Word Conversion"
447
+ doc.core_properties.author = "PDF Converter"
448
+ doc.core_properties.created = datetime.now()
449
+ header_para = doc.add_paragraph()
450
+ header_run = header_para.add_run("PDF Document Conversion")
451
+ header_run.font.size = Pt(16)
452
+ header_run.font.bold = True
453
+ header_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
454
+ info_para = doc.add_paragraph()
455
+ info_run = info_para.add_run(f"Total Pages: {pdf_content.get('total_pages', 'Unknown')} | Converted on: {self._get_current_timestamp()}")
456
+ info_run.font.size = Pt(10)
457
+ info_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
458
+ doc.add_paragraph()
459
+
460
+ for page_idx, page in enumerate(pdf_content["pages"]):
461
+ print(f"Converting page {page['page_number']}/{pdf_content.get('total_pages', '?')}")
462
+ page_header = doc.add_paragraph()
463
+ page_header_run = page_header.add_run(f"--- Page {page['page_number']} ---")
464
+ page_header_run.font.bold = True
465
+ page_header_run.font.size = Pt(12)
466
+ page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
467
+
468
+ for img in page["images"]:
469
+ try:
470
+ img_para = doc.add_paragraph()
471
+ img_run = img_para.add_run()
472
+ img_stream = io.BytesIO(img['data'])
473
+ img_bbox = img['bbox']
474
+ img_width_px = img_bbox[2] - img_bbox[0]
475
+ page_width_px = page.get('page_width', 595)
476
+ img_width = min(Inches(img_width_px / 72), Inches(6.5))
477
+ img_run.add_picture(img_stream, width=img_width)
478
+ img_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
479
+ except Exception as e:
480
+ print(f"Error adding image to Word document: {e}")
481
+ img_para = doc.add_paragraph()
482
+ img_run = img_para.add_run(f"[Image {img['index']} - Could not be inserted]")
483
+ img_run.font.italic = True
484
+
485
+ if page["tables"]:
486
+ for table_data in page["tables"]:
487
+ try:
488
+ enhanced_table = self._create_enhanced_table(doc, table_data)
489
+ if enhanced_table:
490
+ doc.add_paragraph()
491
+ except Exception as e:
492
+ print(f"Error adding table to Word document: {e}")
493
+
494
+ text_lines = self._group_text_blocks_by_lines(page["text_blocks"])
495
+ for line_blocks in text_lines:
496
+ if not line_blocks:
497
+ continue
498
+ para = doc.add_paragraph()
499
+ line_blocks.sort(key=lambda b: b.x)
500
+ for block in line_blocks:
501
+ cleaned_text = self.clean_text_for_xml(block.text)
502
+ if cleaned_text:
503
+ run = para.add_run(cleaned_text + " ")
504
+ self._set_font_properties(run, block)
505
+ if page_idx < len(pdf_content["pages"]) - 1:
506
+ self._add_page_break(doc)
507
+ if output_path:
508
+ try:
509
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
510
+ doc.save(output_path)
511
+ print(f"Word document saved to: {output_path}")
512
+ except Exception as e:
513
+ print(f"Error saving Word document to {output_path}: {e}")
514
+ return doc
515
+
516
+ def _get_current_timestamp(self) -> str:
517
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
518
+
519
+ def process_pdf_to_word(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> Document:
520
+ print(f"Processing PDF to Word: {pdf_path}")
521
+ if not os.path.exists(pdf_path):
522
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
523
+ print("Extracting PDF content...")
524
+ pdf_content = self.extract_pdf_content(pdf_path)
525
+ if use_hf_models and self.hf_token:
526
+ print("Attempting to enhance with Hugging Face models...")
527
+ try:
528
+ print("Note: Hugging Face model integration requires further implementation.")
529
+ except Exception as e:
530
+ print(f"Hugging Face enhancement failed: {e}")
531
+ print("Converting to Word document...")
532
+ word_doc = self.convert_to_word(pdf_content, output_path)
533
+ print("Processing complete!")
534
+ return word_doc
535
+
536
+ def main():
537
+ HF_TOKEN = os.getenv("HF_API_TOKEN")
538
+
539
+ converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
540
+ pdf_path = "supplychain (1).pdf"
541
+ output_path = "converted_document_enhanced.docx"
542
+
543
+ try:
544
+ word_document = converter.process_pdf_to_word(
545
+ pdf_path=pdf_path,
546
+ output_path=output_path,
547
+ use_hf_models=False
548
+ )
549
+ print(f"Successfully converted '{pdf_path}' to '{output_path}'")
550
+ print(f"Open '{output_path}' in Microsoft Word to view the result!")
551
+ except FileNotFoundError as e:
552
+ print(f"Error: {e}")
553
+ except Exception as e:
554
+ print(f"An unexpected error occurred: {str(e)}")
555
+ import traceback
556
+ traceback.print_exc()
557
+
558
+ if __name__ == "__main__":
559
+ main()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF==1.23.26
2
+ Pillow==10.0.0
3
+ requests==2.31.0
4
+ transformers==4.35.0
5
+ torch==2.1.0
6
+ numpy==1.24.0
7
+ flask==2.3.3
8
+ flask-cors==4.0.0
9
+ werkzeug==2.3.7
10
+ camelot-py[cv]==0.11.0
11
+ gunicorn==21.2.0
12
+ python-docx==1.1.0
static/index.html ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>PDF Converter Tool</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif;
16
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
17
+ min-height: 100vh;
18
+ display: flex;
19
+ align-items: center;
20
+ justify-content: center;
21
+ padding: 20px;
22
+ }
23
+
24
+ .container {
25
+ background: rgba(255, 255, 255, 0.95);
26
+ backdrop-filter: blur(10px);
27
+ padding: 40px;
28
+ border-radius: 20px;
29
+ box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
30
+ max-width: 600px;
31
+ width: 100%;
32
+ animation: slideIn 0.6s ease-out;
33
+ }
34
+
35
+ @keyframes slideIn {
36
+ from {
37
+ opacity: 0;
38
+ transform: translateY(30px);
39
+ }
40
+ to {
41
+ opacity: 1;
42
+ transform: translateY(0);
43
+ }
44
+ }
45
+
46
+ .header {
47
+ text-align: center;
48
+ margin-bottom: 40px;
49
+ }
50
+
51
+ .header h1 {
52
+ color: #333;
53
+ font-size: 2.5em;
54
+ margin-bottom: 10px;
55
+ background: linear-gradient(45deg, #667eea, #764ba2);
56
+ -webkit-background-clip: text;
57
+ -webkit-text-fill-color: transparent;
58
+ background-clip: text;
59
+ }
60
+
61
+ .header p {
62
+ color: #666;
63
+ font-size: 1.1em;
64
+ }
65
+
66
+ .status-indicator {
67
+ position: absolute;
68
+ top: 20px;
69
+ right: 20px;
70
+ padding: 8px 16px;
71
+ border-radius: 20px;
72
+ font-size: 0.8em;
73
+ font-weight: 600;
74
+ text-transform: uppercase;
75
+ letter-spacing: 0.5px;
76
+ }
77
+
78
+ .status-online {
79
+ background: #d4edda;
80
+ color: #155724;
81
+ border: 1px solid #c3e6cb;
82
+ }
83
+
84
+ .status-offline {
85
+ background: #f8d7da;
86
+ color: #721c24;
87
+ border: 1px solid #f5c6cb;
88
+ }
89
+
90
+ .conversion-options {
91
+ display: grid;
92
+ gap: 20px;
93
+ margin-bottom: 30px;
94
+ }
95
+
96
+ .option-card {
97
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
98
+ border: none;
99
+ border-radius: 15px;
100
+ padding: 25px;
101
+ cursor: pointer;
102
+ transition: all 0.3s ease;
103
+ color: white;
104
+ text-align: left;
105
+ position: relative;
106
+ overflow: hidden;
107
+ }
108
+
109
+ .option-card:hover {
110
+ transform: translateY(-5px);
111
+ box-shadow: 0 15px 30px rgba(0, 0, 0, 0.2);
112
+ }
113
+
114
+ .option-card.html {
115
+ background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
116
+ }
117
+
118
+ .option-card.word {
119
+ background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
120
+ }
121
+
122
+ .option-card.json {
123
+ background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
124
+ }
125
+
126
+ /* New style for Excel option card */
127
+ .option-card.excel {
128
+ background: linear-gradient(135deg, #2ecc71 0%, #27ae60 100%); /* Green shades for Excel */
129
+ }
130
+
131
+
132
+ .option-card::before {
133
+ content: "";
134
+ position: absolute;
135
+ top: 0;
136
+ left: -100%;
137
+ width: 100%;
138
+ height: 100%;
139
+ background: linear-gradient(
140
+ 90deg,
141
+ transparent,
142
+ rgba(255, 255, 255, 0.2),
143
+ transparent
144
+ );
145
+ transition: left 0.5s;
146
+ }
147
+
148
+ .option-card:hover::before {
149
+ left: 100%;
150
+ }
151
+
152
+ .option-icon {
153
+ font-size: 2em;
154
+ margin-bottom: 10px;
155
+ }
156
+
157
+ .option-title {
158
+ font-size: 1.3em;
159
+ font-weight: bold;
160
+ margin-bottom: 5px;
161
+ }
162
+
163
+ .option-desc {
164
+ font-size: 0.9em;
165
+ opacity: 0.9;
166
+ }
167
+
168
+ .upload-section {
169
+ display: none;
170
+ background: #f8f9fa;
171
+ border-radius: 15px;
172
+ padding: 30px;
173
+ margin-top: 20px;
174
+ border: 2px dashed #ddd;
175
+ transition: all 0.3s ease;
176
+ }
177
+
178
+ .upload-section.active {
179
+ display: block;
180
+ animation: fadeIn 0.5s ease-out;
181
+ }
182
+
183
+ @keyframes fadeIn {
184
+ from {
185
+ opacity: 0;
186
+ }
187
+ to {
188
+ opacity: 1;
189
+ }
190
+ }
191
+
192
+ .file-input-wrapper {
193
+ position: relative;
194
+ display: inline-block;
195
+ width: 100%;
196
+ margin-bottom: 20px;
197
+ }
198
+
199
+ .file-input {
200
+ display: none;
201
+ }
202
+
203
+ .file-input-label {
204
+ display: block;
205
+ padding: 15px 25px;
206
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
207
+ color: white;
208
+ border-radius: 10px;
209
+ cursor: pointer;
210
+ text-align: center;
211
+ transition: all 0.3s ease;
212
+ font-weight: 500;
213
+ }
214
+
215
+ .file-input-label:hover {
216
+ transform: translateY(-2px);
217
+ box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2);
218
+ }
219
+
220
+ .file-name {
221
+ margin-top: 10px;
222
+ padding: 10px;
223
+ background: #e9ecef;
224
+ border-radius: 8px;
225
+ font-size: 0.9em;
226
+ color: #495057;
227
+ display: none;
228
+ }
229
+
230
+ .output-name {
231
+ width: 100%;
232
+ padding: 15px;
233
+ border: 2px solid #e9ecef;
234
+ border-radius: 10px;
235
+ font-size: 1em;
236
+ margin-bottom: 20px;
237
+ transition: border-color 0.3s ease;
238
+ }
239
+
240
+ .output-name:focus {
241
+ outline: none;
242
+ border-color: #667eea;
243
+ }
244
+
245
+ .convert-btn {
246
+ width: 100%;
247
+ padding: 15px;
248
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
249
+ color: white;
250
+ border: none;
251
+ border-radius: 10px;
252
+ font-size: 1.1em;
253
+ font-weight: 600;
254
+ cursor: pointer;
255
+ transition: all 0.3s ease;
256
+ position: relative;
257
+ overflow: hidden;
258
+ }
259
+
260
+ .convert-btn:hover {
261
+ transform: translateY(-2px);
262
+ box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2);
263
+ }
264
+
265
+ .convert-btn:disabled {
266
+ opacity: 0.7;
267
+ cursor: not-allowed;
268
+ transform: none;
269
+ }
270
+
271
+ .back-btn {
272
+ background: #6c757d;
273
+ color: white;
274
+ border: none;
275
+ padding: 10px 20px;
276
+ border-radius: 8px;
277
+ cursor: pointer;
278
+ margin-bottom: 20px;
279
+ transition: all 0.3s ease;
280
+ }
281
+
282
+ .back-btn:hover {
283
+ background: #5a6268;
284
+ transform: translateY(-1px);
285
+ }
286
+
287
+ .progress-bar {
288
+ width: 100%;
289
+ height: 6px;
290
+ background: #e9ecef;
291
+ border-radius: 3px;
292
+ margin: 20px 0;
293
+ overflow: hidden;
294
+ display: none;
295
+ }
296
+
297
+ .progress-fill {
298
+ height: 100%;
299
+ background: linear-gradient(90deg, #667eea, #764ba2);
300
+ width: 0%;
301
+ transition: width 0.3s ease;
302
+ border-radius: 3px;
303
+ }
304
+
305
+ .result-section {
306
+ margin-top: 20px;
307
+ padding: 20px;
308
+ border-radius: 12px;
309
+ display: none;
310
+ }
311
+
312
+ .result-success {
313
+ background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
314
+ border: 1px solid #c3e6cb;
315
+ color: #155724;
316
+ }
317
+
318
+ .result-error {
319
+ background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%);
320
+ border: 1px solid #f5c6cb;
321
+ color: #721c24;
322
+ }
323
+
324
+ .loading {
325
+ display: none;
326
+ text-align: center;
327
+ margin: 20px 0;
328
+ }
329
+
330
+ .spinner {
331
+ border: 4px solid #f3f3f3;
332
+ border-top: 4px solid #667eea;
333
+ border-radius: 50%;
334
+ width: 40px;
335
+ height: 40px;
336
+ animation: spin 1s linear infinite;
337
+ margin: 0 auto 10px;
338
+ }
339
+
340
+ @keyframes spin {
341
+ 0% {
342
+ transform: rotate(0deg);
343
+ }
344
+ 100% {
345
+ transform: rotate(360deg);
346
+ }
347
+ }
348
+
349
+ .drag-over {
350
+ border-color: #667eea !important;
351
+ background: rgba(102, 126, 234, 0.1) !important;
352
+ }
353
+
354
+ .debug-info {
355
+ margin-top: 20px;
356
+ padding: 15px;
357
+ background: #f8f9fa;
358
+ border-radius: 8px;
359
+ font-size: 0.9em;
360
+ color: #6c757d;
361
+ border-left: 4px solid #007bff;
362
+ }
363
+
364
+ .error-details {
365
+ margin-top: 10px;
366
+ padding: 10px;
367
+ background: #fff3cd;
368
+ border: 1px solid #ffeaa7;
369
+ border-radius: 6px;
370
+ font-size: 0.85em;
371
+ color: #856404;
372
+ max-height: 200px;
373
+ overflow-y: auto;
374
+ }
375
+
376
+ @media (max-width: 768px) {
377
+ .container {
378
+ padding: 25px;
379
+ margin: 10px;
380
+ }
381
+
382
+ .header h1 {
383
+ font-size: 2em;
384
+ }
385
+
386
+ .option-card {
387
+ padding: 20px;
388
+ }
389
+
390
+ .status-indicator {
391
+ position: relative;
392
+ top: auto;
393
+ right: auto;
394
+ margin-bottom: 20px;
395
+ display: inline-block;
396
+ }
397
+ }
398
+ </style>
399
+ </head>
400
+ <body>
401
+ <div class="container">
402
+ <div id="status-indicator" class="status-indicator status-offline">
403
+ Server Offline
404
+ </div>
405
+
406
+ <div class="header">
407
+ <h1>🧠 PDF Converter</h1>
408
+ <p>Convert your PDF files to HTML, Word, JSON, or Excel format</p>
409
+ </div>
410
+
411
+ <div id="main-menu">
412
+ <div class="conversion-options">
413
+ <button class="option-card html" onclick="showUploadSection('html')">
414
+ <div class="option-icon">🌐</div>
415
+ <div class="option-title">Convert to HTML</div>
416
+ <div class="option-desc">
417
+ Transform PDF into web-ready HTML format
418
+ </div>
419
+ </button>
420
+
421
+ <button class="option-card word" onclick="showUploadSection('word')">
422
+ <div class="option-icon">πŸ“„</div>
423
+ <div class="option-title">Convert to Word</div>
424
+ <div class="option-desc">
425
+ Create editable Word documents from PDF
426
+ </div>
427
+ </button>
428
+
429
+ <button class="option-card json" onclick="showUploadSection('json')">
430
+ <div class="option-icon">πŸ“Š</div>
431
+ <div class="option-title">Convert to JSON</div>
432
+ <div class="option-desc">
433
+ Extract structured data in JSON format
434
+ </div>
435
+ </button>
436
+
437
+ <button class="option-card excel" onclick="showUploadSection('excel')">
438
+ <div class="option-icon">πŸ“ˆ</div>
439
+ <div class="option-title">Convert to Excel</div>
440
+ <div class="option-desc">
441
+ Organize PDF tables into an Excel spreadsheet
442
+ </div>
443
+ </button>
444
+ </div>
445
+ </div>
446
+
447
+ <div id="upload-section" class="upload-section">
448
+ <button class="back-btn" onclick="showMainMenu()">
449
+ ← Back to Menu
450
+ </button>
451
+
452
+ <div class="file-input-wrapper">
453
+ <input
454
+ type="file"
455
+ id="pdf-file"
456
+ class="file-input"
457
+ accept=".pdf"
458
+ onchange="handleFileSelect(event)"
459
+ />
460
+ <label for="pdf-file" class="file-input-label" id="file-label">
461
+ πŸ“„ Choose PDF File or Drag & Drop Here
462
+ </label>
463
+ <div id="file-name" class="file-name"></div>
464
+ </div>
465
+
466
+ <div class="loading" id="loading">
467
+ <div class="spinner"></div>
468
+ <p>Converting your PDF file...</p>
469
+ </div>
470
+
471
+ <div class="progress-bar" id="progress-bar">
472
+ <div class="progress-fill" id="progress-fill"></div>
473
+ </div>
474
+
475
+ <button
476
+ class="convert-btn"
477
+ id="convert-btn"
478
+ onclick="convertFile()"
479
+ disabled
480
+ >
481
+ πŸš€ Start Conversion
482
+ </button>
483
+
484
+ <div id="result-section" class="result-section">
485
+ <div id="result-message"></div>
486
+ </div>
487
+
488
+ <div id="debug-info" class="debug-info" style="display: none">
489
+ <strong>Debug Information:</strong>
490
+ <div id="debug-content"></div>
491
+ </div>
492
+ </div>
493
+ </div>
494
+
495
+ <script>
496
+ let currentFormat = "";
497
+ let selectedFile = null;
498
+ let serverOnline = false;
499
+
500
+ // Check server status on page load
501
+ document.addEventListener("DOMContentLoaded", function () {
502
+ checkServerStatus();
503
+ // Check server status every 30 seconds
504
+ setInterval(checkServerStatus, 30000);
505
+ });
506
+
507
+ async function checkServerStatus() {
508
+ try {
509
+ const response = await fetch("/health", {
510
+ method: "GET",
511
+ mode: "cors",
512
+ headers: {
513
+ Accept: "application/json",
514
+ },
515
+ signal: AbortSignal.timeout(5000), // 5 second timeout
516
+ });
517
+
518
+ if (response.ok) {
519
+ const data = await response.json();
520
+ updateServerStatus(true, data.message || "Server is online");
521
+ } else {
522
+ updateServerStatus(false, `Server returned ${response.status}`);
523
+ }
524
+ } catch (error) {
525
+ updateServerStatus(false, error.message);
526
+ }
527
+ }
528
+
529
+ function updateServerStatus(online, message) {
530
+ serverOnline = online;
531
+ const indicator = document.getElementById("status-indicator");
532
+
533
+ if (online) {
534
+ indicator.className = "status-indicator status-online";
535
+ indicator.textContent = "Server Online";
536
+ indicator.title = message;
537
+ } else {
538
+ indicator.className = "status-indicator status-offline";
539
+ indicator.textContent = "Server Offline";
540
+ indicator.title = `Error: ${message}`;
541
+ }
542
+ }
543
+
544
+ function showUploadSection(format) {
545
+ if (!serverOnline) {
546
+ alert("Server is offline. Please start the Flask server first.");
547
+ return;
548
+ }
549
+
550
+ currentFormat = format;
551
+ document.getElementById("main-menu").style.display = "none";
552
+ document.getElementById("upload-section").classList.add("active");
553
+
554
+ resetForm(); // βœ… Always reset when entering upload
555
+
556
+ const outputInput = document.getElementById("output-name");
557
+ const extensions = { html: ".html", word: ".docx", json: ".json", excel: ".xlsx" };
558
+ outputInput.placeholder = `Enter output filename (e.g., converted_file${extensions[format]})`;
559
+ }
560
+
561
+
562
+ function showMainMenu() {
563
+ window.location.reload();
564
+ document.getElementById("main-menu").style.display = "block";
565
+ document.getElementById("upload-section").classList.remove("active");
566
+ resetForm();
567
+
568
+ selectedFile = null;
569
+ }
570
+
571
+
572
+ function resetForm() {
573
+ selectedFile = null;
574
+
575
+ const pdfInput = document.getElementById("pdf-file");
576
+ const outputInput = document.getElementById("output-name");
577
+ const fileName = document.getElementById("file-name");
578
+ const fileLabel = document.getElementById("file-label");
579
+
580
+ // Clear inputs
581
+ pdfInput.value = "";
582
+
583
+
584
+ // Hide filename display
585
+ fileName.style.display = "none";
586
+ fileName.textContent = "";
587
+
588
+ // Reset label text
589
+ fileLabel.textContent = "πŸ“„ Choose PDF File or Drag & Drop Here";
590
+
591
+ // Reset buttons and sections
592
+ document.getElementById("convert-btn").disabled = true;
593
+ document.getElementById("result-section").style.display = "none";
594
+ document.getElementById("loading").style.display = "none";
595
+ document.getElementById("progress-bar").style.display = "none";
596
+ document.getElementById("debug-info").style.display = "none";
597
+
598
+ // Also reset drag-over styling if stuck
599
+ document.getElementById("upload-section").classList.remove("drag-over");
600
+ }
601
+
602
+
603
+ function handleFileSelect(event) {
604
+ const file = event.target.files[0];
605
+ if (file && file.type === "application/pdf") {
606
+ selectedFile = file;
607
+ document.getElementById("file-name").textContent = `Selected: ${
608
+ file.name
609
+ } (${(file.size / 1024 / 1024).toFixed(2)} MB)`;
610
+ document.getElementById("file-name").style.display = "block";
611
+ document.getElementById(
612
+ "file-label"
613
+ ).textContent = `βœ… ${file.name} selected`;
614
+ checkFormValidity();
615
+ } else {
616
+ alert("Please select a valid PDF file.");
617
+ resetFileInput();
618
+ }
619
+ }
620
+
621
+ function resetFileInput() {
622
+ selectedFile = null;
623
+ document.getElementById("pdf-file").value = "";
624
+ document.getElementById("file-name").style.display = "none";
625
+ document.getElementById("file-label").textContent =
626
+ "πŸ“„ Choose PDF File or Drag & Drop Here";
627
+ checkFormValidity();
628
+ }
629
+
630
+ function checkFormValidity() {
631
+ const outputName = document.getElementById("output-name").value.trim();
632
+ const convertBtn = document.getElementById("convert-btn");
633
+
634
+ if (selectedFile && outputName && serverOnline) {
635
+ convertBtn.disabled = false;
636
+ convertBtn.textContent = "πŸš€ Start Conversion";
637
+ } else {
638
+ convertBtn.disabled = true;
639
+ convertBtn.textContent = serverOnline
640
+ ? "πŸš€ Start Conversion"
641
+ : "❌ Server Offline";
642
+ }
643
+ }
644
+
645
+ // Add event listener for output name input
646
+ document
647
+ .getElementById("output-name")
648
+ .addEventListener("input", checkFormValidity);
649
+
650
+ // Drag and drop functionality
651
+ const uploadSection = document.getElementById("upload-section");
652
+
653
+ ["dragenter", "dragover", "dragleave", "drop"].forEach((eventName) => {
654
+ uploadSection.addEventListener(eventName, preventDefaults, false);
655
+ });
656
+
657
+ function preventDefaults(e) {
658
+ e.preventDefault();
659
+ e.stopPropagation();
660
+ }
661
+
662
+ ["dragenter", "dragover"].forEach((eventName) => {
663
+ uploadSection.addEventListener(eventName, highlight, false);
664
+ });
665
+
666
+ ["dragleave", "drop"].forEach((eventName) => {
667
+ uploadSection.addEventListener(eventName, unhighlight, false);
668
+ });
669
+
670
+ function highlight() {
671
+ uploadSection.classList.add("drag-over");
672
+ }
673
+
674
+ function unhighlight() {
675
+ uploadSection.classList.remove("drag-over");
676
+ }
677
+
678
+ uploadSection.addEventListener("drop", handleDrop, false);
679
+
680
+ function handleDrop(e) {
681
+ const dt = e.dataTransfer;
682
+ const files = dt.files;
683
+
684
+ if (files.length > 0) {
685
+ const file = files[0];
686
+ if (file.type === "application/pdf") {
687
+ selectedFile = file;
688
+ document.getElementById("file-name").textContent = `Selected: ${
689
+ file.name
690
+ } (${(file.size / 1024 / 1024).toFixed(2)} MB)`;
691
+ document.getElementById("file-name").style.display = "block";
692
+ document.getElementById(
693
+ "file-label"
694
+ ).textContent = `βœ… ${file.name} selected`;
695
+ checkFormValidity();
696
+ } else {
697
+ alert("Please drop a valid PDF file.");
698
+ }
699
+ }
700
+ }
701
+
702
+ function checkFormValidity() {
703
+ const convertBtn = document.getElementById("convert-btn");
704
+ if (selectedFile && serverOnline) {
705
+ convertBtn.disabled = false;
706
+ convertBtn.textContent = "πŸš€ Start Conversion";
707
+ } else {
708
+ convertBtn.disabled = true;
709
+ convertBtn.textContent = serverOnline
710
+ ? "πŸš€ Start Conversion"
711
+ : "❌ Server Offline";
712
+ }
713
+ }
714
+
715
+ async function convertFile() {
716
+ if (!selectedFile || !currentFormat) {
717
+ alert("Please select a file and format.");
718
+ return;
719
+ }
720
+
721
+ if (!serverOnline) {
722
+ alert("Server is offline. Please start the Flask server first.");
723
+ return;
724
+ }
725
+
726
+ const outputName = selectedFile.name.replace(/\.[^/.]+$/, "");
727
+
728
+ document.getElementById("loading").style.display = "block";
729
+ document.getElementById("progress-bar").style.display = "block";
730
+ document.getElementById("convert-btn").disabled = true;
731
+ document.getElementById("result-section").style.display = "none";
732
+ document.getElementById("debug-info").style.display = "none";
733
+
734
+ simulateProgress();
735
+
736
+ const formData = new FormData();
737
+ formData.append("file", selectedFile);
738
+ formData.append("format", currentFormat);
739
+ formData.append("output_name", outputName);
740
+
741
+ const debugInfo = {
742
+ fileName: selectedFile.name,
743
+ fileSize: selectedFile.size,
744
+ format: currentFormat,
745
+ outputName: outputName,
746
+ timestamp: new Date().toISOString(),
747
+ };
748
+
749
+ try {
750
+ console.log("πŸ”„ Starting conversion...", debugInfo);
751
+
752
+ const controller = new AbortController();
753
+ const timeoutId = setTimeout(() => controller.abort(), 420000); // 60 second timeout
754
+
755
+ const response = await fetch("/convert", {
756
+ method: "POST",
757
+ body: formData,
758
+ headers: {
759
+ Accept: "application/json",
760
+ },
761
+ mode: "cors",
762
+ signal: controller.signal,
763
+ });
764
+
765
+ clearTimeout(timeoutId);
766
+ console.log("πŸ“‘ Response status:", response.status);
767
+
768
+ if (!response.ok) {
769
+ const errorText = await response.text();
770
+ throw new Error(`Server returned ${response.status}: ${errorText}`);
771
+ }
772
+
773
+ const result = await response.json();
774
+ console.log("βœ… Conversion result:", result);
775
+
776
+ // Hide loading
777
+ document.getElementById("loading").style.display = "none";
778
+ document.getElementById("progress-bar").style.display = "none";
779
+
780
+ // Show result
781
+ const resultSection = document.getElementById("result-section");
782
+ const resultMessage = document.getElementById("result-message");
783
+
784
+ if (result.success) {
785
+ resultSection.className = "result-section result-success";
786
+ resultMessage.innerHTML = `<h3>βœ… Conversion Successful!</h3>
787
+ <p>Your PDF has been converted to ${currentFormat.toUpperCase()} format.</p>
788
+ <p><strong>Output file:</strong> ${
789
+ result.output_path || "Generated successfully"
790
+ }</p>`;
791
+
792
+ if (result.download_url) {
793
+ const downloadUrl = `${window.location.origin}${result.download_url}`;
794
+
795
+ // Add link for user
796
+ resultMessage.innerHTML += `<p><a href="${downloadUrl}" target="_blank" style="color: #155724; text-decoration: none; font-weight: bold;">πŸ“₯ Download File</a></p>`;
797
+
798
+ // ⬇️ Auto-download
799
+ const a = document.createElement("a");
800
+ a.href = downloadUrl;
801
+ a.download = result.output_path || "converted_file";
802
+ document.body.appendChild(a);
803
+ a.click();
804
+ document.body.removeChild(a);
805
+ }
806
+ } else {
807
+ resultSection.className = "result-section result-error";
808
+ resultMessage.innerHTML = `
809
+ <h3>❌ Conversion Failed</h3>
810
+ <p>${
811
+ result.error || "An unexpected error occurred."
812
+ }</p>
813
+ `;
814
+ }
815
+
816
+ resultSection.style.display = "block";
817
+ } catch (error) {
818
+ console.error("❌ Error during conversion:", error);
819
+
820
+ // Hide loading
821
+ document.getElementById("loading").style.display = "none";
822
+ document.getElementById("progress-bar").style.display = "none";
823
+
824
+ // Show error
825
+ const resultSection = document.getElementById("result-section");
826
+ const resultMessage = document.getElementById("result-message");
827
+
828
+ resultSection.className = "result-section result-error";
829
+
830
+ let errorMessage = "An unexpected error occurred.";
831
+ if (error.name === "AbortError") {
832
+ errorMessage =
833
+ "Request timed out. The file might be too large or the server is taking too long to respond.";
834
+ } else if (error.message.includes("Failed to fetch")) {
835
+ errorMessage =
836
+ "Cannot connect to server. Please ensure the Flask server is running on http://localhost:5000";
837
+ } else {
838
+ errorMessage = error.message;
839
+ }
840
+
841
+ resultMessage.innerHTML = `
842
+ <h3>❌ Conversion Error</h3>
843
+ <p>${errorMessage}</p>
844
+ `;
845
+
846
+ resultSection.style.display = "block";
847
+
848
+ // Show debug information
849
+ const debugElement = document.getElementById("debug-info");
850
+ const debugContent = document.getElementById("debug-content");
851
+ debugContent.innerHTML = `
852
+ <div class="error-details">
853
+ <strong>Error Details:</strong><br>
854
+ Type: ${error.name}<br>
855
+ Message: ${error.message}<br>
856
+ <br>
857
+ <strong>Request Details:</strong><br>
858
+ ${JSON.stringify(debugInfo, null, 2)}
859
+ <br>
860
+ <strong>Troubleshooting:</strong><br>
861
+ 1. Ensure Flask server is running: python app.py<br>
862
+ 2. Check server logs for errors<br>
863
+ 3. Verify file size is under 100MB<br>
864
+ 4. Check browser console for additional errors
865
+ </div>
866
+ `;
867
+ debugElement.style.display = "block";
868
+ }
869
+
870
+ document.getElementById("convert-btn").disabled = false;
871
+ checkFormValidity(); // Update button state
872
+ }
873
+
874
+ function simulateProgress() {
875
+ const progressFill = document.getElementById("progress-fill");
876
+ let progress = 0;
877
+
878
+ const interval = setInterval(() => {
879
+ progress += Math.random() * 15;
880
+ if (progress > 90) progress = 90;
881
+
882
+ progressFill.style.width = progress + "%";
883
+
884
+ if (progress >= 90) {
885
+ clearInterval(interval);
886
+ }
887
+ }, 200);
888
+
889
+ // Reset progress after animation
890
+ setTimeout(() => {
891
+ progressFill.style.width = "0%";
892
+ }, 5000);
893
+ }
894
+ </script>
895
+ </body>
896
+ </html>