Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- app.py +180 -0
- dockerfile +53 -0
- pdf_excel.py +737 -0
- pdf_html.py +636 -0
- pdf_json.py +513 -0
- pdf_word.py +559 -0
- requirements.txt +12 -0
- static/index.html +896 -0
app.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify, send_file, send_from_directory
|
2 |
+
from flask_cors import CORS
|
3 |
+
from werkzeug.utils import secure_filename
|
4 |
+
import os
|
5 |
+
import traceback
|
6 |
+
from pdf_html import PDFToHTMLConverter
|
7 |
+
from pdf_word import PDFToWordConverter
|
8 |
+
from pdf_json import PDFToJSONConverter
|
9 |
+
from pdf_excel import PDFToExcelConverter
|
10 |
+
|
11 |
+
app = Flask(__name__, static_folder='static')
|
12 |
+
CORS(app)
|
13 |
+
|
14 |
+
# Configure file size limits and folders
|
15 |
+
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100 MB limit
|
16 |
+
app.config['UPLOAD_FOLDER'] = 'uploads'
|
17 |
+
app.config['OUTPUT_FOLDER'] = 'outputs'
|
18 |
+
app.config['SECRET_KEY'] = 'your-secret-key-here' # IMPORTANT: Change this in production!
|
19 |
+
|
20 |
+
# Create necessary directories if they don't exist
|
21 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
22 |
+
os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
|
23 |
+
|
24 |
+
# Placeholder for Hugging Face API Token
|
25 |
+
HF_TOKEN = "Api_token" # Replace with your actual token
|
26 |
+
|
27 |
+
# Define allowed file extensions for uploads
|
28 |
+
ALLOWED_EXTENSIONS = {'pdf'}
|
29 |
+
|
30 |
+
def allowed_file(filename):
|
31 |
+
"""Checks if the uploaded file has an allowed extension."""
|
32 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
33 |
+
|
34 |
+
@app.route('/')
|
35 |
+
def serve_index():
|
36 |
+
"""Serves the main index.html file."""
|
37 |
+
return send_from_directory('static', 'index.html')
|
38 |
+
|
39 |
+
@app.route('/<path:filename>')
|
40 |
+
def serve_static(filename):
|
41 |
+
"""Serves other static files (CSS, JS, etc.)."""
|
42 |
+
return send_from_directory('static', filename)
|
43 |
+
|
44 |
+
@app.route('/convert', methods=['POST'])
|
45 |
+
def convert_pdf():
|
46 |
+
"""
|
47 |
+
Handles PDF conversion requests.
|
48 |
+
Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'.
|
49 |
+
"""
|
50 |
+
try:
|
51 |
+
# Check if a file was included in the request
|
52 |
+
if 'file' not in request.files:
|
53 |
+
return jsonify({'success': False, 'error': 'No file uploaded.'}), 400
|
54 |
+
|
55 |
+
file = request.files['file']
|
56 |
+
format_type = request.form.get('format')
|
57 |
+
output_name = request.form.get('output_name', 'converted_file')
|
58 |
+
|
59 |
+
# Validate file and format
|
60 |
+
if file.filename == '':
|
61 |
+
return jsonify({'success': False, 'error': 'No file selected.'}), 400
|
62 |
+
|
63 |
+
if not format_type or format_type not in ['html', 'word', 'json', 'excel']:
|
64 |
+
return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400
|
65 |
+
|
66 |
+
if not allowed_file(file.filename):
|
67 |
+
return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400
|
68 |
+
|
69 |
+
# Securely save the uploaded file
|
70 |
+
filename_secured = secure_filename(file.filename)
|
71 |
+
input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured)
|
72 |
+
file.save(input_path)
|
73 |
+
|
74 |
+
# Define output file extensions based on format
|
75 |
+
extensions = {
|
76 |
+
'html': '.html',
|
77 |
+
'word': '.docx',
|
78 |
+
'json': '.json',
|
79 |
+
'excel': '.xlsx'
|
80 |
+
}
|
81 |
+
output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}"
|
82 |
+
output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename)
|
83 |
+
|
84 |
+
success_message = ""
|
85 |
+
|
86 |
+
try:
|
87 |
+
# Perform conversion based on the requested format
|
88 |
+
if format_type == 'html':
|
89 |
+
converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
|
90 |
+
try:
|
91 |
+
# First try with HF models
|
92 |
+
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True)
|
93 |
+
except AttributeError as ae:
|
94 |
+
if '_group_overlapping_text' in str(ae):
|
95 |
+
# Fall back to non-HF mode if the method is missing
|
96 |
+
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False)
|
97 |
+
else:
|
98 |
+
raise
|
99 |
+
success_message = "Successfully converted to HTML!"
|
100 |
+
elif format_type == 'word':
|
101 |
+
converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
|
102 |
+
converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False)
|
103 |
+
success_message = "Successfully converted to Word!"
|
104 |
+
elif format_type == 'json':
|
105 |
+
converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
|
106 |
+
converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False)
|
107 |
+
success_message = "Successfully converted to JSON!"
|
108 |
+
elif format_type == 'excel':
|
109 |
+
converter = PDFToExcelConverter(huggingface_token=HF_TOKEN)
|
110 |
+
converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False)
|
111 |
+
success_message = "Successfully converted to Excel!"
|
112 |
+
except Exception as conv_e:
|
113 |
+
# Clean up the output file if conversion failed
|
114 |
+
if os.path.exists(output_path):
|
115 |
+
try:
|
116 |
+
os.remove(output_path)
|
117 |
+
except Exception as e:
|
118 |
+
print(f"Warning: Could not remove output file {output_path}: {e}")
|
119 |
+
raise conv_e
|
120 |
+
|
121 |
+
# Clean up the uploaded input file
|
122 |
+
try:
|
123 |
+
os.remove(input_path)
|
124 |
+
except Exception as e:
|
125 |
+
print(f"Warning: Could not remove input file {input_path}: {e}")
|
126 |
+
|
127 |
+
# Return success response with download URL
|
128 |
+
return jsonify({
|
129 |
+
'success': True,
|
130 |
+
'message': success_message,
|
131 |
+
'download_url': f'/download/{output_filename}'
|
132 |
+
}), 200
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
# Clean up input file in case of error
|
136 |
+
if 'input_path' in locals() and os.path.exists(input_path):
|
137 |
+
try:
|
138 |
+
os.remove(input_path)
|
139 |
+
except Exception as cleanup_e:
|
140 |
+
print(f"Error during error cleanup for {input_path}: {cleanup_e}")
|
141 |
+
|
142 |
+
traceback.print_exc()
|
143 |
+
error_msg = str(e)
|
144 |
+
if '_group_overlapping_text' in error_msg:
|
145 |
+
error_msg = "HTML conversion failed due to incompatible converter version. Please try another format."
|
146 |
+
return jsonify({
|
147 |
+
'success': False,
|
148 |
+
'error': f'Conversion failed: {error_msg}'
|
149 |
+
}), 500
|
150 |
+
|
151 |
+
@app.route('/download/<filename>')
|
152 |
+
def download_file(filename):
|
153 |
+
"""Allows downloading of converted files."""
|
154 |
+
try:
|
155 |
+
file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename)
|
156 |
+
if os.path.exists(file_path):
|
157 |
+
return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True)
|
158 |
+
return jsonify({'error': 'File not found.'}), 404
|
159 |
+
except Exception as e:
|
160 |
+
traceback.print_exc()
|
161 |
+
return jsonify({'error': str(e)}), 500
|
162 |
+
|
163 |
+
@app.route('/health')
|
164 |
+
def health_check():
|
165 |
+
"""Simple health check endpoint."""
|
166 |
+
return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200
|
167 |
+
|
168 |
+
@app.errorhandler(413)
|
169 |
+
def too_large(e):
|
170 |
+
"""Handles file too large errors."""
|
171 |
+
return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413
|
172 |
+
|
173 |
+
@app.errorhandler(500)
|
174 |
+
def internal_error(e):
|
175 |
+
"""Handles general internal server errors."""
|
176 |
+
traceback.print_exc()
|
177 |
+
return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500
|
178 |
+
|
179 |
+
if __name__ == '__main__':
|
180 |
+
app.run(debug=True, host='0.0.0.0', port=5000)
|
dockerfile
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use a minimal Python base image
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Install system dependencies including libcrypt and additional libraries for PyMuPDF
|
5 |
+
RUN apt-get update && \
|
6 |
+
apt-get install -y \
|
7 |
+
libcrypt1 \
|
8 |
+
libgl1-mesa-glx \
|
9 |
+
libglib2.0-0 \
|
10 |
+
libsm6 \
|
11 |
+
libxext6 \
|
12 |
+
libxrender-dev \
|
13 |
+
libgomp1 \
|
14 |
+
poppler-utils \
|
15 |
+
build-essential \
|
16 |
+
libfontconfig1 \
|
17 |
+
libxrender1 \
|
18 |
+
libxtst6 \
|
19 |
+
libxi6 \
|
20 |
+
libfreetype6-dev \
|
21 |
+
libjpeg-dev \
|
22 |
+
libopenjp2-7-dev \
|
23 |
+
&& rm -rf /var/lib/apt/lists/*
|
24 |
+
|
25 |
+
# Set working directory
|
26 |
+
WORKDIR /app
|
27 |
+
|
28 |
+
# Copy requirements first for better caching
|
29 |
+
COPY requirements.txt .
|
30 |
+
|
31 |
+
# Upgrade pip and install Python dependencies with verbose output
|
32 |
+
RUN pip install --upgrade pip
|
33 |
+
|
34 |
+
# Install PyMuPDF first to check for issues early
|
35 |
+
RUN pip install --no-cache-dir PyMuPDF==1.23.0
|
36 |
+
|
37 |
+
# Test PyMuPDF import
|
38 |
+
RUN python -c "import fitz; print('PyMuPDF imported successfully')"
|
39 |
+
|
40 |
+
# Install remaining dependencies
|
41 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
42 |
+
|
43 |
+
# Verify python-docx installation
|
44 |
+
RUN python -c "from docx import Document; print('python-docx installed successfully')"
|
45 |
+
|
46 |
+
# Copy source code to container
|
47 |
+
COPY . .
|
48 |
+
|
49 |
+
# Expose the port Flask will run on (important for Hugging Face)
|
50 |
+
EXPOSE 7860
|
51 |
+
|
52 |
+
# Run the Flask app
|
53 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]
|
pdf_excel.py
ADDED
@@ -0,0 +1,737 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import fitz # PyMuPDF
|
4 |
+
import openpyxl
|
5 |
+
from openpyxl.utils.dataframe import dataframe_to_rows
|
6 |
+
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
|
7 |
+
from dataclasses import dataclass
|
8 |
+
from typing import List, Dict, Any, Tuple, Optional
|
9 |
+
import re
|
10 |
+
from pathlib import Path
|
11 |
+
import logging
|
12 |
+
from datetime import datetime
|
13 |
+
import numpy as np
|
14 |
+
|
15 |
+
# Optional imports with graceful fallback
|
16 |
+
try:
|
17 |
+
import camelot # For advanced table extraction
|
18 |
+
CAMELOT_AVAILABLE = True
|
19 |
+
except ImportError:
|
20 |
+
CAMELOT_AVAILABLE = False
|
21 |
+
print("β οΈ Camelot not installed. Run: pip install camelot-py[cv]")
|
22 |
+
|
23 |
+
try:
|
24 |
+
import tabula # Alternative table extraction
|
25 |
+
TABULA_AVAILABLE = True
|
26 |
+
except ImportError:
|
27 |
+
TABULA_AVAILABLE = False
|
28 |
+
print("β οΈ Tabula not installed. Run: pip install tabula-py")
|
29 |
+
|
30 |
+
# Set up logging
|
31 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
32 |
+
logger = logging.getLogger(__name__)
|
33 |
+
|
34 |
+
@dataclass
|
35 |
+
class TextBlock:
|
36 |
+
text: str
|
37 |
+
x: float
|
38 |
+
y: float
|
39 |
+
width: float
|
40 |
+
height: float
|
41 |
+
font_size: float
|
42 |
+
font_name: str
|
43 |
+
is_bold: bool = False
|
44 |
+
is_italic: bool = False
|
45 |
+
page_num: int = 1
|
46 |
+
block_id: str = ""
|
47 |
+
|
48 |
+
@dataclass
|
49 |
+
class TableData:
|
50 |
+
data: List[List[str]]
|
51 |
+
bbox: Tuple[float, float, float, float]
|
52 |
+
page_num: int
|
53 |
+
confidence: float = 0.0
|
54 |
+
has_header: bool = True
|
55 |
+
|
56 |
+
class PDFToExcelConverter:
|
57 |
+
"""
|
58 |
+
Enhanced PDF to Excel converter with multiple extraction methods
|
59 |
+
for better accuracy and handling of complex documents.
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(self):
|
63 |
+
# Check available extraction methods
|
64 |
+
available_methods = ['pymupdf'] # Always available
|
65 |
+
if CAMELOT_AVAILABLE:
|
66 |
+
available_methods.append('camelot')
|
67 |
+
if TABULA_AVAILABLE:
|
68 |
+
available_methods.append('tabula')
|
69 |
+
|
70 |
+
self.extraction_methods = available_methods
|
71 |
+
self.output_formats = {
|
72 |
+
'separate_sheets': 'Each table and text section on separate sheets',
|
73 |
+
'combined': 'All content combined logically',
|
74 |
+
'structured': 'Maintain document structure with proper formatting'
|
75 |
+
}
|
76 |
+
|
77 |
+
# Log available methods
|
78 |
+
logger.info(f"Available extraction methods: {', '.join(available_methods)}")
|
79 |
+
|
80 |
+
def extract_text_blocks_advanced(self, page, page_num: int) -> List[TextBlock]:
|
81 |
+
"""
|
82 |
+
Advanced text extraction with better formatting detection
|
83 |
+
"""
|
84 |
+
text_blocks = []
|
85 |
+
|
86 |
+
try:
|
87 |
+
# Method 1: Dictionary-based extraction (most detailed)
|
88 |
+
page_dict = page.get_text("dict")
|
89 |
+
|
90 |
+
for block_idx, block in enumerate(page_dict.get("blocks", [])):
|
91 |
+
if block.get("type", 1) != 0: # Skip non-text blocks
|
92 |
+
continue
|
93 |
+
|
94 |
+
for line_idx, line in enumerate(block.get("lines", [])):
|
95 |
+
for span_idx, span in enumerate(line.get("spans", [])):
|
96 |
+
text_content = span.get("text", "").strip()
|
97 |
+
if not text_content:
|
98 |
+
continue
|
99 |
+
|
100 |
+
bbox = span["bbox"]
|
101 |
+
flags = span.get("flags", 0)
|
102 |
+
|
103 |
+
# Enhanced font detection
|
104 |
+
font_name = span.get("font", "Arial")
|
105 |
+
font_size = span.get("size", 12)
|
106 |
+
is_bold = bool(flags & 16) or "bold" in font_name.lower()
|
107 |
+
is_italic = bool(flags & 2) or "italic" in font_name.lower()
|
108 |
+
|
109 |
+
text_block = TextBlock(
|
110 |
+
text=text_content,
|
111 |
+
x=bbox[0], y=bbox[1],
|
112 |
+
width=bbox[2] - bbox[0],
|
113 |
+
height=bbox[3] - bbox[1],
|
114 |
+
font_size=font_size,
|
115 |
+
font_name=font_name,
|
116 |
+
is_bold=is_bold,
|
117 |
+
is_italic=is_italic,
|
118 |
+
page_num=page_num,
|
119 |
+
block_id=f"p{page_num}_b{block_idx}_l{line_idx}_s{span_idx}"
|
120 |
+
)
|
121 |
+
text_blocks.append(text_block)
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
logger.warning(f"Advanced text extraction failed for page {page_num}: {e}")
|
125 |
+
# Fallback to simple extraction
|
126 |
+
text_blocks = self._extract_text_simple_fallback(page, page_num)
|
127 |
+
|
128 |
+
return text_blocks
|
129 |
+
|
130 |
+
def _extract_text_simple_fallback(self, page, page_num: int) -> List[TextBlock]:
|
131 |
+
"""
|
132 |
+
Fallback text extraction method
|
133 |
+
"""
|
134 |
+
text_blocks = []
|
135 |
+
try:
|
136 |
+
text = page.get_text()
|
137 |
+
if text.strip():
|
138 |
+
# Create a single text block for the entire page content
|
139 |
+
rect = page.rect
|
140 |
+
text_block = TextBlock(
|
141 |
+
text=text.strip(),
|
142 |
+
x=0, y=0,
|
143 |
+
width=rect.width,
|
144 |
+
height=rect.height,
|
145 |
+
font_size=12,
|
146 |
+
font_name="Arial",
|
147 |
+
page_num=page_num,
|
148 |
+
block_id=f"p{page_num}_fallback"
|
149 |
+
)
|
150 |
+
text_blocks.append(text_block)
|
151 |
+
except Exception as e:
|
152 |
+
logger.error(f"Fallback text extraction failed for page {page_num}: {e}")
|
153 |
+
|
154 |
+
return text_blocks
|
155 |
+
|
156 |
+
def extract_tables_multiple_methods(self, pdf_path: str, page_num: int) -> List[TableData]:
|
157 |
+
"""
|
158 |
+
Extract tables using multiple methods and combine results
|
159 |
+
"""
|
160 |
+
all_tables = []
|
161 |
+
|
162 |
+
# Method 1: PyMuPDF built-in table detection
|
163 |
+
tables_pymupdf = self._extract_tables_pymupdf(pdf_path, page_num)
|
164 |
+
all_tables.extend(tables_pymupdf)
|
165 |
+
|
166 |
+
# Method 2: Camelot (if available)
|
167 |
+
if CAMELOT_AVAILABLE:
|
168 |
+
try:
|
169 |
+
tables_camelot = self._extract_tables_camelot(pdf_path, page_num)
|
170 |
+
all_tables.extend(tables_camelot)
|
171 |
+
except Exception as e:
|
172 |
+
logger.warning(f"Camelot extraction failed: {e}")
|
173 |
+
|
174 |
+
# Method 3: Tabula (if available)
|
175 |
+
if TABULA_AVAILABLE:
|
176 |
+
try:
|
177 |
+
tables_tabula = self._extract_tables_tabula(pdf_path, page_num)
|
178 |
+
all_tables.extend(tables_tabula)
|
179 |
+
except Exception as e:
|
180 |
+
logger.warning(f"Tabula extraction failed: {e}")
|
181 |
+
|
182 |
+
# Remove duplicates and return best tables
|
183 |
+
return self._deduplicate_tables(all_tables)
|
184 |
+
|
185 |
+
def _extract_tables_pymupdf(self, pdf_path: str, page_num: int) -> List[TableData]:
|
186 |
+
"""
|
187 |
+
Extract tables using PyMuPDF
|
188 |
+
"""
|
189 |
+
tables = []
|
190 |
+
try:
|
191 |
+
doc = fitz.open(pdf_path)
|
192 |
+
page = doc[page_num - 1] # Convert to 0-based index
|
193 |
+
|
194 |
+
detected_tables = page.find_tables()
|
195 |
+
for i, table in enumerate(detected_tables):
|
196 |
+
try:
|
197 |
+
table_data = table.extract()
|
198 |
+
if table_data and len(table_data) > 0:
|
199 |
+
# Clean the table data
|
200 |
+
cleaned_data = []
|
201 |
+
for row in table_data:
|
202 |
+
cleaned_row = []
|
203 |
+
for cell in row:
|
204 |
+
cell_text = str(cell).strip() if cell else ""
|
205 |
+
cleaned_row.append(cell_text)
|
206 |
+
if any(cleaned_row): # Only add non-empty rows
|
207 |
+
cleaned_data.append(cleaned_row)
|
208 |
+
|
209 |
+
if cleaned_data:
|
210 |
+
tables.append(TableData(
|
211 |
+
data=cleaned_data,
|
212 |
+
bbox=table.bbox,
|
213 |
+
page_num=page_num,
|
214 |
+
confidence=0.8, # PyMuPDF generally reliable
|
215 |
+
has_header=True
|
216 |
+
))
|
217 |
+
except Exception as e:
|
218 |
+
logger.warning(f"Error extracting PyMuPDF table {i}: {e}")
|
219 |
+
|
220 |
+
doc.close()
|
221 |
+
except Exception as e:
|
222 |
+
logger.error(f"PyMuPDF table extraction failed: {e}")
|
223 |
+
|
224 |
+
return tables
|
225 |
+
|
226 |
+
def _extract_tables_camelot(self, pdf_path: str, page_num: int) -> List[TableData]:
|
227 |
+
"""
|
228 |
+
Extract tables using Camelot (only if available)
|
229 |
+
"""
|
230 |
+
if not CAMELOT_AVAILABLE:
|
231 |
+
return []
|
232 |
+
|
233 |
+
tables = []
|
234 |
+
try:
|
235 |
+
# Camelot works with page numbers (1-based)
|
236 |
+
camelot_tables = camelot.read_pdf(pdf_path, pages=str(page_num), flavor='lattice')
|
237 |
+
|
238 |
+
for i, table in enumerate(camelot_tables):
|
239 |
+
df = table.df
|
240 |
+
if not df.empty:
|
241 |
+
# Convert DataFrame to list of lists
|
242 |
+
table_data = df.values.tolist()
|
243 |
+
# Add headers if they exist
|
244 |
+
if not df.columns.empty:
|
245 |
+
headers = df.columns.tolist()
|
246 |
+
table_data.insert(0, headers)
|
247 |
+
|
248 |
+
tables.append(TableData(
|
249 |
+
data=table_data,
|
250 |
+
bbox=(0, 0, 100, 100), # Camelot doesn't provide bbox
|
251 |
+
page_num=page_num,
|
252 |
+
confidence=table.accuracy / 100.0 if hasattr(table, 'accuracy') else 0.7,
|
253 |
+
has_header=True
|
254 |
+
))
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
logger.warning(f"Camelot extraction failed: {e}")
|
258 |
+
|
259 |
+
return tables
|
260 |
+
|
261 |
+
def _extract_tables_tabula(self, pdf_path: str, page_num: int) -> List[TableData]:
|
262 |
+
"""
|
263 |
+
Extract tables using Tabula (only if available)
|
264 |
+
"""
|
265 |
+
if not TABULA_AVAILABLE:
|
266 |
+
return []
|
267 |
+
|
268 |
+
tables = []
|
269 |
+
try:
|
270 |
+
# Tabula works with page numbers (1-based)
|
271 |
+
tabula_tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True)
|
272 |
+
|
273 |
+
for i, df in enumerate(tabula_tables):
|
274 |
+
if not df.empty:
|
275 |
+
# Convert DataFrame to list of lists
|
276 |
+
table_data = df.fillna('').values.tolist()
|
277 |
+
# Add headers
|
278 |
+
headers = df.columns.tolist()
|
279 |
+
table_data.insert(0, headers)
|
280 |
+
|
281 |
+
tables.append(TableData(
|
282 |
+
data=table_data,
|
283 |
+
bbox=(0, 0, 100, 100), # Tabula doesn't provide bbox
|
284 |
+
page_num=page_num,
|
285 |
+
confidence=0.7,
|
286 |
+
has_header=True
|
287 |
+
))
|
288 |
+
|
289 |
+
except Exception as e:
|
290 |
+
logger.warning(f"Tabula extraction failed: {e}")
|
291 |
+
|
292 |
+
return tables
|
293 |
+
|
294 |
+
def _deduplicate_tables(self, tables: List[TableData]) -> List[TableData]:
|
295 |
+
"""
|
296 |
+
Remove duplicate tables by comparing content
|
297 |
+
"""
|
298 |
+
if not tables:
|
299 |
+
return tables
|
300 |
+
|
301 |
+
unique_tables = []
|
302 |
+
for table in tables:
|
303 |
+
is_duplicate = False
|
304 |
+
for existing_table in unique_tables:
|
305 |
+
if self._tables_are_similar(table, existing_table):
|
306 |
+
# Keep the one with higher confidence
|
307 |
+
if table.confidence > existing_table.confidence:
|
308 |
+
unique_tables.remove(existing_table)
|
309 |
+
unique_tables.append(table)
|
310 |
+
is_duplicate = True
|
311 |
+
break
|
312 |
+
|
313 |
+
if not is_duplicate:
|
314 |
+
unique_tables.append(table)
|
315 |
+
|
316 |
+
return unique_tables
|
317 |
+
|
318 |
+
def _tables_are_similar(self, table1: TableData, table2: TableData, threshold: float = 0.8) -> bool:
|
319 |
+
"""
|
320 |
+
Check if two tables are similar (likely duplicates)
|
321 |
+
"""
|
322 |
+
if len(table1.data) != len(table2.data):
|
323 |
+
return False
|
324 |
+
|
325 |
+
if not table1.data or not table2.data:
|
326 |
+
return False
|
327 |
+
|
328 |
+
# Compare dimensions
|
329 |
+
if len(table1.data[0]) != len(table2.data[0]):
|
330 |
+
return False
|
331 |
+
|
332 |
+
# Compare content similarity
|
333 |
+
matching_cells = 0
|
334 |
+
total_cells = len(table1.data) * len(table1.data[0])
|
335 |
+
|
336 |
+
for i, (row1, row2) in enumerate(zip(table1.data, table2.data)):
|
337 |
+
for j, (cell1, cell2) in enumerate(zip(row1, row2)):
|
338 |
+
if str(cell1).strip().lower() == str(cell2).strip().lower():
|
339 |
+
matching_cells += 1
|
340 |
+
|
341 |
+
similarity = matching_cells / total_cells if total_cells > 0 else 0
|
342 |
+
return similarity >= threshold
|
343 |
+
|
344 |
+
def process_pdf_to_excel(self, pdf_path: str, output_path: str, format_type: str = 'structured') -> str:
|
345 |
+
"""
|
346 |
+
Convert PDF to Excel with enhanced processing
|
347 |
+
"""
|
348 |
+
logger.info(f"Starting PDF to Excel conversion: {pdf_path}")
|
349 |
+
|
350 |
+
if not os.path.exists(pdf_path):
|
351 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
352 |
+
|
353 |
+
# Extract content from PDF
|
354 |
+
pdf_content = self._extract_comprehensive_content(pdf_path)
|
355 |
+
|
356 |
+
# Create Excel workbook
|
357 |
+
output_path = self._create_excel_workbook(pdf_content, output_path, format_type)
|
358 |
+
|
359 |
+
logger.info(f"Successfully converted PDF to Excel: {output_path}")
|
360 |
+
return output_path
|
361 |
+
|
362 |
+
def _extract_comprehensive_content(self, pdf_path: str) -> Dict[str, Any]:
|
363 |
+
"""
|
364 |
+
Extract all content from PDF using multiple methods
|
365 |
+
"""
|
366 |
+
content = {
|
367 |
+
'pages': [],
|
368 |
+
'total_pages': 0,
|
369 |
+
'metadata': {}
|
370 |
+
}
|
371 |
+
|
372 |
+
try:
|
373 |
+
doc = fitz.open(pdf_path)
|
374 |
+
content['total_pages'] = doc.page_count
|
375 |
+
content['metadata'] = doc.metadata
|
376 |
+
|
377 |
+
logger.info(f"Processing {doc.page_count} pages...")
|
378 |
+
|
379 |
+
for page_num in range(doc.page_count):
|
380 |
+
page = doc[page_num]
|
381 |
+
logger.info(f"Processing page {page_num + 1}/{doc.page_count}")
|
382 |
+
|
383 |
+
# Extract text blocks
|
384 |
+
text_blocks = self.extract_text_blocks_advanced(page, page_num + 1)
|
385 |
+
|
386 |
+
# Extract tables using multiple methods
|
387 |
+
tables = self.extract_tables_multiple_methods(pdf_path, page_num + 1)
|
388 |
+
|
389 |
+
# Extract images (basic)
|
390 |
+
images = self._extract_images_basic(page, page_num + 1)
|
391 |
+
|
392 |
+
page_content = {
|
393 |
+
'page_number': page_num + 1,
|
394 |
+
'text_blocks': text_blocks,
|
395 |
+
'tables': tables,
|
396 |
+
'images': images,
|
397 |
+
'page_width': page.rect.width,
|
398 |
+
'page_height': page.rect.height
|
399 |
+
}
|
400 |
+
|
401 |
+
content['pages'].append(page_content)
|
402 |
+
|
403 |
+
doc.close()
|
404 |
+
|
405 |
+
except Exception as e:
|
406 |
+
logger.error(f"Error extracting PDF content: {e}")
|
407 |
+
raise
|
408 |
+
|
409 |
+
return content
|
410 |
+
|
411 |
+
def _extract_images_basic(self, page, page_num: int) -> List[Dict]:
|
412 |
+
"""
|
413 |
+
Basic image extraction for reference
|
414 |
+
"""
|
415 |
+
images = []
|
416 |
+
try:
|
417 |
+
image_list = page.get_images()
|
418 |
+
for i, img in enumerate(image_list):
|
419 |
+
images.append({
|
420 |
+
'index': i,
|
421 |
+
'page': page_num,
|
422 |
+
'bbox': img # Simplified
|
423 |
+
})
|
424 |
+
except Exception as e:
|
425 |
+
logger.warning(f"Image extraction failed for page {page_num}: {e}")
|
426 |
+
|
427 |
+
return images
|
428 |
+
|
429 |
+
def _create_excel_workbook(self, content: Dict[str, Any], output_path: str, format_type: str) -> str:
|
430 |
+
"""
|
431 |
+
Create Excel workbook with proper formatting
|
432 |
+
"""
|
433 |
+
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
|
434 |
+
|
435 |
+
if format_type == 'structured':
|
436 |
+
self._create_structured_workbook(content, writer)
|
437 |
+
elif format_type == 'combined':
|
438 |
+
self._create_combined_workbook(content, writer)
|
439 |
+
else: # separate_sheets
|
440 |
+
self._create_separate_sheets_workbook(content, writer)
|
441 |
+
|
442 |
+
# Add summary sheet
|
443 |
+
self._add_summary_sheet(content, writer)
|
444 |
+
|
445 |
+
# Apply formatting
|
446 |
+
self._apply_excel_formatting(output_path)
|
447 |
+
|
448 |
+
return output_path
|
449 |
+
|
450 |
+
def _create_structured_workbook(self, content: Dict[str, Any], writer):
|
451 |
+
"""
|
452 |
+
Create structured workbook maintaining document flow
|
453 |
+
"""
|
454 |
+
for page_data in content['pages']:
|
455 |
+
page_num = page_data['page_number']
|
456 |
+
|
457 |
+
# Process tables first
|
458 |
+
table_count = 0
|
459 |
+
for table in page_data['tables']:
|
460 |
+
if table.data:
|
461 |
+
df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
|
462 |
+
sheet_name = f"P{page_num}_Table{table_count + 1}"[:31]
|
463 |
+
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
464 |
+
table_count += 1
|
465 |
+
|
466 |
+
# Process text content
|
467 |
+
if page_data['text_blocks']:
|
468 |
+
# Group text blocks by proximity and formatting
|
469 |
+
text_groups = self._group_text_blocks(page_data['text_blocks'])
|
470 |
+
|
471 |
+
for i, group in enumerate(text_groups):
|
472 |
+
if group['content'].strip():
|
473 |
+
text_df = pd.DataFrame([{
|
474 |
+
'Content': group['content'],
|
475 |
+
'Font_Size': group.get('font_size', 12),
|
476 |
+
'Is_Bold': group.get('is_bold', False),
|
477 |
+
'Position_X': group.get('x', 0),
|
478 |
+
'Position_Y': group.get('y', 0)
|
479 |
+
}])
|
480 |
+
sheet_name = f"P{page_num}_Text{i + 1}"[:31]
|
481 |
+
text_df.to_excel(writer, sheet_name=sheet_name, index=False)
|
482 |
+
|
483 |
+
def _create_combined_workbook(self, content: Dict[str, Any], writer):
|
484 |
+
"""
|
485 |
+
Create combined workbook with all tables and text together
|
486 |
+
"""
|
487 |
+
all_tables = []
|
488 |
+
all_text = []
|
489 |
+
|
490 |
+
for page_data in content['pages']:
|
491 |
+
page_num = page_data['page_number']
|
492 |
+
|
493 |
+
# Collect all tables
|
494 |
+
for i, table in enumerate(page_data['tables']):
|
495 |
+
if table.data:
|
496 |
+
df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
|
497 |
+
df['Source_Page'] = page_num
|
498 |
+
df['Table_Index'] = i + 1
|
499 |
+
all_tables.append(df)
|
500 |
+
|
501 |
+
# Collect all text
|
502 |
+
text_content = '\n'.join([block.text for block in page_data['text_blocks']])
|
503 |
+
if text_content.strip():
|
504 |
+
all_text.append({
|
505 |
+
'Page': page_num,
|
506 |
+
'Content': text_content.strip()
|
507 |
+
})
|
508 |
+
|
509 |
+
# Write combined tables
|
510 |
+
if all_tables:
|
511 |
+
combined_tables = pd.concat(all_tables, ignore_index=True)
|
512 |
+
combined_tables.to_excel(writer, sheet_name='All_Tables', index=False)
|
513 |
+
|
514 |
+
# Write combined text
|
515 |
+
if all_text:
|
516 |
+
text_df = pd.DataFrame(all_text)
|
517 |
+
text_df.to_excel(writer, sheet_name='All_Text', index=False)
|
518 |
+
|
519 |
+
def _create_separate_sheets_workbook(self, content: Dict[str, Any], writer):
|
520 |
+
"""
|
521 |
+
Create workbook with each element on separate sheets
|
522 |
+
"""
|
523 |
+
table_counter = 1
|
524 |
+
text_counter = 1
|
525 |
+
|
526 |
+
for page_data in content['pages']:
|
527 |
+
page_num = page_data['page_number']
|
528 |
+
|
529 |
+
# Each table gets its own sheet
|
530 |
+
for table in page_data['tables']:
|
531 |
+
if table.data:
|
532 |
+
df = pd.DataFrame(table.data[1:], columns=table.data[0] if table.has_header else None)
|
533 |
+
sheet_name = f"Table_{table_counter}"[:31]
|
534 |
+
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
535 |
+
table_counter += 1
|
536 |
+
|
537 |
+
# Page text gets its own sheet
|
538 |
+
if page_data['text_blocks']:
|
539 |
+
text_content = '\n'.join([block.text for block in page_data['text_blocks']])
|
540 |
+
if text_content.strip():
|
541 |
+
text_df = pd.DataFrame([{'Page': page_num, 'Content': text_content}])
|
542 |
+
sheet_name = f"Text_{text_counter}"[:31]
|
543 |
+
text_df.to_excel(writer, sheet_name=sheet_name, index=False)
|
544 |
+
text_counter += 1
|
545 |
+
|
546 |
+
def _group_text_blocks(self, text_blocks: List[TextBlock]) -> List[Dict]:
|
547 |
+
"""
|
548 |
+
Group text blocks by proximity and formatting
|
549 |
+
"""
|
550 |
+
if not text_blocks:
|
551 |
+
return []
|
552 |
+
|
553 |
+
# Sort by position (top to bottom, left to right)
|
554 |
+
sorted_blocks = sorted(text_blocks, key=lambda b: (b.y, b.x))
|
555 |
+
|
556 |
+
groups = []
|
557 |
+
current_group = {
|
558 |
+
'content': '',
|
559 |
+
'font_size': sorted_blocks[0].font_size,
|
560 |
+
'is_bold': sorted_blocks[0].is_bold,
|
561 |
+
'x': sorted_blocks[0].x,
|
562 |
+
'y': sorted_blocks[0].y
|
563 |
+
}
|
564 |
+
|
565 |
+
for block in sorted_blocks:
|
566 |
+
# Check if block should be in current group (similar formatting and position)
|
567 |
+
if (abs(current_group['font_size'] - block.font_size) < 2 and
|
568 |
+
current_group['is_bold'] == block.is_bold):
|
569 |
+
current_group['content'] += ' ' + block.text
|
570 |
+
else:
|
571 |
+
# Start new group
|
572 |
+
if current_group['content'].strip():
|
573 |
+
groups.append(current_group)
|
574 |
+
current_group = {
|
575 |
+
'content': block.text,
|
576 |
+
'font_size': block.font_size,
|
577 |
+
'is_bold': block.is_bold,
|
578 |
+
'x': block.x,
|
579 |
+
'y': block.y
|
580 |
+
}
|
581 |
+
|
582 |
+
# Add last group
|
583 |
+
if current_group['content'].strip():
|
584 |
+
groups.append(current_group)
|
585 |
+
|
586 |
+
return groups
|
587 |
+
|
588 |
+
def _add_summary_sheet(self, content: Dict[str, Any], writer):
|
589 |
+
"""
|
590 |
+
Add summary sheet with document statistics
|
591 |
+
"""
|
592 |
+
total_tables = sum(len(page['tables']) for page in content['pages'])
|
593 |
+
total_text_blocks = sum(len(page['text_blocks']) for page in content['pages'])
|
594 |
+
|
595 |
+
summary_data = {
|
596 |
+
'Statistic': [
|
597 |
+
'Total Pages',
|
598 |
+
'Total Tables',
|
599 |
+
'Total Text Blocks',
|
600 |
+
'Processing Date',
|
601 |
+
'Document Title'
|
602 |
+
],
|
603 |
+
'Value': [
|
604 |
+
content['total_pages'],
|
605 |
+
total_tables,
|
606 |
+
total_text_blocks,
|
607 |
+
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
608 |
+
content['metadata'].get('title', 'Unknown')
|
609 |
+
]
|
610 |
+
}
|
611 |
+
|
612 |
+
summary_df = pd.DataFrame(summary_data)
|
613 |
+
summary_df.to_excel(writer, sheet_name='Summary', index=False)
|
614 |
+
|
615 |
+
def _apply_excel_formatting(self, file_path: str):
|
616 |
+
"""
|
617 |
+
Apply formatting to the Excel file
|
618 |
+
"""
|
619 |
+
try:
|
620 |
+
wb = openpyxl.load_workbook(file_path)
|
621 |
+
|
622 |
+
# Define styles
|
623 |
+
header_font = Font(bold=True, color="FFFFFF")
|
624 |
+
header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
|
625 |
+
border = Border(
|
626 |
+
left=Side(style='thin'),
|
627 |
+
right=Side(style='thin'),
|
628 |
+
top=Side(style='thin'),
|
629 |
+
bottom=Side(style='thin')
|
630 |
+
)
|
631 |
+
|
632 |
+
for sheet_name in wb.sheetnames:
|
633 |
+
ws = wb[sheet_name]
|
634 |
+
|
635 |
+
# Format headers
|
636 |
+
if ws.max_row > 0:
|
637 |
+
for cell in ws[1]:
|
638 |
+
cell.font = header_font
|
639 |
+
cell.fill = header_fill
|
640 |
+
cell.alignment = Alignment(horizontal='center', vertical='center')
|
641 |
+
cell.border = border
|
642 |
+
|
643 |
+
# Auto-adjust column widths
|
644 |
+
for column in ws.columns:
|
645 |
+
max_length = 0
|
646 |
+
column_letter = column[0].column_letter
|
647 |
+
|
648 |
+
for cell in column:
|
649 |
+
try:
|
650 |
+
if len(str(cell.value)) > max_length:
|
651 |
+
max_length = len(str(cell.value))
|
652 |
+
except:
|
653 |
+
pass
|
654 |
+
|
655 |
+
adjusted_width = min(max_length + 2, 50)
|
656 |
+
ws.column_dimensions[column_letter].width = adjusted_width
|
657 |
+
|
658 |
+
wb.save(file_path)
|
659 |
+
|
660 |
+
except Exception as e:
|
661 |
+
logger.warning(f"Could not apply formatting: {e}")
|
662 |
+
|
663 |
+
# Usage example and main function
|
664 |
+
def install_dependencies():
|
665 |
+
"""
|
666 |
+
Print installation instructions for missing dependencies
|
667 |
+
"""
|
668 |
+
print("π¦ INSTALLATION INSTRUCTIONS:")
|
669 |
+
print("=" * 50)
|
670 |
+
|
671 |
+
required_packages = [
|
672 |
+
("PyMuPDF", "pip install PyMuPDF", True),
|
673 |
+
("pandas", "pip install pandas", True),
|
674 |
+
("openpyxl", "pip install openpyxl", True),
|
675 |
+
("numpy", "pip install numpy", True),
|
676 |
+
("camelot-py", "pip install camelot-py[cv]", CAMELOT_AVAILABLE),
|
677 |
+
("tabula-py", "pip install tabula-py", TABULA_AVAILABLE)
|
678 |
+
]
|
679 |
+
|
680 |
+
print("\nβ
CORE PACKAGES (Required):")
|
681 |
+
for name, cmd, available in required_packages[:4]:
|
682 |
+
status = "β
Installed" if available else "β Missing"
|
683 |
+
print(f" {name}: {status}")
|
684 |
+
if not available:
|
685 |
+
print(f" Install: {cmd}")
|
686 |
+
|
687 |
+
print("\nπ§ OPTIONAL PACKAGES (For better table extraction):")
|
688 |
+
for name, cmd, available in required_packages[4:]:
|
689 |
+
status = "β
Installed" if available else "β Missing"
|
690 |
+
print(f" {name}: {status}")
|
691 |
+
if not available:
|
692 |
+
print(f" Install: {cmd}")
|
693 |
+
|
694 |
+
print("\nπ‘ INSTALL ALL AT ONCE:")
|
695 |
+
print("pip install PyMuPDF pandas openpyxl numpy camelot-py[cv] tabula-py")
|
696 |
+
print("\n" + "=" * 50)
|
697 |
+
|
698 |
+
def main():
|
699 |
+
"""
|
700 |
+
Main function to demonstrate usage
|
701 |
+
"""
|
702 |
+
print("π Enhanced PDF to Excel Converter")
|
703 |
+
print("=" * 40)
|
704 |
+
|
705 |
+
# Show installation status
|
706 |
+
install_dependencies()
|
707 |
+
|
708 |
+
converter = PDFToExcelConverter()
|
709 |
+
|
710 |
+
# Example usage
|
711 |
+
pdf_path = "input.pdf" # Replace with your PDF path
|
712 |
+
output_path = "output.xlsx" # Replace with desired output path
|
713 |
+
|
714 |
+
try:
|
715 |
+
# Check if PDF file exists
|
716 |
+
if not os.path.exists(pdf_path):
|
717 |
+
print(f"\nβ PDF file not found: {pdf_path}")
|
718 |
+
print("Please update the 'pdf_path' variable with your actual PDF file path.")
|
719 |
+
return
|
720 |
+
|
721 |
+
print(f"\nπ Converting: {pdf_path}")
|
722 |
+
result = converter.process_pdf_to_excel(
|
723 |
+
pdf_path=pdf_path,
|
724 |
+
output_path=output_path,
|
725 |
+
format_type='structured' # Options: 'structured', 'combined', 'separate_sheets'
|
726 |
+
)
|
727 |
+
print(f"β
Conversion completed successfully: {result}")
|
728 |
+
|
729 |
+
except Exception as e:
|
730 |
+
print(f"β Conversion failed: {e}")
|
731 |
+
print("\nπ οΈ TROUBLESHOOTING:")
|
732 |
+
print("1. Make sure all required packages are installed")
|
733 |
+
print("2. Check that your PDF file exists and is readable")
|
734 |
+
print("3. Ensure you have write permissions for the output directory")
|
735 |
+
|
736 |
+
if __name__ == "__main__":
|
737 |
+
main()
|
pdf_html.py
ADDED
@@ -0,0 +1,636 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import json
|
4 |
+
import requests
|
5 |
+
from typing import Dict, List, Any
|
6 |
+
import fitz # PyMuPDF
|
7 |
+
from PIL import Image
|
8 |
+
import io
|
9 |
+
import re
|
10 |
+
from dataclasses import dataclass
|
11 |
+
from pathlib import Path
|
12 |
+
from datetime import datetime
|
13 |
+
|
14 |
+
@dataclass
|
15 |
+
class TextBlock:
|
16 |
+
text: str
|
17 |
+
x: float
|
18 |
+
y: float
|
19 |
+
width: float
|
20 |
+
height: float
|
21 |
+
font_size: float
|
22 |
+
font_name: str
|
23 |
+
is_bold: bool = False
|
24 |
+
is_italic: bool = False
|
25 |
+
block_id: str = ""
|
26 |
+
|
27 |
+
class PDFToHTMLConverter:
|
28 |
+
def __init__(self, huggingface_token: str = None):
|
29 |
+
self.hf_token = huggingface_token
|
30 |
+
self.hf_headers = {
|
31 |
+
"Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
|
32 |
+
}
|
33 |
+
self.models = {
|
34 |
+
"document_layout": "microsoft/layoutlm-base-uncased",
|
35 |
+
"table_detection": "microsoft/table-transformer-detection",
|
36 |
+
"ocr": "microsoft/trocr-base-printed",
|
37 |
+
"math_detection": "facebook/detr-resnet-50"
|
38 |
+
}
|
39 |
+
self.hf_inference_url = "https://api-inference.huggingface.co/models"
|
40 |
+
|
41 |
+
def pdf_to_base64(self, pdf_path: str) -> str:
|
42 |
+
try:
|
43 |
+
with open(pdf_path, "rb") as pdf_file:
|
44 |
+
return base64.b64encode(pdf_file.read()).decode('utf-8')
|
45 |
+
except Exception as e:
|
46 |
+
raise Exception(f"Error converting PDF to base64: {str(e)}")
|
47 |
+
|
48 |
+
def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
|
49 |
+
doc = None
|
50 |
+
try:
|
51 |
+
if not os.path.exists(pdf_path):
|
52 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
53 |
+
|
54 |
+
doc = fitz.open(pdf_path)
|
55 |
+
|
56 |
+
if doc is None:
|
57 |
+
raise RuntimeError("Failed to open PDF document")
|
58 |
+
|
59 |
+
if doc.page_count == 0:
|
60 |
+
raise ValueError("PDF document has no pages")
|
61 |
+
|
62 |
+
print(f"π PDF opened successfully: {doc.page_count} pages")
|
63 |
+
|
64 |
+
pages_content = []
|
65 |
+
|
66 |
+
for page_num in range(doc.page_count):
|
67 |
+
try:
|
68 |
+
page = doc[page_num]
|
69 |
+
print(f"π Processing page {page_num + 1}/{doc.page_count}")
|
70 |
+
|
71 |
+
text_blocks = []
|
72 |
+
try:
|
73 |
+
page_dict = page.get_text("dict")
|
74 |
+
text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
|
75 |
+
except Exception as e:
|
76 |
+
print(f"β οΈ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
|
77 |
+
text_blocks = self._extract_text_blocks_simple(page, page_num)
|
78 |
+
|
79 |
+
images = self._extract_images_safely(page, doc, page_num)
|
80 |
+
tables = self._detect_tables_safely(page)
|
81 |
+
|
82 |
+
page_rect = page.rect
|
83 |
+
|
84 |
+
pages_content.append({
|
85 |
+
"page_number": page_num + 1,
|
86 |
+
"text_blocks": text_blocks,
|
87 |
+
"images": images,
|
88 |
+
"tables": tables,
|
89 |
+
"page_width": page_rect.width,
|
90 |
+
"page_height": page_rect.height
|
91 |
+
})
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
print(f"β Error processing page {page_num + 1}: {e}")
|
95 |
+
pages_content.append({
|
96 |
+
"page_number": page_num + 1,
|
97 |
+
"text_blocks": [],
|
98 |
+
"images": [],
|
99 |
+
"tables": [],
|
100 |
+
"page_width": 595,
|
101 |
+
"page_height": 842
|
102 |
+
})
|
103 |
+
|
104 |
+
result = {
|
105 |
+
"pages": pages_content,
|
106 |
+
"total_pages": doc.page_count
|
107 |
+
}
|
108 |
+
return result
|
109 |
+
|
110 |
+
except Exception as e:
|
111 |
+
raise Exception(f"Error extracting PDF content: {str(e)}")
|
112 |
+
finally:
|
113 |
+
if doc is not None:
|
114 |
+
try:
|
115 |
+
doc.close()
|
116 |
+
print("β
PDF document closed successfully")
|
117 |
+
except Exception as e:
|
118 |
+
print(f"β οΈ Error closing PDF document: {e}")
|
119 |
+
|
120 |
+
def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
|
121 |
+
text_blocks = []
|
122 |
+
|
123 |
+
for block_idx, block in enumerate(page_dict.get("blocks", [])):
|
124 |
+
if "lines" not in block:
|
125 |
+
continue
|
126 |
+
|
127 |
+
for line_idx, line in enumerate(block["lines"]):
|
128 |
+
for span_idx, span in enumerate(line["spans"]):
|
129 |
+
text_content = span.get("text", "").strip()
|
130 |
+
if text_content:
|
131 |
+
bbox = span["bbox"]
|
132 |
+
font_info = {
|
133 |
+
"size": span.get("size", 12),
|
134 |
+
"font": span.get("font", "Arial"),
|
135 |
+
"is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
|
136 |
+
"is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
|
137 |
+
}
|
138 |
+
|
139 |
+
text_block = TextBlock(
|
140 |
+
text=text_content,
|
141 |
+
x=bbox[0],
|
142 |
+
y=bbox[1],
|
143 |
+
width=bbox[2] - bbox[0],
|
144 |
+
height=bbox[3] - bbox[1],
|
145 |
+
font_size=font_info["size"],
|
146 |
+
font_name=font_info["font"],
|
147 |
+
is_bold=font_info["is_bold"],
|
148 |
+
is_italic=font_info["is_italic"],
|
149 |
+
block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
|
150 |
+
)
|
151 |
+
text_blocks.append(text_block)
|
152 |
+
|
153 |
+
return text_blocks
|
154 |
+
|
155 |
+
def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
|
156 |
+
text_blocks = []
|
157 |
+
try:
|
158 |
+
blocks_data = page.get_text("blocks")
|
159 |
+
for block_idx, block in enumerate(blocks_data):
|
160 |
+
if block[6] == 0:
|
161 |
+
text = block[4].strip()
|
162 |
+
if text:
|
163 |
+
x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
|
164 |
+
|
165 |
+
lines = text.split('\n')
|
166 |
+
line_height = (y1 - y0) / max(len(lines), 1)
|
167 |
+
|
168 |
+
for line_idx, line in enumerate(lines):
|
169 |
+
if line.strip():
|
170 |
+
text_block = TextBlock(
|
171 |
+
text=line.strip(),
|
172 |
+
x=x0,
|
173 |
+
y=y0 + (line_idx * line_height),
|
174 |
+
width=x1 - x0,
|
175 |
+
height=line_height,
|
176 |
+
font_size=12,
|
177 |
+
font_name="Arial",
|
178 |
+
is_bold=False,
|
179 |
+
is_italic=False,
|
180 |
+
block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
|
181 |
+
)
|
182 |
+
text_blocks.append(text_block)
|
183 |
+
except Exception as e:
|
184 |
+
print(f"β οΈ Simple text block extraction failed: {e}")
|
185 |
+
|
186 |
+
return text_blocks
|
187 |
+
|
188 |
+
def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
|
189 |
+
images = []
|
190 |
+
try:
|
191 |
+
image_list = page.get_images(full=True)
|
192 |
+
for img_index, img_info in enumerate(image_list):
|
193 |
+
try:
|
194 |
+
xref = img_info[0]
|
195 |
+
|
196 |
+
img_rects = [r for r in page.get_image_rects(xref)]
|
197 |
+
if not img_rects:
|
198 |
+
continue
|
199 |
+
|
200 |
+
bbox = img_rects[0]
|
201 |
+
|
202 |
+
pix = fitz.Pixmap(doc, xref)
|
203 |
+
if pix.n - pix.alpha < 4:
|
204 |
+
img_data = pix.tobytes("png")
|
205 |
+
img_base64 = base64.b64encode(img_data).decode()
|
206 |
+
|
207 |
+
images.append({
|
208 |
+
"index": img_index,
|
209 |
+
"data": img_base64,
|
210 |
+
"bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
|
211 |
+
})
|
212 |
+
pix = None
|
213 |
+
except Exception as e:
|
214 |
+
print(f"β οΈ Error extracting image {img_index} on page {page_num+1}: {e}")
|
215 |
+
continue
|
216 |
+
except Exception as e:
|
217 |
+
print(f"β οΈ General error in image extraction for page {page_num+1}: {e}")
|
218 |
+
return images
|
219 |
+
|
220 |
+
def _detect_tables_safely(self, page) -> List[Dict]:
|
221 |
+
tables = []
|
222 |
+
try:
|
223 |
+
tabs = page.find_tables()
|
224 |
+
for tab_index, tab in enumerate(tabs):
|
225 |
+
try:
|
226 |
+
table_data = tab.extract()
|
227 |
+
if table_data:
|
228 |
+
cleaned_data = []
|
229 |
+
for row in table_data:
|
230 |
+
cleaned_row = [str(cell).strip() if cell else "" for cell in row]
|
231 |
+
if any(cleaned_row):
|
232 |
+
cleaned_data.append(cleaned_row)
|
233 |
+
|
234 |
+
if cleaned_data:
|
235 |
+
tables.append({
|
236 |
+
"bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1),
|
237 |
+
"data": cleaned_data
|
238 |
+
})
|
239 |
+
except Exception as e:
|
240 |
+
print(f"β οΈ Error extracting table {tab_index}: {e}")
|
241 |
+
continue
|
242 |
+
except Exception as e:
|
243 |
+
print(f"β οΈ General error in table detection: {e}")
|
244 |
+
return tables
|
245 |
+
|
246 |
+
def enhance_math_symbols(self, text: str) -> str:
|
247 |
+
math_replacements = {
|
248 |
+
'Β±': '±', 'Γ': '×', 'Γ·': '÷', 'β': '∑',
|
249 |
+
'β': '∏', 'β': '√', 'β': '∞', 'β«': '∫',
|
250 |
+
'β': '∂', 'β': 'Δ', 'β': '∇', 'β': '∈',
|
251 |
+
'β': '∉', 'β': '⊂', 'β': '⊃', 'β': '⊆',
|
252 |
+
'β': '⊇', 'βͺ': '∪', 'β©': '∩', 'β€': '≤',
|
253 |
+
'β₯': '≥', 'β ': '≠', 'β‘': '≡', 'β': '≈',
|
254 |
+
'β': '∝', 'β΄': '∴',
|
255 |
+
'Ξ±': 'α', 'Ξ²': 'β', 'Ξ³': 'γ', 'Ξ΄': 'δ',
|
256 |
+
'Ξ΅': 'ε', 'ΞΆ': 'ζ', 'Ξ·': 'η', 'ΞΈ': 'θ',
|
257 |
+
'ΞΉ': 'ι', 'ΞΊ': 'κ', 'Ξ»': 'λ', 'ΞΌ': 'μ',
|
258 |
+
'Ξ½': 'ν', 'ΞΎ': 'ξ', 'Ο': 'π', 'Ο': 'ρ', 'Ο': 'σ',
|
259 |
+
'Ο': 'τ', 'Ο
': 'υ', 'Ο': 'φ', 'Ο': 'χ',
|
260 |
+
'Ο': 'ψ', 'Ο': 'ω',
|
261 |
+
'Β½': '½', 'β
': '⅓', 'ΒΌ': '¼', 'β
': '⅔',
|
262 |
+
'ΒΎ': '¾', 'β
': '⅛', 'Β²': '²', 'Β³': '³',
|
263 |
+
'ΒΉ': '¹', 'Β°': '°'
|
264 |
+
}
|
265 |
+
|
266 |
+
for symbol, html_entity in math_replacements.items():
|
267 |
+
text = text.replace(symbol, html_entity)
|
268 |
+
|
269 |
+
return text
|
270 |
+
|
271 |
+
def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str:
|
272 |
+
html_content = []
|
273 |
+
html_content.append("""<!DOCTYPE html>
|
274 |
+
<html lang="en">
|
275 |
+
<head>
|
276 |
+
<meta charset="UTF-8">
|
277 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
278 |
+
<title>PDF Document</title>
|
279 |
+
<style>
|
280 |
+
* {
|
281 |
+
box-sizing: border-box;
|
282 |
+
margin: 0;
|
283 |
+
padding: 0;
|
284 |
+
}
|
285 |
+
|
286 |
+
body {
|
287 |
+
font-family: 'Times New Roman', Times, serif;
|
288 |
+
background-color: #f5f5f5;
|
289 |
+
padding: 20px;
|
290 |
+
line-height: 1.2;
|
291 |
+
color: #000000;
|
292 |
+
}
|
293 |
+
|
294 |
+
.document-container {
|
295 |
+
max-width: 1200px;
|
296 |
+
margin: 0 auto;
|
297 |
+
background-color: white;
|
298 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
|
299 |
+
border: 1px solid #ddd;
|
300 |
+
}
|
301 |
+
|
302 |
+
.page-wrapper {
|
303 |
+
background-color: white;
|
304 |
+
margin: 0;
|
305 |
+
padding: 40px;
|
306 |
+
border-bottom: 2px solid #000;
|
307 |
+
position: relative;
|
308 |
+
min-height: 800px;
|
309 |
+
page-break-after: always;
|
310 |
+
overflow: visible;
|
311 |
+
}
|
312 |
+
|
313 |
+
.page-header {
|
314 |
+
background-color: #f8f8f8;
|
315 |
+
padding: 10px 15px;
|
316 |
+
margin: -40px -40px 30px -40px;
|
317 |
+
border-bottom: 2px solid #000;
|
318 |
+
font-weight: bold;
|
319 |
+
color: #000;
|
320 |
+
font-size: 14px;
|
321 |
+
text-align: center;
|
322 |
+
}
|
323 |
+
|
324 |
+
.content-layer {
|
325 |
+
position: relative;
|
326 |
+
width: 100%;
|
327 |
+
height: 100%;
|
328 |
+
}
|
329 |
+
|
330 |
+
.text-content {
|
331 |
+
position: relative;
|
332 |
+
z-index: 10;
|
333 |
+
line-height: 1.4;
|
334 |
+
}
|
335 |
+
|
336 |
+
.text-block {
|
337 |
+
margin-bottom: 8px;
|
338 |
+
font-family: 'Times New Roman', Times, serif;
|
339 |
+
color: #000;
|
340 |
+
word-wrap: break-word;
|
341 |
+
overflow-wrap: break-word;
|
342 |
+
}
|
343 |
+
|
344 |
+
.text-block.inline {
|
345 |
+
display: inline;
|
346 |
+
margin-bottom: 0;
|
347 |
+
margin-right: 5px;
|
348 |
+
}
|
349 |
+
|
350 |
+
.text-group {
|
351 |
+
margin-bottom: 12px;
|
352 |
+
line-height: 1.3;
|
353 |
+
}
|
354 |
+
|
355 |
+
.bold {
|
356 |
+
font-weight: bold;
|
357 |
+
}
|
358 |
+
|
359 |
+
.italic {
|
360 |
+
font-style: italic;
|
361 |
+
}
|
362 |
+
|
363 |
+
.table-container {
|
364 |
+
margin: 20px 0;
|
365 |
+
background-color: white;
|
366 |
+
overflow: auto;
|
367 |
+
z-index: 20;
|
368 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
369 |
+
}
|
370 |
+
|
371 |
+
.table {
|
372 |
+
width: 100%;
|
373 |
+
border-collapse: collapse;
|
374 |
+
border: 2px solid #000;
|
375 |
+
font-family: 'Times New Roman', Times, serif;
|
376 |
+
font-size: 12px;
|
377 |
+
color: #000;
|
378 |
+
background-color: white;
|
379 |
+
margin: 0;
|
380 |
+
}
|
381 |
+
|
382 |
+
.table td, .table th {
|
383 |
+
border: 1px solid #000;
|
384 |
+
padding: 8px 12px;
|
385 |
+
text-align: left;
|
386 |
+
vertical-align: top;
|
387 |
+
background-color: white;
|
388 |
+
font-family: 'Times New Roman', Times, serif;
|
389 |
+
word-wrap: break-word;
|
390 |
+
min-width: 60px;
|
391 |
+
}
|
392 |
+
|
393 |
+
.table th {
|
394 |
+
background-color: #f0f0f0;
|
395 |
+
font-weight: bold;
|
396 |
+
text-align: center;
|
397 |
+
}
|
398 |
+
|
399 |
+
.table tr:nth-child(even) td {
|
400 |
+
background-color: #f9f9f9;
|
401 |
+
}
|
402 |
+
|
403 |
+
.table tr:hover td {
|
404 |
+
background-color: #f0f0f0;
|
405 |
+
}
|
406 |
+
|
407 |
+
.image-container {
|
408 |
+
margin: 15px 0;
|
409 |
+
border: 1px solid #ccc;
|
410 |
+
background-color: white;
|
411 |
+
text-align: center;
|
412 |
+
overflow: hidden;
|
413 |
+
z-index: 5;
|
414 |
+
}
|
415 |
+
|
416 |
+
.image {
|
417 |
+
max-width: 100%;
|
418 |
+
height: auto;
|
419 |
+
display: block;
|
420 |
+
margin: 0 auto;
|
421 |
+
}
|
422 |
+
|
423 |
+
.math-symbol {
|
424 |
+
font-family: 'Times New Roman', serif;
|
425 |
+
}
|
426 |
+
|
427 |
+
.document-info {
|
428 |
+
background-color: #f8f8f8;
|
429 |
+
padding: 15px;
|
430 |
+
border: 1px solid #ccc;
|
431 |
+
margin-bottom: 20px;
|
432 |
+
text-align: center;
|
433 |
+
font-family: 'Times New Roman', Times, serif;
|
434 |
+
}
|
435 |
+
|
436 |
+
@media print {
|
437 |
+
body {
|
438 |
+
background-color: white;
|
439 |
+
padding: 0;
|
440 |
+
}
|
441 |
+
.page-wrapper {
|
442 |
+
border: none;
|
443 |
+
box-shadow: none;
|
444 |
+
margin: 0;
|
445 |
+
page-break-after: always;
|
446 |
+
}
|
447 |
+
.document-info {
|
448 |
+
display: none;
|
449 |
+
}
|
450 |
+
.table {
|
451 |
+
border: 2px solid #000 !important;
|
452 |
+
}
|
453 |
+
.table td, .table th {
|
454 |
+
border: 1px solid #000 !important;
|
455 |
+
}
|
456 |
+
}
|
457 |
+
</style>
|
458 |
+
</head>
|
459 |
+
<body>
|
460 |
+
<div class="document-container">""")
|
461 |
+
|
462 |
+
html_content.append(f"""
|
463 |
+
<div class="document-info">
|
464 |
+
<h1>PDF Document Conversion</h1>
|
465 |
+
<p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p>
|
466 |
+
<p><strong>Converted on:</strong> {self._get_current_timestamp()}</p>
|
467 |
+
</div>""")
|
468 |
+
|
469 |
+
for page in pdf_content["pages"]:
|
470 |
+
page_width = max(page["page_width"], 595)
|
471 |
+
page_height = max(page["page_height"], 842)
|
472 |
+
|
473 |
+
html_content.append(f"""
|
474 |
+
<div class="page-wrapper">
|
475 |
+
<div class="page-header">
|
476 |
+
Page {page["page_number"]} ({page_width:.0f}Γ{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])}
|
477 |
+
</div>
|
478 |
+
<div class="content-layer">""")
|
479 |
+
|
480 |
+
# Add images first
|
481 |
+
for img in page["images"]:
|
482 |
+
html_content.append(f"""
|
483 |
+
<div class="image-container">
|
484 |
+
<img class="image" src="data:image/png;base64,{img['data']}"
|
485 |
+
alt="Page {page['page_number']} Image {img['index']}">
|
486 |
+
</div>""")
|
487 |
+
|
488 |
+
# Add tables with improved generation
|
489 |
+
for table_idx, table in enumerate(page["tables"]):
|
490 |
+
print(f"π Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})")
|
491 |
+
html_content.append(self._generate_html_table(
|
492 |
+
table["data"],
|
493 |
+
header_rows=table.get("header_rows", 1)
|
494 |
+
))
|
495 |
+
|
496 |
+
# Add text content (non-overlapping groups)
|
497 |
+
text_groups = self._group_overlapping_text(page["text_blocks"])
|
498 |
+
|
499 |
+
html_content.append(' <div class="text-content">')
|
500 |
+
|
501 |
+
for group in text_groups:
|
502 |
+
if len(group) == 1:
|
503 |
+
block = group[0]
|
504 |
+
if block.text.strip():
|
505 |
+
enhanced_text = self.enhance_math_symbols(block.text)
|
506 |
+
enhanced_text = enhanced_text.replace('<', '<').replace('>', '>')
|
507 |
+
|
508 |
+
css_classes = ["text-block"]
|
509 |
+
if block.is_bold:
|
510 |
+
css_classes.append("bold")
|
511 |
+
if block.is_italic:
|
512 |
+
css_classes.append("italic")
|
513 |
+
if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']):
|
514 |
+
css_classes.append("math-symbol")
|
515 |
+
|
516 |
+
font_family = "'Times New Roman', Times, serif"
|
517 |
+
if 'arial' in block.font_name.lower():
|
518 |
+
font_family = "Arial, sans-serif"
|
519 |
+
elif 'helvetica' in block.font_name.lower():
|
520 |
+
font_family = "Helvetica, Arial, sans-serif"
|
521 |
+
elif 'courier' in block.font_name.lower():
|
522 |
+
font_family = "'Courier New', monospace"
|
523 |
+
|
524 |
+
font_size = max(block.font_size * 0.9, 10)
|
525 |
+
|
526 |
+
html_content.append(f"""
|
527 |
+
<div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
|
528 |
+
{enhanced_text}
|
529 |
+
</div>""")
|
530 |
+
else:
|
531 |
+
group.sort(key=lambda b: b.x)
|
532 |
+
html_content.append(' <div class="text-group">')
|
533 |
+
|
534 |
+
for block in group:
|
535 |
+
if block.text.strip():
|
536 |
+
enhanced_text = self.enhance_math_symbols(block.text)
|
537 |
+
enhanced_text = enhanced_text.replace('<', '<').replace('>', '>')
|
538 |
+
|
539 |
+
css_classes = ["text-block", "inline"]
|
540 |
+
if block.is_bold:
|
541 |
+
css_classes.append("bold")
|
542 |
+
if block.is_italic:
|
543 |
+
css_classes.append("italic")
|
544 |
+
if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']):
|
545 |
+
css_classes.append("math-symbol")
|
546 |
+
|
547 |
+
font_family = "'Times New Roman', Times, serif"
|
548 |
+
if 'arial' in block.font_name.lower():
|
549 |
+
font_family = "Arial, sans-serif"
|
550 |
+
elif 'helvetica' in block.font_name.lower():
|
551 |
+
font_family = "Helvetica, Arial, sans-serif"
|
552 |
+
elif 'courier' in block.font_name.lower():
|
553 |
+
font_family = "'Courier New', monospace"
|
554 |
+
|
555 |
+
font_size = max(block.font_size * 0.9, 10)
|
556 |
+
|
557 |
+
html_content.append(f"""
|
558 |
+
<span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
|
559 |
+
{enhanced_text}
|
560 |
+
</span>""")
|
561 |
+
|
562 |
+
html_content.append(' </div>')
|
563 |
+
|
564 |
+
html_content.append(""" </div>
|
565 |
+
</div>
|
566 |
+
</div>""")
|
567 |
+
|
568 |
+
html_content.append(" </div>")
|
569 |
+
html_content.append("""
|
570 |
+
</body>
|
571 |
+
</html>""")
|
572 |
+
final_html = "\n".join(html_content)
|
573 |
+
|
574 |
+
if output_path:
|
575 |
+
try:
|
576 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
577 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
578 |
+
f.write(final_html)
|
579 |
+
print(f"β
HTML saved to: {output_path}")
|
580 |
+
except Exception as e:
|
581 |
+
print(f"β οΈ Error saving HTML to {output_path}: {e}")
|
582 |
+
|
583 |
+
return final_html
|
584 |
+
|
585 |
+
def _get_current_timestamp(self) -> str:
|
586 |
+
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
587 |
+
|
588 |
+
def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str:
|
589 |
+
print(f"π Processing PDF: {pdf_path}")
|
590 |
+
|
591 |
+
if not os.path.exists(pdf_path):
|
592 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
593 |
+
|
594 |
+
print("π Extracting PDF content...")
|
595 |
+
pdf_content = self.extract_pdf_content(pdf_path)
|
596 |
+
|
597 |
+
if use_hf_models and self.hf_token:
|
598 |
+
print("π€ Attempting to enhance with Hugging Face models...")
|
599 |
+
try:
|
600 |
+
print("Note: Hugging Face model integration requires further implementation.")
|
601 |
+
except Exception as e:
|
602 |
+
print(f"β οΈ Hugging Face enhancement failed: {e}")
|
603 |
+
|
604 |
+
print("π Converting to HTML...")
|
605 |
+
html_content = self.convert_to_html(pdf_content, output_path)
|
606 |
+
|
607 |
+
print("β
Processing complete!")
|
608 |
+
return html_content
|
609 |
+
|
610 |
+
def main():
|
611 |
+
HF_TOKEN = os.getenv("HF_API_TOKEN")
|
612 |
+
|
613 |
+
converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
|
614 |
+
pdf_path = "new-pdf.pdf"
|
615 |
+
output_path = "sample_converted.html"
|
616 |
+
|
617 |
+
try:
|
618 |
+
html_content = converter.process_pdf(
|
619 |
+
pdf_path=pdf_path,
|
620 |
+
output_path=output_path,
|
621 |
+
use_hf_models=False
|
622 |
+
)
|
623 |
+
|
624 |
+
print(f"β
Successfully converted '{pdf_path}' to '{output_path}'")
|
625 |
+
print(f"π Open '{output_path}' in your web browser to view the result!")
|
626 |
+
|
627 |
+
except FileNotFoundError as e:
|
628 |
+
print(f"β Error: {e}")
|
629 |
+
print("Please ensure the PDF file exists at the specified path.")
|
630 |
+
except Exception as e:
|
631 |
+
print(f"β An unexpected error occurred: {str(e)}")
|
632 |
+
import traceback
|
633 |
+
traceback.print_exc()
|
634 |
+
|
635 |
+
if __name__ == "__main__":
|
636 |
+
main()
|
pdf_json.py
ADDED
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import json
|
4 |
+
import requests
|
5 |
+
from typing import Dict, List, Any, Optional
|
6 |
+
import fitz # PyMuPDF
|
7 |
+
from PIL import Image
|
8 |
+
import io
|
9 |
+
import re
|
10 |
+
from dataclasses import dataclass, asdict
|
11 |
+
from pathlib import Path
|
12 |
+
from datetime import datetime
|
13 |
+
|
14 |
+
@dataclass
|
15 |
+
class TextBlock:
|
16 |
+
text: str
|
17 |
+
x: float
|
18 |
+
y: float
|
19 |
+
width: float
|
20 |
+
height: float
|
21 |
+
font_size: float
|
22 |
+
font_name: str
|
23 |
+
is_bold: bool = False
|
24 |
+
is_italic: bool = False
|
25 |
+
block_id: str = ""
|
26 |
+
|
27 |
+
def to_dict(self) -> Dict[str, Any]:
|
28 |
+
"""Convert TextBlock to dictionary"""
|
29 |
+
return asdict(self)
|
30 |
+
|
31 |
+
@dataclass
|
32 |
+
class ImageData:
|
33 |
+
index: int
|
34 |
+
base64_data: str
|
35 |
+
bbox: tuple
|
36 |
+
width: float
|
37 |
+
height: float
|
38 |
+
format: str = "PNG"
|
39 |
+
|
40 |
+
def to_dict(self) -> Dict[str, Any]:
|
41 |
+
"""Convert ImageData to dictionary"""
|
42 |
+
return asdict(self)
|
43 |
+
|
44 |
+
@dataclass
|
45 |
+
class TableData:
|
46 |
+
bbox: tuple
|
47 |
+
data: List[List[str]]
|
48 |
+
rows: int
|
49 |
+
columns: int
|
50 |
+
|
51 |
+
def to_dict(self) -> Dict[str, Any]:
|
52 |
+
"""Convert TableData to dictionary"""
|
53 |
+
return asdict(self)
|
54 |
+
|
55 |
+
@dataclass
|
56 |
+
class PageData:
|
57 |
+
page_number: int
|
58 |
+
text_blocks: List[TextBlock]
|
59 |
+
images: List[ImageData]
|
60 |
+
tables: List[TableData]
|
61 |
+
page_width: float
|
62 |
+
page_height: float
|
63 |
+
word_count: int = 0
|
64 |
+
character_count: int = 0
|
65 |
+
|
66 |
+
def to_dict(self) -> Dict[str, Any]:
|
67 |
+
"""Convert PageData to dictionary"""
|
68 |
+
return {
|
69 |
+
"page_number": self.page_number,
|
70 |
+
"text_blocks": [block.to_dict() for block in self.text_blocks],
|
71 |
+
"images": [img.to_dict() for img in self.images],
|
72 |
+
"tables": [table.to_dict() for table in self.tables],
|
73 |
+
"page_width": self.page_width,
|
74 |
+
"page_height": self.page_height,
|
75 |
+
"word_count": self.word_count,
|
76 |
+
"character_count": self.character_count
|
77 |
+
}
|
78 |
+
|
79 |
+
class PDFToJSONConverter:
|
80 |
+
def __init__(self, huggingface_token: str = None):
|
81 |
+
self.hf_token = huggingface_token
|
82 |
+
self.hf_headers = {
|
83 |
+
"Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
|
84 |
+
}
|
85 |
+
self.models = {
|
86 |
+
"document_layout": "microsoft/layoutlm-base-uncased",
|
87 |
+
"table_detection": "microsoft/table-transformer-detection",
|
88 |
+
"ocr": "microsoft/trocr-base-printed",
|
89 |
+
"math_detection": "facebook/detr-resnet-50"
|
90 |
+
}
|
91 |
+
self.hf_inference_url = "https://api-inference.huggingface.co/models"
|
92 |
+
|
93 |
+
def pdf_to_base64(self, pdf_path: str) -> str:
|
94 |
+
"""Convert PDF file to base64 string"""
|
95 |
+
try:
|
96 |
+
with open(pdf_path, "rb") as pdf_file:
|
97 |
+
return base64.b64encode(pdf_file.read()).decode('utf-8')
|
98 |
+
except Exception as e:
|
99 |
+
raise Exception(f"Error converting PDF to base64: {str(e)}")
|
100 |
+
|
101 |
+
def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
|
102 |
+
"""Extract all content from PDF and return structured data"""
|
103 |
+
doc = None
|
104 |
+
try:
|
105 |
+
if not os.path.exists(pdf_path):
|
106 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
107 |
+
|
108 |
+
doc = fitz.open(pdf_path)
|
109 |
+
|
110 |
+
if doc is None:
|
111 |
+
raise RuntimeError("Failed to open PDF document")
|
112 |
+
|
113 |
+
if doc.page_count == 0:
|
114 |
+
raise ValueError("PDF document has no pages")
|
115 |
+
|
116 |
+
print(f"π PDF opened successfully: {doc.page_count} pages")
|
117 |
+
|
118 |
+
pages_data = []
|
119 |
+
document_stats = {
|
120 |
+
"total_pages": doc.page_count,
|
121 |
+
"total_words": 0,
|
122 |
+
"total_characters": 0,
|
123 |
+
"total_images": 0,
|
124 |
+
"total_tables": 0
|
125 |
+
}
|
126 |
+
|
127 |
+
for page_num in range(doc.page_count):
|
128 |
+
try:
|
129 |
+
page = doc[page_num]
|
130 |
+
print(f"π Processing page {page_num + 1}/{doc.page_count}")
|
131 |
+
|
132 |
+
# Extract text blocks
|
133 |
+
text_blocks = []
|
134 |
+
try:
|
135 |
+
page_dict = page.get_text("dict")
|
136 |
+
text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
|
137 |
+
except Exception as e:
|
138 |
+
print(f"β οΈ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
|
139 |
+
text_blocks = self._extract_text_blocks_simple(page, page_num)
|
140 |
+
|
141 |
+
# Extract images
|
142 |
+
images = self._extract_images_safely(page, doc, page_num)
|
143 |
+
|
144 |
+
# Extract tables
|
145 |
+
tables = self._detect_tables_safely(page)
|
146 |
+
|
147 |
+
# Get page dimensions
|
148 |
+
page_rect = page.rect
|
149 |
+
|
150 |
+
# Calculate statistics
|
151 |
+
page_text = " ".join([block.text for block in text_blocks])
|
152 |
+
word_count = len(page_text.split())
|
153 |
+
char_count = len(page_text)
|
154 |
+
|
155 |
+
# Create page data
|
156 |
+
page_data = PageData(
|
157 |
+
page_number=page_num + 1,
|
158 |
+
text_blocks=text_blocks,
|
159 |
+
images=images,
|
160 |
+
tables=tables,
|
161 |
+
page_width=page_rect.width,
|
162 |
+
page_height=page_rect.height,
|
163 |
+
word_count=word_count,
|
164 |
+
character_count=char_count
|
165 |
+
)
|
166 |
+
|
167 |
+
pages_data.append(page_data)
|
168 |
+
|
169 |
+
# Update document statistics
|
170 |
+
document_stats["total_words"] += word_count
|
171 |
+
document_stats["total_characters"] += char_count
|
172 |
+
document_stats["total_images"] += len(images)
|
173 |
+
document_stats["total_tables"] += len(tables)
|
174 |
+
|
175 |
+
except Exception as e:
|
176 |
+
print(f"β Error processing page {page_num + 1}: {e}")
|
177 |
+
# Create empty page data for failed pages
|
178 |
+
empty_page = PageData(
|
179 |
+
page_number=page_num + 1,
|
180 |
+
text_blocks=[],
|
181 |
+
images=[],
|
182 |
+
tables=[],
|
183 |
+
page_width=595,
|
184 |
+
page_height=842,
|
185 |
+
word_count=0,
|
186 |
+
character_count=0
|
187 |
+
)
|
188 |
+
pages_data.append(empty_page)
|
189 |
+
|
190 |
+
result = {
|
191 |
+
"document_info": {
|
192 |
+
"filename": os.path.basename(pdf_path),
|
193 |
+
"file_size": os.path.getsize(pdf_path),
|
194 |
+
"conversion_timestamp": self._get_current_timestamp(),
|
195 |
+
"converter_version": "1.0.0"
|
196 |
+
},
|
197 |
+
"document_statistics": document_stats,
|
198 |
+
"pages": [page.to_dict() for page in pages_data]
|
199 |
+
}
|
200 |
+
|
201 |
+
return result
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
raise Exception(f"Error extracting PDF content: {str(e)}")
|
205 |
+
finally:
|
206 |
+
if doc is not None:
|
207 |
+
try:
|
208 |
+
doc.close()
|
209 |
+
print("β
PDF document closed successfully")
|
210 |
+
except Exception as e:
|
211 |
+
print(f"β οΈ Error closing PDF document: {e}")
|
212 |
+
|
213 |
+
def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
|
214 |
+
"""Extract text blocks from page dictionary with detailed formatting"""
|
215 |
+
text_blocks = []
|
216 |
+
|
217 |
+
for block_idx, block in enumerate(page_dict.get("blocks", [])):
|
218 |
+
if "lines" not in block:
|
219 |
+
continue
|
220 |
+
|
221 |
+
for line_idx, line in enumerate(block["lines"]):
|
222 |
+
for span_idx, span in enumerate(line["spans"]):
|
223 |
+
text_content = span.get("text", "").strip()
|
224 |
+
if text_content:
|
225 |
+
bbox = span["bbox"]
|
226 |
+
font_info = {
|
227 |
+
"size": span.get("size", 12),
|
228 |
+
"font": span.get("font", "Arial"),
|
229 |
+
"is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
|
230 |
+
"is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
|
231 |
+
}
|
232 |
+
|
233 |
+
text_block = TextBlock(
|
234 |
+
text=text_content,
|
235 |
+
x=round(bbox[0], 2),
|
236 |
+
y=round(bbox[1], 2),
|
237 |
+
width=round(bbox[2] - bbox[0], 2),
|
238 |
+
height=round(bbox[3] - bbox[1], 2),
|
239 |
+
font_size=round(font_info["size"], 2),
|
240 |
+
font_name=font_info["font"],
|
241 |
+
is_bold=font_info["is_bold"],
|
242 |
+
is_italic=font_info["is_italic"],
|
243 |
+
block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
|
244 |
+
)
|
245 |
+
text_blocks.append(text_block)
|
246 |
+
|
247 |
+
return text_blocks
|
248 |
+
|
249 |
+
def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
|
250 |
+
"""Fallback method for text extraction"""
|
251 |
+
text_blocks = []
|
252 |
+
try:
|
253 |
+
blocks_data = page.get_text("blocks")
|
254 |
+
for block_idx, block in enumerate(blocks_data):
|
255 |
+
if block[6] == 0: # Text block
|
256 |
+
text = block[4].strip()
|
257 |
+
if text:
|
258 |
+
x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
|
259 |
+
|
260 |
+
lines = text.split('\n')
|
261 |
+
line_height = (y1 - y0) / max(len(lines), 1)
|
262 |
+
|
263 |
+
for line_idx, line in enumerate(lines):
|
264 |
+
if line.strip():
|
265 |
+
text_block = TextBlock(
|
266 |
+
text=line.strip(),
|
267 |
+
x=round(x0, 2),
|
268 |
+
y=round(y0 + (line_idx * line_height), 2),
|
269 |
+
width=round(x1 - x0, 2),
|
270 |
+
height=round(line_height, 2),
|
271 |
+
font_size=12.0,
|
272 |
+
font_name="Arial",
|
273 |
+
is_bold=False,
|
274 |
+
is_italic=False,
|
275 |
+
block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
|
276 |
+
)
|
277 |
+
text_blocks.append(text_block)
|
278 |
+
except Exception as e:
|
279 |
+
print(f"β οΈ Simple text block extraction failed: {e}")
|
280 |
+
|
281 |
+
return text_blocks
|
282 |
+
|
283 |
+
def _extract_images_safely(self, page, doc, page_num) -> List[ImageData]:
|
284 |
+
"""Extract images from page and return structured data"""
|
285 |
+
images = []
|
286 |
+
try:
|
287 |
+
image_list = page.get_images(full=True)
|
288 |
+
for img_index, img_info in enumerate(image_list):
|
289 |
+
try:
|
290 |
+
xref = img_info[0]
|
291 |
+
|
292 |
+
# Get image rectangles
|
293 |
+
img_rects = [r for r in page.get_image_rects(xref)]
|
294 |
+
if not img_rects:
|
295 |
+
continue
|
296 |
+
|
297 |
+
bbox = img_rects[0]
|
298 |
+
|
299 |
+
# Extract image data
|
300 |
+
pix = fitz.Pixmap(doc, xref)
|
301 |
+
if pix.n - pix.alpha < 4: # Valid image
|
302 |
+
img_data = pix.tobytes("png")
|
303 |
+
img_base64 = base64.b64encode(img_data).decode()
|
304 |
+
|
305 |
+
image_data = ImageData(
|
306 |
+
index=img_index,
|
307 |
+
base64_data=img_base64,
|
308 |
+
bbox=(round(bbox.x0, 2), round(bbox.y0, 2),
|
309 |
+
round(bbox.x1, 2), round(bbox.y1, 2)),
|
310 |
+
width=round(bbox.x1 - bbox.x0, 2),
|
311 |
+
height=round(bbox.y1 - bbox.y0, 2),
|
312 |
+
format="PNG"
|
313 |
+
)
|
314 |
+
images.append(image_data)
|
315 |
+
pix = None
|
316 |
+
except Exception as e:
|
317 |
+
print(f"β οΈ Error extracting image {img_index} on page {page_num+1}: {e}")
|
318 |
+
continue
|
319 |
+
except Exception as e:
|
320 |
+
print(f"β οΈ General error in image extraction for page {page_num+1}: {e}")
|
321 |
+
return images
|
322 |
+
|
323 |
+
def _detect_tables_safely(self, page) -> List[TableData]:
|
324 |
+
"""Extract tables from page and return structured data"""
|
325 |
+
tables = []
|
326 |
+
try:
|
327 |
+
tabs = page.find_tables()
|
328 |
+
for tab_index, tab in enumerate(tabs):
|
329 |
+
try:
|
330 |
+
table_data = tab.extract()
|
331 |
+
if table_data:
|
332 |
+
# Clean table data
|
333 |
+
cleaned_data = []
|
334 |
+
for row in table_data:
|
335 |
+
cleaned_row = [str(cell).strip() if cell else "" for cell in row]
|
336 |
+
if any(cleaned_row): # Only add non-empty rows
|
337 |
+
cleaned_data.append(cleaned_row)
|
338 |
+
|
339 |
+
if cleaned_data:
|
340 |
+
table_obj = TableData(
|
341 |
+
bbox=(round(tab.bbox.x0, 2), round(tab.bbox.y0, 2),
|
342 |
+
round(tab.bbox.x1, 2), round(tab.bbox.y1, 2)),
|
343 |
+
data=cleaned_data,
|
344 |
+
rows=len(cleaned_data),
|
345 |
+
columns=max(len(row) for row in cleaned_data) if cleaned_data else 0
|
346 |
+
)
|
347 |
+
tables.append(table_obj)
|
348 |
+
except Exception as e:
|
349 |
+
print(f"β οΈ Error extracting table {tab_index}: {e}")
|
350 |
+
continue
|
351 |
+
except Exception as e:
|
352 |
+
print(f"β οΈ General error in table detection: {e}")
|
353 |
+
return tables
|
354 |
+
|
355 |
+
def convert_to_json(self, pdf_content: Dict[str, Any], output_path: str = None,
|
356 |
+
pretty_print: bool = True, include_base64_images: bool = True) -> str:
|
357 |
+
"""Convert PDF content to JSON format"""
|
358 |
+
print("π Converting to JSON format...")
|
359 |
+
|
360 |
+
try:
|
361 |
+
# Create a copy of the content for modification
|
362 |
+
json_content = pdf_content.copy()
|
363 |
+
|
364 |
+
# Add metadata
|
365 |
+
json_content["conversion_options"] = {
|
366 |
+
"pretty_print": pretty_print,
|
367 |
+
"include_base64_images": include_base64_images,
|
368 |
+
"json_schema_version": "1.0"
|
369 |
+
}
|
370 |
+
|
371 |
+
# Optionally remove base64 image data to reduce file size
|
372 |
+
if not include_base64_images:
|
373 |
+
for page in json_content["pages"]:
|
374 |
+
for image in page["images"]:
|
375 |
+
image["base64_data"] = "[Base64 data removed - set include_base64_images=True to include]"
|
376 |
+
|
377 |
+
# Convert to JSON string
|
378 |
+
if pretty_print:
|
379 |
+
json_string = json.dumps(json_content, indent=2, ensure_ascii=False)
|
380 |
+
else:
|
381 |
+
json_string = json.dumps(json_content, ensure_ascii=False)
|
382 |
+
|
383 |
+
# Save to file if output path provided
|
384 |
+
if output_path:
|
385 |
+
try:
|
386 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
387 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
388 |
+
f.write(json_string)
|
389 |
+
print(f"β
JSON saved to: {output_path}")
|
390 |
+
print(f"π File size: {len(json_string):,} characters")
|
391 |
+
except Exception as e:
|
392 |
+
print(f"β οΈ Error saving JSON to {output_path}: {e}")
|
393 |
+
|
394 |
+
return json_string
|
395 |
+
|
396 |
+
except Exception as e:
|
397 |
+
raise Exception(f"Error converting to JSON: {str(e)}")
|
398 |
+
|
399 |
+
def create_json_summary(self, pdf_content: Dict[str, Any]) -> Dict[str, Any]:
|
400 |
+
"""Create a summary of the PDF content without full data"""
|
401 |
+
summary = {
|
402 |
+
"document_info": pdf_content.get("document_info", {}),
|
403 |
+
"document_statistics": pdf_content.get("document_statistics", {}),
|
404 |
+
"page_summaries": []
|
405 |
+
}
|
406 |
+
|
407 |
+
for page in pdf_content.get("pages", []):
|
408 |
+
page_summary = {
|
409 |
+
"page_number": page["page_number"],
|
410 |
+
"text_blocks_count": len(page["text_blocks"]),
|
411 |
+
"images_count": len(page["images"]),
|
412 |
+
"tables_count": len(page["tables"]),
|
413 |
+
"word_count": page["word_count"],
|
414 |
+
"character_count": page["character_count"],
|
415 |
+
"page_dimensions": {
|
416 |
+
"width": page["page_width"],
|
417 |
+
"height": page["page_height"]
|
418 |
+
},
|
419 |
+
"sample_text": " ".join([block["text"] for block in page["text_blocks"][:3]])[:200] + "..." if page["text_blocks"] else ""
|
420 |
+
}
|
421 |
+
summary["page_summaries"].append(page_summary)
|
422 |
+
|
423 |
+
return summary
|
424 |
+
|
425 |
+
def _get_current_timestamp(self) -> str:
|
426 |
+
"""Get current timestamp as string"""
|
427 |
+
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
428 |
+
|
429 |
+
def process_pdf_to_json(self, pdf_path: str, output_path: str = None,
|
430 |
+
pretty_print: bool = True, include_base64_images: bool = True,
|
431 |
+
create_summary: bool = False, use_hf_models: bool = False) -> str:
|
432 |
+
"""Main method to process PDF and convert to JSON"""
|
433 |
+
print(f"π Processing PDF to JSON: {pdf_path}")
|
434 |
+
|
435 |
+
if not os.path.exists(pdf_path):
|
436 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
437 |
+
|
438 |
+
print("π Extracting PDF content...")
|
439 |
+
pdf_content = self.extract_pdf_content(pdf_path)
|
440 |
+
|
441 |
+
if use_hf_models and self.hf_token:
|
442 |
+
print("π€ Attempting to enhance with Hugging Face models...")
|
443 |
+
try:
|
444 |
+
print("Note: Hugging Face model integration requires further implementation.")
|
445 |
+
except Exception as e:
|
446 |
+
print(f"β οΈ Hugging Face enhancement failed: {e}")
|
447 |
+
|
448 |
+
print("π Converting to JSON...")
|
449 |
+
json_content = self.convert_to_json(
|
450 |
+
pdf_content,
|
451 |
+
output_path,
|
452 |
+
pretty_print,
|
453 |
+
include_base64_images
|
454 |
+
)
|
455 |
+
|
456 |
+
# Create summary file if requested
|
457 |
+
if create_summary and output_path:
|
458 |
+
summary_path = output_path.replace('.json', '_summary.json')
|
459 |
+
summary_data = self.create_json_summary(pdf_content)
|
460 |
+
summary_json = json.dumps(summary_data, indent=2, ensure_ascii=False)
|
461 |
+
|
462 |
+
try:
|
463 |
+
with open(summary_path, 'w', encoding='utf-8') as f:
|
464 |
+
f.write(summary_json)
|
465 |
+
print(f"β
Summary JSON saved to: {summary_path}")
|
466 |
+
except Exception as e:
|
467 |
+
print(f"β οΈ Error saving summary: {e}")
|
468 |
+
|
469 |
+
print("β
Processing complete!")
|
470 |
+
return json_content
|
471 |
+
|
472 |
+
def main():
|
473 |
+
"""Main function to demonstrate PDF to JSON conversion"""
|
474 |
+
# Set your Hugging Face token if needed
|
475 |
+
HF_TOKEN = os.getenv("HF_API_TOKEN")
|
476 |
+
|
477 |
+
# Initialize converter
|
478 |
+
converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
|
479 |
+
|
480 |
+
# Define paths
|
481 |
+
pdf_path = "new-pdf.pdf" # Change this to your PDF file path
|
482 |
+
output_path = "converted_document.json" # Output JSON file path
|
483 |
+
|
484 |
+
try:
|
485 |
+
# Convert PDF to JSON
|
486 |
+
json_content = converter.process_pdf_to_json(
|
487 |
+
pdf_path=pdf_path,
|
488 |
+
output_path=output_path,
|
489 |
+
pretty_print=True, # Format JSON with indentation
|
490 |
+
include_base64_images=True, # Include image data (set False to reduce file size)
|
491 |
+
create_summary=True, # Create additional summary file
|
492 |
+
use_hf_models=False # Set to True if you want to use HuggingFace models
|
493 |
+
)
|
494 |
+
|
495 |
+
print(f"β
Successfully converted '{pdf_path}' to '{output_path}'")
|
496 |
+
print(f"π JSON length: {len(json_content):,} characters")
|
497 |
+
print(f"π Open '{output_path}' to view the structured JSON data!")
|
498 |
+
|
499 |
+
# Optional: Print first 500 characters of JSON as preview
|
500 |
+
print("\nπ JSON Preview (first 500 characters):")
|
501 |
+
print("-" * 50)
|
502 |
+
print(json_content[:500] + "..." if len(json_content) > 500 else json_content)
|
503 |
+
|
504 |
+
except FileNotFoundError as e:
|
505 |
+
print(f"β Error: {e}")
|
506 |
+
print("Please ensure the PDF file exists at the specified path.")
|
507 |
+
except Exception as e:
|
508 |
+
print(f"β An unexpected error occurred: {str(e)}")
|
509 |
+
import traceback
|
510 |
+
traceback.print_exc()
|
511 |
+
|
512 |
+
if __name__ == "__main__":
|
513 |
+
main()
|
pdf_word.py
ADDED
@@ -0,0 +1,559 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import json
|
4 |
+
from typing import Dict, List, Any
|
5 |
+
import fitz
|
6 |
+
from PIL import Image
|
7 |
+
import io
|
8 |
+
import re
|
9 |
+
from dataclasses import dataclass
|
10 |
+
from pathlib import Path
|
11 |
+
from datetime import datetime
|
12 |
+
from docx import Document
|
13 |
+
from docx.shared import Inches, Pt
|
14 |
+
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
|
15 |
+
from docx.enum.table import WD_TABLE_ALIGNMENT
|
16 |
+
from docx.oxml.shared import OxmlElement, qn
|
17 |
+
from docx.oxml.ns import nsdecls
|
18 |
+
from docx.oxml import parse_xml
|
19 |
+
import unicodedata
|
20 |
+
import docx
|
21 |
+
import camelot
|
22 |
+
|
23 |
+
@dataclass
|
24 |
+
class TextBlock:
|
25 |
+
text: str
|
26 |
+
x: float
|
27 |
+
y: float
|
28 |
+
width: float
|
29 |
+
height: float
|
30 |
+
font_size: float
|
31 |
+
font_name: str
|
32 |
+
is_bold: bool = False
|
33 |
+
is_italic: bool = False
|
34 |
+
block_id: str = ""
|
35 |
+
is_math: bool = False
|
36 |
+
|
37 |
+
class PDFToWordConverter:
|
38 |
+
def __init__(self, huggingface_token: str = None):
|
39 |
+
self.hf_token = huggingface_token
|
40 |
+
self.hf_headers = {
|
41 |
+
"Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
|
42 |
+
}
|
43 |
+
self.models = {
|
44 |
+
"document_layout": "microsoft/layoutlm-base-uncased",
|
45 |
+
"table_detection": "microsoft/table-transformer-detection",
|
46 |
+
"ocr": "microsoft/trocr-base-printed",
|
47 |
+
"math_detection": "facebook/detr-resnet-50"
|
48 |
+
}
|
49 |
+
self.hf_inference_url = "https://api-inference.huggingface.co/models"
|
50 |
+
self.math_symbols = {
|
51 |
+
'β': 'β', 'β': 'β', 'β': 'β', 'β«': 'β«', 'β': 'β', 'β€': 'β€', 'β₯': 'β₯', 'β ': 'β ', 'Β±': 'Β±',
|
52 |
+
'Γ': 'Γ', 'Γ·': 'Γ·', 'Ξ±': 'Ξ±', 'Ξ²': 'Ξ²', 'Ξ³': 'Ξ³', 'Ξ΄': 'Ξ΄', 'ΞΈ': 'ΞΈ', 'Ξ»': 'Ξ»', 'ΞΌ': 'ΞΌ',
|
53 |
+
'Ο': 'Ο', 'Ο': 'Ο', 'Ο': 'Ο', 'Ο': 'Ο'
|
54 |
+
}
|
55 |
+
|
56 |
+
def detect_mathematical_content(self, text: str) -> bool:
|
57 |
+
math_patterns = [
|
58 |
+
r'\d+\s*[+\-*/=]\s*\d+', r'[a-zA-Z]\s*=\s*\d+', r'\b(?:sin|cos|tan|log|ln|exp)\s*\(',
|
59 |
+
r'\d+\s*\^\s*\d+', r'β\d+', r'\d+/\d+', r'[βββ«]', r'[β€β₯β Β±ΓΓ·]', r'[αβγδθλμΟΟΟΟ]',
|
60 |
+
r'\bEquation\s+\d+', r'\d+\.\d+', r'\$\d+,?\d*', r'NORMSINV', r'using Equation'
|
61 |
+
]
|
62 |
+
for pattern in math_patterns:
|
63 |
+
if re.search(pattern, text, re.IGNORECASE):
|
64 |
+
return True
|
65 |
+
return False
|
66 |
+
|
67 |
+
def preserve_mathematical_formatting(self, text: str) -> str:
|
68 |
+
if not text:
|
69 |
+
return ""
|
70 |
+
text = text.replace('Γ', 'Γ')
|
71 |
+
text = text.replace('Γ·', 'Γ·')
|
72 |
+
text = text.replace('Β±', 'Β±')
|
73 |
+
text = text.replace('β€', 'β€')
|
74 |
+
text = text.replace('β₯', 'β₯')
|
75 |
+
text = text.replace('β ', 'β ')
|
76 |
+
text = text.replace('β', 'β')
|
77 |
+
text = text.replace('β', 'β')
|
78 |
+
text = text.replace('β', 'β')
|
79 |
+
text = text.replace('β«', 'β«')
|
80 |
+
text = text.replace('β', 'β')
|
81 |
+
text = re.sub(r'(\d+)\s*\^\s*(\d+)', r'\1^\2', text)
|
82 |
+
text = re.sub(r'(\w+)\s*\(\s*([^)]+)\s*\)', r'\1(\2)', text)
|
83 |
+
return text
|
84 |
+
|
85 |
+
def clean_text_for_xml(self, text: str) -> str:
|
86 |
+
if not text:
|
87 |
+
return ""
|
88 |
+
try:
|
89 |
+
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
|
90 |
+
text = text.replace('\ufeff', '')
|
91 |
+
text = text.replace('\u0000', '')
|
92 |
+
text = unicodedata.normalize('NFKC', text)
|
93 |
+
printable_chars = []
|
94 |
+
for char in text:
|
95 |
+
if char.isprintable() or char.isspace() or char in self.math_symbols:
|
96 |
+
printable_chars.append(char)
|
97 |
+
else:
|
98 |
+
printable_chars.append(' ')
|
99 |
+
text = ''.join(printable_chars)
|
100 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
101 |
+
text = text.encode('utf-8', errors='ignore').decode('utf-8')
|
102 |
+
return self.preserve_mathematical_formatting(text)
|
103 |
+
except Exception:
|
104 |
+
return ''.join(char for char in str(text) if ord(char) < 128).strip()
|
105 |
+
|
106 |
+
def clean_font_name(self, font_name: str) -> str:
|
107 |
+
if not font_name:
|
108 |
+
return "Calibri"
|
109 |
+
try:
|
110 |
+
cleaned = self.clean_text_for_xml(font_name)
|
111 |
+
cleaned = re.sub(r'[^\w\s-]', '', cleaned)
|
112 |
+
if not cleaned.strip():
|
113 |
+
return "Calibri"
|
114 |
+
return cleaned.strip()
|
115 |
+
except Exception:
|
116 |
+
return "Calibri"
|
117 |
+
|
118 |
+
def pdf_to_base64(self, pdf_path: str) -> str:
|
119 |
+
try:
|
120 |
+
with open(pdf_path, "rb") as pdf_file:
|
121 |
+
return base64.b64encode(pdf_file.read()).decode('utf-8')
|
122 |
+
except Exception as e:
|
123 |
+
raise Exception(f"Error converting PDF to base64: {str(e)}")
|
124 |
+
|
125 |
+
def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
|
126 |
+
doc = None
|
127 |
+
try:
|
128 |
+
if not os.path.exists(pdf_path):
|
129 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
130 |
+
doc = fitz.open(pdf_path)
|
131 |
+
if doc is None:
|
132 |
+
raise RuntimeError("Failed to open PDF document")
|
133 |
+
if doc.page_count == 0:
|
134 |
+
raise ValueError("PDF document has no pages")
|
135 |
+
print(f"PDF opened successfully: {doc.page_count} pages")
|
136 |
+
pages_content = []
|
137 |
+
for page_num in range(doc.page_count):
|
138 |
+
try:
|
139 |
+
page = doc[page_num]
|
140 |
+
print(f"Processing page {page_num + 1}/{doc.page_count}")
|
141 |
+
text_blocks = []
|
142 |
+
try:
|
143 |
+
page_dict = page.get_text("dict")
|
144 |
+
text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
|
145 |
+
except Exception as e:
|
146 |
+
print(f"Dict method failed for page {page_num + 1}, using fallback: {e}")
|
147 |
+
text_blocks = self._extract_text_blocks_simple(page, page_num)
|
148 |
+
images = self._extract_images_safely(page, doc, page_num)
|
149 |
+
tables = self._detect_tables_with_camelot(pdf_path, page_num)
|
150 |
+
page_rect = page.rect
|
151 |
+
pages_content.append({
|
152 |
+
"page_number": page_num + 1,
|
153 |
+
"text_blocks": text_blocks,
|
154 |
+
"images": images,
|
155 |
+
"tables": tables,
|
156 |
+
"page_width": page_rect.width,
|
157 |
+
"page_height": page_rect.height
|
158 |
+
})
|
159 |
+
except Exception as e:
|
160 |
+
print(f"Error processing page {page_num + 1}: {e}")
|
161 |
+
pages_content.append({
|
162 |
+
"page_number": page_num + 1,
|
163 |
+
"text_blocks": [],
|
164 |
+
"images": [],
|
165 |
+
"tables": [],
|
166 |
+
"page_width": 595,
|
167 |
+
"page_height": 842
|
168 |
+
})
|
169 |
+
result = {
|
170 |
+
"pages": pages_content,
|
171 |
+
"total_pages": doc.page_count
|
172 |
+
}
|
173 |
+
return result
|
174 |
+
except Exception as e:
|
175 |
+
raise Exception(f"Error extracting PDF content: {str(e)}")
|
176 |
+
finally:
|
177 |
+
if doc is not None:
|
178 |
+
try:
|
179 |
+
doc.close()
|
180 |
+
print("PDF document closed successfully")
|
181 |
+
except Exception as e:
|
182 |
+
print(f"Error closing PDF document: {e}")
|
183 |
+
|
184 |
+
def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
|
185 |
+
text_blocks = []
|
186 |
+
for block_idx, block in enumerate(page_dict.get("blocks", [])):
|
187 |
+
if "lines" not in block:
|
188 |
+
continue
|
189 |
+
for line_idx, line in enumerate(block["lines"]):
|
190 |
+
for span_idx, span in enumerate(line["spans"]):
|
191 |
+
text_content = span.get("text", "").strip()
|
192 |
+
if text_content:
|
193 |
+
cleaned_text = self.clean_text_for_xml(text_content)
|
194 |
+
if not cleaned_text:
|
195 |
+
continue
|
196 |
+
bbox = span["bbox"]
|
197 |
+
font_name = self.clean_font_name(span.get("font", "Arial"))
|
198 |
+
font_info = {
|
199 |
+
"size": max(span.get("size", 12), 6),
|
200 |
+
"font": font_name,
|
201 |
+
"is_bold": "bold" in font_name.lower() or bool(span.get("flags", 0) & 16),
|
202 |
+
"is_italic": "italic" in font_name.lower() or bool(span.get("flags", 0) & 2)
|
203 |
+
}
|
204 |
+
is_math = self.detect_mathematical_content(cleaned_text)
|
205 |
+
text_block = TextBlock(
|
206 |
+
text=cleaned_text,
|
207 |
+
x=bbox[0], y=bbox[1],
|
208 |
+
width=bbox[2] - bbox[0], height=bbox[3] - bbox[1],
|
209 |
+
font_size=font_info["size"], font_name=font_info["font"],
|
210 |
+
is_bold=font_info["is_bold"], is_italic=font_info["is_italic"],
|
211 |
+
block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}",
|
212 |
+
is_math=is_math
|
213 |
+
)
|
214 |
+
text_blocks.append(text_block)
|
215 |
+
return text_blocks
|
216 |
+
|
217 |
+
def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
|
218 |
+
text_blocks = []
|
219 |
+
try:
|
220 |
+
blocks_data = page.get_text("blocks")
|
221 |
+
for block_idx, block in enumerate(blocks_data):
|
222 |
+
if block[6] == 0:
|
223 |
+
text = block[4].strip()
|
224 |
+
if text:
|
225 |
+
cleaned_text = self.clean_text_for_xml(text)
|
226 |
+
if not cleaned_text:
|
227 |
+
continue
|
228 |
+
x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
|
229 |
+
lines = cleaned_text.split('\n')
|
230 |
+
line_height = (y1 - y0) / max(len(lines), 1)
|
231 |
+
for line_idx, line in enumerate(lines):
|
232 |
+
line_text = self.clean_text_for_xml(line)
|
233 |
+
if line_text:
|
234 |
+
is_math = self.detect_mathematical_content(line_text)
|
235 |
+
text_block = TextBlock(
|
236 |
+
text=line_text,
|
237 |
+
x=x0, y=y0 + (line_idx * line_height),
|
238 |
+
width=x1 - x0, height=line_height,
|
239 |
+
font_size=12, font_name="Arial",
|
240 |
+
is_bold=False, is_italic=False,
|
241 |
+
block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}",
|
242 |
+
is_math=is_math
|
243 |
+
)
|
244 |
+
text_blocks.append(text_block)
|
245 |
+
except Exception as e:
|
246 |
+
print(f"Simple text block extraction failed: {e}")
|
247 |
+
return text_blocks
|
248 |
+
|
249 |
+
def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
|
250 |
+
images = []
|
251 |
+
try:
|
252 |
+
image_list = page.get_images(full=True)
|
253 |
+
for img_index, img_info in enumerate(image_list):
|
254 |
+
try:
|
255 |
+
xref = img_info[0]
|
256 |
+
img_rects = [r for r in page.get_image_rects(xref)]
|
257 |
+
if not img_rects:
|
258 |
+
continue
|
259 |
+
bbox = img_rects[0]
|
260 |
+
pix = fitz.Pixmap(doc, xref)
|
261 |
+
if pix.n - pix.alpha < 4:
|
262 |
+
img_data = pix.tobytes("png")
|
263 |
+
img_base64 = base64.b64encode(img_data).decode()
|
264 |
+
images.append({
|
265 |
+
"index": img_index,
|
266 |
+
"data": img_data,
|
267 |
+
"base64": img_base64,
|
268 |
+
"bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
|
269 |
+
})
|
270 |
+
pix = None
|
271 |
+
except Exception as e:
|
272 |
+
print(f"Error extracting image {img_index} on page {page_num+1}: {e}")
|
273 |
+
continue
|
274 |
+
except Exception as e:
|
275 |
+
print(f"General error in image extraction for page {page_num+1}: {e}")
|
276 |
+
return images
|
277 |
+
|
278 |
+
def _detect_tables_with_camelot(self, pdf_path: str, page_num: int) -> List[Dict]:
|
279 |
+
tables = []
|
280 |
+
try:
|
281 |
+
try:
|
282 |
+
camelot_tables = camelot.read_pdf(
|
283 |
+
pdf_path,
|
284 |
+
pages=str(page_num + 1),
|
285 |
+
flavor='lattice',
|
286 |
+
suppress_stdout=True
|
287 |
+
)
|
288 |
+
if len(camelot_tables) == 0:
|
289 |
+
camelot_tables = camelot.read_pdf(
|
290 |
+
pdf_path,
|
291 |
+
pages=str(page_num + 1),
|
292 |
+
flavor='stream',
|
293 |
+
suppress_stdout=True
|
294 |
+
)
|
295 |
+
except:
|
296 |
+
camelot_tables = camelot.read_pdf(
|
297 |
+
pdf_path,
|
298 |
+
pages=str(page_num + 1),
|
299 |
+
flavor='stream',
|
300 |
+
suppress_stdout=True
|
301 |
+
)
|
302 |
+
|
303 |
+
for table in camelot_tables:
|
304 |
+
table_data = table.df.values.tolist()
|
305 |
+
if table_data and any(any(str(cell).strip() for cell in row) for row in table_data):
|
306 |
+
cleaned_data = []
|
307 |
+
for row in table_data:
|
308 |
+
cleaned_row = []
|
309 |
+
for cell in row:
|
310 |
+
cell_text = str(cell).strip() if cell is not None else ""
|
311 |
+
cleaned_cell = self.clean_text_for_xml(cell_text)
|
312 |
+
cleaned_row.append(cleaned_cell)
|
313 |
+
cleaned_data.append(cleaned_row)
|
314 |
+
|
315 |
+
tables.append({
|
316 |
+
"bbox": table.bbox,
|
317 |
+
"data": cleaned_data,
|
318 |
+
"accuracy": getattr(table, 'accuracy', 0)
|
319 |
+
})
|
320 |
+
print(f"Found table with {len(cleaned_data)} rows and {len(cleaned_data[0]) if cleaned_data else 0} columns on page {page_num + 1}")
|
321 |
+
except Exception as e:
|
322 |
+
print(f"Error detecting tables with Camelot on page {page_num + 1}: {e}")
|
323 |
+
return tables
|
324 |
+
|
325 |
+
def _add_page_break(self, doc):
|
326 |
+
try:
|
327 |
+
paragraph = doc.add_paragraph()
|
328 |
+
run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
|
329 |
+
run.add_break(WD_BREAK.PAGE)
|
330 |
+
except:
|
331 |
+
doc.add_page_break()
|
332 |
+
|
333 |
+
def _set_font_properties(self, run, text_block: TextBlock):
|
334 |
+
try:
|
335 |
+
font_name = self.clean_font_name(text_block.font_name)
|
336 |
+
if 'Times' in font_name or 'Roman' in font_name:
|
337 |
+
run.font.name = 'Times New Roman'
|
338 |
+
elif 'Arial' in font_name:
|
339 |
+
run.font.name = 'Arial'
|
340 |
+
elif 'Courier' in font_name:
|
341 |
+
run.font.name = 'Courier New'
|
342 |
+
else:
|
343 |
+
run.font.name = 'Calibri'
|
344 |
+
try:
|
345 |
+
font_size_val = float(text_block.font_size)
|
346 |
+
font_size = max(min(int(font_size_val), 72), 6)
|
347 |
+
run.font.size = Pt(font_size)
|
348 |
+
except (ValueError, TypeError):
|
349 |
+
print(f"Warning: Invalid font_size '{text_block.font_size}'. Using default 11pt.")
|
350 |
+
run.font.size = Pt(11)
|
351 |
+
run.font.bold = bool(text_block.is_bold)
|
352 |
+
run.font.italic = bool(text_block.is_italic)
|
353 |
+
if text_block.is_math:
|
354 |
+
run.font.name = 'Cambria Math'
|
355 |
+
except Exception as e:
|
356 |
+
print(f"Error setting font properties for text_block: {e}")
|
357 |
+
run.font.name = 'Calibri'
|
358 |
+
run.font.size = Pt(11)
|
359 |
+
run.font.bold = False
|
360 |
+
run.font.italic = False
|
361 |
+
|
362 |
+
def _group_text_blocks_by_lines(self, text_blocks: List[TextBlock]) -> List[List[TextBlock]]:
|
363 |
+
if not text_blocks:
|
364 |
+
return []
|
365 |
+
sorted_blocks = sorted(text_blocks, key=lambda b: (round(b.y, 1), b.x))
|
366 |
+
lines = []
|
367 |
+
current_line = []
|
368 |
+
current_y = None
|
369 |
+
for block in sorted_blocks:
|
370 |
+
if current_y is None or abs(block.y - current_y) <= 5:
|
371 |
+
current_line.append(block)
|
372 |
+
current_y = block.y if current_y is None else current_y
|
373 |
+
else:
|
374 |
+
if current_line:
|
375 |
+
lines.append(current_line)
|
376 |
+
current_line = [block]
|
377 |
+
current_y = block.y
|
378 |
+
if current_line:
|
379 |
+
lines.append(current_line)
|
380 |
+
return lines
|
381 |
+
|
382 |
+
def _set_table_borders(self, table):
|
383 |
+
tbl = table._tbl
|
384 |
+
for row in tbl.tr_lst:
|
385 |
+
for cell in row.tc_lst:
|
386 |
+
tcPr = cell.tcPr
|
387 |
+
tcBorders = OxmlElement('w:tcBorders')
|
388 |
+
|
389 |
+
for border_name in ['top', 'left', 'bottom', 'right']:
|
390 |
+
border = OxmlElement(f'w:{border_name}')
|
391 |
+
border.set(qn('w:val'), 'single')
|
392 |
+
border.set(qn('w:sz'), '4')
|
393 |
+
border.set(qn('w:space'), '0')
|
394 |
+
border.set(qn('w:color'), '000000')
|
395 |
+
tcBorders.append(border)
|
396 |
+
|
397 |
+
tcPr.append(tcBorders)
|
398 |
+
|
399 |
+
def _create_enhanced_table(self, doc, table_data):
|
400 |
+
try:
|
401 |
+
table_rows = table_data["data"]
|
402 |
+
if not table_rows or not any(any(str(cell).strip() for cell in row) for row in table_rows):
|
403 |
+
return None
|
404 |
+
|
405 |
+
max_cols = max(len(row) for row in table_rows) if table_rows else 0
|
406 |
+
if max_cols == 0:
|
407 |
+
return None
|
408 |
+
|
409 |
+
word_table = doc.add_table(rows=len(table_rows), cols=max_cols)
|
410 |
+
|
411 |
+
self._set_table_borders(word_table)
|
412 |
+
word_table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
413 |
+
word_table.autofit = False
|
414 |
+
|
415 |
+
for row_idx, row_data in enumerate(table_rows):
|
416 |
+
for col_idx in range(max_cols):
|
417 |
+
cell = word_table.cell(row_idx, col_idx)
|
418 |
+
cell_data = row_data[col_idx] if col_idx < len(row_data) else ""
|
419 |
+
clean_cell_data = self.clean_text_for_xml(str(cell_data) if cell_data else "")
|
420 |
+
|
421 |
+
paragraph = cell.paragraphs[0]
|
422 |
+
paragraph.clear()
|
423 |
+
run = paragraph.add_run(clean_cell_data)
|
424 |
+
|
425 |
+
if self.detect_mathematical_content(clean_cell_data):
|
426 |
+
run.font.name = 'Cambria Math'
|
427 |
+
else:
|
428 |
+
run.font.name = 'Calibri'
|
429 |
+
run.font.size = Pt(9)
|
430 |
+
|
431 |
+
if row_idx == 0:
|
432 |
+
run.font.bold = True
|
433 |
+
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
434 |
+
|
435 |
+
cell.vertical_alignment = docx.enum.table.WD_ALIGN_VERTICAL.CENTER
|
436 |
+
|
437 |
+
print(f"Created table with {len(table_rows)} rows and {max_cols} columns")
|
438 |
+
return word_table
|
439 |
+
except Exception as e:
|
440 |
+
print(f"Error creating enhanced table: {e}")
|
441 |
+
return None
|
442 |
+
|
443 |
+
def convert_to_word(self, pdf_content: Dict[str, Any], output_path: str = None) -> Document:
|
444 |
+
print("Creating Word document...")
|
445 |
+
doc = Document()
|
446 |
+
doc.core_properties.title = "PDF to Word Conversion"
|
447 |
+
doc.core_properties.author = "PDF Converter"
|
448 |
+
doc.core_properties.created = datetime.now()
|
449 |
+
header_para = doc.add_paragraph()
|
450 |
+
header_run = header_para.add_run("PDF Document Conversion")
|
451 |
+
header_run.font.size = Pt(16)
|
452 |
+
header_run.font.bold = True
|
453 |
+
header_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
454 |
+
info_para = doc.add_paragraph()
|
455 |
+
info_run = info_para.add_run(f"Total Pages: {pdf_content.get('total_pages', 'Unknown')} | Converted on: {self._get_current_timestamp()}")
|
456 |
+
info_run.font.size = Pt(10)
|
457 |
+
info_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
458 |
+
doc.add_paragraph()
|
459 |
+
|
460 |
+
for page_idx, page in enumerate(pdf_content["pages"]):
|
461 |
+
print(f"Converting page {page['page_number']}/{pdf_content.get('total_pages', '?')}")
|
462 |
+
page_header = doc.add_paragraph()
|
463 |
+
page_header_run = page_header.add_run(f"--- Page {page['page_number']} ---")
|
464 |
+
page_header_run.font.bold = True
|
465 |
+
page_header_run.font.size = Pt(12)
|
466 |
+
page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
467 |
+
|
468 |
+
for img in page["images"]:
|
469 |
+
try:
|
470 |
+
img_para = doc.add_paragraph()
|
471 |
+
img_run = img_para.add_run()
|
472 |
+
img_stream = io.BytesIO(img['data'])
|
473 |
+
img_bbox = img['bbox']
|
474 |
+
img_width_px = img_bbox[2] - img_bbox[0]
|
475 |
+
page_width_px = page.get('page_width', 595)
|
476 |
+
img_width = min(Inches(img_width_px / 72), Inches(6.5))
|
477 |
+
img_run.add_picture(img_stream, width=img_width)
|
478 |
+
img_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
479 |
+
except Exception as e:
|
480 |
+
print(f"Error adding image to Word document: {e}")
|
481 |
+
img_para = doc.add_paragraph()
|
482 |
+
img_run = img_para.add_run(f"[Image {img['index']} - Could not be inserted]")
|
483 |
+
img_run.font.italic = True
|
484 |
+
|
485 |
+
if page["tables"]:
|
486 |
+
for table_data in page["tables"]:
|
487 |
+
try:
|
488 |
+
enhanced_table = self._create_enhanced_table(doc, table_data)
|
489 |
+
if enhanced_table:
|
490 |
+
doc.add_paragraph()
|
491 |
+
except Exception as e:
|
492 |
+
print(f"Error adding table to Word document: {e}")
|
493 |
+
|
494 |
+
text_lines = self._group_text_blocks_by_lines(page["text_blocks"])
|
495 |
+
for line_blocks in text_lines:
|
496 |
+
if not line_blocks:
|
497 |
+
continue
|
498 |
+
para = doc.add_paragraph()
|
499 |
+
line_blocks.sort(key=lambda b: b.x)
|
500 |
+
for block in line_blocks:
|
501 |
+
cleaned_text = self.clean_text_for_xml(block.text)
|
502 |
+
if cleaned_text:
|
503 |
+
run = para.add_run(cleaned_text + " ")
|
504 |
+
self._set_font_properties(run, block)
|
505 |
+
if page_idx < len(pdf_content["pages"]) - 1:
|
506 |
+
self._add_page_break(doc)
|
507 |
+
if output_path:
|
508 |
+
try:
|
509 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
510 |
+
doc.save(output_path)
|
511 |
+
print(f"Word document saved to: {output_path}")
|
512 |
+
except Exception as e:
|
513 |
+
print(f"Error saving Word document to {output_path}: {e}")
|
514 |
+
return doc
|
515 |
+
|
516 |
+
def _get_current_timestamp(self) -> str:
|
517 |
+
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
518 |
+
|
519 |
+
def process_pdf_to_word(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> Document:
|
520 |
+
print(f"Processing PDF to Word: {pdf_path}")
|
521 |
+
if not os.path.exists(pdf_path):
|
522 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
523 |
+
print("Extracting PDF content...")
|
524 |
+
pdf_content = self.extract_pdf_content(pdf_path)
|
525 |
+
if use_hf_models and self.hf_token:
|
526 |
+
print("Attempting to enhance with Hugging Face models...")
|
527 |
+
try:
|
528 |
+
print("Note: Hugging Face model integration requires further implementation.")
|
529 |
+
except Exception as e:
|
530 |
+
print(f"Hugging Face enhancement failed: {e}")
|
531 |
+
print("Converting to Word document...")
|
532 |
+
word_doc = self.convert_to_word(pdf_content, output_path)
|
533 |
+
print("Processing complete!")
|
534 |
+
return word_doc
|
535 |
+
|
536 |
+
def main():
|
537 |
+
HF_TOKEN = os.getenv("HF_API_TOKEN")
|
538 |
+
|
539 |
+
converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
|
540 |
+
pdf_path = "supplychain (1).pdf"
|
541 |
+
output_path = "converted_document_enhanced.docx"
|
542 |
+
|
543 |
+
try:
|
544 |
+
word_document = converter.process_pdf_to_word(
|
545 |
+
pdf_path=pdf_path,
|
546 |
+
output_path=output_path,
|
547 |
+
use_hf_models=False
|
548 |
+
)
|
549 |
+
print(f"Successfully converted '{pdf_path}' to '{output_path}'")
|
550 |
+
print(f"Open '{output_path}' in Microsoft Word to view the result!")
|
551 |
+
except FileNotFoundError as e:
|
552 |
+
print(f"Error: {e}")
|
553 |
+
except Exception as e:
|
554 |
+
print(f"An unexpected error occurred: {str(e)}")
|
555 |
+
import traceback
|
556 |
+
traceback.print_exc()
|
557 |
+
|
558 |
+
if __name__ == "__main__":
|
559 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyMuPDF==1.23.26
|
2 |
+
Pillow==10.0.0
|
3 |
+
requests==2.31.0
|
4 |
+
transformers==4.35.0
|
5 |
+
torch==2.1.0
|
6 |
+
numpy==1.24.0
|
7 |
+
flask==2.3.3
|
8 |
+
flask-cors==4.0.0
|
9 |
+
werkzeug==2.3.7
|
10 |
+
camelot-py[cv]==0.11.0
|
11 |
+
gunicorn==21.2.0
|
12 |
+
python-docx==1.1.0
|
static/index.html
ADDED
@@ -0,0 +1,896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
6 |
+
<title>PDF Converter Tool</title>
|
7 |
+
<style>
|
8 |
+
* {
|
9 |
+
margin: 0;
|
10 |
+
padding: 0;
|
11 |
+
box-sizing: border-box;
|
12 |
+
}
|
13 |
+
|
14 |
+
body {
|
15 |
+
font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif;
|
16 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
17 |
+
min-height: 100vh;
|
18 |
+
display: flex;
|
19 |
+
align-items: center;
|
20 |
+
justify-content: center;
|
21 |
+
padding: 20px;
|
22 |
+
}
|
23 |
+
|
24 |
+
.container {
|
25 |
+
background: rgba(255, 255, 255, 0.95);
|
26 |
+
backdrop-filter: blur(10px);
|
27 |
+
padding: 40px;
|
28 |
+
border-radius: 20px;
|
29 |
+
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
|
30 |
+
max-width: 600px;
|
31 |
+
width: 100%;
|
32 |
+
animation: slideIn 0.6s ease-out;
|
33 |
+
}
|
34 |
+
|
35 |
+
@keyframes slideIn {
|
36 |
+
from {
|
37 |
+
opacity: 0;
|
38 |
+
transform: translateY(30px);
|
39 |
+
}
|
40 |
+
to {
|
41 |
+
opacity: 1;
|
42 |
+
transform: translateY(0);
|
43 |
+
}
|
44 |
+
}
|
45 |
+
|
46 |
+
.header {
|
47 |
+
text-align: center;
|
48 |
+
margin-bottom: 40px;
|
49 |
+
}
|
50 |
+
|
51 |
+
.header h1 {
|
52 |
+
color: #333;
|
53 |
+
font-size: 2.5em;
|
54 |
+
margin-bottom: 10px;
|
55 |
+
background: linear-gradient(45deg, #667eea, #764ba2);
|
56 |
+
-webkit-background-clip: text;
|
57 |
+
-webkit-text-fill-color: transparent;
|
58 |
+
background-clip: text;
|
59 |
+
}
|
60 |
+
|
61 |
+
.header p {
|
62 |
+
color: #666;
|
63 |
+
font-size: 1.1em;
|
64 |
+
}
|
65 |
+
|
66 |
+
.status-indicator {
|
67 |
+
position: absolute;
|
68 |
+
top: 20px;
|
69 |
+
right: 20px;
|
70 |
+
padding: 8px 16px;
|
71 |
+
border-radius: 20px;
|
72 |
+
font-size: 0.8em;
|
73 |
+
font-weight: 600;
|
74 |
+
text-transform: uppercase;
|
75 |
+
letter-spacing: 0.5px;
|
76 |
+
}
|
77 |
+
|
78 |
+
.status-online {
|
79 |
+
background: #d4edda;
|
80 |
+
color: #155724;
|
81 |
+
border: 1px solid #c3e6cb;
|
82 |
+
}
|
83 |
+
|
84 |
+
.status-offline {
|
85 |
+
background: #f8d7da;
|
86 |
+
color: #721c24;
|
87 |
+
border: 1px solid #f5c6cb;
|
88 |
+
}
|
89 |
+
|
90 |
+
.conversion-options {
|
91 |
+
display: grid;
|
92 |
+
gap: 20px;
|
93 |
+
margin-bottom: 30px;
|
94 |
+
}
|
95 |
+
|
96 |
+
.option-card {
|
97 |
+
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
98 |
+
border: none;
|
99 |
+
border-radius: 15px;
|
100 |
+
padding: 25px;
|
101 |
+
cursor: pointer;
|
102 |
+
transition: all 0.3s ease;
|
103 |
+
color: white;
|
104 |
+
text-align: left;
|
105 |
+
position: relative;
|
106 |
+
overflow: hidden;
|
107 |
+
}
|
108 |
+
|
109 |
+
.option-card:hover {
|
110 |
+
transform: translateY(-5px);
|
111 |
+
box-shadow: 0 15px 30px rgba(0, 0, 0, 0.2);
|
112 |
+
}
|
113 |
+
|
114 |
+
.option-card.html {
|
115 |
+
background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
|
116 |
+
}
|
117 |
+
|
118 |
+
.option-card.word {
|
119 |
+
background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
|
120 |
+
}
|
121 |
+
|
122 |
+
.option-card.json {
|
123 |
+
background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
|
124 |
+
}
|
125 |
+
|
126 |
+
/* New style for Excel option card */
|
127 |
+
.option-card.excel {
|
128 |
+
background: linear-gradient(135deg, #2ecc71 0%, #27ae60 100%); /* Green shades for Excel */
|
129 |
+
}
|
130 |
+
|
131 |
+
|
132 |
+
.option-card::before {
|
133 |
+
content: "";
|
134 |
+
position: absolute;
|
135 |
+
top: 0;
|
136 |
+
left: -100%;
|
137 |
+
width: 100%;
|
138 |
+
height: 100%;
|
139 |
+
background: linear-gradient(
|
140 |
+
90deg,
|
141 |
+
transparent,
|
142 |
+
rgba(255, 255, 255, 0.2),
|
143 |
+
transparent
|
144 |
+
);
|
145 |
+
transition: left 0.5s;
|
146 |
+
}
|
147 |
+
|
148 |
+
.option-card:hover::before {
|
149 |
+
left: 100%;
|
150 |
+
}
|
151 |
+
|
152 |
+
.option-icon {
|
153 |
+
font-size: 2em;
|
154 |
+
margin-bottom: 10px;
|
155 |
+
}
|
156 |
+
|
157 |
+
.option-title {
|
158 |
+
font-size: 1.3em;
|
159 |
+
font-weight: bold;
|
160 |
+
margin-bottom: 5px;
|
161 |
+
}
|
162 |
+
|
163 |
+
.option-desc {
|
164 |
+
font-size: 0.9em;
|
165 |
+
opacity: 0.9;
|
166 |
+
}
|
167 |
+
|
168 |
+
.upload-section {
|
169 |
+
display: none;
|
170 |
+
background: #f8f9fa;
|
171 |
+
border-radius: 15px;
|
172 |
+
padding: 30px;
|
173 |
+
margin-top: 20px;
|
174 |
+
border: 2px dashed #ddd;
|
175 |
+
transition: all 0.3s ease;
|
176 |
+
}
|
177 |
+
|
178 |
+
.upload-section.active {
|
179 |
+
display: block;
|
180 |
+
animation: fadeIn 0.5s ease-out;
|
181 |
+
}
|
182 |
+
|
183 |
+
@keyframes fadeIn {
|
184 |
+
from {
|
185 |
+
opacity: 0;
|
186 |
+
}
|
187 |
+
to {
|
188 |
+
opacity: 1;
|
189 |
+
}
|
190 |
+
}
|
191 |
+
|
192 |
+
.file-input-wrapper {
|
193 |
+
position: relative;
|
194 |
+
display: inline-block;
|
195 |
+
width: 100%;
|
196 |
+
margin-bottom: 20px;
|
197 |
+
}
|
198 |
+
|
199 |
+
.file-input {
|
200 |
+
display: none;
|
201 |
+
}
|
202 |
+
|
203 |
+
.file-input-label {
|
204 |
+
display: block;
|
205 |
+
padding: 15px 25px;
|
206 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
207 |
+
color: white;
|
208 |
+
border-radius: 10px;
|
209 |
+
cursor: pointer;
|
210 |
+
text-align: center;
|
211 |
+
transition: all 0.3s ease;
|
212 |
+
font-weight: 500;
|
213 |
+
}
|
214 |
+
|
215 |
+
.file-input-label:hover {
|
216 |
+
transform: translateY(-2px);
|
217 |
+
box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2);
|
218 |
+
}
|
219 |
+
|
220 |
+
.file-name {
|
221 |
+
margin-top: 10px;
|
222 |
+
padding: 10px;
|
223 |
+
background: #e9ecef;
|
224 |
+
border-radius: 8px;
|
225 |
+
font-size: 0.9em;
|
226 |
+
color: #495057;
|
227 |
+
display: none;
|
228 |
+
}
|
229 |
+
|
230 |
+
.output-name {
|
231 |
+
width: 100%;
|
232 |
+
padding: 15px;
|
233 |
+
border: 2px solid #e9ecef;
|
234 |
+
border-radius: 10px;
|
235 |
+
font-size: 1em;
|
236 |
+
margin-bottom: 20px;
|
237 |
+
transition: border-color 0.3s ease;
|
238 |
+
}
|
239 |
+
|
240 |
+
.output-name:focus {
|
241 |
+
outline: none;
|
242 |
+
border-color: #667eea;
|
243 |
+
}
|
244 |
+
|
245 |
+
.convert-btn {
|
246 |
+
width: 100%;
|
247 |
+
padding: 15px;
|
248 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
249 |
+
color: white;
|
250 |
+
border: none;
|
251 |
+
border-radius: 10px;
|
252 |
+
font-size: 1.1em;
|
253 |
+
font-weight: 600;
|
254 |
+
cursor: pointer;
|
255 |
+
transition: all 0.3s ease;
|
256 |
+
position: relative;
|
257 |
+
overflow: hidden;
|
258 |
+
}
|
259 |
+
|
260 |
+
.convert-btn:hover {
|
261 |
+
transform: translateY(-2px);
|
262 |
+
box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2);
|
263 |
+
}
|
264 |
+
|
265 |
+
.convert-btn:disabled {
|
266 |
+
opacity: 0.7;
|
267 |
+
cursor: not-allowed;
|
268 |
+
transform: none;
|
269 |
+
}
|
270 |
+
|
271 |
+
.back-btn {
|
272 |
+
background: #6c757d;
|
273 |
+
color: white;
|
274 |
+
border: none;
|
275 |
+
padding: 10px 20px;
|
276 |
+
border-radius: 8px;
|
277 |
+
cursor: pointer;
|
278 |
+
margin-bottom: 20px;
|
279 |
+
transition: all 0.3s ease;
|
280 |
+
}
|
281 |
+
|
282 |
+
.back-btn:hover {
|
283 |
+
background: #5a6268;
|
284 |
+
transform: translateY(-1px);
|
285 |
+
}
|
286 |
+
|
287 |
+
.progress-bar {
|
288 |
+
width: 100%;
|
289 |
+
height: 6px;
|
290 |
+
background: #e9ecef;
|
291 |
+
border-radius: 3px;
|
292 |
+
margin: 20px 0;
|
293 |
+
overflow: hidden;
|
294 |
+
display: none;
|
295 |
+
}
|
296 |
+
|
297 |
+
.progress-fill {
|
298 |
+
height: 100%;
|
299 |
+
background: linear-gradient(90deg, #667eea, #764ba2);
|
300 |
+
width: 0%;
|
301 |
+
transition: width 0.3s ease;
|
302 |
+
border-radius: 3px;
|
303 |
+
}
|
304 |
+
|
305 |
+
.result-section {
|
306 |
+
margin-top: 20px;
|
307 |
+
padding: 20px;
|
308 |
+
border-radius: 12px;
|
309 |
+
display: none;
|
310 |
+
}
|
311 |
+
|
312 |
+
.result-success {
|
313 |
+
background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
|
314 |
+
border: 1px solid #c3e6cb;
|
315 |
+
color: #155724;
|
316 |
+
}
|
317 |
+
|
318 |
+
.result-error {
|
319 |
+
background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%);
|
320 |
+
border: 1px solid #f5c6cb;
|
321 |
+
color: #721c24;
|
322 |
+
}
|
323 |
+
|
324 |
+
.loading {
|
325 |
+
display: none;
|
326 |
+
text-align: center;
|
327 |
+
margin: 20px 0;
|
328 |
+
}
|
329 |
+
|
330 |
+
.spinner {
|
331 |
+
border: 4px solid #f3f3f3;
|
332 |
+
border-top: 4px solid #667eea;
|
333 |
+
border-radius: 50%;
|
334 |
+
width: 40px;
|
335 |
+
height: 40px;
|
336 |
+
animation: spin 1s linear infinite;
|
337 |
+
margin: 0 auto 10px;
|
338 |
+
}
|
339 |
+
|
340 |
+
@keyframes spin {
|
341 |
+
0% {
|
342 |
+
transform: rotate(0deg);
|
343 |
+
}
|
344 |
+
100% {
|
345 |
+
transform: rotate(360deg);
|
346 |
+
}
|
347 |
+
}
|
348 |
+
|
349 |
+
.drag-over {
|
350 |
+
border-color: #667eea !important;
|
351 |
+
background: rgba(102, 126, 234, 0.1) !important;
|
352 |
+
}
|
353 |
+
|
354 |
+
.debug-info {
|
355 |
+
margin-top: 20px;
|
356 |
+
padding: 15px;
|
357 |
+
background: #f8f9fa;
|
358 |
+
border-radius: 8px;
|
359 |
+
font-size: 0.9em;
|
360 |
+
color: #6c757d;
|
361 |
+
border-left: 4px solid #007bff;
|
362 |
+
}
|
363 |
+
|
364 |
+
.error-details {
|
365 |
+
margin-top: 10px;
|
366 |
+
padding: 10px;
|
367 |
+
background: #fff3cd;
|
368 |
+
border: 1px solid #ffeaa7;
|
369 |
+
border-radius: 6px;
|
370 |
+
font-size: 0.85em;
|
371 |
+
color: #856404;
|
372 |
+
max-height: 200px;
|
373 |
+
overflow-y: auto;
|
374 |
+
}
|
375 |
+
|
376 |
+
@media (max-width: 768px) {
|
377 |
+
.container {
|
378 |
+
padding: 25px;
|
379 |
+
margin: 10px;
|
380 |
+
}
|
381 |
+
|
382 |
+
.header h1 {
|
383 |
+
font-size: 2em;
|
384 |
+
}
|
385 |
+
|
386 |
+
.option-card {
|
387 |
+
padding: 20px;
|
388 |
+
}
|
389 |
+
|
390 |
+
.status-indicator {
|
391 |
+
position: relative;
|
392 |
+
top: auto;
|
393 |
+
right: auto;
|
394 |
+
margin-bottom: 20px;
|
395 |
+
display: inline-block;
|
396 |
+
}
|
397 |
+
}
|
398 |
+
</style>
|
399 |
+
</head>
|
400 |
+
<body>
|
401 |
+
<div class="container">
|
402 |
+
<div id="status-indicator" class="status-indicator status-offline">
|
403 |
+
Server Offline
|
404 |
+
</div>
|
405 |
+
|
406 |
+
<div class="header">
|
407 |
+
<h1>π§ PDF Converter</h1>
|
408 |
+
<p>Convert your PDF files to HTML, Word, JSON, or Excel format</p>
|
409 |
+
</div>
|
410 |
+
|
411 |
+
<div id="main-menu">
|
412 |
+
<div class="conversion-options">
|
413 |
+
<button class="option-card html" onclick="showUploadSection('html')">
|
414 |
+
<div class="option-icon">π</div>
|
415 |
+
<div class="option-title">Convert to HTML</div>
|
416 |
+
<div class="option-desc">
|
417 |
+
Transform PDF into web-ready HTML format
|
418 |
+
</div>
|
419 |
+
</button>
|
420 |
+
|
421 |
+
<button class="option-card word" onclick="showUploadSection('word')">
|
422 |
+
<div class="option-icon">π</div>
|
423 |
+
<div class="option-title">Convert to Word</div>
|
424 |
+
<div class="option-desc">
|
425 |
+
Create editable Word documents from PDF
|
426 |
+
</div>
|
427 |
+
</button>
|
428 |
+
|
429 |
+
<button class="option-card json" onclick="showUploadSection('json')">
|
430 |
+
<div class="option-icon">π</div>
|
431 |
+
<div class="option-title">Convert to JSON</div>
|
432 |
+
<div class="option-desc">
|
433 |
+
Extract structured data in JSON format
|
434 |
+
</div>
|
435 |
+
</button>
|
436 |
+
|
437 |
+
<button class="option-card excel" onclick="showUploadSection('excel')">
|
438 |
+
<div class="option-icon">π</div>
|
439 |
+
<div class="option-title">Convert to Excel</div>
|
440 |
+
<div class="option-desc">
|
441 |
+
Organize PDF tables into an Excel spreadsheet
|
442 |
+
</div>
|
443 |
+
</button>
|
444 |
+
</div>
|
445 |
+
</div>
|
446 |
+
|
447 |
+
<div id="upload-section" class="upload-section">
|
448 |
+
<button class="back-btn" onclick="showMainMenu()">
|
449 |
+
β Back to Menu
|
450 |
+
</button>
|
451 |
+
|
452 |
+
<div class="file-input-wrapper">
|
453 |
+
<input
|
454 |
+
type="file"
|
455 |
+
id="pdf-file"
|
456 |
+
class="file-input"
|
457 |
+
accept=".pdf"
|
458 |
+
onchange="handleFileSelect(event)"
|
459 |
+
/>
|
460 |
+
<label for="pdf-file" class="file-input-label" id="file-label">
|
461 |
+
π Choose PDF File or Drag & Drop Here
|
462 |
+
</label>
|
463 |
+
<div id="file-name" class="file-name"></div>
|
464 |
+
</div>
|
465 |
+
|
466 |
+
<div class="loading" id="loading">
|
467 |
+
<div class="spinner"></div>
|
468 |
+
<p>Converting your PDF file...</p>
|
469 |
+
</div>
|
470 |
+
|
471 |
+
<div class="progress-bar" id="progress-bar">
|
472 |
+
<div class="progress-fill" id="progress-fill"></div>
|
473 |
+
</div>
|
474 |
+
|
475 |
+
<button
|
476 |
+
class="convert-btn"
|
477 |
+
id="convert-btn"
|
478 |
+
onclick="convertFile()"
|
479 |
+
disabled
|
480 |
+
>
|
481 |
+
π Start Conversion
|
482 |
+
</button>
|
483 |
+
|
484 |
+
<div id="result-section" class="result-section">
|
485 |
+
<div id="result-message"></div>
|
486 |
+
</div>
|
487 |
+
|
488 |
+
<div id="debug-info" class="debug-info" style="display: none">
|
489 |
+
<strong>Debug Information:</strong>
|
490 |
+
<div id="debug-content"></div>
|
491 |
+
</div>
|
492 |
+
</div>
|
493 |
+
</div>
|
494 |
+
|
495 |
+
<script>
|
496 |
+
let currentFormat = "";
|
497 |
+
let selectedFile = null;
|
498 |
+
let serverOnline = false;
|
499 |
+
|
500 |
+
// Check server status on page load
|
501 |
+
document.addEventListener("DOMContentLoaded", function () {
|
502 |
+
checkServerStatus();
|
503 |
+
// Check server status every 30 seconds
|
504 |
+
setInterval(checkServerStatus, 30000);
|
505 |
+
});
|
506 |
+
|
507 |
+
async function checkServerStatus() {
|
508 |
+
try {
|
509 |
+
const response = await fetch("/health", {
|
510 |
+
method: "GET",
|
511 |
+
mode: "cors",
|
512 |
+
headers: {
|
513 |
+
Accept: "application/json",
|
514 |
+
},
|
515 |
+
signal: AbortSignal.timeout(5000), // 5 second timeout
|
516 |
+
});
|
517 |
+
|
518 |
+
if (response.ok) {
|
519 |
+
const data = await response.json();
|
520 |
+
updateServerStatus(true, data.message || "Server is online");
|
521 |
+
} else {
|
522 |
+
updateServerStatus(false, `Server returned ${response.status}`);
|
523 |
+
}
|
524 |
+
} catch (error) {
|
525 |
+
updateServerStatus(false, error.message);
|
526 |
+
}
|
527 |
+
}
|
528 |
+
|
529 |
+
function updateServerStatus(online, message) {
|
530 |
+
serverOnline = online;
|
531 |
+
const indicator = document.getElementById("status-indicator");
|
532 |
+
|
533 |
+
if (online) {
|
534 |
+
indicator.className = "status-indicator status-online";
|
535 |
+
indicator.textContent = "Server Online";
|
536 |
+
indicator.title = message;
|
537 |
+
} else {
|
538 |
+
indicator.className = "status-indicator status-offline";
|
539 |
+
indicator.textContent = "Server Offline";
|
540 |
+
indicator.title = `Error: ${message}`;
|
541 |
+
}
|
542 |
+
}
|
543 |
+
|
544 |
+
function showUploadSection(format) {
|
545 |
+
if (!serverOnline) {
|
546 |
+
alert("Server is offline. Please start the Flask server first.");
|
547 |
+
return;
|
548 |
+
}
|
549 |
+
|
550 |
+
currentFormat = format;
|
551 |
+
document.getElementById("main-menu").style.display = "none";
|
552 |
+
document.getElementById("upload-section").classList.add("active");
|
553 |
+
|
554 |
+
resetForm(); // β
Always reset when entering upload
|
555 |
+
|
556 |
+
const outputInput = document.getElementById("output-name");
|
557 |
+
const extensions = { html: ".html", word: ".docx", json: ".json", excel: ".xlsx" };
|
558 |
+
outputInput.placeholder = `Enter output filename (e.g., converted_file${extensions[format]})`;
|
559 |
+
}
|
560 |
+
|
561 |
+
|
562 |
+
function showMainMenu() {
|
563 |
+
window.location.reload();
|
564 |
+
document.getElementById("main-menu").style.display = "block";
|
565 |
+
document.getElementById("upload-section").classList.remove("active");
|
566 |
+
resetForm();
|
567 |
+
|
568 |
+
selectedFile = null;
|
569 |
+
}
|
570 |
+
|
571 |
+
|
572 |
+
function resetForm() {
|
573 |
+
selectedFile = null;
|
574 |
+
|
575 |
+
const pdfInput = document.getElementById("pdf-file");
|
576 |
+
const outputInput = document.getElementById("output-name");
|
577 |
+
const fileName = document.getElementById("file-name");
|
578 |
+
const fileLabel = document.getElementById("file-label");
|
579 |
+
|
580 |
+
// Clear inputs
|
581 |
+
pdfInput.value = "";
|
582 |
+
|
583 |
+
|
584 |
+
// Hide filename display
|
585 |
+
fileName.style.display = "none";
|
586 |
+
fileName.textContent = "";
|
587 |
+
|
588 |
+
// Reset label text
|
589 |
+
fileLabel.textContent = "π Choose PDF File or Drag & Drop Here";
|
590 |
+
|
591 |
+
// Reset buttons and sections
|
592 |
+
document.getElementById("convert-btn").disabled = true;
|
593 |
+
document.getElementById("result-section").style.display = "none";
|
594 |
+
document.getElementById("loading").style.display = "none";
|
595 |
+
document.getElementById("progress-bar").style.display = "none";
|
596 |
+
document.getElementById("debug-info").style.display = "none";
|
597 |
+
|
598 |
+
// Also reset drag-over styling if stuck
|
599 |
+
document.getElementById("upload-section").classList.remove("drag-over");
|
600 |
+
}
|
601 |
+
|
602 |
+
|
603 |
+
function handleFileSelect(event) {
|
604 |
+
const file = event.target.files[0];
|
605 |
+
if (file && file.type === "application/pdf") {
|
606 |
+
selectedFile = file;
|
607 |
+
document.getElementById("file-name").textContent = `Selected: ${
|
608 |
+
file.name
|
609 |
+
} (${(file.size / 1024 / 1024).toFixed(2)} MB)`;
|
610 |
+
document.getElementById("file-name").style.display = "block";
|
611 |
+
document.getElementById(
|
612 |
+
"file-label"
|
613 |
+
).textContent = `β
${file.name} selected`;
|
614 |
+
checkFormValidity();
|
615 |
+
} else {
|
616 |
+
alert("Please select a valid PDF file.");
|
617 |
+
resetFileInput();
|
618 |
+
}
|
619 |
+
}
|
620 |
+
|
621 |
+
function resetFileInput() {
|
622 |
+
selectedFile = null;
|
623 |
+
document.getElementById("pdf-file").value = "";
|
624 |
+
document.getElementById("file-name").style.display = "none";
|
625 |
+
document.getElementById("file-label").textContent =
|
626 |
+
"π Choose PDF File or Drag & Drop Here";
|
627 |
+
checkFormValidity();
|
628 |
+
}
|
629 |
+
|
630 |
+
function checkFormValidity() {
|
631 |
+
const outputName = document.getElementById("output-name").value.trim();
|
632 |
+
const convertBtn = document.getElementById("convert-btn");
|
633 |
+
|
634 |
+
if (selectedFile && outputName && serverOnline) {
|
635 |
+
convertBtn.disabled = false;
|
636 |
+
convertBtn.textContent = "π Start Conversion";
|
637 |
+
} else {
|
638 |
+
convertBtn.disabled = true;
|
639 |
+
convertBtn.textContent = serverOnline
|
640 |
+
? "π Start Conversion"
|
641 |
+
: "β Server Offline";
|
642 |
+
}
|
643 |
+
}
|
644 |
+
|
645 |
+
// Add event listener for output name input
|
646 |
+
document
|
647 |
+
.getElementById("output-name")
|
648 |
+
.addEventListener("input", checkFormValidity);
|
649 |
+
|
650 |
+
// Drag and drop functionality
|
651 |
+
const uploadSection = document.getElementById("upload-section");
|
652 |
+
|
653 |
+
["dragenter", "dragover", "dragleave", "drop"].forEach((eventName) => {
|
654 |
+
uploadSection.addEventListener(eventName, preventDefaults, false);
|
655 |
+
});
|
656 |
+
|
657 |
+
function preventDefaults(e) {
|
658 |
+
e.preventDefault();
|
659 |
+
e.stopPropagation();
|
660 |
+
}
|
661 |
+
|
662 |
+
["dragenter", "dragover"].forEach((eventName) => {
|
663 |
+
uploadSection.addEventListener(eventName, highlight, false);
|
664 |
+
});
|
665 |
+
|
666 |
+
["dragleave", "drop"].forEach((eventName) => {
|
667 |
+
uploadSection.addEventListener(eventName, unhighlight, false);
|
668 |
+
});
|
669 |
+
|
670 |
+
function highlight() {
|
671 |
+
uploadSection.classList.add("drag-over");
|
672 |
+
}
|
673 |
+
|
674 |
+
function unhighlight() {
|
675 |
+
uploadSection.classList.remove("drag-over");
|
676 |
+
}
|
677 |
+
|
678 |
+
uploadSection.addEventListener("drop", handleDrop, false);
|
679 |
+
|
680 |
+
function handleDrop(e) {
|
681 |
+
const dt = e.dataTransfer;
|
682 |
+
const files = dt.files;
|
683 |
+
|
684 |
+
if (files.length > 0) {
|
685 |
+
const file = files[0];
|
686 |
+
if (file.type === "application/pdf") {
|
687 |
+
selectedFile = file;
|
688 |
+
document.getElementById("file-name").textContent = `Selected: ${
|
689 |
+
file.name
|
690 |
+
} (${(file.size / 1024 / 1024).toFixed(2)} MB)`;
|
691 |
+
document.getElementById("file-name").style.display = "block";
|
692 |
+
document.getElementById(
|
693 |
+
"file-label"
|
694 |
+
).textContent = `β
${file.name} selected`;
|
695 |
+
checkFormValidity();
|
696 |
+
} else {
|
697 |
+
alert("Please drop a valid PDF file.");
|
698 |
+
}
|
699 |
+
}
|
700 |
+
}
|
701 |
+
|
702 |
+
function checkFormValidity() {
|
703 |
+
const convertBtn = document.getElementById("convert-btn");
|
704 |
+
if (selectedFile && serverOnline) {
|
705 |
+
convertBtn.disabled = false;
|
706 |
+
convertBtn.textContent = "π Start Conversion";
|
707 |
+
} else {
|
708 |
+
convertBtn.disabled = true;
|
709 |
+
convertBtn.textContent = serverOnline
|
710 |
+
? "π Start Conversion"
|
711 |
+
: "β Server Offline";
|
712 |
+
}
|
713 |
+
}
|
714 |
+
|
715 |
+
async function convertFile() {
|
716 |
+
if (!selectedFile || !currentFormat) {
|
717 |
+
alert("Please select a file and format.");
|
718 |
+
return;
|
719 |
+
}
|
720 |
+
|
721 |
+
if (!serverOnline) {
|
722 |
+
alert("Server is offline. Please start the Flask server first.");
|
723 |
+
return;
|
724 |
+
}
|
725 |
+
|
726 |
+
const outputName = selectedFile.name.replace(/\.[^/.]+$/, "");
|
727 |
+
|
728 |
+
document.getElementById("loading").style.display = "block";
|
729 |
+
document.getElementById("progress-bar").style.display = "block";
|
730 |
+
document.getElementById("convert-btn").disabled = true;
|
731 |
+
document.getElementById("result-section").style.display = "none";
|
732 |
+
document.getElementById("debug-info").style.display = "none";
|
733 |
+
|
734 |
+
simulateProgress();
|
735 |
+
|
736 |
+
const formData = new FormData();
|
737 |
+
formData.append("file", selectedFile);
|
738 |
+
formData.append("format", currentFormat);
|
739 |
+
formData.append("output_name", outputName);
|
740 |
+
|
741 |
+
const debugInfo = {
|
742 |
+
fileName: selectedFile.name,
|
743 |
+
fileSize: selectedFile.size,
|
744 |
+
format: currentFormat,
|
745 |
+
outputName: outputName,
|
746 |
+
timestamp: new Date().toISOString(),
|
747 |
+
};
|
748 |
+
|
749 |
+
try {
|
750 |
+
console.log("π Starting conversion...", debugInfo);
|
751 |
+
|
752 |
+
const controller = new AbortController();
|
753 |
+
const timeoutId = setTimeout(() => controller.abort(), 420000); // 60 second timeout
|
754 |
+
|
755 |
+
const response = await fetch("/convert", {
|
756 |
+
method: "POST",
|
757 |
+
body: formData,
|
758 |
+
headers: {
|
759 |
+
Accept: "application/json",
|
760 |
+
},
|
761 |
+
mode: "cors",
|
762 |
+
signal: controller.signal,
|
763 |
+
});
|
764 |
+
|
765 |
+
clearTimeout(timeoutId);
|
766 |
+
console.log("π‘ Response status:", response.status);
|
767 |
+
|
768 |
+
if (!response.ok) {
|
769 |
+
const errorText = await response.text();
|
770 |
+
throw new Error(`Server returned ${response.status}: ${errorText}`);
|
771 |
+
}
|
772 |
+
|
773 |
+
const result = await response.json();
|
774 |
+
console.log("β
Conversion result:", result);
|
775 |
+
|
776 |
+
// Hide loading
|
777 |
+
document.getElementById("loading").style.display = "none";
|
778 |
+
document.getElementById("progress-bar").style.display = "none";
|
779 |
+
|
780 |
+
// Show result
|
781 |
+
const resultSection = document.getElementById("result-section");
|
782 |
+
const resultMessage = document.getElementById("result-message");
|
783 |
+
|
784 |
+
if (result.success) {
|
785 |
+
resultSection.className = "result-section result-success";
|
786 |
+
resultMessage.innerHTML = `<h3>β
Conversion Successful!</h3>
|
787 |
+
<p>Your PDF has been converted to ${currentFormat.toUpperCase()} format.</p>
|
788 |
+
<p><strong>Output file:</strong> ${
|
789 |
+
result.output_path || "Generated successfully"
|
790 |
+
}</p>`;
|
791 |
+
|
792 |
+
if (result.download_url) {
|
793 |
+
const downloadUrl = `${window.location.origin}${result.download_url}`;
|
794 |
+
|
795 |
+
// Add link for user
|
796 |
+
resultMessage.innerHTML += `<p><a href="${downloadUrl}" target="_blank" style="color: #155724; text-decoration: none; font-weight: bold;">π₯ Download File</a></p>`;
|
797 |
+
|
798 |
+
// β¬οΈ Auto-download
|
799 |
+
const a = document.createElement("a");
|
800 |
+
a.href = downloadUrl;
|
801 |
+
a.download = result.output_path || "converted_file";
|
802 |
+
document.body.appendChild(a);
|
803 |
+
a.click();
|
804 |
+
document.body.removeChild(a);
|
805 |
+
}
|
806 |
+
} else {
|
807 |
+
resultSection.className = "result-section result-error";
|
808 |
+
resultMessage.innerHTML = `
|
809 |
+
<h3>β Conversion Failed</h3>
|
810 |
+
<p>${
|
811 |
+
result.error || "An unexpected error occurred."
|
812 |
+
}</p>
|
813 |
+
`;
|
814 |
+
}
|
815 |
+
|
816 |
+
resultSection.style.display = "block";
|
817 |
+
} catch (error) {
|
818 |
+
console.error("β Error during conversion:", error);
|
819 |
+
|
820 |
+
// Hide loading
|
821 |
+
document.getElementById("loading").style.display = "none";
|
822 |
+
document.getElementById("progress-bar").style.display = "none";
|
823 |
+
|
824 |
+
// Show error
|
825 |
+
const resultSection = document.getElementById("result-section");
|
826 |
+
const resultMessage = document.getElementById("result-message");
|
827 |
+
|
828 |
+
resultSection.className = "result-section result-error";
|
829 |
+
|
830 |
+
let errorMessage = "An unexpected error occurred.";
|
831 |
+
if (error.name === "AbortError") {
|
832 |
+
errorMessage =
|
833 |
+
"Request timed out. The file might be too large or the server is taking too long to respond.";
|
834 |
+
} else if (error.message.includes("Failed to fetch")) {
|
835 |
+
errorMessage =
|
836 |
+
"Cannot connect to server. Please ensure the Flask server is running on http://localhost:5000";
|
837 |
+
} else {
|
838 |
+
errorMessage = error.message;
|
839 |
+
}
|
840 |
+
|
841 |
+
resultMessage.innerHTML = `
|
842 |
+
<h3>β Conversion Error</h3>
|
843 |
+
<p>${errorMessage}</p>
|
844 |
+
`;
|
845 |
+
|
846 |
+
resultSection.style.display = "block";
|
847 |
+
|
848 |
+
// Show debug information
|
849 |
+
const debugElement = document.getElementById("debug-info");
|
850 |
+
const debugContent = document.getElementById("debug-content");
|
851 |
+
debugContent.innerHTML = `
|
852 |
+
<div class="error-details">
|
853 |
+
<strong>Error Details:</strong><br>
|
854 |
+
Type: ${error.name}<br>
|
855 |
+
Message: ${error.message}<br>
|
856 |
+
<br>
|
857 |
+
<strong>Request Details:</strong><br>
|
858 |
+
${JSON.stringify(debugInfo, null, 2)}
|
859 |
+
<br>
|
860 |
+
<strong>Troubleshooting:</strong><br>
|
861 |
+
1. Ensure Flask server is running: python app.py<br>
|
862 |
+
2. Check server logs for errors<br>
|
863 |
+
3. Verify file size is under 100MB<br>
|
864 |
+
4. Check browser console for additional errors
|
865 |
+
</div>
|
866 |
+
`;
|
867 |
+
debugElement.style.display = "block";
|
868 |
+
}
|
869 |
+
|
870 |
+
document.getElementById("convert-btn").disabled = false;
|
871 |
+
checkFormValidity(); // Update button state
|
872 |
+
}
|
873 |
+
|
874 |
+
function simulateProgress() {
|
875 |
+
const progressFill = document.getElementById("progress-fill");
|
876 |
+
let progress = 0;
|
877 |
+
|
878 |
+
const interval = setInterval(() => {
|
879 |
+
progress += Math.random() * 15;
|
880 |
+
if (progress > 90) progress = 90;
|
881 |
+
|
882 |
+
progressFill.style.width = progress + "%";
|
883 |
+
|
884 |
+
if (progress >= 90) {
|
885 |
+
clearInterval(interval);
|
886 |
+
}
|
887 |
+
}, 200);
|
888 |
+
|
889 |
+
// Reset progress after animation
|
890 |
+
setTimeout(() => {
|
891 |
+
progressFill.style.width = "0%";
|
892 |
+
}, 5000);
|
893 |
+
}
|
894 |
+
</script>
|
895 |
+
</body>
|
896 |
+
</html>
|