PDF_2_Image / app.py
broadfield-dev's picture
Update app.py
67e4414 verified
raw
history blame
4.85 kB
import os
import tempfile
import logging
from typing import List
import gradio as gr
import requests
from PIL import Image
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
def stitch_images_vertically(images: List[Image.Image]) -> Image.Image:
if not images:
return None
max_width = max(img.width for img in images)
total_height = sum(img.height for img in images)
stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255))
current_y = 0
for img in images:
stitched_image.paste(img, (0, current_y))
current_y += img.height
return stitched_image
def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
pdf_input_source = None
is_bytes = False
source_name = "document"
progress(0, desc="Validating input...")
if pdf_file is not None:
logger.info(f"Processing uploaded file: {pdf_file.name}")
pdf_input_source = pdf_file.name
source_name = os.path.splitext(os.path.basename(pdf_file.name))[0]
elif pdf_url and pdf_url.strip():
url = pdf_url.strip()
logger.info(f"Processing file from URL: {url}")
progress(0.1, desc="Downloading PDF from URL...")
try:
response = requests.get(url, timeout=45)
response.raise_for_status()
pdf_input_source = response.content
source_name = os.path.splitext(os.path.basename(url.split('?')[0]))[0]
is_bytes = True
except requests.RequestException as e:
raise gr.Error(f"Failed to download PDF from URL. Error: {e}")
else:
raise gr.Error("Please upload a PDF file or provide a valid URL.")
progress(0.3, desc="Converting PDF pages to images...")
try:
if is_bytes:
images = convert_from_bytes(pdf_input_source, dpi=200)
else:
images = convert_from_path(pdf_input_source, dpi=200)
except (PDFInfoNotInstalledError, FileNotFoundError):
raise gr.Error("Server configuration error: Poppler dependency is missing.")
except (PDFPageCountError, Exception) as e:
raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}")
if not images:
raise gr.Error("Could not extract any pages from the PDF. The file might be empty or invalid.")
logger.info(f"Successfully converted {len(images)} pages to images.")
progress(0.7, desc=f"Stitching {len(images)} images together...")
stitched_image = stitch_images_vertically(images)
if stitched_image is None:
raise gr.Error("Image stitching failed.")
logger.info("Image stitching complete.")
progress(0.9, desc="Saving final image...")
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file:
stitched_image.save(tmp_file.name, "PNG")
output_path = tmp_file.name
logger.info(f"Final image saved to temporary path: {output_path}")
progress(1, desc="Done!")
return output_path, output_path
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# PDF Page Stitcher ๐Ÿ“„ โžก๏ธ ๐Ÿ–ผ๏ธ
Upload a PDF file or provide a URL. This tool will convert every page of the PDF into an image
and then append them beneath each other to create a single, tall image that you can download.
"""
)
with gr.Row():
with gr.Column(scale=1):
with gr.Tabs():
with gr.TabItem("Upload PDF"):
pdf_file_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
with gr.TabItem("From URL"):
pdf_url_input = gr.Textbox(
label="PDF URL",
placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf"
)
submit_btn = gr.Button("Stitch PDF Pages", variant="primary")
with gr.Column(scale=2):
gr.Markdown("## Output")
output_image_preview = gr.Image(
label="Stitched Image Preview",
type="filepath",
interactive=False,
height=600,
)
output_image_download = gr.File(
label="Download Stitched Image",
interactive=False
)
submit_btn.click(
fn=process_pdf,
inputs=[pdf_file_input, pdf_url_input],
outputs=[output_image_preview, output_image_download]
)
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)