import os import tempfile import logging import subprocess from typing import List import gradio as gr import requests from PIL import Image from pdf2image import convert_from_path, convert_from_bytes from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError # --- Logging Configuration --- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) def check_poppler(): """ Checks if the Poppler PDF rendering utility is installed and accessible. """ try: # Run a simple poppler command to check for its existence and version result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False) if result.returncode == 0 or "pdftoppm version" in result.stderr: logger.info("Poppler check successful.") return True else: logger.error(f"Poppler check failed. stderr: {result.stderr.strip()}") return False except FileNotFoundError: logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in the system's PATH.") return False except Exception as e: logger.error(f"An unexpected error occurred during Poppler check: {e}") return False def stitch_images_vertically(images: List[Image.Image]) -> Image.Image: """ Stitches a list of PIL Images together vertically. Args: images: A list of PIL Image objects. Returns: A single PIL Image object containing all input images stitched together. """ if not images: return None # Find the maximum width among all images to use as the canvas width max_width = max(img.width for img in images) # Calculate the total height by summing the height of all images total_height = sum(img.height for img in images) # Create a new blank image (canvas) with a white background stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255)) # Paste each image onto the canvas, one below the other current_y = 0 for img in images: stitched_image.paste(img, (0, current_y)) current_y += img.height return stitched_image def process_pdf(pdf_file, pdf_url, progress=gr.Progress()): """ The main processing function for the Gradio interface. It takes a PDF (either as an uploaded file or a URL), converts all its pages to images, and stitches them into a single tall image. """ pdf_input_source = None is_bytes = False source_name = "document" # Default name for output file # --- 1. Determine Input Source --- progress(0, desc="Validating input...") if pdf_file is not None: logger.info(f"Processing uploaded file: {pdf_file.name}") pdf_input_source = pdf_file.name # .name provides the temp path in Gradio source_name = os.path.splitext(os.path.basename(pdf_file.name))[0] is_bytes = False elif pdf_url and pdf_url.strip(): url = pdf_url.strip() logger.info(f"Processing file from URL: {url}") progress(0.1, desc=f"Downloading PDF from URL...") try: response = requests.get(url, timeout=45) response.raise_for_status() pdf_input_source = response.content source_name = os.path.splitext(os.path.basename(url.split('?')[0]))[0] is_bytes = True except requests.RequestException as e: raise gr.Error(f"Failed to download PDF from URL. Error: {e}") else: raise gr.Error("Please upload a PDF file or provide a valid URL.") # --- 2. Convert PDF to a List of Images --- progress(0.3, desc="Converting PDF pages to images...") try: if is_bytes: images = convert_from_bytes(pdf_input_source, dpi=200) else: images = convert_from_path(pdf_input_source, dpi=200) except (PDFInfoNotInstalledError, FileNotFoundError): raise gr.Error("Poppler not found. Please ensure poppler-utils is installed and in your system's PATH.") except (PDFPageCountError, Exception) as e: raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}") if not images: raise gr.Error("Could not extract any pages from the PDF. The file might be empty or invalid.") logger.info(f"Successfully converted {len(images)} pages to images.") # --- 3. Stitch the Images Together --- progress(0.7, desc=f"Stitching {len(images)} images together...") stitched_image = stitch_images_vertically(images) if stitched_image is None: raise gr.Error("Image stitching failed.") logger.info("Image stitching complete.") # --- 4. Save the Final Image to a Temporary File --- progress(0.9, desc="Saving final image...") # Use a named temporary file that Gradio can serve with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file: stitched_image.save(tmp_file.name, "PNG") output_path = tmp_file.name logger.info(f"Final image saved to temporary path: {output_path}") progress(1, desc="Done!") # --- 5. Return the path for the Gradio output components --- # The first path is for the gr.Image preview, the second for the gr.File download. return output_path, output_path # --- Gradio Interface Definition --- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # PDF Page Stitcher 📄 ➡️ 🖼️ Upload a PDF file or provide a URL. This tool will convert every page of the PDF into an image and then append them beneath each other to create a single, tall image that you can download. """ ) with gr.Row(): with gr.Column(scale=1): with gr.Tabs(): with gr.TabItem("Upload PDF"): pdf_file_input = gr.File(label="Upload PDF File", file_types=[".pdf"]) with gr.TabItem("From URL"): pdf_url_input = gr.Textbox( label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf" ) submit_btn = gr.Button("Stitch PDF Pages", variant="primary") with gr.Column(scale=2): gr.Markdown("## Output") output_image_preview = gr.Image( label="Stitched Image Preview", type="filepath", interactive=False, height=600, # Set a fixed height for the preview area ) output_image_download = gr.File( label="Download Stitched Image", interactive=False ) # Connect the button click event to the processing function submit_btn.click( fn=process_pdf, inputs=[pdf_file_input, pdf_url_input], outputs=[output_image_preview, output_image_download] ) gr.Examples( examples=[ [None, "https://arxiv.org/pdf/1706.03762.pdf"], # "Attention is All You Need" paper [None, "https://bitcoin.org/bitcoin.pdf"], # Bitcoin whitepaper ], inputs=[pdf_file_input, pdf_url_input], outputs=[output_image_preview, output_image_download], fn=process_pdf, cache_examples=True # Cache results for faster demo ) # --- Main Execution --- if __name__ == '__main__': # Perform a check for Poppler when the script starts if not check_poppler(): logger.warning( "Poppler utilities could not be verified. The application may fail to process PDFs." ) # Launch the Gradio application demo.launch()