broadfield-dev commited on
Commit
45cdd58
·
verified ·
1 Parent(s): 1fe9990

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -0
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import logging
4
+ import subprocess
5
+ from typing import List
6
+
7
+ import gradio as gr
8
+ import requests
9
+ from PIL import Image
10
+ from pdf2image import convert_from_path, convert_from_bytes
11
+ from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError
12
+
13
+ # --- Logging Configuration ---
14
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def check_poppler():
19
+ """
20
+ Checks if the Poppler PDF rendering utility is installed and accessible.
21
+ """
22
+ try:
23
+ # Run a simple poppler command to check for its existence and version
24
+ result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
25
+ if result.returncode == 0 or "pdftoppm version" in result.stderr:
26
+ logger.info("Poppler check successful.")
27
+ return True
28
+ else:
29
+ logger.error(f"Poppler check failed. stderr: {result.stderr.strip()}")
30
+ return False
31
+ except FileNotFoundError:
32
+ logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in the system's PATH.")
33
+ return False
34
+ except Exception as e:
35
+ logger.error(f"An unexpected error occurred during Poppler check: {e}")
36
+ return False
37
+
38
+
39
+ def stitch_images_vertically(images: List[Image.Image]) -> Image.Image:
40
+ """
41
+ Stitches a list of PIL Images together vertically.
42
+
43
+ Args:
44
+ images: A list of PIL Image objects.
45
+
46
+ Returns:
47
+ A single PIL Image object containing all input images stitched together.
48
+ """
49
+ if not images:
50
+ return None
51
+
52
+ # Find the maximum width among all images to use as the canvas width
53
+ max_width = max(img.width for img in images)
54
+
55
+ # Calculate the total height by summing the height of all images
56
+ total_height = sum(img.height for img in images)
57
+
58
+ # Create a new blank image (canvas) with a white background
59
+ stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255))
60
+
61
+ # Paste each image onto the canvas, one below the other
62
+ current_y = 0
63
+ for img in images:
64
+ stitched_image.paste(img, (0, current_y))
65
+ current_y += img.height
66
+
67
+ return stitched_image
68
+
69
+
70
+ def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
71
+ """
72
+ The main processing function for the Gradio interface.
73
+ It takes a PDF (either as an uploaded file or a URL), converts all its
74
+ pages to images, and stitches them into a single tall image.
75
+ """
76
+ pdf_input_source = None
77
+ is_bytes = False
78
+ source_name = "document" # Default name for output file
79
+
80
+ # --- 1. Determine Input Source ---
81
+ progress(0, desc="Validating input...")
82
+ if pdf_file is not None:
83
+ logger.info(f"Processing uploaded file: {pdf_file.name}")
84
+ pdf_input_source = pdf_file.name # .name provides the temp path in Gradio
85
+ source_name = os.path.splitext(os.path.basename(pdf_file.name))[0]
86
+ is_bytes = False
87
+ elif pdf_url and pdf_url.strip():
88
+ url = pdf_url.strip()
89
+ logger.info(f"Processing file from URL: {url}")
90
+ progress(0.1, desc=f"Downloading PDF from URL...")
91
+ try:
92
+ response = requests.get(url, timeout=45)
93
+ response.raise_for_status()
94
+ pdf_input_source = response.content
95
+ source_name = os.path.splitext(os.path.basename(url.split('?')[0]))[0]
96
+ is_bytes = True
97
+ except requests.RequestException as e:
98
+ raise gr.Error(f"Failed to download PDF from URL. Error: {e}")
99
+ else:
100
+ raise gr.Error("Please upload a PDF file or provide a valid URL.")
101
+
102
+ # --- 2. Convert PDF to a List of Images ---
103
+ progress(0.3, desc="Converting PDF pages to images...")
104
+ try:
105
+ if is_bytes:
106
+ images = convert_from_bytes(pdf_input_source, dpi=200)
107
+ else:
108
+ images = convert_from_path(pdf_input_source, dpi=200)
109
+ except (PDFInfoNotInstalledError, FileNotFoundError):
110
+ raise gr.Error("Poppler not found. Please ensure poppler-utils is installed and in your system's PATH.")
111
+ except (PDFPageCountError, Exception) as e:
112
+ raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}")
113
+
114
+ if not images:
115
+ raise gr.Error("Could not extract any pages from the PDF. The file might be empty or invalid.")
116
+
117
+ logger.info(f"Successfully converted {len(images)} pages to images.")
118
+
119
+ # --- 3. Stitch the Images Together ---
120
+ progress(0.7, desc=f"Stitching {len(images)} images together...")
121
+
122
+ stitched_image = stitch_images_vertically(images)
123
+ if stitched_image is None:
124
+ raise gr.Error("Image stitching failed.")
125
+
126
+ logger.info("Image stitching complete.")
127
+
128
+ # --- 4. Save the Final Image to a Temporary File ---
129
+ progress(0.9, desc="Saving final image...")
130
+
131
+ # Use a named temporary file that Gradio can serve
132
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file:
133
+ stitched_image.save(tmp_file.name, "PNG")
134
+ output_path = tmp_file.name
135
+
136
+ logger.info(f"Final image saved to temporary path: {output_path}")
137
+ progress(1, desc="Done!")
138
+
139
+ # --- 5. Return the path for the Gradio output components ---
140
+ # The first path is for the gr.Image preview, the second for the gr.File download.
141
+ return output_path, output_path
142
+
143
+
144
+ # --- Gradio Interface Definition ---
145
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
146
+ gr.Markdown(
147
+ """
148
+ # PDF Page Stitcher 📄 ➡️ 🖼️
149
+ Upload a PDF file or provide a URL. This tool will convert every page of the PDF into an image
150
+ and then append them beneath each other to create a single, tall image that you can download.
151
+ """
152
+ )
153
+
154
+ with gr.Row():
155
+ with gr.Column(scale=1):
156
+ with gr.Tabs():
157
+ with gr.TabItem("Upload PDF"):
158
+ pdf_file_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
159
+ with gr.TabItem("From URL"):
160
+ pdf_url_input = gr.Textbox(
161
+ label="PDF URL",
162
+ placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf"
163
+ )
164
+
165
+ submit_btn = gr.Button("Stitch PDF Pages", variant="primary")
166
+
167
+ with gr.Column(scale=2):
168
+ gr.Markdown("## Output")
169
+ output_image_preview = gr.Image(
170
+ label="Stitched Image Preview",
171
+ type="filepath",
172
+ interactive=False,
173
+ height=600, # Set a fixed height for the preview area
174
+ )
175
+ output_image_download = gr.File(
176
+ label="Download Stitched Image",
177
+ interactive=False
178
+ )
179
+
180
+ # Connect the button click event to the processing function
181
+ submit_btn.click(
182
+ fn=process_pdf,
183
+ inputs=[pdf_file_input, pdf_url_input],
184
+ outputs=[output_image_preview, output_image_download]
185
+ )
186
+
187
+ gr.Examples(
188
+ examples=[
189
+ [None, "https://arxiv.org/pdf/1706.03762.pdf"], # "Attention is All You Need" paper
190
+ [None, "https://bitcoin.org/bitcoin.pdf"], # Bitcoin whitepaper
191
+ ],
192
+ inputs=[pdf_file_input, pdf_url_input],
193
+ outputs=[output_image_preview, output_image_download],
194
+ fn=process_pdf,
195
+ cache_examples=True # Cache results for faster demo
196
+ )
197
+
198
+
199
+ # --- Main Execution ---
200
+ if __name__ == '__main__':
201
+ # Perform a check for Poppler when the script starts
202
+ if not check_poppler():
203
+ logger.warning(
204
+ "Poppler utilities could not be verified. The application may fail to process PDFs."
205
+ )
206
+
207
+ # Launch the Gradio application
208
+ demo.launch()