broadfield-dev commited on
Commit
40f4278
·
verified ·
1 Parent(s): bb0bd3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -74
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import tempfile
3
  import logging
4
- import subprocess
5
  from typing import List
6
 
7
  import gradio as gr
@@ -10,55 +9,18 @@ from PIL import Image
10
  from pdf2image import convert_from_path, convert_from_bytes
11
  from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError
12
 
13
- # --- Logging Configuration ---
14
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
15
  logger = logging.getLogger(__name__)
16
 
17
-
18
- def check_poppler():
19
- """
20
- Checks if the Poppler PDF rendering utility is installed and accessible.
21
- """
22
- try:
23
- # Run a simple poppler command to check for its existence and version
24
- result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
25
- if result.returncode == 0 or "pdftoppm version" in result.stderr:
26
- logger.info("Poppler check successful.")
27
- return True
28
- else:
29
- logger.error(f"Poppler check failed. stderr: {result.stderr.strip()}")
30
- return False
31
- except FileNotFoundError:
32
- logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in the system's PATH.")
33
- return False
34
- except Exception as e:
35
- logger.error(f"An unexpected error occurred during Poppler check: {e}")
36
- return False
37
-
38
-
39
  def stitch_images_vertically(images: List[Image.Image]) -> Image.Image:
40
- """
41
- Stitches a list of PIL Images together vertically.
42
-
43
- Args:
44
- images: A list of PIL Image objects.
45
-
46
- Returns:
47
- A single PIL Image object containing all input images stitched together.
48
- """
49
  if not images:
50
  return None
51
 
52
- # Find the maximum width among all images to use as the canvas width
53
  max_width = max(img.width for img in images)
54
-
55
- # Calculate the total height by summing the height of all images
56
  total_height = sum(img.height for img in images)
57
 
58
- # Create a new blank image (canvas) with a white background
59
  stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255))
60
 
61
- # Paste each image onto the canvas, one below the other
62
  current_y = 0
63
  for img in images:
64
  stitched_image.paste(img, (0, current_y))
@@ -66,28 +28,20 @@ def stitch_images_vertically(images: List[Image.Image]) -> Image.Image:
66
 
67
  return stitched_image
68
 
69
-
70
  def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
71
- """
72
- The main processing function for the Gradio interface.
73
- It takes a PDF (either as an uploaded file or a URL), converts all its
74
- pages to images, and stitches them into a single tall image.
75
- """
76
  pdf_input_source = None
77
  is_bytes = False
78
- source_name = "document" # Default name for output file
79
 
80
- # --- 1. Determine Input Source ---
81
  progress(0, desc="Validating input...")
82
  if pdf_file is not None:
83
  logger.info(f"Processing uploaded file: {pdf_file.name}")
84
- pdf_input_source = pdf_file.name # .name provides the temp path in Gradio
85
  source_name = os.path.splitext(os.path.basename(pdf_file.name))[0]
86
- is_bytes = False
87
  elif pdf_url and pdf_url.strip():
88
  url = pdf_url.strip()
89
  logger.info(f"Processing file from URL: {url}")
90
- progress(0.1, desc=f"Downloading PDF from URL...")
91
  try:
92
  response = requests.get(url, timeout=45)
93
  response.raise_for_status()
@@ -99,7 +53,6 @@ def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
99
  else:
100
  raise gr.Error("Please upload a PDF file or provide a valid URL.")
101
 
102
- # --- 2. Convert PDF to a List of Images ---
103
  progress(0.3, desc="Converting PDF pages to images...")
104
  try:
105
  if is_bytes:
@@ -107,7 +60,7 @@ def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
107
  else:
108
  images = convert_from_path(pdf_input_source, dpi=200)
109
  except (PDFInfoNotInstalledError, FileNotFoundError):
110
- raise gr.Error("Poppler not found. Please ensure poppler-utils is installed and in your system's PATH.")
111
  except (PDFPageCountError, Exception) as e:
112
  raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}")
113
 
@@ -116,7 +69,6 @@ def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
116
 
117
  logger.info(f"Successfully converted {len(images)} pages to images.")
118
 
119
- # --- 3. Stitch the Images Together ---
120
  progress(0.7, desc=f"Stitching {len(images)} images together...")
121
 
122
  stitched_image = stitch_images_vertically(images)
@@ -125,10 +77,8 @@ def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
125
 
126
  logger.info("Image stitching complete.")
127
 
128
- # --- 4. Save the Final Image to a Temporary File ---
129
  progress(0.9, desc="Saving final image...")
130
 
131
- # Use a named temporary file that Gradio can serve
132
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file:
133
  stitched_image.save(tmp_file.name, "PNG")
134
  output_path = tmp_file.name
@@ -136,12 +86,8 @@ def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
136
  logger.info(f"Final image saved to temporary path: {output_path}")
137
  progress(1, desc="Done!")
138
 
139
- # --- 5. Return the path for the Gradio output components ---
140
- # The first path is for the gr.Image preview, the second for the gr.File download.
141
  return output_path, output_path
142
 
143
-
144
- # --- Gradio Interface Definition ---
145
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
146
  gr.Markdown(
147
  """
@@ -170,14 +116,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
170
  label="Stitched Image Preview",
171
  type="filepath",
172
  interactive=False,
173
- height=600, # Set a fixed height for the preview area
174
  )
175
  output_image_download = gr.File(
176
  label="Download Stitched Image",
177
  interactive=False
178
  )
179
 
180
- # Connect the button click event to the processing function
181
  submit_btn.click(
182
  fn=process_pdf,
183
  inputs=[pdf_file_input, pdf_url_input],
@@ -186,23 +131,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
186
 
187
  gr.Examples(
188
  examples=[
189
- [None, "https://arxiv.org/pdf/1706.03762.pdf"], # "Attention is All You Need" paper
190
- [None, "https://bitcoin.org/bitcoin.pdf"], # Bitcoin whitepaper
191
  ],
192
  inputs=[pdf_file_input, pdf_url_input],
193
  outputs=[output_image_preview, output_image_download],
194
  fn=process_pdf,
195
- cache_examples=False # Cache results for faster demo
196
  )
197
 
198
-
199
- # --- Main Execution ---
200
- if __name__ == '__main__':
201
- # Perform a check for Poppler when the script starts
202
- if not check_poppler():
203
- logger.warning(
204
- "Poppler utilities could not be verified. The application may fail to process PDFs."
205
- )
206
-
207
- # Launch the Gradio application
208
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import os
2
  import tempfile
3
  import logging
 
4
  from typing import List
5
 
6
  import gradio as gr
 
9
  from pdf2image import convert_from_path, convert_from_bytes
10
  from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError
11
 
 
12
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
13
  logger = logging.getLogger(__name__)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def stitch_images_vertically(images: List[Image.Image]) -> Image.Image:
 
 
 
 
 
 
 
 
 
16
  if not images:
17
  return None
18
 
 
19
  max_width = max(img.width for img in images)
 
 
20
  total_height = sum(img.height for img in images)
21
 
 
22
  stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255))
23
 
 
24
  current_y = 0
25
  for img in images:
26
  stitched_image.paste(img, (0, current_y))
 
28
 
29
  return stitched_image
30
 
 
31
  def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
 
 
 
 
 
32
  pdf_input_source = None
33
  is_bytes = False
34
+ source_name = "document"
35
 
 
36
  progress(0, desc="Validating input...")
37
  if pdf_file is not None:
38
  logger.info(f"Processing uploaded file: {pdf_file.name}")
39
+ pdf_input_source = pdf_file.name
40
  source_name = os.path.splitext(os.path.basename(pdf_file.name))[0]
 
41
  elif pdf_url and pdf_url.strip():
42
  url = pdf_url.strip()
43
  logger.info(f"Processing file from URL: {url}")
44
+ progress(0.1, desc="Downloading PDF from URL...")
45
  try:
46
  response = requests.get(url, timeout=45)
47
  response.raise_for_status()
 
53
  else:
54
  raise gr.Error("Please upload a PDF file or provide a valid URL.")
55
 
 
56
  progress(0.3, desc="Converting PDF pages to images...")
57
  try:
58
  if is_bytes:
 
60
  else:
61
  images = convert_from_path(pdf_input_source, dpi=200)
62
  except (PDFInfoNotInstalledError, FileNotFoundError):
63
+ raise gr.Error("Server configuration error: Poppler dependency is missing.")
64
  except (PDFPageCountError, Exception) as e:
65
  raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}")
66
 
 
69
 
70
  logger.info(f"Successfully converted {len(images)} pages to images.")
71
 
 
72
  progress(0.7, desc=f"Stitching {len(images)} images together...")
73
 
74
  stitched_image = stitch_images_vertically(images)
 
77
 
78
  logger.info("Image stitching complete.")
79
 
 
80
  progress(0.9, desc="Saving final image...")
81
 
 
82
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file:
83
  stitched_image.save(tmp_file.name, "PNG")
84
  output_path = tmp_file.name
 
86
  logger.info(f"Final image saved to temporary path: {output_path}")
87
  progress(1, desc="Done!")
88
 
 
 
89
  return output_path, output_path
90
 
 
 
91
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
92
  gr.Markdown(
93
  """
 
116
  label="Stitched Image Preview",
117
  type="filepath",
118
  interactive=False,
119
+ height=600,
120
  )
121
  output_image_download = gr.File(
122
  label="Download Stitched Image",
123
  interactive=False
124
  )
125
 
 
126
  submit_btn.click(
127
  fn=process_pdf,
128
  inputs=[pdf_file_input, pdf_url_input],
 
131
 
132
  gr.Examples(
133
  examples=[
134
+ [None, "https://arxiv.org/pdf/1706.03762.pdf"],
135
+ [None, "https://bitcoin.org/bitcoin.pdf"],
136
  ],
137
  inputs=[pdf_file_input, pdf_url_input],
138
  outputs=[output_image_preview, output_image_download],
139
  fn=process_pdf,
140
+ cache_examples=True
141
  )
142
 
143
+ demo.launch()