ctranslate2-4you commited on
Commit
2431bf9
Β·
verified Β·
1 Parent(s): 3829982

Create pymupdf_test_WIP.py

Browse files
Files changed (1) hide show
  1. pymupdf_test_WIP.py +428 -0
pymupdf_test_WIP.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Standard library ────────────────────────────────────────────────────────
2
+ import os
3
+ import sys
4
+ import json
5
+ import math
6
+ import queue
7
+ import shutil
8
+ import logging
9
+ import tempfile
10
+ import threading
11
+ import subprocess
12
+ import multiprocessing
13
+ from pathlib import Path
14
+ from multiprocessing import Pool
15
+
16
+ # ── Third-party ─────────────────────────────────────────────────────────────
17
+ import fitz # PyMuPDF
18
+ import tkinter as tk
19
+ from tkinter import filedialog, messagebox
20
+ from joblib import cpu_count, Parallel, delayed
21
+
22
+ # ── Parser configuration ────────────────────────────────────────────────────
23
+ PARALLEL_THRESHOLD = 16 # pages – switch to multiprocessing above this
24
+ LINE_TOLERANCE = 1 # pts for snapping nearly-identical rulings
25
+ MIN_RECT_AREA = 1e4 # ptsΒ² – ignore tiny rectangles
26
+
27
+ # ── pdfplumber-style clustering helpers ─────────────────────────────────────
28
+ def cluster_list(xs, tol):
29
+ """Return list of clusters (each a list) grouped by ≀ tol apart."""
30
+ xs = sorted(xs)
31
+ if len(xs) < 2:
32
+ return [[x] for x in xs]
33
+ groups, grp = [], [xs[0]]
34
+ for x in xs[1:]:
35
+ if x - grp[-1] <= tol:
36
+ grp.append(x)
37
+ else:
38
+ groups.append(grp)
39
+ grp = [x]
40
+ groups.append(grp)
41
+ return groups
42
+
43
+ def make_cluster_dict(vals, tol):
44
+ """Map each value to a cluster id (0,1,2,…) using tolerance."""
45
+ clusters = cluster_list(sorted(set(vals)), tol)
46
+ mapping = {}
47
+ for cid, cl in enumerate(clusters):
48
+ for v in cl:
49
+ mapping[v] = cid
50
+ return mapping
51
+
52
+ # ── Utility funcs ───────────────────────────────────────────────────────────
53
+ def clean_cell_text(text):
54
+ if not isinstance(text, str):
55
+ return ""
56
+ text = text.replace("-\n", "").replace("\n", " ")
57
+ return " ".join(text.split())
58
+
59
+ def safe_join(row):
60
+ return [clean_cell_text(str(c)) if c is not None else "" for c in row]
61
+
62
+ def clamp_bbox(bbox, page_rect):
63
+ x0, y0, x1, y1 = bbox
64
+ x0 = max(page_rect.x0, min(x0, page_rect.x1))
65
+ x1 = max(page_rect.x0, min(x1, page_rect.x1))
66
+ y0 = max(page_rect.y0, min(y0, page_rect.y1))
67
+ y1 = max(page_rect.y0, min(y1, page_rect.y1))
68
+ return (x0, y0, x1, y1)
69
+
70
+ # ── Improved table detection with snapping ─────────────────────────────────
71
+ def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE):
72
+ """
73
+ Detect table rectangles by:
74
+ 1. Collecting very thin horizontal & vertical strokes
75
+ 2. Snapping their positions with tolerance `tol`
76
+ 3. Forming a grid from unique row & column positions
77
+ 4. Returning a list[fitz.Rect] for each cell rectangle
78
+ """
79
+ horiz_raw, vert_raw = [], []
80
+ for d in page.get_drawings():
81
+ if d["type"] != 1: # stroke only
82
+ continue
83
+ x0, y0, x1, y1 = d["bbox"]
84
+ if abs(y1 - y0) < 2: # horizontal line
85
+ y_mid = (y0 + y1) / 2
86
+ horiz_raw.append((y_mid, x0, x1))
87
+ elif abs(x1 - x0) < 2: # vertical line
88
+ x_mid = (x0 + x1) / 2
89
+ vert_raw.append((x_mid, y0, y1))
90
+
91
+ if not horiz_raw or not vert_raw:
92
+ return []
93
+
94
+ row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol)
95
+ col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol)
96
+
97
+ # Average positions per cluster id
98
+ rows = {}
99
+ for y, x0, x1 in horiz_raw:
100
+ cid = row_map[y]
101
+ rows.setdefault(cid, []).append(y)
102
+ cols = {}
103
+ for x, y0, y1 in vert_raw:
104
+ cid = col_map[x]
105
+ cols.setdefault(cid, []).append(x)
106
+
107
+ row_pos = sorted(sum(v)/len(v) for v in rows.values())
108
+ col_pos = sorted(sum(v)/len(v) for v in cols.values())
109
+
110
+ rects = []
111
+ for r0, r1 in zip(row_pos, row_pos[1:]):
112
+ for c0, c1 in zip(col_pos, col_pos[1:]):
113
+ rect = fitz.Rect(c0, r0, c1, r1)
114
+ if rect.get_area() >= MIN_RECT_AREA:
115
+ rects.append(rect)
116
+
117
+ # Remove duplicates / contained rects
118
+ unique = []
119
+ for rect in rects:
120
+ if not any(u.contains(rect) or rect.contains(u) for u in unique):
121
+ unique.append(rect)
122
+
123
+ return unique
124
+
125
+ # ── Table extraction (simple text grouping) ────────────────────────────────
126
+ def extract_table(page: fitz.Page, table_rect: fitz.Rect):
127
+ """Group words inside `table_rect` into JSON rows [dict]."""
128
+ words = [
129
+ w for w in page.get_text("words")
130
+ if table_rect.x0 <= w[0] <= table_rect.x1
131
+ and table_rect.y0 <= w[1] <= table_rect.y1
132
+ ]
133
+ words.sort(key=lambda w: (w[1], w[0])) # sort by y then x
134
+
135
+ # cluster words by line
136
+ lines, cury, cur = [], None, []
137
+ for w in words:
138
+ if cury is None or abs(w[1] - cury) > 5:
139
+ if cur:
140
+ lines.append(cur)
141
+ cur = [w]
142
+ cury = w[1]
143
+ else:
144
+ cur.append(w)
145
+ if cur:
146
+ lines.append(cur)
147
+
148
+ if not lines:
149
+ return []
150
+
151
+ line_texts = [" ".join(w[4] for w in ln) for ln in lines]
152
+ headers = safe_join([line_texts[0]])
153
+ rows = [safe_join([lt]) for lt in line_texts[1:]]
154
+ return [dict(zip(headers, r)) for r in rows]
155
+
156
+ # ── Per-page worker ────────────────────────────────────────────────────────
157
+ def process_page(args):
158
+ page_number, pdf_path = args
159
+ try:
160
+ with fitz.open(pdf_path) as doc:
161
+ page = doc.load_page(page_number)
162
+ page_rect = page.rect
163
+ output = f"Page {page_number + 1}\n"
164
+
165
+ # Detect tables
166
+ table_rects = detect_table_bboxes(page)
167
+ table_jsons = []
168
+ for rect in table_rects:
169
+ tbl = extract_table(page, rect)
170
+ if tbl:
171
+ table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False))
172
+
173
+ # Words outside tables
174
+ tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects]
175
+ words = page.get_text("words")
176
+ outside = [
177
+ w for w in words
178
+ if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes)
179
+ ]
180
+ outside.sort(key=lambda w: (w[1], w[0]))
181
+
182
+ cury, cur, text = None, [], ""
183
+ for w in outside:
184
+ if cury is None or abs(w[1] - cury) > 10:
185
+ if cur:
186
+ text += " ".join(cur) + "\n"
187
+ cur, cury = [w[4]], w[1]
188
+ else:
189
+ cur.append(w[4])
190
+ if cur:
191
+ text += " ".join(cur) + "\n"
192
+
193
+ output += text.strip() + "\n"
194
+ for idx, tbl in enumerate(table_jsons, 1):
195
+ output += f'"table {idx}":\n{tbl}\n'
196
+ return page_number, output
197
+
198
+ except fitz.FileDataError as e:
199
+ return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable – {e}"
200
+ except Exception as e:
201
+ return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}"
202
+
203
+ # ── Document-level processing ───────────────────────────────────────────────
204
+ def process_pdf(pdf_path):
205
+ try:
206
+ if not os.path.exists(pdf_path):
207
+ return f"[ERROR] File not found: {pdf_path}"
208
+
209
+ print(f"[INFO] Starting processing: {pdf_path}")
210
+ try:
211
+ with fitz.open(pdf_path) as doc:
212
+ num_pages = doc.page_count
213
+ except fitz.FileDataError as e:
214
+ return f"[ERROR] Cannot open PDF: {pdf_path} – {e}"
215
+ except Exception as e:
216
+ return f"[ERROR] General error opening PDF: {pdf_path} – {e}"
217
+
218
+ pages = [(i, pdf_path) for i in range(num_pages)]
219
+ results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
220
+
221
+ results.sort(key=lambda x: x[0])
222
+ final_output = "\n".join(t for _, t in results)
223
+
224
+ base = os.path.splitext(os.path.basename(pdf_path))[0]
225
+ out_dir = os.path.dirname(pdf_path)
226
+ out_path = os.path.join(out_dir, f"{base}.txt")
227
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
228
+ f.write(final_output)
229
+ print(f"[INFO] Processing complete: {out_path}")
230
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
231
+ return "[INFO] Processing interrupted by user."
232
+ except Exception as e:
233
+ return f"[ERROR] Unexpected error with '{pdf_path}': {e}"
234
+
235
+ def run_serial(pages): return [process_page(a) for a in pages]
236
+
237
+ def run_parallel(pages):
238
+ cores = min(max(1, cpu_count() - 2), len(pages))
239
+ print(f"Starting parallel processing with {cores} cores…")
240
+ with Pool(cores) as pool:
241
+ return pool.map(process_page, pages)
242
+
243
+ # ── Batch CLI entrypoint ────────────────────────────────────────────────────
244
+ def process_pdfs_main():
245
+ pdfs = sys.argv[1:]
246
+ if not pdfs:
247
+ print("No PDF files provided.")
248
+ return
249
+
250
+ small, large = [], []
251
+ for p in pdfs:
252
+ if not os.path.exists(p):
253
+ print(f"File not found: {p}")
254
+ continue
255
+ try:
256
+ with fitz.open(p) as doc:
257
+ (small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p)
258
+ except fitz.FileDataError:
259
+ print(f"[ERROR] Password-protected PDF skipped: {p}")
260
+ except Exception as e:
261
+ print(f"[ERROR] Error opening {p}: {e}")
262
+
263
+ if small:
264
+ cores = min(max(1, cpu_count() - 2), len(small))
265
+ print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores …")
266
+ for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small):
267
+ print(r)
268
+
269
+ for p in large:
270
+ print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}")
271
+ print(process_pdf(p))
272
+
273
+ # ── Tkinter GUI ─────────────────────────────────────────────────────────────
274
+ class FileManager:
275
+ def __init__(self, master):
276
+ self.master = master
277
+ master.title("Parser-Sevenof9 β€” PyMuPDF")
278
+
279
+ self.files, self.last_selected = [], None
280
+ tk.Label(master, text="Selected PDF files:").pack(pady=5)
281
+
282
+ list_frame = tk.Frame(master); list_frame.pack(pady=5)
283
+ sb_list = tk.Scrollbar(list_frame)
284
+ self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6,
285
+ yscrollcommand=sb_list.set)
286
+ sb_list.config(command=self.listbox.yview)
287
+ self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y)
288
+ self.listbox.bind("<<ListboxSelect>>", self.show_text)
289
+ self.listbox.bind("<Button-1>", self.on_click)
290
+ self.listbox.bind("<Shift-Button-1>", self.on_shift_click)
291
+
292
+ self.ctx = tk.Menu(master, tearoff=0)
293
+ self.ctx.add_command(label="Remove selected", command=self.remove_file)
294
+ self.listbox.bind("<Button-3>", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None)
295
+
296
+ btn_frame = tk.Frame(master); btn_frame.pack(pady=10)
297
+ tk.Button(btn_frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
298
+ tk.Button(btn_frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
299
+ tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5)
300
+ tk.Button(btn_frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
301
+ tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)
302
+ tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
303
+
304
+ tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5)
305
+ sb_text = tk.Scrollbar(tx_frame)
306
+ self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set)
307
+ sb_text.config(command=self.text.yview)
308
+ self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y)
309
+
310
+ tk.Label(master, text="Progress:").pack()
311
+ prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5)
312
+ sb_prog = tk.Scrollbar(prog_frame)
313
+ self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set)
314
+ sb_prog.config(command=self.prog.yview)
315
+ self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y)
316
+
317
+ self.parser_proc = None
318
+
319
+ # ── Listbox helpers ───────────────────────────────────────────────────
320
+ def on_click(self, e):
321
+ idx = self.listbox.nearest(e.y)
322
+ self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx)
323
+ self.last_selected = idx; self.show_text(None)
324
+ return "break"
325
+
326
+ def on_shift_click(self, e):
327
+ idx = self.listbox.nearest(e.y)
328
+ if self.last_selected is None: self.last_selected = idx
329
+ lo, hi = sorted((self.last_selected, idx))
330
+ self.listbox.selection_clear(0, tk.END)
331
+ for i in range(lo, hi+1): self.listbox.selection_set(i)
332
+ return "break"
333
+
334
+ # ── File ops ─────────────────────────────────────────────────────────
335
+ def add_folder(self):
336
+ folder = filedialog.askdirectory(title="Select Folder")
337
+ if not folder: return
338
+ for root, _, fs in os.walk(folder):
339
+ for f in fs:
340
+ if f.lower().endswith(".pdf"):
341
+ p = os.path.join(root, f)
342
+ if p not in self.files:
343
+ self.files.append(p); self.listbox.insert(tk.END, p)
344
+
345
+ def add_file(self):
346
+ for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]):
347
+ if p not in self.files:
348
+ self.files.append(p); self.listbox.insert(tk.END, p)
349
+
350
+ def remove_file(self):
351
+ sel = self.listbox.curselection()
352
+ if not sel:
353
+ messagebox.showwarning("Notice","Please select an entry to remove."); return
354
+ for idx in reversed(sel):
355
+ self.listbox.delete(idx); del self.files[idx]
356
+ self.text.delete(1.0, tk.END)
357
+
358
+ def remove_all(self):
359
+ self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END)
360
+
361
+ # ── Parser control ───────────────────────────────────────────────────
362
+ def start_parser(self):
363
+ if not self.files:
364
+ messagebox.showinfo("No Files","Please select at least one file."); return
365
+ self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END)
366
+ self.prog.insert(tk.END,"Starting parser…\n"); self.prog.config(state=tk.DISABLED)
367
+ threading.Thread(target=self.run_parser).start()
368
+
369
+ def stop_parser(self):
370
+ if self.parser_proc and self.parser_proc.poll() is None:
371
+ self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n")
372
+ else:
373
+ self.append_prog("No active parser process to stop.\n")
374
+
375
+ def run_parser(self):
376
+ try:
377
+ self.parser_proc = subprocess.Popen(
378
+ [sys.executable, __file__] + self.files,
379
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
380
+ text=True, encoding="utf-8", errors="ignore", bufsize=4096
381
+ )
382
+ for line in self.parser_proc.stdout:
383
+ self.append_prog(line)
384
+ self.parser_proc.stdout.close(); self.parser_proc.wait()
385
+ if self.parser_proc.returncode == 0:
386
+ self.append_prog("\nParser finished successfully.\n")
387
+ self.shell_msg("Parser Done","The parser was executed successfully.")
388
+ else:
389
+ self.append_prog("\nError while running the parser.\n")
390
+ self.shell_msg("Error","Error while running the parser.")
391
+ except Exception as e:
392
+ self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}")
393
+ finally:
394
+ self.parser_proc = None
395
+
396
+ # ── GUI helpers ──────────────────────────────────────────────────────
397
+ def append_prog(self, txt):
398
+ self.prog.after(0, lambda:self._ins(txt))
399
+
400
+ def _ins(self, txt):
401
+ self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt)
402
+ self.prog.see(tk.END); self.prog.config(state=tk.DISABLED)
403
+
404
+ def shell_msg(self, title, msg):
405
+ self.master.after(0, lambda: messagebox.showinfo(title, msg))
406
+
407
+ def show_text(self, _):
408
+ sel = self.listbox.curselection()
409
+ if not sel: return
410
+ path = self.files[sel[0]]
411
+ txt = os.path.splitext(path)[0] + ".txt"
412
+ self.text.delete(1.0, tk.END)
413
+ if os.path.exists(txt):
414
+ try:
415
+ with open(txt,"r",encoding="utf-8",errors="ignore") as f:
416
+ self.text.insert(tk.END, f.read())
417
+ except Exception as e:
418
+ self.text.insert(tk.END,f"Error loading text file:\n{e}")
419
+ else:
420
+ self.text.insert(tk.END,"[No corresponding .txt file found]")
421
+
422
+ # ── Main guard ─────────────────────────────────────────────────────────────
423
+ if __name__ == "__main__":
424
+ multiprocessing.freeze_support()
425
+ if len(sys.argv) > 1:
426
+ process_pdfs_main()
427
+ else:
428
+ root = tk.Tk(); FileManager(root); root.mainloop()