# ── Standard library ──────────────────────────────────────────────────────── import os import sys import json import math import queue import shutil import logging import tempfile import threading import subprocess import multiprocessing from pathlib import Path from multiprocessing import Pool # ── Third-party ───────────────────────────────────────────────────────────── import fitz # PyMuPDF import tkinter as tk from tkinter import filedialog, messagebox from joblib import cpu_count, Parallel, delayed # ── Parser configuration ──────────────────────────────────────────────────── PARALLEL_THRESHOLD = 16 # pages – switch to multiprocessing above this LINE_TOLERANCE = 1 # pts for snapping nearly-identical rulings MIN_RECT_AREA = 1e4 # pts² – ignore tiny rectangles # ── pdfplumber-style clustering helpers ───────────────────────────────────── def cluster_list(xs, tol): """Return list of clusters (each a list) grouped by ≤ tol apart.""" xs = sorted(xs) if len(xs) < 2: return [[x] for x in xs] groups, grp = [], [xs[0]] for x in xs[1:]: if x - grp[-1] <= tol: grp.append(x) else: groups.append(grp) grp = [x] groups.append(grp) return groups def make_cluster_dict(vals, tol): """Map each value to a cluster id (0,1,2,…) using tolerance.""" clusters = cluster_list(sorted(set(vals)), tol) mapping = {} for cid, cl in enumerate(clusters): for v in cl: mapping[v] = cid return mapping # ── Utility funcs ─────────────────────────────────────────────────────────── def clean_cell_text(text): if not isinstance(text, str): return "" text = text.replace("-\n", "").replace("\n", " ") return " ".join(text.split()) def safe_join(row): return [clean_cell_text(str(c)) if c is not None else "" for c in row] def clamp_bbox(bbox, page_rect): x0, y0, x1, y1 = bbox x0 = max(page_rect.x0, min(x0, page_rect.x1)) x1 = max(page_rect.x0, min(x1, page_rect.x1)) y0 = max(page_rect.y0, min(y0, page_rect.y1)) y1 = max(page_rect.y0, min(y1, page_rect.y1)) return (x0, y0, x1, y1) # ── Improved table detection with snapping ───────────────────────────────── def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE): """ Detect table rectangles by: 1. Collecting very thin horizontal & vertical strokes 2. Snapping their positions with tolerance `tol` 3. Forming a grid from unique row & column positions 4. Returning a list[fitz.Rect] for each cell rectangle """ horiz_raw, vert_raw = [], [] for d in page.get_drawings(): if d["type"] != 1: # stroke only continue x0, y0, x1, y1 = d["bbox"] if abs(y1 - y0) < 2: # horizontal line y_mid = (y0 + y1) / 2 horiz_raw.append((y_mid, x0, x1)) elif abs(x1 - x0) < 2: # vertical line x_mid = (x0 + x1) / 2 vert_raw.append((x_mid, y0, y1)) if not horiz_raw or not vert_raw: return [] row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol) col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol) # Average positions per cluster id rows = {} for y, x0, x1 in horiz_raw: cid = row_map[y] rows.setdefault(cid, []).append(y) cols = {} for x, y0, y1 in vert_raw: cid = col_map[x] cols.setdefault(cid, []).append(x) row_pos = sorted(sum(v)/len(v) for v in rows.values()) col_pos = sorted(sum(v)/len(v) for v in cols.values()) rects = [] for r0, r1 in zip(row_pos, row_pos[1:]): for c0, c1 in zip(col_pos, col_pos[1:]): rect = fitz.Rect(c0, r0, c1, r1) if rect.get_area() >= MIN_RECT_AREA: rects.append(rect) # Remove duplicates / contained rects unique = [] for rect in rects: if not any(u.contains(rect) or rect.contains(u) for u in unique): unique.append(rect) return unique # ── Table extraction (simple text grouping) ──────────────────────────────── def extract_table(page: fitz.Page, table_rect: fitz.Rect): """Group words inside `table_rect` into JSON rows [dict].""" words = [ w for w in page.get_text("words") if table_rect.x0 <= w[0] <= table_rect.x1 and table_rect.y0 <= w[1] <= table_rect.y1 ] words.sort(key=lambda w: (w[1], w[0])) # sort by y then x # cluster words by line lines, cury, cur = [], None, [] for w in words: if cury is None or abs(w[1] - cury) > 5: if cur: lines.append(cur) cur = [w] cury = w[1] else: cur.append(w) if cur: lines.append(cur) if not lines: return [] line_texts = [" ".join(w[4] for w in ln) for ln in lines] headers = safe_join([line_texts[0]]) rows = [safe_join([lt]) for lt in line_texts[1:]] return [dict(zip(headers, r)) for r in rows] # ── Per-page worker ──────────────────────────────────────────────────────── def process_page(args): page_number, pdf_path = args try: with fitz.open(pdf_path) as doc: page = doc.load_page(page_number) page_rect = page.rect output = f"Page {page_number + 1}\n" # Detect tables table_rects = detect_table_bboxes(page) table_jsons = [] for rect in table_rects: tbl = extract_table(page, rect) if tbl: table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False)) # Words outside tables tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects] words = page.get_text("words") outside = [ w for w in words if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes) ] outside.sort(key=lambda w: (w[1], w[0])) cury, cur, text = None, [], "" for w in outside: if cury is None or abs(w[1] - cury) > 10: if cur: text += " ".join(cur) + "\n" cur, cury = [w[4]], w[1] else: cur.append(w[4]) if cur: text += " ".join(cur) + "\n" output += text.strip() + "\n" for idx, tbl in enumerate(table_jsons, 1): output += f'"table {idx}":\n{tbl}\n' return page_number, output except fitz.FileDataError as e: return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable – {e}" except Exception as e: return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}" # ── Document-level processing ─────────────────────────────────────────────── def process_pdf(pdf_path): try: if not os.path.exists(pdf_path): return f"[ERROR] File not found: {pdf_path}" print(f"[INFO] Starting processing: {pdf_path}") try: with fitz.open(pdf_path) as doc: num_pages = doc.page_count except fitz.FileDataError as e: return f"[ERROR] Cannot open PDF: {pdf_path} – {e}" except Exception as e: return f"[ERROR] General error opening PDF: {pdf_path} – {e}" pages = [(i, pdf_path) for i in range(num_pages)] results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages) results.sort(key=lambda x: x[0]) final_output = "\n".join(t for _, t in results) base = os.path.splitext(os.path.basename(pdf_path))[0] out_dir = os.path.dirname(pdf_path) out_path = os.path.join(out_dir, f"{base}.txt") with open(out_path, "w", encoding="utf-8", errors="ignore") as f: f.write(final_output) print(f"[INFO] Processing complete: {out_path}") except (EOFError, BrokenPipeError, KeyboardInterrupt): return "[INFO] Processing interrupted by user." except Exception as e: return f"[ERROR] Unexpected error with '{pdf_path}': {e}" def run_serial(pages): return [process_page(a) for a in pages] def run_parallel(pages): cores = min(max(1, cpu_count() - 2), len(pages)) print(f"Starting parallel processing with {cores} cores…") with Pool(cores) as pool: return pool.map(process_page, pages) # ── Batch CLI entrypoint ──────────────────────────────────────────────────── def process_pdfs_main(): pdfs = sys.argv[1:] if not pdfs: print("No PDF files provided.") return small, large = [], [] for p in pdfs: if not os.path.exists(p): print(f"File not found: {p}") continue try: with fitz.open(p) as doc: (small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p) except fitz.FileDataError: print(f"[ERROR] Password-protected PDF skipped: {p}") except Exception as e: print(f"[ERROR] Error opening {p}: {e}") if small: cores = min(max(1, cpu_count() - 2), len(small)) print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores …") for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small): print(r) for p in large: print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}") print(process_pdf(p)) # ── Tkinter GUI ───────────────────────────────────────────────────────────── class FileManager: def __init__(self, master): self.master = master master.title("Parser-Sevenof9 — PyMuPDF") self.files, self.last_selected = [], None tk.Label(master, text="Selected PDF files:").pack(pady=5) list_frame = tk.Frame(master); list_frame.pack(pady=5) sb_list = tk.Scrollbar(list_frame) self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6, yscrollcommand=sb_list.set) sb_list.config(command=self.listbox.yview) self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y) self.listbox.bind("<>", self.show_text) self.listbox.bind("", self.on_click) self.listbox.bind("", self.on_shift_click) self.ctx = tk.Menu(master, tearoff=0) self.ctx.add_command(label="Remove selected", command=self.remove_file) self.listbox.bind("", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None) btn_frame = tk.Frame(master); btn_frame.pack(pady=10) tk.Button(btn_frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5) tk.Button(btn_frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5) tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5) tk.Button(btn_frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5) tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5) tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10) tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5) sb_text = tk.Scrollbar(tx_frame) self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set) sb_text.config(command=self.text.yview) self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y) tk.Label(master, text="Progress:").pack() prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5) sb_prog = tk.Scrollbar(prog_frame) self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set) sb_prog.config(command=self.prog.yview) self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y) self.parser_proc = None # ── Listbox helpers ─────────────────────────────────────────────────── def on_click(self, e): idx = self.listbox.nearest(e.y) self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx) self.last_selected = idx; self.show_text(None) return "break" def on_shift_click(self, e): idx = self.listbox.nearest(e.y) if self.last_selected is None: self.last_selected = idx lo, hi = sorted((self.last_selected, idx)) self.listbox.selection_clear(0, tk.END) for i in range(lo, hi+1): self.listbox.selection_set(i) return "break" # ── File ops ───────────────────────────────────────────────────────── def add_folder(self): folder = filedialog.askdirectory(title="Select Folder") if not folder: return for root, _, fs in os.walk(folder): for f in fs: if f.lower().endswith(".pdf"): p = os.path.join(root, f) if p not in self.files: self.files.append(p); self.listbox.insert(tk.END, p) def add_file(self): for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]): if p not in self.files: self.files.append(p); self.listbox.insert(tk.END, p) def remove_file(self): sel = self.listbox.curselection() if not sel: messagebox.showwarning("Notice","Please select an entry to remove."); return for idx in reversed(sel): self.listbox.delete(idx); del self.files[idx] self.text.delete(1.0, tk.END) def remove_all(self): self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END) # ── Parser control ─────────────────────────────────────────────────── def start_parser(self): if not self.files: messagebox.showinfo("No Files","Please select at least one file."); return self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END) self.prog.insert(tk.END,"Starting parser…\n"); self.prog.config(state=tk.DISABLED) threading.Thread(target=self.run_parser).start() def stop_parser(self): if self.parser_proc and self.parser_proc.poll() is None: self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n") else: self.append_prog("No active parser process to stop.\n") def run_parser(self): try: self.parser_proc = subprocess.Popen( [sys.executable, __file__] + self.files, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, encoding="utf-8", errors="ignore", bufsize=4096 ) for line in self.parser_proc.stdout: self.append_prog(line) self.parser_proc.stdout.close(); self.parser_proc.wait() if self.parser_proc.returncode == 0: self.append_prog("\nParser finished successfully.\n") self.shell_msg("Parser Done","The parser was executed successfully.") else: self.append_prog("\nError while running the parser.\n") self.shell_msg("Error","Error while running the parser.") except Exception as e: self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}") finally: self.parser_proc = None # ── GUI helpers ────────────────────────────────────────────────────── def append_prog(self, txt): self.prog.after(0, lambda:self._ins(txt)) def _ins(self, txt): self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt) self.prog.see(tk.END); self.prog.config(state=tk.DISABLED) def shell_msg(self, title, msg): self.master.after(0, lambda: messagebox.showinfo(title, msg)) def show_text(self, _): sel = self.listbox.curselection() if not sel: return path = self.files[sel[0]] txt = os.path.splitext(path)[0] + ".txt" self.text.delete(1.0, tk.END) if os.path.exists(txt): try: with open(txt,"r",encoding="utf-8",errors="ignore") as f: self.text.insert(tk.END, f.read()) except Exception as e: self.text.insert(tk.END,f"Error loading text file:\n{e}") else: self.text.insert(tk.END,"[No corresponding .txt file found]") # ── Main guard ───────────────────────────────────────────────────────────── if __name__ == "__main__": multiprocessing.freeze_support() if len(sys.argv) > 1: process_pdfs_main() else: root = tk.Tk(); FileManager(root); root.mainloop()