kalle07
/

pdf2txt_parser_converter

+# ── Standard library ────────────────────────────────────────────────────────
+import os
+import sys
+import json
+import math
+import queue
+import shutil
+import logging
+import tempfile
+import threading
+import subprocess
+import multiprocessing
+from pathlib import Path
+from multiprocessing import Pool
+# ── Third-party ─────────────────────────────────────────────────────────────
+import fitz  # PyMuPDF
+import tkinter as tk
+from tkinter import filedialog, messagebox
+from joblib import cpu_count, Parallel, delayed
+# ── Parser configuration ────────────────────────────────────────────────────
+PARALLEL_THRESHOLD = 16       # pages – switch to multiprocessing above this
+LINE_TOLERANCE      = 1       # pts for snapping nearly-identical rulings
+MIN_RECT_AREA       = 1e4     # pts² – ignore tiny rectangles
+# ── pdfplumber-style clustering helpers ─────────────────────────────────────
+def cluster_list(xs, tol):
+    """Return list of clusters (each a list) grouped by ≤ tol apart."""
+    xs = sorted(xs)
+    if len(xs) < 2:
+        return [[x] for x in xs]
+    groups, grp = [], [xs[0]]
+    for x in xs[1:]:
+        if x - grp[-1] <= tol:
+            grp.append(x)
+        else:
+            groups.append(grp)
+            grp = [x]
+    groups.append(grp)
+    return groups
+def make_cluster_dict(vals, tol):
+    """Map each value to a cluster id (0,1,2,…) using tolerance."""
+    clusters = cluster_list(sorted(set(vals)), tol)
+    mapping = {}
+    for cid, cl in enumerate(clusters):
+        for v in cl:
+            mapping[v] = cid
+    return mapping
+# ── Utility funcs ───────────────────────────────────────────────────────────
+def clean_cell_text(text):
+    if not isinstance(text, str):
+        return ""
+    text = text.replace("-\n", "").replace("\n", " ")
+    return " ".join(text.split())
+def safe_join(row):
+    return [clean_cell_text(str(c)) if c is not None else "" for c in row]
+def clamp_bbox(bbox, page_rect):
+    x0, y0, x1, y1 = bbox
+    x0 = max(page_rect.x0, min(x0, page_rect.x1))
+    x1 = max(page_rect.x0, min(x1, page_rect.x1))
+    y0 = max(page_rect.y0, min(y0, page_rect.y1))
+    y1 = max(page_rect.y0, min(y1, page_rect.y1))
+    return (x0, y0, x1, y1)
+# ── Improved table detection with snapping ─────────────────────────────────
+def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE):
+    """
+    Detect table rectangles by:
+    1. Collecting very thin horizontal & vertical strokes
+    2. Snapping their positions with tolerance `tol`
+    3. Forming a grid from unique row & column positions
+    4. Returning a list[fitz.Rect] for each cell rectangle
+    """
+    horiz_raw, vert_raw = [], []
+    for d in page.get_drawings():
+        if d["type"] != 1:             # stroke only
+            continue
+        x0, y0, x1, y1 = d["bbox"]
+        if abs(y1 - y0) < 2:           # horizontal line
+            y_mid = (y0 + y1) / 2
+            horiz_raw.append((y_mid, x0, x1))
+        elif abs(x1 - x0) < 2:         # vertical line
+            x_mid = (x0 + x1) / 2
+            vert_raw.append((x_mid, y0, y1))
+    if not horiz_raw or not vert_raw:
+        return []
+    row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol)
+    col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol)
+    # Average positions per cluster id
+    rows = {}
+    for y, x0, x1 in horiz_raw:
+        cid = row_map[y]
+        rows.setdefault(cid, []).append(y)
+    cols = {}
+    for x, y0, y1 in vert_raw:
+        cid = col_map[x]
+        cols.setdefault(cid, []).append(x)
+    row_pos = sorted(sum(v)/len(v) for v in rows.values())
+    col_pos = sorted(sum(v)/len(v) for v in cols.values())
+    rects = []
+    for r0, r1 in zip(row_pos, row_pos[1:]):
+        for c0, c1 in zip(col_pos, col_pos[1:]):
+            rect = fitz.Rect(c0, r0, c1, r1)
+            if rect.get_area() >= MIN_RECT_AREA:
+                rects.append(rect)
+    # Remove duplicates / contained rects
+    unique = []
+    for rect in rects:
+        if not any(u.contains(rect) or rect.contains(u) for u in unique):
+            unique.append(rect)
+    return unique
+# ── Table extraction (simple text grouping) ────────────────────────────────
+def extract_table(page: fitz.Page, table_rect: fitz.Rect):
+    """Group words inside `table_rect` into JSON rows [dict]."""
+    words = [
+        w for w in page.get_text("words")
+        if table_rect.x0 <= w[0] <= table_rect.x1
+        and table_rect.y0 <= w[1] <= table_rect.y1
+    ]
+    words.sort(key=lambda w: (w[1], w[0]))          # sort by y then x
+    # cluster words by line
+    lines, cury, cur = [], None, []
+    for w in words:
+        if cury is None or abs(w[1] - cury) > 5:
+            if cur:
+                lines.append(cur)
+            cur = [w]
+            cury = w[1]
+        else:
+            cur.append(w)
+    if cur:
+        lines.append(cur)
+    if not lines:
+        return []
+    line_texts = [" ".join(w[4] for w in ln) for ln in lines]
+    headers = safe_join([line_texts[0]])
+    rows    = [safe_join([lt]) for lt in line_texts[1:]]
+    return [dict(zip(headers, r)) for r in rows]
+# ── Per-page worker ────────────────────────────────────────────────────────
+def process_page(args):
+    page_number, pdf_path = args
+    try:
+        with fitz.open(pdf_path) as doc:
+            page = doc.load_page(page_number)
+            page_rect = page.rect
+            output = f"Page {page_number + 1}\n"
+            # Detect tables
+            table_rects = detect_table_bboxes(page)
+            table_jsons = []
+            for rect in table_rects:
+                tbl = extract_table(page, rect)
+                if tbl:
+                    table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False))
+            # Words outside tables
+            tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects]
+            words = page.get_text("words")
+            outside = [
+                w for w in words
+                if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes)
+            ]
+            outside.sort(key=lambda w: (w[1], w[0]))
+            cury, cur, text = None, [], ""
+            for w in outside:
+                if cury is None or abs(w[1] - cury) > 10:
+                    if cur:
+                        text += " ".join(cur) + "\n"
+                    cur, cury = [w[4]], w[1]
+                else:
+                    cur.append(w[4])
+            if cur:
+                text += " ".join(cur) + "\n"
+            output += text.strip() + "\n"
+            for idx, tbl in enumerate(table_jsons, 1):
+                output += f'"table {idx}":\n{tbl}\n'
+            return page_number, output
+    except fitz.FileDataError as e:
+        return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable – {e}"
+    except Exception as e:
+        return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}"
+# ── Document-level processing ───────────────────────────────────────────────
+def process_pdf(pdf_path):
+    try:
+        if not os.path.exists(pdf_path):
+            return f"[ERROR] File not found: {pdf_path}"
+        print(f"[INFO] Starting processing: {pdf_path}")
+        try:
+            with fitz.open(pdf_path) as doc:
+                num_pages = doc.page_count
+        except fitz.FileDataError as e:
+            return f"[ERROR] Cannot open PDF: {pdf_path} – {e}"
+        except Exception as e:
+            return f"[ERROR] General error opening PDF: {pdf_path} – {e}"
+        pages = [(i, pdf_path) for i in range(num_pages)]
+        results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
+        results.sort(key=lambda x: x[0])
+        final_output = "\n".join(t for _, t in results)
+        base = os.path.splitext(os.path.basename(pdf_path))[0]
+        out_dir = os.path.dirname(pdf_path)
+        out_path = os.path.join(out_dir, f"{base}.txt")
+        with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
+            f.write(final_output)
+        print(f"[INFO] Processing complete: {out_path}")
+    except (EOFError, BrokenPipeError, KeyboardInterrupt):
+        return "[INFO] Processing interrupted by user."
+    except Exception as e:
+        return f"[ERROR] Unexpected error with '{pdf_path}': {e}"
+def run_serial(pages):   return [process_page(a) for a in pages]
+def run_parallel(pages):
+    cores = min(max(1, cpu_count() - 2), len(pages))
+    print(f"Starting parallel processing with {cores} cores…")
+    with Pool(cores) as pool:
+        return pool.map(process_page, pages)
+# ── Batch CLI entrypoint ────────────────────────────────────────────────────
+def process_pdfs_main():
+    pdfs = sys.argv[1:]
+    if not pdfs:
+        print("No PDF files provided.")
+        return
+    small, large = [], []
+    for p in pdfs:
+        if not os.path.exists(p):
+            print(f"File not found: {p}")
+            continue
+        try:
+            with fitz.open(p) as doc:
+                (small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p)
+        except fitz.FileDataError:
+            print(f"[ERROR] Password-protected PDF skipped: {p}")
+        except Exception as e:
+            print(f"[ERROR] Error opening {p}: {e}")
+    if small:
+        cores = min(max(1, cpu_count() - 2), len(small))
+        print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores …")
+        for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small):
+            print(r)
+    for p in large:
+        print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}")
+        print(process_pdf(p))
+# ── Tkinter GUI ─────────────────────────────────────────────────────────────
+class FileManager:
+    def __init__(self, master):
+        self.master = master
+        master.title("Parser-Sevenof9 — PyMuPDF")
+        self.files, self.last_selected = [], None
+        tk.Label(master, text="Selected PDF files:").pack(pady=5)
+        list_frame = tk.Frame(master); list_frame.pack(pady=5)
+        sb_list = tk.Scrollbar(list_frame)
+        self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6,
+                                  yscrollcommand=sb_list.set)
+        sb_list.config(command=self.listbox.yview)
+        self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y)
+        self.listbox.bind("<<ListboxSelect>>", self.show_text)
+        self.listbox.bind("<Button-1>", self.on_click)
+        self.listbox.bind("<Shift-Button-1>", self.on_shift_click)
+        self.ctx = tk.Menu(master, tearoff=0)
+        self.ctx.add_command(label="Remove selected", command=self.remove_file)
+        self.listbox.bind("<Button-3>", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None)
+        btn_frame = tk.Frame(master); btn_frame.pack(pady=10)
+        tk.Button(btn_frame, text="Add Folder",     command=self.add_folder).pack(side=tk.LEFT, padx=5)
+        tk.Button(btn_frame, text="Select Files",   command=self.add_file).pack(side=tk.LEFT, padx=5)
+        tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5)
+        tk.Button(btn_frame, text="Remove All",     command=self.remove_all).pack(side=tk.LEFT, padx=5)
+        tk.Button(master, text="Stop",  command=self.stop_parser).pack(pady=5)
+        tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
+        tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5)
+        sb_text = tk.Scrollbar(tx_frame)
+        self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set)
+        sb_text.config(command=self.text.yview)
+        self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y)
+        tk.Label(master, text="Progress:").pack()
+        prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5)
+        sb_prog = tk.Scrollbar(prog_frame)
+        self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set)
+        sb_prog.config(command=self.prog.yview)
+        self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y)
+        self.parser_proc = None
+    # ── Listbox helpers ───────────────────────────────────────────────────
+    def on_click(self, e):
+        idx = self.listbox.nearest(e.y)
+        self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx)
+        self.last_selected = idx; self.show_text(None)
+        return "break"
+    def on_shift_click(self, e):
+        idx = self.listbox.nearest(e.y)
+        if self.last_selected is None: self.last_selected = idx
+        lo, hi = sorted((self.last_selected, idx))
+        self.listbox.selection_clear(0, tk.END)
+        for i in range(lo, hi+1): self.listbox.selection_set(i)
+        return "break"
+    # ── File ops ─────────────────────────────────────────────────────────
+    def add_folder(self):
+        folder = filedialog.askdirectory(title="Select Folder")
+        if not folder: return
+        for root, _, fs in os.walk(folder):
+            for f in fs:
+                if f.lower().endswith(".pdf"):
+                    p = os.path.join(root, f)
+                    if p not in self.files:
+                        self.files.append(p); self.listbox.insert(tk.END, p)
+    def add_file(self):
+        for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]):
+            if p not in self.files:
+                self.files.append(p); self.listbox.insert(tk.END, p)
+    def remove_file(self):
+        sel = self.listbox.curselection()
+        if not sel:
+            messagebox.showwarning("Notice","Please select an entry to remove."); return
+        for idx in reversed(sel):
+            self.listbox.delete(idx); del self.files[idx]
+        self.text.delete(1.0, tk.END)
+    def remove_all(self):
+        self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END)
+    # ── Parser control ───────────────────────────────────────────────────
+    def start_parser(self):
+        if not self.files:
+            messagebox.showinfo("No Files","Please select at least one file."); return
+        self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END)
+        self.prog.insert(tk.END,"Starting parser…\n"); self.prog.config(state=tk.DISABLED)
+        threading.Thread(target=self.run_parser).start()
+    def stop_parser(self):
+        if self.parser_proc and self.parser_proc.poll() is None:
+            self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n")
+        else:
+            self.append_prog("No active parser process to stop.\n")
+    def run_parser(self):
+        try:
+            self.parser_proc = subprocess.Popen(
+                [sys.executable, __file__] + self.files,
+                stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                text=True, encoding="utf-8", errors="ignore", bufsize=4096
+            )
+            for line in self.parser_proc.stdout:
+                self.append_prog(line)
+            self.parser_proc.stdout.close(); self.parser_proc.wait()
+            if self.parser_proc.returncode == 0:
+                self.append_prog("\nParser finished successfully.\n")
+                self.shell_msg("Parser Done","The parser was executed successfully.")
+            else:
+                self.append_prog("\nError while running the parser.\n")
+                self.shell_msg("Error","Error while running the parser.")
+        except Exception as e:
+            self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}")
+        finally:
+            self.parser_proc = None
+    # ── GUI helpers ──────────────────────────────────────────────────────
+    def append_prog(self, txt):
+        self.prog.after(0, lambda:self._ins(txt))
+    def _ins(self, txt):
+        self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt)
+        self.prog.see(tk.END); self.prog.config(state=tk.DISABLED)
+    def shell_msg(self, title, msg):
+        self.master.after(0, lambda: messagebox.showinfo(title, msg))
+    def show_text(self, _):
+        sel = self.listbox.curselection()
+        if not sel: return
+        path = self.files[sel[0]]
+        txt = os.path.splitext(path)[0] + ".txt"
+        self.text.delete(1.0, tk.END)
+        if os.path.exists(txt):
+            try:
+                with open(txt,"r",encoding="utf-8",errors="ignore") as f:
+                    self.text.insert(tk.END, f.read())
+            except Exception as e:
+                self.text.insert(tk.END,f"Error loading text file:\n{e}")
+        else:
+            self.text.insert(tk.END,"[No corresponding .txt file found]")
+# ── Main guard ─────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    multiprocessing.freeze_support()
+    if len(sys.argv) > 1:
+        process_pdfs_main()
+    else:
+        root = tk.Tk(); FileManager(root); root.mainloop()