|
|
|
import os |
|
import sys |
|
import json |
|
import math |
|
import queue |
|
import shutil |
|
import logging |
|
import tempfile |
|
import threading |
|
import subprocess |
|
import multiprocessing |
|
from pathlib import Path |
|
from multiprocessing import Pool |
|
|
|
|
|
import fitz |
|
import tkinter as tk |
|
from tkinter import filedialog, messagebox |
|
from joblib import cpu_count, Parallel, delayed |
|
|
|
|
|
PARALLEL_THRESHOLD = 16 |
|
LINE_TOLERANCE = 1 |
|
MIN_RECT_AREA = 1e4 |
|
|
|
|
|
def cluster_list(xs, tol): |
|
"""Return list of clusters (each a list) grouped by β€ tol apart.""" |
|
xs = sorted(xs) |
|
if len(xs) < 2: |
|
return [[x] for x in xs] |
|
groups, grp = [], [xs[0]] |
|
for x in xs[1:]: |
|
if x - grp[-1] <= tol: |
|
grp.append(x) |
|
else: |
|
groups.append(grp) |
|
grp = [x] |
|
groups.append(grp) |
|
return groups |
|
|
|
def make_cluster_dict(vals, tol): |
|
"""Map each value to a cluster id (0,1,2,β¦) using tolerance.""" |
|
clusters = cluster_list(sorted(set(vals)), tol) |
|
mapping = {} |
|
for cid, cl in enumerate(clusters): |
|
for v in cl: |
|
mapping[v] = cid |
|
return mapping |
|
|
|
|
|
def clean_cell_text(text): |
|
if not isinstance(text, str): |
|
return "" |
|
text = text.replace("-\n", "").replace("\n", " ") |
|
return " ".join(text.split()) |
|
|
|
def safe_join(row): |
|
return [clean_cell_text(str(c)) if c is not None else "" for c in row] |
|
|
|
def clamp_bbox(bbox, page_rect): |
|
x0, y0, x1, y1 = bbox |
|
x0 = max(page_rect.x0, min(x0, page_rect.x1)) |
|
x1 = max(page_rect.x0, min(x1, page_rect.x1)) |
|
y0 = max(page_rect.y0, min(y0, page_rect.y1)) |
|
y1 = max(page_rect.y0, min(y1, page_rect.y1)) |
|
return (x0, y0, x1, y1) |
|
|
|
|
|
def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE): |
|
""" |
|
Detect table rectangles by: |
|
1. Collecting very thin horizontal & vertical strokes |
|
2. Snapping their positions with tolerance `tol` |
|
3. Forming a grid from unique row & column positions |
|
4. Returning a list[fitz.Rect] for each cell rectangle |
|
""" |
|
horiz_raw, vert_raw = [], [] |
|
for d in page.get_drawings(): |
|
if d["type"] != 1: |
|
continue |
|
x0, y0, x1, y1 = d["bbox"] |
|
if abs(y1 - y0) < 2: |
|
y_mid = (y0 + y1) / 2 |
|
horiz_raw.append((y_mid, x0, x1)) |
|
elif abs(x1 - x0) < 2: |
|
x_mid = (x0 + x1) / 2 |
|
vert_raw.append((x_mid, y0, y1)) |
|
|
|
if not horiz_raw or not vert_raw: |
|
return [] |
|
|
|
row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol) |
|
col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol) |
|
|
|
|
|
rows = {} |
|
for y, x0, x1 in horiz_raw: |
|
cid = row_map[y] |
|
rows.setdefault(cid, []).append(y) |
|
cols = {} |
|
for x, y0, y1 in vert_raw: |
|
cid = col_map[x] |
|
cols.setdefault(cid, []).append(x) |
|
|
|
row_pos = sorted(sum(v)/len(v) for v in rows.values()) |
|
col_pos = sorted(sum(v)/len(v) for v in cols.values()) |
|
|
|
rects = [] |
|
for r0, r1 in zip(row_pos, row_pos[1:]): |
|
for c0, c1 in zip(col_pos, col_pos[1:]): |
|
rect = fitz.Rect(c0, r0, c1, r1) |
|
if rect.get_area() >= MIN_RECT_AREA: |
|
rects.append(rect) |
|
|
|
|
|
unique = [] |
|
for rect in rects: |
|
if not any(u.contains(rect) or rect.contains(u) for u in unique): |
|
unique.append(rect) |
|
|
|
return unique |
|
|
|
|
|
def extract_table(page: fitz.Page, table_rect: fitz.Rect): |
|
"""Group words inside `table_rect` into JSON rows [dict].""" |
|
words = [ |
|
w for w in page.get_text("words") |
|
if table_rect.x0 <= w[0] <= table_rect.x1 |
|
and table_rect.y0 <= w[1] <= table_rect.y1 |
|
] |
|
words.sort(key=lambda w: (w[1], w[0])) |
|
|
|
|
|
lines, cury, cur = [], None, [] |
|
for w in words: |
|
if cury is None or abs(w[1] - cury) > 5: |
|
if cur: |
|
lines.append(cur) |
|
cur = [w] |
|
cury = w[1] |
|
else: |
|
cur.append(w) |
|
if cur: |
|
lines.append(cur) |
|
|
|
if not lines: |
|
return [] |
|
|
|
line_texts = [" ".join(w[4] for w in ln) for ln in lines] |
|
headers = safe_join([line_texts[0]]) |
|
rows = [safe_join([lt]) for lt in line_texts[1:]] |
|
return [dict(zip(headers, r)) for r in rows] |
|
|
|
|
|
def process_page(args): |
|
page_number, pdf_path = args |
|
try: |
|
with fitz.open(pdf_path) as doc: |
|
page = doc.load_page(page_number) |
|
page_rect = page.rect |
|
output = f"Page {page_number + 1}\n" |
|
|
|
|
|
table_rects = detect_table_bboxes(page) |
|
table_jsons = [] |
|
for rect in table_rects: |
|
tbl = extract_table(page, rect) |
|
if tbl: |
|
table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False)) |
|
|
|
|
|
tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects] |
|
words = page.get_text("words") |
|
outside = [ |
|
w for w in words |
|
if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes) |
|
] |
|
outside.sort(key=lambda w: (w[1], w[0])) |
|
|
|
cury, cur, text = None, [], "" |
|
for w in outside: |
|
if cury is None or abs(w[1] - cury) > 10: |
|
if cur: |
|
text += " ".join(cur) + "\n" |
|
cur, cury = [w[4]], w[1] |
|
else: |
|
cur.append(w[4]) |
|
if cur: |
|
text += " ".join(cur) + "\n" |
|
|
|
output += text.strip() + "\n" |
|
for idx, tbl in enumerate(table_jsons, 1): |
|
output += f'"table {idx}":\n{tbl}\n' |
|
return page_number, output |
|
|
|
except fitz.FileDataError as e: |
|
return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable β {e}" |
|
except Exception as e: |
|
return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}" |
|
|
|
|
|
def process_pdf(pdf_path): |
|
try: |
|
if not os.path.exists(pdf_path): |
|
return f"[ERROR] File not found: {pdf_path}" |
|
|
|
print(f"[INFO] Starting processing: {pdf_path}") |
|
try: |
|
with fitz.open(pdf_path) as doc: |
|
num_pages = doc.page_count |
|
except fitz.FileDataError as e: |
|
return f"[ERROR] Cannot open PDF: {pdf_path} β {e}" |
|
except Exception as e: |
|
return f"[ERROR] General error opening PDF: {pdf_path} β {e}" |
|
|
|
pages = [(i, pdf_path) for i in range(num_pages)] |
|
results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages) |
|
|
|
results.sort(key=lambda x: x[0]) |
|
final_output = "\n".join(t for _, t in results) |
|
|
|
base = os.path.splitext(os.path.basename(pdf_path))[0] |
|
out_dir = os.path.dirname(pdf_path) |
|
out_path = os.path.join(out_dir, f"{base}.txt") |
|
with open(out_path, "w", encoding="utf-8", errors="ignore") as f: |
|
f.write(final_output) |
|
print(f"[INFO] Processing complete: {out_path}") |
|
except (EOFError, BrokenPipeError, KeyboardInterrupt): |
|
return "[INFO] Processing interrupted by user." |
|
except Exception as e: |
|
return f"[ERROR] Unexpected error with '{pdf_path}': {e}" |
|
|
|
def run_serial(pages): return [process_page(a) for a in pages] |
|
|
|
def run_parallel(pages): |
|
cores = min(max(1, cpu_count() - 2), len(pages)) |
|
print(f"Starting parallel processing with {cores} coresβ¦") |
|
with Pool(cores) as pool: |
|
return pool.map(process_page, pages) |
|
|
|
|
|
def process_pdfs_main(): |
|
pdfs = sys.argv[1:] |
|
if not pdfs: |
|
print("No PDF files provided.") |
|
return |
|
|
|
small, large = [], [] |
|
for p in pdfs: |
|
if not os.path.exists(p): |
|
print(f"File not found: {p}") |
|
continue |
|
try: |
|
with fitz.open(p) as doc: |
|
(small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p) |
|
except fitz.FileDataError: |
|
print(f"[ERROR] Password-protected PDF skipped: {p}") |
|
except Exception as e: |
|
print(f"[ERROR] Error opening {p}: {e}") |
|
|
|
if small: |
|
cores = min(max(1, cpu_count() - 2), len(small)) |
|
print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores β¦") |
|
for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small): |
|
print(r) |
|
|
|
for p in large: |
|
print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}") |
|
print(process_pdf(p)) |
|
|
|
|
|
class FileManager: |
|
def __init__(self, master): |
|
self.master = master |
|
master.title("Parser-Sevenof9 β PyMuPDF") |
|
|
|
self.files, self.last_selected = [], None |
|
tk.Label(master, text="Selected PDF files:").pack(pady=5) |
|
|
|
list_frame = tk.Frame(master); list_frame.pack(pady=5) |
|
sb_list = tk.Scrollbar(list_frame) |
|
self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6, |
|
yscrollcommand=sb_list.set) |
|
sb_list.config(command=self.listbox.yview) |
|
self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y) |
|
self.listbox.bind("<<ListboxSelect>>", self.show_text) |
|
self.listbox.bind("<Button-1>", self.on_click) |
|
self.listbox.bind("<Shift-Button-1>", self.on_shift_click) |
|
|
|
self.ctx = tk.Menu(master, tearoff=0) |
|
self.ctx.add_command(label="Remove selected", command=self.remove_file) |
|
self.listbox.bind("<Button-3>", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None) |
|
|
|
btn_frame = tk.Frame(master); btn_frame.pack(pady=10) |
|
tk.Button(btn_frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5) |
|
tk.Button(btn_frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5) |
|
tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5) |
|
tk.Button(btn_frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5) |
|
tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5) |
|
tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10) |
|
|
|
tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5) |
|
sb_text = tk.Scrollbar(tx_frame) |
|
self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set) |
|
sb_text.config(command=self.text.yview) |
|
self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y) |
|
|
|
tk.Label(master, text="Progress:").pack() |
|
prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5) |
|
sb_prog = tk.Scrollbar(prog_frame) |
|
self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set) |
|
sb_prog.config(command=self.prog.yview) |
|
self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y) |
|
|
|
self.parser_proc = None |
|
|
|
|
|
def on_click(self, e): |
|
idx = self.listbox.nearest(e.y) |
|
self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx) |
|
self.last_selected = idx; self.show_text(None) |
|
return "break" |
|
|
|
def on_shift_click(self, e): |
|
idx = self.listbox.nearest(e.y) |
|
if self.last_selected is None: self.last_selected = idx |
|
lo, hi = sorted((self.last_selected, idx)) |
|
self.listbox.selection_clear(0, tk.END) |
|
for i in range(lo, hi+1): self.listbox.selection_set(i) |
|
return "break" |
|
|
|
|
|
def add_folder(self): |
|
folder = filedialog.askdirectory(title="Select Folder") |
|
if not folder: return |
|
for root, _, fs in os.walk(folder): |
|
for f in fs: |
|
if f.lower().endswith(".pdf"): |
|
p = os.path.join(root, f) |
|
if p not in self.files: |
|
self.files.append(p); self.listbox.insert(tk.END, p) |
|
|
|
def add_file(self): |
|
for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]): |
|
if p not in self.files: |
|
self.files.append(p); self.listbox.insert(tk.END, p) |
|
|
|
def remove_file(self): |
|
sel = self.listbox.curselection() |
|
if not sel: |
|
messagebox.showwarning("Notice","Please select an entry to remove."); return |
|
for idx in reversed(sel): |
|
self.listbox.delete(idx); del self.files[idx] |
|
self.text.delete(1.0, tk.END) |
|
|
|
def remove_all(self): |
|
self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END) |
|
|
|
|
|
def start_parser(self): |
|
if not self.files: |
|
messagebox.showinfo("No Files","Please select at least one file."); return |
|
self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END) |
|
self.prog.insert(tk.END,"Starting parserβ¦\n"); self.prog.config(state=tk.DISABLED) |
|
threading.Thread(target=self.run_parser).start() |
|
|
|
def stop_parser(self): |
|
if self.parser_proc and self.parser_proc.poll() is None: |
|
self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n") |
|
else: |
|
self.append_prog("No active parser process to stop.\n") |
|
|
|
def run_parser(self): |
|
try: |
|
self.parser_proc = subprocess.Popen( |
|
[sys.executable, __file__] + self.files, |
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, |
|
text=True, encoding="utf-8", errors="ignore", bufsize=4096 |
|
) |
|
for line in self.parser_proc.stdout: |
|
self.append_prog(line) |
|
self.parser_proc.stdout.close(); self.parser_proc.wait() |
|
if self.parser_proc.returncode == 0: |
|
self.append_prog("\nParser finished successfully.\n") |
|
self.shell_msg("Parser Done","The parser was executed successfully.") |
|
else: |
|
self.append_prog("\nError while running the parser.\n") |
|
self.shell_msg("Error","Error while running the parser.") |
|
except Exception as e: |
|
self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}") |
|
finally: |
|
self.parser_proc = None |
|
|
|
|
|
def append_prog(self, txt): |
|
self.prog.after(0, lambda:self._ins(txt)) |
|
|
|
def _ins(self, txt): |
|
self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt) |
|
self.prog.see(tk.END); self.prog.config(state=tk.DISABLED) |
|
|
|
def shell_msg(self, title, msg): |
|
self.master.after(0, lambda: messagebox.showinfo(title, msg)) |
|
|
|
def show_text(self, _): |
|
sel = self.listbox.curselection() |
|
if not sel: return |
|
path = self.files[sel[0]] |
|
txt = os.path.splitext(path)[0] + ".txt" |
|
self.text.delete(1.0, tk.END) |
|
if os.path.exists(txt): |
|
try: |
|
with open(txt,"r",encoding="utf-8",errors="ignore") as f: |
|
self.text.insert(tk.END, f.read()) |
|
except Exception as e: |
|
self.text.insert(tk.END,f"Error loading text file:\n{e}") |
|
else: |
|
self.text.insert(tk.END,"[No corresponding .txt file found]") |
|
|
|
|
|
if __name__ == "__main__": |
|
multiprocessing.freeze_support() |
|
if len(sys.argv) > 1: |
|
process_pdfs_main() |
|
else: |
|
root = tk.Tk(); FileManager(root); root.mainloop() |
|
|