Create pymupdf_test_WIP.py
Browse files- pymupdf_test_WIP.py +428 -0
pymupdf_test_WIP.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ββ Standard library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import math
|
6 |
+
import queue
|
7 |
+
import shutil
|
8 |
+
import logging
|
9 |
+
import tempfile
|
10 |
+
import threading
|
11 |
+
import subprocess
|
12 |
+
import multiprocessing
|
13 |
+
from pathlib import Path
|
14 |
+
from multiprocessing import Pool
|
15 |
+
|
16 |
+
# ββ Third-party βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
17 |
+
import fitz # PyMuPDF
|
18 |
+
import tkinter as tk
|
19 |
+
from tkinter import filedialog, messagebox
|
20 |
+
from joblib import cpu_count, Parallel, delayed
|
21 |
+
|
22 |
+
# ββ Parser configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
23 |
+
PARALLEL_THRESHOLD = 16 # pages β switch to multiprocessing above this
|
24 |
+
LINE_TOLERANCE = 1 # pts for snapping nearly-identical rulings
|
25 |
+
MIN_RECT_AREA = 1e4 # ptsΒ² β ignore tiny rectangles
|
26 |
+
|
27 |
+
# ββ pdfplumber-style clustering helpers βββββββββββββββββββββββββββββββββββββ
|
28 |
+
def cluster_list(xs, tol):
|
29 |
+
"""Return list of clusters (each a list) grouped by β€ tol apart."""
|
30 |
+
xs = sorted(xs)
|
31 |
+
if len(xs) < 2:
|
32 |
+
return [[x] for x in xs]
|
33 |
+
groups, grp = [], [xs[0]]
|
34 |
+
for x in xs[1:]:
|
35 |
+
if x - grp[-1] <= tol:
|
36 |
+
grp.append(x)
|
37 |
+
else:
|
38 |
+
groups.append(grp)
|
39 |
+
grp = [x]
|
40 |
+
groups.append(grp)
|
41 |
+
return groups
|
42 |
+
|
43 |
+
def make_cluster_dict(vals, tol):
|
44 |
+
"""Map each value to a cluster id (0,1,2,β¦) using tolerance."""
|
45 |
+
clusters = cluster_list(sorted(set(vals)), tol)
|
46 |
+
mapping = {}
|
47 |
+
for cid, cl in enumerate(clusters):
|
48 |
+
for v in cl:
|
49 |
+
mapping[v] = cid
|
50 |
+
return mapping
|
51 |
+
|
52 |
+
# ββ Utility funcs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
53 |
+
def clean_cell_text(text):
|
54 |
+
if not isinstance(text, str):
|
55 |
+
return ""
|
56 |
+
text = text.replace("-\n", "").replace("\n", " ")
|
57 |
+
return " ".join(text.split())
|
58 |
+
|
59 |
+
def safe_join(row):
|
60 |
+
return [clean_cell_text(str(c)) if c is not None else "" for c in row]
|
61 |
+
|
62 |
+
def clamp_bbox(bbox, page_rect):
|
63 |
+
x0, y0, x1, y1 = bbox
|
64 |
+
x0 = max(page_rect.x0, min(x0, page_rect.x1))
|
65 |
+
x1 = max(page_rect.x0, min(x1, page_rect.x1))
|
66 |
+
y0 = max(page_rect.y0, min(y0, page_rect.y1))
|
67 |
+
y1 = max(page_rect.y0, min(y1, page_rect.y1))
|
68 |
+
return (x0, y0, x1, y1)
|
69 |
+
|
70 |
+
# ββ Improved table detection with snapping βββββββββββββββββββββββββββββββββ
|
71 |
+
def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE):
|
72 |
+
"""
|
73 |
+
Detect table rectangles by:
|
74 |
+
1. Collecting very thin horizontal & vertical strokes
|
75 |
+
2. Snapping their positions with tolerance `tol`
|
76 |
+
3. Forming a grid from unique row & column positions
|
77 |
+
4. Returning a list[fitz.Rect] for each cell rectangle
|
78 |
+
"""
|
79 |
+
horiz_raw, vert_raw = [], []
|
80 |
+
for d in page.get_drawings():
|
81 |
+
if d["type"] != 1: # stroke only
|
82 |
+
continue
|
83 |
+
x0, y0, x1, y1 = d["bbox"]
|
84 |
+
if abs(y1 - y0) < 2: # horizontal line
|
85 |
+
y_mid = (y0 + y1) / 2
|
86 |
+
horiz_raw.append((y_mid, x0, x1))
|
87 |
+
elif abs(x1 - x0) < 2: # vertical line
|
88 |
+
x_mid = (x0 + x1) / 2
|
89 |
+
vert_raw.append((x_mid, y0, y1))
|
90 |
+
|
91 |
+
if not horiz_raw or not vert_raw:
|
92 |
+
return []
|
93 |
+
|
94 |
+
row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol)
|
95 |
+
col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol)
|
96 |
+
|
97 |
+
# Average positions per cluster id
|
98 |
+
rows = {}
|
99 |
+
for y, x0, x1 in horiz_raw:
|
100 |
+
cid = row_map[y]
|
101 |
+
rows.setdefault(cid, []).append(y)
|
102 |
+
cols = {}
|
103 |
+
for x, y0, y1 in vert_raw:
|
104 |
+
cid = col_map[x]
|
105 |
+
cols.setdefault(cid, []).append(x)
|
106 |
+
|
107 |
+
row_pos = sorted(sum(v)/len(v) for v in rows.values())
|
108 |
+
col_pos = sorted(sum(v)/len(v) for v in cols.values())
|
109 |
+
|
110 |
+
rects = []
|
111 |
+
for r0, r1 in zip(row_pos, row_pos[1:]):
|
112 |
+
for c0, c1 in zip(col_pos, col_pos[1:]):
|
113 |
+
rect = fitz.Rect(c0, r0, c1, r1)
|
114 |
+
if rect.get_area() >= MIN_RECT_AREA:
|
115 |
+
rects.append(rect)
|
116 |
+
|
117 |
+
# Remove duplicates / contained rects
|
118 |
+
unique = []
|
119 |
+
for rect in rects:
|
120 |
+
if not any(u.contains(rect) or rect.contains(u) for u in unique):
|
121 |
+
unique.append(rect)
|
122 |
+
|
123 |
+
return unique
|
124 |
+
|
125 |
+
# ββ Table extraction (simple text grouping) ββββββββββββββββββββββββββββββββ
|
126 |
+
def extract_table(page: fitz.Page, table_rect: fitz.Rect):
|
127 |
+
"""Group words inside `table_rect` into JSON rows [dict]."""
|
128 |
+
words = [
|
129 |
+
w for w in page.get_text("words")
|
130 |
+
if table_rect.x0 <= w[0] <= table_rect.x1
|
131 |
+
and table_rect.y0 <= w[1] <= table_rect.y1
|
132 |
+
]
|
133 |
+
words.sort(key=lambda w: (w[1], w[0])) # sort by y then x
|
134 |
+
|
135 |
+
# cluster words by line
|
136 |
+
lines, cury, cur = [], None, []
|
137 |
+
for w in words:
|
138 |
+
if cury is None or abs(w[1] - cury) > 5:
|
139 |
+
if cur:
|
140 |
+
lines.append(cur)
|
141 |
+
cur = [w]
|
142 |
+
cury = w[1]
|
143 |
+
else:
|
144 |
+
cur.append(w)
|
145 |
+
if cur:
|
146 |
+
lines.append(cur)
|
147 |
+
|
148 |
+
if not lines:
|
149 |
+
return []
|
150 |
+
|
151 |
+
line_texts = [" ".join(w[4] for w in ln) for ln in lines]
|
152 |
+
headers = safe_join([line_texts[0]])
|
153 |
+
rows = [safe_join([lt]) for lt in line_texts[1:]]
|
154 |
+
return [dict(zip(headers, r)) for r in rows]
|
155 |
+
|
156 |
+
# ββ Per-page worker ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
157 |
+
def process_page(args):
|
158 |
+
page_number, pdf_path = args
|
159 |
+
try:
|
160 |
+
with fitz.open(pdf_path) as doc:
|
161 |
+
page = doc.load_page(page_number)
|
162 |
+
page_rect = page.rect
|
163 |
+
output = f"Page {page_number + 1}\n"
|
164 |
+
|
165 |
+
# Detect tables
|
166 |
+
table_rects = detect_table_bboxes(page)
|
167 |
+
table_jsons = []
|
168 |
+
for rect in table_rects:
|
169 |
+
tbl = extract_table(page, rect)
|
170 |
+
if tbl:
|
171 |
+
table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False))
|
172 |
+
|
173 |
+
# Words outside tables
|
174 |
+
tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects]
|
175 |
+
words = page.get_text("words")
|
176 |
+
outside = [
|
177 |
+
w for w in words
|
178 |
+
if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes)
|
179 |
+
]
|
180 |
+
outside.sort(key=lambda w: (w[1], w[0]))
|
181 |
+
|
182 |
+
cury, cur, text = None, [], ""
|
183 |
+
for w in outside:
|
184 |
+
if cury is None or abs(w[1] - cury) > 10:
|
185 |
+
if cur:
|
186 |
+
text += " ".join(cur) + "\n"
|
187 |
+
cur, cury = [w[4]], w[1]
|
188 |
+
else:
|
189 |
+
cur.append(w[4])
|
190 |
+
if cur:
|
191 |
+
text += " ".join(cur) + "\n"
|
192 |
+
|
193 |
+
output += text.strip() + "\n"
|
194 |
+
for idx, tbl in enumerate(table_jsons, 1):
|
195 |
+
output += f'"table {idx}":\n{tbl}\n'
|
196 |
+
return page_number, output
|
197 |
+
|
198 |
+
except fitz.FileDataError as e:
|
199 |
+
return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable β {e}"
|
200 |
+
except Exception as e:
|
201 |
+
return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}"
|
202 |
+
|
203 |
+
# ββ Document-level processing βββββββββββββββββββββββββββββββββββββββββββββββ
|
204 |
+
def process_pdf(pdf_path):
|
205 |
+
try:
|
206 |
+
if not os.path.exists(pdf_path):
|
207 |
+
return f"[ERROR] File not found: {pdf_path}"
|
208 |
+
|
209 |
+
print(f"[INFO] Starting processing: {pdf_path}")
|
210 |
+
try:
|
211 |
+
with fitz.open(pdf_path) as doc:
|
212 |
+
num_pages = doc.page_count
|
213 |
+
except fitz.FileDataError as e:
|
214 |
+
return f"[ERROR] Cannot open PDF: {pdf_path} β {e}"
|
215 |
+
except Exception as e:
|
216 |
+
return f"[ERROR] General error opening PDF: {pdf_path} β {e}"
|
217 |
+
|
218 |
+
pages = [(i, pdf_path) for i in range(num_pages)]
|
219 |
+
results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
|
220 |
+
|
221 |
+
results.sort(key=lambda x: x[0])
|
222 |
+
final_output = "\n".join(t for _, t in results)
|
223 |
+
|
224 |
+
base = os.path.splitext(os.path.basename(pdf_path))[0]
|
225 |
+
out_dir = os.path.dirname(pdf_path)
|
226 |
+
out_path = os.path.join(out_dir, f"{base}.txt")
|
227 |
+
with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
|
228 |
+
f.write(final_output)
|
229 |
+
print(f"[INFO] Processing complete: {out_path}")
|
230 |
+
except (EOFError, BrokenPipeError, KeyboardInterrupt):
|
231 |
+
return "[INFO] Processing interrupted by user."
|
232 |
+
except Exception as e:
|
233 |
+
return f"[ERROR] Unexpected error with '{pdf_path}': {e}"
|
234 |
+
|
235 |
+
def run_serial(pages): return [process_page(a) for a in pages]
|
236 |
+
|
237 |
+
def run_parallel(pages):
|
238 |
+
cores = min(max(1, cpu_count() - 2), len(pages))
|
239 |
+
print(f"Starting parallel processing with {cores} coresβ¦")
|
240 |
+
with Pool(cores) as pool:
|
241 |
+
return pool.map(process_page, pages)
|
242 |
+
|
243 |
+
# ββ Batch CLI entrypoint ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
244 |
+
def process_pdfs_main():
|
245 |
+
pdfs = sys.argv[1:]
|
246 |
+
if not pdfs:
|
247 |
+
print("No PDF files provided.")
|
248 |
+
return
|
249 |
+
|
250 |
+
small, large = [], []
|
251 |
+
for p in pdfs:
|
252 |
+
if not os.path.exists(p):
|
253 |
+
print(f"File not found: {p}")
|
254 |
+
continue
|
255 |
+
try:
|
256 |
+
with fitz.open(p) as doc:
|
257 |
+
(small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p)
|
258 |
+
except fitz.FileDataError:
|
259 |
+
print(f"[ERROR] Password-protected PDF skipped: {p}")
|
260 |
+
except Exception as e:
|
261 |
+
print(f"[ERROR] Error opening {p}: {e}")
|
262 |
+
|
263 |
+
if small:
|
264 |
+
cores = min(max(1, cpu_count() - 2), len(small))
|
265 |
+
print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores β¦")
|
266 |
+
for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small):
|
267 |
+
print(r)
|
268 |
+
|
269 |
+
for p in large:
|
270 |
+
print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}")
|
271 |
+
print(process_pdf(p))
|
272 |
+
|
273 |
+
# ββ Tkinter GUI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
274 |
+
class FileManager:
|
275 |
+
def __init__(self, master):
|
276 |
+
self.master = master
|
277 |
+
master.title("Parser-Sevenof9 β PyMuPDF")
|
278 |
+
|
279 |
+
self.files, self.last_selected = [], None
|
280 |
+
tk.Label(master, text="Selected PDF files:").pack(pady=5)
|
281 |
+
|
282 |
+
list_frame = tk.Frame(master); list_frame.pack(pady=5)
|
283 |
+
sb_list = tk.Scrollbar(list_frame)
|
284 |
+
self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6,
|
285 |
+
yscrollcommand=sb_list.set)
|
286 |
+
sb_list.config(command=self.listbox.yview)
|
287 |
+
self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y)
|
288 |
+
self.listbox.bind("<<ListboxSelect>>", self.show_text)
|
289 |
+
self.listbox.bind("<Button-1>", self.on_click)
|
290 |
+
self.listbox.bind("<Shift-Button-1>", self.on_shift_click)
|
291 |
+
|
292 |
+
self.ctx = tk.Menu(master, tearoff=0)
|
293 |
+
self.ctx.add_command(label="Remove selected", command=self.remove_file)
|
294 |
+
self.listbox.bind("<Button-3>", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None)
|
295 |
+
|
296 |
+
btn_frame = tk.Frame(master); btn_frame.pack(pady=10)
|
297 |
+
tk.Button(btn_frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
|
298 |
+
tk.Button(btn_frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
|
299 |
+
tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5)
|
300 |
+
tk.Button(btn_frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
|
301 |
+
tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)
|
302 |
+
tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
|
303 |
+
|
304 |
+
tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5)
|
305 |
+
sb_text = tk.Scrollbar(tx_frame)
|
306 |
+
self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set)
|
307 |
+
sb_text.config(command=self.text.yview)
|
308 |
+
self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y)
|
309 |
+
|
310 |
+
tk.Label(master, text="Progress:").pack()
|
311 |
+
prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5)
|
312 |
+
sb_prog = tk.Scrollbar(prog_frame)
|
313 |
+
self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set)
|
314 |
+
sb_prog.config(command=self.prog.yview)
|
315 |
+
self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y)
|
316 |
+
|
317 |
+
self.parser_proc = None
|
318 |
+
|
319 |
+
# ββ Listbox helpers βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
320 |
+
def on_click(self, e):
|
321 |
+
idx = self.listbox.nearest(e.y)
|
322 |
+
self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx)
|
323 |
+
self.last_selected = idx; self.show_text(None)
|
324 |
+
return "break"
|
325 |
+
|
326 |
+
def on_shift_click(self, e):
|
327 |
+
idx = self.listbox.nearest(e.y)
|
328 |
+
if self.last_selected is None: self.last_selected = idx
|
329 |
+
lo, hi = sorted((self.last_selected, idx))
|
330 |
+
self.listbox.selection_clear(0, tk.END)
|
331 |
+
for i in range(lo, hi+1): self.listbox.selection_set(i)
|
332 |
+
return "break"
|
333 |
+
|
334 |
+
# ββ File ops βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
335 |
+
def add_folder(self):
|
336 |
+
folder = filedialog.askdirectory(title="Select Folder")
|
337 |
+
if not folder: return
|
338 |
+
for root, _, fs in os.walk(folder):
|
339 |
+
for f in fs:
|
340 |
+
if f.lower().endswith(".pdf"):
|
341 |
+
p = os.path.join(root, f)
|
342 |
+
if p not in self.files:
|
343 |
+
self.files.append(p); self.listbox.insert(tk.END, p)
|
344 |
+
|
345 |
+
def add_file(self):
|
346 |
+
for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]):
|
347 |
+
if p not in self.files:
|
348 |
+
self.files.append(p); self.listbox.insert(tk.END, p)
|
349 |
+
|
350 |
+
def remove_file(self):
|
351 |
+
sel = self.listbox.curselection()
|
352 |
+
if not sel:
|
353 |
+
messagebox.showwarning("Notice","Please select an entry to remove."); return
|
354 |
+
for idx in reversed(sel):
|
355 |
+
self.listbox.delete(idx); del self.files[idx]
|
356 |
+
self.text.delete(1.0, tk.END)
|
357 |
+
|
358 |
+
def remove_all(self):
|
359 |
+
self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END)
|
360 |
+
|
361 |
+
# ββ Parser control βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
362 |
+
def start_parser(self):
|
363 |
+
if not self.files:
|
364 |
+
messagebox.showinfo("No Files","Please select at least one file."); return
|
365 |
+
self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END)
|
366 |
+
self.prog.insert(tk.END,"Starting parserβ¦\n"); self.prog.config(state=tk.DISABLED)
|
367 |
+
threading.Thread(target=self.run_parser).start()
|
368 |
+
|
369 |
+
def stop_parser(self):
|
370 |
+
if self.parser_proc and self.parser_proc.poll() is None:
|
371 |
+
self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n")
|
372 |
+
else:
|
373 |
+
self.append_prog("No active parser process to stop.\n")
|
374 |
+
|
375 |
+
def run_parser(self):
|
376 |
+
try:
|
377 |
+
self.parser_proc = subprocess.Popen(
|
378 |
+
[sys.executable, __file__] + self.files,
|
379 |
+
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
380 |
+
text=True, encoding="utf-8", errors="ignore", bufsize=4096
|
381 |
+
)
|
382 |
+
for line in self.parser_proc.stdout:
|
383 |
+
self.append_prog(line)
|
384 |
+
self.parser_proc.stdout.close(); self.parser_proc.wait()
|
385 |
+
if self.parser_proc.returncode == 0:
|
386 |
+
self.append_prog("\nParser finished successfully.\n")
|
387 |
+
self.shell_msg("Parser Done","The parser was executed successfully.")
|
388 |
+
else:
|
389 |
+
self.append_prog("\nError while running the parser.\n")
|
390 |
+
self.shell_msg("Error","Error while running the parser.")
|
391 |
+
except Exception as e:
|
392 |
+
self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}")
|
393 |
+
finally:
|
394 |
+
self.parser_proc = None
|
395 |
+
|
396 |
+
# ββ GUI helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
397 |
+
def append_prog(self, txt):
|
398 |
+
self.prog.after(0, lambda:self._ins(txt))
|
399 |
+
|
400 |
+
def _ins(self, txt):
|
401 |
+
self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt)
|
402 |
+
self.prog.see(tk.END); self.prog.config(state=tk.DISABLED)
|
403 |
+
|
404 |
+
def shell_msg(self, title, msg):
|
405 |
+
self.master.after(0, lambda: messagebox.showinfo(title, msg))
|
406 |
+
|
407 |
+
def show_text(self, _):
|
408 |
+
sel = self.listbox.curselection()
|
409 |
+
if not sel: return
|
410 |
+
path = self.files[sel[0]]
|
411 |
+
txt = os.path.splitext(path)[0] + ".txt"
|
412 |
+
self.text.delete(1.0, tk.END)
|
413 |
+
if os.path.exists(txt):
|
414 |
+
try:
|
415 |
+
with open(txt,"r",encoding="utf-8",errors="ignore") as f:
|
416 |
+
self.text.insert(tk.END, f.read())
|
417 |
+
except Exception as e:
|
418 |
+
self.text.insert(tk.END,f"Error loading text file:\n{e}")
|
419 |
+
else:
|
420 |
+
self.text.insert(tk.END,"[No corresponding .txt file found]")
|
421 |
+
|
422 |
+
# ββ Main guard βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
423 |
+
if __name__ == "__main__":
|
424 |
+
multiprocessing.freeze_support()
|
425 |
+
if len(sys.argv) > 1:
|
426 |
+
process_pdfs_main()
|
427 |
+
else:
|
428 |
+
root = tk.Tk(); FileManager(root); root.mainloop()
|