kalle07 commited on
Commit
d380f75
·
verified ·
1 Parent(s): 4e8efba

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ parser_sevenof9_v1_de.exe filter=lfs diff=lfs merge=lfs -text
37
+ parser_sevenof9_v1_en.exe filter=lfs diff=lfs merge=lfs -text
parser_sevenof9_v1_de.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c1157651ec219957876f09ec4acbf0d09f5f6469e986ab45c9d7f42df53b3ec
3
+ size 25577192
parser_sevenof9_v1_de.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import tkinter as tk
4
+ from tkinter import filedialog, messagebox
5
+ import subprocess
6
+ import threading
7
+ import tempfile
8
+ import shutil
9
+ import json
10
+ import logging
11
+ import pdfplumber
12
+ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
13
+ from pdfplumber.utils.exceptions import PdfminerException
14
+ from joblib import delayed, cpu_count, parallel_backend, Parallel
15
+ import multiprocessing # Wichtig für frozen support
16
+ from multiprocessing import Pool
17
+
18
+
19
+ # ========================
20
+ # Parser-Konfiguration
21
+ # ========================
22
+
23
+ TEXT_EXTRACTION_SETTINGS = {
24
+ "x_tolerance": 1,
25
+ "y_tolerance": 3,
26
+ "keep_blank_chars": False,
27
+ "use_text_flow": True
28
+ }
29
+
30
+
31
+ if sys.platform == "win32":
32
+ sys.stderr = open(os.devnull, 'w')
33
+
34
+ PARALLEL_THRESHOLD = 16
35
+
36
+ def suppress_pdfminer_logging():
37
+ for logger_name in [
38
+ "pdfminer",
39
+ "pdfminer.pdfparser",
40
+ "pdfminer.pdfdocument",
41
+ "pdfminer.pdfpage",
42
+ "pdfminer.converter",
43
+ "pdfminer.layout",
44
+ "pdfminer.cmapdb",
45
+ "pdfminer.utils"
46
+ ]:
47
+ logging.getLogger(logger_name).setLevel(logging.ERROR)
48
+
49
+ def clean_cell_text(text):
50
+ if not isinstance(text, str):
51
+ return ""
52
+ text = text.replace("-\n", "").replace("\n", " ")
53
+ return " ".join(text.split())
54
+
55
+ def safe_join(row):
56
+ return [clean_cell_text(str(cell)) if cell is not None else "" for cell in row]
57
+
58
+ def clamp_bbox(bbox, page_width, page_height):
59
+ x0, top, x1, bottom = bbox
60
+ x0 = max(0, min(x0, page_width))
61
+ x1 = max(0, min(x1, page_width))
62
+ top = max(0, min(top, page_height))
63
+ bottom = max(0, min(bottom, page_height))
64
+ return (x0, top, x1, bottom)
65
+
66
+ def process_page(args):
67
+
68
+ suppress_pdfminer_logging()
69
+ try:
70
+ page_number, pdf_path, text_settings = args
71
+ with pdfplumber.open(pdf_path) as pdf:
72
+ page = pdf.pages[page_number]
73
+ page_output = f"Page {page_number + 1}\n"
74
+ page_width = page.width
75
+ page_height = page.height
76
+
77
+ filtered_page = page
78
+ table_bbox_list = []
79
+ table_json_outputs = []
80
+
81
+ for table in page.find_tables():
82
+ bbox = clamp_bbox(table.bbox, page_width, page_height)
83
+ table_bbox_list.append(bbox)
84
+
85
+ if not page.crop(bbox).chars:
86
+ continue
87
+
88
+ filtered_page = filtered_page.filter(
89
+ lambda obj: get_bbox_overlap(obj_to_bbox(obj), bbox) is None
90
+ )
91
+
92
+ table_data = table.extract()
93
+ if table_data and len(table_data) >= 1:
94
+ headers = safe_join(table_data[0])
95
+ rows = [safe_join(row) for row in table_data[1:]]
96
+ json_table = [dict(zip(headers, row)) for row in rows]
97
+ table_json_outputs.append(json.dumps(json_table, indent=1, ensure_ascii=False))
98
+
99
+ chars_outside_tables = [
100
+ word for word in page.extract_words(**text_settings)
101
+ if not any(
102
+ bbox[0] <= float(word['x0']) <= bbox[2] and
103
+ bbox[1] <= float(word['top']) <= bbox[3]
104
+ for bbox in table_bbox_list
105
+ )
106
+ ]
107
+
108
+ current_y = None
109
+ line = []
110
+ text_content = ""
111
+
112
+ for word in chars_outside_tables:
113
+ if current_y is None or abs(word['top'] - current_y) > 10:
114
+ if line:
115
+ text_content += " ".join(line) + "\n"
116
+ line = [word['text']]
117
+ current_y = word['top']
118
+ else:
119
+ line.append(word['text'])
120
+ if line:
121
+ text_content += " ".join(line) + "\n"
122
+
123
+ page_output += text_content.strip() + "\n"
124
+
125
+ for idx, table in enumerate(table_json_outputs, start=1):
126
+ page_output += f'"tabelle {idx}":\n{table}\n'
127
+
128
+ return page_number, page_output
129
+
130
+ except Exception as e:
131
+ return args[0], f"[FEHLER] Seite {args[0]+1} ({args[1]}): {str(e)}"
132
+
133
+ def verarbeite_pdf(pdf_path):
134
+ suppress_pdfminer_logging()
135
+ try:
136
+ if not os.path.exists(pdf_path):
137
+ return f"[FEHLER] Datei nicht gefunden: {pdf_path}"
138
+
139
+ print(f"[INFO] Beginne Verarbeitung: {pdf_path}")
140
+ try:
141
+ with pdfplumber.open(pdf_path) as pdf:
142
+ num_pages = len(pdf.pages)
143
+ except PdfminerException as e:
144
+ return f"[FEHLER] PDF kann nicht geöffnet werden: {pdf_path} – {str(e)}"
145
+ except Exception as e:
146
+ return f"[FEHLER] Allgemeiner Fehler beim Öffnen: {pdf_path} – {str(e)}"
147
+
148
+ pages = [(i, pdf_path, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]
149
+
150
+ try:
151
+ results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
152
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
153
+ return "[INFO] Verarbeitung wurde abgebrochen."
154
+
155
+ sorted_results = sorted(results, key=lambda x: x[0])
156
+ final_output = "\n".join(text for _, text in sorted_results)
157
+
158
+ base_name = os.path.splitext(os.path.basename(pdf_path))[0]
159
+ output_dir = os.path.dirname(pdf_path)
160
+ output_path = os.path.join(output_dir, f"{base_name}.txt")
161
+
162
+ with open(output_path, "w", encoding="utf-8", errors="ignore") as f:
163
+ f.write(final_output)
164
+
165
+ print(f"[INFO] Verarbeitung abgeschlossen: {output_path}")
166
+
167
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
168
+ return "[INFO] Verarbeitung durch Benutzer abgebrochen."
169
+ except Exception as e:
170
+ return f"[FEHLER] Unerwarteter Fehler bei '{pdf_path}': {str(e)}"
171
+
172
+ def run_serial(pages):
173
+ return [process_page(args) for args in pages]
174
+
175
+ def run_parallel(pages):
176
+ available_cores = max(1, cpu_count() - 2) # Mindestens 1 Kern
177
+ num_cores = min(available_cores, len(pages))
178
+ print(f"Starte Parallelverarbeitung mit {num_cores} -2 Kernen...")
179
+ with Pool(processes=num_cores) as pool:
180
+ return pool.map(process_page, pages)
181
+
182
+ def verarbeite_pdfs_main():
183
+ suppress_pdfminer_logging()
184
+ pdf_dateien = sys.argv[1:]
185
+ if not pdf_dateien:
186
+ print("Keine PDF-Dateien übergeben.")
187
+ return
188
+
189
+ kleine_pdfs = []
190
+ grosse_pdfs = []
191
+
192
+ for pfad in pdf_dateien:
193
+ if not os.path.exists(pfad):
194
+ print(f"Datei nicht gefunden: {pfad}")
195
+ continue
196
+ try:
197
+ with pdfplumber.open(pfad) as pdf:
198
+ if len(pdf.pages) <= PARALLEL_THRESHOLD:
199
+ kleine_pdfs.append(pfad)
200
+ else:
201
+ grosse_pdfs.append(pfad)
202
+ except PdfminerException:
203
+ print(f"[FEHLER] Passwortgeschützte PDF-Datei: {pfad} – wird übersprungen.")
204
+ except Exception as e:
205
+ print(f"[FEHLER] Fehler beim Öffnen von {pfad}: {str(e)}")
206
+
207
+ if kleine_pdfs:
208
+ available_cores = max(1, cpu_count() - 2)
209
+ num_cores = min(available_cores, len(kleine_pdfs))
210
+ print(f"\n[Phase 1] Starte Parallelverarbeitung kleiner PDFs mit {num_cores} -2 Kernen...")
211
+ results = Parallel(n_jobs=num_cores)(
212
+ delayed(verarbeite_pdf)(pfad) for pfad in kleine_pdfs
213
+ )
214
+ for r in results:
215
+ print(r)
216
+
217
+ for pfad in grosse_pdfs:
218
+ print(f"\n[Phase 2] Verarbeitung großer PDFs: {os.path.basename(pfad)}")
219
+ print(verarbeite_pdf(pfad))
220
+
221
+
222
+
223
+ # ========================
224
+ # GUI-Klasse
225
+ # ========================
226
+
227
+ class DateiManager:
228
+ def __init__(self, master):
229
+ self.master = master
230
+ self.master.title("Parser-Sevenof9")
231
+ self.dateien = []
232
+ self.last_selected_index = None
233
+
234
+ self.label = tk.Label(master, text="Ausgewählte PDF-Dateien:")
235
+ self.label.pack(pady=5)
236
+
237
+ listbox_frame = tk.Frame(master)
238
+ listbox_frame.pack(pady=5)
239
+
240
+ scrollbar_listbox = tk.Scrollbar(listbox_frame)
241
+ self.listbox = tk.Listbox(listbox_frame, selectmode=tk.MULTIPLE, width=80, height=6, yscrollcommand=scrollbar_listbox.set)
242
+ scrollbar_listbox.config(command=self.listbox.yview)
243
+
244
+ self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
245
+ scrollbar_listbox.pack(side=tk.RIGHT, fill=tk.Y)
246
+
247
+ self.listbox.bind("<<ListboxSelect>>", self.zeige_textdatei)
248
+ self.listbox.bind("<Button-1>", self.on_listbox_click)
249
+ self.listbox.bind("<Shift-Button-1>", self.on_listbox_shift_click)
250
+
251
+ self.context_menu = tk.Menu(master, tearoff=0)
252
+ self.context_menu.add_command(label="Ausgewählte entfernen", command=self.datei_entfernen)
253
+ self.listbox.bind("<Button-3>", self.show_context_menu)
254
+
255
+ self.frame = tk.Frame(master)
256
+ self.frame.pack(pady=10)
257
+
258
+ tk.Button(self.frame, text="Ordner hinzufügen", command=self.ordner_hinzufuegen).pack(side=tk.LEFT, padx=5)
259
+ tk.Button(self.frame, text="Dateien auswählen", command=self.datei_hinzufuegen).pack(side=tk.LEFT, padx=5)
260
+ tk.Button(self.frame, text="Ausgewählte entfernen", command=self.datei_entfernen).pack(side=tk.LEFT, padx=5)
261
+ tk.Button(self.frame, text="Alle entfernen", command=self.alle_entfernen).pack(side=tk.LEFT, padx=5)
262
+ tk.Button(master, text="Stop", command=self.parser_stoppen).pack(pady=5)
263
+ self.parser_process = None # Wird im Thread gespeichert
264
+
265
+ tk.Button(master, text="Parser starten", command=self.parser_starten).pack(pady=10)
266
+
267
+ textfeld_frame = tk.Frame(master)
268
+ textfeld_frame.pack(padx=10, pady=5)
269
+
270
+ scrollbar_textfeld = tk.Scrollbar(textfeld_frame)
271
+ self.textfeld = tk.Text(textfeld_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=scrollbar_textfeld.set)
272
+ scrollbar_textfeld.config(command=self.textfeld.yview)
273
+
274
+ self.textfeld.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
275
+ scrollbar_textfeld.pack(side=tk.RIGHT, fill=tk.Y)
276
+
277
+ tk.Label(master, text="Fortschritt:").pack()
278
+
279
+ progress_frame = tk.Frame(master)
280
+ progress_frame.pack(padx=10, pady=5)
281
+
282
+ scrollbar_progress = tk.Scrollbar(progress_frame)
283
+ self.progress_text = tk.Text(progress_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=scrollbar_progress.set)
284
+ scrollbar_progress.config(command=self.progress_text.yview)
285
+
286
+ self.progress_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
287
+ scrollbar_progress.pack(side=tk.RIGHT, fill=tk.Y)
288
+
289
+ def on_listbox_click(self, event):
290
+ index = self.listbox.nearest(event.y)
291
+ self.listbox.selection_clear(0, tk.END)
292
+ self.listbox.selection_set(index)
293
+ self.last_selected_index = index
294
+ self.zeige_textdatei(None)
295
+ return "break"
296
+
297
+ def on_listbox_shift_click(self, event):
298
+ index = self.listbox.nearest(event.y)
299
+ if self.last_selected_index is None:
300
+ self.last_selected_index = index
301
+ start, end = sorted((self.last_selected_index, index))
302
+ self.listbox.selection_clear(0, tk.END)
303
+ for i in range(start, end + 1):
304
+ self.listbox.selection_set(i)
305
+ return "break"
306
+
307
+ def show_context_menu(self, event):
308
+ if self.listbox.curselection():
309
+ self.context_menu.tk_popup(event.x_root, event.y_root)
310
+
311
+ def ordner_hinzufuegen(self):
312
+ ordner = filedialog.askdirectory(title="Ordner auswählen")
313
+ if not ordner:
314
+ return
315
+ for root, _, files in os.walk(ordner):
316
+ for file in files:
317
+ if file.lower().endswith(".pdf"):
318
+ pfad = os.path.join(root, file)
319
+ if pfad not in self.dateien:
320
+ self.dateien.append(pfad)
321
+ self.listbox.insert(tk.END, pfad)
322
+
323
+ def datei_hinzufuegen(self):
324
+ pfade = filedialog.askopenfilenames(title="PDF-Dateien auswählen", filetypes=[("PDF-Dateien", "*.pdf")])
325
+ for pfad in pfade:
326
+ if pfad not in self.dateien:
327
+ self.dateien.append(pfad)
328
+ self.listbox.insert(tk.END, pfad)
329
+
330
+ def datei_entfernen(self):
331
+ selektion = self.listbox.curselection()
332
+ if not selektion:
333
+ messagebox.showwarning("Hinweis", "Bitte wählen Sie einen Eintrag zum Entfernen.")
334
+ return
335
+ for index in reversed(selektion):
336
+ self.listbox.delete(index)
337
+ del self.dateien[index]
338
+ self.textfeld.delete(1.0, tk.END)
339
+
340
+ def alle_entfernen(self):
341
+ self.listbox.delete(0, tk.END)
342
+ self.dateien.clear()
343
+ self.textfeld.delete(1.0, tk.END)
344
+
345
+
346
+ def parser_starten(self):
347
+ if not self.dateien:
348
+ messagebox.showinfo("Keine Dateien", "Bitte wählen Sie mindestens eine Datei aus.")
349
+ return
350
+ self.progress_text.config(state=tk.NORMAL)
351
+ self.progress_text.delete(1.0, tk.END)
352
+ self.progress_text.insert(tk.END, "Starte Parser...\n")
353
+ self.progress_text.config(state=tk.DISABLED)
354
+ thread = threading.Thread(target=self.parser_ausfuehren)
355
+ thread.start()
356
+
357
+ def parser_stoppen(self):
358
+ if self.parser_process and self.parser_process.poll() is None:
359
+ self.parser_process.terminate()
360
+ self.progress_text_einfuegen("Parser-Prozess wurde gestoppt.\n")
361
+ else:
362
+ self.progress_text_einfuegen("Kein laufender Parser-Prozess zum Stoppen.\n")
363
+
364
+ def parser_ausfuehren(self):
365
+ try:
366
+ self.parser_process = subprocess.Popen(
367
+ [sys.executable, __file__] + self.dateien,
368
+ stdout=subprocess.PIPE,
369
+ stderr=subprocess.STDOUT,
370
+ text=True,
371
+ encoding='utf-8',
372
+ errors='ignore',
373
+ bufsize=4096
374
+ )
375
+ for line in self.parser_process.stdout:
376
+ self.progress_text_einfuegen(line)
377
+ self.parser_process.stdout.close()
378
+ self.parser_process.wait()
379
+
380
+ if self.parser_process.returncode == 0:
381
+ self.progress_text_einfuegen("\nParser abgeschlossen.\n")
382
+ self.show_messagebox_threadsafe("Parser abgeschlossen", "Der Parser wurde erfolgreich ausgeführt.")
383
+ else:
384
+ self.progress_text_einfuegen("\nFehler beim Ausführen des Parsers.\n")
385
+ self.show_messagebox_threadsafe("Fehler", "Fehler beim Ausführen des Parsers.")
386
+ except Exception as e:
387
+ self.progress_text_einfuegen(f"Fehler: {e}\n")
388
+ self.show_messagebox_threadsafe("Fehler", f"Fehler beim Ausführen:\n{e}")
389
+ finally:
390
+ self.parser_process = None
391
+
392
+ def progress_text_einfuegen(self, text):
393
+ self.progress_text.after(0, lambda: self._text_einfuegen(text))
394
+
395
+ def _text_einfuegen(self, text):
396
+ self.progress_text.config(state=tk.NORMAL)
397
+ self.progress_text.insert(tk.END, text)
398
+ self.progress_text.see(tk.END)
399
+ self.progress_text.config(state=tk.DISABLED)
400
+
401
+ def show_messagebox_threadsafe(self, titel, nachricht):
402
+ self.master.after(0, lambda: messagebox.showinfo(titel, nachricht))
403
+
404
+ def zeige_textdatei(self, event):
405
+ selektion = self.listbox.curselection()
406
+ if not selektion:
407
+ return
408
+ index = selektion[0]
409
+ pfad = self.dateien[index]
410
+ txt_pfad = os.path.splitext(pfad)[0] + ".txt"
411
+ self.textfeld.delete(1.0, tk.END)
412
+ if os.path.exists(txt_pfad):
413
+ try:
414
+ with open(txt_pfad, "r", encoding="utf-8", errors="ignore") as f:
415
+ self.textfeld.insert(tk.END, f.read())
416
+ except Exception as e:
417
+ self.textfeld.insert(tk.END, f"Fehler beim Laden der Textdatei:\n{e}")
418
+ else:
419
+ self.textfeld.insert(tk.END, "[Keine zugehörige .txt-Datei vorhanden]")
420
+
421
+ # ========================
422
+ # Einstiegspunkt
423
+ # ========================
424
+
425
+
426
+ if __name__ == "__main__":
427
+ multiprocessing.freeze_support() # Muss als erstes im main stehen
428
+
429
+ if len(sys.argv) > 1:
430
+ verarbeite_pdfs_main()
431
+ else:
432
+ root = tk.Tk()
433
+ app = DateiManager(root)
434
+ root.mainloop()
parser_sevenof9_v1_en.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d8187a2413c1e5d9db9bd2d2eb21e9e811d82b42d63a4245d1d5a900172e6dc
3
+ size 25576676
parser_sevenof9_v1_en.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import tkinter as tk # internal
4
+ from tkinter import filedialog, messagebox # internal
5
+ import subprocess
6
+ import threading
7
+ import tempfile
8
+ import shutil
9
+ import json
10
+ import logging
11
+ import pdfplumber
12
+ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
13
+ from pdfplumber.utils.exceptions import PdfminerException
14
+ from joblib import delayed, cpu_count, parallel_backend, Parallel
15
+ import multiprocessing # intternal
16
+ from multiprocessing import Pool # internal
17
+
18
+
19
+ # ========================
20
+ # Parser Configuration
21
+ # ========================
22
+
23
+ TEXT_EXTRACTION_SETTINGS = {
24
+ "x_tolerance": 1,
25
+ "y_tolerance": 3,
26
+ "keep_blank_chars": False,
27
+ "use_text_flow": True
28
+ }
29
+
30
+ if sys.platform == "win32":
31
+ sys.stderr = open(os.devnull, 'w')
32
+
33
+ PARALLEL_THRESHOLD = 16
34
+
35
+ def suppress_pdfminer_logging():
36
+ for logger_name in [
37
+ "pdfminer",
38
+ "pdfminer.pdfparser",
39
+ "pdfminer.pdfdocument",
40
+ "pdfminer.pdfpage",
41
+ "pdfminer.converter",
42
+ "pdfminer.layout",
43
+ "pdfminer.cmapdb",
44
+ "pdfminer.utils"
45
+ ]:
46
+ logging.getLogger(logger_name).setLevel(logging.ERROR)
47
+
48
+ def clean_cell_text(text):
49
+ if not isinstance(text, str):
50
+ return ""
51
+ text = text.replace("-\n", "").replace("\n", " ")
52
+ return " ".join(text.split())
53
+
54
+ def safe_join(row):
55
+ return [clean_cell_text(str(cell)) if cell is not None else "" for cell in row]
56
+
57
+ def clamp_bbox(bbox, page_width, page_height):
58
+ x0, top, x1, bottom = bbox
59
+ x0 = max(0, min(x0, page_width))
60
+ x1 = max(0, min(x1, page_width))
61
+ top = max(0, min(top, page_height))
62
+ bottom = max(0, min(bottom, page_height))
63
+ return (x0, top, x1, bottom)
64
+
65
+ def process_page(args):
66
+ suppress_pdfminer_logging()
67
+ try:
68
+ page_number, pdf_path, text_settings = args
69
+ with pdfplumber.open(pdf_path) as pdf:
70
+ page = pdf.pages[page_number]
71
+ output = f"Page {page_number + 1}\n"
72
+ width, height = page.width, page.height
73
+
74
+ filtered_page = page
75
+ table_bboxes = []
76
+ table_json_outputs = []
77
+
78
+ for table in page.find_tables():
79
+ bbox = clamp_bbox(table.bbox, width, height)
80
+ table_bboxes.append(bbox)
81
+
82
+ if not page.crop(bbox).chars:
83
+ continue
84
+
85
+ filtered_page = filtered_page.filter(
86
+ lambda obj: get_bbox_overlap(obj_to_bbox(obj), bbox) is None
87
+ )
88
+
89
+ table_data = table.extract()
90
+ if table_data and len(table_data) >= 1:
91
+ headers = safe_join(table_data[0])
92
+ rows = [safe_join(row) for row in table_data[1:]]
93
+ json_table = [dict(zip(headers, row)) for row in rows]
94
+ table_json_outputs.append(json.dumps(json_table, indent=1, ensure_ascii=False))
95
+
96
+ words_outside_tables = [
97
+ word for word in page.extract_words(**text_settings)
98
+ if not any(
99
+ bbox[0] <= float(word['x0']) <= bbox[2] and
100
+ bbox[1] <= float(word['top']) <= bbox[3]
101
+ for bbox in table_bboxes
102
+ )
103
+ ]
104
+
105
+ current_y = None
106
+ line = []
107
+ text_content = ""
108
+
109
+ for word in words_outside_tables:
110
+ if current_y is None or abs(word['top'] - current_y) > 10:
111
+ if line:
112
+ text_content += " ".join(line) + "\n"
113
+ line = [word['text']]
114
+ current_y = word['top']
115
+ else:
116
+ line.append(word['text'])
117
+ if line:
118
+ text_content += " ".join(line) + "\n"
119
+
120
+ output += text_content.strip() + "\n"
121
+
122
+ for idx, table in enumerate(table_json_outputs, start=1):
123
+ output += f'"table {idx}":\n{table}\n'
124
+
125
+ return page_number, output
126
+
127
+ except Exception as e:
128
+ return args[0], f"[ERROR] Page {args[0]+1} ({args[1]}): {str(e)}"
129
+
130
+ def process_pdf(pdf_path):
131
+ suppress_pdfminer_logging()
132
+ try:
133
+ if not os.path.exists(pdf_path):
134
+ return f"[ERROR] File not found: {pdf_path}"
135
+
136
+ print(f"[INFO] Starting processing: {pdf_path}")
137
+ try:
138
+ with pdfplumber.open(pdf_path) as pdf:
139
+ num_pages = len(pdf.pages)
140
+ except PdfminerException as e:
141
+ return f"[ERROR] Cannot open PDF: {pdf_path} – {str(e)}"
142
+ except Exception as e:
143
+ return f"[ERROR] General error opening PDF: {pdf_path} – {str(e)}"
144
+
145
+ pages = [(i, pdf_path, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]
146
+
147
+ try:
148
+ results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
149
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
150
+ return "[INFO] Processing was interrupted."
151
+
152
+ sorted_results = sorted(results, key=lambda x: x[0])
153
+ final_output = "\n".join(text for _, text in sorted_results)
154
+
155
+ base_name = os.path.splitext(os.path.basename(pdf_path))[0]
156
+ output_dir = os.path.dirname(pdf_path)
157
+ output_path = os.path.join(output_dir, f"{base_name}.txt")
158
+
159
+ with open(output_path, "w", encoding="utf-8", errors="ignore") as f:
160
+ f.write(final_output)
161
+
162
+ print(f"[INFO] Processing complete: {output_path}")
163
+
164
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
165
+ return "[INFO] Processing interrupted by user."
166
+ except Exception as e:
167
+ return f"[ERROR] Unexpected error with '{pdf_path}': {str(e)}"
168
+
169
+ def run_serial(pages):
170
+ return [process_page(args) for args in pages]
171
+
172
+ def run_parallel(pages):
173
+ available_cores = max(1, cpu_count() - 2)
174
+ num_cores = min(available_cores, len(pages))
175
+ print(f"Starting parallel processing with {num_cores} cores...")
176
+ with Pool(processes=num_cores) as pool:
177
+ return pool.map(process_page, pages)
178
+
179
+ def process_pdfs_main():
180
+ suppress_pdfminer_logging()
181
+ pdf_files = sys.argv[1:]
182
+ if not pdf_files:
183
+ print("No PDF files provided.")
184
+ return
185
+
186
+ small_pdfs = []
187
+ large_pdfs = []
188
+
189
+ for path in pdf_files:
190
+ if not os.path.exists(path):
191
+ print(f"File not found: {path}")
192
+ continue
193
+ try:
194
+ with pdfplumber.open(path) as pdf:
195
+ if len(pdf.pages) <= PARALLEL_THRESHOLD:
196
+ small_pdfs.append(path)
197
+ else:
198
+ large_pdfs.append(path)
199
+ except PdfminerException:
200
+ print(f"[ERROR] Password-protected PDF skipped: {path}")
201
+ except Exception as e:
202
+ print(f"[ERROR] Error opening {path}: {str(e)}")
203
+
204
+ if small_pdfs:
205
+ available_cores = max(1, cpu_count() - 2)
206
+ num_cores = min(available_cores, len(small_pdfs))
207
+ print(f"\n[Phase 1] Starting parallel processing of small PDFs with {num_cores} cores...")
208
+ results = Parallel(n_jobs=num_cores)(
209
+ delayed(process_pdf)(path) for path in small_pdfs
210
+ )
211
+ for r in results:
212
+ print(r)
213
+
214
+ for path in large_pdfs:
215
+ print(f"\n[Phase 2] Processing large PDF: {os.path.basename(path)}")
216
+ print(process_pdf(path))
217
+
218
+
219
+ # ========================
220
+ # GUI Class
221
+ # ========================
222
+
223
+ class FileManager:
224
+ def __init__(self, master):
225
+ self.master = master
226
+ self.master.title("Parser-Sevenof9")
227
+ self.files = []
228
+ self.last_selected_index = None
229
+
230
+ self.label = tk.Label(master, text="Selected PDF files:")
231
+ self.label.pack(pady=5)
232
+
233
+ listbox_frame = tk.Frame(master)
234
+ listbox_frame.pack(pady=5)
235
+
236
+ scrollbar_listbox = tk.Scrollbar(listbox_frame)
237
+ self.listbox = tk.Listbox(listbox_frame, selectmode=tk.MULTIPLE, width=80, height=6, yscrollcommand=scrollbar_listbox.set)
238
+ scrollbar_listbox.config(command=self.listbox.yview)
239
+
240
+ self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
241
+ scrollbar_listbox.pack(side=tk.RIGHT, fill=tk.Y)
242
+
243
+ self.listbox.bind("<<ListboxSelect>>", self.show_text_file)
244
+ self.listbox.bind("<Button-1>", self.on_listbox_click)
245
+ self.listbox.bind("<Shift-Button-1>", self.on_listbox_shift_click)
246
+
247
+ self.context_menu = tk.Menu(master, tearoff=0)
248
+ self.context_menu.add_command(label="Remove selected", command=self.remove_file)
249
+ self.listbox.bind("<Button-3>", self.show_context_menu)
250
+
251
+ self.frame = tk.Frame(master)
252
+ self.frame.pack(pady=10)
253
+
254
+ tk.Button(self.frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
255
+ tk.Button(self.frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
256
+ tk.Button(self.frame, text="Remove Selected", command=self.remove_file).pack(side=tk.LEFT, padx=5)
257
+ tk.Button(self.frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
258
+ tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)
259
+ self.parser_process = None # Will be stored in thread
260
+
261
+ tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
262
+
263
+ text_frame = tk.Frame(master)
264
+ text_frame.pack(padx=10, pady=5)
265
+
266
+ scrollbar_text = tk.Scrollbar(text_frame)
267
+ self.text_widget = tk.Text(text_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=scrollbar_text.set)
268
+ scrollbar_text.config(command=self.text_widget.yview)
269
+
270
+ self.text_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
271
+ scrollbar_text.pack(side=tk.RIGHT, fill=tk.Y)
272
+
273
+ tk.Label(master, text="Progress:").pack()
274
+
275
+ progress_frame = tk.Frame(master)
276
+ progress_frame.pack(padx=10, pady=5)
277
+
278
+ scrollbar_progress = tk.Scrollbar(progress_frame)
279
+ self.progress_text = tk.Text(progress_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=scrollbar_progress.set)
280
+ scrollbar_progress.config(command=self.progress_text.yview)
281
+
282
+ self.progress_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
283
+ scrollbar_progress.pack(side=tk.RIGHT, fill=tk.Y)
284
+
285
+ def on_listbox_click(self, event):
286
+ index = self.listbox.nearest(event.y)
287
+ self.listbox.selection_clear(0, tk.END)
288
+ self.listbox.selection_set(index)
289
+ self.last_selected_index = index
290
+ self.show_text_file(None)
291
+ return "break"
292
+
293
+ def on_listbox_shift_click(self, event):
294
+ index = self.listbox.nearest(event.y)
295
+ if self.last_selected_index is None:
296
+ self.last_selected_index = index
297
+ start, end = sorted((self.last_selected_index, index))
298
+ self.listbox.selection_clear(0, tk.END)
299
+ for i in range(start, end + 1):
300
+ self.listbox.selection_set(i)
301
+ return "break"
302
+
303
+ def show_context_menu(self, event):
304
+ if self.listbox.curselection():
305
+ self.context_menu.tk_popup(event.x_root, event.y_root)
306
+
307
+ def add_folder(self):
308
+ folder = filedialog.askdirectory(title="Select Folder")
309
+ if not folder:
310
+ return
311
+ for root, _, files in os.walk(folder):
312
+ for file in files:
313
+ if file.lower().endswith(".pdf"):
314
+ path = os.path.join(root, file)
315
+ if path not in self.files:
316
+ self.files.append(path)
317
+ self.listbox.insert(tk.END, path)
318
+
319
+ def add_file(self):
320
+ paths = filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files", "*.pdf")])
321
+ for path in paths:
322
+ if path not in self.files:
323
+ self.files.append(path)
324
+ self.listbox.insert(tk.END, path)
325
+
326
+ def remove_file(self):
327
+ selection = self.listbox.curselection()
328
+ if not selection:
329
+ messagebox.showwarning("Notice", "Please select an entry to remove.")
330
+ return
331
+ for index in reversed(selection):
332
+ self.listbox.delete(index)
333
+ del self.files[index]
334
+ self.text_widget.delete(1.0, tk.END)
335
+
336
+ def remove_all(self):
337
+ self.listbox.delete(0, tk.END)
338
+ self.files.clear()
339
+ self.text_widget.delete(1.0, tk.END)
340
+
341
+ def start_parser(self):
342
+ if not self.files:
343
+ messagebox.showinfo("No Files", "Please select at least one file.")
344
+ return
345
+ self.progress_text.config(state=tk.NORMAL)
346
+ self.progress_text.delete(1.0, tk.END)
347
+ self.progress_text.insert(tk.END, "Starting parser...\n")
348
+ self.progress_text.config(state=tk.DISABLED)
349
+ thread = threading.Thread(target=self.run_parser)
350
+ thread.start()
351
+
352
+ def stop_parser(self):
353
+ if self.parser_process and self.parser_process.poll() is None:
354
+ self.parser_process.terminate()
355
+ self.append_progress_text("Parser process was stopped.\n")
356
+ else:
357
+ self.append_progress_text("No active parser process to stop.\n")
358
+
359
+ def run_parser(self):
360
+ try:
361
+ self.parser_process = subprocess.Popen(
362
+ [sys.executable, __file__] + self.files,
363
+ stdout=subprocess.PIPE,
364
+ stderr=subprocess.STDOUT,
365
+ text=True,
366
+ encoding='utf-8',
367
+ errors='ignore',
368
+ bufsize=4096
369
+ )
370
+ for line in self.parser_process.stdout:
371
+ self.append_progress_text(line)
372
+ self.parser_process.stdout.close()
373
+ self.parser_process.wait()
374
+
375
+ if self.parser_process.returncode == 0:
376
+ self.append_progress_text("\nParser finished successfully.\n")
377
+ self.show_messagebox_threadsafe("Parser Done", "The parser was executed successfully.")
378
+ else:
379
+ self.append_progress_text("\nError while running the parser.\n")
380
+ self.show_messagebox_threadsafe("Error", "Error while running the parser.")
381
+ except Exception as e:
382
+ self.append_progress_text(f"Error: {e}\n")
383
+ self.show_messagebox_threadsafe("Error", f"Error during execution:\n{e}")
384
+ finally:
385
+ self.parser_process = None
386
+
387
+ def append_progress_text(self, text):
388
+ self.progress_text.after(0, lambda: self._insert_text(text))
389
+
390
+ def _insert_text(self, text):
391
+ self.progress_text.config(state=tk.NORMAL)
392
+ self.progress_text.insert(tk.END, text)
393
+ self.progress_text.see(tk.END)
394
+ self.progress_text.config(state=tk.DISABLED)
395
+
396
+ def show_messagebox_threadsafe(self, title, message):
397
+ self.master.after(0, lambda: messagebox.showinfo(title, message))
398
+
399
+ def show_text_file(self, event):
400
+ selection = self.listbox.curselection()
401
+ if not selection:
402
+ return
403
+ index = selection[0]
404
+ path = self.files[index]
405
+ txt_path = os.path.splitext(path)[0] + ".txt"
406
+ self.text_widget.delete(1.0, tk.END)
407
+ if os.path.exists(txt_path):
408
+ try:
409
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
410
+ self.text_widget.insert(tk.END, f.read())
411
+ except Exception as e:
412
+ self.text_widget.insert(tk.END, f"Error loading text file:\n{e}")
413
+ else:
414
+ self.text_widget.insert(tk.END, "[No corresponding .txt file found]")
415
+
416
+ # ========================
417
+ # Entry Point
418
+ # ========================
419
+
420
+ if __name__ == "__main__":
421
+ multiprocessing.freeze_support() # Must be first in main for compatibility with multiprocessing on Windows
422
+
423
+ if len(sys.argv) > 1:
424
+ process_pdfs_main()
425
+ else:
426
+ root = tk.Tk()
427
+ app = FileManager(root)
428
+ root.mainloop()
429
+