# ── Standard library ────────────────────────────────────────────────────────
import os
import sys
import json
import math
import queue
import shutil
import logging
import tempfile
import threading
import subprocess
import multiprocessing
from pathlib import Path
from multiprocessing import Pool

# ── Third-party ─────────────────────────────────────────────────────────────
import fitz  # PyMuPDF
import tkinter as tk
from tkinter import filedialog, messagebox
from joblib import cpu_count, Parallel, delayed

# ── Parser configuration ────────────────────────────────────────────────────
PARALLEL_THRESHOLD = 16       # pages – switch to multiprocessing above this
LINE_TOLERANCE      = 1       # pts for snapping nearly-identical rulings
MIN_RECT_AREA       = 1e4     # pts² – ignore tiny rectangles

# ── pdfplumber-style clustering helpers ─────────────────────────────────────
def cluster_list(xs, tol):
    """Return list of clusters (each a list) grouped by ≤ tol apart."""
    xs = sorted(xs)
    if len(xs) < 2:
        return [[x] for x in xs]
    groups, grp = [], [xs[0]]
    for x in xs[1:]:
        if x - grp[-1] <= tol:
            grp.append(x)
        else:
            groups.append(grp)
            grp = [x]
    groups.append(grp)
    return groups

def make_cluster_dict(vals, tol):
    """Map each value to a cluster id (0,1,2,…) using tolerance."""
    clusters = cluster_list(sorted(set(vals)), tol)
    mapping = {}
    for cid, cl in enumerate(clusters):
        for v in cl:
            mapping[v] = cid
    return mapping

# ── Utility funcs ───────────────────────────────────────────────────────────
def clean_cell_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("-\n", "").replace("\n", " ")
    return " ".join(text.split())

def safe_join(row):
    return [clean_cell_text(str(c)) if c is not None else "" for c in row]

def clamp_bbox(bbox, page_rect):
    x0, y0, x1, y1 = bbox
    x0 = max(page_rect.x0, min(x0, page_rect.x1))
    x1 = max(page_rect.x0, min(x1, page_rect.x1))
    y0 = max(page_rect.y0, min(y0, page_rect.y1))
    y1 = max(page_rect.y0, min(y1, page_rect.y1))
    return (x0, y0, x1, y1)

# ── Improved table detection with snapping ─────────────────────────────────
def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE):
    """
    Detect table rectangles by:
    1. Collecting very thin horizontal & vertical strokes
    2. Snapping their positions with tolerance `tol`
    3. Forming a grid from unique row & column positions
    4. Returning a list[fitz.Rect] for each cell rectangle
    """
    horiz_raw, vert_raw = [], []
    for d in page.get_drawings():
        if d["type"] != 1:             # stroke only
            continue
        x0, y0, x1, y1 = d["bbox"]
        if abs(y1 - y0) < 2:           # horizontal line
            y_mid = (y0 + y1) / 2
            horiz_raw.append((y_mid, x0, x1))
        elif abs(x1 - x0) < 2:         # vertical line
            x_mid = (x0 + x1) / 2
            vert_raw.append((x_mid, y0, y1))

    if not horiz_raw or not vert_raw:
        return []

    row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol)
    col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol)

    # Average positions per cluster id
    rows = {}
    for y, x0, x1 in horiz_raw:
        cid = row_map[y]
        rows.setdefault(cid, []).append(y)
    cols = {}
    for x, y0, y1 in vert_raw:
        cid = col_map[x]
        cols.setdefault(cid, []).append(x)

    row_pos = sorted(sum(v)/len(v) for v in rows.values())
    col_pos = sorted(sum(v)/len(v) for v in cols.values())

    rects = []
    for r0, r1 in zip(row_pos, row_pos[1:]):
        for c0, c1 in zip(col_pos, col_pos[1:]):
            rect = fitz.Rect(c0, r0, c1, r1)
            if rect.get_area() >= MIN_RECT_AREA:
                rects.append(rect)

    # Remove duplicates / contained rects
    unique = []
    for rect in rects:
        if not any(u.contains(rect) or rect.contains(u) for u in unique):
            unique.append(rect)

    return unique

# ── Table extraction (simple text grouping) ────────────────────────────────
def extract_table(page: fitz.Page, table_rect: fitz.Rect):
    """Group words inside `table_rect` into JSON rows [dict]."""
    words = [
        w for w in page.get_text("words")
        if table_rect.x0 <= w[0] <= table_rect.x1
        and table_rect.y0 <= w[1] <= table_rect.y1
    ]
    words.sort(key=lambda w: (w[1], w[0]))          # sort by y then x

    # cluster words by line
    lines, cury, cur = [], None, []
    for w in words:
        if cury is None or abs(w[1] - cury) > 5:
            if cur:
                lines.append(cur)
            cur = [w]
            cury = w[1]
        else:
            cur.append(w)
    if cur:
        lines.append(cur)

    if not lines:
        return []

    line_texts = [" ".join(w[4] for w in ln) for ln in lines]
    headers = safe_join([line_texts[0]])
    rows    = [safe_join([lt]) for lt in line_texts[1:]]
    return [dict(zip(headers, r)) for r in rows]

# ── Per-page worker ────────────────────────────────────────────────────────
def process_page(args):
    page_number, pdf_path = args
    try:
        with fitz.open(pdf_path) as doc:
            page = doc.load_page(page_number)
            page_rect = page.rect
            output = f"Page {page_number + 1}\n"

            # Detect tables
            table_rects = detect_table_bboxes(page)
            table_jsons = []
            for rect in table_rects:
                tbl = extract_table(page, rect)
                if tbl:
                    table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False))

            # Words outside tables
            tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects]
            words = page.get_text("words")
            outside = [
                w for w in words
                if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes)
            ]
            outside.sort(key=lambda w: (w[1], w[0]))

            cury, cur, text = None, [], ""
            for w in outside:
                if cury is None or abs(w[1] - cury) > 10:
                    if cur:
                        text += " ".join(cur) + "\n"
                    cur, cury = [w[4]], w[1]
                else:
                    cur.append(w[4])
            if cur:
                text += " ".join(cur) + "\n"

            output += text.strip() + "\n"
            for idx, tbl in enumerate(table_jsons, 1):
                output += f'"table {idx}":\n{tbl}\n'
            return page_number, output

    except fitz.FileDataError as e:
        return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable – {e}"
    except Exception as e:
        return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}"

# ── Document-level processing ───────────────────────────────────────────────
def process_pdf(pdf_path):
    try:
        if not os.path.exists(pdf_path):
            return f"[ERROR] File not found: {pdf_path}"

        print(f"[INFO] Starting processing: {pdf_path}")
        try:
            with fitz.open(pdf_path) as doc:
                num_pages = doc.page_count
        except fitz.FileDataError as e:
            return f"[ERROR] Cannot open PDF: {pdf_path} – {e}"
        except Exception as e:
            return f"[ERROR] General error opening PDF: {pdf_path} – {e}"

        pages = [(i, pdf_path) for i in range(num_pages)]
        results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)

        results.sort(key=lambda x: x[0])
        final_output = "\n".join(t for _, t in results)

        base = os.path.splitext(os.path.basename(pdf_path))[0]
        out_dir = os.path.dirname(pdf_path)
        out_path = os.path.join(out_dir, f"{base}.txt")
        with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
            f.write(final_output)
        print(f"[INFO] Processing complete: {out_path}")
    except (EOFError, BrokenPipeError, KeyboardInterrupt):
        return "[INFO] Processing interrupted by user."
    except Exception as e:
        return f"[ERROR] Unexpected error with '{pdf_path}': {e}"

def run_serial(pages):   return [process_page(a) for a in pages]

def run_parallel(pages):
    cores = min(max(1, cpu_count() - 2), len(pages))
    print(f"Starting parallel processing with {cores} cores…")
    with Pool(cores) as pool:
        return pool.map(process_page, pages)

# ── Batch CLI entrypoint ────────────────────────────────────────────────────
def process_pdfs_main():
    pdfs = sys.argv[1:]
    if not pdfs:
        print("No PDF files provided.")
        return

    small, large = [], []
    for p in pdfs:
        if not os.path.exists(p):
            print(f"File not found: {p}")
            continue
        try:
            with fitz.open(p) as doc:
                (small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p)
        except fitz.FileDataError:
            print(f"[ERROR] Password-protected PDF skipped: {p}")
        except Exception as e:
            print(f"[ERROR] Error opening {p}: {e}")

    if small:
        cores = min(max(1, cpu_count() - 2), len(small))
        print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores …")
        for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small):
            print(r)

    for p in large:
        print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}")
        print(process_pdf(p))

# ── Tkinter GUI ─────────────────────────────────────────────────────────────
class FileManager:
    def __init__(self, master):
        self.master = master
        master.title("Parser-Sevenof9 — PyMuPDF")

        self.files, self.last_selected = [], None
        tk.Label(master, text="Selected PDF files:").pack(pady=5)

        list_frame = tk.Frame(master); list_frame.pack(pady=5)
        sb_list = tk.Scrollbar(list_frame)
        self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6,
                                  yscrollcommand=sb_list.set)
        sb_list.config(command=self.listbox.yview)
        self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y)
        self.listbox.bind("<<ListboxSelect>>", self.show_text)
        self.listbox.bind("<Button-1>", self.on_click)
        self.listbox.bind("<Shift-Button-1>", self.on_shift_click)

        self.ctx = tk.Menu(master, tearoff=0)
        self.ctx.add_command(label="Remove selected", command=self.remove_file)
        self.listbox.bind("<Button-3>", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None)

        btn_frame = tk.Frame(master); btn_frame.pack(pady=10)
        tk.Button(btn_frame, text="Add Folder",     command=self.add_folder).pack(side=tk.LEFT, padx=5)
        tk.Button(btn_frame, text="Select Files",   command=self.add_file).pack(side=tk.LEFT, padx=5)
        tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5)
        tk.Button(btn_frame, text="Remove All",     command=self.remove_all).pack(side=tk.LEFT, padx=5)
        tk.Button(master, text="Stop",  command=self.stop_parser).pack(pady=5)
        tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)

        tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5)
        sb_text = tk.Scrollbar(tx_frame)
        self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set)
        sb_text.config(command=self.text.yview)
        self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y)

        tk.Label(master, text="Progress:").pack()
        prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5)
        sb_prog = tk.Scrollbar(prog_frame)
        self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set)
        sb_prog.config(command=self.prog.yview)
        self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y)

        self.parser_proc = None

    # ── Listbox helpers ───────────────────────────────────────────────────
    def on_click(self, e):
        idx = self.listbox.nearest(e.y)
        self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx)
        self.last_selected = idx; self.show_text(None)
        return "break"

    def on_shift_click(self, e):
        idx = self.listbox.nearest(e.y)
        if self.last_selected is None: self.last_selected = idx
        lo, hi = sorted((self.last_selected, idx))
        self.listbox.selection_clear(0, tk.END)
        for i in range(lo, hi+1): self.listbox.selection_set(i)
        return "break"

    # ── File ops ─────────────────────────────────────────────────────────
    def add_folder(self):
        folder = filedialog.askdirectory(title="Select Folder")
        if not folder: return
        for root, _, fs in os.walk(folder):
            for f in fs:
                if f.lower().endswith(".pdf"):
                    p = os.path.join(root, f)
                    if p not in self.files:
                        self.files.append(p); self.listbox.insert(tk.END, p)

    def add_file(self):
        for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]):
            if p not in self.files:
                self.files.append(p); self.listbox.insert(tk.END, p)

    def remove_file(self):
        sel = self.listbox.curselection()
        if not sel:
            messagebox.showwarning("Notice","Please select an entry to remove."); return
        for idx in reversed(sel):
            self.listbox.delete(idx); del self.files[idx]
        self.text.delete(1.0, tk.END)

    def remove_all(self):
        self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END)

    # ── Parser control ───────────────────────────────────────────────────
    def start_parser(self):
        if not self.files:
            messagebox.showinfo("No Files","Please select at least one file."); return
        self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END)
        self.prog.insert(tk.END,"Starting parser…\n"); self.prog.config(state=tk.DISABLED)
        threading.Thread(target=self.run_parser).start()

    def stop_parser(self):
        if self.parser_proc and self.parser_proc.poll() is None:
            self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n")
        else:
            self.append_prog("No active parser process to stop.\n")

    def run_parser(self):
        try:
            self.parser_proc = subprocess.Popen(
                [sys.executable, __file__] + self.files,
                stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                text=True, encoding="utf-8", errors="ignore", bufsize=4096
            )
            for line in self.parser_proc.stdout:
                self.append_prog(line)
            self.parser_proc.stdout.close(); self.parser_proc.wait()
            if self.parser_proc.returncode == 0:
                self.append_prog("\nParser finished successfully.\n")
                self.shell_msg("Parser Done","The parser was executed successfully.")
            else:
                self.append_prog("\nError while running the parser.\n")
                self.shell_msg("Error","Error while running the parser.")
        except Exception as e:
            self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}")
        finally:
            self.parser_proc = None

    # ── GUI helpers ──────────────────────────────────────────────────────
    def append_prog(self, txt):
        self.prog.after(0, lambda:self._ins(txt))

    def _ins(self, txt):
        self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt)
        self.prog.see(tk.END); self.prog.config(state=tk.DISABLED)

    def shell_msg(self, title, msg):
        self.master.after(0, lambda: messagebox.showinfo(title, msg))

    def show_text(self, _):
        sel = self.listbox.curselection()
        if not sel: return
        path = self.files[sel[0]]
        txt = os.path.splitext(path)[0] + ".txt"
        self.text.delete(1.0, tk.END)
        if os.path.exists(txt):
            try:
                with open(txt,"r",encoding="utf-8",errors="ignore") as f:
                    self.text.insert(tk.END, f.read())
            except Exception as e:
                self.text.insert(tk.END,f"Error loading text file:\n{e}")
        else:
            self.text.insert(tk.END,"[No corresponding .txt file found]")

# ── Main guard ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
    multiprocessing.freeze_support()
    if len(sys.argv) > 1:
        process_pdfs_main()
    else:
        root = tk.Tk(); FileManager(root); root.mainloop()