import os import fitz # PyMuPDF from paddleocr import PPStructure from pdf2image import convert_from_path import numpy as np import json import re import spacy from spacy.matcher import Matcher from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification import gradio as gr from tqdm.auto import tqdm import os # Ensure Poppler is available os.system("apt-get update -y && apt-get install -y poppler-utils") # --- Initialization --- structure_engine = PPStructure(table=True, ocr=True, layout=True) nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) # Regex & matcher setup date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}|\d{2}\.\d{2}\.\d{2}" party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?" pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}] matcher.add("CLAIMANT", [pattern]) # Load Legal-BERT pipelines ner_model = "nlpaueb/legal-bert-base-uncased" token_model = AutoModelForTokenClassification.from_pretrained(ner_model) tokenizer = AutoTokenizer.from_pretrained(ner_model) ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple") clf_pipeline = pipeline("text-classification", model=ner_model) # Helper functions def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) pages = [] for i in range(len(doc)): page = doc[i] pages.append({"page": i + 1, "text": page.get_text("text") or ""}) doc.close() return pages def extract_content_from_images(pdf_path): images = convert_from_path(pdf_path) results = [] for i, img in enumerate(images, start=1): img_np = np.array(img) res = structure_engine(img_np) text_lines, tables = [], [] for block in res: if block['type'] == 'text': text_lines += [line['text'] for line in block['res'] if 'text' in line] elif block['type'] == 'table' and 'html' in block['res']: tables.append(block['res']['html']) results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables}) return results def extract_metadata(text): meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []} # Regex meta['dates'] = re.findall(date_pattern, text) meta['parties'] = re.findall(party_pattern, text) # SpaCy doc = nlp(text) for ent in doc.ents: if ent.label_ == 'ORG' and ent.text not in meta['parties']: meta['parties'].append(ent.text) if ent.label_ == 'GPE': meta['tribunals'].append(ent.text) for match_id, start, end in matcher(doc): meta['claimants'].append(doc[start:end].text) # Legal-BERT NER for ent in ner_pipeline(text): grp = ent['entity_group'] if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']: meta['parties'].append(ent['word']) if grp == 'GPE' and ent['word'] not in meta['tribunals']: meta['tribunals'].append(ent['word']) # Clause classification for sent in text.split('. '): if len(sent) < 10: continue try: res = clf_pipeline(sent)[0] if res['score'] > 0.7: meta['clauses'].append({'type': res['label'], 'text': sent}) except: pass return meta def process_pdf(file_obj): # Save uploaded file pdf_path = file_obj.name # 1. Text text_pages = extract_text_from_pdf(pdf_path) # 2. OCR & tables img_content = extract_content_from_images(pdf_path) # 3. Metadata metadata = [] for page in text_pages: metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])}) # Combine output = { "text_pages": text_pages, "image_content": img_content, "metadata": metadata } return output # Gradio Interface iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF", file_types=['.pdf']), outputs=gr.JSON(label="Extraction Result"), title="PDF OCR & Metadata Extractor", description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata." ) if __name__ == '__main__': iface.launch()