import gradio as gr from transformers import pipeline import PyPDF2 from docx import Document import re # Load pipelines classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True) # File reading def read_file(file_obj): name = file_obj.name if name.endswith(".txt"): return file_obj.read().decode("utf-8", errors="ignore") elif name.endswith(".pdf"): reader = PyPDF2.PdfReader(file_obj) return " ".join([page.extract_text() for page in reader.pages if page.extract_text()]) elif name.endswith(".docx"): doc = Document(file_obj) return "\n".join([para.text for para in doc.paragraphs]) else: return "Unsupported file format" # Contract classification def is_contract(text): result = classifier(text[:1000], ["contract", "not a contract"]) return result['labels'][0] == 'contract', result # Rule-based + NER-based party extraction def extract_parties_with_rules(text): results = set() # Rule-based: between X and Y matches = re.findall(r'between\s+(.*?)\s+and\s+(.*?)[\.,\n]', text, re.IGNORECASE) for match in matches: results.update(match) # Rule-based: "X" (Party A), etc. named_matches = re.findall(r'“([^”]+)”\s*\(.*?Party [AB]\)', text) results.update(named_matches) # NER fallback entities = ner(text[:1000]) ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']] results.update(ner_parties) return list(results) # Main logic def process_file(file): text = read_file(file) if not text.strip(): return "Empty or unreadable file.", None is_contract_flag, classification = is_contract(text) if is_contract_flag: parties = extract_parties_with_rules(text) return "✅ This is a contract.", ", ".join(parties) else: return "❌ This is NOT a contract.", "" # Gradio interface iface = gr.Interface( fn=process_file, inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"), outputs=[ gr.Textbox(label="Classification Result"), gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)") ], title="Contract Classifier with RoBERTa", description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching." ) iface.launch()