Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import PyPDF2 | |
from docx import Document | |
# Load pipelines | |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True) | |
# File reading | |
def read_file(file_obj): | |
name = file_obj.name | |
if name.endswith(".txt"): | |
return file_obj.read().decode("utf-8") | |
elif name.endswith(".pdf"): | |
reader = PyPDF2.PdfReader(file_obj) | |
return " ".join([page.extract_text() for page in reader.pages if page.extract_text()]) | |
elif name.endswith(".docx"): | |
doc = Document(file_obj) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
else: | |
return "Unsupported file format" | |
# Contract classification | |
def is_contract(text): | |
result = classifier(text[:1000], ["contract", "not a contract"]) | |
return result['labels'][0] == 'contract', result | |
# Party extraction | |
def extract_parties(text): | |
entities = ner(text[:1000]) | |
return list(set(ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER'])) | |
# Main logic | |
def process_file(file): | |
text = read_file(file) | |
if not text.strip(): | |
return "Empty or unreadable file.", None | |
is_contract_flag, classification = is_contract(text) | |
if is_contract_flag: | |
parties = extract_parties(text) | |
return "β This is a contract.", parties | |
else: | |
return "β This is NOT a contract.", [] | |
# Gradio interface | |
iface = gr.Interface( | |
fn=process_file, | |
inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"), | |
outputs=[ | |
gr.Textbox(label="Classification Result"), | |
gr.Label(label="Detected Parties") | |
], | |
title="Contract Classifier with RoBERTa", | |
description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa." | |
) | |
iface.launch() |