manish-aggarwal commited on
Commit
c1ccd2b
·
verified ·
1 Parent(s): 3faa170

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +59 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import PyPDF2
4
+ from docx import Document
5
+
6
+ # Load pipelines
7
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
8
+ ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True)
9
+
10
+ # File reading
11
+ def read_file(file_obj):
12
+ name = file_obj.name
13
+ if name.endswith(".txt"):
14
+ return file_obj.read().decode("utf-8")
15
+ elif name.endswith(".pdf"):
16
+ reader = PyPDF2.PdfReader(file_obj)
17
+ return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
18
+ elif name.endswith(".docx"):
19
+ doc = Document(file_obj)
20
+ return "\n".join([para.text for para in doc.paragraphs])
21
+ else:
22
+ return "Unsupported file format"
23
+
24
+ # Contract classification
25
+ def is_contract(text):
26
+ result = classifier(text[:1000], ["contract", "not a contract"])
27
+ return result['labels'][0] == 'contract', result
28
+
29
+ # Party extraction
30
+ def extract_parties(text):
31
+ entities = ner(text[:1000])
32
+ return list(set(ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']))
33
+
34
+ # Main logic
35
+ def process_file(file):
36
+ text = read_file(file)
37
+ if not text.strip():
38
+ return "Empty or unreadable file.", None
39
+
40
+ is_contract_flag, classification = is_contract(text)
41
+ if is_contract_flag:
42
+ parties = extract_parties(text)
43
+ return "✅ This is a contract.", parties
44
+ else:
45
+ return "❌ This is NOT a contract.", []
46
+
47
+ # Gradio interface
48
+ iface = gr.Interface(
49
+ fn=process_file,
50
+ inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"),
51
+ outputs=[
52
+ gr.Textbox(label="Classification Result"),
53
+ gr.Label(label="Detected Parties")
54
+ ],
55
+ title="Contract Classifier with RoBERTa",
56
+ description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa."
57
+ )
58
+
59
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ python-docx
5
+ PyPDF2