tdnathmlenthusiast commited on
Commit
8c6bdbd
·
verified ·
1 Parent(s): 2ddd8a4

solved poppler-utils

Browse files
Files changed (1) hide show
  1. app.py +123 -121
app.py CHANGED
@@ -1,121 +1,123 @@
1
- import os
2
- import fitz # PyMuPDF
3
- from paddleocr import PPStructure
4
- from pdf2image import convert_from_path
5
- import numpy as np
6
- import json
7
- import re
8
- import spacy
9
- from spacy.matcher import Matcher
10
- from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
11
- import gradio as gr
12
- from tqdm.auto import tqdm
13
-
14
- # --- Initialization ---
15
- structure_engine = PPStructure(table=True, ocr=True, layout=True)
16
- nlp = spacy.load("en_core_web_sm")
17
- matcher = Matcher(nlp.vocab)
18
-
19
- # Regex & matcher setup
20
- date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}|\d{2}\.\d{2}\.\d{2}"
21
- party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?"
22
- pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}]
23
- matcher.add("CLAIMANT", [pattern])
24
-
25
- # Load Legal-BERT pipelines
26
- ner_model = "nlpaueb/legal-bert-base-uncased"
27
- token_model = AutoModelForTokenClassification.from_pretrained(ner_model)
28
- tokenizer = AutoTokenizer.from_pretrained(ner_model)
29
- ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple")
30
- clf_pipeline = pipeline("text-classification", model=ner_model)
31
-
32
- # Helper functions
33
- def extract_text_from_pdf(pdf_path):
34
- doc = fitz.open(pdf_path)
35
- pages = []
36
- for i in range(len(doc)):
37
- page = doc[i]
38
- pages.append({"page": i + 1, "text": page.get_text("text") or ""})
39
- doc.close()
40
- return pages
41
-
42
-
43
- def extract_content_from_images(pdf_path):
44
- images = convert_from_path(pdf_path)
45
- results = []
46
- for i, img in enumerate(images, start=1):
47
- img_np = np.array(img)
48
- res = structure_engine(img_np)
49
- text_lines, tables = [], []
50
- for block in res:
51
- if block['type'] == 'text':
52
- text_lines += [line['text'] for line in block['res'] if 'text' in line]
53
- elif block['type'] == 'table' and 'html' in block['res']:
54
- tables.append(block['res']['html'])
55
- results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables})
56
- return results
57
-
58
-
59
- def extract_metadata(text):
60
- meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []}
61
- # Regex
62
- meta['dates'] = re.findall(date_pattern, text)
63
- meta['parties'] = re.findall(party_pattern, text)
64
- # SpaCy
65
- doc = nlp(text)
66
- for ent in doc.ents:
67
- if ent.label_ == 'ORG' and ent.text not in meta['parties']:
68
- meta['parties'].append(ent.text)
69
- if ent.label_ == 'GPE':
70
- meta['tribunals'].append(ent.text)
71
- for match_id, start, end in matcher(doc):
72
- meta['claimants'].append(doc[start:end].text)
73
- # Legal-BERT NER
74
- for ent in ner_pipeline(text):
75
- grp = ent['entity_group']
76
- if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']:
77
- meta['parties'].append(ent['word'])
78
- if grp == 'GPE' and ent['word'] not in meta['tribunals']:
79
- meta['tribunals'].append(ent['word'])
80
- # Clause classification
81
- for sent in text.split('. '):
82
- if len(sent) < 10: continue
83
- try:
84
- res = clf_pipeline(sent)[0]
85
- if res['score'] > 0.7:
86
- meta['clauses'].append({'type': res['label'], 'text': sent})
87
- except:
88
- pass
89
- return meta
90
-
91
-
92
- def process_pdf(file_obj):
93
- # Save uploaded file
94
- pdf_path = file_obj.name
95
- # 1. Text
96
- text_pages = extract_text_from_pdf(pdf_path)
97
- # 2. OCR & tables
98
- img_content = extract_content_from_images(pdf_path)
99
- # 3. Metadata
100
- metadata = []
101
- for page in text_pages:
102
- metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])})
103
- # Combine
104
- output = {
105
- "text_pages": text_pages,
106
- "image_content": img_content,
107
- "metadata": metadata
108
- }
109
- return output
110
-
111
- # Gradio Interface
112
- iface = gr.Interface(
113
- fn=process_pdf,
114
- inputs=gr.File(label="Upload PDF", file_types=['.pdf']),
115
- outputs=gr.JSON(label="Extraction Result"),
116
- title="PDF OCR & Metadata Extractor",
117
- description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata."
118
- )
119
-
120
- if __name__ == '__main__':
121
- iface.launch()
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ from paddleocr import PPStructure
4
+ from pdf2image import convert_from_path
5
+ import numpy as np
6
+ import json
7
+ import re
8
+ import spacy
9
+ from spacy.matcher import Matcher
10
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
11
+ import gradio as gr
12
+ from tqdm.auto import tqdm
13
+ import os
14
+ # Ensure Poppler is available
15
+ os.system("apt-get update -y && apt-get install -y poppler-utils")
16
+ # --- Initialization ---
17
+ structure_engine = PPStructure(table=True, ocr=True, layout=True)
18
+ nlp = spacy.load("en_core_web_sm")
19
+ matcher = Matcher(nlp.vocab)
20
+
21
+ # Regex & matcher setup
22
+ date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}|\d{2}\.\d{2}\.\d{2}"
23
+ party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?"
24
+ pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}]
25
+ matcher.add("CLAIMANT", [pattern])
26
+
27
+ # Load Legal-BERT pipelines
28
+ ner_model = "nlpaueb/legal-bert-base-uncased"
29
+ token_model = AutoModelForTokenClassification.from_pretrained(ner_model)
30
+ tokenizer = AutoTokenizer.from_pretrained(ner_model)
31
+ ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple")
32
+ clf_pipeline = pipeline("text-classification", model=ner_model)
33
+
34
+ # Helper functions
35
+ def extract_text_from_pdf(pdf_path):
36
+ doc = fitz.open(pdf_path)
37
+ pages = []
38
+ for i in range(len(doc)):
39
+ page = doc[i]
40
+ pages.append({"page": i + 1, "text": page.get_text("text") or ""})
41
+ doc.close()
42
+ return pages
43
+
44
+
45
+ def extract_content_from_images(pdf_path):
46
+ images = convert_from_path(pdf_path)
47
+ results = []
48
+ for i, img in enumerate(images, start=1):
49
+ img_np = np.array(img)
50
+ res = structure_engine(img_np)
51
+ text_lines, tables = [], []
52
+ for block in res:
53
+ if block['type'] == 'text':
54
+ text_lines += [line['text'] for line in block['res'] if 'text' in line]
55
+ elif block['type'] == 'table' and 'html' in block['res']:
56
+ tables.append(block['res']['html'])
57
+ results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables})
58
+ return results
59
+
60
+
61
+ def extract_metadata(text):
62
+ meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []}
63
+ # Regex
64
+ meta['dates'] = re.findall(date_pattern, text)
65
+ meta['parties'] = re.findall(party_pattern, text)
66
+ # SpaCy
67
+ doc = nlp(text)
68
+ for ent in doc.ents:
69
+ if ent.label_ == 'ORG' and ent.text not in meta['parties']:
70
+ meta['parties'].append(ent.text)
71
+ if ent.label_ == 'GPE':
72
+ meta['tribunals'].append(ent.text)
73
+ for match_id, start, end in matcher(doc):
74
+ meta['claimants'].append(doc[start:end].text)
75
+ # Legal-BERT NER
76
+ for ent in ner_pipeline(text):
77
+ grp = ent['entity_group']
78
+ if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']:
79
+ meta['parties'].append(ent['word'])
80
+ if grp == 'GPE' and ent['word'] not in meta['tribunals']:
81
+ meta['tribunals'].append(ent['word'])
82
+ # Clause classification
83
+ for sent in text.split('. '):
84
+ if len(sent) < 10: continue
85
+ try:
86
+ res = clf_pipeline(sent)[0]
87
+ if res['score'] > 0.7:
88
+ meta['clauses'].append({'type': res['label'], 'text': sent})
89
+ except:
90
+ pass
91
+ return meta
92
+
93
+
94
+ def process_pdf(file_obj):
95
+ # Save uploaded file
96
+ pdf_path = file_obj.name
97
+ # 1. Text
98
+ text_pages = extract_text_from_pdf(pdf_path)
99
+ # 2. OCR & tables
100
+ img_content = extract_content_from_images(pdf_path)
101
+ # 3. Metadata
102
+ metadata = []
103
+ for page in text_pages:
104
+ metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])})
105
+ # Combine
106
+ output = {
107
+ "text_pages": text_pages,
108
+ "image_content": img_content,
109
+ "metadata": metadata
110
+ }
111
+ return output
112
+
113
+ # Gradio Interface
114
+ iface = gr.Interface(
115
+ fn=process_pdf,
116
+ inputs=gr.File(label="Upload PDF", file_types=['.pdf']),
117
+ outputs=gr.JSON(label="Extraction Result"),
118
+ title="PDF OCR & Metadata Extractor",
119
+ description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata."
120
+ )
121
+
122
+ if __name__ == '__main__':
123
+ iface.launch()