Commit
·
de2d856
1
Parent(s):
8faf6d7
Text extract added + initiate gradio interface
Browse files- app.py +0 -0
- pdf_summarization.ipynb +227 -0
- requirements.txt +7 -0
app.py
ADDED
File without changes
|
pdf_summarization.ipynb
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
|
13 |
+
" _torch_pytree._register_pytree_node(\n",
|
14 |
+
"e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
|
15 |
+
" _torch_pytree._register_pytree_node(\n"
|
16 |
+
]
|
17 |
+
}
|
18 |
+
],
|
19 |
+
"source": [
|
20 |
+
"import gradio as gr\n",
|
21 |
+
"import PyPDF2\n",
|
22 |
+
"from PyPDF2 import PdfReader\n",
|
23 |
+
"from io import BytesIO\n",
|
24 |
+
"import pytesseract\n",
|
25 |
+
"from PIL import Image\n",
|
26 |
+
"import spacy\n",
|
27 |
+
"import json\n",
|
28 |
+
"\n",
|
29 |
+
"from transformers import pipeline\n",
|
30 |
+
"from PyPDF2 import PdfReader"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": 6,
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [
|
38 |
+
{
|
39 |
+
"name": "stderr",
|
40 |
+
"output_type": "stream",
|
41 |
+
"text": [
|
42 |
+
"Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
|
43 |
+
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
44 |
+
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
45 |
+
"e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--facebook--bart-large-cnn. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
46 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
47 |
+
" warnings.warn(message)\n",
|
48 |
+
"e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--d4data--biomedical-ner-all. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
49 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
50 |
+
" warnings.warn(message)\n",
|
51 |
+
"e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--dslim--bert-base-NER. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
52 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
53 |
+
" warnings.warn(message)\n"
|
54 |
+
]
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"source": [
|
58 |
+
"# initiate model\n",
|
59 |
+
"ner_model = pipeline(\"token-classification\", model=\"dslim/bert-large-NER\")\n",
|
60 |
+
"summarization_pipeline = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n",
|
61 |
+
"ner_models = {\n",
|
62 |
+
" \"bert-large-NER\": \"dslim/bert-large-NER\",\n",
|
63 |
+
" \"bioNER\": \"d4data/biomedical-ner-all\",\n",
|
64 |
+
" \"SpaCy English NER\": \"en_core_web_trf\",\n",
|
65 |
+
"}\n",
|
66 |
+
"spacy_ner_model = spacy.load(ner_models[\"SpaCy English NER\"])\n",
|
67 |
+
"ner_model_bio = pipeline(\"token-classification\", model=\"d4data/biomedical-ner-all\")\n",
|
68 |
+
"\n",
|
69 |
+
"from transformers import AutoTokenizer\n",
|
70 |
+
"\n",
|
71 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"dslim/bert-base-NER\")\n",
|
72 |
+
"from spacy import displacy"
|
73 |
+
]
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"cell_type": "code",
|
77 |
+
"execution_count": 12,
|
78 |
+
"metadata": {},
|
79 |
+
"outputs": [],
|
80 |
+
"source": [
|
81 |
+
"# Extracting text from pdf & image file\n",
|
82 |
+
"\n",
|
83 |
+
"\n",
|
84 |
+
"def extract_text_from_pdf(pdf_bytes):\n",
|
85 |
+
" text = \"\"\n",
|
86 |
+
" pdf_file = BytesIO(pdf_bytes)\n",
|
87 |
+
" pdf_reader = PdfReader(pdf_file)\n",
|
88 |
+
"\n",
|
89 |
+
" for page_number in range(len(pdf_reader.pages)):\n",
|
90 |
+
" page = pdf_reader.pages[page_number]\n",
|
91 |
+
" text += page.extract_text()\n",
|
92 |
+
" return text\n",
|
93 |
+
"\n",
|
94 |
+
"\n",
|
95 |
+
"def extract_text_from_image_or_pdf(file_bytes):\n",
|
96 |
+
" try:\n",
|
97 |
+
" if file_bytes.startswith(b'%PDF'):\n",
|
98 |
+
" text = extract_text_from_pdf(file_bytes)\n",
|
99 |
+
" print(text)\n",
|
100 |
+
" else:\n",
|
101 |
+
" image = Image.open(BytesIO(file_bytes))\n",
|
102 |
+
" text = pytesseract.image_to_string(image)\n",
|
103 |
+
"\n",
|
104 |
+
" return text\n",
|
105 |
+
" except Exception as e:\n",
|
106 |
+
" return f\"Error extracting file\""
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": 13,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": [
|
115 |
+
"def image_ner_tool(file, model_name):\n",
|
116 |
+
" reformatted_ner_output = \"\"\n",
|
117 |
+
" try:\n",
|
118 |
+
" if isinstance(file, str):\n",
|
119 |
+
" with open(file, \"rb\") as file_stream:\n",
|
120 |
+
" file_bytes = file_stream.read()\n",
|
121 |
+
" else:\n",
|
122 |
+
" file_bytes = file.getvalue()\n",
|
123 |
+
" text = extract_text_from_image_or_pdf(file_bytes)\n",
|
124 |
+
" return text\n",
|
125 |
+
" except Exception as e:\n",
|
126 |
+
" error_message = f\"Error processing file:{str(e)}\"\n",
|
127 |
+
" return error_message, \"\", reformatted_ner_output"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": 15,
|
133 |
+
"metadata": {},
|
134 |
+
"outputs": [
|
135 |
+
{
|
136 |
+
"name": "stdout",
|
137 |
+
"output_type": "stream",
|
138 |
+
"text": [
|
139 |
+
"Running on local URL: http://127.0.0.1:7864\n",
|
140 |
+
"\n",
|
141 |
+
"Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n"
|
142 |
+
]
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"data": {
|
146 |
+
"text/html": [
|
147 |
+
"<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
148 |
+
],
|
149 |
+
"text/plain": [
|
150 |
+
"<IPython.core.display.HTML object>"
|
151 |
+
]
|
152 |
+
},
|
153 |
+
"metadata": {},
|
154 |
+
"output_type": "display_data"
|
155 |
+
},
|
156 |
+
{
|
157 |
+
"data": {
|
158 |
+
"text/plain": []
|
159 |
+
},
|
160 |
+
"execution_count": 15,
|
161 |
+
"metadata": {},
|
162 |
+
"output_type": "execute_result"
|
163 |
+
}
|
164 |
+
],
|
165 |
+
"source": [
|
166 |
+
"# Gradio\n",
|
167 |
+
"with gr.Blocks() as demo:\n",
|
168 |
+
" gr.Markdown(\n",
|
169 |
+
" \"\"\"\n",
|
170 |
+
" <p style=\"text-align: center; font-weight: bold; font-size: 44px;\">\n",
|
171 |
+
" Intelligent Document Processing\n",
|
172 |
+
" </p>\n",
|
173 |
+
" <p style=\"text-align: center;\">\n",
|
174 |
+
" Upload a PDF or an image file to extract text and identify named entities\n",
|
175 |
+
" </p>\n",
|
176 |
+
" \"\"\"\n",
|
177 |
+
" )\n",
|
178 |
+
"\n",
|
179 |
+
" with gr.Row() as row:\n",
|
180 |
+
" with gr.Column():\n",
|
181 |
+
" text1 = gr.File(label=\"Upload File\")\n",
|
182 |
+
" model = gr.Dropdown(list(ner_models.keys()), label=\"Select NER Model\")\n",
|
183 |
+
" btn = gr.Button(\"Submit\")\n",
|
184 |
+
"\n",
|
185 |
+
" with gr.Column():\n",
|
186 |
+
" with gr.Tab(\"Extract Text\"):\n",
|
187 |
+
" output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n",
|
188 |
+
"\n",
|
189 |
+
" btn.click(\n",
|
190 |
+
" image_ner_tool,\n",
|
191 |
+
" [text1, model],\n",
|
192 |
+
" [output1],\n",
|
193 |
+
" )\n",
|
194 |
+
"\n",
|
195 |
+
"demo.launch(share=True)"
|
196 |
+
]
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"cell_type": "code",
|
200 |
+
"execution_count": null,
|
201 |
+
"metadata": {},
|
202 |
+
"outputs": [],
|
203 |
+
"source": []
|
204 |
+
}
|
205 |
+
],
|
206 |
+
"metadata": {
|
207 |
+
"kernelspec": {
|
208 |
+
"display_name": ".venv",
|
209 |
+
"language": "python",
|
210 |
+
"name": "python3"
|
211 |
+
},
|
212 |
+
"language_info": {
|
213 |
+
"codemirror_mode": {
|
214 |
+
"name": "ipython",
|
215 |
+
"version": 3
|
216 |
+
},
|
217 |
+
"file_extension": ".py",
|
218 |
+
"mimetype": "text/x-python",
|
219 |
+
"name": "python",
|
220 |
+
"nbconvert_exporter": "python",
|
221 |
+
"pygments_lexer": "ipython3",
|
222 |
+
"version": "3.11.8"
|
223 |
+
}
|
224 |
+
},
|
225 |
+
"nbformat": 4,
|
226 |
+
"nbformat_minor": 2
|
227 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyPDF2
|
2 |
+
transformers
|
3 |
+
pytesseract
|
4 |
+
torch
|
5 |
+
spacy-transformers
|
6 |
+
gradio
|
7 |
+
https://huggingface.co/spacy/en_core_web_trf/resolve/main/en_core_web_trf-any-py3-none-any.whl
|