SalmanSakibSrizon commited on
Commit
de2d856
·
1 Parent(s): 8faf6d7

Text extract added + initiate gradio interface

Browse files
Files changed (3) hide show
  1. app.py +0 -0
  2. pdf_summarization.ipynb +227 -0
  3. requirements.txt +7 -0
app.py ADDED
File without changes
pdf_summarization.ipynb ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
13
+ " _torch_pytree._register_pytree_node(\n",
14
+ "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
15
+ " _torch_pytree._register_pytree_node(\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "import gradio as gr\n",
21
+ "import PyPDF2\n",
22
+ "from PyPDF2 import PdfReader\n",
23
+ "from io import BytesIO\n",
24
+ "import pytesseract\n",
25
+ "from PIL import Image\n",
26
+ "import spacy\n",
27
+ "import json\n",
28
+ "\n",
29
+ "from transformers import pipeline\n",
30
+ "from PyPDF2 import PdfReader"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 6,
36
+ "metadata": {},
37
+ "outputs": [
38
+ {
39
+ "name": "stderr",
40
+ "output_type": "stream",
41
+ "text": [
42
+ "Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
43
+ "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
44
+ "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
45
+ "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--facebook--bart-large-cnn. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
46
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
47
+ " warnings.warn(message)\n",
48
+ "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--d4data--biomedical-ner-all. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
49
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
50
+ " warnings.warn(message)\n",
51
+ "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--dslim--bert-base-NER. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
52
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
53
+ " warnings.warn(message)\n"
54
+ ]
55
+ }
56
+ ],
57
+ "source": [
58
+ "# initiate model\n",
59
+ "ner_model = pipeline(\"token-classification\", model=\"dslim/bert-large-NER\")\n",
60
+ "summarization_pipeline = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n",
61
+ "ner_models = {\n",
62
+ " \"bert-large-NER\": \"dslim/bert-large-NER\",\n",
63
+ " \"bioNER\": \"d4data/biomedical-ner-all\",\n",
64
+ " \"SpaCy English NER\": \"en_core_web_trf\",\n",
65
+ "}\n",
66
+ "spacy_ner_model = spacy.load(ner_models[\"SpaCy English NER\"])\n",
67
+ "ner_model_bio = pipeline(\"token-classification\", model=\"d4data/biomedical-ner-all\")\n",
68
+ "\n",
69
+ "from transformers import AutoTokenizer\n",
70
+ "\n",
71
+ "tokenizer = AutoTokenizer.from_pretrained(\"dslim/bert-base-NER\")\n",
72
+ "from spacy import displacy"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 12,
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "# Extracting text from pdf & image file\n",
82
+ "\n",
83
+ "\n",
84
+ "def extract_text_from_pdf(pdf_bytes):\n",
85
+ " text = \"\"\n",
86
+ " pdf_file = BytesIO(pdf_bytes)\n",
87
+ " pdf_reader = PdfReader(pdf_file)\n",
88
+ "\n",
89
+ " for page_number in range(len(pdf_reader.pages)):\n",
90
+ " page = pdf_reader.pages[page_number]\n",
91
+ " text += page.extract_text()\n",
92
+ " return text\n",
93
+ "\n",
94
+ "\n",
95
+ "def extract_text_from_image_or_pdf(file_bytes):\n",
96
+ " try:\n",
97
+ " if file_bytes.startswith(b'%PDF'):\n",
98
+ " text = extract_text_from_pdf(file_bytes)\n",
99
+ " print(text)\n",
100
+ " else:\n",
101
+ " image = Image.open(BytesIO(file_bytes))\n",
102
+ " text = pytesseract.image_to_string(image)\n",
103
+ "\n",
104
+ " return text\n",
105
+ " except Exception as e:\n",
106
+ " return f\"Error extracting file\""
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 13,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "def image_ner_tool(file, model_name):\n",
116
+ " reformatted_ner_output = \"\"\n",
117
+ " try:\n",
118
+ " if isinstance(file, str):\n",
119
+ " with open(file, \"rb\") as file_stream:\n",
120
+ " file_bytes = file_stream.read()\n",
121
+ " else:\n",
122
+ " file_bytes = file.getvalue()\n",
123
+ " text = extract_text_from_image_or_pdf(file_bytes)\n",
124
+ " return text\n",
125
+ " except Exception as e:\n",
126
+ " error_message = f\"Error processing file:{str(e)}\"\n",
127
+ " return error_message, \"\", reformatted_ner_output"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 15,
133
+ "metadata": {},
134
+ "outputs": [
135
+ {
136
+ "name": "stdout",
137
+ "output_type": "stream",
138
+ "text": [
139
+ "Running on local URL: http://127.0.0.1:7864\n",
140
+ "\n",
141
+ "Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n"
142
+ ]
143
+ },
144
+ {
145
+ "data": {
146
+ "text/html": [
147
+ "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
148
+ ],
149
+ "text/plain": [
150
+ "<IPython.core.display.HTML object>"
151
+ ]
152
+ },
153
+ "metadata": {},
154
+ "output_type": "display_data"
155
+ },
156
+ {
157
+ "data": {
158
+ "text/plain": []
159
+ },
160
+ "execution_count": 15,
161
+ "metadata": {},
162
+ "output_type": "execute_result"
163
+ }
164
+ ],
165
+ "source": [
166
+ "# Gradio\n",
167
+ "with gr.Blocks() as demo:\n",
168
+ " gr.Markdown(\n",
169
+ " \"\"\"\n",
170
+ " <p style=\"text-align: center; font-weight: bold; font-size: 44px;\">\n",
171
+ " Intelligent Document Processing\n",
172
+ " </p>\n",
173
+ " <p style=\"text-align: center;\">\n",
174
+ " Upload a PDF or an image file to extract text and identify named entities\n",
175
+ " </p>\n",
176
+ " \"\"\"\n",
177
+ " )\n",
178
+ "\n",
179
+ " with gr.Row() as row:\n",
180
+ " with gr.Column():\n",
181
+ " text1 = gr.File(label=\"Upload File\")\n",
182
+ " model = gr.Dropdown(list(ner_models.keys()), label=\"Select NER Model\")\n",
183
+ " btn = gr.Button(\"Submit\")\n",
184
+ "\n",
185
+ " with gr.Column():\n",
186
+ " with gr.Tab(\"Extract Text\"):\n",
187
+ " output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n",
188
+ "\n",
189
+ " btn.click(\n",
190
+ " image_ner_tool,\n",
191
+ " [text1, model],\n",
192
+ " [output1],\n",
193
+ " )\n",
194
+ "\n",
195
+ "demo.launch(share=True)"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": []
204
+ }
205
+ ],
206
+ "metadata": {
207
+ "kernelspec": {
208
+ "display_name": ".venv",
209
+ "language": "python",
210
+ "name": "python3"
211
+ },
212
+ "language_info": {
213
+ "codemirror_mode": {
214
+ "name": "ipython",
215
+ "version": 3
216
+ },
217
+ "file_extension": ".py",
218
+ "mimetype": "text/x-python",
219
+ "name": "python",
220
+ "nbconvert_exporter": "python",
221
+ "pygments_lexer": "ipython3",
222
+ "version": "3.11.8"
223
+ }
224
+ },
225
+ "nbformat": 4,
226
+ "nbformat_minor": 2
227
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ PyPDF2
2
+ transformers
3
+ pytesseract
4
+ torch
5
+ spacy-transformers
6
+ gradio
7
+ https://huggingface.co/spacy/en_core_web_trf/resolve/main/en_core_web_trf-any-py3-none-any.whl