SalmanSakibSrizon commited on
Commit
63e3895
·
1 Parent(s): 18438fa

added all function to app.py

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. README.md +8 -7
  3. app.py +170 -3
  4. pdf_summarization.ipynb +272 -21
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  .venv
2
- .md
 
 
1
  .venv
2
+ *.md
3
+ *.ipynb
README.md CHANGED
@@ -1,9 +1,10 @@
 
 
1
  title: Pdf Image Ner Application
2
- emoji: 📚
3
- colorFrom: indigo
4
- colorTo: pink
5
  sdk: gradio
6
- sdk_version: 4.19.2
7
- app_file: app.py
8
- pinned: false
9
- license: apache-2.0
 
 
 
1
+ ---
2
+ license: apache-2.0
3
  title: Pdf Image Ner Application
 
 
 
4
  sdk: gradio
5
+ emoji: 📚
6
+ colorFrom: yellow
7
+ colorTo: green
8
+ pinned: true
9
+ short_description: Pdf Image Ner Application
10
+ ---
app.py CHANGED
@@ -56,6 +56,149 @@ def extract_text_from_image_or_pdf(file_bytes):
56
  return f"Error extracting file"
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def image_ner_tool(file, model_name):
60
  reformatted_ner_output = ""
61
  try:
@@ -65,14 +208,32 @@ def image_ner_tool(file, model_name):
65
  else:
66
  file_bytes = file.getvalue()
67
  text = extract_text_from_image_or_pdf(file_bytes)
68
- return text
 
 
 
 
 
 
 
69
  except Exception as e:
70
  error_message = f"Error processing file:{str(e)}"
71
  return error_message, "", reformatted_ner_output
72
 
73
 
74
  # Gradio
75
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
76
  gr.Markdown(
77
  """
78
  <p style="text-align: center; font-weight: bold; font-size: 44px;">
@@ -93,11 +254,17 @@ with gr.Blocks() as demo:
93
  with gr.Column():
94
  with gr.Tab("Extract Text"):
95
  output1 = gr.Textbox(label="Extracted Text", container=True)
 
 
 
 
 
 
96
 
97
  btn.click(
98
  image_ner_tool,
99
  [text1, model],
100
- [output1],
101
  )
102
 
103
  demo.launch(share=True)
 
56
  return f"Error extracting file"
57
 
58
 
59
+ # Performs Named Entity Recognition (NER) on given text
60
+ def perform_ner(text, model_name):
61
+ try:
62
+ if model_name == "SpaCy English NER":
63
+ doc = spacy_ner_model(text)
64
+ extracted_entities = [
65
+ {
66
+ "text": ent.text,
67
+ "type": ent.label_,
68
+ "start_index": ent.start_char,
69
+ "end_index": ent.end_char,
70
+ }
71
+ for ent in doc.ents
72
+ ]
73
+ elif model_name == "bert-large-NER":
74
+ entities = ner_model(text)
75
+ extracted_entities = [
76
+ {
77
+ "text": entity["word"],
78
+ "type": entity["entity"],
79
+ "start_index": entity["start"],
80
+ "end_index": entity["end"],
81
+ }
82
+ for entity in entities
83
+ ]
84
+ else:
85
+ entities = ner_model_bio(text)
86
+ extracted_entities = [
87
+ {
88
+ "text": entity["word"],
89
+ "type": entity["entity"],
90
+ "start_index": entity["start"],
91
+ "end_index": entity["end"],
92
+ }
93
+ for entity in entities
94
+ ]
95
+
96
+ return extracted_entities
97
+ except Exception as e:
98
+ return f"Error Performing NER: {str(e)}"
99
+
100
+
101
+ # this function takes row text , a list of entities with their start and end indices and maps with the assigned color
102
+ def highlight_entities_with_colors_and_label_tokenized(text, entities, color_mapping, tokenizer):
103
+ highlighted_text = ""
104
+ current_pos = 0
105
+
106
+ for ent in entities:
107
+ start, end, label = (
108
+ ent.get("start_index", 0),
109
+ ent.get("end_index", 0),
110
+ ent.get("type", "0"),
111
+ )
112
+ entity_text = text[start:end]
113
+
114
+ # tokenize the text
115
+ encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
116
+ tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
117
+ tokenized_entity_length = len(tokenized_entity_text)
118
+
119
+ # adding non entity text
120
+ highlighted_text += text[current_pos:start]
121
+
122
+ # adding highlighted entity text with color and label on the same time
123
+ color = color_mapping.get(label, "#4D94FF")
124
+ highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"
125
+
126
+ # Update current position
127
+ current_pos = end
128
+
129
+ # add any non remaining non-entity text
130
+ highlighted_text += text[current_pos:]
131
+ return highlighted_text
132
+
133
+
134
+ # Highlight named entities in the given color maping
135
+ def highlight_entities(text, entities, model_name):
136
+ try:
137
+ if model_name == "SpaCy English NER":
138
+ doc = spacy_ner_model(text)
139
+ color_mapping = {
140
+ "DATE": "#4D94FF", # Blue
141
+ "PERSON": "#4CAF50", # Green
142
+ "EVENT": "#FF6666", # Salmon
143
+ "FAC": "#66B2FF", # Sky Blue
144
+ "GPE": "#FFCC99", # Light Apricot
145
+ "LANGUAGE": "#FF80BF", # Pink
146
+ "LAW": "#66FF99", # Mint
147
+ "LOC": "#809FFF", # Lavender Blue
148
+ "MONEY": "#FFFF99", # Light Yellow
149
+ "NORP": "#808000", # Olive Green
150
+ "ORDINAL": "#FF9999", # Misty Rose
151
+ "ORG": "#FFB366", # Light Peach
152
+ "PERCENT": "#FF99FF", # Orchid
153
+ "PRODUCT": "#FF6666", # Salmon
154
+ "QUANTITY": "#CC99FF", # Pastel Purple
155
+ "TIME": "#FFD54F", # Amber
156
+ "WORK_OF_ART": "#FFC266", # Light Orange
157
+ "CARDINAL": "#008080", # Teal
158
+ }
159
+
160
+ options = {
161
+ "ents": [entity["type"] for entity in entities],
162
+ "colors": color_mapping,
163
+ }
164
+ html = displacy.render(doc, style="ent", options=options, page=True)
165
+ colored_text = html
166
+ return colored_text
167
+ else:
168
+ color_mapping = {
169
+ "O": "pink",
170
+ "B-MIS": "red",
171
+ "I-MIS": "brown",
172
+ "B-PER": "green",
173
+ "I-PER": "#FFD54F",
174
+ "B-ORG": "orange",
175
+ "I-ORG": "#FF6666",
176
+ "B-LOC": "purple",
177
+ "I-LOC": "#FFCC99",
178
+ }
179
+ highlighted_example = highlight_entities_with_colors_and_label_tokenized(
180
+ text, entities, color_mapping, tokenizer
181
+ )
182
+ return highlighted_example
183
+ except Exception as e:
184
+ return f"Error highlighted entities: {str(e)}"
185
+
186
+
187
+ # Summarize text
188
+ def summarized_text(input_text):
189
+ summarized_text = summarization_pipeline(
190
+ input_text,
191
+ max_length=150,
192
+ min_length=50,
193
+ length_penalty=2.0,
194
+ num_beams=4,
195
+ early_stopping=True,
196
+ )
197
+ summarized_text = summarized_text[0]["summary_text"]
198
+
199
+ return summarized_text
200
+
201
+
202
  def image_ner_tool(file, model_name):
203
  reformatted_ner_output = ""
204
  try:
 
208
  else:
209
  file_bytes = file.getvalue()
210
  text = extract_text_from_image_or_pdf(file_bytes)
211
+ entities = perform_ner(text, model_name)
212
+ highlighted_text = highlight_entities(text, entities, model_name)
213
+
214
+ reformatted_ner_output = json.dumps(entities, indent=2)
215
+
216
+ summary = summarized_text(text)
217
+
218
+ return text, highlighted_text, reformatted_ner_output, summary
219
  except Exception as e:
220
  error_message = f"Error processing file:{str(e)}"
221
  return error_message, "", reformatted_ner_output
222
 
223
 
224
  # Gradio
225
+
226
+ # # adding custom css
227
+ # css ="""
228
+
229
+ # """
230
+
231
+ # # adding custom js
232
+ # js ="""
233
+
234
+ # """
235
+
236
+ with gr.Blocks(theme='shivi/calm_seafoam') as demo:
237
  gr.Markdown(
238
  """
239
  <p style="text-align: center; font-weight: bold; font-size: 44px;">
 
254
  with gr.Column():
255
  with gr.Tab("Extract Text"):
256
  output1 = gr.Textbox(label="Extracted Text", container=True)
257
+ with gr.Tab("Highlighted Entitiled"):
258
+ output2 = gr.HTML("Summarize Text")
259
+ with gr.Tab("Summarized Text"):
260
+ output3 = gr.HTML("Summarize Text")
261
+ with gr.Tab("Named Entities Extracted"):
262
+ output4 = gr.HTML(label="Named Entities")
263
 
264
  btn.click(
265
  image_ner_tool,
266
  [text1, model],
267
+ [output1, output2, output4, output3],
268
  )
269
 
270
  demo.launch(share=True)
pdf_summarization.ipynb CHANGED
@@ -2,13 +2,15 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 4,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stderr",
10
  "output_type": "stream",
11
  "text": [
 
 
12
  "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
13
  " _torch_pytree._register_pytree_node(\n",
14
  "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
@@ -32,7 +34,7 @@
32
  },
33
  {
34
  "cell_type": "code",
35
- "execution_count": 6,
36
  "metadata": {},
37
  "outputs": [
38
  {
@@ -41,16 +43,7 @@
41
  "text": [
42
  "Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
43
  "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
44
- "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
45
- "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--facebook--bart-large-cnn. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
46
- "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
47
- " warnings.warn(message)\n",
48
- "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--d4data--biomedical-ner-all. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
49
- "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
50
- " warnings.warn(message)\n",
51
- "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--dslim--bert-base-NER. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
52
- "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
53
- " warnings.warn(message)\n"
54
  ]
55
  }
56
  ],
@@ -74,7 +67,7 @@
74
  },
75
  {
76
  "cell_type": "code",
77
- "execution_count": 12,
78
  "metadata": {},
79
  "outputs": [],
80
  "source": [
@@ -108,7 +101,217 @@
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  "metadata": {},
113
  "outputs": [],
114
  "source": [
@@ -121,7 +324,14 @@
121
  " else:\n",
122
  " file_bytes = file.getvalue()\n",
123
  " text = extract_text_from_image_or_pdf(file_bytes)\n",
124
- " return text\n",
 
 
 
 
 
 
 
125
  " except Exception as e:\n",
126
  " error_message = f\"Error processing file:{str(e)}\"\n",
127
  " return error_message, \"\", reformatted_ner_output"
@@ -129,22 +339,23 @@
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": 15,
133
  "metadata": {},
134
  "outputs": [
135
  {
136
  "name": "stdout",
137
  "output_type": "stream",
138
  "text": [
139
- "Running on local URL: http://127.0.0.1:7864\n",
 
140
  "\n",
141
- "Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n"
142
  ]
143
  },
144
  {
145
  "data": {
146
  "text/html": [
147
- "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
148
  ],
149
  "text/plain": [
150
  "<IPython.core.display.HTML object>"
@@ -157,13 +368,47 @@
157
  "data": {
158
  "text/plain": []
159
  },
160
- "execution_count": 15,
161
  "metadata": {},
162
  "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  }
164
  ],
165
  "source": [
166
  "# Gradio\n",
 
 
 
 
 
 
 
 
 
 
 
167
  "with gr.Blocks() as demo:\n",
168
  " gr.Markdown(\n",
169
  " \"\"\"\n",
@@ -185,11 +430,17 @@
185
  " with gr.Column():\n",
186
  " with gr.Tab(\"Extract Text\"):\n",
187
  " output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n",
 
 
 
 
 
 
188
  "\n",
189
  " btn.click(\n",
190
  " image_ner_tool,\n",
191
  " [text1, model],\n",
192
- " [output1],\n",
193
  " )\n",
194
  "\n",
195
  "demo.launch(share=True)"
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stderr",
10
  "output_type": "stream",
11
  "text": [
12
+ "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n",
14
  "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
15
  " _torch_pytree._register_pytree_node(\n",
16
  "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
 
34
  },
35
  {
36
  "cell_type": "code",
37
+ "execution_count": 2,
38
  "metadata": {},
39
  "outputs": [
40
  {
 
43
  "text": [
44
  "Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
45
  "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
46
+ "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
 
 
 
 
 
 
 
 
 
47
  ]
48
  }
49
  ],
 
67
  },
68
  {
69
  "cell_type": "code",
70
+ "execution_count": 3,
71
  "metadata": {},
72
  "outputs": [],
73
  "source": [
 
101
  },
102
  {
103
  "cell_type": "code",
104
+ "execution_count": 27,
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "name": "stdout",
109
+ "output_type": "stream",
110
+ "text": [
111
+ "SALMANSAKIB\n",
112
+ "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||[email protected]||website||Github||Linkedin\n",
113
+ "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n",
114
+ "PROFESSIONALEXPERIENCE\n",
115
+ "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n",
116
+ "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n",
117
+ "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n",
118
+ "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n",
119
+ "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n",
120
+ "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n",
121
+ "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n",
122
+ "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n",
123
+ "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n",
124
+ "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n",
125
+ "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n",
126
+ "ADDITIONALSKILLS\n",
127
+ "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n"
128
+ ]
129
+ }
130
+ ],
131
+ "source": [
132
+ "# Performs Named Entity Recognition (NER) on given text\n",
133
+ "def perform_ner(text, model_name):\n",
134
+ " try:\n",
135
+ " if model_name == \"SpaCy English NER\":\n",
136
+ " doc = spacy_ner_model(text)\n",
137
+ " extracted_entities = [\n",
138
+ " {\n",
139
+ " \"text\": ent.text,\n",
140
+ " \"type\": ent.label_,\n",
141
+ " \"start_index\": ent.start_char,\n",
142
+ " \"end_index\": ent.end_char,\n",
143
+ " }\n",
144
+ " for ent in doc.ents\n",
145
+ " ]\n",
146
+ " elif model_name == \"bert-large-NER\":\n",
147
+ " entities = ner_model(text)\n",
148
+ " extracted_entities = [\n",
149
+ " {\n",
150
+ " \"text\": entity[\"word\"],\n",
151
+ " \"type\": entity[\"entity\"],\n",
152
+ " \"start_index\": entity[\"start\"],\n",
153
+ " \"end_index\": entity[\"end\"],\n",
154
+ " }\n",
155
+ " for entity in entities\n",
156
+ " ]\n",
157
+ " else:\n",
158
+ " entities = ner_model_bio(text)\n",
159
+ " extracted_entities = [\n",
160
+ " {\n",
161
+ " \"text\": entity[\"word\"],\n",
162
+ " \"type\": entity[\"entity\"],\n",
163
+ " \"start_index\": entity[\"start\"],\n",
164
+ " \"end_index\": entity[\"end\"],\n",
165
+ " }\n",
166
+ " for entity in entities\n",
167
+ " ]\n",
168
+ "\n",
169
+ " return extracted_entities\n",
170
+ " except Exception as e:\n",
171
+ " return f\"Error Performing NER: {str(e)}\""
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 15,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "# this function takes row text , a list of entities with their start and end indices and maps with the assigned color\n",
181
+ "def highlight_entities_with_colors_and_label_tokenized(\n",
182
+ " text, entities, color_mapping, tokenizer\n",
183
+ "):\n",
184
+ " highlighted_text = \"\"\n",
185
+ " current_pos = 0\n",
186
+ "\n",
187
+ " for ent in entities:\n",
188
+ " start, end, label = (\n",
189
+ " ent.get(\"start_index\", 0),\n",
190
+ " ent.get(\"end_index\", 0),\n",
191
+ " ent.get(\"type\", \"0\"),\n",
192
+ " )\n",
193
+ " entity_text = text[start:end]\n",
194
+ "\n",
195
+ " # tokenize the text\n",
196
+ " encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)\n",
197
+ " tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)\n",
198
+ " tokenized_entity_length = len(tokenized_entity_text)\n",
199
+ "\n",
200
+ " # adding non entity text\n",
201
+ " highlighted_text += text[current_pos:start]\n",
202
+ "\n",
203
+ " # adding highlighted entity text with color and label on the same time\n",
204
+ " color = color_mapping.get(label, \"#4D94FF\")\n",
205
+ " highlighted_text += f\"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>\"\n",
206
+ "\n",
207
+ " # Update current position\n",
208
+ " current_pos = end\n",
209
+ "\n",
210
+ " # add any non remaining non-entity text\n",
211
+ " highlighted_text += text[current_pos:]\n",
212
+ " return highlighted_text"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 16,
218
+ "metadata": {},
219
+ "outputs": [],
220
+ "source": [
221
+ "# Highlight named entities in the given color maping\n",
222
+ "def highlight_entities(text, entities, model_name):\n",
223
+ " try:\n",
224
+ " if model_name == \"SpaCy English NER\":\n",
225
+ " doc = spacy_ner_model(text)\n",
226
+ " color_mapping = {\n",
227
+ " \"DATE\": \"#4D94FF\", # Blue\n",
228
+ " \"PERSON\": \"#4CAF50\", # Green\n",
229
+ " \"EVENT\": \"#FF6666\", # Salmon\n",
230
+ " \"FAC\": \"#66B2FF\", # Sky Blue\n",
231
+ " \"GPE\": \"#FFCC99\", # Light Apricot\n",
232
+ " \"LANGUAGE\": \"#FF80BF\", # Pink\n",
233
+ " \"LAW\": \"#66FF99\", # Mint\n",
234
+ " \"LOC\": \"#809FFF\", # Lavender Blue\n",
235
+ " \"MONEY\": \"#FFFF99\", # Light Yellow\n",
236
+ " \"NORP\": \"#808000\", # Olive Green\n",
237
+ " \"ORDINAL\": \"#FF9999\", # Misty Rose\n",
238
+ " \"ORG\": \"#FFB366\", # Light Peach\n",
239
+ " \"PERCENT\": \"#FF99FF\", # Orchid\n",
240
+ " \"PRODUCT\": \"#FF6666\", # Salmon\n",
241
+ " \"QUANTITY\": \"#CC99FF\", # Pastel Purple\n",
242
+ " \"TIME\": \"#FFD54F\", # Amber\n",
243
+ " \"WORK_OF_ART\": \"#FFC266\", # Light Orange\n",
244
+ " \"CARDINAL\": \"#008080\", # Teal\n",
245
+ " }\n",
246
+ "\n",
247
+ " options = {\n",
248
+ " \"ents\": [entity[\"type\"] for entity in entities],\n",
249
+ " \"colors\": color_mapping,\n",
250
+ " }\n",
251
+ " html = displacy.render(doc, style=\"ent\", options=options, page=True)\n",
252
+ " colored_text = html\n",
253
+ " return colored_text\n",
254
+ " else:\n",
255
+ " color_mapping = {\n",
256
+ " \"O\": \"pink\",\n",
257
+ " \"B-MIS\": \"red\",\n",
258
+ " \"I-MIS\": \"brown\",\n",
259
+ " \"B-PER\": \"green\",\n",
260
+ " \"I-PER\": \"#FFD54F\",\n",
261
+ " \"B-ORG\": \"orange\",\n",
262
+ " \"I-ORG\": \"#FF6666\",\n",
263
+ " \"B-LOC\": \"purple\",\n",
264
+ " \"I-LOC\": \"#FFCC99\",\n",
265
+ " }\n",
266
+ " highlighted_example = highlight_entities_with_colors_and_label_tokenized(\n",
267
+ " text, entities, color_mapping, tokenizer\n",
268
+ " )\n",
269
+ " return highlighted_example\n",
270
+ " except Exception as e:\n",
271
+ " return f\"Error highlighted entities: {str(e)}\""
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 32,
277
+ "metadata": {},
278
+ "outputs": [
279
+ {
280
+ "name": "stdout",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "SALMANSAKIB\n",
284
+ "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||[email protected]||website||Github||Linkedin\n",
285
+ "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n",
286
+ "PROFESSIONALEXPERIENCE\n",
287
+ "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n",
288
+ "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n",
289
+ "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n",
290
+ "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n",
291
+ "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n",
292
+ "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n",
293
+ "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n",
294
+ "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n",
295
+ "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n",
296
+ "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n",
297
+ "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n",
298
+ "ADDITIONALSKILLS\n",
299
+ "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n"
300
+ ]
301
+ }
302
+ ],
303
+ "source": [
304
+ "# Summarize text \n",
305
+ "def summarized_text(input_text):\n",
306
+ " summarized_text = summarization_pipeline(input_text, max_length= 150, min_length = 50, length_penalty = 2.0, num_beams = 4, early_stopping = True )\n",
307
+ " summarized_text = summarized_text[0]['summary_text']\n",
308
+ "\n",
309
+ " return summarized_text"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": 30,
315
  "metadata": {},
316
  "outputs": [],
317
  "source": [
 
324
  " else:\n",
325
  " file_bytes = file.getvalue()\n",
326
  " text = extract_text_from_image_or_pdf(file_bytes)\n",
327
+ " entities = perform_ner(text, model_name)\n",
328
+ " highlighted_text = highlight_entities(text, entities, model_name)\n",
329
+ "\n",
330
+ " reformatted_ner_output = json.dumps(entities, indent=2)\n",
331
+ "\n",
332
+ " summary = summarized_text(text)\n",
333
+ "\n",
334
+ " return text, highlighted_text, reformatted_ner_output, summary\n",
335
  " except Exception as e:\n",
336
  " error_message = f\"Error processing file:{str(e)}\"\n",
337
  " return error_message, \"\", reformatted_ner_output"
 
339
  },
340
  {
341
  "cell_type": "code",
342
+ "execution_count": 31,
343
  "metadata": {},
344
  "outputs": [
345
  {
346
  "name": "stdout",
347
  "output_type": "stream",
348
  "text": [
349
+ "Running on local URL: http://127.0.0.1:7872\n",
350
+ "Running on public URL: https://6117c2357b407721d0.gradio.live\n",
351
  "\n",
352
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
353
  ]
354
  },
355
  {
356
  "data": {
357
  "text/html": [
358
+ "<div><iframe src=\"https://6117c2357b407721d0.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
359
  ],
360
  "text/plain": [
361
  "<IPython.core.display.HTML object>"
 
368
  "data": {
369
  "text/plain": []
370
  },
371
+ "execution_count": 31,
372
  "metadata": {},
373
  "output_type": "execute_result"
374
+ },
375
+ {
376
+ "name": "stdout",
377
+ "output_type": "stream",
378
+ "text": [
379
+ "SALMANSAKIB\n",
380
+ "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||[email protected]||website||Github||Linkedin\n",
381
+ "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n",
382
+ "PROFESSIONALEXPERIENCE\n",
383
+ "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n",
384
+ "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n",
385
+ "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n",
386
+ "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n",
387
+ "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n",
388
+ "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n",
389
+ "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n",
390
+ "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n",
391
+ "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n",
392
+ "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n",
393
+ "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n",
394
+ "ADDITIONALSKILLS\n",
395
+ "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n"
396
+ ]
397
  }
398
  ],
399
  "source": [
400
  "# Gradio\n",
401
+ "\n",
402
+ "# # adding custom css \n",
403
+ "# css =\"\"\"\n",
404
+ "\n",
405
+ "# \"\"\"\n",
406
+ "\n",
407
+ "# # adding custom js \n",
408
+ "# js =\"\"\" \n",
409
+ "\n",
410
+ "# \"\"\"\n",
411
+ "\n",
412
  "with gr.Blocks() as demo:\n",
413
  " gr.Markdown(\n",
414
  " \"\"\"\n",
 
430
  " with gr.Column():\n",
431
  " with gr.Tab(\"Extract Text\"):\n",
432
  " output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n",
433
+ " with gr.Tab(\"Highlighted Entitiled\"):\n",
434
+ " output2 = gr.HTML(\"Summarize Text\")\n",
435
+ " with gr.Tab(\"Summarized Text\"):\n",
436
+ " output3 = gr.HTML(\"Summarize Text\")\n",
437
+ " with gr.Tab(\"Named Entities Extracted\"):\n",
438
+ " output4=gr.HTML(label=\"Named Entities\")\n",
439
  "\n",
440
  " btn.click(\n",
441
  " image_ner_tool,\n",
442
  " [text1, model],\n",
443
+ " [output1, output2, output4,output3],\n",
444
  " )\n",
445
  "\n",
446
  "demo.launch(share=True)"