Spaces:

SalmanSakibSrizon
/

pdf_image_ner_application

Sleeping

App Files Files Community

SalmanSakibSrizon commited on Mar 1, 2024

Commit

63e3895

1 Parent(s): 18438fa

added all function to app.py

Browse files

Files changed (4) hide show

.gitignore +2 -1
README.md +8 -7
app.py +170 -3
pdf_summarization.ipynb +272 -21

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 .venv
-.md

 .venv
+*.md
+*.ipynb

README.md CHANGED Viewed

@@ -1,9 +1,10 @@
 title: Pdf Image Ner Application
-emoji: 📚
-colorFrom: indigo
-colorTo: pink
 sdk: gradio
-sdk_version: 4.19.2
-app_file: app.py
-pinned: false
-license: apache-2.0

+---
+license: apache-2.0
 title: Pdf Image Ner Application
 sdk: gradio
+emoji: 📚
+colorFrom: yellow
+colorTo: green
+pinned: true
+short_description: Pdf Image Ner Application
+---

app.py CHANGED Viewed

@@ -56,6 +56,149 @@ def extract_text_from_image_or_pdf(file_bytes):
         return f"Error extracting file"
 def image_ner_tool(file, model_name):
     reformatted_ner_output = ""
     try:
@@ -65,14 +208,32 @@ def image_ner_tool(file, model_name):
         else:
             file_bytes = file.getvalue()
         text = extract_text_from_image_or_pdf(file_bytes)
-        return text
     except Exception as e:
         error_message = f"Error processing file:{str(e)}"
         return error_message, "", reformatted_ner_output
 # Gradio
-with gr.Blocks() as demo:
     gr.Markdown(
         """
         <p style="text-align: center; font-weight: bold; font-size: 44px;">
@@ -93,11 +254,17 @@ with gr.Blocks() as demo:
         with gr.Column():
             with gr.Tab("Extract Text"):
                 output1 = gr.Textbox(label="Extracted Text", container=True)
     btn.click(
         image_ner_tool,
         [text1, model],
-        [output1],
     )
 demo.launch(share=True)

         return f"Error extracting file"
+# Performs Named Entity Recognition (NER) on given text
+def perform_ner(text, model_name):
+    try:
+        if model_name == "SpaCy English NER":
+            doc = spacy_ner_model(text)
+            extracted_entities = [
+                {
+                    "text": ent.text,
+                    "type": ent.label_,
+                    "start_index": ent.start_char,
+                    "end_index": ent.end_char,
+                }
+                for ent in doc.ents
+            ]
+        elif model_name == "bert-large-NER":
+            entities = ner_model(text)
+            extracted_entities = [
+                {
+                    "text": entity["word"],
+                    "type": entity["entity"],
+                    "start_index": entity["start"],
+                    "end_index": entity["end"],
+                }
+                for entity in entities
+            ]
+        else:
+            entities = ner_model_bio(text)
+            extracted_entities = [
+                {
+                    "text": entity["word"],
+                    "type": entity["entity"],
+                    "start_index": entity["start"],
+                    "end_index": entity["end"],
+                }
+                for entity in entities
+            ]
+        return extracted_entities
+    except Exception as e:
+        return f"Error Performing NER: {str(e)}"
+# this function takes row text , a list of entities with their start and end indices and maps with the assigned color
+def highlight_entities_with_colors_and_label_tokenized(text, entities, color_mapping, tokenizer):
+    highlighted_text = ""
+    current_pos = 0
+    for ent in entities:
+        start, end, label = (
+            ent.get("start_index", 0),
+            ent.get("end_index", 0),
+            ent.get("type", "0"),
+        )
+        entity_text = text[start:end]
+        # tokenize the text
+        encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
+        tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
+        tokenized_entity_length = len(tokenized_entity_text)
+        # adding non entity text
+        highlighted_text += text[current_pos:start]
+        # adding highlighted entity text with color and label on the same time
+        color = color_mapping.get(label, "#4D94FF")
+        highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"
+        # Update current position
+        current_pos = end
+        # add any non remaining non-entity text
+        highlighted_text += text[current_pos:]
+        return highlighted_text
+# Highlight named entities in the given color maping
+def highlight_entities(text, entities, model_name):
+    try:
+        if model_name == "SpaCy English NER":
+            doc = spacy_ner_model(text)
+            color_mapping = {
+                "DATE": "#4D94FF",  # Blue
+                "PERSON": "#4CAF50",  # Green
+                "EVENT": "#FF6666",  # Salmon
+                "FAC": "#66B2FF",  # Sky Blue
+                "GPE": "#FFCC99",  # Light Apricot
+                "LANGUAGE": "#FF80BF",  # Pink
+                "LAW": "#66FF99",  # Mint
+                "LOC": "#809FFF",  # Lavender Blue
+                "MONEY": "#FFFF99",  # Light Yellow
+                "NORP": "#808000",  # Olive Green
+                "ORDINAL": "#FF9999",  # Misty Rose
+                "ORG": "#FFB366",  # Light Peach
+                "PERCENT": "#FF99FF",  # Orchid
+                "PRODUCT": "#FF6666",  # Salmon
+                "QUANTITY": "#CC99FF",  # Pastel Purple
+                "TIME": "#FFD54F",  # Amber
+                "WORK_OF_ART": "#FFC266",  # Light Orange
+                "CARDINAL": "#008080",  # Teal
+            }
+            options = {
+                "ents": [entity["type"] for entity in entities],
+                "colors": color_mapping,
+            }
+            html = displacy.render(doc, style="ent", options=options, page=True)
+            colored_text = html
+            return colored_text
+        else:
+            color_mapping = {
+                "O": "pink",
+                "B-MIS": "red",
+                "I-MIS": "brown",
+                "B-PER": "green",
+                "I-PER": "#FFD54F",
+                "B-ORG": "orange",
+                "I-ORG": "#FF6666",
+                "B-LOC": "purple",
+                "I-LOC": "#FFCC99",
+            }
+            highlighted_example = highlight_entities_with_colors_and_label_tokenized(
+                text, entities, color_mapping, tokenizer
+            )
+            return highlighted_example
+    except Exception as e:
+        return f"Error highlighted entities: {str(e)}"
+# Summarize text
+def summarized_text(input_text):
+    summarized_text = summarization_pipeline(
+        input_text,
+        max_length=150,
+        min_length=50,
+        length_penalty=2.0,
+        num_beams=4,
+        early_stopping=True,
+    )
+    summarized_text = summarized_text[0]["summary_text"]
+    return summarized_text
 def image_ner_tool(file, model_name):
     reformatted_ner_output = ""
     try:
         else:
             file_bytes = file.getvalue()
         text = extract_text_from_image_or_pdf(file_bytes)
+        entities = perform_ner(text, model_name)
+        highlighted_text = highlight_entities(text, entities, model_name)
+        reformatted_ner_output = json.dumps(entities, indent=2)
+        summary = summarized_text(text)
+        return text, highlighted_text, reformatted_ner_output, summary
     except Exception as e:
         error_message = f"Error processing file:{str(e)}"
         return error_message, "", reformatted_ner_output
 # Gradio
+# # adding custom css
+# css ="""
+# """
+# # adding custom js
+# js ="""
+# """
+with gr.Blocks(theme='shivi/calm_seafoam') as demo:
     gr.Markdown(
         """
         <p style="text-align: center; font-weight: bold; font-size: 44px;">
         with gr.Column():
             with gr.Tab("Extract Text"):
                 output1 = gr.Textbox(label="Extracted Text", container=True)
+            with gr.Tab("Highlighted Entitiled"):
+                output2 = gr.HTML("Summarize Text")
+            with gr.Tab("Summarized Text"):
+                output3 = gr.HTML("Summarize Text")
+            with gr.Tab("Named Entities Extracted"):
+                output4 = gr.HTML(label="Named Entities")
     btn.click(
         image_ner_tool,
         [text1, model],
+        [output1, output2, output4, output3],
     )
 demo.launch(share=True)

pdf_summarization.ipynb CHANGED Viewed

@@ -2,13 +2,15 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
       "  _torch_pytree._register_pytree_node(\n",
       "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
@@ -32,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -41,16 +43,7 @@
      "text": [
       "Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
       "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--facebook--bart-large-cnn. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
-      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
-      "  warnings.warn(message)\n",
-      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--d4data--biomedical-ner-all. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
-      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
-      "  warnings.warn(message)\n",
-      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--dslim--bert-base-NER. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
-      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
-      "  warnings.warn(message)\n"
      ]
     }
    ],
@@ -74,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -108,7 +101,217 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -121,7 +324,14 @@
     "        else:\n",
     "            file_bytes = file.getvalue()\n",
     "        text = extract_text_from_image_or_pdf(file_bytes)\n",
-    "        return text\n",
     "    except Exception as e:\n",
     "        error_message = f\"Error processing file:{str(e)}\"\n",
     "        return error_message, \"\", reformatted_ner_output"
@@ -129,22 +339,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running on local URL:  http://127.0.0.1:7864\n",
       "\n",
-      "Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -157,13 +368,47 @@
      "data": {
       "text/plain": []
      },
-     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Gradio\n",
     "with gr.Blocks() as demo:\n",
     "    gr.Markdown(\n",
     "        \"\"\"\n",
@@ -185,11 +430,17 @@
     "        with gr.Column():\n",
     "            with gr.Tab(\"Extract Text\"):\n",
     "                output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n",
     "\n",
     "    btn.click(\n",
     "        image_ner_tool,\n",
     "        [text1, model],\n",
-    "        [output1],\n",
     "    )\n",
     "\n",
     "demo.launch(share=True)"

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
       "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
       "  _torch_pytree._register_pytree_node(\n",
       "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "text": [
       "Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
       "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SALMANSAKIB\n",
+      "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||[email protected]||website||Github||Linkedin\n",
+      "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n",
+      "PROFESSIONALEXPERIENCE\n",
+      "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n",
+      "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n",
+      "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n",
+      "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n",
+      "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n",
+      "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n",
+      "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n",
+      "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n",
+      "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n",
+      "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n",
+      "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n",
+      "ADDITIONALSKILLS\n",
+      "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Performs Named Entity Recognition (NER) on given text\n",
+    "def perform_ner(text, model_name):\n",
+    "    try:\n",
+    "        if model_name == \"SpaCy English NER\":\n",
+    "            doc = spacy_ner_model(text)\n",
+    "            extracted_entities = [\n",
+    "                {\n",
+    "                    \"text\": ent.text,\n",
+    "                    \"type\": ent.label_,\n",
+    "                    \"start_index\": ent.start_char,\n",
+    "                    \"end_index\": ent.end_char,\n",
+    "                }\n",
+    "                for ent in doc.ents\n",
+    "            ]\n",
+    "        elif model_name == \"bert-large-NER\":\n",
+    "            entities = ner_model(text)\n",
+    "            extracted_entities = [\n",
+    "                {\n",
+    "                    \"text\": entity[\"word\"],\n",
+    "                    \"type\": entity[\"entity\"],\n",
+    "                    \"start_index\": entity[\"start\"],\n",
+    "                    \"end_index\": entity[\"end\"],\n",
+    "                }\n",
+    "                for entity in entities\n",
+    "            ]\n",
+    "        else:\n",
+    "            entities = ner_model_bio(text)\n",
+    "            extracted_entities = [\n",
+    "                {\n",
+    "                    \"text\": entity[\"word\"],\n",
+    "                    \"type\": entity[\"entity\"],\n",
+    "                    \"start_index\": entity[\"start\"],\n",
+    "                    \"end_index\": entity[\"end\"],\n",
+    "                }\n",
+    "                for entity in entities\n",
+    "            ]\n",
+    "\n",
+    "        return extracted_entities\n",
+    "    except Exception as e:\n",
+    "        return f\"Error Performing NER: {str(e)}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this function takes row text , a list of entities with their start and end indices and maps with the assigned color\n",
+    "def highlight_entities_with_colors_and_label_tokenized(\n",
+    "    text, entities, color_mapping, tokenizer\n",
+    "):\n",
+    "    highlighted_text = \"\"\n",
+    "    current_pos = 0\n",
+    "\n",
+    "    for ent in entities:\n",
+    "        start, end, label = (\n",
+    "            ent.get(\"start_index\", 0),\n",
+    "            ent.get(\"end_index\", 0),\n",
+    "            ent.get(\"type\", \"0\"),\n",
+    "        )\n",
+    "        entity_text = text[start:end]\n",
+    "\n",
+    "        # tokenize the text\n",
+    "        encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)\n",
+    "        tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)\n",
+    "        tokenized_entity_length = len(tokenized_entity_text)\n",
+    "\n",
+    "        # adding non entity text\n",
+    "        highlighted_text += text[current_pos:start]\n",
+    "\n",
+    "        # adding highlighted entity text with color and label on the same time\n",
+    "        color = color_mapping.get(label, \"#4D94FF\")\n",
+    "        highlighted_text += f\"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>\"\n",
+    "\n",
+    "        # Update current position\n",
+    "        current_pos = end\n",
+    "\n",
+    "        # add any non remaining non-entity text\n",
+    "        highlighted_text += text[current_pos:]\n",
+    "        return highlighted_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Highlight named entities in the given color maping\n",
+    "def highlight_entities(text, entities, model_name):\n",
+    "    try:\n",
+    "        if model_name == \"SpaCy English NER\":\n",
+    "            doc = spacy_ner_model(text)\n",
+    "            color_mapping = {\n",
+    "                \"DATE\": \"#4D94FF\",  # Blue\n",
+    "                \"PERSON\": \"#4CAF50\",  # Green\n",
+    "                \"EVENT\": \"#FF6666\",  # Salmon\n",
+    "                \"FAC\": \"#66B2FF\",  # Sky Blue\n",
+    "                \"GPE\": \"#FFCC99\",  # Light Apricot\n",
+    "                \"LANGUAGE\": \"#FF80BF\",  # Pink\n",
+    "                \"LAW\": \"#66FF99\",  # Mint\n",
+    "                \"LOC\": \"#809FFF\",  # Lavender Blue\n",
+    "                \"MONEY\": \"#FFFF99\",  # Light Yellow\n",
+    "                \"NORP\": \"#808000\",  # Olive Green\n",
+    "                \"ORDINAL\": \"#FF9999\",  # Misty Rose\n",
+    "                \"ORG\": \"#FFB366\",  # Light Peach\n",
+    "                \"PERCENT\": \"#FF99FF\",  # Orchid\n",
+    "                \"PRODUCT\": \"#FF6666\",  # Salmon\n",
+    "                \"QUANTITY\": \"#CC99FF\",  # Pastel Purple\n",
+    "                \"TIME\": \"#FFD54F\",  # Amber\n",
+    "                \"WORK_OF_ART\": \"#FFC266\",  # Light Orange\n",
+    "                \"CARDINAL\": \"#008080\",  # Teal\n",
+    "            }\n",
+    "\n",
+    "            options = {\n",
+    "                \"ents\": [entity[\"type\"] for entity in entities],\n",
+    "                \"colors\": color_mapping,\n",
+    "            }\n",
+    "            html = displacy.render(doc, style=\"ent\", options=options, page=True)\n",
+    "            colored_text = html\n",
+    "            return colored_text\n",
+    "        else:\n",
+    "            color_mapping = {\n",
+    "                \"O\": \"pink\",\n",
+    "                \"B-MIS\": \"red\",\n",
+    "                \"I-MIS\": \"brown\",\n",
+    "                \"B-PER\": \"green\",\n",
+    "                \"I-PER\": \"#FFD54F\",\n",
+    "                \"B-ORG\": \"orange\",\n",
+    "                \"I-ORG\": \"#FF6666\",\n",
+    "                \"B-LOC\": \"purple\",\n",
+    "                \"I-LOC\": \"#FFCC99\",\n",
+    "            }\n",
+    "            highlighted_example = highlight_entities_with_colors_and_label_tokenized(\n",
+    "                text, entities, color_mapping, tokenizer\n",
+    "            )\n",
+    "            return highlighted_example\n",
+    "    except Exception as e:\n",
+    "        return f\"Error highlighted entities: {str(e)}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SALMANSAKIB\n",
+      "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||[email protected]||website||Github||Linkedin\n",
+      "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n",
+      "PROFESSIONALEXPERIENCE\n",
+      "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n",
+      "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n",
+      "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n",
+      "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n",
+      "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n",
+      "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n",
+      "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n",
+      "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n",
+      "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n",
+      "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n",
+      "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n",
+      "ADDITIONALSKILLS\n",
+      "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Summarize text \n",
+    "def summarized_text(input_text):\n",
+    "    summarized_text = summarization_pipeline(input_text, max_length= 150, min_length = 50, length_penalty = 2.0, num_beams = 4, early_stopping = True )\n",
+    "    summarized_text = summarized_text[0]['summary_text']\n",
+    "\n",
+    "    return summarized_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
     "        else:\n",
     "            file_bytes = file.getvalue()\n",
     "        text = extract_text_from_image_or_pdf(file_bytes)\n",
+    "        entities = perform_ner(text, model_name)\n",
+    "        highlighted_text = highlight_entities(text, entities, model_name)\n",
+    "\n",
+    "        reformatted_ner_output = json.dumps(entities, indent=2)\n",
+    "\n",
+    "        summary = summarized_text(text)\n",
+    "\n",
+    "        return text, highlighted_text, reformatted_ner_output, summary\n",
     "    except Exception as e:\n",
     "        error_message = f\"Error processing file:{str(e)}\"\n",
     "        return error_message, \"\", reformatted_ner_output"
   },
   {
    "cell_type": "code",
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Running on local URL:  http://127.0.0.1:7872\n",
+      "Running on public URL: https://6117c2357b407721d0.gradio.live\n",
       "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
      ]
     },
     {
      "data": {
       "text/html": [
+       "<div><iframe src=\"https://6117c2357b407721d0.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
      "data": {
       "text/plain": []
      },
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SALMANSAKIB\n",
+      "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||[email protected]||website||Github||Linkedin\n",
+      "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n",
+      "PROFESSIONALEXPERIENCE\n",
+      "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n",
+      "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n",
+      "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n",
+      "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n",
+      "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n",
+      "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n",
+      "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n",
+      "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n",
+      "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n",
+      "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n",
+      "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n",
+      "ADDITIONALSKILLS\n",
+      "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n"
+     ]
     }
    ],
    "source": [
     "# Gradio\n",
+    "\n",
+    "# # adding custom css \n",
+    "# css =\"\"\"\n",
+    "\n",
+    "# \"\"\"\n",
+    "\n",
+    "# # adding custom js \n",
+    "# js =\"\"\" \n",
+    "\n",
+    "# \"\"\"\n",
+    "\n",
     "with gr.Blocks() as demo:\n",
     "    gr.Markdown(\n",
     "        \"\"\"\n",
     "        with gr.Column():\n",
     "            with gr.Tab(\"Extract Text\"):\n",
     "                output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n",
+    "            with gr.Tab(\"Highlighted Entitiled\"):\n",
+    "                output2 = gr.HTML(\"Summarize Text\")\n",
+    "            with gr.Tab(\"Summarized Text\"):\n",
+    "                output3 = gr.HTML(\"Summarize Text\")\n",
+    "            with gr.Tab(\"Named Entities Extracted\"):\n",
+    "                output4=gr.HTML(label=\"Named Entities\")\n",
     "\n",
     "    btn.click(\n",
     "        image_ner_tool,\n",
     "        [text1, model],\n",
+    "        [output1, output2, output4,output3],\n",
     "    )\n",
     "\n",
     "demo.launch(share=True)"