{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", " _torch_pytree._register_pytree_node(\n", "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", " _torch_pytree._register_pytree_node(\n" ] } ], "source": [ "import gradio as gr\n", "import PyPDF2\n", "from PyPDF2 import PdfReader\n", "from io import BytesIO\n", "import pytesseract\n", "from PIL import Image\n", "import spacy\n", "import json\n", "\n", "from transformers import pipeline\n", "from PyPDF2 import PdfReader" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n", "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "# initiate model\n", "ner_model = pipeline(\"token-classification\", model=\"dslim/bert-large-NER\")\n", "summarization_pipeline = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n", "ner_models = {\n", " \"bert-large-NER\": \"dslim/bert-large-NER\",\n", " \"bioNER\": \"d4data/biomedical-ner-all\",\n", " \"SpaCy English NER\": \"en_core_web_trf\",\n", "}\n", "spacy_ner_model = spacy.load(ner_models[\"SpaCy English NER\"])\n", "ner_model_bio = pipeline(\"token-classification\", model=\"d4data/biomedical-ner-all\")\n", "\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"dslim/bert-base-NER\")\n", "from spacy import displacy" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Extracting text from pdf & image file\n", "\n", "\n", "def extract_text_from_pdf(pdf_bytes):\n", " text = \"\"\n", " pdf_file = BytesIO(pdf_bytes)\n", " pdf_reader = PdfReader(pdf_file)\n", "\n", " for page_number in range(len(pdf_reader.pages)):\n", " page = pdf_reader.pages[page_number]\n", " text += page.extract_text()\n", " return text\n", "\n", "\n", "def extract_text_from_image_or_pdf(file_bytes):\n", " try:\n", " if file_bytes.startswith(b'%PDF'):\n", " text = extract_text_from_pdf(file_bytes)\n", " print(text)\n", " else:\n", " image = Image.open(BytesIO(file_bytes))\n", " text = pytesseract.image_to_string(image)\n", "\n", " return text\n", " except Exception as e:\n", " return f\"Error extracting file\"" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SALMANSAKIB\n", "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||salmansrizon2016@gmail.com||website||Github||Linkedin\n", "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n", "PROFESSIONALEXPERIENCE\n", "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n", "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n", "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n", "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n", "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n", "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n", "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n", "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n", "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n", "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n", "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n", "ADDITIONALSKILLS\n", "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n" ] } ], "source": [ "# Performs Named Entity Recognition (NER) on given text\n", "def perform_ner(text, model_name):\n", " try:\n", " if model_name == \"SpaCy English NER\":\n", " doc = spacy_ner_model(text)\n", " extracted_entities = [\n", " {\n", " \"text\": ent.text,\n", " \"type\": ent.label_,\n", " \"start_index\": ent.start_char,\n", " \"end_index\": ent.end_char,\n", " }\n", " for ent in doc.ents\n", " ]\n", " elif model_name == \"bert-large-NER\":\n", " entities = ner_model(text)\n", " extracted_entities = [\n", " {\n", " \"text\": entity[\"word\"],\n", " \"type\": entity[\"entity\"],\n", " \"start_index\": entity[\"start\"],\n", " \"end_index\": entity[\"end\"],\n", " }\n", " for entity in entities\n", " ]\n", " else:\n", " entities = ner_model_bio(text)\n", " extracted_entities = [\n", " {\n", " \"text\": entity[\"word\"],\n", " \"type\": entity[\"entity\"],\n", " \"start_index\": entity[\"start\"],\n", " \"end_index\": entity[\"end\"],\n", " }\n", " for entity in entities\n", " ]\n", "\n", " return extracted_entities\n", " except Exception as e:\n", " return f\"Error Performing NER: {str(e)}\"" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# this function takes row text , a list of entities with their start and end indices and maps with the assigned color\n", "def highlight_entities_with_colors_and_label_tokenized(\n", " text, entities, color_mapping, tokenizer\n", "):\n", " highlighted_text = \"\"\n", " current_pos = 0\n", "\n", " for ent in entities:\n", " start, end, label = (\n", " ent.get(\"start_index\", 0),\n", " ent.get(\"end_index\", 0),\n", " ent.get(\"type\", \"0\"),\n", " )\n", " entity_text = text[start:end]\n", "\n", " # tokenize the text\n", " encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)\n", " tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)\n", " tokenized_entity_length = len(tokenized_entity_text)\n", "\n", " # adding non entity text\n", " highlighted_text += text[current_pos:start]\n", "\n", " # adding highlighted entity text with color and label on the same time\n", " color = color_mapping.get(label, \"#4D94FF\")\n", " highlighted_text += f\"{entity_text} ({label})\"\n", "\n", " # Update current position\n", " current_pos = end\n", "\n", " # add any non remaining non-entity text\n", " highlighted_text += text[current_pos:]\n", " return highlighted_text" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Highlight named entities in the given color maping\n", "def highlight_entities(text, entities, model_name):\n", " try:\n", " if model_name == \"SpaCy English NER\":\n", " doc = spacy_ner_model(text)\n", " color_mapping = {\n", " \"DATE\": \"#4D94FF\", # Blue\n", " \"PERSON\": \"#4CAF50\", # Green\n", " \"EVENT\": \"#FF6666\", # Salmon\n", " \"FAC\": \"#66B2FF\", # Sky Blue\n", " \"GPE\": \"#FFCC99\", # Light Apricot\n", " \"LANGUAGE\": \"#FF80BF\", # Pink\n", " \"LAW\": \"#66FF99\", # Mint\n", " \"LOC\": \"#809FFF\", # Lavender Blue\n", " \"MONEY\": \"#FFFF99\", # Light Yellow\n", " \"NORP\": \"#808000\", # Olive Green\n", " \"ORDINAL\": \"#FF9999\", # Misty Rose\n", " \"ORG\": \"#FFB366\", # Light Peach\n", " \"PERCENT\": \"#FF99FF\", # Orchid\n", " \"PRODUCT\": \"#FF6666\", # Salmon\n", " \"QUANTITY\": \"#CC99FF\", # Pastel Purple\n", " \"TIME\": \"#FFD54F\", # Amber\n", " \"WORK_OF_ART\": \"#FFC266\", # Light Orange\n", " \"CARDINAL\": \"#008080\", # Teal\n", " }\n", "\n", " options = {\n", " \"ents\": [entity[\"type\"] for entity in entities],\n", " \"colors\": color_mapping,\n", " }\n", " html = displacy.render(doc, style=\"ent\", options=options, page=True)\n", " colored_text = html\n", " return colored_text\n", " else:\n", " color_mapping = {\n", " \"O\": \"pink\",\n", " \"B-MIS\": \"red\",\n", " \"I-MIS\": \"brown\",\n", " \"B-PER\": \"green\",\n", " \"I-PER\": \"#FFD54F\",\n", " \"B-ORG\": \"orange\",\n", " \"I-ORG\": \"#FF6666\",\n", " \"B-LOC\": \"purple\",\n", " \"I-LOC\": \"#FFCC99\",\n", " }\n", " highlighted_example = highlight_entities_with_colors_and_label_tokenized(\n", " text, entities, color_mapping, tokenizer\n", " )\n", " return highlighted_example\n", " except Exception as e:\n", " return f\"Error highlighted entities: {str(e)}\"" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SALMANSAKIB\n", "Flat-1104,Building-18D,(Chameli),RajukUttaraApartmentproject,Sector-18,Uttara-1230,Dhaka,Bangladesh(+880)1682359817||salmansrizon2016@gmail.com||website||Github||Linkedin\n", "Highlymotivatedandresults-orientedDataAnalystwith6+yearsofexperienceinthee-commerceand\u0000\u0000ntechindustries.Possessesastrongunderstandingofdataanalysisprinciples,dataminingtechniques,andpro\u0000\u0000ciencyinvariousdataanalysistoolsandsoftware.Provenabilitytocollect,clean,analyze,andinterpretlargedatasetstogenerateactionableinsightsandsupportinformeddecision-making.\n", "PROFESSIONALEXPERIENCE\n", "DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2022–Present\n", "●Data-drivenstrategydevelopment:Leverageinsightstobuildatargeted,e\u0000\u0000\u0000cient,andimpactfulbusinessstrategy.\n", "●Segmentation:Customer,market,product,etc.(segmentationbasedonpurchase,buyingpatterns).Whichresultedina21%dropinCAC,Quarterlyaround10%incrementinCR.\n", "●ChannelPerformance:Analyze&trackperformanceacrossvariouschannels(e.g.,socialmedia,website,CRM)\n", "●ContentStrategy:Evaluate&strategizecontente\u0000fectiveness&userengagement(SEO,CR,CPO,PPC,CAC)UCBFintechCo.LtdSeniorStrategyAnalystDhaka,BangladeshOct2021–Sept2022\n", "●Strategy&Processdevelopment:Developlong-termvision&goals,SWOTanalysis,objectivesetting,performanceevaluation,Data-drivendecision-makingforstrategyimprovement\n", "●Decision-making&resourceallocation:Translatestrategyintoactionabletasks&resourceallocationthatincreaseddailytotal\u0000\u0000nancialtransactionvolumeby56%,Customeracquisitionby40%DarazBangladeshLtdSeniorDataAnalystDhaka,BangladeshOct2017–Sept2021\n", "●E\u0000\u0000\u0000cientstrategyexecution:Bridgethegapbetweenstrategy&action.\n", "●Data-drivendecision-making:Informbusinessdecisionswithinsightsfromproposals&campaignanalysis.\n", "●Developactionableplans:Translatestrategyintoconcretetasks&resourceallocation.Whichleadsdropsoperationalleadtimeby5%EDUCATIONIndependentUniversity,BangladeshExecutiveMBADhaka,BangladeshMay2021\n", "AmericanInternationalUniversityBachelorinComputerScience&EngineeringDhaka,BangladeshDec2016\n", "ADDITIONALSKILLS\n", "●Pro\u0000\u0000cientinSQL,TechnicalAnalytics,Statistics,BusinessIntelligence,Databas,MicrosoftExcel,Tableau,PowerBI,Python,DataModeling,DataVisualization,EDA,ETL,DataMining,BigData,Pivotal,PredictiveModeling,Clustering,MachineLearning.\n" ] } ], "source": [ "# Summarize text \n", "def summarized_text(input_text):\n", " summarized_text = summarization_pipeline(input_text, max_length= 150, min_length = 50, length_penalty = 2.0, num_beams = 4, early_stopping = True )\n", " summarized_text = summarized_text[0]['summary_text']\n", "\n", " return summarized_text" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "def image_ner_tool(file, model_name):\n", " reformatted_ner_output = \"\"\n", " try:\n", " if isinstance(file, str):\n", " with open(file, \"rb\") as file_stream:\n", " file_bytes = file_stream.read()\n", " else:\n", " file_bytes = file.getvalue()\n", " text = extract_text_from_image_or_pdf(file_bytes)\n", " entities = perform_ner(text, model_name)\n", " highlighted_text = highlight_entities(text, entities, model_name)\n", "\n", " reformatted_ner_output = json.dumps(entities, indent=2)\n", "\n", " summary = summarized_text(text)\n", "\n", " return text, highlighted_text, reformatted_ner_output, summary\n", " except Exception as e:\n", " error_message = f\"Error processing file:{str(e)}\"\n", " return error_message, \"\", reformatted_ner_output" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7872\n", "Running on public URL: https://6117c2357b407721d0.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "\n", " Intelligent Document Processing\n", "
\n", "\n", " Upload a PDF or an image file to extract text and identify named entities\n", "
\n", " \"\"\"\n", " )\n", "\n", " with gr.Row() as row:\n", " with gr.Column():\n", " text1 = gr.File(label=\"Upload File\")\n", " model = gr.Dropdown(list(ner_models.keys()), label=\"Select NER Model\")\n", " btn = gr.Button(\"Submit\")\n", "\n", " with gr.Column():\n", " with gr.Tab(\"Extract Text\"):\n", " output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n", " with gr.Tab(\"Highlighted Entitiled\"):\n", " output2 = gr.HTML(\"Summarize Text\")\n", " with gr.Tab(\"Summarized Text\"):\n", " output3 = gr.HTML(\"Summarize Text\")\n", " with gr.Tab(\"Named Entities Extracted\"):\n", " output4=gr.HTML(label=\"Named Entities\")\n", "\n", " btn.click(\n", " image_ner_tool,\n", " [text1, model],\n", " [output1, output2, output4,output3],\n", " )\n", "\n", "demo.launch(share=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }