Spaces:

SalmanSakibSrizon
/

pdf_image_ner_application

Sleeping

App Files Files Community

SalmanSakibSrizon commited on Mar 1, 2024

Commit

de2d856

1 Parent(s): 8faf6d7

Text extract added + initiate gradio interface

Browse files

Files changed (3) hide show

app.py +0 -0
pdf_summarization.ipynb +227 -0
requirements.txt +7 -0

app.py ADDED Viewed

File without changes

pdf_summarization.ipynb ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
+      "  _torch_pytree._register_pytree_node(\n",
+      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
+      "  _torch_pytree._register_pytree_node(\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "import PyPDF2\n",
+    "from PyPDF2 import PdfReader\n",
+    "from io import BytesIO\n",
+    "import pytesseract\n",
+    "from PIL import Image\n",
+    "import spacy\n",
+    "import json\n",
+    "\n",
+    "from transformers import pipeline\n",
+    "from PyPDF2 import PdfReader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
+      "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--facebook--bart-large-cnn. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--d4data--biomedical-ner-all. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "e:\\pdf_summarization\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\salma\\.cache\\huggingface\\hub\\models--dslim--bert-base-NER. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# initiate model\n",
+    "ner_model = pipeline(\"token-classification\", model=\"dslim/bert-large-NER\")\n",
+    "summarization_pipeline = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n",
+    "ner_models = {\n",
+    "    \"bert-large-NER\": \"dslim/bert-large-NER\",\n",
+    "    \"bioNER\": \"d4data/biomedical-ner-all\",\n",
+    "    \"SpaCy English NER\": \"en_core_web_trf\",\n",
+    "}\n",
+    "spacy_ner_model = spacy.load(ner_models[\"SpaCy English NER\"])\n",
+    "ner_model_bio = pipeline(\"token-classification\", model=\"d4data/biomedical-ner-all\")\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"dslim/bert-base-NER\")\n",
+    "from spacy import displacy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting text from pdf & image file\n",
+    "\n",
+    "\n",
+    "def extract_text_from_pdf(pdf_bytes):\n",
+    "    text = \"\"\n",
+    "    pdf_file = BytesIO(pdf_bytes)\n",
+    "    pdf_reader = PdfReader(pdf_file)\n",
+    "\n",
+    "    for page_number in range(len(pdf_reader.pages)):\n",
+    "        page = pdf_reader.pages[page_number]\n",
+    "        text += page.extract_text()\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "def extract_text_from_image_or_pdf(file_bytes):\n",
+    "    try:\n",
+    "        if file_bytes.startswith(b'%PDF'):\n",
+    "            text = extract_text_from_pdf(file_bytes)\n",
+    "            print(text)\n",
+    "        else:\n",
+    "            image = Image.open(BytesIO(file_bytes))\n",
+    "            text = pytesseract.image_to_string(image)\n",
+    "\n",
+    "        return text\n",
+    "    except Exception as e:\n",
+    "        return f\"Error extracting file\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def image_ner_tool(file, model_name):\n",
+    "    reformatted_ner_output = \"\"\n",
+    "    try:\n",
+    "        if isinstance(file, str):\n",
+    "            with open(file, \"rb\") as file_stream:\n",
+    "                file_bytes = file_stream.read()\n",
+    "        else:\n",
+    "            file_bytes = file.getvalue()\n",
+    "        text = extract_text_from_image_or_pdf(file_bytes)\n",
+    "        return text\n",
+    "    except Exception as e:\n",
+    "        error_message = f\"Error processing file:{str(e)}\"\n",
+    "        return error_message, \"\", reformatted_ner_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7864\n",
+      "\n",
+      "Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Gradio\n",
+    "with gr.Blocks() as demo:\n",
+    "    gr.Markdown(\n",
+    "        \"\"\"\n",
+    "        <p style=\"text-align: center; font-weight: bold; font-size: 44px;\">\n",
+    "        Intelligent Document Processing\n",
+    "        </p>\n",
+    "        <p style=\"text-align: center;\">\n",
+    "        Upload a PDF or an image file to extract text and identify named entities\n",
+    "        </p>\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "\n",
+    "    with gr.Row() as row:\n",
+    "        with gr.Column():\n",
+    "            text1 = gr.File(label=\"Upload File\")\n",
+    "            model = gr.Dropdown(list(ner_models.keys()), label=\"Select NER Model\")\n",
+    "            btn = gr.Button(\"Submit\")\n",
+    "\n",
+    "        with gr.Column():\n",
+    "            with gr.Tab(\"Extract Text\"):\n",
+    "                output1 = gr.Textbox(label=\"Extracted Text\", container=True)\n",
+    "\n",
+    "    btn.click(\n",
+    "        image_ner_tool,\n",
+    "        [text1, model],\n",
+    "        [output1],\n",
+    "    )\n",
+    "\n",
+    "demo.launch(share=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+PyPDF2
+transformers
+pytesseract
+torch
+spacy-transformers
+gradio
+https://huggingface.co/spacy/en_core_web_trf/resolve/main/en_core_web_trf-any-py3-none-any.whl