Spaces:
Sleeping
Sleeping
checking the code
Browse files
app.py
CHANGED
|
@@ -10,11 +10,8 @@ from transformers import (DebertaTokenizerFast,
|
|
| 10 |
import tensorflow as tf
|
| 11 |
import spacy
|
| 12 |
import streamlit as st
|
| 13 |
-
from scraper import scrape_text
|
| 14 |
|
| 15 |
|
| 16 |
-
os.environ['TF_USE_LEGACY_KERAS'] = "1"
|
| 17 |
-
|
| 18 |
class NERLabelEncoder:
|
| 19 |
'''
|
| 20 |
Label Encoder to encode and decode the entity labels
|
|
@@ -75,7 +72,6 @@ def load_ner_models():
|
|
| 75 |
|
| 76 |
ner_model, ner_label_encoder, ner_tokenizer, nlp = load_ner_models()
|
| 77 |
|
| 78 |
-
|
| 79 |
############ NER MODEL & VARS INITIALIZATION END ####################
|
| 80 |
|
| 81 |
############ NER LOGIC START ####################
|
|
@@ -151,9 +147,10 @@ def ner_inference_long_text(txt):
|
|
| 151 |
entities = []
|
| 152 |
doc = nlp(txt)
|
| 153 |
for sent in doc.sents:
|
| 154 |
-
entities.
|
| 155 |
return entities
|
| 156 |
|
|
|
|
| 157 |
def get_ner_text(article_txt, ner_result):
|
| 158 |
res_txt = ''
|
| 159 |
start = 0
|
|
@@ -177,7 +174,6 @@ def get_ner_text(article_txt, ner_result):
|
|
| 177 |
|
| 178 |
############ NER LOGIC END ####################
|
| 179 |
|
| 180 |
-
|
| 181 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION START ####################
|
| 182 |
SUMM_CHECKPOINT = "facebook/bart-base"
|
| 183 |
SUMM_INPUT_N_TOKENS = 400
|
|
@@ -213,23 +209,13 @@ def summ_inference_tokenize(input_: list, n_tokens: int):
|
|
| 213 |
tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
|
| 214 |
return summ_tokenizer, tokenized_data
|
| 215 |
|
| 216 |
-
def clean_summary(summary: str):
|
| 217 |
-
summary = summary.strip()
|
| 218 |
-
if summary[-1] != '.':
|
| 219 |
-
sents = summary.split(". ")
|
| 220 |
-
summary = ". ".join(sents[:-1])
|
| 221 |
-
summary += "."
|
| 222 |
-
summary = re.sub(r'^-', "", summary)
|
| 223 |
-
summary = summary.strip()
|
| 224 |
-
if len(summary) <= 5:
|
| 225 |
-
summary = ""
|
| 226 |
-
return summary
|
| 227 |
-
|
| 228 |
def summ_inference(txt: str):
|
| 229 |
txt = summ_preprocess(txt)
|
| 230 |
-
|
|
|
|
| 231 |
pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
|
| 232 |
-
result =
|
|
|
|
| 233 |
return result
|
| 234 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
|
| 235 |
|
|
|
|
| 10 |
import tensorflow as tf
|
| 11 |
import spacy
|
| 12 |
import streamlit as st
|
|
|
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
| 15 |
class NERLabelEncoder:
|
| 16 |
'''
|
| 17 |
Label Encoder to encode and decode the entity labels
|
|
|
|
| 72 |
|
| 73 |
ner_model, ner_label_encoder, ner_tokenizer, nlp = load_ner_models()
|
| 74 |
|
|
|
|
| 75 |
############ NER MODEL & VARS INITIALIZATION END ####################
|
| 76 |
|
| 77 |
############ NER LOGIC START ####################
|
|
|
|
| 147 |
entities = []
|
| 148 |
doc = nlp(txt)
|
| 149 |
for sent in doc.sents:
|
| 150 |
+
entities.extend(ner_inference(sent.text))
|
| 151 |
return entities
|
| 152 |
|
| 153 |
+
|
| 154 |
def get_ner_text(article_txt, ner_result):
|
| 155 |
res_txt = ''
|
| 156 |
start = 0
|
|
|
|
| 174 |
|
| 175 |
############ NER LOGIC END ####################
|
| 176 |
|
|
|
|
| 177 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION START ####################
|
| 178 |
SUMM_CHECKPOINT = "facebook/bart-base"
|
| 179 |
SUMM_INPUT_N_TOKENS = 400
|
|
|
|
| 209 |
tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
|
| 210 |
return summ_tokenizer, tokenized_data
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
def summ_inference(txt: str):
|
| 213 |
txt = summ_preprocess(txt)
|
| 214 |
+
test_data = [txt]
|
| 215 |
+
inference_tokenizer, tokenized_data = summ_inference_tokenize(input_=test_data, n_tokens=SUMM_INPUT_N_TOKENS)
|
| 216 |
pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
|
| 217 |
+
result = inference_tokenizer.decode(pred[0])
|
| 218 |
+
result = re.sub("<.*?>", "", result).strip()
|
| 219 |
return result
|
| 220 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
|
| 221 |
|