Spaces:
Build error
Build error
File size: 9,762 Bytes
fb6f6f9 02ac4f2 7f3cfe8 44f0002 7f3cfe8 1704eda 7f3cfe8 52022f7 7f3cfe8 336de0c 7f3cfe8 52022f7 7f3cfe8 48029cd 7d24d84 336de0c 6ed92b7 336de0c 6ed92b7 7d24d84 336de0c 7d24d84 336de0c 48029cd 7f3cfe8 336de0c 7d24d84 336de0c 7f3cfe8 48029cd 94ab8d9 336de0c 7d24d84 336de0c 7d24d84 336de0c 7d24d84 336de0c 7d24d84 336de0c 7f3cfe8 48029cd 7d24d84 336de0c 7d24d84 336de0c 7d24d84 336de0c 7d24d84 336de0c 6ed92b7 7d24d84 336de0c 7d24d84 336de0c 7d24d84 336de0c 7d24d84 34f4c1d 7d24d84 336de0c 7d24d84 336de0c 7d24d84 f195951 7d24d84 7f3cfe8 6ed92b7 7d24d84 6ed92b7 7d24d84 6ed92b7 7d24d84 6ed92b7 7d24d84 6ed92b7 02ac4f2 7d24d84 02ac4f2 6ed92b7 7d24d84 02ac4f2 7d24d84 48029cd 7d24d84 02ac4f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import time
import pandas as pd
import torch
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from keyphrase_vectorizers import KeyphraseCountVectorizer
from transformers import T5ForConditionalGeneration,T5Tokenizer
#from fastT5 import export_and_get_onnx_model, set_auth_token
import nltk
from nltk.tokenize import sent_tokenize
from huggingface_hub import snapshot_download, HfFolder
import streamlit as st
import traceback
import logging
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HfFolder.save_token(st.secrets["hf-auth-token"])
@st.cache(allow_output_mutation=True)
def load_base_model():
try:
nltk.download('stopwords')
nltk.download('punkt')
# Load KeyBert Model
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
kw_extractor = KeyBERT(tmp_model)
# Load T5 for Paraphrasing
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
t5_model = t5_model.to(device)
return kw_extractor, t5_model, t5_tokenizer
except Exception:
st.error('Error Loading Models. Please contact admin')
logger.error(traceback.format_exc())
def get_keybert_results_with_vectorizer(text, number_of_results=20):
try:
keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
keywords = [i for i in keywords if i[1] >= 0.25]
return keywords
except Exception:
st.error('Error running Keybert. Please contact admin')
logger.error(traceback.format_exc())
def t5_paraphraser(text, number_of_results=5):
try:
text = "paraphrase: " + text
encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
beam_outputs = t5_model.generate(
input_ids=input_ids, attention_mask=attention_masks,
do_sample=True,
max_length=1024,
top_k=50,
top_p=0.95,
early_stopping=True,
num_return_sequences=number_of_results
)
final_outputs =[]
for beam_output in beam_outputs:
sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
final_outputs.append(sent)
return final_outputs
except Exception:
st.error('Error running T5 Paraphrasing. Please contact admin')
logger.error(traceback.format_exc())
def run_long_extraction(article, number_of_paraphrases):
try:
start1 = time.time()
with st.spinner('Extraction Keywords from Original Document...'):
original_keywords = get_keybert_results_with_vectorizer(article, number_of_results=30)
article_sentences = sent_tokenize(article)
target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
st.info(f'Total Sentences in Article : {len(article_sentences)}')
st.info(f'Total Target Sentences Selected : {len(target_sentences)}')
start2 = time.time()
with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
t5_paraphrasing_keywords = []
for sent in target_sentences:
### T5
t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
t5_paraphrasing_keywords.extend(t5_keywords)
st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
total_end = time.time()-start1
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
except Exception:
st.error('Error running Extraction Pipeline. Please contact admin')
logger.error(traceback.format_exc())
def run_short_extraction(article, number_of_paraphrases):
try:
start1 = time.time()
original_keywords = get_keybert_results_with_vectorizer(article)
article_sentences = sent_tokenize(article)
st.info(f'Total Sentences in Article : {len(article_sentences)}')
target_sentences = []
tmp = []
token_count = 0
for i in article_sentences:
enc = t5_tokenizer.encode(i)
if token_count + len(enc) <= 96:
tmp.append(i)
token_count += len(enc)
else:
target_sentences.append(' '.join(tmp))
token_count = len(enc)
tmp = [i]
start2 = time.time()
with st.spinner('Extracting Keywords from Paraphrased Sentences Groups...'):
t5_paraphrasing_keywords = []
for sent in target_sentences:
### T5
t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
t5_paraphrasing_keywords.extend(t5_keywords)
st.success('Keyword Extraction from Paraphrased Grouped Sentences finished in {}'.format(time.time() - start2))
original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
total_end = time.time()-start1
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
except Exception:
st.error('Error running Extraction Pipeline. Please contact admin')
logger.error(traceback.format_exc())
def check_document_length(article, number_of_paraphrases):
total_tokens = len(t5_tokenizer.encode(article))
st.info(f'Token Counts for Encoded Document: {total_tokens}')
if total_tokens >= 512:
st.info('Running Extraction for Long Document')
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = run_long_extraction(article, number_of_paraphrases)
else:
st.info('Running Extraction for Short Document')
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = run_short_extraction(article, number_of_paraphrases)
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
kw_extractor, t5_model, t5_tokenizer = load_base_model()
st.title('Exhaustive Keyword Extraction with Paraphrasing')
with st.sidebar:
st.header('Overview')
st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')
st.header('Parameters')
# number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)
st.header('Specifications')
# st.markdown('To generate context aware and OOV keywords for long, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')
doc = st.text_area("Enter a custom document")
if doc:
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = check_document_length(doc, number_of_paraphrases)
# extract_paraphrased_article(input_list[0])
st.text(f'PIPELINE RUNTIME: {total_end}\n')
st.subheader('\nOriginal Keywords Extracted:\n\n')
st.dataframe(original_keywords_df)
st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
st.dataframe(unique_keywords_df)
st.subheader('\nT5 Keywords Extracted:\n\n')
st.dataframe(t5_keywords_df)
|