File size: 9,762 Bytes
fb6f6f9
02ac4f2
 
 
7f3cfe8
44f0002
7f3cfe8
 
1704eda
7f3cfe8
 
 
 
52022f7
7f3cfe8
 
336de0c
 
 
 
 
7f3cfe8
 
52022f7
7f3cfe8
48029cd
7d24d84
336de0c
6ed92b7
 
336de0c
 
 
6ed92b7
7d24d84
336de0c
7d24d84
336de0c
 
 
 
 
48029cd
 
7f3cfe8
 
336de0c
 
7d24d84
336de0c
 
 
 
 
7f3cfe8
48029cd
94ab8d9
336de0c
7d24d84
336de0c
 
7d24d84
336de0c
 
 
7d24d84
336de0c
 
 
 
 
7d24d84
336de0c
 
 
 
 
 
 
 
7f3cfe8
 
48029cd
7d24d84
 
336de0c
 
7d24d84
336de0c
7d24d84
 
336de0c
7d24d84
 
 
336de0c
 
 
 
 
6ed92b7
7d24d84
336de0c
 
 
7d24d84
336de0c
7d24d84
336de0c
7d24d84
34f4c1d
7d24d84
336de0c
7d24d84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336de0c
 
 
7d24d84
 
f195951
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d24d84
 
7f3cfe8
 
6ed92b7
 
 
 
7d24d84
6ed92b7
7d24d84
6ed92b7
7d24d84
6ed92b7
7d24d84
 
 
6ed92b7
 
02ac4f2
7d24d84
 
02ac4f2
6ed92b7
7d24d84
02ac4f2
 
7d24d84
48029cd
 
7d24d84
02ac4f2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import time
import pandas as pd

import torch
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from keyphrase_vectorizers import KeyphraseCountVectorizer
from transformers import T5ForConditionalGeneration,T5Tokenizer
#from fastT5 import export_and_get_onnx_model, set_auth_token

import nltk
from nltk.tokenize import sent_tokenize

from huggingface_hub import snapshot_download, HfFolder
import streamlit as st

import traceback
import logging

logger = logging.getLogger(__name__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

HfFolder.save_token(st.secrets["hf-auth-token"])

@st.cache(allow_output_mutation=True)
def load_base_model():
    try:
        nltk.download('stopwords')
        nltk.download('punkt')
        # Load KeyBert Model
        tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
        kw_extractor = KeyBERT(tmp_model)

        # Load T5 for Paraphrasing
        t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
        t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
        t5_model = t5_model.to(device)
        return kw_extractor, t5_model, t5_tokenizer
    except Exception:
        st.error('Error Loading Models. Please contact admin')
        logger.error(traceback.format_exc())
    
    

def get_keybert_results_with_vectorizer(text, number_of_results=20):
    try:
        keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
        keywords = [i for i in keywords if i[1] >= 0.25]
        return keywords
    except Exception:
        st.error('Error running Keybert. Please contact admin')
        logger.error(traceback.format_exc())



def t5_paraphraser(text, number_of_results=5):
    try:
        text =  "paraphrase: " + text
        encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

        beam_outputs = t5_model.generate(
            input_ids=input_ids, attention_mask=attention_masks,
            do_sample=True,
            max_length=1024,
            top_k=50,
            top_p=0.95,
            early_stopping=True,
            num_return_sequences=number_of_results
        )

        final_outputs =[]
        for beam_output in beam_outputs:
            sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            final_outputs.append(sent)
        return final_outputs
    except Exception:
        st.error('Error running T5 Paraphrasing. Please contact admin')
        logger.error(traceback.format_exc())
    
    
    
def run_long_extraction(article, number_of_paraphrases):
    try:
        start1 = time.time()
        with st.spinner('Extraction Keywords from Original Document...'):
            original_keywords = get_keybert_results_with_vectorizer(article, number_of_results=30)
            article_sentences = sent_tokenize(article)
            target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]

        st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
        st.info(f'Total Sentences in Article : {len(article_sentences)}')
        st.info(f'Total Target Sentences Selected : {len(target_sentences)}')

        start2 = time.time()
        with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
            t5_paraphrasing_keywords = []
            for sent in target_sentences:
                ### T5
                t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
                t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
                t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
                t5_paraphrasing_keywords.extend(t5_keywords)
        st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))

        original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])

        t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)

        unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)

        total_end = time.time()-start1

        return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
    except Exception:
        st.error('Error running Extraction Pipeline. Please contact admin')
        logger.error(traceback.format_exc())     
          
          
          
def run_short_extraction(article, number_of_paraphrases):
    try:
        start1 = time.time()
        original_keywords = get_keybert_results_with_vectorizer(article)
        article_sentences = sent_tokenize(article)
        st.info(f'Total Sentences in Article : {len(article_sentences)}')

        target_sentences = []
        tmp = []
        token_count = 0
        for i in article_sentences:
            enc = t5_tokenizer.encode(i)
            if token_count + len(enc) <= 96:
                tmp.append(i)
                token_count += len(enc)
            else:
                target_sentences.append(' '.join(tmp))
                token_count = len(enc)
                tmp = [i]          
     
        start2 = time.time()
        with st.spinner('Extracting Keywords from Paraphrased Sentences Groups...'):
            t5_paraphrasing_keywords = []
            for sent in target_sentences:
                ### T5
                t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
                t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
                t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
                t5_paraphrasing_keywords.extend(t5_keywords)
        st.success('Keyword Extraction from Paraphrased Grouped Sentences finished in {}'.format(time.time() - start2))

        original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])

        t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)

        unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)

        total_end = time.time()-start1

        return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
    except Exception:
        st.error('Error running Extraction Pipeline. Please contact admin')
        logger.error(traceback.format_exc())     
        



def check_document_length(article, number_of_paraphrases):
    total_tokens = len(t5_tokenizer.encode(article))
    st.info(f'Token Counts for Encoded Document: {total_tokens}')

    if total_tokens >= 512:
        st.info('Running Extraction for Long Document')
        t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = run_long_extraction(article, number_of_paraphrases)
    else:
        st.info('Running Extraction for Short Document')
        t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = run_short_extraction(article, number_of_paraphrases)
    
    return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
    
    

kw_extractor, t5_model, t5_tokenizer = load_base_model()


st.title('Exhaustive Keyword Extraction with Paraphrasing')
with st.sidebar:
    st.header('Overview')
    st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')

    st.header('Parameters')
    # number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
    number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)

    st.header('Specifications')
    # st.markdown('To generate context aware and OOV keywords for long, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')



doc = st.text_area("Enter a custom document")
if doc:
    t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = check_document_length(doc, number_of_paraphrases)

    # extract_paraphrased_article(input_list[0])
    st.text(f'PIPELINE RUNTIME: {total_end}\n')

    st.subheader('\nOriginal Keywords Extracted:\n\n')
    st.dataframe(original_keywords_df)

    st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
    st.dataframe(unique_keywords_df)

    st.subheader('\nT5 Keywords Extracted:\n\n')
    st.dataframe(t5_keywords_df)