File size: 3,097 Bytes
7f3cfe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4158e04
7f3cfe8
 
4158e04
7f3cfe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4158e04
7f3cfe8
4158e04
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch

from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from transformers import T5ForConditionalGeneration,T5Tokenizer

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')

import streamlit as st

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load KeyBert Model
kw_extractor = KeyBERT('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=st.secrets["hf-auth-token"])

# Load T5 for Paraphrasing
t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=st.secrets["hf-auth-token"])
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = t5_model.to(device)

def get_keybert_results_with_vectorizer(text, number_of_results=20):
    keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
    return keywords

def t5_paraphraser(text, number_of_results=10):
    text =  "paraphrase: " + text + " </s>"
    max_len = 2048
    encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    beam_outputs = t5_model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=2048,
        top_k=50,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=number_of_results
    )
    
    final_outputs =[]
    for beam_output in beam_outputs:
        sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        final_outputs.append(sent)
    
    return final_outputs
    
  
  #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again

def extract_paraphrased_sentences(article):

    original_keywords = [i[0] for i in get_keybert_results_with_vectorizer(article)]

    article_sentences = sent_tokenize(article)
    target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]

    start1 = time.time()
    t5_paraphrasing_keywords = []

    for sent in target_sentences:
        ### T5
        t5_paraphrased = t5_paraphraser(sent)
        t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
        t5_keywords = [word[0] for s in t5_keywords for word in s]

        t5_paraphrasing_keywords.extend(t5_keywords)

    print(f'T5 Approach2 PARAPHRASING RUNTIME: {time.time()-start1}\n')

    print('T5 Keywords Extracted: \n{}\n\n'.format(t5_paraphrasing_keywords))
    print('----------------------------')
    print('T5 Unique New Keywords Extracted: \n{}\n\n'.format([i for i in set(t5_paraphrasing_keywords) 
                                                                if i not in original_keywords]))
    
    return t5_paraphrasing_keywords
    

doc = st.text_area("Enter a custom document")

if doc:
    keywords = extract_paraphrased_sentences(doc)
    st.write(keywords)