numBery commited on
Commit
336de0c
1 Parent(s): a52f9ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -65
app.py CHANGED
@@ -10,12 +10,19 @@ from transformers import T5ForConditionalGeneration,T5Tokenizer
10
 
11
  import nltk
12
  from nltk.tokenize import sent_tokenize
13
- nltk.download('stopwords')
14
- nltk.download('punkt')
15
 
16
  from huggingface_hub import snapshot_download, HfFolder
17
  import streamlit as st
18
 
 
 
 
 
 
 
 
 
 
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
 
21
  HfFolder.save_token(st.secrets["hf-auth-token"])
@@ -23,85 +30,101 @@ HfFolder.save_token(st.secrets["hf-auth-token"])
23
 
24
  @st.cache(allow_output_mutation=True)
25
  def load_model():
26
- # Load KeyBert Model
27
- tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
28
- kw_extractor = KeyBERT(tmp_model)
 
 
 
 
 
 
 
 
 
 
29
 
30
- # Load T5 for Paraphrasing
31
- t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
32
- t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
33
- t5_model = t5_model.to(device)
34
- return kw_extractor, t5_model, t5_tokenizer
35
 
36
  kw_extractor, t5_model, t5_tokenizer = load_model()
37
 
38
 
39
  @st.cache()
40
  def get_keybert_results_with_vectorizer(text, number_of_results=20):
41
- keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
42
- return keywords
 
 
 
 
 
 
43
 
44
 
45
  @st.cache()
46
  def t5_paraphraser(text, number_of_results=5):
47
- text = "paraphrase: " + text + " </s>"
48
- max_len = 2048
49
- encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
50
- input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
51
-
52
- beam_outputs = t5_model.generate(
53
- input_ids=input_ids, attention_mask=attention_masks,
54
- do_sample=True,
55
- max_length=2048,
56
- top_k=50,
57
- top_p=0.95,
58
- early_stopping=True,
59
- num_return_sequences=number_of_results
60
- )
61
-
62
- final_outputs =[]
63
- for beam_output in beam_outputs:
64
- sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
65
- final_outputs.append(sent)
 
 
 
 
 
66
 
67
- return final_outputs
68
 
69
 
70
  #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
71
  def extract_paraphrased_sentences(article):
72
-
73
- start1 = time.time()
74
- with st.spinner('Extraction Keywords from Original Document...'):
75
- original_keywords = [(i[0], i[1]) for i in get_keybert_results_with_vectorizer(article)]
76
-
77
- article_sentences = sent_tokenize(article)
78
- target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
79
- st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
80
-
81
-
82
- start2 = time.time()
83
- with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
84
- t5_paraphrasing_keywords = []
85
-
86
- for sent in target_sentences:
87
- ### T5
88
- t5_paraphrased = t5_paraphraser(sent)
89
- t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
90
- t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
91
-
92
- t5_paraphrasing_keywords.extend(t5_keywords)
93
- st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
94
-
95
- original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
96
-
97
- t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')
98
- st.dataframe(t5_keywords_df)
99
- unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')
100
-
101
- total_end = time.time()-start1
102
-
103
- return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
104
-
 
105
 
106
  doc = st.text_area("Enter a custom document")
107
 
 
10
 
11
  import nltk
12
  from nltk.tokenize import sent_tokenize
 
 
13
 
14
  from huggingface_hub import snapshot_download, HfFolder
15
  import streamlit as st
16
 
17
+ import traceback
18
+ import logging
19
+
20
+
21
+ nltk.download('stopwords')
22
+ nltk.download('punkt')
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
 
28
  HfFolder.save_token(st.secrets["hf-auth-token"])
 
30
 
31
  @st.cache(allow_output_mutation=True)
32
  def load_model():
33
+ try:
34
+ # Load KeyBert Model
35
+ tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
36
+ kw_extractor = KeyBERT(tmp_model)
37
+
38
+ # Load T5 for Paraphrasing
39
+ t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
40
+ t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
41
+ t5_model = t5_model.to(device)
42
+ return kw_extractor, t5_model, t5_tokenizer
43
+ except Exception:
44
+ st.error('Error Loading Models. Please contact admin')
45
+ logger.error(traceback.format_exc())
46
 
 
 
 
 
 
47
 
48
  kw_extractor, t5_model, t5_tokenizer = load_model()
49
 
50
 
51
  @st.cache()
52
  def get_keybert_results_with_vectorizer(text, number_of_results=20):
53
+
54
+ try:
55
+ keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
56
+ return keywords
57
+ except Exception:
58
+ st.error('Error running Keybert. Please contact admin')
59
+ logger.error(traceback.format_exc())
60
+
61
 
62
 
63
  @st.cache()
64
  def t5_paraphraser(text, number_of_results=5):
65
+ try:
66
+ text = "paraphrase: " + text + " </s>"
67
+ max_len = 2048
68
+ encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
69
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
70
+
71
+ beam_outputs = t5_model.generate(
72
+ input_ids=input_ids, attention_mask=attention_masks,
73
+ do_sample=True,
74
+ max_length=2048,
75
+ top_k=50,
76
+ top_p=0.95,
77
+ early_stopping=True,
78
+ num_return_sequences=number_of_results
79
+ )
80
+
81
+ final_outputs =[]
82
+ for beam_output in beam_outputs:
83
+ sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
84
+ final_outputs.append(sent)
85
+ return final_outputs
86
+ except Exception:
87
+ st.error('Error running T5 Paraphrasing. Please contact admin')
88
+ logger.error(traceback.format_exc())
89
 
 
90
 
91
 
92
  #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
93
  def extract_paraphrased_sentences(article):
94
+ try:
95
+ start1 = time.time()
96
+ with st.spinner('Extraction Keywords from Original Document...'):
97
+ original_keywords = get_keybert_results_with_vectorizer(article)
98
+
99
+ article_sentences = sent_tokenize(article)
100
+ target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
101
+ st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
102
+
103
+ start2 = time.time()
104
+ with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
105
+ t5_paraphrasing_keywords = []
106
+
107
+ for sent in target_sentences:
108
+ ### T5
109
+ t5_paraphrased = t5_paraphraser(sent)
110
+ t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
111
+ t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
112
+
113
+ t5_paraphrasing_keywords.extend(t5_keywords)
114
+ st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
115
+
116
+ original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
117
+
118
+ t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
119
+
120
+ unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
121
+
122
+ total_end = time.time()-start1
123
+
124
+ return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
125
+ except Exception:
126
+ st.error('Error running Extraction Pipeline. Please contact admin')
127
+ logger.error(traceback.format_exc())
128
 
129
  doc = st.text_area("Enter a custom document")
130