Spaces:

valurank
/

keyword-extraction-demo

Build error

App Files Files Community

numBery commited on May 22, 2022

Commit

336de0c

•

1 Parent(s): a52f9ec

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -65

app.py CHANGED Viewed

@@ -10,12 +10,19 @@ from transformers import T5ForConditionalGeneration,T5Tokenizer
 import nltk
 from nltk.tokenize import sent_tokenize
-nltk.download('stopwords')
-nltk.download('punkt')
 from huggingface_hub import snapshot_download, HfFolder
 import streamlit as st
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 HfFolder.save_token(st.secrets["hf-auth-token"])
@@ -23,85 +30,101 @@ HfFolder.save_token(st.secrets["hf-auth-token"])
 @st.cache(allow_output_mutation=True)
 def load_model():
-    # Load KeyBert Model
-    tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
-    kw_extractor = KeyBERT(tmp_model)
-    # Load T5 for Paraphrasing
-    t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
-    t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
-    t5_model = t5_model.to(device)
-    return kw_extractor, t5_model, t5_tokenizer
 kw_extractor, t5_model, t5_tokenizer = load_model()
 @st.cache()
 def get_keybert_results_with_vectorizer(text, number_of_results=20):
-    keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
-    return keywords
 @st.cache()
 def t5_paraphraser(text, number_of_results=5):
-    text =  "paraphrase: " + text + " </s>"
-    max_len = 2048
-    encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
-    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
-    beam_outputs = t5_model.generate(
-        input_ids=input_ids, attention_mask=attention_masks,
-        do_sample=True,
-        max_length=2048,
-        top_k=50,
-        top_p=0.95,
-        early_stopping=True,
-        num_return_sequences=number_of_results
-    )
-    final_outputs =[]
-    for beam_output in beam_outputs:
-        sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        final_outputs.append(sent)
-    return final_outputs
 #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
 def extract_paraphrased_sentences(article):
-    start1 = time.time()
-    with st.spinner('Extraction Keywords from Original Document...'):
-        original_keywords = [(i[0], i[1]) for i in get_keybert_results_with_vectorizer(article)]
-        article_sentences = sent_tokenize(article)
-        target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
-    st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
-    start2 = time.time()
-    with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
-        t5_paraphrasing_keywords = []
-        for sent in target_sentences:
-            ### T5
-            t5_paraphrased = t5_paraphraser(sent)
-            t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
-            t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
-            t5_paraphrasing_keywords.extend(t5_keywords)
-    st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
-    original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
-    t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')
-    st.dataframe(t5_keywords_df)
-    unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')
-    total_end = time.time()-start1
-    return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
 doc = st.text_area("Enter a custom document")

 import nltk
 from nltk.tokenize import sent_tokenize
 from huggingface_hub import snapshot_download, HfFolder
 import streamlit as st
+import traceback
+import logging
+nltk.download('stopwords')
+nltk.download('punkt')
+logger = logging.getLogger(__name__)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 HfFolder.save_token(st.secrets["hf-auth-token"])
 @st.cache(allow_output_mutation=True)
 def load_model():
+    try:
+        # Load KeyBert Model
+        tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
+        kw_extractor = KeyBERT(tmp_model)
+        # Load T5 for Paraphrasing
+        t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
+        t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
+        t5_model = t5_model.to(device)
+        return kw_extractor, t5_model, t5_tokenizer
+    except Exception:
+        st.error('Error Loading Models. Please contact admin')
+        logger.error(traceback.format_exc())
 kw_extractor, t5_model, t5_tokenizer = load_model()
 @st.cache()
 def get_keybert_results_with_vectorizer(text, number_of_results=20):
+    try:
+        keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
+        return keywords
+    except Exception:
+        st.error('Error running Keybert. Please contact admin')
+        logger.error(traceback.format_exc())
 @st.cache()
 def t5_paraphraser(text, number_of_results=5):
+    try:
+        text =  "paraphrase: " + text + " </s>"
+        max_len = 2048
+        encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
+        input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
+        beam_outputs = t5_model.generate(
+            input_ids=input_ids, attention_mask=attention_masks,
+            do_sample=True,
+            max_length=2048,
+            top_k=50,
+            top_p=0.95,
+            early_stopping=True,
+            num_return_sequences=number_of_results
+        )
+        final_outputs =[]
+        for beam_output in beam_outputs:
+            sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            final_outputs.append(sent)
+        return final_outputs
+    except Exception:
+        st.error('Error running T5 Paraphrasing. Please contact admin')
+        logger.error(traceback.format_exc())
 #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
 def extract_paraphrased_sentences(article):
+    try:
+        start1 = time.time()
+        with st.spinner('Extraction Keywords from Original Document...'):
+            original_keywords = get_keybert_results_with_vectorizer(article)
+            article_sentences = sent_tokenize(article)
+            target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
+        st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
+        start2 = time.time()
+        with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
+            t5_paraphrasing_keywords = []
+            for sent in target_sentences:
+                ### T5
+                t5_paraphrased = t5_paraphraser(sent)
+                t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
+                t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
+                t5_paraphrasing_keywords.extend(t5_keywords)
+        st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
+        original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
+        t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
+        unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
+        total_end = time.time()-start1
+        return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
+    except Exception:
+        st.error('Error running Extraction Pipeline. Please contact admin')
+        logger.error(traceback.format_exc())
 doc = st.text_area("Enter a custom document")