Spaces:

pleonova
/

multi-label-summary-text

Running

App Files Files Community

Paula Leonova commited on Dec 28, 2021

Commit

e452a5c

1 Parent(s): 1f1805f

Add keyword extraction model and clean up custom models import reference

Browse files

Files changed (3) hide show

app.py +9 -8
models.py +18 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -8,7 +8,8 @@ import streamlit as st
 from sklearn.metrics import classification_report
-from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
 from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
 import json
@@ -46,12 +47,12 @@ with st.form(key='my_form'):
 with st.spinner('Loading pretrained summarizer mnli model...'):
     start = time.time()
-    summarizer = load_summary_model()
     st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
 with st.spinner('Loading pretrained classifier mnli model...'):
     start = time.time()
-    classifier = load_model()
     st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
@@ -63,7 +64,7 @@ if submit_button:
         my_expander = st.expander(label='Expand to see summary generation details')
         with my_expander:
             # For each body of text, create text chunks of a certain token size required for the transformer
-            nested_sentences = create_nest_sentences(document = text_input, token_max_length = 1024)
             summary = []
             # st.markdown("### Text Chunk & Summaries")
@@ -77,21 +78,21 @@ if submit_button:
                 st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
                 st.markdown(text_chunk)
-                chunk_summary = summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
                 summary.append(chunk_summary)
                 st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
                 st.markdown(chunk_summary)
                 # Combine all the summaries into a list and compress into one document, again
                 final_summary = " \n\n".join(list(summary))
-        # final_summary = summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
         st.markdown("### Combined Summary")
         st.markdown(final_summary)
         st.markdown("### Top Label Predictions on Summary & Full Text")
         with st.spinner('Matching labels...'):
-            topics, scores = classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
             # st.markdown("### Top Label Predictions: Combined Summary")
             # plot_result(topics[::-1][:], scores[::-1][:])
             # st.markdown("### Download Data")
@@ -103,7 +104,7 @@ if submit_button:
             #     unsafe_allow_html = True
             #     )
-            topics_ex_text, scores_ex_text = classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
             plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
             data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})

 from sklearn.metrics import classification_report
+# from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
+import models as md
 from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
 import json
 with st.spinner('Loading pretrained summarizer mnli model...'):
     start = time.time()
+    summarizer = md.load_summary_model()
     st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
 with st.spinner('Loading pretrained classifier mnli model...'):
     start = time.time()
+    classifier = md.load_model()
     st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
         my_expander = st.expander(label='Expand to see summary generation details')
         with my_expander:
             # For each body of text, create text chunks of a certain token size required for the transformer
+            nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
             summary = []
             # st.markdown("### Text Chunk & Summaries")
                 st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
                 st.markdown(text_chunk)
+                chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
                 summary.append(chunk_summary)
                 st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
                 st.markdown(chunk_summary)
                 # Combine all the summaries into a list and compress into one document, again
                 final_summary = " \n\n".join(list(summary))
+        # final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
         st.markdown("### Combined Summary")
         st.markdown(final_summary)
         st.markdown("### Top Label Predictions on Summary & Full Text")
         with st.spinner('Matching labels...'):
+            topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
             # st.markdown("### Top Label Predictions: Combined Summary")
             # plot_result(topics[::-1][:], scores[::-1][:])
             # st.markdown("### Download Data")
             #     unsafe_allow_html = True
             #     )
+            topics_ex_text, scores_ex_text = md.classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
             plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
             data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})

models.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import streamlit as st
 import spacy
@@ -29,6 +30,23 @@ def create_nest_sentences(document:str, token_max_length = 1024):
     nested.append(sent)
   return nested
 # Reference: https://huggingface.co/facebook/bart-large-mnli
 @st.cache(allow_output_mutation=True)
 def load_summary_model():

 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import streamlit as st
+from keybert import KeyBERT
 import spacy
     nested.append(sent)
   return nested
+# Reference: https://github.com/MaartenGr/KeyBERT
+@st.cache(allow_output_mutation=True)
+def load_keyword_model():
+  kw_model = KeyBERT()
+  return ky_model
+def keyword_gen(sequence:str):
+  keywords = kw_model.extract_keywords(sequence,
+    keyphrase_ngram_range=(1, 1),
+    stop_words='english',
+    use_mmr=True,
+    diversity=0.5,
+    top_n=10)
+  return keywords
 # Reference: https://huggingface.co/facebook/bart-large-mnli
 @st.cache(allow_output_mutation=True)
 def load_summary_model():

requirements.txt CHANGED Viewed

@@ -4,5 +4,6 @@ streamlit
 plotly
 torch
 sklearn
 spacy>=2.2.0,<3.0.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm

 plotly
 torch
 sklearn
+KeyBERT
 spacy>=2.2.0,<3.0.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm