Paula Leonova
commited on
Commit
·
e452a5c
1
Parent(s):
1f1805f
Add keyword extraction model and clean up custom models import reference
Browse files- app.py +9 -8
- models.py +18 -0
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -8,7 +8,8 @@ import streamlit as st
|
|
| 8 |
from sklearn.metrics import classification_report
|
| 9 |
|
| 10 |
|
| 11 |
-
from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
|
|
|
|
| 12 |
from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
|
| 13 |
import json
|
| 14 |
|
|
@@ -46,12 +47,12 @@ with st.form(key='my_form'):
|
|
| 46 |
|
| 47 |
with st.spinner('Loading pretrained summarizer mnli model...'):
|
| 48 |
start = time.time()
|
| 49 |
-
summarizer = load_summary_model()
|
| 50 |
st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
|
| 51 |
|
| 52 |
with st.spinner('Loading pretrained classifier mnli model...'):
|
| 53 |
start = time.time()
|
| 54 |
-
classifier = load_model()
|
| 55 |
st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
| 56 |
|
| 57 |
|
|
@@ -63,7 +64,7 @@ if submit_button:
|
|
| 63 |
my_expander = st.expander(label='Expand to see summary generation details')
|
| 64 |
with my_expander:
|
| 65 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 66 |
-
nested_sentences = create_nest_sentences(document = text_input, token_max_length = 1024)
|
| 67 |
|
| 68 |
summary = []
|
| 69 |
# st.markdown("### Text Chunk & Summaries")
|
|
@@ -77,21 +78,21 @@ if submit_button:
|
|
| 77 |
st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
| 78 |
st.markdown(text_chunk)
|
| 79 |
|
| 80 |
-
chunk_summary = summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
| 81 |
summary.append(chunk_summary)
|
| 82 |
st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
|
| 83 |
st.markdown(chunk_summary)
|
| 84 |
# Combine all the summaries into a list and compress into one document, again
|
| 85 |
final_summary = " \n\n".join(list(summary))
|
| 86 |
|
| 87 |
-
# final_summary = summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
| 88 |
st.markdown("### Combined Summary")
|
| 89 |
st.markdown(final_summary)
|
| 90 |
|
| 91 |
|
| 92 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
| 93 |
with st.spinner('Matching labels...'):
|
| 94 |
-
topics, scores = classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
| 95 |
# st.markdown("### Top Label Predictions: Combined Summary")
|
| 96 |
# plot_result(topics[::-1][:], scores[::-1][:])
|
| 97 |
# st.markdown("### Download Data")
|
|
@@ -103,7 +104,7 @@ if submit_button:
|
|
| 103 |
# unsafe_allow_html = True
|
| 104 |
# )
|
| 105 |
|
| 106 |
-
topics_ex_text, scores_ex_text = classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
|
| 107 |
plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
|
| 108 |
|
| 109 |
data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
|
|
|
|
| 8 |
from sklearn.metrics import classification_report
|
| 9 |
|
| 10 |
|
| 11 |
+
# from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
|
| 12 |
+
import models as md
|
| 13 |
from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
|
| 14 |
import json
|
| 15 |
|
|
|
|
| 47 |
|
| 48 |
with st.spinner('Loading pretrained summarizer mnli model...'):
|
| 49 |
start = time.time()
|
| 50 |
+
summarizer = md.load_summary_model()
|
| 51 |
st.success(f'Time taken to load summarizer mnli model: {round(time.time() - start,4)} seconds')
|
| 52 |
|
| 53 |
with st.spinner('Loading pretrained classifier mnli model...'):
|
| 54 |
start = time.time()
|
| 55 |
+
classifier = md.load_model()
|
| 56 |
st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
| 57 |
|
| 58 |
|
|
|
|
| 64 |
my_expander = st.expander(label='Expand to see summary generation details')
|
| 65 |
with my_expander:
|
| 66 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 67 |
+
nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
|
| 68 |
|
| 69 |
summary = []
|
| 70 |
# st.markdown("### Text Chunk & Summaries")
|
|
|
|
| 78 |
st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
| 79 |
st.markdown(text_chunk)
|
| 80 |
|
| 81 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
| 82 |
summary.append(chunk_summary)
|
| 83 |
st.markdown(f"###### Partial Summary {n+1}/{len(nested_sentences)}")
|
| 84 |
st.markdown(chunk_summary)
|
| 85 |
# Combine all the summaries into a list and compress into one document, again
|
| 86 |
final_summary = " \n\n".join(list(summary))
|
| 87 |
|
| 88 |
+
# final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
| 89 |
st.markdown("### Combined Summary")
|
| 90 |
st.markdown(final_summary)
|
| 91 |
|
| 92 |
|
| 93 |
st.markdown("### Top Label Predictions on Summary & Full Text")
|
| 94 |
with st.spinner('Matching labels...'):
|
| 95 |
+
topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
|
| 96 |
# st.markdown("### Top Label Predictions: Combined Summary")
|
| 97 |
# plot_result(topics[::-1][:], scores[::-1][:])
|
| 98 |
# st.markdown("### Download Data")
|
|
|
|
| 104 |
# unsafe_allow_html = True
|
| 105 |
# )
|
| 106 |
|
| 107 |
+
topics_ex_text, scores_ex_text = md.classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
|
| 108 |
plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
|
| 109 |
|
| 110 |
data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
|
models.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 3 |
import streamlit as st
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
import spacy
|
|
@@ -29,6 +30,23 @@ def create_nest_sentences(document:str, token_max_length = 1024):
|
|
| 29 |
nested.append(sent)
|
| 30 |
return nested
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
| 33 |
@st.cache(allow_output_mutation=True)
|
| 34 |
def load_summary_model():
|
|
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 3 |
import streamlit as st
|
| 4 |
+
from keybert import KeyBERT
|
| 5 |
|
| 6 |
|
| 7 |
import spacy
|
|
|
|
| 30 |
nested.append(sent)
|
| 31 |
return nested
|
| 32 |
|
| 33 |
+
# Reference: https://github.com/MaartenGr/KeyBERT
|
| 34 |
+
@st.cache(allow_output_mutation=True)
|
| 35 |
+
def load_keyword_model():
|
| 36 |
+
kw_model = KeyBERT()
|
| 37 |
+
return ky_model
|
| 38 |
+
|
| 39 |
+
def keyword_gen(sequence:str):
|
| 40 |
+
keywords = kw_model.extract_keywords(sequence,
|
| 41 |
+
keyphrase_ngram_range=(1, 1),
|
| 42 |
+
stop_words='english',
|
| 43 |
+
use_mmr=True,
|
| 44 |
+
diversity=0.5,
|
| 45 |
+
top_n=10)
|
| 46 |
+
return keywords
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
| 51 |
@st.cache(allow_output_mutation=True)
|
| 52 |
def load_summary_model():
|
requirements.txt
CHANGED
|
@@ -4,5 +4,6 @@ streamlit
|
|
| 4 |
plotly
|
| 5 |
torch
|
| 6 |
sklearn
|
|
|
|
| 7 |
spacy>=2.2.0,<3.0.0
|
| 8 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
|
|
|
|
| 4 |
plotly
|
| 5 |
torch
|
| 6 |
sklearn
|
| 7 |
+
KeyBERT
|
| 8 |
spacy>=2.2.0,<3.0.0
|
| 9 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
|