Spaces:
Build error
Build error
Commit
·
99b1da3
1
Parent(s):
338f4fe
Use tokenizer to split sentences
Browse files- Summarizer.py +35 -4
- app.py +2 -1
- requirements.txt +2 -1
- test_summarizer.py +26 -0
Summarizer.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
from sumy.parsers import DocumentParser
|
| 4 |
from sumy.parsers.html import HtmlParser
|
|
@@ -7,12 +7,14 @@ from sumy.nlp.tokenizers import Tokenizer
|
|
| 7 |
from sumy.nlp.stemmers import Stemmer
|
| 8 |
from sumy.summarizers.lsa import LsaSummarizer
|
| 9 |
from sumy.utils import get_stop_words
|
| 10 |
-
from transformers import Pipeline
|
| 11 |
|
| 12 |
|
| 13 |
class Summarizer:
|
| 14 |
DEFAULT_LANGUAGE = "english"
|
| 15 |
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def __init__(self, pipeline: Pipeline):
|
| 18 |
self.pipeline = pipeline
|
|
@@ -27,6 +29,30 @@ class Summarizer:
|
|
| 27 |
summarized_list.append(sentence._text)
|
| 28 |
return summarized_list
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
|
| 31 |
summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
|
| 32 |
summarized_list = Summarizer.sentence_list(summarized_sentences)
|
|
@@ -41,8 +67,13 @@ class Summarizer:
|
|
| 41 |
return self.__extractive_summary(parser, sentences_count)
|
| 42 |
|
| 43 |
def abstractive_summary(self, extract_summary_sentences: list) -> list:
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
abstractive_summary_list = []
|
| 47 |
for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
|
| 48 |
abstractive_summary_list.append(result['summary_text'])
|
|
|
|
| 1 |
+
import string
|
| 2 |
|
| 3 |
from sumy.parsers import DocumentParser
|
| 4 |
from sumy.parsers.html import HtmlParser
|
|
|
|
| 7 |
from sumy.nlp.stemmers import Stemmer
|
| 8 |
from sumy.summarizers.lsa import LsaSummarizer
|
| 9 |
from sumy.utils import get_stop_words
|
| 10 |
+
from transformers import Pipeline, BertTokenizer
|
| 11 |
|
| 12 |
|
| 13 |
class Summarizer:
|
| 14 |
DEFAULT_LANGUAGE = "english"
|
| 15 |
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
| 16 |
+
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
|
| 17 |
+
STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
|
| 18 |
|
| 19 |
def __init__(self, pipeline: Pipeline):
|
| 20 |
self.pipeline = pipeline
|
|
|
|
| 29 |
summarized_list.append(sentence._text)
|
| 30 |
return summarized_list
|
| 31 |
|
| 32 |
+
@staticmethod
|
| 33 |
+
def join_sentences(summary_sentences: list) -> str:
|
| 34 |
+
return " ".join([sentence for sentence in summary_sentences])
|
| 35 |
+
|
| 36 |
+
@staticmethod
|
| 37 |
+
def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
|
| 38 |
+
accumulated_lists = []
|
| 39 |
+
result_list = []
|
| 40 |
+
cumulative_token_length = 0
|
| 41 |
+
for sentence in summary_sentences:
|
| 42 |
+
result_list.append(sentence)
|
| 43 |
+
token_list = Summarizer.TOKENIZER.tokenize(sentence)
|
| 44 |
+
token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
|
| 45 |
+
token_length = len(token_words)
|
| 46 |
+
if token_length + cumulative_token_length >= max_token_length:
|
| 47 |
+
accumulated_lists.append(Summarizer.join_sentences(result_list))
|
| 48 |
+
result_list = []
|
| 49 |
+
cumulative_token_length = 0
|
| 50 |
+
else:
|
| 51 |
+
cumulative_token_length += token_length
|
| 52 |
+
if result_list:
|
| 53 |
+
accumulated_lists.append(Summarizer.join_sentences(result_list))
|
| 54 |
+
return accumulated_lists
|
| 55 |
+
|
| 56 |
def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
|
| 57 |
summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
|
| 58 |
summarized_list = Summarizer.sentence_list(summarized_sentences)
|
|
|
|
| 67 |
return self.__extractive_summary(parser, sentences_count)
|
| 68 |
|
| 69 |
def abstractive_summary(self, extract_summary_sentences: list) -> list:
|
| 70 |
+
"""
|
| 71 |
+
:param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
|
| 72 |
+
:return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
|
| 73 |
+
"""
|
| 74 |
+
wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
|
| 75 |
+
max_token_length=1000)
|
| 76 |
+
# The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
|
| 77 |
abstractive_summary_list = []
|
| 78 |
for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
|
| 79 |
abstractive_summary_list.append(result['summary_text'])
|
app.py
CHANGED
|
@@ -87,7 +87,8 @@ def main() -> None:
|
|
| 87 |
|
| 88 |
sentences_length = st.number_input(
|
| 89 |
label='Number of sentences to be extracted:',
|
| 90 |
-
min_value=
|
|
|
|
| 91 |
value=st.session_state.sentences_length
|
| 92 |
)
|
| 93 |
sample_choice = st.selectbox(
|
|
|
|
| 87 |
|
| 88 |
sentences_length = st.number_input(
|
| 89 |
label='Number of sentences to be extracted:',
|
| 90 |
+
min_value=5,
|
| 91 |
+
max_value=15,
|
| 92 |
value=st.session_state.sentences_length
|
| 93 |
)
|
| 94 |
sample_choice = st.selectbox(
|
requirements.txt
CHANGED
|
@@ -5,4 +5,5 @@ torchvision==0.10.1
|
|
| 5 |
transformers==4.10.3
|
| 6 |
sumy==0.9.0
|
| 7 |
nltk==3.6.7
|
| 8 |
-
validators==0.18.2
|
|
|
|
|
|
| 5 |
transformers==4.10.3
|
| 6 |
sumy==0.9.0
|
| 7 |
nltk==3.6.7
|
| 8 |
+
validators==0.18.2
|
| 9 |
+
pytest==6.2.5
|
test_summarizer.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Summarizer import Summarizer
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_split_sentences_by_token_length():
|
| 5 |
+
summary_sentences = [
|
| 6 |
+
'Python is a programming language.',
|
| 7 |
+
'Memory allocation.',
|
| 8 |
+
'Free.'
|
| 9 |
+
]
|
| 10 |
+
|
| 11 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=3)
|
| 12 |
+
assert split_sentences == [
|
| 13 |
+
'Python is a programming language.',
|
| 14 |
+
'Memory allocation. Free.'
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=5)
|
| 18 |
+
assert split_sentences == [
|
| 19 |
+
'Python is a programming language. Memory allocation.',
|
| 20 |
+
'Free.'
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=10)
|
| 24 |
+
assert split_sentences == [
|
| 25 |
+
'Python is a programming language. Memory allocation. Free.'
|
| 26 |
+
]
|