YouTubeTextSummarizer / summary.py
Rehman1603's picture
Create summary.py
3a86bc0
raw
history blame
1.89 kB
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import traceback
import sys
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
nltk.download('stopwords')
nltk.download('punkt')
def summary_nlp(text):
stopWords = set(stopwords.words("english"))
words = word_tokenize(text)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
average = int(sumValues / len(sentenceValue))
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
summary += " " + sentence
return summary
def Summary_BART(text):
checkpoint = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
inputs = tokenizer(text,
max_length=1024,
truncation=True,
return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"])
summary = tokenizer.batch_decode(summary_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
return summary[0]