Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import streamlit.components.v1 as stc | |
| import nltk | |
| # NLP Package-used for text analysis | |
| import nltk | |
| nltk.download('all') | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from nltk.tokenize import word_tokenize | |
| from nltk.tag import pos_tag | |
| from nltk.stem import WordNetLemmatizer | |
| from sumy.summarizers.lex_rank import LexRankSummarizer | |
| from sumy.summarizers.text_rank import TextRankSummarizer | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import sent_tokenize | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from rouge import Rouge | |
| from transformers import BartForConditionalGeneration, BartTokenizer | |
| from transformers import T5ForConditionalGeneration, T5Tokenizer | |
| # from nltk import ne_chunk | |
| from nltk.tag import StanfordNERTagger | |
| from collections import Counter | |
| from textblob import TextBlob | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from wordcloud import WordCloud | |
| import base64 | |
| import time | |
| stanford_ner_jar_path = open('stanford-ner.jar','rb') | |
| # Path to the pre-trained NER model file | |
| stanford_ner_model_path =open('english.all.3class.distsim.crf.ser.gz','rb') | |
| timestr = time.strftime("%Y%m%d-%H%M%S") | |
| # from spacy import displacy | |
| #Text cleaning packages | |
| # removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes, | |
| import neattext as nt | |
| import neattext.functions as nfx | |
| HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{} | |
| </div> | |
| """ | |
| def evaluate_summary(summary,reference): | |
| r=Rouge() | |
| eval_score=r.get_scores(summary,reference) | |
| eval_score_df=pd.DataFrame(eval_score[0]) | |
| return eval_score_df | |
| def bart_summary(docx): | |
| model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') | |
| tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') | |
| inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt') | |
| summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True) | |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| return summary | |
| def T5_summary(docx): | |
| model = T5ForConditionalGeneration.from_pretrained('t5-base') | |
| tokenizer = T5Tokenizer.from_pretrained('t5-base') | |
| input_text = "summarize: " + docx | |
| input_ids = tokenizer.encode(input_text, return_tensors='pt') | |
| summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True) | |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| return summary | |
| def sumy_summarizer(docx,num=5): | |
| parser=PlaintextParser.from_string(docx,Tokenizer("english")) | |
| lex_summ=LexRankSummarizer() | |
| summary=lex_summ(parser.document,sentences_count= num) | |
| summary_list=[str(sentence) for sentence in summary] | |
| result=' '.join(summary_list) | |
| return result | |
| def sumy_text_summarizer(docx, num=5): | |
| parser = PlaintextParser.from_string(docx, Tokenizer("english")) | |
| text_rank_summarizer = TextRankSummarizer() | |
| summary = text_rank_summarizer(parser.document, sentences_count=num) | |
| summary_list = [str(sentence) for sentence in summary] | |
| result = ' '.join(summary_list) | |
| return result | |
| def nlp_analysis(text): | |
| token_data = [] | |
| tokens=word_tokenize(text) | |
| tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc | |
| stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in" | |
| lemmatizer = WordNetLemmatizer() #preprocessing | |
| for token in tagged_tokens: | |
| token_text=token[0] | |
| token_shape = None | |
| token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb | |
| token_lemma = lemmatizer.lemmatize(token_text) | |
| token_is_alpha = token_text.isalpha() | |
| token_is_stop = token_text.lower() in stop_words | |
| token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop]) | |
| df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words']) | |
| return df | |
| def find_entities(text): | |
| stan = StanfordNERTagger(stanford_ner_model_path, stanford_ner_jar_path) | |
| text=text.replace("\n\n","\n") | |
| tokens = nltk.word_tokenize(text) | |
| tagged_tokens = stan.tag(tokens) | |
| entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O'] | |
| entities=HTML_WRAPPER.format(entities) | |
| return entities | |
| def file_download(data): | |
| csv_file= data.to_csv() | |
| b64=base64.b64encode(csv_file.encode()).decode() | |
| new_filename="result_{}.csv".format(timestr) | |
| st.markdown('### 🗃️ Download csv file ') | |
| href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| def get_most_common_tokens(text): | |
| word_tokens=Counter(text.split()) | |
| most_common=dict(word_tokens.most_common(len(text))) | |
| return most_common | |
| def get_semantics(text): | |
| blob=TextBlob(text) | |
| sentiment=blob.sentiment | |
| return sentiment | |
| def plot_wordcloud(text): | |
| text_workcloud= WordCloud().generate(text) #size indicates its frequency | |
| fig=plt.figure() | |
| plt.imshow(text_workcloud,interpolation='bilinear') | |
| plt.axis('off') | |
| st.pyplot(fig) | |
| def pos_tags(text): | |
| blob=TextBlob(text) | |
| tagged_text=blob.tags | |
| tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags']) | |
| return tagged_df | |
| TAGS = { | |
| 'NN' : 'green', | |
| 'NNS' : 'green', | |
| 'NNP' : 'green', | |
| 'NNPS' : 'green', | |
| 'VB' : 'blue', | |
| 'VBD' : 'blue', | |
| 'VBG' : 'blue', | |
| 'VBN' : 'blue', | |
| 'VBP' : 'blue', | |
| 'VBZ' : 'blue', | |
| 'JJ' : 'red', | |
| 'JJR' : 'red', | |
| 'JJS' : 'red', | |
| 'RB' : 'cyan', | |
| 'RBR' : 'cyan', | |
| 'RBS' : 'cyan', | |
| 'IN' : 'darkwhite', | |
| 'POS' : 'darkyellow', | |
| 'PRP$' : 'magenta', | |
| 'PRP$' : 'magenta', | |
| 'DET' : 'black', | |
| 'CC' : 'black', | |
| 'CD' : 'black', | |
| 'WDT' : 'black', | |
| 'WP' : 'black', | |
| 'WP$' : 'black', | |
| 'WRB' : 'black', | |
| 'EX' : 'yellow', | |
| 'FW' : 'yellow', | |
| 'LS' : 'yellow', | |
| 'MD' : 'yellow', | |
| 'PDT' : 'yellow', | |
| 'RP' : 'yellow', | |
| 'SYM' : 'yellow', | |
| 'TO' : 'yellow', | |
| 'None' : 'off' | |
| } | |
| def tag_visualize(tagged_df): | |
| colored_text=[] | |
| for i in tagged_df: | |
| if i[1] in TAGS.keys(): | |
| token=i[0] | |
| color_of_text=TAGS.get(i[1]) | |
| changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token) | |
| colored_text.append(changed_text) | |
| result=''.join(colored_text) | |
| return result |