|  | import streamlit as st | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import streamlit.components.v1 as stc | 
					
						
						|  | import nltk | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import nltk | 
					
						
						|  | nltk.download('all') | 
					
						
						|  | from sumy.parsers.plaintext import PlaintextParser | 
					
						
						|  | from nltk.tokenize import word_tokenize | 
					
						
						|  | from nltk.tag import pos_tag | 
					
						
						|  | from nltk.stem import WordNetLemmatizer | 
					
						
						|  | from sumy.summarizers.lex_rank import LexRankSummarizer | 
					
						
						|  | from sumy.summarizers.text_rank import TextRankSummarizer | 
					
						
						|  | from nltk.corpus import stopwords | 
					
						
						|  | from nltk.tokenize import sent_tokenize | 
					
						
						|  | from sumy.nlp.tokenizers import Tokenizer | 
					
						
						|  | from rouge import Rouge | 
					
						
						|  | from transformers import BartForConditionalGeneration, BartTokenizer | 
					
						
						|  | from transformers import T5ForConditionalGeneration, T5Tokenizer | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | from nltk.tag import StanfordNERTagger | 
					
						
						|  |  | 
					
						
						|  | from collections import Counter | 
					
						
						|  |  | 
					
						
						|  | from textblob import TextBlob | 
					
						
						|  | import seaborn as sns | 
					
						
						|  | import matplotlib.pyplot as plt | 
					
						
						|  |  | 
					
						
						|  | from wordcloud import WordCloud | 
					
						
						|  |  | 
					
						
						|  | import base64 | 
					
						
						|  | import time | 
					
						
						|  |  | 
					
						
						|  | stanford_ner_jar_path = 'stanford_model/stanford-ner.jar' | 
					
						
						|  |  | 
					
						
						|  | stanford_ner_model_path ='stanford_model/english.all.3class.distsim.crf.ser.gz' | 
					
						
						|  |  | 
					
						
						|  | timestr = time.strftime("%Y%m%d-%H%M%S") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import neattext as nt | 
					
						
						|  | import neattext.functions as nfx | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid red; border-radius: 0.25rem; padding: 1rem";>{} | 
					
						
						|  | </div> | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def evaluate_summary(summary,reference): | 
					
						
						|  | r=Rouge() | 
					
						
						|  | eval_score=r.get_scores(summary,reference) | 
					
						
						|  | eval_score_df=pd.DataFrame(eval_score[0]) | 
					
						
						|  | return eval_score_df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def bart_summary(docx): | 
					
						
						|  | model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') | 
					
						
						|  | tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') | 
					
						
						|  | inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt') | 
					
						
						|  | summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True) | 
					
						
						|  | summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | 
					
						
						|  | return  summary | 
					
						
						|  |  | 
					
						
						|  | def T5_summary(docx): | 
					
						
						|  | model = T5ForConditionalGeneration.from_pretrained('t5-base') | 
					
						
						|  | tokenizer = T5Tokenizer.from_pretrained('t5-base') | 
					
						
						|  | input_text = "summarize: " + docx | 
					
						
						|  | input_ids = tokenizer.encode(input_text, return_tensors='pt') | 
					
						
						|  | summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True) | 
					
						
						|  | summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | 
					
						
						|  | return summary | 
					
						
						|  |  | 
					
						
						|  | def sumy_summarizer(docx,num=5): | 
					
						
						|  | parser=PlaintextParser.from_string(docx,Tokenizer("english")) | 
					
						
						|  | lex_summ=LexRankSummarizer() | 
					
						
						|  | summary=lex_summ(parser.document,sentences_count= num) | 
					
						
						|  | summary_list=[str(sentence) for sentence in summary] | 
					
						
						|  | result=' '.join(summary_list) | 
					
						
						|  | return result | 
					
						
						|  |  | 
					
						
						|  | def sumy_text_summarizer(docx, num=5): | 
					
						
						|  | parser = PlaintextParser.from_string(docx, Tokenizer("english")) | 
					
						
						|  | text_rank_summarizer = TextRankSummarizer() | 
					
						
						|  | summary = text_rank_summarizer(parser.document, sentences_count=num) | 
					
						
						|  | summary_list = [str(sentence) for sentence in summary] | 
					
						
						|  | result = ' '.join(summary_list) | 
					
						
						|  | return result | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def nlp_analysis(text): | 
					
						
						|  | token_data = [] | 
					
						
						|  | tokens=word_tokenize(text) | 
					
						
						|  | tagged_tokens = pos_tag(tokens) | 
					
						
						|  | stop_words = set(stopwords.words('english')) | 
					
						
						|  | lemmatizer = WordNetLemmatizer() | 
					
						
						|  | for token in tagged_tokens: | 
					
						
						|  | token_text=token[0] | 
					
						
						|  | token_shape = None | 
					
						
						|  | token_pos = token[1] | 
					
						
						|  | token_lemma = lemmatizer.lemmatize(token_text) | 
					
						
						|  | token_is_alpha = token_text.isalpha() | 
					
						
						|  | token_is_stop = token_text.lower() in stop_words | 
					
						
						|  | token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop]) | 
					
						
						|  | df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words']) | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def find_entities(text): | 
					
						
						|  | stan = StanfordNERTagger(stanford_ner_model_path, stanford_ner_jar_path) | 
					
						
						|  | text=text.replace("\n\n","\n") | 
					
						
						|  | tokens = nltk.word_tokenize(text) | 
					
						
						|  | tagged_tokens = stan.tag(tokens) | 
					
						
						|  | entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O'] | 
					
						
						|  | entities=HTML_WRAPPER.format(entities) | 
					
						
						|  | return entities | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def file_download(data): | 
					
						
						|  | csv_file= data.to_csv() | 
					
						
						|  | b64=base64.b64encode(csv_file.encode()).decode() | 
					
						
						|  | new_filename="result_{}.csv".format(timestr) | 
					
						
						|  | st.markdown('### 🗃️ Download csv file ') | 
					
						
						|  | href=f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Click Here! </a>' | 
					
						
						|  | st.markdown(href, unsafe_allow_html=True) | 
					
						
						|  |  | 
					
						
						|  | def get_most_common_tokens(text): | 
					
						
						|  | word_tokens=Counter(text.split()) | 
					
						
						|  | most_common=dict(word_tokens.most_common(len(text))) | 
					
						
						|  | return most_common | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_semantics(text): | 
					
						
						|  | blob=TextBlob(text) | 
					
						
						|  | sentiment=blob.sentiment | 
					
						
						|  | return sentiment | 
					
						
						|  |  | 
					
						
						|  | def plot_wordcloud(text): | 
					
						
						|  | text_workcloud= WordCloud().generate(text) | 
					
						
						|  | fig=plt.figure() | 
					
						
						|  | plt.imshow(text_workcloud,interpolation='bilinear') | 
					
						
						|  | plt.axis('off') | 
					
						
						|  | st.pyplot(fig) | 
					
						
						|  |  | 
					
						
						|  | def pos_tags(text): | 
					
						
						|  | blob=TextBlob(text) | 
					
						
						|  | tagged_text=blob.tags | 
					
						
						|  | tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags']) | 
					
						
						|  | return tagged_df | 
					
						
						|  |  | 
					
						
						|  | TAGS = { | 
					
						
						|  | 'NN'   : 'green', | 
					
						
						|  | 'NNS'  : 'green', | 
					
						
						|  | 'NNP'  : 'green', | 
					
						
						|  | 'NNPS' : 'green', | 
					
						
						|  | 'VB'   : 'blue', | 
					
						
						|  | 'VBD'  : 'blue', | 
					
						
						|  | 'VBG'  : 'blue', | 
					
						
						|  | 'VBN'  : 'blue', | 
					
						
						|  | 'VBP'  : 'blue', | 
					
						
						|  | 'VBZ'  : 'blue', | 
					
						
						|  | 'JJ'   : 'red', | 
					
						
						|  | 'JJR'  : 'red', | 
					
						
						|  | 'JJS'  : 'red', | 
					
						
						|  | 'RB'   : 'cyan', | 
					
						
						|  | 'RBR'  : 'cyan', | 
					
						
						|  | 'RBS'  : 'cyan', | 
					
						
						|  | 'IN'   : 'darkwhite', | 
					
						
						|  | 'POS'  : 'darkyellow', | 
					
						
						|  | 'PRP$' : 'magenta', | 
					
						
						|  | 'PRP$' : 'magenta', | 
					
						
						|  | 'DET'   : 'black', | 
					
						
						|  | 'CC'   : 'black', | 
					
						
						|  | 'CD'   : 'black', | 
					
						
						|  | 'WDT'  : 'black', | 
					
						
						|  | 'WP'   : 'black', | 
					
						
						|  | 'WP$'  : 'black', | 
					
						
						|  | 'WRB'  : 'black', | 
					
						
						|  | 'EX'   : 'yellow', | 
					
						
						|  | 'FW'   : 'yellow', | 
					
						
						|  | 'LS'   : 'yellow', | 
					
						
						|  | 'MD'   : 'yellow', | 
					
						
						|  | 'PDT'  : 'yellow', | 
					
						
						|  | 'RP'   : 'yellow', | 
					
						
						|  | 'SYM'  : 'yellow', | 
					
						
						|  | 'TO'   : 'yellow', | 
					
						
						|  | 'None' : 'off' | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | def tag_visualize(tagged_df): | 
					
						
						|  | colored_text=[] | 
					
						
						|  | for i in tagged_df: | 
					
						
						|  | if i[1] in TAGS.keys(): | 
					
						
						|  | token=i[0] | 
					
						
						|  | color_of_text=TAGS.get(i[1]) | 
					
						
						|  | changed_text='<span style=color:{}>{}</span>'.format(color_of_text,token) | 
					
						
						|  | colored_text.append(changed_text) | 
					
						
						|  | result=''.join(colored_text) | 
					
						
						|  | return result |