import os import numpy as np import spacy import nltk import tensorflow as tf import streamlit as st from streamlit_extras.add_vertical_space import add_vertical_space from bs4 import BeautifulSoup from gensim.models import Word2Vec from nltk.tokenize import word_tokenize from warnings import filterwarnings filterwarnings('ignore') def streamlit_config(): # page configuration st.set_page_config(page_title='Classification', layout='centered') # page header transparent color page_background_color = """ """ st.markdown(page_background_color, unsafe_allow_html=True) # title and position st.markdown(f'

Financial Document Classification

', unsafe_allow_html=True) add_vertical_space(4) def text_extract_from_html(html_file): # Read the uploaded HTML file html_content = html_file.read().decode('utf-8') # Parse the HTML Content soup = BeautifulSoup(html_content, 'html.parser') # Extract the Text text = soup.get_text() # Split the Text and Remove Unwanted Space result = [i.strip() for i in text.split()] return result def text_processing(text): # spaCy Engine nlp = spacy.load('en_core_web_lg') # Process the Text with spaCy doc = nlp(' '.join(text)) # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits token_list = [ token.lemma_.lower().strip() for token in doc if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha() ] if len(token_list) > 0: return ' '.join(token_list) else: return 'empty' def sentence_embeddings(sentence): # split the sentence into separate words words = word_tokenize(sentence) # load the trained model model = Word2Vec.load(os.path.join('model', 'word2vec_model.bin')) # get the vectors of each words vectors = [model.wv[word] for word in words if word in model.wv] if vectors: # return the average of vectors return np.mean(vectors, axis=0) else: # we set the model parameter in training ---> vector_size = 300 return np.zeros(model.vector_size) def prediction(html_file): # Extract the Text from HTML Document extracted_text = text_extract_from_html(html_file) # Preprocess the Text preprocessed_text = text_processing(extracted_text) # Text Convert into Embeddings features = sentence_embeddings(preprocessed_text) # Reshape the features into match the expected input shape of Model features = np.expand_dims(features, axis=0) features = np.expand_dims(features, axis=2) # Convert into Tensors features_tensors = tf.convert_to_tensor(features, dtype=tf.float32) # Load the Model and Prediction model = tf.keras.models.load_model(os.path.join('model', 'model.h5')) prediction = model.predict(features_tensors) # Find the Maximum Probability Value target_label = np.argmax(prediction) # Find the Target_Label Name target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'} predicted_class = target[target_label] # Find the Confidence confidence = round(np.max(prediction)*100, 2) add_vertical_space(1) st.markdown(f'

{confidence}% Match Found

', unsafe_allow_html=True) # Display the HTML content in Streamlit st.components.v1.html(html_file, height=600, scrolling=True) add_vertical_space(1) st.markdown(f'

{predicted_class}

', unsafe_allow_html=True) # Streamlit Configuration Setup streamlit_config() # Check 'punkt' Already Downloaded or Not try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') # File uploader to upload the HTML file input_file = st.file_uploader('Upload an HTML file', type='html') if input_file is not None: prediction(input_file)