Spaces:

gopiashokan
/

Financial-Document-Classification

Build error

File size: 4,516 Bytes

import numpy as np
import spacy
import nltk
import tensorflow as tf
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from warnings import filterwarnings
filterwarnings('ignore')



def streamlit_config():

    # page configuration
    st.set_page_config(page_title='Document Classification', layout='centered')

    # page header transparent color
    page_background_color = """

    <style>



    [data-testid="stHeader"] 

    {

    background: rgba(0,0,0,0);

    }



    </style>

    """
    st.markdown(page_background_color, unsafe_allow_html=True)

    # title and position
    st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
                unsafe_allow_html=True)
    add_vertical_space(4)


def text_extract_from_html(html_file):

    # Read the uploaded HTML file
    html_content = html_file.read().decode('utf-8')

    # Parse the HTML Content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the Text
    text = soup.get_text()

    # Split the Text and Remove Unwanted Space
    result = [i.strip() for i in text.split()]

    return result


def text_processing(text):

    # spaCy Engine
    nlp = spacy.load('en_core_web_lg')

    # Process the Text with spaCy
    doc = nlp(' '.join(text))

    # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
    token_list = [
                  token.lemma_.lower().strip()
                  for token in doc
                  if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
                 ]

    if len(token_list) > 0:
        return ' '.join(token_list)
    else:
        return 'empty'
    

def sentence_embeddings(sentence):

    # split the sentence into separate words
    words = word_tokenize(sentence)                         

    # load the trained model
    model = Word2Vec.load('word2vec_model.bin')

    # get the vectors of each words
    vectors = [model.wv[word] for word in words if word in model.wv]
    
    if vectors:
        # return the average of vectors
        return np.mean(vectors, axis=0)

    else:
        # we set the model parameter in training ---> vector_size = 300
        return np.zeros(model.vector_size)


def prediction(input_file):

    # Extract the Text from HTML Document
    html_content = text_extract_from_html(input_file)

    # Preprocess the Text
    preprocessed_text = text_processing(html_content)

    # Text Convert into Embeddings
    features = sentence_embeddings(preprocessed_text)

    # Reshape the features into match the expected input shape of Model
    features = np.expand_dims(features, axis=0)
    features = np.expand_dims(features, axis=2)

    # Convert into Tensors
    features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)

    # Load the Model and Prediction
    model = tf.keras.models.load_model('model.h5', custom_objects = {'Orthogonal': tf.keras.initializers.Orthogonal})
    prediction = model.predict(features_tensors)

    # Find the Maximum Probability Value
    target_label = np.argmax(prediction)

    # Find the Target_Label Name
    target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
    predicted_class = target[target_label]

    # Find the Confidence
    confidence = round(np.max(prediction)*100, 2)

    add_vertical_space(2)
    st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>', 
                    unsafe_allow_html=True)
    
    add_vertical_space(1)
    st.markdown(f'<h3 style="text-align: center; color: green;">Prdicted Class = {predicted_class}</h3>', 
                    unsafe_allow_html=True)



# Streamlit Configuration Setup
streamlit_config()
    

# File uploader to upload the HTML file
input_file = st.file_uploader('Upload an HTML file', type='html')

if input_file is not None:

    try:
        # Predict the Input_HTML_File_Class
        prediction(input_file)

    except:
        # Check 'punkt' Already Downloaded or Not
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        # Predict the Input_HTML_File_Class
        prediction(input_file)