Spaces:

gopiashokan
/

Financial-Document-Classification

Build error

App Files Files Community

gopiashokan commited on Jun 9, 2024

Commit

ae0301a

verified ·

1 Parent(s): 0713211

Upload 4 files

Browse files

Files changed (4) hide show

.streamlit/config.toml +7 -0
app.py +167 -0
requirements.txt +11 -0
setup.sh +2 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+[theme]
+base="dark"
+primaryColor="#FF4B4B"
+backgroundColor="#0E1117"
+secondaryBackgroundColor="#262730"
+textColor="#FAFAFA"
+font="sans serif"

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import numpy as np
+import spacy
+import nltk
+import tensorflow as tf
+import streamlit as st
+from streamlit_extras.add_vertical_space import add_vertical_space
+from bs4 import BeautifulSoup
+from gensim.models import Word2Vec
+from nltk.tokenize import word_tokenize
+from warnings import filterwarnings
+filterwarnings('ignore')
+def streamlit_config():
+    # page configuration
+    st.set_page_config(page_title='Classification', layout='centered')
+    # page header transparent color
+    page_background_color = """
+    <style>
+    [data-testid="stHeader"]
+    {
+    background: rgba(0,0,0,0);
+    }
+    </style>
+    """
+    st.markdown(page_background_color, unsafe_allow_html=True)
+    # title and position
+    st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
+                unsafe_allow_html=True)
+    add_vertical_space(4)
+def text_extract_from_html(html_file):
+    # Read the uploaded HTML file
+    html_content = html_file.read().decode('utf-8')
+    # Parse the HTML Content
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # Extract the Text
+    text = soup.get_text()
+    # Split the Text and Remove Unwanted Space
+    result = [i.strip() for i in text.split()]
+    return result
+def text_processing(text):
+    # spaCy Engine
+    nlp = spacy.load('en_core_web_lg')
+    # Process the Text with spaCy
+    doc = nlp(' '.join(text))
+    # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
+    token_list = [
+                  token.lemma_.lower().strip()
+                  for token in doc
+                  if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
+                 ]
+    if len(token_list) > 0:
+        return ' '.join(token_list)
+    else:
+        return 'empty'
+def sentence_embeddings(sentence):
+    # split the sentence into separate words
+    words = word_tokenize(sentence)
+    # load the trained model
+    model = Word2Vec.load(os.path.join('model', 'word2vec_model.bin'))
+    # get the vectors of each words
+    vectors = [model.wv[word] for word in words if word in model.wv]
+    if vectors:
+        # return the average of vectors
+        return np.mean(vectors, axis=0)
+    else:
+        # we set the model parameter in training ---> vector_size = 300
+        return np.zeros(model.vector_size)
+def prediction(html_file):
+    # Extract the Text from HTML Document
+    extracted_text = text_extract_from_html(html_file)
+    # Preprocess the Text
+    preprocessed_text = text_processing(extracted_text)
+    # Text Convert into Embeddings
+    features = sentence_embeddings(preprocessed_text)
+    # Reshape the features into match the expected input shape of Model
+    features = np.expand_dims(features, axis=0)
+    features = np.expand_dims(features, axis=2)
+    # Convert into Tensors
+    features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
+    # Load the Model and Prediction
+    model = tf.keras.models.load_model(os.path.join('model', 'model.h5'))
+    prediction = model.predict(features_tensors)
+    # Find the Maximum Probability Value
+    target_label = np.argmax(prediction)
+    # Find the Target_Label Name
+    target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
+    predicted_class = target[target_label]
+    # Find the Confidence
+    confidence = round(np.max(prediction)*100, 2)
+    add_vertical_space(1)
+    st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
+                    unsafe_allow_html=True)
+    # Display the HTML content in Streamlit
+    st.components.v1.html(html_file, height=600, scrolling=True)
+    add_vertical_space(1)
+    st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
+                    unsafe_allow_html=True)
+# Streamlit Configuration Setup
+streamlit_config()
+# Check 'punkt' Already Downloaded or Not
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+# Load spaCy model
+try:
+    nlp = spacy.load('en_core_web_lg')
+except OSError:
+    from spacy.cli import download
+    download('en_core_web_lg')
+    nlp = spacy.load('en_core_web_lg')
+# File uploader to upload the HTML file
+input_file = st.file_uploader('Upload an HTML file', type='html')
+if input_file is not None:
+    prediction(input_file)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy
+pandas
+matplotlib
+spacy
+nltk
+tensorflow
+streamlit
+streamlit_extras
+beautifulsoup4
+gensim
+imblearn

setup.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python -m spacy download en_core_web_lg
2	+ python -c "import nltk; nltk.download('punkt')"