Spaces:

gopiashokan
/

Financial-Document-Classification

Build error

App Files Files Community

gopiashokan commited on Jun 9, 2024

Commit

fbb1cfd

verified ·

1 Parent(s): e61fb48

Upload app.py

Browse files

Files changed (1) hide show

app.py +187 -158

app.py CHANGED Viewed

@@ -1,158 +1,187 @@
-import os
-import numpy as np
-import spacy
-import nltk
-import tensorflow as tf
-import streamlit as st
-from streamlit_extras.add_vertical_space import add_vertical_space
-from bs4 import BeautifulSoup
-from gensim.models import Word2Vec
-from nltk.tokenize import word_tokenize
-from warnings import filterwarnings
-filterwarnings('ignore')
-def streamlit_config():
-    # page configuration
-    st.set_page_config(page_title='Classification', layout='centered')
-    # page header transparent color
-    page_background_color = """
-    <style>
-    [data-testid="stHeader"]
-    {
-    background: rgba(0,0,0,0);
-    }
-    </style>
-    """
-    st.markdown(page_background_color, unsafe_allow_html=True)
-    # title and position
-    st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
-                unsafe_allow_html=True)
-    add_vertical_space(4)
-def text_extract_from_html(html_file):
-    # Read the uploaded HTML file
-    html_content = html_file.read().decode('utf-8')
-    # Parse the HTML Content
-    soup = BeautifulSoup(html_content, 'html.parser')
-    # Extract the Text
-    text = soup.get_text()
-    # Split the Text and Remove Unwanted Space
-    result = [i.strip() for i in text.split()]
-    return result
-def text_processing(text):
-    # spaCy Engine
-    nlp = spacy.load('en_core_web_lg')
-    # Process the Text with spaCy
-    doc = nlp(' '.join(text))
-    # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
-    token_list = [
-                  token.lemma_.lower().strip()
-                  for token in doc
-                  if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
-                 ]
-    if len(token_list) > 0:
-        return ' '.join(token_list)
-    else:
-        return 'empty'
-def sentence_embeddings(sentence):
-    # split the sentence into separate words
-    words = word_tokenize(sentence)
-    # load the trained model
-    model = Word2Vec.load('word2vec_model.bin')
-    # get the vectors of each words
-    vectors = [model.wv[word] for word in words if word in model.wv]
-    if vectors:
-        # return the average of vectors
-        return np.mean(vectors, axis=0)
-    else:
-        # we set the model parameter in training ---> vector_size = 300
-        return np.zeros(model.vector_size)
-def prediction(html_file):
-    # Extract the Text from HTML Document
-    extracted_text = text_extract_from_html(html_file)
-    # Preprocess the Text
-    preprocessed_text = text_processing(extracted_text)
-    # Text Convert into Embeddings
-    features = sentence_embeddings(preprocessed_text)
-    # Reshape the features into match the expected input shape of Model
-    features = np.expand_dims(features, axis=0)
-    features = np.expand_dims(features, axis=2)
-    # Convert into Tensors
-    features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
-    # Load the Model and Prediction
-    model = tf.keras.models.load_model('model.h5', custom_objects = {'Orthogonal': tf.keras.initializers.Orthogonal})
-    prediction = model.predict(features_tensors)
-    # Find the Maximum Probability Value
-    target_label = np.argmax(prediction)
-    # Find the Target_Label Name
-    target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
-    predicted_class = target[target_label]
-    # Find the Confidence
-    confidence = round(np.max(prediction)*100, 2)
-    add_vertical_space(1)
-    st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
-                    unsafe_allow_html=True)
-    # Display the HTML content in Streamlit
-    st.html(html_file, height=600, scrolling=True)
-    add_vertical_space(1)
-    st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
-                    unsafe_allow_html=True)
-# Streamlit Configuration Setup
-streamlit_config()
-# Check 'punkt' Already Downloaded or Not
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
-# File uploader to upload the HTML file
-input_file = st.file_uploader('Upload an HTML file', type='html')
-if input_file is not None:
-    prediction(input_file)

+import os
+import numpy as np
+import spacy
+import nltk
+import tensorflow as tf
+import streamlit as st
+import streamlit.components.v1 as components
+from streamlit_extras.add_vertical_space import add_vertical_space
+from bs4 import BeautifulSoup
+from gensim.models import Word2Vec
+from nltk.tokenize import word_tokenize
+from warnings import filterwarnings
+filterwarnings('ignore')
+def streamlit_config():
+    # page configuration
+    st.set_page_config(page_title='Document Classification', layout='centered')
+    # page header transparent color
+    page_background_color = """
+    <style>
+    [data-testid="stHeader"]
+    {
+    background: rgba(0,0,0,0);
+    }
+    </style>
+    """
+    st.markdown(page_background_color, unsafe_allow_html=True)
+    # title and position
+    st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
+                unsafe_allow_html=True)
+    add_vertical_space(4)
+def text_color_change(input_file):
+    # Add style to change the text color to white
+    styled_html = f"""
+                    <!DOCTYPE html>
+                    <html>
+                    <head>
+                    <style>
+                    body {{
+                        color: white;
+                    }}
+                    </style>
+                    </head>
+                    <body>
+                        {input_file}
+                    </body>
+                    </html>
+                   """
+    return styled_html
+def text_extract_from_html(html_file):
+    # Read the uploaded HTML file
+    html_content = html_file.read().decode('utf-8')
+    # Parse the HTML Content
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # Extract the Text
+    text = soup.get_text()
+    # Split the Text and Remove Unwanted Space
+    result = [i.strip() for i in text.split()]
+    return result
+def text_processing(text):
+    # spaCy Engine
+    nlp = spacy.load('en_core_web_lg')
+    # Process the Text with spaCy
+    doc = nlp(' '.join(text))
+    # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
+    token_list = [
+                  token.lemma_.lower().strip()
+                  for token in doc
+                  if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
+                 ]
+    if len(token_list) > 0:
+        return ' '.join(token_list)
+    else:
+        return 'empty'
+def sentence_embeddings(sentence):
+    # split the sentence into separate words
+    words = word_tokenize(sentence)
+    # load the trained model
+    model = Word2Vec.load('word2vec_model.bin')
+    # get the vectors of each words
+    vectors = [model.wv[word] for word in words if word in model.wv]
+    if vectors:
+        # return the average of vectors
+        return np.mean(vectors, axis=0)
+    else:
+        # we set the model parameter in training ---> vector_size = 300
+        return np.zeros(model.vector_size)
+def prediction(input_file):
+    # Extract the Text from HTML Document
+    html_content = text_extract_from_html(input_file)
+    # Preprocess the Text
+    preprocessed_text = text_processing(html_content)
+    # Text Convert into Embeddings
+    features = sentence_embeddings(preprocessed_text)
+    # Reshape the features into match the expected input shape of Model
+    features = np.expand_dims(features, axis=0)
+    features = np.expand_dims(features, axis=2)
+    # Convert into Tensors
+    features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
+    # Load the Model and Prediction
+    model = tf.keras.models.load_model('model.h5', custom_objects = {'Orthogonal': tf.keras.initializers.Orthogonal})
+    prediction = model.predict(features_tensors)
+    # Find the Maximum Probability Value
+    target_label = np.argmax(prediction)
+    # Find the Target_Label Name
+    target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
+    predicted_class = target[target_label]
+    # Find the Confidence
+    confidence = round(np.max(prediction)*100, 2)
+    return predicted_class, confidence
+# Streamlit Configuration Setup
+streamlit_config()
+# Check 'punkt' Already Downloaded or Not
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+# File uploader to upload the HTML file
+input_file = st.file_uploader('Upload an HTML file', type='html')
+if input_file is not None:
+    # Read and Display the HTML content in Streamlit
+    add_vertical_space(1)
+    html_content = input_file.read().decode('utf-8')
+    html_content = text_color_change(html_content)
+    components.html(html=html_content, width=700, height=300, scrolling=True)
+    predicted_class, confidence = prediction(input_file)
+    add_vertical_space(1)
+    st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
+                    unsafe_allow_html=True)
+    add_vertical_space(1)
+    st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
+                    unsafe_allow_html=True)