gopiashokan commited on
Commit
ae0301a
·
verified ·
1 Parent(s): 0713211

Upload 4 files

Browse files
Files changed (4) hide show
  1. .streamlit/config.toml +7 -0
  2. app.py +167 -0
  3. requirements.txt +11 -0
  4. setup.sh +2 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ base="dark"
3
+ primaryColor="#FF4B4B"
4
+ backgroundColor="#0E1117"
5
+ secondaryBackgroundColor="#262730"
6
+ textColor="#FAFAFA"
7
+ font="sans serif"
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import spacy
4
+ import nltk
5
+ import tensorflow as tf
6
+ import streamlit as st
7
+ from streamlit_extras.add_vertical_space import add_vertical_space
8
+ from bs4 import BeautifulSoup
9
+ from gensim.models import Word2Vec
10
+ from nltk.tokenize import word_tokenize
11
+ from warnings import filterwarnings
12
+ filterwarnings('ignore')
13
+
14
+
15
+
16
+ def streamlit_config():
17
+
18
+ # page configuration
19
+ st.set_page_config(page_title='Classification', layout='centered')
20
+
21
+ # page header transparent color
22
+ page_background_color = """
23
+ <style>
24
+
25
+ [data-testid="stHeader"]
26
+ {
27
+ background: rgba(0,0,0,0);
28
+ }
29
+
30
+ </style>
31
+ """
32
+ st.markdown(page_background_color, unsafe_allow_html=True)
33
+
34
+ # title and position
35
+ st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
36
+ unsafe_allow_html=True)
37
+ add_vertical_space(4)
38
+
39
+
40
+ def text_extract_from_html(html_file):
41
+
42
+ # Read the uploaded HTML file
43
+ html_content = html_file.read().decode('utf-8')
44
+
45
+ # Parse the HTML Content
46
+ soup = BeautifulSoup(html_content, 'html.parser')
47
+
48
+ # Extract the Text
49
+ text = soup.get_text()
50
+
51
+ # Split the Text and Remove Unwanted Space
52
+ result = [i.strip() for i in text.split()]
53
+
54
+ return result
55
+
56
+
57
+ def text_processing(text):
58
+
59
+ # spaCy Engine
60
+ nlp = spacy.load('en_core_web_lg')
61
+
62
+ # Process the Text with spaCy
63
+ doc = nlp(' '.join(text))
64
+
65
+ # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
66
+ token_list = [
67
+ token.lemma_.lower().strip()
68
+ for token in doc
69
+ if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
70
+ ]
71
+
72
+ if len(token_list) > 0:
73
+ return ' '.join(token_list)
74
+ else:
75
+ return 'empty'
76
+
77
+
78
+ def sentence_embeddings(sentence):
79
+
80
+ # split the sentence into separate words
81
+ words = word_tokenize(sentence)
82
+
83
+ # load the trained model
84
+ model = Word2Vec.load(os.path.join('model', 'word2vec_model.bin'))
85
+
86
+ # get the vectors of each words
87
+ vectors = [model.wv[word] for word in words if word in model.wv]
88
+
89
+ if vectors:
90
+ # return the average of vectors
91
+ return np.mean(vectors, axis=0)
92
+
93
+ else:
94
+ # we set the model parameter in training ---> vector_size = 300
95
+ return np.zeros(model.vector_size)
96
+
97
+
98
+ def prediction(html_file):
99
+
100
+ # Extract the Text from HTML Document
101
+ extracted_text = text_extract_from_html(html_file)
102
+
103
+ # Preprocess the Text
104
+ preprocessed_text = text_processing(extracted_text)
105
+
106
+ # Text Convert into Embeddings
107
+ features = sentence_embeddings(preprocessed_text)
108
+
109
+ # Reshape the features into match the expected input shape of Model
110
+ features = np.expand_dims(features, axis=0)
111
+ features = np.expand_dims(features, axis=2)
112
+
113
+ # Convert into Tensors
114
+ features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
115
+
116
+ # Load the Model and Prediction
117
+ model = tf.keras.models.load_model(os.path.join('model', 'model.h5'))
118
+ prediction = model.predict(features_tensors)
119
+
120
+ # Find the Maximum Probability Value
121
+ target_label = np.argmax(prediction)
122
+
123
+ # Find the Target_Label Name
124
+ target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
125
+ predicted_class = target[target_label]
126
+
127
+ # Find the Confidence
128
+ confidence = round(np.max(prediction)*100, 2)
129
+
130
+ add_vertical_space(1)
131
+ st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
132
+ unsafe_allow_html=True)
133
+
134
+ # Display the HTML content in Streamlit
135
+ st.components.v1.html(html_file, height=600, scrolling=True)
136
+
137
+ add_vertical_space(1)
138
+ st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
139
+ unsafe_allow_html=True)
140
+
141
+
142
+
143
+ # Streamlit Configuration Setup
144
+ streamlit_config()
145
+
146
+
147
+ # Check 'punkt' Already Downloaded or Not
148
+ try:
149
+ nltk.data.find('tokenizers/punkt')
150
+ except LookupError:
151
+ nltk.download('punkt')
152
+
153
+
154
+ # Load spaCy model
155
+ try:
156
+ nlp = spacy.load('en_core_web_lg')
157
+ except OSError:
158
+ from spacy.cli import download
159
+ download('en_core_web_lg')
160
+ nlp = spacy.load('en_core_web_lg')
161
+
162
+
163
+ # File uploader to upload the HTML file
164
+ input_file = st.file_uploader('Upload an HTML file', type='html')
165
+
166
+ if input_file is not None:
167
+ prediction(input_file)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ matplotlib
4
+ spacy
5
+ nltk
6
+ tensorflow
7
+ streamlit
8
+ streamlit_extras
9
+ beautifulsoup4
10
+ gensim
11
+ imblearn
setup.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python -m spacy download en_core_web_lg
2
+ python -c "import nltk; nltk.download('punkt')"