gopiashokan commited on
Commit
fbb1cfd
·
verified ·
1 Parent(s): e61fb48

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -158
app.py CHANGED
@@ -1,158 +1,187 @@
1
- import os
2
- import numpy as np
3
- import spacy
4
- import nltk
5
- import tensorflow as tf
6
- import streamlit as st
7
- from streamlit_extras.add_vertical_space import add_vertical_space
8
- from bs4 import BeautifulSoup
9
- from gensim.models import Word2Vec
10
- from nltk.tokenize import word_tokenize
11
- from warnings import filterwarnings
12
- filterwarnings('ignore')
13
-
14
-
15
-
16
- def streamlit_config():
17
-
18
- # page configuration
19
- st.set_page_config(page_title='Classification', layout='centered')
20
-
21
- # page header transparent color
22
- page_background_color = """
23
- <style>
24
-
25
- [data-testid="stHeader"]
26
- {
27
- background: rgba(0,0,0,0);
28
- }
29
-
30
- </style>
31
- """
32
- st.markdown(page_background_color, unsafe_allow_html=True)
33
-
34
- # title and position
35
- st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
36
- unsafe_allow_html=True)
37
- add_vertical_space(4)
38
-
39
-
40
- def text_extract_from_html(html_file):
41
-
42
- # Read the uploaded HTML file
43
- html_content = html_file.read().decode('utf-8')
44
-
45
- # Parse the HTML Content
46
- soup = BeautifulSoup(html_content, 'html.parser')
47
-
48
- # Extract the Text
49
- text = soup.get_text()
50
-
51
- # Split the Text and Remove Unwanted Space
52
- result = [i.strip() for i in text.split()]
53
-
54
- return result
55
-
56
-
57
- def text_processing(text):
58
-
59
- # spaCy Engine
60
- nlp = spacy.load('en_core_web_lg')
61
-
62
- # Process the Text with spaCy
63
- doc = nlp(' '.join(text))
64
-
65
- # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
66
- token_list = [
67
- token.lemma_.lower().strip()
68
- for token in doc
69
- if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
70
- ]
71
-
72
- if len(token_list) > 0:
73
- return ' '.join(token_list)
74
- else:
75
- return 'empty'
76
-
77
-
78
- def sentence_embeddings(sentence):
79
-
80
- # split the sentence into separate words
81
- words = word_tokenize(sentence)
82
-
83
- # load the trained model
84
- model = Word2Vec.load('word2vec_model.bin')
85
-
86
- # get the vectors of each words
87
- vectors = [model.wv[word] for word in words if word in model.wv]
88
-
89
- if vectors:
90
- # return the average of vectors
91
- return np.mean(vectors, axis=0)
92
-
93
- else:
94
- # we set the model parameter in training ---> vector_size = 300
95
- return np.zeros(model.vector_size)
96
-
97
-
98
- def prediction(html_file):
99
-
100
- # Extract the Text from HTML Document
101
- extracted_text = text_extract_from_html(html_file)
102
-
103
- # Preprocess the Text
104
- preprocessed_text = text_processing(extracted_text)
105
-
106
- # Text Convert into Embeddings
107
- features = sentence_embeddings(preprocessed_text)
108
-
109
- # Reshape the features into match the expected input shape of Model
110
- features = np.expand_dims(features, axis=0)
111
- features = np.expand_dims(features, axis=2)
112
-
113
- # Convert into Tensors
114
- features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
115
-
116
- # Load the Model and Prediction
117
- model = tf.keras.models.load_model('model.h5', custom_objects = {'Orthogonal': tf.keras.initializers.Orthogonal})
118
- prediction = model.predict(features_tensors)
119
-
120
- # Find the Maximum Probability Value
121
- target_label = np.argmax(prediction)
122
-
123
- # Find the Target_Label Name
124
- target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
125
- predicted_class = target[target_label]
126
-
127
- # Find the Confidence
128
- confidence = round(np.max(prediction)*100, 2)
129
-
130
- add_vertical_space(1)
131
- st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
132
- unsafe_allow_html=True)
133
-
134
- # Display the HTML content in Streamlit
135
- st.html(html_file, height=600, scrolling=True)
136
-
137
- add_vertical_space(1)
138
- st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
139
- unsafe_allow_html=True)
140
-
141
-
142
-
143
- # Streamlit Configuration Setup
144
- streamlit_config()
145
-
146
-
147
- # Check 'punkt' Already Downloaded or Not
148
- try:
149
- nltk.data.find('tokenizers/punkt')
150
- except LookupError:
151
- nltk.download('punkt')
152
-
153
-
154
- # File uploader to upload the HTML file
155
- input_file = st.file_uploader('Upload an HTML file', type='html')
156
-
157
- if input_file is not None:
158
- prediction(input_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import spacy
4
+ import nltk
5
+ import tensorflow as tf
6
+ import streamlit as st
7
+ import streamlit.components.v1 as components
8
+ from streamlit_extras.add_vertical_space import add_vertical_space
9
+ from bs4 import BeautifulSoup
10
+ from gensim.models import Word2Vec
11
+ from nltk.tokenize import word_tokenize
12
+ from warnings import filterwarnings
13
+ filterwarnings('ignore')
14
+
15
+
16
+
17
+ def streamlit_config():
18
+
19
+ # page configuration
20
+ st.set_page_config(page_title='Document Classification', layout='centered')
21
+
22
+ # page header transparent color
23
+ page_background_color = """
24
+ <style>
25
+
26
+ [data-testid="stHeader"]
27
+ {
28
+ background: rgba(0,0,0,0);
29
+ }
30
+
31
+ </style>
32
+ """
33
+ st.markdown(page_background_color, unsafe_allow_html=True)
34
+
35
+ # title and position
36
+ st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
37
+ unsafe_allow_html=True)
38
+ add_vertical_space(4)
39
+
40
+
41
+ def text_color_change(input_file):
42
+
43
+ # Add style to change the text color to white
44
+ styled_html = f"""
45
+ <!DOCTYPE html>
46
+ <html>
47
+ <head>
48
+ <style>
49
+ body {{
50
+ color: white;
51
+ }}
52
+ </style>
53
+ </head>
54
+ <body>
55
+ {input_file}
56
+ </body>
57
+ </html>
58
+ """
59
+ return styled_html
60
+
61
+
62
+ def text_extract_from_html(html_file):
63
+
64
+ # Read the uploaded HTML file
65
+ html_content = html_file.read().decode('utf-8')
66
+
67
+ # Parse the HTML Content
68
+ soup = BeautifulSoup(html_content, 'html.parser')
69
+
70
+ # Extract the Text
71
+ text = soup.get_text()
72
+
73
+ # Split the Text and Remove Unwanted Space
74
+ result = [i.strip() for i in text.split()]
75
+
76
+ return result
77
+
78
+
79
+ def text_processing(text):
80
+
81
+ # spaCy Engine
82
+ nlp = spacy.load('en_core_web_lg')
83
+
84
+ # Process the Text with spaCy
85
+ doc = nlp(' '.join(text))
86
+
87
+ # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
88
+ token_list = [
89
+ token.lemma_.lower().strip()
90
+ for token in doc
91
+ if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
92
+ ]
93
+
94
+ if len(token_list) > 0:
95
+ return ' '.join(token_list)
96
+ else:
97
+ return 'empty'
98
+
99
+
100
+ def sentence_embeddings(sentence):
101
+
102
+ # split the sentence into separate words
103
+ words = word_tokenize(sentence)
104
+
105
+ # load the trained model
106
+ model = Word2Vec.load('word2vec_model.bin')
107
+
108
+ # get the vectors of each words
109
+ vectors = [model.wv[word] for word in words if word in model.wv]
110
+
111
+ if vectors:
112
+ # return the average of vectors
113
+ return np.mean(vectors, axis=0)
114
+
115
+ else:
116
+ # we set the model parameter in training ---> vector_size = 300
117
+ return np.zeros(model.vector_size)
118
+
119
+
120
+ def prediction(input_file):
121
+
122
+ # Extract the Text from HTML Document
123
+ html_content = text_extract_from_html(input_file)
124
+
125
+ # Preprocess the Text
126
+ preprocessed_text = text_processing(html_content)
127
+
128
+ # Text Convert into Embeddings
129
+ features = sentence_embeddings(preprocessed_text)
130
+
131
+ # Reshape the features into match the expected input shape of Model
132
+ features = np.expand_dims(features, axis=0)
133
+ features = np.expand_dims(features, axis=2)
134
+
135
+ # Convert into Tensors
136
+ features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
137
+
138
+ # Load the Model and Prediction
139
+ model = tf.keras.models.load_model('model.h5', custom_objects = {'Orthogonal': tf.keras.initializers.Orthogonal})
140
+ prediction = model.predict(features_tensors)
141
+
142
+ # Find the Maximum Probability Value
143
+ target_label = np.argmax(prediction)
144
+
145
+ # Find the Target_Label Name
146
+ target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
147
+ predicted_class = target[target_label]
148
+
149
+ # Find the Confidence
150
+ confidence = round(np.max(prediction)*100, 2)
151
+
152
+ return predicted_class, confidence
153
+
154
+
155
+
156
+ # Streamlit Configuration Setup
157
+ streamlit_config()
158
+
159
+
160
+ # Check 'punkt' Already Downloaded or Not
161
+ try:
162
+ nltk.data.find('tokenizers/punkt')
163
+ except LookupError:
164
+ nltk.download('punkt')
165
+
166
+
167
+ # File uploader to upload the HTML file
168
+ input_file = st.file_uploader('Upload an HTML file', type='html')
169
+
170
+ if input_file is not None:
171
+
172
+ # Read and Display the HTML content in Streamlit
173
+ add_vertical_space(1)
174
+ html_content = input_file.read().decode('utf-8')
175
+ html_content = text_color_change(html_content)
176
+ components.html(html=html_content, width=700, height=300, scrolling=True)
177
+
178
+ predicted_class, confidence = prediction(input_file)
179
+
180
+ add_vertical_space(1)
181
+ st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
182
+ unsafe_allow_html=True)
183
+
184
+ add_vertical_space(1)
185
+ st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
186
+ unsafe_allow_html=True)
187
+