gopiashokan commited on
Commit
e61fb48
·
verified ·
1 Parent(s): 6c1481f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -158
app.py CHANGED
@@ -1,158 +1,158 @@
1
- import os
2
- import numpy as np
3
- import spacy
4
- import nltk
5
- import tensorflow as tf
6
- import streamlit as st
7
- from streamlit_extras.add_vertical_space import add_vertical_space
8
- from bs4 import BeautifulSoup
9
- from gensim.models import Word2Vec
10
- from nltk.tokenize import word_tokenize
11
- from warnings import filterwarnings
12
- filterwarnings('ignore')
13
-
14
-
15
-
16
- def streamlit_config():
17
-
18
- # page configuration
19
- st.set_page_config(page_title='Classification', layout='centered')
20
-
21
- # page header transparent color
22
- page_background_color = """
23
- <style>
24
-
25
- [data-testid="stHeader"]
26
- {
27
- background: rgba(0,0,0,0);
28
- }
29
-
30
- </style>
31
- """
32
- st.markdown(page_background_color, unsafe_allow_html=True)
33
-
34
- # title and position
35
- st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
36
- unsafe_allow_html=True)
37
- add_vertical_space(4)
38
-
39
-
40
- def text_extract_from_html(html_file):
41
-
42
- # Read the uploaded HTML file
43
- html_content = html_file.read().decode('utf-8')
44
-
45
- # Parse the HTML Content
46
- soup = BeautifulSoup(html_content, 'html.parser')
47
-
48
- # Extract the Text
49
- text = soup.get_text()
50
-
51
- # Split the Text and Remove Unwanted Space
52
- result = [i.strip() for i in text.split()]
53
-
54
- return result
55
-
56
-
57
- def text_processing(text):
58
-
59
- # spaCy Engine
60
- nlp = spacy.load('en_core_web_lg')
61
-
62
- # Process the Text with spaCy
63
- doc = nlp(' '.join(text))
64
-
65
- # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
66
- token_list = [
67
- token.lemma_.lower().strip()
68
- for token in doc
69
- if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
70
- ]
71
-
72
- if len(token_list) > 0:
73
- return ' '.join(token_list)
74
- else:
75
- return 'empty'
76
-
77
-
78
- def sentence_embeddings(sentence):
79
-
80
- # split the sentence into separate words
81
- words = word_tokenize(sentence)
82
-
83
- # load the trained model
84
- model = Word2Vec.load('word2vec_model.bin')
85
-
86
- # get the vectors of each words
87
- vectors = [model.wv[word] for word in words if word in model.wv]
88
-
89
- if vectors:
90
- # return the average of vectors
91
- return np.mean(vectors, axis=0)
92
-
93
- else:
94
- # we set the model parameter in training ---> vector_size = 300
95
- return np.zeros(model.vector_size)
96
-
97
-
98
- def prediction(html_file):
99
-
100
- # Extract the Text from HTML Document
101
- extracted_text = text_extract_from_html(html_file)
102
-
103
- # Preprocess the Text
104
- preprocessed_text = text_processing(extracted_text)
105
-
106
- # Text Convert into Embeddings
107
- features = sentence_embeddings(preprocessed_text)
108
-
109
- # Reshape the features into match the expected input shape of Model
110
- features = np.expand_dims(features, axis=0)
111
- features = np.expand_dims(features, axis=2)
112
-
113
- # Convert into Tensors
114
- features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
115
-
116
- # Load the Model and Prediction
117
- model = tf.keras.models.load_model('model.h5', custom_objects = {'Orthogonal': tf.keras.initializers.Orthogonal})
118
- prediction = model.predict(features_tensors)
119
-
120
- # Find the Maximum Probability Value
121
- target_label = np.argmax(prediction)
122
-
123
- # Find the Target_Label Name
124
- target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
125
- predicted_class = target[target_label]
126
-
127
- # Find the Confidence
128
- confidence = round(np.max(prediction)*100, 2)
129
-
130
- add_vertical_space(1)
131
- st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
132
- unsafe_allow_html=True)
133
-
134
- # Display the HTML content in Streamlit
135
- st.components.v1.html(html_file, height=600, scrolling=True)
136
-
137
- add_vertical_space(1)
138
- st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
139
- unsafe_allow_html=True)
140
-
141
-
142
-
143
- # Streamlit Configuration Setup
144
- streamlit_config()
145
-
146
-
147
- # Check 'punkt' Already Downloaded or Not
148
- try:
149
- nltk.data.find('tokenizers/punkt')
150
- except LookupError:
151
- nltk.download('punkt')
152
-
153
-
154
- # File uploader to upload the HTML file
155
- input_file = st.file_uploader('Upload an HTML file', type='html')
156
-
157
- if input_file is not None:
158
- prediction(input_file)
 
1
+ import os
2
+ import numpy as np
3
+ import spacy
4
+ import nltk
5
+ import tensorflow as tf
6
+ import streamlit as st
7
+ from streamlit_extras.add_vertical_space import add_vertical_space
8
+ from bs4 import BeautifulSoup
9
+ from gensim.models import Word2Vec
10
+ from nltk.tokenize import word_tokenize
11
+ from warnings import filterwarnings
12
+ filterwarnings('ignore')
13
+
14
+
15
+
16
+ def streamlit_config():
17
+
18
+ # page configuration
19
+ st.set_page_config(page_title='Classification', layout='centered')
20
+
21
+ # page header transparent color
22
+ page_background_color = """
23
+ <style>
24
+
25
+ [data-testid="stHeader"]
26
+ {
27
+ background: rgba(0,0,0,0);
28
+ }
29
+
30
+ </style>
31
+ """
32
+ st.markdown(page_background_color, unsafe_allow_html=True)
33
+
34
+ # title and position
35
+ st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
36
+ unsafe_allow_html=True)
37
+ add_vertical_space(4)
38
+
39
+
40
+ def text_extract_from_html(html_file):
41
+
42
+ # Read the uploaded HTML file
43
+ html_content = html_file.read().decode('utf-8')
44
+
45
+ # Parse the HTML Content
46
+ soup = BeautifulSoup(html_content, 'html.parser')
47
+
48
+ # Extract the Text
49
+ text = soup.get_text()
50
+
51
+ # Split the Text and Remove Unwanted Space
52
+ result = [i.strip() for i in text.split()]
53
+
54
+ return result
55
+
56
+
57
+ def text_processing(text):
58
+
59
+ # spaCy Engine
60
+ nlp = spacy.load('en_core_web_lg')
61
+
62
+ # Process the Text with spaCy
63
+ doc = nlp(' '.join(text))
64
+
65
+ # Tokenization, Lemmatization, and Remove Stopwords, punctuation, digits
66
+ token_list = [
67
+ token.lemma_.lower().strip()
68
+ for token in doc
69
+ if token.text.lower() not in nlp.Defaults.stop_words and token.text.isalpha()
70
+ ]
71
+
72
+ if len(token_list) > 0:
73
+ return ' '.join(token_list)
74
+ else:
75
+ return 'empty'
76
+
77
+
78
+ def sentence_embeddings(sentence):
79
+
80
+ # split the sentence into separate words
81
+ words = word_tokenize(sentence)
82
+
83
+ # load the trained model
84
+ model = Word2Vec.load('word2vec_model.bin')
85
+
86
+ # get the vectors of each words
87
+ vectors = [model.wv[word] for word in words if word in model.wv]
88
+
89
+ if vectors:
90
+ # return the average of vectors
91
+ return np.mean(vectors, axis=0)
92
+
93
+ else:
94
+ # we set the model parameter in training ---> vector_size = 300
95
+ return np.zeros(model.vector_size)
96
+
97
+
98
+ def prediction(html_file):
99
+
100
+ # Extract the Text from HTML Document
101
+ extracted_text = text_extract_from_html(html_file)
102
+
103
+ # Preprocess the Text
104
+ preprocessed_text = text_processing(extracted_text)
105
+
106
+ # Text Convert into Embeddings
107
+ features = sentence_embeddings(preprocessed_text)
108
+
109
+ # Reshape the features into match the expected input shape of Model
110
+ features = np.expand_dims(features, axis=0)
111
+ features = np.expand_dims(features, axis=2)
112
+
113
+ # Convert into Tensors
114
+ features_tensors = tf.convert_to_tensor(features, dtype=tf.float32)
115
+
116
+ # Load the Model and Prediction
117
+ model = tf.keras.models.load_model('model.h5', custom_objects = {'Orthogonal': tf.keras.initializers.Orthogonal})
118
+ prediction = model.predict(features_tensors)
119
+
120
+ # Find the Maximum Probability Value
121
+ target_label = np.argmax(prediction)
122
+
123
+ # Find the Target_Label Name
124
+ target = {0:'Balance Sheets', 1:'Cash Flow', 2:'Income Statement', 3:'Notes', 4:'Others'}
125
+ predicted_class = target[target_label]
126
+
127
+ # Find the Confidence
128
+ confidence = round(np.max(prediction)*100, 2)
129
+
130
+ add_vertical_space(1)
131
+ st.markdown(f'<h4 style="text-align: center; color: orange;">{confidence}% Match Found</h4>',
132
+ unsafe_allow_html=True)
133
+
134
+ # Display the HTML content in Streamlit
135
+ st.html(html_file, height=600, scrolling=True)
136
+
137
+ add_vertical_space(1)
138
+ st.markdown(f'<h3 style="text-align: center; color: green;">{predicted_class}</h3>',
139
+ unsafe_allow_html=True)
140
+
141
+
142
+
143
+ # Streamlit Configuration Setup
144
+ streamlit_config()
145
+
146
+
147
+ # Check 'punkt' Already Downloaded or Not
148
+ try:
149
+ nltk.data.find('tokenizers/punkt')
150
+ except LookupError:
151
+ nltk.download('punkt')
152
+
153
+
154
+ # File uploader to upload the HTML file
155
+ input_file = st.file_uploader('Upload an HTML file', type='html')
156
+
157
+ if input_file is not None:
158
+ prediction(input_file)