|
import os
|
|
import requests
|
|
import streamlit as st
|
|
import streamlit.components.v1 as components
|
|
from streamlit_extras.add_vertical_space import add_vertical_space
|
|
from bs4 import BeautifulSoup
|
|
from dotenv import load_dotenv
|
|
from warnings import filterwarnings
|
|
filterwarnings('ignore')
|
|
|
|
|
|
def streamlit_config():
|
|
|
|
|
|
st.set_page_config(page_title='Document Classification', layout='centered')
|
|
|
|
|
|
page_background_color = """
|
|
<style>
|
|
|
|
[data-testid="stHeader"]
|
|
{
|
|
background: rgba(0,0,0,0);
|
|
}
|
|
|
|
</style>
|
|
"""
|
|
st.markdown(page_background_color, unsafe_allow_html=True)
|
|
|
|
|
|
st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
|
|
unsafe_allow_html=True)
|
|
add_vertical_space(2)
|
|
|
|
|
|
def display_html_document(input_file):
|
|
|
|
|
|
html_content = input_file.getvalue().decode("utf-8")
|
|
|
|
|
|
styled_html = f"""
|
|
<div style="width: 610px; height: 300px;
|
|
overflow: auto; border: 1px solid #ddd;
|
|
padding: 10px; background-color: white;
|
|
color: black; white-space: normal;
|
|
display: block;">
|
|
{html_content}
|
|
</div>
|
|
"""
|
|
|
|
|
|
components.html(styled_html, height=320, width=650, scrolling=False)
|
|
|
|
|
|
def text_extract_from_html(html_file):
|
|
|
|
|
|
html_content = html_file.read().decode('utf-8')
|
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
|
text = soup.get_text()
|
|
|
|
|
|
result = [i.strip() for i in text.split()]
|
|
result = ' '.join(result)
|
|
|
|
return result
|
|
|
|
|
|
def classify_text_with_huggingface_api(extracted_text):
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
hf_token = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
|
|
|
API_URL = "https://api-inference.huggingface.co/models/gopiashokan/Financial-Document-Classification-using-Deep-Learning"
|
|
|
|
|
|
HEADERS = {"Authorization": f"Bearer {hf_token}"}
|
|
|
|
|
|
response = requests.post(API_URL, headers=HEADERS, json={"inputs": extracted_text})
|
|
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
return result[0]
|
|
|
|
else:
|
|
return None
|
|
|
|
|
|
def prediction(input_file):
|
|
|
|
|
|
extracted_text = text_extract_from_html(input_file)
|
|
|
|
|
|
extracted_text = extracted_text[0:512]
|
|
|
|
|
|
result = classify_text_with_huggingface_api(extracted_text)
|
|
|
|
if result is not None:
|
|
|
|
prediction = max(result, key=lambda x: x['score'])
|
|
|
|
|
|
label_mapping = {'LABEL_0':'Others', 'LABEL_1':'Balance Sheets', 'LABEL_2':'Notes', 'LABEL_3':'Cash Flow', 'LABEL_4':'Income Statement'}
|
|
|
|
|
|
predicted_class = label_mapping[prediction['label']]
|
|
|
|
|
|
confidence = prediction['score'] * 100
|
|
|
|
|
|
add_vertical_space(1)
|
|
st.markdown(f"""
|
|
<div style="text-align: center; line-height: 1; padding: 0px;">
|
|
<h4 style="color: orange; margin: 0px; padding: 0px;">{confidence:.2f}% Match Found</h4>
|
|
<h3 style="color: green; margin-top: 10px; padding: 0px;">Predicted Class = {predicted_class}</h3>
|
|
</div>
|
|
""", unsafe_allow_html=True)
|
|
|
|
|
|
else:
|
|
add_vertical_space(1)
|
|
st.markdown(f'<h4 style="text-align: center; color: orange; margin-top: 10px;">Refresh the Page and Try Again</h4>',
|
|
unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
streamlit_config()
|
|
|
|
|
|
try:
|
|
|
|
|
|
input_file = st.file_uploader('Upload an HTML file', type='html')
|
|
|
|
if input_file is not None:
|
|
|
|
|
|
display_html_document(input_file)
|
|
|
|
|
|
with st.spinner('Processing'):
|
|
prediction(input_file)
|
|
|
|
|
|
except Exception as e:
|
|
st.markdown(f'<h3 style="text-align: center;">{e}</h3>', unsafe_allow_html=True)
|
|
|