gopiashokan's picture
a145b29 verified
import os
import requests
import streamlit as st
import streamlit.components.v1 as components
from streamlit_extras.add_vertical_space import add_vertical_space
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from warnings import filterwarnings
def streamlit_config():
# page configuration
st.set_page_config(page_title='Document Classification', layout='centered')
# page header transparent color
page_background_color = """
background: rgba(0,0,0,0);
st.markdown(page_background_color, unsafe_allow_html=True)
# title and position
st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
def display_html_document(input_file):
# Read the file content
html_content = input_file.getvalue().decode("utf-8")
# Define CSS to control the container size and center content
styled_html = f"""
<div style="width: 610px; height: 300px;
overflow: auto; border: 1px solid #ddd;
padding: 10px; background-color: white;
color: black; white-space: normal;
display: block;">
# Display the HTML content inside a fixed-size container
components.html(styled_html, height=320, width=650, scrolling=False)
def text_extract_from_html(html_file):
# Read the uploaded HTML file
html_content ='utf-8')
# Parse the HTML Content
soup = BeautifulSoup(html_content, 'html.parser')
# Extract the Text
text = soup.get_text()
# Split the Text and Remove Unwanted Space
result = [i.strip() for i in text.split()]
result = ' '.join(result)
return result
def classify_text_with_huggingface_api(extracted_text):
# Load environment variables from .env file
# Retrieve the Hugging Face API token from environment variables
hf_token = os.getenv("HUGGINGFACE_TOKEN")
# Define the Hugging Face API URL for the model
API_URL = ""
# Set the authorization headers with the Hugging Face token
HEADERS = {"Authorization": f"Bearer {hf_token}"}
# Send a POST request to the Hugging Face API with the extracted text
response =, headers=HEADERS, json={"inputs": extracted_text})
# Parse and return the JSON response
if response.status_code == 200:
result = response.json()
return result[0]
return None
def prediction(input_file):
# Extract text from the uploaded HTML file
extracted_text = text_extract_from_html(input_file)
# Limit the extracted text to the first 512 characters to avoid API input limits
extracted_text = extracted_text[0:512]
# Classify the extracted text using the Hugging Face API
result = classify_text_with_huggingface_api(extracted_text)
if result is not None:
# Select the prediction with the highest confidence score
prediction = max(result, key=lambda x: x['score'])
# Map model labels to human-readable class names
label_mapping = {'LABEL_0':'Others', 'LABEL_1':'Balance Sheets', 'LABEL_2':'Notes', 'LABEL_3':'Cash Flow', 'LABEL_4':'Income Statement'}
# Get the predicted class name based on the model output
predicted_class = label_mapping[prediction['label']]
# Convert the confidence score to a percentage
confidence = prediction['score'] * 100
# Display the prediction results
<div style="text-align: center; line-height: 1; padding: 0px;">
<h4 style="color: orange; margin: 0px; padding: 0px;">{confidence:.2f}% Match Found</h4>
<h3 style="color: green; margin-top: 10px; padding: 0px;">Predicted Class = {predicted_class}</h3>
""", unsafe_allow_html=True)
st.markdown(f'<h4 style="text-align: center; color: orange; margin-top: 10px;">Refresh the Page and Try Again</h4>',
# Streamlit Configuration Setup
# File uploader to upload the HTML file
input_file = st.file_uploader('Upload an HTML file', type='html')
if input_file is not None:
# Display the HTML Document to User Interface
# Predict the Class and Confidence Score
with st.spinner('Processing'):
except Exception as e:
st.markdown(f'<h3 style="text-align: center;">{e}</h3>', unsafe_allow_html=True)