gopiashokan's picture
Update app.py
a145b29 verified
import os
import requests
import streamlit as st
import streamlit.components.v1 as components
from streamlit_extras.add_vertical_space import add_vertical_space
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from warnings import filterwarnings
filterwarnings('ignore')
def streamlit_config():
# page configuration
st.set_page_config(page_title='Document Classification', layout='centered')
# page header transparent color
page_background_color = """
<style>
[data-testid="stHeader"]
{
background: rgba(0,0,0,0);
}
</style>
"""
st.markdown(page_background_color, unsafe_allow_html=True)
# title and position
st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>',
unsafe_allow_html=True)
add_vertical_space(2)
def display_html_document(input_file):
# Read the file content
html_content = input_file.getvalue().decode("utf-8")
# Define CSS to control the container size and center content
styled_html = f"""
<div style="width: 610px; height: 300px;
overflow: auto; border: 1px solid #ddd;
padding: 10px; background-color: white;
color: black; white-space: normal;
display: block;">
{html_content}
</div>
"""
# Display the HTML content inside a fixed-size container
components.html(styled_html, height=320, width=650, scrolling=False)
def text_extract_from_html(html_file):
# Read the uploaded HTML file
html_content = html_file.read().decode('utf-8')
# Parse the HTML Content
soup = BeautifulSoup(html_content, 'html.parser')
# Extract the Text
text = soup.get_text()
# Split the Text and Remove Unwanted Space
result = [i.strip() for i in text.split()]
result = ' '.join(result)
return result
def classify_text_with_huggingface_api(extracted_text):
# Load environment variables from .env file
load_dotenv()
# Retrieve the Hugging Face API token from environment variables
hf_token = os.getenv("HUGGINGFACE_TOKEN")
# Define the Hugging Face API URL for the model
API_URL = "https://api-inference.huggingface.co/models/gopiashokan/Financial-Document-Classification-using-Deep-Learning"
# Set the authorization headers with the Hugging Face token
HEADERS = {"Authorization": f"Bearer {hf_token}"}
# Send a POST request to the Hugging Face API with the extracted text
response = requests.post(API_URL, headers=HEADERS, json={"inputs": extracted_text})
# Parse and return the JSON response
if response.status_code == 200:
result = response.json()
return result[0]
else:
return None
def prediction(input_file):
# Extract text from the uploaded HTML file
extracted_text = text_extract_from_html(input_file)
# Limit the extracted text to the first 512 characters to avoid API input limits
extracted_text = extracted_text[0:512]
# Classify the extracted text using the Hugging Face API
result = classify_text_with_huggingface_api(extracted_text)
if result is not None:
# Select the prediction with the highest confidence score
prediction = max(result, key=lambda x: x['score'])
# Map model labels to human-readable class names
label_mapping = {'LABEL_0':'Others', 'LABEL_1':'Balance Sheets', 'LABEL_2':'Notes', 'LABEL_3':'Cash Flow', 'LABEL_4':'Income Statement'}
# Get the predicted class name based on the model output
predicted_class = label_mapping[prediction['label']]
# Convert the confidence score to a percentage
confidence = prediction['score'] * 100
# Display the prediction results
add_vertical_space(1)
st.markdown(f"""
<div style="text-align: center; line-height: 1; padding: 0px;">
<h4 style="color: orange; margin: 0px; padding: 0px;">{confidence:.2f}% Match Found</h4>
<h3 style="color: green; margin-top: 10px; padding: 0px;">Predicted Class = {predicted_class}</h3>
</div>
""", unsafe_allow_html=True)
else:
add_vertical_space(1)
st.markdown(f'<h4 style="text-align: center; color: orange; margin-top: 10px;">Refresh the Page and Try Again</h4>',
unsafe_allow_html=True)
# Streamlit Configuration Setup
streamlit_config()
try:
# File uploader to upload the HTML file
input_file = st.file_uploader('Upload an HTML file', type='html')
if input_file is not None:
# Display the HTML Document to User Interface
display_html_document(input_file)
# Predict the Class and Confidence Score
with st.spinner('Processing'):
prediction(input_file)
add_vertical_space(2)
except Exception as e:
st.markdown(f'<h3 style="text-align: center;">{e}</h3>', unsafe_allow_html=True)