import os import requests import streamlit as st import streamlit.components.v1 as components from streamlit_extras.add_vertical_space import add_vertical_space from bs4 import BeautifulSoup from dotenv import load_dotenv from warnings import filterwarnings filterwarnings('ignore') def streamlit_config(): # page configuration st.set_page_config(page_title='Document Classification', layout='centered') # page header transparent color page_background_color = """ """ st.markdown(page_background_color, unsafe_allow_html=True) # title and position st.markdown(f'

Financial Document Classification

', unsafe_allow_html=True) add_vertical_space(2) def display_html_document(input_file): # Read the file content html_content = input_file.getvalue().decode("utf-8") # Define CSS to control the container size and center content styled_html = f"""
{html_content}
""" # Display the HTML content inside a fixed-size container components.html(styled_html, height=320, width=650, scrolling=False) def text_extract_from_html(html_file): # Read the uploaded HTML file html_content = html_file.read().decode('utf-8') # Parse the HTML Content soup = BeautifulSoup(html_content, 'html.parser') # Extract the Text text = soup.get_text() # Split the Text and Remove Unwanted Space result = [i.strip() for i in text.split()] result = ' '.join(result) return result def classify_text_with_huggingface_api(extracted_text): # Load environment variables from .env file load_dotenv() # Retrieve the Hugging Face API token from environment variables hf_token = os.getenv("HUGGINGFACE_TOKEN") # Define the Hugging Face API URL for the model API_URL = "https://api-inference.huggingface.co/models/gopiashokan/Financial-Document-Classification-using-Deep-Learning" # Set the authorization headers with the Hugging Face token HEADERS = {"Authorization": f"Bearer {hf_token}"} # Send a POST request to the Hugging Face API with the extracted text response = requests.post(API_URL, headers=HEADERS, json={"inputs": extracted_text}) # Parse and return the JSON response if response.status_code == 200: result = response.json() return result[0] else: return None def prediction(input_file): # Extract text from the uploaded HTML file extracted_text = text_extract_from_html(input_file) # Limit the extracted text to the first 512 characters to avoid API input limits extracted_text = extracted_text[0:512] # Classify the extracted text using the Hugging Face API result = classify_text_with_huggingface_api(extracted_text) if result is not None: # Select the prediction with the highest confidence score prediction = max(result, key=lambda x: x['score']) # Map model labels to human-readable class names label_mapping = {'LABEL_0':'Others', 'LABEL_1':'Balance Sheets', 'LABEL_2':'Notes', 'LABEL_3':'Cash Flow', 'LABEL_4':'Income Statement'} # Get the predicted class name based on the model output predicted_class = label_mapping[prediction['label']] # Convert the confidence score to a percentage confidence = prediction['score'] * 100 # Display the prediction results add_vertical_space(1) st.markdown(f"""

{confidence:.2f}% Match Found

Predicted Class = {predicted_class}

""", unsafe_allow_html=True) else: add_vertical_space(1) st.markdown(f'

Refresh the Page and Try Again

', unsafe_allow_html=True) # Streamlit Configuration Setup streamlit_config() try: # File uploader to upload the HTML file input_file = st.file_uploader('Upload an HTML file', type='html') if input_file is not None: # Display the HTML Document to User Interface display_html_document(input_file) # Predict the Class and Confidence Score with st.spinner('Processing'): prediction(input_file) add_vertical_space(2) except Exception as e: st.markdown(f'

{e}

', unsafe_allow_html=True)