Spaces:

koulsahil
/

Regulatory_Document_Analyzer

Running

App Files Files Community

koulsahil commited on Apr 10

Commit

2f9fd21

verified ·

1 Parent(s): b02a40e

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -77

app.py CHANGED Viewed

@@ -4,22 +4,11 @@ import pandas as pd
 import re
 import spacy
 import torch
-from transformers import pipeline
 import base64
 import io
 from datetime import datetime
 import json
-from pathlib import Path
-import os
-import sys
-# Print Python and package versions for debugging
-st.sidebar.write(f"Python version: {sys.version}")
-st.sidebar.write(f"Transformers version: {__import__('transformers').__version__}")
-# Configuration for Docker deployment
-UPLOAD_FOLDER = os.getenv('UPLOAD_FOLDER', '/tmp/uploads')
-Path(UPLOAD_FOLDER).mkdir(exist_ok=True)  # Ensure directory exists
 # Set page config
 st.set_page_config(
@@ -41,9 +30,11 @@ This application analyzes SEC filings (10-K, 13F, etc.) to extract:
 # Sidebar for model selection and settings
 st.sidebar.header("Analysis Settings")
-# We're standardizing on just one model to avoid issues
-nlp_model = "deepset/deberta-v3-base-squad2"
-st.sidebar.info(f"Using model: {nlp_model}")
 # Entity types to identify
 entity_types = st.sidebar.multiselect(
@@ -180,30 +171,17 @@ def perform_ner(text, entity_types):
     return entities
-# Function to load QA model (using simpler pipeline approach)
 @st.cache_resource
 def load_qa_model(model_name):
-    try:
-        # Use the simpler pipeline API that worked in the Dockerfile
-        qa_pipeline = pipeline("question-answering", model=model_name, cache_dir='/tmp/huggingface')
-        return qa_pipeline
-    except Exception as e:
-        st.error(f"Error loading model: {str(e)}")
-        # If error, try with minimal requirements
-        try:
-            qa_pipeline = pipeline("question-answering", model=model_name)
-            return qa_pipeline
-        except Exception as e2:
-            st.error(f"Failed to load model: {str(e2)}")
-            return None
-# Function to perform Question Answering with better error handling
 def perform_qa(text, questions, qa_pipeline, confidence_threshold):
-    if qa_pipeline is None:
-        return [{"question": q, "answer": "Model loading failed", "confidence": 0, "context": ""} for q in questions]
     # Split text into chunks if it's too long
-    max_length = 384  # Reduced for DeBERTa to avoid tokenization issues
     chunks = []
     # Simple chunking by sentences
@@ -240,7 +218,7 @@ def perform_qa(text, questions, qa_pipeline, confidence_threshold):
                         "context": chunk[max(0, result["start"] - 100):min(len(chunk), result["end"] + 100)]
                     }
             except Exception as e:
-                st.warning(f"Error processing chunk with question '{question}': {str(e)}")
                 continue
         if best_answer["answer"]:
@@ -276,14 +254,9 @@ def get_download_link(data, filename, text):
 uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
 if uploaded_file:
-    # Save the uploaded file to the upload folder for better processing
-    file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
-    with open(file_path, "wb") as f:
-        f.write(uploaded_file.getbuffer())
     with st.spinner("Processing PDF file..."):
         # Extract text from PDF
-        full_text, text_by_page = extract_text_from_pdf(file_path)
         # Show text extraction status
         st.success(f"Successfully extracted text from {len(text_by_page)} pages")
@@ -346,47 +319,41 @@ if uploaded_file:
         # Question Answering
         with qa_tab:
             if qa_mode:
-                with st.spinner("Loading QA model and performing analysis..."):
                     try:
-                        # Load the QA model
                         qa_pipeline = load_qa_model(nlp_model)
-                        if qa_pipeline:
-                            qa_results = perform_qa(full_text, custom_questions, qa_pipeline, confidence_threshold)
-                            # Display QA results
-                            for result in qa_results:
-                                st.subheader(result["question"])
-                                if result["confidence"] > 0:
-                                    st.markdown(f"**Answer:** {result['answer']}")
-                                    st.markdown(f"**Confidence:** {result['confidence']:.2f}")
-                                    with st.expander("Show Context"):
-                                        # Highlight the answer in the context
-                                        highlighted_context = result["context"].replace(
-                                            result["answer"],
-                                            f"**:blue[{result['answer']}]**"
-                                        )
-                                        st.markdown(highlighted_context)
-                                else:
-                                    st.info("No answer found with sufficient confidence.")
-                            # Provide download link for QA results
-                            qa_df = pd.DataFrame(qa_results)
-                            st.markdown(
-                                get_download_link(
-                                    qa_df,
-                                    "qa_results.csv",
-                                    "Download QA Results as CSV"
-                                ),
-                                unsafe_allow_html=True
-                            )
-                        else:
-                            st.error("Failed to load QA model. Check logs for details.")
                     except Exception as e:
                         st.error(f"Error performing question answering: {str(e)}")
-                        st.info("If you're seeing model loading errors, ensure the Docker container has adequate memory and the model is properly downloaded.")
             else:
                 st.info("Question Answering is disabled. Enable it from the sidebar.")
@@ -533,5 +500,4 @@ else:
         st.markdown("Download structured analysis results for review by your legal and compliance teams.")
 # Add footer with information
-st.markdown("---")
-st.markdown("Regulatory Report Checker - NLP-powered document analysis for compliance teams")

 import re
 import spacy
 import torch
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, pipeline
 import base64
 import io
 from datetime import datetime
 import json
 # Set page config
 st.set_page_config(
 # Sidebar for model selection and settings
 st.sidebar.header("Analysis Settings")
+# Model selection
+nlp_model = st.sidebar.selectbox(
+    "Select NLP Model",
+    ["distilbert-base-uncased", "deepset/deberta-v3-base-squad2", "distilbert-base-cased-distilled-squad"]
+)
 # Entity types to identify
 entity_types = st.sidebar.multiselect(
     return entities
+# Function to perform Question Answering
 @st.cache_resource
 def load_qa_model(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
+    return qa_pipeline
 def perform_qa(text, questions, qa_pipeline, confidence_threshold):
     # Split text into chunks if it's too long
+    max_length = 512  # Typical max length for transformer models
     chunks = []
     # Simple chunking by sentences
                         "context": chunk[max(0, result["start"] - 100):min(len(chunk), result["end"] + 100)]
                     }
             except Exception as e:
+                st.error(f"Error processing chunk with question '{question}': {str(e)}")
                 continue
         if best_answer["answer"]:
 uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
 if uploaded_file:
     with st.spinner("Processing PDF file..."):
         # Extract text from PDF
+        full_text, text_by_page = extract_text_from_pdf(uploaded_file)
         # Show text extraction status
         st.success(f"Successfully extracted text from {len(text_by_page)} pages")
         # Question Answering
         with qa_tab:
             if qa_mode:
+                with st.spinner("Performing Question Answering..."):
                     try:
                         qa_pipeline = load_qa_model(nlp_model)
+                        qa_results = perform_qa(full_text, custom_questions, qa_pipeline, confidence_threshold)
+                        # Display QA results
+                        for result in qa_results:
+                            st.subheader(result["question"])
+                            if result["confidence"] > 0:
+                                st.markdown(f"**Answer:** {result['answer']}")
+                                st.markdown(f"**Confidence:** {result['confidence']:.2f}")
+                                with st.expander("Show Context"):
+                                    # Highlight the answer in the context
+                                    highlighted_context = result["context"].replace(
+                                        result["answer"],
+                                        f"**:blue[{result['answer']}]**"
+                                    )
+                                    st.markdown(highlighted_context)
+                            else:
+                                st.info("No answer found with sufficient confidence.")
+                        # Provide download link for QA results
+                        qa_df = pd.DataFrame(qa_results)
+                        st.markdown(
+                            get_download_link(
+                                qa_df,
+                                "qa_results.csv",
+                                "Download QA Results as CSV"
+                            ),
+                            unsafe_allow_html=True
+                        )
                     except Exception as e:
                         st.error(f"Error performing question answering: {str(e)}")
             else:
                 st.info("Question Answering is disabled. Enable it from the sidebar.")
         st.markdown("Download structured analysis results for review by your legal and compliance teams.")
 # Add footer with information
+st.markdown("---")