Spaces:

koulsahil
/

Regulatory_Document_Analyzer

Running

App Files Files Community

koulsahil commited on Apr 9

Commit

4380ad1

verified ·

1 Parent(s): 59151b3

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +30 -0
README.md +22 -12
app.py +504 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.9-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install spaCy model
+RUN python -m spacy download en_core_web_sm
+# Copy the app code
+COPY . .
+# Expose port for Streamlit
+EXPOSE 8501
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Run the application
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,12 +1,22 @@
----
-title: Regulatory Document Analyzer
-emoji: 🐢
-colorFrom: green
-colorTo: red
-sdk: docker
-pinned: false
-license: other
-short_description: This application analyzes SEC filings (10-K, 13F, etc.)
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Regulatory Report Checker
+This application analyzes SEC filings (10-K, 13F, etc.) to extract:
+- Regulatory obligations
+- Risk statements
+- Regulatory agency references
+- Potential violations
+## Features
+- PDF text extraction
+- Named Entity Recognition for regulatory entities
+- Question Answering for regulatory information
+- Risk analysis with scoring and highlighting
+- Export capabilities (CSV/JSON)
+## How to Use
+1. Upload an SEC filing PDF
+2. Configure analysis settings in the sidebar
+3. Review results across different tabs
+4. Download analysis reports
+Built with Streamlit, Hugging Face Transformers, spaCy, and PDFPlumber.

app.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import streamlit as st
+import pdfplumber
+import pandas as pd
+import re
+import spacy
+import torch
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, pipeline
+import base64
+import io
+from datetime import datetime
+import json
+# Set page config
+st.set_page_config(
+    page_title="Regulatory Report Checker",
+    page_icon="📋",
+    layout="wide"
+)
+# Application title and description
+st.title("Regulatory Report Checker")
+st.markdown("""
+This application analyzes SEC filings (10-K, 13F, etc.) to extract:
+- Regulatory obligations
+- Risk statements
+- Regulatory agency references
+- Potential violations
+""")
+# Sidebar for model selection and settings
+st.sidebar.header("Analysis Settings")
+# Model selection
+nlp_model = st.sidebar.selectbox(
+    "Select NLP Model",
+    ["distilbert-base-uncased", "deepset/deberta-v3-base-squad2", "distilbert-base-cased-distilled-squad"]
+)
+# Entity types to identify
+entity_types = st.sidebar.multiselect(
+    "Entity Types to Extract",
+    ["Obligation", "Regulatory Agency", "Risk", "Deadline", "Penalty", "Amount"],
+    default=["Obligation", "Regulatory Agency", "Risk"]
+)
+# QA mode selection
+qa_mode = st.sidebar.checkbox("Enable Question Answering", value=True)
+# Custom questions for QA
+if qa_mode:
+    default_questions = [
+        "What are the regulatory obligations mentioned?",
+        "Are there any violations or risk statements?",
+        "What regulatory agencies are mentioned?",
+        "What are the compliance deadlines?"
+    ]
+    # Allow users to edit questions or add new ones
+    st.sidebar.subheader("Custom Questions")
+    custom_questions = []
+    # Start with default questions that can be modified
+    for i, default_q in enumerate(default_questions):
+        q = st.sidebar.text_input(f"Question {i+1}", value=default_q)
+        if q:
+            custom_questions.append(q)
+    # Option to add more questions
+    new_q = st.sidebar.text_input("Additional Question")
+    if new_q:
+        custom_questions.append(new_q)
+# Risk keyword settings
+st.sidebar.subheader("Risk Keywords")
+default_risk_keywords = "non-compliance, penalty, violation, risk, fine, investigation, audit, failure, breach, warning"
+risk_keywords = st.sidebar.text_area("Enter risk keywords (comma separated)", value=default_risk_keywords)
+risk_keywords_list = [keyword.strip() for keyword in risk_keywords.split(",")]
+# Add confidence threshold slider
+confidence_threshold = st.sidebar.slider("Confidence Threshold", 0.0, 1.0, 0.5)
+# Function to extract text from PDF
+@st.cache_data
+def extract_text_from_pdf(pdf_file):
+    text_by_page = {}
+    with pdfplumber.open(pdf_file) as pdf:
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text()
+            if text:
+                text_by_page[i+1] = text
+    # Combine all text
+    full_text = "\n\n".join(text_by_page.values())
+    return full_text, text_by_page
+# Function to highlight risk keywords in text
+def highlight_risk_terms(text, risk_terms):
+    highlighted_text = text
+    for term in risk_terms:
+        pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
+        highlighted_text = pattern.sub(f"**:red[{term}]**", highlighted_text)
+    return highlighted_text
+# Function to perform NER using spaCy with custom rules
+def perform_ner(text, entity_types):
+    # Load spaCy model
+    nlp = spacy.load("en_core_web_sm")
+    # Add custom rules for regulatory entities
+    ruler = nlp.add_pipe("entity_ruler")
+    # Define patterns for each entity type
+    patterns = []
+    # Regulatory agency patterns
+    if "Regulatory Agency" in entity_types:
+        agencies = ["SEC", "FINRA", "CFTC", "FDIC", "Federal Reserve", "OCC", "CFPB",
+                   "FTC", "IRS", "DOJ", "EPA", "FDA", "OSHA", "Securities and Exchange Commission"]
+        for agency in agencies:
+            patterns.append({"label": "REGULATORY_AGENCY", "pattern": agency})
+    # Obligation patterns
+    if "Obligation" in entity_types:
+        obligation_triggers = ["must", "required to", "shall", "obligation to", "mandated",
+                              "compliance with", "comply with", "required by", "in accordance with"]
+        for trigger in obligation_triggers:
+            patterns.append({"label": "OBLIGATION", "pattern": [{"LOWER": trigger}]})
+    # Risk patterns
+    if "Risk" in entity_types:
+        risk_triggers = ["risk", "exposure", "vulnerable", "susceptible", "hazard",
+                        "threat", "danger", "liability", "non-compliance", "violation"]
+        for trigger in risk_triggers:
+            patterns.append({"label": "RISK", "pattern": trigger})
+    # Deadline patterns
+    if "Deadline" in entity_types:
+        deadline_triggers = ["by", "due", "deadline", "within", "no later than"]
+        for trigger in deadline_triggers:
+            patterns.append({"label": "DEADLINE", "pattern": [{"LOWER": trigger}, {"ENT_TYPE": "DATE"}]})
+    # Penalty patterns
+    if "Penalty" in entity_types:
+        penalty_triggers = ["fine", "penalty", "sanction", "enforcement", "punitive", "disciplinary"]
+        for trigger in penalty_triggers:
+            patterns.append({"label": "PENALTY", "pattern": trigger})
+    # Add patterns to ruler
+    ruler.add_patterns(patterns)
+    # Process text
+    doc = nlp(text)
+    # Extract entities
+    entities = []
+    for ent in doc.ents:
+        if ent.label_ in ["REGULATORY_AGENCY", "OBLIGATION", "RISK", "DEADLINE", "PENALTY"] or ent.label_ == "MONEY":
+            entity_type = ent.label_
+            if ent.label_ == "MONEY" and "Amount" in entity_types:
+                entity_type = "AMOUNT"
+            entities.append({
+                "text": ent.text,
+                "start": ent.start_char,
+                "end": ent.end_char,
+                "type": entity_type,
+                "context": text[max(0, ent.start_char - 50):min(len(text), ent.end_char + 50)]
+            })
+    return entities
+# Function to perform Question Answering
+@st.cache_resource
+def load_qa_model(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
+    return qa_pipeline
+def perform_qa(text, questions, qa_pipeline, confidence_threshold):
+    # Split text into chunks if it's too long
+    max_length = 512  # Typical max length for transformer models
+    chunks = []
+    # Simple chunking by sentences
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < max_length:
+            current_chunk += sentence + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    # If text is still short enough, just use it directly
+    if not chunks:
+        chunks = [text]
+    # Process each question across all chunks
+    results = []
+    for question in questions:
+        best_answer = {"answer": "", "score": 0, "context": ""}
+        for chunk in chunks:
+            try:
+                result = qa_pipeline(question=question, context=chunk)
+                if result["score"] > best_answer["score"] and result["score"] >= confidence_threshold:
+                    best_answer = {
+                        "answer": result["answer"],
+                        "score": result["score"],
+                        "context": chunk[max(0, result["start"] - 100):min(len(chunk), result["end"] + 100)]
+                    }
+            except Exception as e:
+                st.error(f"Error processing chunk with question '{question}': {str(e)}")
+                continue
+        if best_answer["answer"]:
+            results.append({
+                "question": question,
+                "answer": best_answer["answer"],
+                "confidence": best_answer["score"],
+                "context": best_answer["context"]
+            })
+        else:
+            results.append({
+                "question": question,
+                "answer": "No answer found with sufficient confidence.",
+                "confidence": 0,
+                "context": ""
+            })
+    return results
+# Function to create downloadable file
+def get_download_link(data, filename, text):
+    """Generate a link to download the given data as a file"""
+    if isinstance(data, pd.DataFrame):
+        csv = data.to_csv(index=False)
+        b64 = base64.b64encode(csv.encode()).decode()
+    else:  # Assume JSON
+        b64 = base64.b64encode(json.dumps(data, indent=4).encode()).decode()
+    href = f'<a href="data:file/txt;base64,{b64}" download="{filename}">{text}</a>'
+    return href
+# File upload
+uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
+if uploaded_file:
+    with st.spinner("Processing PDF file..."):
+        # Extract text from PDF
+        full_text, text_by_page = extract_text_from_pdf(uploaded_file)
+        # Show text extraction status
+        st.success(f"Successfully extracted text from {len(text_by_page)} pages")
+        # Allow user to view the extracted text
+        with st.expander("View Extracted Text"):
+            page_selection = st.selectbox(
+                "Select page to view",
+                ["All"] + list(text_by_page.keys())
+            )
+            if page_selection == "All":
+                st.text_area("Full Text", full_text, height=300)
+            else:
+                st.text_area(f"Page {page_selection}", text_by_page[page_selection], height=300)
+        # Begin analysis section
+        st.header("Analysis Results")
+        # Create tabs for different analysis methods
+        ner_tab, qa_tab, risk_tab, summary_tab = st.tabs(["Entity Recognition", "Question Answering", "Risk Analysis", "Summary"])
+        # NER Analysis
+        with ner_tab:
+            with st.spinner("Performing Entity Recognition..."):
+                entities = perform_ner(full_text, entity_types)
+                if entities:
+                    # Group entities by type
+                    entities_by_type = {}
+                    for entity in entities:
+                        if entity["type"] not in entities_by_type:
+                            entities_by_type[entity["type"]] = []
+                        entities_by_type[entity["type"]].append(entity)
+                    # Display entities by type
+                    for entity_type, type_entities in entities_by_type.items():
+                        st.subheader(f"{entity_type} Entities")
+                        # Create a dataframe for better display
+                        df = pd.DataFrame([{
+                            "Text": e["text"],
+                            "Context": e["context"]
+                        } for e in type_entities])
+                        st.dataframe(df, use_container_width=True)
+                        # Provide download link for this entity type
+                        st.markdown(
+                            get_download_link(
+                                df,
+                                f"{entity_type.lower()}_entities.csv",
+                                f"Download {entity_type} Entities as CSV"
+                            ),
+                            unsafe_allow_html=True
+                        )
+                else:
+                    st.info("No entities detected. Try adjusting the entity types in the sidebar.")
+        # Question Answering
+        with qa_tab:
+            if qa_mode:
+                with st.spinner("Performing Question Answering..."):
+                    try:
+                        qa_pipeline = load_qa_model(nlp_model)
+                        qa_results = perform_qa(full_text, custom_questions, qa_pipeline, confidence_threshold)
+                        # Display QA results
+                        for result in qa_results:
+                            st.subheader(result["question"])
+                            if result["confidence"] > 0:
+                                st.markdown(f"**Answer:** {result['answer']}")
+                                st.markdown(f"**Confidence:** {result['confidence']:.2f}")
+                                with st.expander("Show Context"):
+                                    # Highlight the answer in the context
+                                    highlighted_context = result["context"].replace(
+                                        result["answer"],
+                                        f"**:blue[{result['answer']}]**"
+                                    )
+                                    st.markdown(highlighted_context)
+                            else:
+                                st.info("No answer found with sufficient confidence.")
+                        # Provide download link for QA results
+                        qa_df = pd.DataFrame(qa_results)
+                        st.markdown(
+                            get_download_link(
+                                qa_df,
+                                "qa_results.csv",
+                                "Download QA Results as CSV"
+                            ),
+                            unsafe_allow_html=True
+                        )
+                    except Exception as e:
+                        st.error(f"Error performing question answering: {str(e)}")
+            else:
+                st.info("Question Answering is disabled. Enable it from the sidebar.")
+        # Risk Analysis
+        with risk_tab:
+            with st.spinner("Analyzing Risk Keywords..."):
+                # Find paragraphs with risk keywords
+                paragraphs = re.split(r'\n\n+', full_text)
+                risk_paragraphs = []
+                for para in paragraphs:
+                    if any(re.search(r'\b' + re.escape(keyword) + r'\b', para, re.IGNORECASE) for keyword in risk_keywords_list):
+                        # Count how many risk keywords are found
+                        keyword_count = sum(1 for keyword in risk_keywords_list if re.search(r'\b' + re.escape(keyword) + r'\b', para, re.IGNORECASE))
+                        # Calculate a simple risk score based on keyword density
+                        risk_score = min(1.0, keyword_count / 10)  # Cap at 1.0
+                        risk_paragraphs.append({
+                            "paragraph": para,
+                            "keyword_count": keyword_count,
+                            "risk_score": risk_score,
+                            "highlighted_text": highlight_risk_terms(para, risk_keywords_list)
+                        })
+                if risk_paragraphs:
+                    # Sort by risk score (highest first)
+                    risk_paragraphs.sort(key=lambda x: x["risk_score"], reverse=True)
+                    # Display risk paragraphs
+                    st.subheader(f"Found {len(risk_paragraphs)} Paragraphs with Risk Keywords")
+                    # Overall document risk score (average of top 5 paragraphs)
+                    top_paragraphs = risk_paragraphs[:min(5, len(risk_paragraphs))]
+                    overall_risk = sum(p["risk_score"] for p in top_paragraphs) / len(top_paragraphs)
+                    # Display risk meter
+                    st.subheader("Document Risk Assessment")
+                    st.progress(overall_risk)
+                    risk_level = "Low" if overall_risk < 0.4 else "Medium" if overall_risk < 0.7 else "High"
+                    st.markdown(f"**Risk Level: :{'green' if risk_level == 'Low' else 'orange' if risk_level == 'Medium' else 'red'}[{risk_level}]** (Score: {overall_risk:.2f})")
+                    # Display individual paragraphs
+                    for i, para in enumerate(risk_paragraphs):
+                        with st.expander(f"Risk Paragraph {i+1} (Score: {para['risk_score']:.2f})"):
+                            st.markdown(para["highlighted_text"])
+                    # Provide download link for risk paragraphs
+                    risk_df = pd.DataFrame([{
+                        "Risk Score": p["risk_score"],
+                        "Keyword Count": p["keyword_count"],
+                        "Paragraph": p["paragraph"]
+                    } for p in risk_paragraphs])
+                    st.markdown(
+                        get_download_link(
+                            risk_df,
+                            "risk_paragraphs.csv",
+                            "Download Risk Analysis as CSV"
+                        ),
+                        unsafe_allow_html=True
+                    )
+                else:
+                    st.info("No risk keywords found in the document.")
+        # Summary Tab
+        with summary_tab:
+            st.subheader("Executive Summary")
+            # Create a simple executive summary based on findings
+            summary_points = []
+            # Add entity summary
+            if entities:
+                entity_counts = {}
+                for entity in entities:
+                    entity_type = entity["type"]
+                    if entity_type not in entity_counts:
+                        entity_counts[entity_type] = 0
+                    entity_counts[entity_type] += 1
+                entity_summary = ", ".join([f"{count} {entity_type}" for entity_type, count in entity_counts.items()])
+                summary_points.append(f"Found {entity_summary}.")
+            # Add risk summary
+            if 'risk_paragraphs' in locals() and risk_paragraphs:
+                top_risk = risk_paragraphs[0]
+                summary_points.append(f"Highest risk section identified with score {top_risk['risk_score']:.2f} containing keywords: {', '.join([kw for kw in risk_keywords_list if re.search(r'\b' + re.escape(kw) + r'\b', top_risk['paragraph'], re.IGNORECASE)])}.")
+                # Add document risk level
+                if 'overall_risk' in locals():
+                    summary_points.append(f"Overall document risk level: {risk_level}.")
+            # Add QA summary
+            if qa_mode and 'qa_results' in locals() and qa_results:
+                # Find the highest confidence answer
+                best_qa = max(qa_results, key=lambda x: x["confidence"])
+                if best_qa["confidence"] > 0:
+                    summary_points.append(f"Key finding: In response to '{best_qa['question']}', the document states '{best_qa['answer']}' (confidence: {best_qa['confidence']:.2f}).")
+            if summary_points:
+                for point in summary_points:
+                    st.markdown(f"• {point}")
+            else:
+                st.info("Not enough data to generate a summary. Try adjusting analysis parameters.")
+            # Export all results as JSON
+            all_results = {
+                "filename": uploaded_file.name,
+                "analysis_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "entities": entities if 'entities' in locals() else [],
+                "qa_results": qa_results if 'qa_results' in locals() else [],
+                "risk_paragraphs": [{k: v for k, v in p.items() if k != 'highlighted_text'} for p in risk_paragraphs] if 'risk_paragraphs' in locals() else [],
+                "summary_points": summary_points
+            }
+            st.markdown(
+                get_download_link(
+                    all_results,
+                    f"regulatory_analysis_{datetime.now().strftime('%Y%m%d%H%M%S')}.json",
+                    "Download Complete Analysis Results (JSON)"
+                ),
+                unsafe_allow_html=True
+            )
+else:
+    # Show a demo or instructions
+    st.info("Upload a PDF file to begin analysis. The tool will extract text and perform NLP analysis to identify regulatory obligations, risks, and more.")
+    # Sample visualization of what the tool does
+    st.subheader("What This Tool Does")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.markdown("**1. Extract Text**")
+        st.markdown("Upload SEC filings and extract all text content from PDFs.")
+    with col2:
+        st.markdown("**2. Analyze Content**")
+        st.markdown("Use NLP to identify regulatory entities, answer questions, and flag risk language.")
+    with col3:
+        st.markdown("**3. Export Results**")
+        st.markdown("Download structured analysis results for review by your legal and compliance teams.")
+# Add footer with information
+st.markdown("---")
+st.markdown("Regulatory Report Checker - NLP-powered document analysis for compliance teams")

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit==1.24.0
+pdfplumber==0.9.0
+spacy==3.5.3
+torch==2.0.1
+transformers==4.30.2
+pandas==2.0.3
+tqdm==4.65.0