Upload 4 files
Browse files- Dockerfile +30 -0
- README.md +22 -12
- app.py +504 -0
- requirements.txt +7 -0
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install system dependencies
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
curl \
|
9 |
+
software-properties-common \
|
10 |
+
git \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
# Copy requirements first for better caching
|
14 |
+
COPY requirements.txt .
|
15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
16 |
+
|
17 |
+
# Install spaCy model
|
18 |
+
RUN python -m spacy download en_core_web_sm
|
19 |
+
|
20 |
+
# Copy the app code
|
21 |
+
COPY . .
|
22 |
+
|
23 |
+
# Expose port for Streamlit
|
24 |
+
EXPOSE 8501
|
25 |
+
|
26 |
+
# Set environment variables
|
27 |
+
ENV PYTHONUNBUFFERED=1
|
28 |
+
|
29 |
+
# Run the application
|
30 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
README.md
CHANGED
@@ -1,12 +1,22 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Regulatory Report Checker
|
2 |
+
|
3 |
+
This application analyzes SEC filings (10-K, 13F, etc.) to extract:
|
4 |
+
- Regulatory obligations
|
5 |
+
- Risk statements
|
6 |
+
- Regulatory agency references
|
7 |
+
- Potential violations
|
8 |
+
|
9 |
+
## Features
|
10 |
+
- PDF text extraction
|
11 |
+
- Named Entity Recognition for regulatory entities
|
12 |
+
- Question Answering for regulatory information
|
13 |
+
- Risk analysis with scoring and highlighting
|
14 |
+
- Export capabilities (CSV/JSON)
|
15 |
+
|
16 |
+
## How to Use
|
17 |
+
1. Upload an SEC filing PDF
|
18 |
+
2. Configure analysis settings in the sidebar
|
19 |
+
3. Review results across different tabs
|
20 |
+
4. Download analysis reports
|
21 |
+
|
22 |
+
Built with Streamlit, Hugging Face Transformers, spaCy, and PDFPlumber.
|
app.py
ADDED
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pdfplumber
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
import spacy
|
6 |
+
import torch
|
7 |
+
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, pipeline
|
8 |
+
import base64
|
9 |
+
import io
|
10 |
+
from datetime import datetime
|
11 |
+
import json
|
12 |
+
|
13 |
+
# Set page config
|
14 |
+
st.set_page_config(
|
15 |
+
page_title="Regulatory Report Checker",
|
16 |
+
page_icon="📋",
|
17 |
+
layout="wide"
|
18 |
+
)
|
19 |
+
|
20 |
+
# Application title and description
|
21 |
+
st.title("Regulatory Report Checker")
|
22 |
+
st.markdown("""
|
23 |
+
This application analyzes SEC filings (10-K, 13F, etc.) to extract:
|
24 |
+
- Regulatory obligations
|
25 |
+
- Risk statements
|
26 |
+
- Regulatory agency references
|
27 |
+
- Potential violations
|
28 |
+
""")
|
29 |
+
|
30 |
+
# Sidebar for model selection and settings
|
31 |
+
st.sidebar.header("Analysis Settings")
|
32 |
+
|
33 |
+
# Model selection
|
34 |
+
nlp_model = st.sidebar.selectbox(
|
35 |
+
"Select NLP Model",
|
36 |
+
["distilbert-base-uncased", "deepset/deberta-v3-base-squad2", "distilbert-base-cased-distilled-squad"]
|
37 |
+
)
|
38 |
+
|
39 |
+
# Entity types to identify
|
40 |
+
entity_types = st.sidebar.multiselect(
|
41 |
+
"Entity Types to Extract",
|
42 |
+
["Obligation", "Regulatory Agency", "Risk", "Deadline", "Penalty", "Amount"],
|
43 |
+
default=["Obligation", "Regulatory Agency", "Risk"]
|
44 |
+
)
|
45 |
+
|
46 |
+
# QA mode selection
|
47 |
+
qa_mode = st.sidebar.checkbox("Enable Question Answering", value=True)
|
48 |
+
|
49 |
+
# Custom questions for QA
|
50 |
+
if qa_mode:
|
51 |
+
default_questions = [
|
52 |
+
"What are the regulatory obligations mentioned?",
|
53 |
+
"Are there any violations or risk statements?",
|
54 |
+
"What regulatory agencies are mentioned?",
|
55 |
+
"What are the compliance deadlines?"
|
56 |
+
]
|
57 |
+
|
58 |
+
# Allow users to edit questions or add new ones
|
59 |
+
st.sidebar.subheader("Custom Questions")
|
60 |
+
custom_questions = []
|
61 |
+
|
62 |
+
# Start with default questions that can be modified
|
63 |
+
for i, default_q in enumerate(default_questions):
|
64 |
+
q = st.sidebar.text_input(f"Question {i+1}", value=default_q)
|
65 |
+
if q:
|
66 |
+
custom_questions.append(q)
|
67 |
+
|
68 |
+
# Option to add more questions
|
69 |
+
new_q = st.sidebar.text_input("Additional Question")
|
70 |
+
if new_q:
|
71 |
+
custom_questions.append(new_q)
|
72 |
+
|
73 |
+
# Risk keyword settings
|
74 |
+
st.sidebar.subheader("Risk Keywords")
|
75 |
+
default_risk_keywords = "non-compliance, penalty, violation, risk, fine, investigation, audit, failure, breach, warning"
|
76 |
+
risk_keywords = st.sidebar.text_area("Enter risk keywords (comma separated)", value=default_risk_keywords)
|
77 |
+
risk_keywords_list = [keyword.strip() for keyword in risk_keywords.split(",")]
|
78 |
+
|
79 |
+
# Add confidence threshold slider
|
80 |
+
confidence_threshold = st.sidebar.slider("Confidence Threshold", 0.0, 1.0, 0.5)
|
81 |
+
|
82 |
+
# Function to extract text from PDF
|
83 |
+
@st.cache_data
|
84 |
+
def extract_text_from_pdf(pdf_file):
|
85 |
+
text_by_page = {}
|
86 |
+
|
87 |
+
with pdfplumber.open(pdf_file) as pdf:
|
88 |
+
for i, page in enumerate(pdf.pages):
|
89 |
+
text = page.extract_text()
|
90 |
+
if text:
|
91 |
+
text_by_page[i+1] = text
|
92 |
+
|
93 |
+
# Combine all text
|
94 |
+
full_text = "\n\n".join(text_by_page.values())
|
95 |
+
|
96 |
+
return full_text, text_by_page
|
97 |
+
|
98 |
+
# Function to highlight risk keywords in text
|
99 |
+
def highlight_risk_terms(text, risk_terms):
|
100 |
+
highlighted_text = text
|
101 |
+
for term in risk_terms:
|
102 |
+
pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
|
103 |
+
highlighted_text = pattern.sub(f"**:red[{term}]**", highlighted_text)
|
104 |
+
return highlighted_text
|
105 |
+
|
106 |
+
# Function to perform NER using spaCy with custom rules
|
107 |
+
def perform_ner(text, entity_types):
|
108 |
+
# Load spaCy model
|
109 |
+
nlp = spacy.load("en_core_web_sm")
|
110 |
+
|
111 |
+
# Add custom rules for regulatory entities
|
112 |
+
ruler = nlp.add_pipe("entity_ruler")
|
113 |
+
|
114 |
+
# Define patterns for each entity type
|
115 |
+
patterns = []
|
116 |
+
|
117 |
+
# Regulatory agency patterns
|
118 |
+
if "Regulatory Agency" in entity_types:
|
119 |
+
agencies = ["SEC", "FINRA", "CFTC", "FDIC", "Federal Reserve", "OCC", "CFPB",
|
120 |
+
"FTC", "IRS", "DOJ", "EPA", "FDA", "OSHA", "Securities and Exchange Commission"]
|
121 |
+
for agency in agencies:
|
122 |
+
patterns.append({"label": "REGULATORY_AGENCY", "pattern": agency})
|
123 |
+
|
124 |
+
# Obligation patterns
|
125 |
+
if "Obligation" in entity_types:
|
126 |
+
obligation_triggers = ["must", "required to", "shall", "obligation to", "mandated",
|
127 |
+
"compliance with", "comply with", "required by", "in accordance with"]
|
128 |
+
for trigger in obligation_triggers:
|
129 |
+
patterns.append({"label": "OBLIGATION", "pattern": [{"LOWER": trigger}]})
|
130 |
+
|
131 |
+
# Risk patterns
|
132 |
+
if "Risk" in entity_types:
|
133 |
+
risk_triggers = ["risk", "exposure", "vulnerable", "susceptible", "hazard",
|
134 |
+
"threat", "danger", "liability", "non-compliance", "violation"]
|
135 |
+
for trigger in risk_triggers:
|
136 |
+
patterns.append({"label": "RISK", "pattern": trigger})
|
137 |
+
|
138 |
+
# Deadline patterns
|
139 |
+
if "Deadline" in entity_types:
|
140 |
+
deadline_triggers = ["by", "due", "deadline", "within", "no later than"]
|
141 |
+
for trigger in deadline_triggers:
|
142 |
+
patterns.append({"label": "DEADLINE", "pattern": [{"LOWER": trigger}, {"ENT_TYPE": "DATE"}]})
|
143 |
+
|
144 |
+
# Penalty patterns
|
145 |
+
if "Penalty" in entity_types:
|
146 |
+
penalty_triggers = ["fine", "penalty", "sanction", "enforcement", "punitive", "disciplinary"]
|
147 |
+
for trigger in penalty_triggers:
|
148 |
+
patterns.append({"label": "PENALTY", "pattern": trigger})
|
149 |
+
|
150 |
+
# Add patterns to ruler
|
151 |
+
ruler.add_patterns(patterns)
|
152 |
+
|
153 |
+
# Process text
|
154 |
+
doc = nlp(text)
|
155 |
+
|
156 |
+
# Extract entities
|
157 |
+
entities = []
|
158 |
+
for ent in doc.ents:
|
159 |
+
if ent.label_ in ["REGULATORY_AGENCY", "OBLIGATION", "RISK", "DEADLINE", "PENALTY"] or ent.label_ == "MONEY":
|
160 |
+
entity_type = ent.label_
|
161 |
+
if ent.label_ == "MONEY" and "Amount" in entity_types:
|
162 |
+
entity_type = "AMOUNT"
|
163 |
+
|
164 |
+
entities.append({
|
165 |
+
"text": ent.text,
|
166 |
+
"start": ent.start_char,
|
167 |
+
"end": ent.end_char,
|
168 |
+
"type": entity_type,
|
169 |
+
"context": text[max(0, ent.start_char - 50):min(len(text), ent.end_char + 50)]
|
170 |
+
})
|
171 |
+
|
172 |
+
return entities
|
173 |
+
|
174 |
+
# Function to perform Question Answering
|
175 |
+
@st.cache_resource
|
176 |
+
def load_qa_model(model_name):
|
177 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
178 |
+
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
179 |
+
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
180 |
+
return qa_pipeline
|
181 |
+
|
182 |
+
def perform_qa(text, questions, qa_pipeline, confidence_threshold):
|
183 |
+
# Split text into chunks if it's too long
|
184 |
+
max_length = 512 # Typical max length for transformer models
|
185 |
+
chunks = []
|
186 |
+
|
187 |
+
# Simple chunking by sentences
|
188 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
189 |
+
current_chunk = ""
|
190 |
+
|
191 |
+
for sentence in sentences:
|
192 |
+
if len(current_chunk) + len(sentence) < max_length:
|
193 |
+
current_chunk += sentence + " "
|
194 |
+
else:
|
195 |
+
chunks.append(current_chunk.strip())
|
196 |
+
current_chunk = sentence + " "
|
197 |
+
|
198 |
+
if current_chunk:
|
199 |
+
chunks.append(current_chunk.strip())
|
200 |
+
|
201 |
+
# If text is still short enough, just use it directly
|
202 |
+
if not chunks:
|
203 |
+
chunks = [text]
|
204 |
+
|
205 |
+
# Process each question across all chunks
|
206 |
+
results = []
|
207 |
+
|
208 |
+
for question in questions:
|
209 |
+
best_answer = {"answer": "", "score": 0, "context": ""}
|
210 |
+
|
211 |
+
for chunk in chunks:
|
212 |
+
try:
|
213 |
+
result = qa_pipeline(question=question, context=chunk)
|
214 |
+
if result["score"] > best_answer["score"] and result["score"] >= confidence_threshold:
|
215 |
+
best_answer = {
|
216 |
+
"answer": result["answer"],
|
217 |
+
"score": result["score"],
|
218 |
+
"context": chunk[max(0, result["start"] - 100):min(len(chunk), result["end"] + 100)]
|
219 |
+
}
|
220 |
+
except Exception as e:
|
221 |
+
st.error(f"Error processing chunk with question '{question}': {str(e)}")
|
222 |
+
continue
|
223 |
+
|
224 |
+
if best_answer["answer"]:
|
225 |
+
results.append({
|
226 |
+
"question": question,
|
227 |
+
"answer": best_answer["answer"],
|
228 |
+
"confidence": best_answer["score"],
|
229 |
+
"context": best_answer["context"]
|
230 |
+
})
|
231 |
+
else:
|
232 |
+
results.append({
|
233 |
+
"question": question,
|
234 |
+
"answer": "No answer found with sufficient confidence.",
|
235 |
+
"confidence": 0,
|
236 |
+
"context": ""
|
237 |
+
})
|
238 |
+
|
239 |
+
return results
|
240 |
+
|
241 |
+
# Function to create downloadable file
|
242 |
+
def get_download_link(data, filename, text):
|
243 |
+
"""Generate a link to download the given data as a file"""
|
244 |
+
if isinstance(data, pd.DataFrame):
|
245 |
+
csv = data.to_csv(index=False)
|
246 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
247 |
+
else: # Assume JSON
|
248 |
+
b64 = base64.b64encode(json.dumps(data, indent=4).encode()).decode()
|
249 |
+
|
250 |
+
href = f'<a href="data:file/txt;base64,{b64}" download="{filename}">{text}</a>'
|
251 |
+
return href
|
252 |
+
|
253 |
+
# File upload
|
254 |
+
uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
|
255 |
+
|
256 |
+
if uploaded_file:
|
257 |
+
with st.spinner("Processing PDF file..."):
|
258 |
+
# Extract text from PDF
|
259 |
+
full_text, text_by_page = extract_text_from_pdf(uploaded_file)
|
260 |
+
|
261 |
+
# Show text extraction status
|
262 |
+
st.success(f"Successfully extracted text from {len(text_by_page)} pages")
|
263 |
+
|
264 |
+
# Allow user to view the extracted text
|
265 |
+
with st.expander("View Extracted Text"):
|
266 |
+
page_selection = st.selectbox(
|
267 |
+
"Select page to view",
|
268 |
+
["All"] + list(text_by_page.keys())
|
269 |
+
)
|
270 |
+
|
271 |
+
if page_selection == "All":
|
272 |
+
st.text_area("Full Text", full_text, height=300)
|
273 |
+
else:
|
274 |
+
st.text_area(f"Page {page_selection}", text_by_page[page_selection], height=300)
|
275 |
+
|
276 |
+
# Begin analysis section
|
277 |
+
st.header("Analysis Results")
|
278 |
+
|
279 |
+
# Create tabs for different analysis methods
|
280 |
+
ner_tab, qa_tab, risk_tab, summary_tab = st.tabs(["Entity Recognition", "Question Answering", "Risk Analysis", "Summary"])
|
281 |
+
|
282 |
+
# NER Analysis
|
283 |
+
with ner_tab:
|
284 |
+
with st.spinner("Performing Entity Recognition..."):
|
285 |
+
entities = perform_ner(full_text, entity_types)
|
286 |
+
|
287 |
+
if entities:
|
288 |
+
# Group entities by type
|
289 |
+
entities_by_type = {}
|
290 |
+
for entity in entities:
|
291 |
+
if entity["type"] not in entities_by_type:
|
292 |
+
entities_by_type[entity["type"]] = []
|
293 |
+
entities_by_type[entity["type"]].append(entity)
|
294 |
+
|
295 |
+
# Display entities by type
|
296 |
+
for entity_type, type_entities in entities_by_type.items():
|
297 |
+
st.subheader(f"{entity_type} Entities")
|
298 |
+
|
299 |
+
# Create a dataframe for better display
|
300 |
+
df = pd.DataFrame([{
|
301 |
+
"Text": e["text"],
|
302 |
+
"Context": e["context"]
|
303 |
+
} for e in type_entities])
|
304 |
+
|
305 |
+
st.dataframe(df, use_container_width=True)
|
306 |
+
|
307 |
+
# Provide download link for this entity type
|
308 |
+
st.markdown(
|
309 |
+
get_download_link(
|
310 |
+
df,
|
311 |
+
f"{entity_type.lower()}_entities.csv",
|
312 |
+
f"Download {entity_type} Entities as CSV"
|
313 |
+
),
|
314 |
+
unsafe_allow_html=True
|
315 |
+
)
|
316 |
+
else:
|
317 |
+
st.info("No entities detected. Try adjusting the entity types in the sidebar.")
|
318 |
+
|
319 |
+
# Question Answering
|
320 |
+
with qa_tab:
|
321 |
+
if qa_mode:
|
322 |
+
with st.spinner("Performing Question Answering..."):
|
323 |
+
try:
|
324 |
+
qa_pipeline = load_qa_model(nlp_model)
|
325 |
+
qa_results = perform_qa(full_text, custom_questions, qa_pipeline, confidence_threshold)
|
326 |
+
|
327 |
+
# Display QA results
|
328 |
+
for result in qa_results:
|
329 |
+
st.subheader(result["question"])
|
330 |
+
|
331 |
+
if result["confidence"] > 0:
|
332 |
+
st.markdown(f"**Answer:** {result['answer']}")
|
333 |
+
st.markdown(f"**Confidence:** {result['confidence']:.2f}")
|
334 |
+
|
335 |
+
with st.expander("Show Context"):
|
336 |
+
# Highlight the answer in the context
|
337 |
+
highlighted_context = result["context"].replace(
|
338 |
+
result["answer"],
|
339 |
+
f"**:blue[{result['answer']}]**"
|
340 |
+
)
|
341 |
+
st.markdown(highlighted_context)
|
342 |
+
else:
|
343 |
+
st.info("No answer found with sufficient confidence.")
|
344 |
+
|
345 |
+
# Provide download link for QA results
|
346 |
+
qa_df = pd.DataFrame(qa_results)
|
347 |
+
st.markdown(
|
348 |
+
get_download_link(
|
349 |
+
qa_df,
|
350 |
+
"qa_results.csv",
|
351 |
+
"Download QA Results as CSV"
|
352 |
+
),
|
353 |
+
unsafe_allow_html=True
|
354 |
+
)
|
355 |
+
except Exception as e:
|
356 |
+
st.error(f"Error performing question answering: {str(e)}")
|
357 |
+
else:
|
358 |
+
st.info("Question Answering is disabled. Enable it from the sidebar.")
|
359 |
+
|
360 |
+
# Risk Analysis
|
361 |
+
with risk_tab:
|
362 |
+
with st.spinner("Analyzing Risk Keywords..."):
|
363 |
+
# Find paragraphs with risk keywords
|
364 |
+
paragraphs = re.split(r'\n\n+', full_text)
|
365 |
+
risk_paragraphs = []
|
366 |
+
|
367 |
+
for para in paragraphs:
|
368 |
+
if any(re.search(r'\b' + re.escape(keyword) + r'\b', para, re.IGNORECASE) for keyword in risk_keywords_list):
|
369 |
+
# Count how many risk keywords are found
|
370 |
+
keyword_count = sum(1 for keyword in risk_keywords_list if re.search(r'\b' + re.escape(keyword) + r'\b', para, re.IGNORECASE))
|
371 |
+
|
372 |
+
# Calculate a simple risk score based on keyword density
|
373 |
+
risk_score = min(1.0, keyword_count / 10) # Cap at 1.0
|
374 |
+
|
375 |
+
risk_paragraphs.append({
|
376 |
+
"paragraph": para,
|
377 |
+
"keyword_count": keyword_count,
|
378 |
+
"risk_score": risk_score,
|
379 |
+
"highlighted_text": highlight_risk_terms(para, risk_keywords_list)
|
380 |
+
})
|
381 |
+
|
382 |
+
if risk_paragraphs:
|
383 |
+
# Sort by risk score (highest first)
|
384 |
+
risk_paragraphs.sort(key=lambda x: x["risk_score"], reverse=True)
|
385 |
+
|
386 |
+
# Display risk paragraphs
|
387 |
+
st.subheader(f"Found {len(risk_paragraphs)} Paragraphs with Risk Keywords")
|
388 |
+
|
389 |
+
# Overall document risk score (average of top 5 paragraphs)
|
390 |
+
top_paragraphs = risk_paragraphs[:min(5, len(risk_paragraphs))]
|
391 |
+
overall_risk = sum(p["risk_score"] for p in top_paragraphs) / len(top_paragraphs)
|
392 |
+
|
393 |
+
# Display risk meter
|
394 |
+
st.subheader("Document Risk Assessment")
|
395 |
+
st.progress(overall_risk)
|
396 |
+
risk_level = "Low" if overall_risk < 0.4 else "Medium" if overall_risk < 0.7 else "High"
|
397 |
+
st.markdown(f"**Risk Level: :{'green' if risk_level == 'Low' else 'orange' if risk_level == 'Medium' else 'red'}[{risk_level}]** (Score: {overall_risk:.2f})")
|
398 |
+
|
399 |
+
# Display individual paragraphs
|
400 |
+
for i, para in enumerate(risk_paragraphs):
|
401 |
+
with st.expander(f"Risk Paragraph {i+1} (Score: {para['risk_score']:.2f})"):
|
402 |
+
st.markdown(para["highlighted_text"])
|
403 |
+
|
404 |
+
# Provide download link for risk paragraphs
|
405 |
+
risk_df = pd.DataFrame([{
|
406 |
+
"Risk Score": p["risk_score"],
|
407 |
+
"Keyword Count": p["keyword_count"],
|
408 |
+
"Paragraph": p["paragraph"]
|
409 |
+
} for p in risk_paragraphs])
|
410 |
+
|
411 |
+
st.markdown(
|
412 |
+
get_download_link(
|
413 |
+
risk_df,
|
414 |
+
"risk_paragraphs.csv",
|
415 |
+
"Download Risk Analysis as CSV"
|
416 |
+
),
|
417 |
+
unsafe_allow_html=True
|
418 |
+
)
|
419 |
+
else:
|
420 |
+
st.info("No risk keywords found in the document.")
|
421 |
+
|
422 |
+
# Summary Tab
|
423 |
+
with summary_tab:
|
424 |
+
st.subheader("Executive Summary")
|
425 |
+
|
426 |
+
# Create a simple executive summary based on findings
|
427 |
+
summary_points = []
|
428 |
+
|
429 |
+
# Add entity summary
|
430 |
+
if entities:
|
431 |
+
entity_counts = {}
|
432 |
+
for entity in entities:
|
433 |
+
entity_type = entity["type"]
|
434 |
+
if entity_type not in entity_counts:
|
435 |
+
entity_counts[entity_type] = 0
|
436 |
+
entity_counts[entity_type] += 1
|
437 |
+
|
438 |
+
entity_summary = ", ".join([f"{count} {entity_type}" for entity_type, count in entity_counts.items()])
|
439 |
+
summary_points.append(f"Found {entity_summary}.")
|
440 |
+
|
441 |
+
# Add risk summary
|
442 |
+
if 'risk_paragraphs' in locals() and risk_paragraphs:
|
443 |
+
top_risk = risk_paragraphs[0]
|
444 |
+
summary_points.append(f"Highest risk section identified with score {top_risk['risk_score']:.2f} containing keywords: {', '.join([kw for kw in risk_keywords_list if re.search(r'\b' + re.escape(kw) + r'\b', top_risk['paragraph'], re.IGNORECASE)])}.")
|
445 |
+
|
446 |
+
# Add document risk level
|
447 |
+
if 'overall_risk' in locals():
|
448 |
+
summary_points.append(f"Overall document risk level: {risk_level}.")
|
449 |
+
|
450 |
+
# Add QA summary
|
451 |
+
if qa_mode and 'qa_results' in locals() and qa_results:
|
452 |
+
# Find the highest confidence answer
|
453 |
+
best_qa = max(qa_results, key=lambda x: x["confidence"])
|
454 |
+
if best_qa["confidence"] > 0:
|
455 |
+
summary_points.append(f"Key finding: In response to '{best_qa['question']}', the document states '{best_qa['answer']}' (confidence: {best_qa['confidence']:.2f}).")
|
456 |
+
|
457 |
+
if summary_points:
|
458 |
+
for point in summary_points:
|
459 |
+
st.markdown(f"• {point}")
|
460 |
+
else:
|
461 |
+
st.info("Not enough data to generate a summary. Try adjusting analysis parameters.")
|
462 |
+
|
463 |
+
# Export all results as JSON
|
464 |
+
all_results = {
|
465 |
+
"filename": uploaded_file.name,
|
466 |
+
"analysis_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
467 |
+
"entities": entities if 'entities' in locals() else [],
|
468 |
+
"qa_results": qa_results if 'qa_results' in locals() else [],
|
469 |
+
"risk_paragraphs": [{k: v for k, v in p.items() if k != 'highlighted_text'} for p in risk_paragraphs] if 'risk_paragraphs' in locals() else [],
|
470 |
+
"summary_points": summary_points
|
471 |
+
}
|
472 |
+
|
473 |
+
st.markdown(
|
474 |
+
get_download_link(
|
475 |
+
all_results,
|
476 |
+
f"regulatory_analysis_{datetime.now().strftime('%Y%m%d%H%M%S')}.json",
|
477 |
+
"Download Complete Analysis Results (JSON)"
|
478 |
+
),
|
479 |
+
unsafe_allow_html=True
|
480 |
+
)
|
481 |
+
else:
|
482 |
+
# Show a demo or instructions
|
483 |
+
st.info("Upload a PDF file to begin analysis. The tool will extract text and perform NLP analysis to identify regulatory obligations, risks, and more.")
|
484 |
+
|
485 |
+
# Sample visualization of what the tool does
|
486 |
+
st.subheader("What This Tool Does")
|
487 |
+
|
488 |
+
col1, col2, col3 = st.columns(3)
|
489 |
+
|
490 |
+
with col1:
|
491 |
+
st.markdown("**1. Extract Text**")
|
492 |
+
st.markdown("Upload SEC filings and extract all text content from PDFs.")
|
493 |
+
|
494 |
+
with col2:
|
495 |
+
st.markdown("**2. Analyze Content**")
|
496 |
+
st.markdown("Use NLP to identify regulatory entities, answer questions, and flag risk language.")
|
497 |
+
|
498 |
+
with col3:
|
499 |
+
st.markdown("**3. Export Results**")
|
500 |
+
st.markdown("Download structured analysis results for review by your legal and compliance teams.")
|
501 |
+
|
502 |
+
# Add footer with information
|
503 |
+
st.markdown("---")
|
504 |
+
st.markdown("Regulatory Report Checker - NLP-powered document analysis for compliance teams")
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.24.0
|
2 |
+
pdfplumber==0.9.0
|
3 |
+
spacy==3.5.3
|
4 |
+
torch==2.0.1
|
5 |
+
transformers==4.30.2
|
6 |
+
pandas==2.0.3
|
7 |
+
tqdm==4.65.0
|