koulsahil commited on
Commit
4380ad1
·
verified ·
1 Parent(s): 59151b3

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +30 -0
  2. README.md +22 -12
  3. app.py +504 -0
  4. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ software-properties-common \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements first for better caching
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Install spaCy model
18
+ RUN python -m spacy download en_core_web_sm
19
+
20
+ # Copy the app code
21
+ COPY . .
22
+
23
+ # Expose port for Streamlit
24
+ EXPOSE 8501
25
+
26
+ # Set environment variables
27
+ ENV PYTHONUNBUFFERED=1
28
+
29
+ # Run the application
30
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md CHANGED
@@ -1,12 +1,22 @@
1
- ---
2
- title: Regulatory Document Analyzer
3
- emoji: 🐢
4
- colorFrom: green
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- license: other
9
- short_description: This application analyzes SEC filings (10-K, 13F, etc.)
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
1
+ # Regulatory Report Checker
2
+
3
+ This application analyzes SEC filings (10-K, 13F, etc.) to extract:
4
+ - Regulatory obligations
5
+ - Risk statements
6
+ - Regulatory agency references
7
+ - Potential violations
8
+
9
+ ## Features
10
+ - PDF text extraction
11
+ - Named Entity Recognition for regulatory entities
12
+ - Question Answering for regulatory information
13
+ - Risk analysis with scoring and highlighting
14
+ - Export capabilities (CSV/JSON)
15
+
16
+ ## How to Use
17
+ 1. Upload an SEC filing PDF
18
+ 2. Configure analysis settings in the sidebar
19
+ 3. Review results across different tabs
20
+ 4. Download analysis reports
21
+
22
+ Built with Streamlit, Hugging Face Transformers, spaCy, and PDFPlumber.
app.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ import pandas as pd
4
+ import re
5
+ import spacy
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, pipeline
8
+ import base64
9
+ import io
10
+ from datetime import datetime
11
+ import json
12
+
13
+ # Set page config
14
+ st.set_page_config(
15
+ page_title="Regulatory Report Checker",
16
+ page_icon="📋",
17
+ layout="wide"
18
+ )
19
+
20
+ # Application title and description
21
+ st.title("Regulatory Report Checker")
22
+ st.markdown("""
23
+ This application analyzes SEC filings (10-K, 13F, etc.) to extract:
24
+ - Regulatory obligations
25
+ - Risk statements
26
+ - Regulatory agency references
27
+ - Potential violations
28
+ """)
29
+
30
+ # Sidebar for model selection and settings
31
+ st.sidebar.header("Analysis Settings")
32
+
33
+ # Model selection
34
+ nlp_model = st.sidebar.selectbox(
35
+ "Select NLP Model",
36
+ ["distilbert-base-uncased", "deepset/deberta-v3-base-squad2", "distilbert-base-cased-distilled-squad"]
37
+ )
38
+
39
+ # Entity types to identify
40
+ entity_types = st.sidebar.multiselect(
41
+ "Entity Types to Extract",
42
+ ["Obligation", "Regulatory Agency", "Risk", "Deadline", "Penalty", "Amount"],
43
+ default=["Obligation", "Regulatory Agency", "Risk"]
44
+ )
45
+
46
+ # QA mode selection
47
+ qa_mode = st.sidebar.checkbox("Enable Question Answering", value=True)
48
+
49
+ # Custom questions for QA
50
+ if qa_mode:
51
+ default_questions = [
52
+ "What are the regulatory obligations mentioned?",
53
+ "Are there any violations or risk statements?",
54
+ "What regulatory agencies are mentioned?",
55
+ "What are the compliance deadlines?"
56
+ ]
57
+
58
+ # Allow users to edit questions or add new ones
59
+ st.sidebar.subheader("Custom Questions")
60
+ custom_questions = []
61
+
62
+ # Start with default questions that can be modified
63
+ for i, default_q in enumerate(default_questions):
64
+ q = st.sidebar.text_input(f"Question {i+1}", value=default_q)
65
+ if q:
66
+ custom_questions.append(q)
67
+
68
+ # Option to add more questions
69
+ new_q = st.sidebar.text_input("Additional Question")
70
+ if new_q:
71
+ custom_questions.append(new_q)
72
+
73
+ # Risk keyword settings
74
+ st.sidebar.subheader("Risk Keywords")
75
+ default_risk_keywords = "non-compliance, penalty, violation, risk, fine, investigation, audit, failure, breach, warning"
76
+ risk_keywords = st.sidebar.text_area("Enter risk keywords (comma separated)", value=default_risk_keywords)
77
+ risk_keywords_list = [keyword.strip() for keyword in risk_keywords.split(",")]
78
+
79
+ # Add confidence threshold slider
80
+ confidence_threshold = st.sidebar.slider("Confidence Threshold", 0.0, 1.0, 0.5)
81
+
82
+ # Function to extract text from PDF
83
+ @st.cache_data
84
+ def extract_text_from_pdf(pdf_file):
85
+ text_by_page = {}
86
+
87
+ with pdfplumber.open(pdf_file) as pdf:
88
+ for i, page in enumerate(pdf.pages):
89
+ text = page.extract_text()
90
+ if text:
91
+ text_by_page[i+1] = text
92
+
93
+ # Combine all text
94
+ full_text = "\n\n".join(text_by_page.values())
95
+
96
+ return full_text, text_by_page
97
+
98
+ # Function to highlight risk keywords in text
99
+ def highlight_risk_terms(text, risk_terms):
100
+ highlighted_text = text
101
+ for term in risk_terms:
102
+ pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
103
+ highlighted_text = pattern.sub(f"**:red[{term}]**", highlighted_text)
104
+ return highlighted_text
105
+
106
+ # Function to perform NER using spaCy with custom rules
107
+ def perform_ner(text, entity_types):
108
+ # Load spaCy model
109
+ nlp = spacy.load("en_core_web_sm")
110
+
111
+ # Add custom rules for regulatory entities
112
+ ruler = nlp.add_pipe("entity_ruler")
113
+
114
+ # Define patterns for each entity type
115
+ patterns = []
116
+
117
+ # Regulatory agency patterns
118
+ if "Regulatory Agency" in entity_types:
119
+ agencies = ["SEC", "FINRA", "CFTC", "FDIC", "Federal Reserve", "OCC", "CFPB",
120
+ "FTC", "IRS", "DOJ", "EPA", "FDA", "OSHA", "Securities and Exchange Commission"]
121
+ for agency in agencies:
122
+ patterns.append({"label": "REGULATORY_AGENCY", "pattern": agency})
123
+
124
+ # Obligation patterns
125
+ if "Obligation" in entity_types:
126
+ obligation_triggers = ["must", "required to", "shall", "obligation to", "mandated",
127
+ "compliance with", "comply with", "required by", "in accordance with"]
128
+ for trigger in obligation_triggers:
129
+ patterns.append({"label": "OBLIGATION", "pattern": [{"LOWER": trigger}]})
130
+
131
+ # Risk patterns
132
+ if "Risk" in entity_types:
133
+ risk_triggers = ["risk", "exposure", "vulnerable", "susceptible", "hazard",
134
+ "threat", "danger", "liability", "non-compliance", "violation"]
135
+ for trigger in risk_triggers:
136
+ patterns.append({"label": "RISK", "pattern": trigger})
137
+
138
+ # Deadline patterns
139
+ if "Deadline" in entity_types:
140
+ deadline_triggers = ["by", "due", "deadline", "within", "no later than"]
141
+ for trigger in deadline_triggers:
142
+ patterns.append({"label": "DEADLINE", "pattern": [{"LOWER": trigger}, {"ENT_TYPE": "DATE"}]})
143
+
144
+ # Penalty patterns
145
+ if "Penalty" in entity_types:
146
+ penalty_triggers = ["fine", "penalty", "sanction", "enforcement", "punitive", "disciplinary"]
147
+ for trigger in penalty_triggers:
148
+ patterns.append({"label": "PENALTY", "pattern": trigger})
149
+
150
+ # Add patterns to ruler
151
+ ruler.add_patterns(patterns)
152
+
153
+ # Process text
154
+ doc = nlp(text)
155
+
156
+ # Extract entities
157
+ entities = []
158
+ for ent in doc.ents:
159
+ if ent.label_ in ["REGULATORY_AGENCY", "OBLIGATION", "RISK", "DEADLINE", "PENALTY"] or ent.label_ == "MONEY":
160
+ entity_type = ent.label_
161
+ if ent.label_ == "MONEY" and "Amount" in entity_types:
162
+ entity_type = "AMOUNT"
163
+
164
+ entities.append({
165
+ "text": ent.text,
166
+ "start": ent.start_char,
167
+ "end": ent.end_char,
168
+ "type": entity_type,
169
+ "context": text[max(0, ent.start_char - 50):min(len(text), ent.end_char + 50)]
170
+ })
171
+
172
+ return entities
173
+
174
+ # Function to perform Question Answering
175
+ @st.cache_resource
176
+ def load_qa_model(model_name):
177
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
178
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
179
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
180
+ return qa_pipeline
181
+
182
+ def perform_qa(text, questions, qa_pipeline, confidence_threshold):
183
+ # Split text into chunks if it's too long
184
+ max_length = 512 # Typical max length for transformer models
185
+ chunks = []
186
+
187
+ # Simple chunking by sentences
188
+ sentences = re.split(r'(?<=[.!?])\s+', text)
189
+ current_chunk = ""
190
+
191
+ for sentence in sentences:
192
+ if len(current_chunk) + len(sentence) < max_length:
193
+ current_chunk += sentence + " "
194
+ else:
195
+ chunks.append(current_chunk.strip())
196
+ current_chunk = sentence + " "
197
+
198
+ if current_chunk:
199
+ chunks.append(current_chunk.strip())
200
+
201
+ # If text is still short enough, just use it directly
202
+ if not chunks:
203
+ chunks = [text]
204
+
205
+ # Process each question across all chunks
206
+ results = []
207
+
208
+ for question in questions:
209
+ best_answer = {"answer": "", "score": 0, "context": ""}
210
+
211
+ for chunk in chunks:
212
+ try:
213
+ result = qa_pipeline(question=question, context=chunk)
214
+ if result["score"] > best_answer["score"] and result["score"] >= confidence_threshold:
215
+ best_answer = {
216
+ "answer": result["answer"],
217
+ "score": result["score"],
218
+ "context": chunk[max(0, result["start"] - 100):min(len(chunk), result["end"] + 100)]
219
+ }
220
+ except Exception as e:
221
+ st.error(f"Error processing chunk with question '{question}': {str(e)}")
222
+ continue
223
+
224
+ if best_answer["answer"]:
225
+ results.append({
226
+ "question": question,
227
+ "answer": best_answer["answer"],
228
+ "confidence": best_answer["score"],
229
+ "context": best_answer["context"]
230
+ })
231
+ else:
232
+ results.append({
233
+ "question": question,
234
+ "answer": "No answer found with sufficient confidence.",
235
+ "confidence": 0,
236
+ "context": ""
237
+ })
238
+
239
+ return results
240
+
241
+ # Function to create downloadable file
242
+ def get_download_link(data, filename, text):
243
+ """Generate a link to download the given data as a file"""
244
+ if isinstance(data, pd.DataFrame):
245
+ csv = data.to_csv(index=False)
246
+ b64 = base64.b64encode(csv.encode()).decode()
247
+ else: # Assume JSON
248
+ b64 = base64.b64encode(json.dumps(data, indent=4).encode()).decode()
249
+
250
+ href = f'<a href="data:file/txt;base64,{b64}" download="{filename}">{text}</a>'
251
+ return href
252
+
253
+ # File upload
254
+ uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
255
+
256
+ if uploaded_file:
257
+ with st.spinner("Processing PDF file..."):
258
+ # Extract text from PDF
259
+ full_text, text_by_page = extract_text_from_pdf(uploaded_file)
260
+
261
+ # Show text extraction status
262
+ st.success(f"Successfully extracted text from {len(text_by_page)} pages")
263
+
264
+ # Allow user to view the extracted text
265
+ with st.expander("View Extracted Text"):
266
+ page_selection = st.selectbox(
267
+ "Select page to view",
268
+ ["All"] + list(text_by_page.keys())
269
+ )
270
+
271
+ if page_selection == "All":
272
+ st.text_area("Full Text", full_text, height=300)
273
+ else:
274
+ st.text_area(f"Page {page_selection}", text_by_page[page_selection], height=300)
275
+
276
+ # Begin analysis section
277
+ st.header("Analysis Results")
278
+
279
+ # Create tabs for different analysis methods
280
+ ner_tab, qa_tab, risk_tab, summary_tab = st.tabs(["Entity Recognition", "Question Answering", "Risk Analysis", "Summary"])
281
+
282
+ # NER Analysis
283
+ with ner_tab:
284
+ with st.spinner("Performing Entity Recognition..."):
285
+ entities = perform_ner(full_text, entity_types)
286
+
287
+ if entities:
288
+ # Group entities by type
289
+ entities_by_type = {}
290
+ for entity in entities:
291
+ if entity["type"] not in entities_by_type:
292
+ entities_by_type[entity["type"]] = []
293
+ entities_by_type[entity["type"]].append(entity)
294
+
295
+ # Display entities by type
296
+ for entity_type, type_entities in entities_by_type.items():
297
+ st.subheader(f"{entity_type} Entities")
298
+
299
+ # Create a dataframe for better display
300
+ df = pd.DataFrame([{
301
+ "Text": e["text"],
302
+ "Context": e["context"]
303
+ } for e in type_entities])
304
+
305
+ st.dataframe(df, use_container_width=True)
306
+
307
+ # Provide download link for this entity type
308
+ st.markdown(
309
+ get_download_link(
310
+ df,
311
+ f"{entity_type.lower()}_entities.csv",
312
+ f"Download {entity_type} Entities as CSV"
313
+ ),
314
+ unsafe_allow_html=True
315
+ )
316
+ else:
317
+ st.info("No entities detected. Try adjusting the entity types in the sidebar.")
318
+
319
+ # Question Answering
320
+ with qa_tab:
321
+ if qa_mode:
322
+ with st.spinner("Performing Question Answering..."):
323
+ try:
324
+ qa_pipeline = load_qa_model(nlp_model)
325
+ qa_results = perform_qa(full_text, custom_questions, qa_pipeline, confidence_threshold)
326
+
327
+ # Display QA results
328
+ for result in qa_results:
329
+ st.subheader(result["question"])
330
+
331
+ if result["confidence"] > 0:
332
+ st.markdown(f"**Answer:** {result['answer']}")
333
+ st.markdown(f"**Confidence:** {result['confidence']:.2f}")
334
+
335
+ with st.expander("Show Context"):
336
+ # Highlight the answer in the context
337
+ highlighted_context = result["context"].replace(
338
+ result["answer"],
339
+ f"**:blue[{result['answer']}]**"
340
+ )
341
+ st.markdown(highlighted_context)
342
+ else:
343
+ st.info("No answer found with sufficient confidence.")
344
+
345
+ # Provide download link for QA results
346
+ qa_df = pd.DataFrame(qa_results)
347
+ st.markdown(
348
+ get_download_link(
349
+ qa_df,
350
+ "qa_results.csv",
351
+ "Download QA Results as CSV"
352
+ ),
353
+ unsafe_allow_html=True
354
+ )
355
+ except Exception as e:
356
+ st.error(f"Error performing question answering: {str(e)}")
357
+ else:
358
+ st.info("Question Answering is disabled. Enable it from the sidebar.")
359
+
360
+ # Risk Analysis
361
+ with risk_tab:
362
+ with st.spinner("Analyzing Risk Keywords..."):
363
+ # Find paragraphs with risk keywords
364
+ paragraphs = re.split(r'\n\n+', full_text)
365
+ risk_paragraphs = []
366
+
367
+ for para in paragraphs:
368
+ if any(re.search(r'\b' + re.escape(keyword) + r'\b', para, re.IGNORECASE) for keyword in risk_keywords_list):
369
+ # Count how many risk keywords are found
370
+ keyword_count = sum(1 for keyword in risk_keywords_list if re.search(r'\b' + re.escape(keyword) + r'\b', para, re.IGNORECASE))
371
+
372
+ # Calculate a simple risk score based on keyword density
373
+ risk_score = min(1.0, keyword_count / 10) # Cap at 1.0
374
+
375
+ risk_paragraphs.append({
376
+ "paragraph": para,
377
+ "keyword_count": keyword_count,
378
+ "risk_score": risk_score,
379
+ "highlighted_text": highlight_risk_terms(para, risk_keywords_list)
380
+ })
381
+
382
+ if risk_paragraphs:
383
+ # Sort by risk score (highest first)
384
+ risk_paragraphs.sort(key=lambda x: x["risk_score"], reverse=True)
385
+
386
+ # Display risk paragraphs
387
+ st.subheader(f"Found {len(risk_paragraphs)} Paragraphs with Risk Keywords")
388
+
389
+ # Overall document risk score (average of top 5 paragraphs)
390
+ top_paragraphs = risk_paragraphs[:min(5, len(risk_paragraphs))]
391
+ overall_risk = sum(p["risk_score"] for p in top_paragraphs) / len(top_paragraphs)
392
+
393
+ # Display risk meter
394
+ st.subheader("Document Risk Assessment")
395
+ st.progress(overall_risk)
396
+ risk_level = "Low" if overall_risk < 0.4 else "Medium" if overall_risk < 0.7 else "High"
397
+ st.markdown(f"**Risk Level: :{'green' if risk_level == 'Low' else 'orange' if risk_level == 'Medium' else 'red'}[{risk_level}]** (Score: {overall_risk:.2f})")
398
+
399
+ # Display individual paragraphs
400
+ for i, para in enumerate(risk_paragraphs):
401
+ with st.expander(f"Risk Paragraph {i+1} (Score: {para['risk_score']:.2f})"):
402
+ st.markdown(para["highlighted_text"])
403
+
404
+ # Provide download link for risk paragraphs
405
+ risk_df = pd.DataFrame([{
406
+ "Risk Score": p["risk_score"],
407
+ "Keyword Count": p["keyword_count"],
408
+ "Paragraph": p["paragraph"]
409
+ } for p in risk_paragraphs])
410
+
411
+ st.markdown(
412
+ get_download_link(
413
+ risk_df,
414
+ "risk_paragraphs.csv",
415
+ "Download Risk Analysis as CSV"
416
+ ),
417
+ unsafe_allow_html=True
418
+ )
419
+ else:
420
+ st.info("No risk keywords found in the document.")
421
+
422
+ # Summary Tab
423
+ with summary_tab:
424
+ st.subheader("Executive Summary")
425
+
426
+ # Create a simple executive summary based on findings
427
+ summary_points = []
428
+
429
+ # Add entity summary
430
+ if entities:
431
+ entity_counts = {}
432
+ for entity in entities:
433
+ entity_type = entity["type"]
434
+ if entity_type not in entity_counts:
435
+ entity_counts[entity_type] = 0
436
+ entity_counts[entity_type] += 1
437
+
438
+ entity_summary = ", ".join([f"{count} {entity_type}" for entity_type, count in entity_counts.items()])
439
+ summary_points.append(f"Found {entity_summary}.")
440
+
441
+ # Add risk summary
442
+ if 'risk_paragraphs' in locals() and risk_paragraphs:
443
+ top_risk = risk_paragraphs[0]
444
+ summary_points.append(f"Highest risk section identified with score {top_risk['risk_score']:.2f} containing keywords: {', '.join([kw for kw in risk_keywords_list if re.search(r'\b' + re.escape(kw) + r'\b', top_risk['paragraph'], re.IGNORECASE)])}.")
445
+
446
+ # Add document risk level
447
+ if 'overall_risk' in locals():
448
+ summary_points.append(f"Overall document risk level: {risk_level}.")
449
+
450
+ # Add QA summary
451
+ if qa_mode and 'qa_results' in locals() and qa_results:
452
+ # Find the highest confidence answer
453
+ best_qa = max(qa_results, key=lambda x: x["confidence"])
454
+ if best_qa["confidence"] > 0:
455
+ summary_points.append(f"Key finding: In response to '{best_qa['question']}', the document states '{best_qa['answer']}' (confidence: {best_qa['confidence']:.2f}).")
456
+
457
+ if summary_points:
458
+ for point in summary_points:
459
+ st.markdown(f"• {point}")
460
+ else:
461
+ st.info("Not enough data to generate a summary. Try adjusting analysis parameters.")
462
+
463
+ # Export all results as JSON
464
+ all_results = {
465
+ "filename": uploaded_file.name,
466
+ "analysis_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
467
+ "entities": entities if 'entities' in locals() else [],
468
+ "qa_results": qa_results if 'qa_results' in locals() else [],
469
+ "risk_paragraphs": [{k: v for k, v in p.items() if k != 'highlighted_text'} for p in risk_paragraphs] if 'risk_paragraphs' in locals() else [],
470
+ "summary_points": summary_points
471
+ }
472
+
473
+ st.markdown(
474
+ get_download_link(
475
+ all_results,
476
+ f"regulatory_analysis_{datetime.now().strftime('%Y%m%d%H%M%S')}.json",
477
+ "Download Complete Analysis Results (JSON)"
478
+ ),
479
+ unsafe_allow_html=True
480
+ )
481
+ else:
482
+ # Show a demo or instructions
483
+ st.info("Upload a PDF file to begin analysis. The tool will extract text and perform NLP analysis to identify regulatory obligations, risks, and more.")
484
+
485
+ # Sample visualization of what the tool does
486
+ st.subheader("What This Tool Does")
487
+
488
+ col1, col2, col3 = st.columns(3)
489
+
490
+ with col1:
491
+ st.markdown("**1. Extract Text**")
492
+ st.markdown("Upload SEC filings and extract all text content from PDFs.")
493
+
494
+ with col2:
495
+ st.markdown("**2. Analyze Content**")
496
+ st.markdown("Use NLP to identify regulatory entities, answer questions, and flag risk language.")
497
+
498
+ with col3:
499
+ st.markdown("**3. Export Results**")
500
+ st.markdown("Download structured analysis results for review by your legal and compliance teams.")
501
+
502
+ # Add footer with information
503
+ st.markdown("---")
504
+ st.markdown("Regulatory Report Checker - NLP-powered document analysis for compliance teams")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit==1.24.0
2
+ pdfplumber==0.9.0
3
+ spacy==3.5.3
4
+ torch==2.0.1
5
+ transformers==4.30.2
6
+ pandas==2.0.3
7
+ tqdm==4.65.0