koulsahil commited on
Commit
2f9fd21
·
verified ·
1 Parent(s): b02a40e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -77
app.py CHANGED
@@ -4,22 +4,11 @@ import pandas as pd
4
  import re
5
  import spacy
6
  import torch
7
- from transformers import pipeline
8
  import base64
9
  import io
10
  from datetime import datetime
11
  import json
12
- from pathlib import Path
13
- import os
14
- import sys
15
-
16
- # Print Python and package versions for debugging
17
- st.sidebar.write(f"Python version: {sys.version}")
18
- st.sidebar.write(f"Transformers version: {__import__('transformers').__version__}")
19
-
20
- # Configuration for Docker deployment
21
- UPLOAD_FOLDER = os.getenv('UPLOAD_FOLDER', '/tmp/uploads')
22
- Path(UPLOAD_FOLDER).mkdir(exist_ok=True) # Ensure directory exists
23
 
24
  # Set page config
25
  st.set_page_config(
@@ -41,9 +30,11 @@ This application analyzes SEC filings (10-K, 13F, etc.) to extract:
41
  # Sidebar for model selection and settings
42
  st.sidebar.header("Analysis Settings")
43
 
44
- # We're standardizing on just one model to avoid issues
45
- nlp_model = "deepset/deberta-v3-base-squad2"
46
- st.sidebar.info(f"Using model: {nlp_model}")
 
 
47
 
48
  # Entity types to identify
49
  entity_types = st.sidebar.multiselect(
@@ -180,30 +171,17 @@ def perform_ner(text, entity_types):
180
 
181
  return entities
182
 
183
- # Function to load QA model (using simpler pipeline approach)
184
  @st.cache_resource
185
  def load_qa_model(model_name):
186
- try:
187
- # Use the simpler pipeline API that worked in the Dockerfile
188
- qa_pipeline = pipeline("question-answering", model=model_name, cache_dir='/tmp/huggingface')
189
- return qa_pipeline
190
- except Exception as e:
191
- st.error(f"Error loading model: {str(e)}")
192
- # If error, try with minimal requirements
193
- try:
194
- qa_pipeline = pipeline("question-answering", model=model_name)
195
- return qa_pipeline
196
- except Exception as e2:
197
- st.error(f"Failed to load model: {str(e2)}")
198
- return None
199
 
200
- # Function to perform Question Answering with better error handling
201
  def perform_qa(text, questions, qa_pipeline, confidence_threshold):
202
- if qa_pipeline is None:
203
- return [{"question": q, "answer": "Model loading failed", "confidence": 0, "context": ""} for q in questions]
204
-
205
  # Split text into chunks if it's too long
206
- max_length = 384 # Reduced for DeBERTa to avoid tokenization issues
207
  chunks = []
208
 
209
  # Simple chunking by sentences
@@ -240,7 +218,7 @@ def perform_qa(text, questions, qa_pipeline, confidence_threshold):
240
  "context": chunk[max(0, result["start"] - 100):min(len(chunk), result["end"] + 100)]
241
  }
242
  except Exception as e:
243
- st.warning(f"Error processing chunk with question '{question}': {str(e)}")
244
  continue
245
 
246
  if best_answer["answer"]:
@@ -276,14 +254,9 @@ def get_download_link(data, filename, text):
276
  uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
277
 
278
  if uploaded_file:
279
- # Save the uploaded file to the upload folder for better processing
280
- file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
281
- with open(file_path, "wb") as f:
282
- f.write(uploaded_file.getbuffer())
283
-
284
  with st.spinner("Processing PDF file..."):
285
  # Extract text from PDF
286
- full_text, text_by_page = extract_text_from_pdf(file_path)
287
 
288
  # Show text extraction status
289
  st.success(f"Successfully extracted text from {len(text_by_page)} pages")
@@ -346,47 +319,41 @@ if uploaded_file:
346
  # Question Answering
347
  with qa_tab:
348
  if qa_mode:
349
- with st.spinner("Loading QA model and performing analysis..."):
350
  try:
351
- # Load the QA model
352
  qa_pipeline = load_qa_model(nlp_model)
 
353
 
354
- if qa_pipeline:
355
- qa_results = perform_qa(full_text, custom_questions, qa_pipeline, confidence_threshold)
 
356
 
357
- # Display QA results
358
- for result in qa_results:
359
- st.subheader(result["question"])
360
 
361
- if result["confidence"] > 0:
362
- st.markdown(f"**Answer:** {result['answer']}")
363
- st.markdown(f"**Confidence:** {result['confidence']:.2f}")
364
-
365
- with st.expander("Show Context"):
366
- # Highlight the answer in the context
367
- highlighted_context = result["context"].replace(
368
- result["answer"],
369
- f"**:blue[{result['answer']}]**"
370
- )
371
- st.markdown(highlighted_context)
372
- else:
373
- st.info("No answer found with sufficient confidence.")
374
-
375
- # Provide download link for QA results
376
- qa_df = pd.DataFrame(qa_results)
377
- st.markdown(
378
- get_download_link(
379
- qa_df,
380
- "qa_results.csv",
381
- "Download QA Results as CSV"
382
- ),
383
- unsafe_allow_html=True
384
- )
385
- else:
386
- st.error("Failed to load QA model. Check logs for details.")
387
  except Exception as e:
388
  st.error(f"Error performing question answering: {str(e)}")
389
- st.info("If you're seeing model loading errors, ensure the Docker container has adequate memory and the model is properly downloaded.")
390
  else:
391
  st.info("Question Answering is disabled. Enable it from the sidebar.")
392
 
@@ -533,5 +500,4 @@ else:
533
  st.markdown("Download structured analysis results for review by your legal and compliance teams.")
534
 
535
  # Add footer with information
536
- st.markdown("---")
537
- st.markdown("Regulatory Report Checker - NLP-powered document analysis for compliance teams")
 
4
  import re
5
  import spacy
6
  import torch
7
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, pipeline
8
  import base64
9
  import io
10
  from datetime import datetime
11
  import json
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Set page config
14
  st.set_page_config(
 
30
  # Sidebar for model selection and settings
31
  st.sidebar.header("Analysis Settings")
32
 
33
+ # Model selection
34
+ nlp_model = st.sidebar.selectbox(
35
+ "Select NLP Model",
36
+ ["distilbert-base-uncased", "deepset/deberta-v3-base-squad2", "distilbert-base-cased-distilled-squad"]
37
+ )
38
 
39
  # Entity types to identify
40
  entity_types = st.sidebar.multiselect(
 
171
 
172
  return entities
173
 
174
+ # Function to perform Question Answering
175
  @st.cache_resource
176
  def load_qa_model(model_name):
177
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
178
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
179
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
180
+ return qa_pipeline
 
 
 
 
 
 
 
 
 
181
 
 
182
  def perform_qa(text, questions, qa_pipeline, confidence_threshold):
 
 
 
183
  # Split text into chunks if it's too long
184
+ max_length = 512 # Typical max length for transformer models
185
  chunks = []
186
 
187
  # Simple chunking by sentences
 
218
  "context": chunk[max(0, result["start"] - 100):min(len(chunk), result["end"] + 100)]
219
  }
220
  except Exception as e:
221
+ st.error(f"Error processing chunk with question '{question}': {str(e)}")
222
  continue
223
 
224
  if best_answer["answer"]:
 
254
  uploaded_file = st.file_uploader("Upload SEC Filing (PDF)", type=["pdf"])
255
 
256
  if uploaded_file:
 
 
 
 
 
257
  with st.spinner("Processing PDF file..."):
258
  # Extract text from PDF
259
+ full_text, text_by_page = extract_text_from_pdf(uploaded_file)
260
 
261
  # Show text extraction status
262
  st.success(f"Successfully extracted text from {len(text_by_page)} pages")
 
319
  # Question Answering
320
  with qa_tab:
321
  if qa_mode:
322
+ with st.spinner("Performing Question Answering..."):
323
  try:
 
324
  qa_pipeline = load_qa_model(nlp_model)
325
+ qa_results = perform_qa(full_text, custom_questions, qa_pipeline, confidence_threshold)
326
 
327
+ # Display QA results
328
+ for result in qa_results:
329
+ st.subheader(result["question"])
330
 
331
+ if result["confidence"] > 0:
332
+ st.markdown(f"**Answer:** {result['answer']}")
333
+ st.markdown(f"**Confidence:** {result['confidence']:.2f}")
334
 
335
+ with st.expander("Show Context"):
336
+ # Highlight the answer in the context
337
+ highlighted_context = result["context"].replace(
338
+ result["answer"],
339
+ f"**:blue[{result['answer']}]**"
340
+ )
341
+ st.markdown(highlighted_context)
342
+ else:
343
+ st.info("No answer found with sufficient confidence.")
344
+
345
+ # Provide download link for QA results
346
+ qa_df = pd.DataFrame(qa_results)
347
+ st.markdown(
348
+ get_download_link(
349
+ qa_df,
350
+ "qa_results.csv",
351
+ "Download QA Results as CSV"
352
+ ),
353
+ unsafe_allow_html=True
354
+ )
 
 
 
 
 
 
355
  except Exception as e:
356
  st.error(f"Error performing question answering: {str(e)}")
 
357
  else:
358
  st.info("Question Answering is disabled. Enable it from the sidebar.")
359
 
 
500
  st.markdown("Download structured analysis results for review by your legal and compliance teams.")
501
 
502
  # Add footer with information
503
+ st.markdown("---")