samim2024 commited on
Commit
1827766
·
verified ·
1 Parent(s): c8e1843

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -29
app.py CHANGED
@@ -1,9 +1,9 @@
1
-
2
  # app.py
3
  import streamlit as st
4
  import os
5
  from io import BytesIO
6
  from PyPDF2 import PdfReader
 
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import FAISS
@@ -16,6 +16,7 @@ import uuid
16
  from dotenv import load_dotenv
17
  import requests
18
  import pandas as pd
 
19
  from docx import Document
20
 
21
  # Load environment variables
@@ -56,14 +57,23 @@ def process_input(input_data):
56
 
57
  try:
58
  if file_extension == 'pdf':
59
- pdf_reader = PdfReader(BytesIO(input_data.read()))
60
- documents = "".join([page.extract_text() or "" for page in pdf_reader.pages])
 
 
 
61
  elif file_extension in ['xls', 'xlsx']:
62
- df = pd.read_excel(BytesIO(input_data.read()), engine='openpyxl')
63
- documents = df.to_string(index=False)
 
 
 
64
  elif file_extension in ['doc', 'docx']:
65
- doc = Document(BytesIO(input_data.read()))
66
- documents = "\n".join([para.text for para in doc.paragraphs if para.text])
 
 
 
67
  elif file_extension == 'txt':
68
  try:
69
  documents = input_data.read().decode('utf-8')
@@ -71,6 +81,9 @@ def process_input(input_data):
71
  documents = input_data.read().decode('latin-1')
72
  else:
73
  raise ValueError(f"Unsupported file type: {file_extension}")
 
 
 
74
  except Exception as e:
75
  raise RuntimeError(f"Failed to process file: {str(e)}")
76
 
@@ -80,39 +93,49 @@ def process_input(input_data):
80
 
81
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
82
  texts = text_splitter.split_text(documents)
 
 
 
83
 
84
- # Step 4: Create or update embeddings
85
- status.text("Creating embeddings...")
86
  progress_bar.progress(0.80)
87
 
88
- hf_embeddings = HuggingFaceEmbeddings(
89
- model_name="sentence-transformers/all-mpnet-base-v2",
90
- model_kwargs={'device': 'cpu'}
91
- )
 
 
 
92
 
93
  # Step 5: Initialize or append to FAISS vector store
94
  status.text("Building or updating vector store...")
95
  progress_bar.progress(1.0)
96
 
97
- if st.session_state.vectorstore is None:
98
- dimension = len(hf_embeddings.embed_query("test"))
99
- index = faiss.IndexFlatL2(dimension)
100
- vector_store = FAISS(
101
- embedding_function=hf_embeddings,
102
- index=index,
103
- docstore=InMemoryDocstore({}),
104
- index_to_docstore_id={}
105
- )
106
- else:
107
- vector_store = st.session_state.vectorstore
108
-
109
- # Add texts to vector store
110
- uuids = [str(uuid.uuid4()) for _ in texts]
111
- vector_store.add_texts(texts, ids=uuids)
 
 
 
112
 
113
  # Complete processing
114
  status.text("Processing complete!")
115
  st.session_state.uploaded_files.append(file_name)
 
116
 
117
  return vector_store
118
 
@@ -208,13 +231,14 @@ with st.sidebar:
208
  try:
209
  vector_store = process_input(input_data)
210
  st.session_state.vectorstore = vector_store
211
- st.success("File processed successfully. You can now ask questions.")
212
  except PermissionError as e:
213
  st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
214
  except OSError as e:
215
  st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
216
  except ValueError as e:
217
  st.error(f"File upload failed: {str(e)} (Invalid file format).")
 
 
218
  except Exception as e:
219
  st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")
220
 
 
 
1
  # app.py
2
  import streamlit as st
3
  import os
4
  from io import BytesIO
5
  from PyPDF2 import PdfReader
6
+ from PyPDF2.errors import PdfReadError
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import FAISS
 
16
  from dotenv import load_dotenv
17
  import requests
18
  import pandas as pd
19
+ from pandas.errors import ParserError
20
  from docx import Document
21
 
22
  # Load environment variables
 
57
 
58
  try:
59
  if file_extension == 'pdf':
60
+ try:
61
+ pdf_reader = PdfReader(BytesIO(input_data.read()))
62
+ documents = "".join([page.extract_text() or "" for page in pdf_reader.pages])
63
+ except PdfReadError as e:
64
+ raise RuntimeError(f"Failed to read PDF: {str(e)}")
65
  elif file_extension in ['xls', 'xlsx']:
66
+ try:
67
+ df = pd.read_excel(BytesIO(input_data.read()), engine='openpyxl')
68
+ documents = df.to_string(index=False)
69
+ except ParserError as e:
70
+ raise RuntimeError(f"Failed to parse Excel file: {str(e)}")
71
  elif file_extension in ['doc', 'docx']:
72
+ try:
73
+ doc = Document(BytesIO(input_data.read()))
74
+ documents = "\n".join([para.text for para in doc.paragraphs if para.text])
75
+ except Exception as e:
76
+ raise RuntimeError(f"Failed to read DOC/DOCX: {str(e)}")
77
  elif file_extension == 'txt':
78
  try:
79
  documents = input_data.read().decode('utf-8')
 
81
  documents = input_data.read().decode('latin-1')
82
  else:
83
  raise ValueError(f"Unsupported file type: {file_extension}")
84
+
85
+ if not documents.strip():
86
+ raise RuntimeError("No text extracted from the file.")
87
  except Exception as e:
88
  raise RuntimeError(f"Failed to process file: {str(e)}")
89
 
 
93
 
94
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
95
  texts = text_splitter.split_text(documents)
96
+ chunk_count = len(texts)
97
+ if chunk_count == 0:
98
+ raise RuntimeError("No text chunks created for embedding.")
99
 
100
+ # Step 4: Create embeddings
101
+ status.text(f"Embedding {chunk_count} chunks...")
102
  progress_bar.progress(0.80)
103
 
104
+ try:
105
+ hf_embeddings = HuggingFaceEmbeddings(
106
+ model_name="sentence-transformers/all-mpnet-base-v2",
107
+ model_kwargs={'device': 'cpu'}
108
+ )
109
+ except Exception as e:
110
+ raise RuntimeError(f"Failed to initialize embeddings: {str(e)}")
111
 
112
  # Step 5: Initialize or append to FAISS vector store
113
  status.text("Building or updating vector store...")
114
  progress_bar.progress(1.0)
115
 
116
+ try:
117
+ if st.session_state.vectorstore is None:
118
+ dimension = len(hf_embeddings.embed_query("test"))
119
+ index = faiss.IndexFlatL2(dimension)
120
+ vector_store = FAISS(
121
+ embedding_function=hf_embeddings,
122
+ index=index,
123
+ docstore=InMemoryDocstore({}),
124
+ index_to_docstore_id={}
125
+ )
126
+ else:
127
+ vector_store = st.session_state.vectorstore
128
+
129
+ # Add texts to vector store
130
+ uuids = [str(uuid.uuid4()) for _ in texts]
131
+ vector_store.add_texts(texts, ids=uuids)
132
+ except Exception as e:
133
+ raise RuntimeError(f"Failed to update vector store: {str(e)}")
134
 
135
  # Complete processing
136
  status.text("Processing complete!")
137
  st.session_state.uploaded_files.append(file_name)
138
+ st.success(f"Embedded {chunk_count} chunks from {file_name}")
139
 
140
  return vector_store
141
 
 
231
  try:
232
  vector_store = process_input(input_data)
233
  st.session_state.vectorstore = vector_store
 
234
  except PermissionError as e:
235
  st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
236
  except OSError as e:
237
  st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
238
  except ValueError as e:
239
  st.error(f"File upload failed: {str(e)} (Invalid file format).")
240
+ except RuntimeError as e:
241
+ st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}).")
242
  except Exception as e:
243
  st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")
244