Spaces:

samim2024
/

bsnl-chatboot

Sleeping

App Files Files Community

samim2024 commited on 20 days ago

Commit

1827766

verified ·

1 Parent(s): c8e1843

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -29

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # app.py
 import streamlit as st
 import os
 from io import BytesIO
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
@@ -16,6 +16,7 @@ import uuid
 from dotenv import load_dotenv
 import requests
 import pandas as pd
 from docx import Document
 # Load environment variables
@@ -56,14 +57,23 @@ def process_input(input_data):
     try:
         if file_extension == 'pdf':
-            pdf_reader = PdfReader(BytesIO(input_data.read()))
-            documents = "".join([page.extract_text() or "" for page in pdf_reader.pages])
         elif file_extension in ['xls', 'xlsx']:
-            df = pd.read_excel(BytesIO(input_data.read()), engine='openpyxl')
-            documents = df.to_string(index=False)
         elif file_extension in ['doc', 'docx']:
-            doc = Document(BytesIO(input_data.read()))
-            documents = "\n".join([para.text for para in doc.paragraphs if para.text])
         elif file_extension == 'txt':
             try:
                 documents = input_data.read().decode('utf-8')
@@ -71,6 +81,9 @@ def process_input(input_data):
                 documents = input_data.read().decode('latin-1')
         else:
             raise ValueError(f"Unsupported file type: {file_extension}")
     except Exception as e:
         raise RuntimeError(f"Failed to process file: {str(e)}")
@@ -80,39 +93,49 @@ def process_input(input_data):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     texts = text_splitter.split_text(documents)
-    # Step 4: Create or update embeddings
-    status.text("Creating embeddings...")
     progress_bar.progress(0.80)
-    hf_embeddings = HuggingFaceEmbeddings(
-        model_name="sentence-transformers/all-mpnet-base-v2",
-        model_kwargs={'device': 'cpu'}
-    )
     # Step 5: Initialize or append to FAISS vector store
     status.text("Building or updating vector store...")
     progress_bar.progress(1.0)
-    if st.session_state.vectorstore is None:
-        dimension = len(hf_embeddings.embed_query("test"))
-        index = faiss.IndexFlatL2(dimension)
-        vector_store = FAISS(
-            embedding_function=hf_embeddings,
-            index=index,
-            docstore=InMemoryDocstore({}),
-            index_to_docstore_id={}
-        )
-    else:
-        vector_store = st.session_state.vectorstore
-    # Add texts to vector store
-    uuids = [str(uuid.uuid4()) for _ in texts]
-    vector_store.add_texts(texts, ids=uuids)
     # Complete processing
     status.text("Processing complete!")
     st.session_state.uploaded_files.append(file_name)
     return vector_store
@@ -208,13 +231,14 @@ with st.sidebar:
                 try:
                     vector_store = process_input(input_data)
                     st.session_state.vectorstore = vector_store
-                    st.success("File processed successfully. You can now ask questions.")
                 except PermissionError as e:
                     st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
                 except OSError as e:
                     st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
                 except ValueError as e:
                     st.error(f"File upload failed: {str(e)} (Invalid file format).")
                 except Exception as e:
                     st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")

 # app.py
 import streamlit as st
 import os
 from io import BytesIO
 from PyPDF2 import PdfReader
+from PyPDF2.errors import PdfReadError
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from dotenv import load_dotenv
 import requests
 import pandas as pd
+from pandas.errors import ParserError
 from docx import Document
 # Load environment variables
     try:
         if file_extension == 'pdf':
+            try:
+                pdf_reader = PdfReader(BytesIO(input_data.read()))
+                documents = "".join([page.extract_text() or "" for page in pdf_reader.pages])
+            except PdfReadError as e:
+                raise RuntimeError(f"Failed to read PDF: {str(e)}")
         elif file_extension in ['xls', 'xlsx']:
+            try:
+                df = pd.read_excel(BytesIO(input_data.read()), engine='openpyxl')
+                documents = df.to_string(index=False)
+            except ParserError as e:
+                raise RuntimeError(f"Failed to parse Excel file: {str(e)}")
         elif file_extension in ['doc', 'docx']:
+            try:
+                doc = Document(BytesIO(input_data.read()))
+                documents = "\n".join([para.text for para in doc.paragraphs if para.text])
+            except Exception as e:
+                raise RuntimeError(f"Failed to read DOC/DOCX: {str(e)}")
         elif file_extension == 'txt':
             try:
                 documents = input_data.read().decode('utf-8')
                 documents = input_data.read().decode('latin-1')
         else:
             raise ValueError(f"Unsupported file type: {file_extension}")
+        if not documents.strip():
+            raise RuntimeError("No text extracted from the file.")
     except Exception as e:
         raise RuntimeError(f"Failed to process file: {str(e)}")
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     texts = text_splitter.split_text(documents)
+    chunk_count = len(texts)
+    if chunk_count == 0:
+        raise RuntimeError("No text chunks created for embedding.")
+    # Step 4: Create embeddings
+    status.text(f"Embedding {chunk_count} chunks...")
     progress_bar.progress(0.80)
+    try:
+        hf_embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-mpnet-base-v2",
+            model_kwargs={'device': 'cpu'}
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to initialize embeddings: {str(e)}")
     # Step 5: Initialize or append to FAISS vector store
     status.text("Building or updating vector store...")
     progress_bar.progress(1.0)
+    try:
+        if st.session_state.vectorstore is None:
+            dimension = len(hf_embeddings.embed_query("test"))
+            index = faiss.IndexFlatL2(dimension)
+            vector_store = FAISS(
+                embedding_function=hf_embeddings,
+                index=index,
+                docstore=InMemoryDocstore({}),
+                index_to_docstore_id={}
+            )
+        else:
+            vector_store = st.session_state.vectorstore
+        # Add texts to vector store
+        uuids = [str(uuid.uuid4()) for _ in texts]
+        vector_store.add_texts(texts, ids=uuids)
+    except Exception as e:
+        raise RuntimeError(f"Failed to update vector store: {str(e)}")
     # Complete processing
     status.text("Processing complete!")
     st.session_state.uploaded_files.append(file_name)
+    st.success(f"Embedded {chunk_count} chunks from {file_name}")
     return vector_store
                 try:
                     vector_store = process_input(input_data)
                     st.session_state.vectorstore = vector_store
                 except PermissionError as e:
                     st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
                 except OSError as e:
                     st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
                 except ValueError as e:
                     st.error(f"File upload failed: {str(e)} (Invalid file format).")
+                except RuntimeError as e:
+                    st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}).")
                 except Exception as e:
                     st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")