Spaces:

bacancydataprophets
/

Smart-PDF-Search

Sleeping

App Files Files Community

Avanisha commited on Jan 28

Commit

5debd08

verified ·

1 Parent(s): 5a38e5d

Upload 14 files

Browse files

Files changed (15) hide show

.gitattributes +2 -0
app.py +762 -0
config.json +5 -0
config.py +57 -0
data/Cyber_Security.pdf +0 -0
data/SITA1602.pdf +3 -0
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/data_level0.bin +3 -0
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/header.bin +3 -0
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/length.bin +3 -0
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/link_lists.bin +3 -0
db/chroma.sqlite3 +3 -0
log_utils.py +44 -0
pdf_details_page.py +363 -0
requirements.txt +169 -0
upload_pdf.py +200 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/SITA1602.pdf filter=lfs diff=lfs merge=lfs -text
+db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,762 @@

+import os
+import io
+import nltk
+import fitz
+import random
+import base64
+import logging
+import pycountry
+from PIL import Image
+import streamlit as st
+from fastapi import FastAPI
+from langdetect import detect
+from config import load_config
+from dotenv import load_dotenv
+from nltk.corpus import stopwords
+from fastapi import FastAPI, Query
+from langchain_groq import ChatGroq
+from collections import defaultdict
+from log_utils import setup_logging
+from nltk.tokenize import sent_tokenize
+from nltk.tokenize import word_tokenize
+from langchain.chains import RetrievalQA
+from upload_pdf import update_or_add_pdf
+from fastapi.responses import JSONResponse
+from langchain.prompts import ChatPromptTemplate
+from langchain_community.vectorstores import Chroma
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import TfidfVectorizer
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from pdf_details_page import display_pdf_details, display_romanized_text_page
+logger = setup_logging('app')
+nltk.download('punkt')
+nltk.download('punkt_tab')
+nltk.download('stopwords')
+app = FastAPI()
+@app.get("/pdf-details")
+async def get_pdf_details(
+    filename: str = Query(..., description="Filename of the PDF"),
+    page_number: int = Query(0, description="Page number (0-indexed)")
+):
+    logger.info(f"Processing PDF details request for file: {filename}, page: {page_number}")
+    try:
+        data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
+        file_path = os.path.join(data_path, filename)
+        # Open the PDF
+        logger.debug(f"Opening PDF file: {file_path}")
+        doc = fitz.open(file_path)
+        # Extract full PDF text
+        full_text = ""
+        for page in doc:
+            full_text += page.get_text()
+        # Get PDF metadata
+        pdf_metadata = doc.metadata or {}
+        # Extract page text and render page image
+        page = doc.load_page(page_number)
+        page_text = page.get_text()
+        # Render page as image
+        pix = page.get_pixmap()
+        page_image_base64 = base64.b64encode(pix.tobytes("png")).decode('utf-8')
+        # Detect language
+        try:
+            lang_code = detect(page_text)
+            language = pycountry.languages.get(alpha_2=lang_code).name
+        except Exception as e:
+            logger.warning(f"Language detection failed: {str(e)}")
+            language = 'Unknown'
+        # Prepare response
+        response = {
+            "file_path": file_path,
+            "filename": os.path.basename(file_path),
+            "total_pages": len(doc),
+            "current_page": page_number + 1,
+            "full_text": full_text,
+            "page_text": page_text,
+            "page_image": page_image_base64,
+            "file_size_bytes": os.path.getsize(file_path),
+            "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
+            "language": language,
+            "metadata": {
+                "title": pdf_metadata.get('title', 'Unknown'),
+                "author": pdf_metadata.get('author', 'Unknown'),
+                "creator": pdf_metadata.get('creator', 'Unknown'),
+                "producer": pdf_metadata.get('producer', 'Unknown')
+            }
+        }
+        logger.info(f"Successfully processed PDF details for {filename}")
+        return JSONResponse(content=response)
+    except Exception as e:
+        logger.error(f"Error processing PDF details: {str(e)}", exc_info=True)
+        return JSONResponse(
+            content={"error": str(e)},
+            status_code=500
+        )
+@app.get("/romanized-text")
+async def get_romanized_text(
+    filename: str = Query(..., description="Filename of the PDF")
+):
+    logger.info(f"Processing romanized text request for file: {filename}")
+    try:
+        data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
+        file_path = os.path.join(data_path, filename)
+        # Open the PDF
+        logger.debug(f"Opening PDF file for romanization: {file_path}")
+        doc = fitz.open(file_path)
+        # Extract full PDF text
+        full_text = ""
+        pages_text = []
+        for page in doc:
+            page_text = page.get_text()
+            full_text += page_text
+            # Add page info to pages_text list
+            pages_text.append({
+                    "page_number": page.number + 1,  # Adding 1 to make it 1-based instead of 0-based
+                    "text": page_text
+                })
+        # Get PDF metadata
+        pdf_metadata = doc.metadata or {}
+        response = {
+            "filename": os.path.basename(file_path),
+            "total_pages": len(doc),
+            "full_text": full_text,
+            "pages": pages_text,
+            "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
+            "metadata": {
+                "title": pdf_metadata.get('title', 'Unknown'),
+                "author": pdf_metadata.get('author', 'Unknown'),
+                "creator": pdf_metadata.get('creator', 'Unknown'),
+                "producer": pdf_metadata.get('producer', 'Unknown')
+            }
+        }
+        logger.info(f"Successfully processed romanized text for {filename}")
+        return JSONResponse(content=response)
+    except Exception as e:
+        logger.error(f"Error processing romanized text: {str(e)}", exc_info=True)
+        return JSONResponse(
+            content={"error": str(e)},
+            status_code=500
+        )
+# Load environment variables
+load_dotenv()
+# Must be the first Streamlit command
+st.set_page_config(
+    page_title="Smart PDF Search",
+    page_icon="📚",
+    layout="wide"
+)
+st.markdown("""
+    <style>
+    img { border: 1px solid rgb(221, 221, 221); }
+    .stApp {
+        font-family: 'Inter', sans-serif;
+    }
+    .stMarkdown {
+        color: #2c3e50;
+    }
+    .stTextInput > div > div > input {
+        border: 2px solid #3498db;
+        border-radius: 12px;
+        padding: 12px;
+        font-size: 16px;
+        background-color: white;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        transition: all 0.3s ease;
+    }
+    .stTextInput > div > div > input:focus {
+        border-color: #2980b9;
+        outline: none;
+        box-shadow: 0 0 0 3px rgba(52, 152, 219, 0.2);
+    }
+    .stButton > button {
+        background-color: #3498db !important;
+        color: white !important;
+        border-radius: 10px;
+        padding: 5px 10px !important;
+        font-weight: 600;
+        transition: all 0.3s ease;
+        text-transform: uppercase;
+        letter-spacing: 0.5px;
+    }
+    .stButton > button:hover {
+        background-color: #2980b9 !important;
+        transform: translateY(-2px);
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .stExpander {
+        border-radius: 12px;
+        background-color: #f9f9f9;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .stMarkdown, .stSubheader {
+        color: #34495e;
+    }
+    mark {
+        background-color: #c6e6fb;
+        color: #2c3e50;
+        padding: 2px 4px;
+        border-radius: 4px;
+    }
+    .st-emotion-cache-1104ytp h2 {
+        font-size: 1rem;
+        font-weight: 400;
+        font-family: "Source Sans Pro", sans-serif";
+        margin: 0px 0px 1rem;
+        line-height: 1.6;
+    }
+    .st-emotion-cache-1v0mbdj.e115fcil1 {
+        width: 100%;
+    }
+    .page-number {
+        display: inline-block;
+        background-color: #6C757D;
+        color: white;
+        font-weight: bold;
+        font-size: 14px;
+        padding: 2px 20px;
+        border-radius: 5px;
+        border: 1px solid #6C757D;
+        margin-top: 0px;
+        text-align: center;
+    }
+    .document-name {
+        color: dimgray;
+        font-size: 18px;
+        margin-bottom: .5rem;
+        font-weight: 500;
+        line-height: 1.2;
+        }
+    .source-content {
+        background-color: #f9f9f9;
+        padding: 10px;
+        border-radius: 5px;
+    }
+    .response-block {
+        background-color: #f9f9f9;
+        padding: 15px;
+        border-radius: 5px;
+        margin-bottom: 20px;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+# Initialize session state variables
+if 'qa_chain' not in st.session_state:
+    st.session_state.qa_chain = None
+if 'vectordb' not in st.session_state:
+    st.session_state.vectordb = None
+if 'config' not in st.session_state:
+    st.session_state.config = None
+def initialize_embedding_model():
+    """Initialize and return the embedding model."""
+    logger.info("Initializing embedding model")
+    try:
+        with st.spinner('Loading embedding model...'):
+            embedding_model = HuggingFaceEmbeddings(
+                model_name='all-MiniLM-L6-v2',
+                model_kwargs={'device': 'cpu'},
+                encode_kwargs={'normalize_embeddings': True}
+            )
+            # st.success("Embedding model loaded successfully")
+            logger.info("Embedding model initialized successfully")
+        return embedding_model
+    except Exception as e:
+        logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
+        raise
+def load_vectordb(persist_directory, embedding_model, collection_name):
+    """Load existing ChromaDB instance."""
+    logger.info(f"Loading ChromaDB from {persist_directory}")
+    try:
+        with st.spinner('Loading ChromaDB...'):
+            vectordb = Chroma(
+                persist_directory=persist_directory,
+                embedding_function=embedding_model,
+                collection_name=collection_name
+            )
+            # st.success("ChromaDB loaded successfully")
+            logger.info("ChromaDB loaded successfully")
+        return vectordb
+    except Exception as e:
+        logger.error(f"Error loading ChromaDB: {str(e)}", exc_info=True)
+        raise
+def create_qa_chain(vectordb, groq_api_key, k=4):
+    """Create and return a QA chain."""
+    logger.info("Creating QA chain")
+    try:
+        with st.spinner('Creating QA chain...'):
+            retriever = vectordb.as_retriever(search_kwargs={'k': k})
+            llm = ChatGroq(api_key=groq_api_key, temperature=0)
+            prompt_messages = [
+                ("system", """You are a helpful AI assistant who provides accurate answers based on the given context.
+                If you don't know the answer, just say that you don't know, don't try to make up an answer."""),
+                ("user", """Use the following context to answer my question:
+                Context: {context}
+                Question: {question}"""),
+                ("assistant", "I'll help answer your question based on the provided context.")
+            ]
+            chat_prompt = ChatPromptTemplate.from_messages(prompt_messages)
+            qa_chain = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type="stuff",
+                retriever=retriever,
+                return_source_documents=True,
+                chain_type_kwargs={"prompt": chat_prompt}
+            )
+            # st.success("QA chain created successfully")
+            logger.info("QA chain created successfully")
+        return qa_chain
+    except Exception as e:
+        logger.error(f"Error creating QA chain: {str(e)}", exc_info=True)
+        raise
+def format_inline_citations(response_text, source_documents):
+    """Format the response text with citations at the end of lines or paragraphs and return citations."""
+    logger.info("Starting inline citations formatting")
+    inline_response = response_text.strip()
+    # Extract text and metadata from source documents
+    try:
+        doc_texts = [
+            source.page_content for source in source_documents if source.page_content
+        ]
+        doc_citations = [
+            {
+                "pdf_name": os.path.basename(source.metadata.get("file_path", "Unknown")),
+                "page": source.metadata.get("page", "Unknown") + 1,
+            }
+            for source in source_documents
+        ]
+        logger.debug(f"Extracted {len(doc_texts)} document texts and citations")
+        if not doc_texts or not inline_response:
+            logger.warning("No documents or response text to process")
+            return inline_response, []
+        # Split response text into paragraphs
+        paragraphs = [p.strip() for p in response_text.split("\n") if p.strip()]
+        logger.debug(f"Split response into {len(paragraphs)} paragraphs")
+        # Vectorize response paragraphs and source document texts
+        vectorizer = TfidfVectorizer()
+        all_texts = doc_texts + paragraphs
+        tfidf_matrix = vectorizer.fit_transform(all_texts)
+        # Initialize a list to store relevant citations
+        relevant_citations = []
+        # Match each paragraph to its most similar source documents
+        for i, paragraph in enumerate(paragraphs):
+            paragraph_idx = len(doc_texts) + i
+            similarities = cosine_similarity(tfidf_matrix[paragraph_idx:paragraph_idx + 1], tfidf_matrix[:len(doc_texts)])[0]
+            # Collect relevant citations based on similarity
+            paragraph_citations = [
+                doc_citations[j] for j, score in enumerate(similarities) if score > 0.2
+            ]
+            if paragraph_citations:
+                logger.debug(f"Found {len(paragraph_citations)} citations for paragraph {i+1}")
+                relevant_citations.extend(paragraph_citations)
+                # Group citations by document name and collect pages
+                grouped_citations = defaultdict(set)
+                for citation in paragraph_citations:
+                    grouped_citations[citation["pdf_name"]].add(citation["page"])
+                # Format grouped citations
+                combined_citations = []
+                for pdf_name, pages in grouped_citations.items():
+                    pages = sorted(pages)
+                    pages_text = f"Page {pages[0]}" if len(pages) == 1 else f"Pages {', '.join(map(str, pages))}"
+                    combined_citations.append(f"{pdf_name}: {pages_text}")
+                formatted_citations = f" <b>(" + "; ".join(combined_citations) + ")</b> \n"
+                paragraphs[i] = f"{paragraph}{formatted_citations}"
+        # Combine paragraphs back into the final response
+        inline_response = "\n".join(paragraphs)
+        logger.info("Successfully formatted inline citations")
+        return inline_response, relevant_citations
+    except Exception as e:
+        logger.error(f"Error formatting inline citations: {str(e)}", exc_info=True)
+        return response_text, []
+def display_citation_details(source_documents):
+    """Display detailed information about citation details."""
+    logger.info("Displaying citation details")
+    try:
+        st.subheader("Citation Details")
+        grouped_sources = defaultdict(list)
+        for source in source_documents:
+            key = (source.metadata.get('file_path', 'Unknown'), source.metadata.get('page', 'Unknown'))
+            grouped_sources[key].append(source.page_content)
+        logger.debug(f"Grouped {len(grouped_sources)} unique sources")
+        for key, content_list in grouped_sources.items():
+            file_path, page_number = key
+            try:
+                full_page_content = next(
+                    (source.metadata.get('full_page_content', 'No full content available')
+                     for source in source_documents
+                     if source.metadata.get('file_path', 'Unknown') == file_path
+                     and source.metadata.get('page', 'Unknown') == page_number),
+                    'No full content available'
+                )
+                merged_content = "\n".join(content_list)
+                highlighted_content = full_page_content
+                for line in merged_content.splitlines():
+                    if line.strip() and line in full_page_content:
+                        highlighted_content = highlighted_content.replace(line, f"<mark>{line}</mark>", 1)
+                with st.expander(f"Source: {os.path.basename(file_path)} - Page {page_number + 1}"):
+                    st.markdown(highlighted_content, unsafe_allow_html=True)
+                logger.debug(f"Displayed citation details for {os.path.basename(file_path)} - Page {page_number + 1}")
+            except Exception as e:
+                logger.error(f"Error processing citation for {file_path}: {str(e)}")
+                continue
+    except Exception as e:
+        logger.error(f"Error displaying citation details: {str(e)}", exc_info=True)
+        st.error("Error displaying citation details")
+def initialize_system():
+    """Initialize the QA system components."""
+    logger.info("Starting system initialization")
+    try:
+        config = load_config()
+        if not config:
+            logger.error("Configuration not found")
+            st.error("Configuration not found. Please run the preprocessing script first.")
+            return False
+        st.session_state.config = config
+        logger.debug("Configuration loaded successfully")
+        embedding_model = initialize_embedding_model()
+        st.session_state.vectordb = load_vectordb(config['persist_directory'], embedding_model, config['collection_name'])
+        st.session_state.qa_chain = create_qa_chain(st.session_state.vectordb, config['groq_api_key'])
+        logger.info("System initialized successfully")
+        st.success("System initialized successfully!")
+        return True
+    except Exception as e:
+        logger.error(f"Error during system initialization: {str(e)}", exc_info=True)
+        st.error(f"An error occurred: {e}")
+        return False
+def extract_page_image(file_path, page_number):
+    """Extract the image of a specific page from a PDF file and return it as a PIL image."""
+    logger.debug(f"Extracting page image from {file_path}, page {page_number}")
+    try:
+        doc = fitz.open(file_path)
+        page = doc.load_page(page_number)
+        pix = page.get_pixmap()
+        image = Image.open(io.BytesIO(pix.tobytes("png")))
+        logger.debug("Successfully extracted page image")
+        return image
+    except Exception as e:
+        logger.error(f"Error extracting page image: {str(e)}")
+        return None
+def highlight_query_words(text, query):
+    """Highlights words from the query in the provided text."""
+    logger.debug(f"Highlighting query words for query: {query}")
+    try:
+        stop_words = set(stopwords.words('english'))
+        query_words = set(word_tokenize(query.lower())) - stop_words
+        words = text.split()
+        highlighted_text = " ".join(
+            f"<mark>{word}</mark>"
+            if word.lower().strip(".,!?") in query_words else word
+            for word in words
+        )
+        logger.debug("Successfully highlighted query words")
+        return highlighted_text
+    except Exception as e:
+        logger.error(f"Error highlighting query words: {str(e)}")
+        return text
+def display_source_documents_with_images(source_documents, query):
+    """Display unique source document images and formatted text snippets with query highlights."""
+    logger.info("Displaying source documents with images")
+    try:
+        st.subheader("📝 Source Documents")
+        unique_sources = {}
+        for source in source_documents:
+            key = (source.metadata.get('file_path', 'Unknown'), source.metadata.get('page', 'Unknown'))
+            if key not in unique_sources:
+                unique_sources[key] = source
+        logger.debug(f"Processing {len(unique_sources)} unique sources")
+        for (file_path, page_number), source in unique_sources.items():
+            try:
+                pdf_name = os.path.basename(file_path)
+                page_content = source.metadata["full_page_content"] or "No content available"
+                logger.debug(f"Processing document: {pdf_name}, page {page_number + 1}")
+                col1, col2 = st.columns([1, 3])
+                with col1:
+                    page_image = extract_page_image(file_path, page_number)
+                    if page_image:
+                        st.image(page_image, caption=f"Page {page_number + 1}", use_container_width=True)
+                    else:
+                        logger.warning(f"Preview not available for {pdf_name}, page {page_number + 1}")
+                        st.warning("⚠️ Preview not available for this page")
+                with col2:
+                    st.markdown(f'<span class="document-name">{pdf_name}</span>', unsafe_allow_html=True)
+                    st.markdown(f'<span class="page-number">Page {page_number + 1}</span>', unsafe_allow_html=True)
+                    sentences = sent_tokenize(page_content)
+                    random.shuffle(sentences)
+                    selected_snippet = []
+                    for sentence in sentences:
+                        words = sentence.split()
+                        chunked_snippet = [" ".join(words[i:i+17]) for i in range(0, len(words), 17)]
+                        selected_snippet.extend(chunked_snippet)
+                        if len(selected_snippet) >= 7:
+                            break
+                    snippet = "  ...  ".join(selected_snippet)
+                    highlighted_snippet = highlight_query_words(snippet, query)
+                    st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
+                    st.markdown(f"[View other results in this book](?page=pdf_details&filename={pdf_name}&page_number={page_number})", unsafe_allow_html=True)
+                    logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
+            except Exception as e:
+                logger.error(f"Error processing document {pdf_name}: {str(e)}")
+                continue
+    except Exception as e:
+        logger.error(f"Error displaying source documents: {str(e)}", exc_info=True)
+        st.error("Error displaying source documents")
+def is_query_relevant(question, source_documents, threshold=0.1):
+    """Check query relevance using multiple similarity methods."""
+    logger.info(f"Checking relevance for query: {question}")
+    try:
+        if not source_documents:
+            logger.warning("No source documents provided for relevance check")
+            return False
+        # Keyword-based check
+        keywords = set(question.lower().split())
+        for doc in source_documents:
+            doc_words = set(doc.page_content.lower().split())
+            if keywords.intersection(doc_words):
+                logger.debug("Query relevant based on keyword match")
+                return True
+        # TF-IDF similarity check
+        try:
+            doc_texts = [doc.page_content for doc in source_documents]
+            texts_to_compare = doc_texts + [question]
+            vectorizer = TfidfVectorizer()
+            tfidf_matrix = vectorizer.fit_transform(texts_to_compare)
+            similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])[0]
+            is_relevant = any(sim > threshold for sim in similarities)
+            logger.debug(f"Query relevance (TF-IDF): {is_relevant}")
+            return is_relevant
+        except Exception as e:
+            logger.warning(f"TF-IDF similarity check failed: {str(e)}")
+            # Fallback to simple text match
+            is_relevant = any(question.lower() in doc.page_content.lower() for doc in source_documents)
+            logger.debug(f"Query relevance (fallback): {is_relevant}")
+            return is_relevant
+    except Exception as e:
+        logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
+        return False
+def main():
+    logger.info("Starting Smart PDF Search application")
+    # Detect page from query parameters
+    query_params = st.query_params
+    page = query_params.get('page', 'home')
+    logger.debug(f"Current page: {page}")
+    # Routing logic
+    if page == 'pdf_details':
+        filename = query_params.get('filename', '')
+        page_number = int(query_params.get('page_number', 0))
+        logger.info(f"Displaying PDF details for {filename}, page {page_number}")
+        if filename:
+            display_pdf_details(filename, page_number)
+        else:
+            logger.warning("No filename provided for PDF details")
+            st.error("No filename provided for PDF details")
+    elif page == 'romanized_text':
+        filename = query_params.get('filename', '')
+        logger.info(f"Displaying romanized text for {filename}")
+        if filename:
+            display_romanized_text_page(filename)
+        else:
+            logger.warning("No filename provided for Romanized text")
+            st.error("No filename provided for Romanized text")
+    else:
+        logger.info("Displaying main search page")
+        st.markdown("<h1 style='text-align: center;'>📚 Smart PDF Search</h1>", unsafe_allow_html=True)
+        # PDF Upload Section in Sidebar
+        st.sidebar.header("📤 Upload PDF")
+        uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
+        # Process the uploaded PDF if a new file is uploaded
+        if uploaded_file is not None:
+            logger.info(f"Processing uploaded file: {uploaded_file.name}")
+            # Only process the PDF if it's a new upload and not an existing one
+            if 'last_uploaded_file' not in st.session_state or st.session_state.last_uploaded_file != uploaded_file.name:
+                try:
+                    config = st.session_state.config if 'config' in st.session_state else load_config()
+                    with st.spinner('Processing uploaded PDF...'):
+                        success = update_or_add_pdf(
+                            uploaded_file,
+                            config['data_path'],
+                            config['persist_directory'],
+                            config['collection_name']
+                        )
+                    if success:
+                        logger.info(f"Successfully processed uploaded file: {uploaded_file.name}")
+                        st.sidebar.success(f"Successfully uploaded {uploaded_file.name}")
+                        st.session_state.vectordb = None
+                        st.session_state.qa_chain = None
+                        st.session_state.last_uploaded_file = uploaded_file.name
+                    else:
+                        logger.warning(f"Failed to process uploaded file: {uploaded_file.name}")
+                        st.sidebar.warning("🚨 Please upload a valid PDF file to proceed.")
+                except Exception as e:
+                    logger.error(f"Error processing uploaded file: {str(e)}", exc_info=True)
+                    st.sidebar.error(f"Error processing file: {str(e)}")
+            else:
+                logger.info(f"PDF {uploaded_file.name} is already uploaded")
+                st.sidebar.info(f"PDF {uploaded_file.name} is already uploaded.")
+        ## Initialize QA system
+        if st.session_state.qa_chain is None:
+            logger.info("Initializing QA system")
+            if not initialize_system():
+                logger.error("Failed to initialize system")
+                return
+        st.subheader("🔍 Ask a Question")
+        question = st.text_input("Enter your question:")
+        if st.button("Get Answer") and question:
+            logger.info(f"Processing question: {question}")
+            try:
+                with st.spinner('🧠 Finding answer...'):
+                    llm_response = st.session_state.qa_chain.invoke({"query": question})
+                    logger.debug("Successfully got response from QA chain")
+                    response_text = llm_response['result']
+                    source_documents = llm_response['source_documents']
+                    # Check if the query is relevant to the documents
+                    if is_query_relevant(question, source_documents):
+                        # Format citations only if the query is relevant
+                        inline_response, relevant_citations = format_inline_citations(response_text, source_documents)
+                        # Only show detailed response if we have relevant citations
+                        if relevant_citations:
+                            col3, col4 = st.columns([2, 1])
+                            with col3:
+                                st.subheader("🧠 Summary")
+                                st.markdown(f'<div class="response-block">{inline_response}</div>', unsafe_allow_html=True)
+                                display_source_documents_with_images(source_documents, question)
+                            with col4:
+                                display_citation_details(source_documents)
+                        else:
+                            st.warning("⚠️ While your question seems related to the documents, I couldn't find specific relevant information to answer it. Please try rephrasing your question or asking about a different topic.")
+                    else:
+                        st.warning("⚠️ Your question appears to be unrelated to the content in the uploaded documents. Please ask a question about the information contained in the PDFs.")
+            except Exception as e:
+                logger.error(f"Error processing question: {str(e)}", exc_info=True)
+                st.error(f"⚠️ An error occurred while processing your question: {e}")
+        # Sidebar content
+        st.sidebar.markdown("""
+        <div style="background-color: #f0f4ff; padding: 5%; border-left: 4px solid #3b82f6; border-radius: 8px; box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1); margin-top: 35%; margin-bottom: 0%;">
+        <h3 style="margin-top: 0;">💡 Smart PDF Search Features</h3>
+            <ul style="padding-left: 20px;">
+                <li>🔍 Intelligent document search across multiple PDFs</li>
+                <li>🧠 Context-aware question answering</li>
+                <li>📄 Precise citations and source tracking</li>
+                <li>🖼️ Visual page previews with highlighted results</li>
+                <li>⚡ Fast and accurate information retrieval</li>
+            </ul>
+        <p style="color: #1e3a8a; font-weight: bold;">
+        Explore your PDFs with intelligent, context-aware search. Ask questions and get precise answers from your document collection.
+        </p>
+        </div>
+        """, unsafe_allow_html=True)
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        logger.critical(f"Critical application error: {str(e)}", exc_info=True)
+        st.error("A critical error occurred. Please check the logs for details.")

config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "data_path": "data",
+    "persist_directory": "db",
+    "collection_name": "smart_pdf_search"
+}

config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import json
+import os
+from dotenv import load_dotenv
+# Constants
+CONFIG_FILE = 'config.json'
+# Load environment variables from .env
+load_dotenv()
+def save_config(data_path, persist_directory, collection_name):
+    """
+    Save configuration to a JSON file.
+    This function accepts arguments and writes them to a config.json file.
+    Sensitive data (e.g., API keys) are not written to the file.
+    """
+    config = {
+        'data_path': data_path,
+        'persist_directory': persist_directory,
+        'collection_name': collection_name
+    }
+    with open(CONFIG_FILE, 'w') as f:
+        json.dump(config, f, indent=4)  # Add indent for better readability
+    print(f"Configuration saved to {CONFIG_FILE}.")
+def load_config():
+    """
+    Load configuration from JSON file and environment variables.
+    Returns the complete configuration as a dictionary.
+    """
+    try:
+        # Load JSON config file if it exists
+        if not os.path.exists(CONFIG_FILE):
+            raise FileNotFoundError(f"{CONFIG_FILE} not found. Please save the configuration first.")
+        with open(CONFIG_FILE, 'r') as f:
+            config = json.load(f)
+        # Validate required keys in config.json
+        required_keys = ['data_path', 'persist_directory', 'collection_name']
+        for key in required_keys:
+            if key not in config:
+                raise KeyError(f"Missing required configuration key: {key}")
+        # Add GROQ_API_KEY from environment variables (fallback to .env)
+        config['groq_api_key'] = os.getenv('GROQ_API_KEY')
+        if not config['groq_api_key']:
+            raise ValueError("GROQ_API_KEY is not set in environment variables.")
+        return config
+    except FileNotFoundError as e:
+        print(f"Error: {e}")
+        return None
+    except (json.JSONDecodeError, KeyError, ValueError) as e:
+        print(f"Configuration error: {e}")
+        return None

data/Cyber_Security.pdf ADDED Viewed

Binary file (341 kB). View file

data/SITA1602.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c02a1deb82d8d8fc3a2e122de97ebbe6552a57ab0f3c04399c9926384508bdd5
+size 5167544

db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
+size 1676000

db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
+size 100

db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
+size 4000

db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:012e5164c8468fc5d3a6b6a847d2dd696b79918e7bbb61c59dc050780c8d8785
+size 14454784

log_utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import logging
+def setup_logging(logger_name=None):
+    """
+    Configure logging settings with a unified configuration.
+    Creates logs directory if it doesn't exist and sets up logging handlers.
+    Args:
+        logger_name: Name for the logger. If None, returns root logger.
+    Returns:
+        Configured logger instance
+    """
+    log_dir = "logs"
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+    log_file = os.path.join(log_dir, "main.log")
+    # Check if the root logger already has handlers to avoid duplicate logging
+    root_logger = logging.getLogger()
+    if not root_logger.handlers:
+        # Configure root logger only if it hasn't been configured
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+            handlers=[
+                logging.FileHandler(log_file),
+                logging.StreamHandler()
+            ]
+        )
+    # Get or create logger with the specified name
+    if logger_name:
+        logger = logging.getLogger(logger_name)
+    else:
+        logger = root_logger
+    # Ensure the logger level is set
+    logger.setLevel(logging.INFO)
+    return logger

pdf_details_page.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import io
+import base64
+import logging
+import requests
+from PIL import Image
+import streamlit as st
+from typing import Dict, Any
+from log_utils import setup_logging
+logger = setup_logging('pdf_details_page')
+def api_request(url: str, params: Dict[str, Any] = None) -> Dict[str, Any]:
+    """
+    Make API request with logging and error handling.
+    """
+    try:
+        logger.info(f"Making API request to: {url}")
+        response = requests.get(url, params=params)
+        response.raise_for_status()
+        logger.debug(f"API response received successfully from: {url}")
+        return response.json()
+    except requests.RequestException as e:
+        logger.error(f"API request failed: {str(e)}", exc_info=True)
+        raise
+def display_romanized_text_page(filename):
+    """
+    Displays romanized text and PDF details in a Streamlit layout styled to match the given design.
+    """
+    logger.info(f"Displaying romanized text page for file: {filename}")
+    try:
+        st.markdown(
+            """
+            <style>
+            /* Styling for metadata section */
+            .metadata {
+                display: flex;
+                justify-content: space-between;
+                margin-bottom: 20px;
+                font-family: SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace;
+                font-size: 16px;
+                color: #34495e;
+                margin-top: 20px;
+            }
+            .metadata div {
+                text-align: left;
+            }
+            /* Styling for page text */
+            .page-section {
+                margin-bottom: 40px;
+            }
+            .page-header {
+                font-size: 20px;
+                color: #3498db;
+                font-family: SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace;
+                margin-bottom: 10px;
+                font-weight: bold;
+            }
+            .page-text {
+                font-family: SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace;
+                font-size: 16px;
+                color: #2c3e50;
+                line-height: 1.5;
+                margin-bottom: 20px;
+            }
+            /* Horizontal rule */
+            hr {
+                border: 0;
+                height: 1px;
+                background: #ddd;
+                margin: 30px 0;
+            }
+            </style>
+            """,
+            unsafe_allow_html=True
+        )
+        logger.debug("Applied CSS styling")
+        # API Endpoint for Romanized Text
+        api_url = f"http://127.0.0.1:8000/romanized-text?filename={filename}"
+        try:
+            # Fetch data from API
+            response = requests.get(api_url)
+            response.raise_for_status()
+            data = response.json()
+            # Page Title
+            st.markdown("<h1 style='text-align: center; margin-top: -1%;}'>📚 Smart PDF Search</h1>", unsafe_allow_html=True)
+            logger.debug("Rendered page title")
+            # Document Info Section
+            word_count = len(data['full_text'].split())
+            logger.info(f"Displaying document info - Pages: {data['total_pages']}, Size: {data['file_size_kb']}KB, Words: {word_count}")
+            # Document Info Section
+            st.markdown(
+                f"""
+                <div class='metadata'>
+                    <div>
+                        <strong>Filename: </strong>{data['filename']} <br>
+                        <strong>Total Pages: </strong>{data['total_pages']} <br>
+                        <strong>File Size: </strong>{data['file_size_kb']} <br>
+                        <strong>Total Words: </strong>{len(data['full_text'].split())}
+                    </div>
+                </div>
+                """,
+                unsafe_allow_html=True
+            )
+            # Display Each Page's Text
+            logger.info(f"Rendering {len(data['pages'])} pages of text")
+            for page in data['pages']:
+                st.markdown(
+                    f"""
+                    <div class='page-section'>
+                        <div class='page-header'>Page {page['page_number']}</div>
+                        <div class='page-text'>{page['text']}</div>
+                        <hr>
+                    </div>
+                    """,
+                    unsafe_allow_html=True
+                )
+            logger.debug("Completed rendering all pages")
+        except requests.RequestException as e:
+            logger.error(f"API request failed: {str(e)}", exc_info=True)
+            st.error(f"Error fetching data: {e}")
+        except KeyError as e:
+            logger.error(f"Missing key in API response: {str(e)}", exc_info=True)
+            st.error(f"Missing key in API response: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error in display_romanized_text_page: {str(e)}", exc_info=True)
+        st.error(f"An unexpected error occurred: {e}")
+def display_pdf_details(filename, page_number):
+    """
+    Display detailed information about a specific PDF page.
+    """
+    logger.info(f"Displaying PDF details for file: {filename}, page: {page_number}")
+    # Initialize reader mode state
+    if 'reader_mode' not in st.session_state:
+        st.session_state.reader_mode = False
+        logger.debug("Initialized reader mode state")
+    def toggle_reader_mode():
+        """Toggle reader mode state with logging."""
+        previous_state = st.session_state.reader_mode
+        st.session_state.reader_mode = not previous_state
+        logger.info(f"Reader mode toggled from {previous_state} to {st.session_state.reader_mode}")
+    try:
+        api_url = f"http://127.0.0.1:8000/pdf-details?filename={filename}&page_number={page_number}"
+        response = requests.get(api_url)
+        logger.debug(f"Retrieved PDF details for page {page_number}")
+        if response.status_code == 200:
+            pdf_details = response.json()
+            # Enhanced CSS for better styling
+            st.markdown("""
+            <style>
+            .page-container {
+                background-color: #ffffff;
+                padding: 30px;
+                margin: 20px auto;
+                border-radius: 12px;
+                box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+                max-width: 1200px;
+                font-family: Arial, sans-serif;
+            }
+            .stApp {
+                background-color: #f8f9fa;
+            }
+            .detail-box {
+                border-radius: 12px;
+                padding: 25px;
+                margin-bottom: 25px;
+            }
+            .header {
+                text-align: center;
+                color: #1a237e;
+                margin-bottom: 30px;
+                font-family: 'Helvetica Neue', sans-serif;
+            }
+            .metadata-table {
+                width: 100%;
+                border-collapse: collapse;
+                margin: 20px 0;
+                font-family: 'Helvetica Neue', sans-serif;
+            }
+            .metadata-table td {
+                padding: 12px 15px;
+                border: 1px solid #e0e0e0;
+            }
+            .metadata-table tr:nth-child(even) {
+                background-color: #f8f9fa;
+            }
+            .metadata-table tr:hover {
+                background-color: #f5f5f5;
+            }
+            .metadata-table td:first-child {
+                font-weight: 600;
+                width: 30%;
+                color: #2c3e50;
+            }
+            .stButton>button {
+                width: 100%;
+                border-radius: 8px;
+                height: 45px;
+                margin-top: 10px;
+            }
+            .stTextArea>div>div {
+                border-radius: 8px;
+            }
+            .page-preview {
+                border-radius: 8px;
+                overflow: hidden;
+                box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+                max-width: 100%;
+                max-height: 500px;
+                margin: auto;
+            }
+            div[data-baseweb="tab"] {
+                padding: 15px !important;
+            }
+            .stExpander {
+                border-radius: 8px;
+                border: 1px solid #e0e0e0;
+                margin-top: 20px;
+            }
+            .reader-mode {
+                position: fixed;
+                top: 0;
+                left: 0;
+                width: 100vw;
+                height: 100vh;
+                background: rgba(0, 0, 0, 0.9);
+                z-index: 9999;
+                display: flex;
+                justify-content: center;
+                align-items: center;
+                padding: 2rem;
+            }
+            .reader-mode img {
+                max-height: 90vh;
+                max-width: 90vw;
+                object-fit: contain;
+            }
+            .close-reader {
+                position: fixed;
+                top: 20px;
+                right: 20px;
+                color: white;
+                font-size: 24px;
+                cursor: pointer;
+                z-index: 10000;
+            }
+            </style>
+            """, unsafe_allow_html=True)
+            logger.debug("Applied CSS styling")
+            # Reader mode display (if active)
+            if st.session_state.reader_mode:
+                logger.info("Displaying reader mode view")
+                st.markdown('<div class="reader-mode-container">', unsafe_allow_html=True)
+                if st.button("❌ Close Reader Mode", key="close_reader", help="Exit reader mode"):
+                    logger.info("Reader mode closed")
+                    st.session_state.reader_mode = False
+                    st.rerun()
+                # Display zoomed image
+                page_image_bytes = base64.b64decode(pdf_details['page_image'])
+                page_image = Image.open(io.BytesIO(page_image_bytes))
+                st.image(page_image, use_container_width=True, caption=f"Page {pdf_details['current_page']}")
+                st.markdown('</div>', unsafe_allow_html=True)
+                return  # Exit early as we don't need to show the regular interface in reader mode
+            logger.info("Displaying regular interface")
+            # Header
+            st.markdown('<h1 class="header">📚 Smart PDF Search</h1>', unsafe_allow_html=True)
+            # Main content
+            col1, col2 = st.columns([1.5, 2])
+            with col1:
+                logger.debug("Rendering details section")
+                st.markdown("<h3 style='color: #1a237e; margin-bottom: 15px;'>🖼️ Page Preview</h3>", unsafe_allow_html=True)
+                st.markdown(f"<div style='text-align: center; padding: 15px;'>Page {page_number + 1} of {pdf_details['total_pages']}</div>", unsafe_allow_html=True)
+                page_image_bytes = base64.b64decode(pdf_details['page_image'])
+                page_image = Image.open(io.BytesIO(page_image_bytes))
+                st.image(page_image, caption=f"Page {pdf_details['current_page']}", use_container_width=True)
+                st.markdown("</div>", unsafe_allow_html=True)
+            with col2:
+                st.markdown("<div class='detail-box'>", unsafe_allow_html=True)
+                # Create 3 equal-width columns
+                col1, col2, col3 = st.columns(3)
+                # Action buttons inside the columns
+                with col1:
+                    logger.info("Reader mode button clicked")
+                    st.button("📖 Reader Mode", on_click=toggle_reader_mode)
+                with col2:
+                    if st.button("🔍 Ask a Question"):
+                        logger.info("Ask a Question button clicked")
+                        st.query_params["page"] = "home"  # Use the new API instead
+                        st.rerun()
+                with col3:
+                    logger.debug("Rendering Romanized Text link")
+                    st.markdown(f"""
+                        <a href="?page=romanized_text&filename={filename}" style="
+                            display: inline-block;
+                            padding: 10px 10px;
+                            font-size: 16px;
+                            font-weight: 400;
+                            color: white;
+                            background-color: #3498db;
+                            border: none;
+                            border-radius: 8px;
+                            text-align: center;
+                            text-decoration: none;
+                            margin-top: 10px;
+                            transition: all 0.3s ease;
+                            text-transform: uppercase;
+                            letter-spacing: 0.5px;
+                            width: -webkit-fill-available;
+                        ">
+                            📄 Romanized Text
+                        </a>
+                    """, unsafe_allow_html=True)
+                # Page content in expander
+                with st.expander("📄 Page Content", expanded=True):
+                    logger.debug("Displaying page content in expander")
+                    st.markdown(pdf_details['page_text'], unsafe_allow_html=True)
+                logger.debug("Rendering metadata table")
+                # Content tabs
+                metadata_html = f"""
+                <table class="metadata-table">
+                    <tr><td>PDF Name</td><td>{pdf_details.get('title', filename)}</td></tr>
+                    <tr><td>Page</td><td>{page_number + 1}</td></tr>
+                    <tr><td>Author</td><td>{pdf_details.get('metadata', {}).get('author', 'N/A')}</td></tr>
+                    <tr><td>Total Pages</td><td>{pdf_details['total_pages']}</td></tr>
+                    <tr><td>Language</td><td>{pdf_details['language']}</td></tr>
+                    <tr><td>File Size</td><td>{pdf_details['file_size_kb']} KB</td></tr>
+                </table>
+                """
+                st.markdown(metadata_html, unsafe_allow_html=True)
+                logger.info(f"Completed rendering PDF details page for {filename}")
+        else:
+            st.error(f"Error fetching PDF details: {response.text}")
+    except Exception as e:
+        logger.error(f"Error in display_pdf_details: {str(e)}", exc_info=True)
+        st.error(f"An error occurred: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,169 @@

+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+asgiref==3.8.1
+async-timeout==4.0.3
+attrs==24.3.0
+backoff==2.2.1
+bcrypt==4.2.1
+blinker==1.9.0
+build==1.2.2.post1
+cachetools==5.5.1
+certifi==2024.12.14
+charset-normalizer==3.4.1
+chroma-hnswlib==0.7.6
+chromadb==0.6.3
+click==8.1.8
+coloredlogs==15.0.1
+dataclasses-json==0.6.7
+Deprecated==1.2.15
+distro==1.9.0
+durationpy==0.9
+exceptiongroup==1.2.2
+fastapi==0.115.7
+filelock==3.17.0
+flatbuffers==25.1.21
+frozenlist==1.5.0
+fsspec==2024.12.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-auth==2.38.0
+googleapis-common-protos==1.66.0
+greenlet==3.1.1
+groq==0.15.0
+grpcio==1.70.0
+h11==0.14.0
+httpcore==1.0.7
+httptools==0.6.4
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.27.1
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.5.2
+Jinja2==3.1.5
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kubernetes==32.0.0
+langchain==0.3.15
+langchain-community==0.3.15
+langchain-core==0.3.31
+langchain-groq==0.2.3
+langchain-text-splitters==0.3.5
+langdetect==1.0.9
+langsmith==0.3.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.0
+mdurl==0.1.2
+mmh3==5.0.1
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+narwhals==1.23.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+oauthlib==3.2.2
+onnxruntime==1.20.1
+opentelemetry-api==1.29.0
+opentelemetry-exporter-otlp-proto-common==1.29.0
+opentelemetry-exporter-otlp-proto-grpc==1.29.0
+opentelemetry-instrumentation==0.50b0
+opentelemetry-instrumentation-asgi==0.50b0
+opentelemetry-instrumentation-fastapi==0.50b0
+opentelemetry-proto==1.29.0
+opentelemetry-sdk==1.29.0
+opentelemetry-semantic-conventions==0.50b0
+opentelemetry-util-http==0.50b0
+orjson==3.10.15
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+posthog==3.9.3
+propcache==0.2.1
+protobuf==5.29.3
+pyarrow==19.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycountry==24.6.1
+pydantic==2.10.6
+pydantic-settings==2.7.1
+pydantic_core==2.27.2
+pydeck==0.9.1
+Pygments==2.19.1
+PyMuPDF==1.25.2
+PyPika==0.48.9
+pyproject_hooks==1.2.0
+pytesseract==0.3.13
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.36.1
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.9.4
+rpds-py==0.22.3
+rsa==4.9
+safetensors==0.5.2
+scikit-learn==1.6.1
+scipy==1.15.1
+sentence-transformers==3.4.0
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+SQLAlchemy==2.0.37
+starlette==0.45.2
+streamlit==1.41.1
+sympy==1.13.1
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tokenizers==0.21.0
+toml==0.10.2
+tomli==2.2.1
+torch==2.5.1
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.48.1
+transliterate==1.10.2
+triton==3.1.0
+typer==0.15.1
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+uvicorn==0.34.0
+uvloop==0.21.0
+watchdog==6.0.0
+watchfiles==1.0.4
+websocket-client==1.8.0
+websockets==14.2
+wrapt==1.17.2
+yarl==1.18.3
+zipp==3.21.0
+zstandard==0.23.0

upload_pdf.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import os
+import uuid
+import json
+import logging
+from typing import List
+from config import save_config
+from dotenv import load_dotenv
+from log_utils import setup_logging
+from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+CONFIG_FILE = 'config.json'
+# Load environment variables
+load_dotenv()
+logger = setup_logging('upload_pdf')
+def load_documents(data_path):
+    """Load PDF documents from the specified directory."""
+    logger.info(f"Starting document loading from directory: {data_path}")
+    if not os.path.exists(data_path):
+        logger.error(f"Directory not found: {data_path}")
+        raise FileNotFoundError(f"Directory not found: {data_path}")
+    directory_loader = DirectoryLoader(
+        data_path,
+        loader_cls=PyMuPDFLoader,
+        show_progress=True
+    )
+    try:
+        documents = directory_loader.load()
+        logger.info(f"Successfully loaded {len(documents)} documents")
+        return documents
+    except Exception as e:
+        logger.error(f"Error loading documents: {str(e)}", exc_info=True)
+        raise
+def store_full_content(documents):
+    """Store full page content in document metadata."""
+    logger.info("Starting to store full page content in metadata")
+    try:
+        for doc in documents:
+            doc.metadata['full_page_content'] = doc.page_content
+            logger.debug(f"Stored full content for page {doc.metadata.get('page', 'Unknown')} "
+                        f"from {os.path.basename(doc.metadata.get('file_path', 'Unknown'))}")
+        logger.info(f"Successfully stored full content for {len(documents)} documents")
+        return documents
+    except Exception as e:
+        logger.error(f"Error storing full content: {str(e)}", exc_info=True)
+        raise
+def process_documents(documents):
+    """Process documents into chunks and add metadata."""
+    logger.info("Starting document processing")
+    try:
+        # First store full page content
+        documents = store_full_content(documents)
+        logger.info("Converting documents to chunks")
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=20)
+        chunks = text_splitter.split_documents(documents)
+        # Add UUID and store full page content in metadata
+        for chunk in chunks:
+            chunk.metadata['chunk_id'] = str(uuid.uuid4())
+            if 'full_page_content' not in chunk.metadata:
+                chunk.metadata['full_page_content'] = chunk.metadata.get('full_page_content', chunk.page_content)
+        logger.info(f"Document processing completed. Total chunks created: {len(chunks)}")
+        return chunks
+    except Exception as e:
+        logger.error(f"Error processing documents: {str(e)}", exc_info=True)
+        raise
+def initialize_embedding_model():
+    """Initialize and return the embedding model."""
+    logger.info("Initializing embedding model")
+    try:
+        embedding_model = HuggingFaceEmbeddings(
+            model_name='all-MiniLM-L6-v2',
+            model_kwargs={'device': 'cpu'},
+            encode_kwargs={'normalize_embeddings': True}
+        )
+        logger.info("Embedding model initialized successfully")
+        return embedding_model
+    except Exception as e:
+        logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
+        raise
+def create_vectordb(chunks, embedding_model, persist_directory, collection_name):
+    """Create and persist ChromaDB instance."""
+    logger.info(f"Creating Chroma instance with collection name: {collection_name}")
+    try:
+        vectordb = Chroma.from_documents(
+            documents=chunks,
+            embedding=embedding_model,
+            persist_directory=persist_directory,
+            collection_name=collection_name
+        )
+        vectordb.persist()
+        logger.info("Vector database created and persisted successfully")
+        return vectordb
+    except Exception as e:
+        logger.error(f"Error creating vector database: {str(e)}", exc_info=True)
+        raise
+def update_or_add_pdf(uploaded_file, data_path, persist_directory, collection_name):
+    """Add or replace a PDF in the system."""
+    logger.info(f"Processing uploaded file: {uploaded_file.name}")
+    if not uploaded_file.name.lower().endswith('.pdf'):
+        logger.warning(f"Rejected non-PDF file: {uploaded_file.name}")
+        return False
+    file_path = os.path.join(data_path, uploaded_file.name)
+    try:
+        # Remove existing PDF if it exists
+        if os.path.exists(file_path):
+            os.remove(file_path)
+            logger.info(f"Deleted existing PDF: {uploaded_file.name}")
+        # Save the uploaded PDF
+        with open(file_path, 'wb') as f:
+            f.write(uploaded_file.getvalue())
+        logger.info(f"Saved new PDF: {uploaded_file.name}")
+        # Load and process the new document
+        documents = load_documents(data_path)
+        new_documents = [doc for doc in documents if os.path.basename(doc.metadata.get('file_path', '')) == uploaded_file.name]
+        if not new_documents:
+            logger.error(f"No documents found for uploaded file: {uploaded_file.name}")
+            return False
+        chunks = process_documents(new_documents)
+        embedding_model = initialize_embedding_model()
+        # Update vector database
+        vectordb = Chroma(
+            persist_directory=persist_directory,
+            embedding_function=embedding_model,
+            collection_name=collection_name
+        )
+        # Remove existing vectors
+        existing_docs = vectordb.get(where={"source": file_path})
+        if existing_docs['ids']:
+            vectordb.delete(existing_docs['ids'])
+            logger.info(f"Removed existing vectors for {uploaded_file.name}")
+        # Add new vectors
+        vectordb.add_documents(documents=chunks)
+        vectordb.persist()
+        logger.info(f"Successfully updated {uploaded_file.name} in vector database")
+        return True
+    except Exception as e:
+        logger.error(f"Error processing uploaded PDF {uploaded_file.name}: {str(e)}", exc_info=True)
+        return False
+def main():
+    logger.info("Starting PDF processing pipeline")
+    try:
+        with open(CONFIG_FILE, 'r') as f:
+            config = json.load(f)
+        # Configuration
+        data_path = config.get('data_path')
+        persist_directory = os.environ.get('PERSIST_DIRECTORY')
+        collection_name = config.get('collection_name')
+        logger.info(f"Using configuration - data_path: {data_path}, "
+                   f"persist_directory: {persist_directory}, "
+                   f"collection_name: {collection_name}")
+        # Save configuration
+        save_config(data_path, persist_directory, collection_name)
+        logger.info("Configuration saved successfully")
+        # Process pipeline
+        documents = load_documents(data_path)
+        chunks = process_documents(documents)
+        embedding_model = initialize_embedding_model()
+        create_vectordb(chunks, embedding_model, persist_directory, collection_name)
+        logger.info("PDF processing pipeline completed successfully!")
+    except Exception as e:
+        logger.error("Fatal error in PDF processing pipeline", exc_info=True)
+        raise
+if __name__ == "__main__":
+    main()