Avanisha commited on
Commit
5debd08
·
verified ·
1 Parent(s): 5a38e5d

Upload 14 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/SITA1602.pdf filter=lfs diff=lfs merge=lfs -text
37
+ db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import nltk
4
+ import fitz
5
+ import random
6
+ import base64
7
+ import logging
8
+ import pycountry
9
+ from PIL import Image
10
+ import streamlit as st
11
+ from fastapi import FastAPI
12
+ from langdetect import detect
13
+ from config import load_config
14
+ from dotenv import load_dotenv
15
+ from nltk.corpus import stopwords
16
+ from fastapi import FastAPI, Query
17
+ from langchain_groq import ChatGroq
18
+ from collections import defaultdict
19
+ from log_utils import setup_logging
20
+ from nltk.tokenize import sent_tokenize
21
+ from nltk.tokenize import word_tokenize
22
+ from langchain.chains import RetrievalQA
23
+ from upload_pdf import update_or_add_pdf
24
+ from fastapi.responses import JSONResponse
25
+ from langchain.prompts import ChatPromptTemplate
26
+ from langchain_community.vectorstores import Chroma
27
+ from sklearn.metrics.pairwise import cosine_similarity
28
+ from sklearn.feature_extraction.text import TfidfVectorizer
29
+ from langchain_community.embeddings import HuggingFaceEmbeddings
30
+ from pdf_details_page import display_pdf_details, display_romanized_text_page
31
+
32
+ logger = setup_logging('app')
33
+
34
+ nltk.download('punkt')
35
+ nltk.download('punkt_tab')
36
+ nltk.download('stopwords')
37
+
38
+ app = FastAPI()
39
+
40
+ @app.get("/pdf-details")
41
+ async def get_pdf_details(
42
+ filename: str = Query(..., description="Filename of the PDF"),
43
+ page_number: int = Query(0, description="Page number (0-indexed)")
44
+ ):
45
+ logger.info(f"Processing PDF details request for file: {filename}, page: {page_number}")
46
+ try:
47
+ data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
48
+ file_path = os.path.join(data_path, filename)
49
+
50
+ # Open the PDF
51
+ logger.debug(f"Opening PDF file: {file_path}")
52
+ doc = fitz.open(file_path)
53
+
54
+ # Extract full PDF text
55
+ full_text = ""
56
+ for page in doc:
57
+ full_text += page.get_text()
58
+
59
+ # Get PDF metadata
60
+ pdf_metadata = doc.metadata or {}
61
+
62
+ # Extract page text and render page image
63
+ page = doc.load_page(page_number)
64
+ page_text = page.get_text()
65
+
66
+ # Render page as image
67
+ pix = page.get_pixmap()
68
+ page_image_base64 = base64.b64encode(pix.tobytes("png")).decode('utf-8')
69
+
70
+ # Detect language
71
+ try:
72
+ lang_code = detect(page_text)
73
+ language = pycountry.languages.get(alpha_2=lang_code).name
74
+ except Exception as e:
75
+ logger.warning(f"Language detection failed: {str(e)}")
76
+ language = 'Unknown'
77
+
78
+ # Prepare response
79
+ response = {
80
+ "file_path": file_path,
81
+ "filename": os.path.basename(file_path),
82
+ "total_pages": len(doc),
83
+ "current_page": page_number + 1,
84
+ "full_text": full_text,
85
+ "page_text": page_text,
86
+ "page_image": page_image_base64,
87
+ "file_size_bytes": os.path.getsize(file_path),
88
+ "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
89
+ "language": language,
90
+ "metadata": {
91
+ "title": pdf_metadata.get('title', 'Unknown'),
92
+ "author": pdf_metadata.get('author', 'Unknown'),
93
+ "creator": pdf_metadata.get('creator', 'Unknown'),
94
+ "producer": pdf_metadata.get('producer', 'Unknown')
95
+ }
96
+ }
97
+
98
+ logger.info(f"Successfully processed PDF details for {filename}")
99
+ return JSONResponse(content=response)
100
+
101
+ except Exception as e:
102
+ logger.error(f"Error processing PDF details: {str(e)}", exc_info=True)
103
+ return JSONResponse(
104
+ content={"error": str(e)},
105
+ status_code=500
106
+ )
107
+
108
+ @app.get("/romanized-text")
109
+ async def get_romanized_text(
110
+ filename: str = Query(..., description="Filename of the PDF")
111
+ ):
112
+ logger.info(f"Processing romanized text request for file: {filename}")
113
+ try:
114
+ data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
115
+ file_path = os.path.join(data_path, filename)
116
+
117
+ # Open the PDF
118
+ logger.debug(f"Opening PDF file for romanization: {file_path}")
119
+ doc = fitz.open(file_path)
120
+
121
+ # Extract full PDF text
122
+ full_text = ""
123
+ pages_text = []
124
+
125
+ for page in doc:
126
+ page_text = page.get_text()
127
+ full_text += page_text
128
+ # Add page info to pages_text list
129
+ pages_text.append({
130
+ "page_number": page.number + 1, # Adding 1 to make it 1-based instead of 0-based
131
+ "text": page_text
132
+ })
133
+
134
+ # Get PDF metadata
135
+ pdf_metadata = doc.metadata or {}
136
+
137
+ response = {
138
+ "filename": os.path.basename(file_path),
139
+ "total_pages": len(doc),
140
+ "full_text": full_text,
141
+ "pages": pages_text,
142
+ "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
143
+ "metadata": {
144
+ "title": pdf_metadata.get('title', 'Unknown'),
145
+ "author": pdf_metadata.get('author', 'Unknown'),
146
+ "creator": pdf_metadata.get('creator', 'Unknown'),
147
+ "producer": pdf_metadata.get('producer', 'Unknown')
148
+ }
149
+ }
150
+
151
+ logger.info(f"Successfully processed romanized text for {filename}")
152
+ return JSONResponse(content=response)
153
+
154
+ except Exception as e:
155
+ logger.error(f"Error processing romanized text: {str(e)}", exc_info=True)
156
+ return JSONResponse(
157
+ content={"error": str(e)},
158
+ status_code=500
159
+ )
160
+
161
+ # Load environment variables
162
+ load_dotenv()
163
+
164
+ # Must be the first Streamlit command
165
+ st.set_page_config(
166
+ page_title="Smart PDF Search",
167
+ page_icon="📚",
168
+ layout="wide"
169
+ )
170
+
171
+ st.markdown("""
172
+ <style>
173
+ img { border: 1px solid rgb(221, 221, 221); }
174
+ .stApp {
175
+ font-family: 'Inter', sans-serif;
176
+ }
177
+ .stMarkdown {
178
+ color: #2c3e50;
179
+ }
180
+ .stTextInput > div > div > input {
181
+ border: 2px solid #3498db;
182
+ border-radius: 12px;
183
+ padding: 12px;
184
+ font-size: 16px;
185
+ background-color: white;
186
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
187
+ transition: all 0.3s ease;
188
+ }
189
+ .stTextInput > div > div > input:focus {
190
+ border-color: #2980b9;
191
+ outline: none;
192
+ box-shadow: 0 0 0 3px rgba(52, 152, 219, 0.2);
193
+ }
194
+ .stButton > button {
195
+ background-color: #3498db !important;
196
+ color: white !important;
197
+ border-radius: 10px;
198
+ padding: 5px 10px !important;
199
+ font-weight: 600;
200
+ transition: all 0.3s ease;
201
+ text-transform: uppercase;
202
+ letter-spacing: 0.5px;
203
+ }
204
+ .stButton > button:hover {
205
+ background-color: #2980b9 !important;
206
+ transform: translateY(-2px);
207
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
208
+ }
209
+ .stExpander {
210
+ border-radius: 12px;
211
+ background-color: #f9f9f9;
212
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
213
+ }
214
+ .stMarkdown, .stSubheader {
215
+ color: #34495e;
216
+ }
217
+ mark {
218
+ background-color: #c6e6fb;
219
+ color: #2c3e50;
220
+ padding: 2px 4px;
221
+ border-radius: 4px;
222
+ }
223
+ .st-emotion-cache-1104ytp h2 {
224
+ font-size: 1rem;
225
+ font-weight: 400;
226
+ font-family: "Source Sans Pro", sans-serif";
227
+ margin: 0px 0px 1rem;
228
+ line-height: 1.6;
229
+ }
230
+ .st-emotion-cache-1v0mbdj.e115fcil1 {
231
+ width: 100%;
232
+ }
233
+ .page-number {
234
+ display: inline-block;
235
+ background-color: #6C757D;
236
+ color: white;
237
+ font-weight: bold;
238
+ font-size: 14px;
239
+ padding: 2px 20px;
240
+ border-radius: 5px;
241
+ border: 1px solid #6C757D;
242
+ margin-top: 0px;
243
+ text-align: center;
244
+ }
245
+ .document-name {
246
+ color: dimgray;
247
+ font-size: 18px;
248
+ margin-bottom: .5rem;
249
+ font-weight: 500;
250
+ line-height: 1.2;
251
+ }
252
+ .source-content {
253
+ background-color: #f9f9f9;
254
+ padding: 10px;
255
+ border-radius: 5px;
256
+ }
257
+ .response-block {
258
+ background-color: #f9f9f9;
259
+ padding: 15px;
260
+ border-radius: 5px;
261
+ margin-bottom: 20px;
262
+ }
263
+ </style>
264
+ """, unsafe_allow_html=True)
265
+
266
+ # Initialize session state variables
267
+ if 'qa_chain' not in st.session_state:
268
+ st.session_state.qa_chain = None
269
+ if 'vectordb' not in st.session_state:
270
+ st.session_state.vectordb = None
271
+ if 'config' not in st.session_state:
272
+ st.session_state.config = None
273
+
274
+ def initialize_embedding_model():
275
+ """Initialize and return the embedding model."""
276
+ logger.info("Initializing embedding model")
277
+ try:
278
+ with st.spinner('Loading embedding model...'):
279
+ embedding_model = HuggingFaceEmbeddings(
280
+ model_name='all-MiniLM-L6-v2',
281
+ model_kwargs={'device': 'cpu'},
282
+ encode_kwargs={'normalize_embeddings': True}
283
+ )
284
+ # st.success("Embedding model loaded successfully")
285
+ logger.info("Embedding model initialized successfully")
286
+ return embedding_model
287
+ except Exception as e:
288
+ logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
289
+ raise
290
+
291
+ def load_vectordb(persist_directory, embedding_model, collection_name):
292
+ """Load existing ChromaDB instance."""
293
+ logger.info(f"Loading ChromaDB from {persist_directory}")
294
+ try:
295
+ with st.spinner('Loading ChromaDB...'):
296
+ vectordb = Chroma(
297
+ persist_directory=persist_directory,
298
+ embedding_function=embedding_model,
299
+ collection_name=collection_name
300
+ )
301
+ # st.success("ChromaDB loaded successfully")
302
+ logger.info("ChromaDB loaded successfully")
303
+ return vectordb
304
+ except Exception as e:
305
+ logger.error(f"Error loading ChromaDB: {str(e)}", exc_info=True)
306
+ raise
307
+
308
+ def create_qa_chain(vectordb, groq_api_key, k=4):
309
+ """Create and return a QA chain."""
310
+ logger.info("Creating QA chain")
311
+ try:
312
+ with st.spinner('Creating QA chain...'):
313
+ retriever = vectordb.as_retriever(search_kwargs={'k': k})
314
+ llm = ChatGroq(api_key=groq_api_key, temperature=0)
315
+
316
+ prompt_messages = [
317
+ ("system", """You are a helpful AI assistant who provides accurate answers based on the given context.
318
+ If you don't know the answer, just say that you don't know, don't try to make up an answer."""),
319
+ ("user", """Use the following context to answer my question:
320
+
321
+ Context: {context}
322
+
323
+ Question: {question}"""),
324
+ ("assistant", "I'll help answer your question based on the provided context.")
325
+ ]
326
+
327
+ chat_prompt = ChatPromptTemplate.from_messages(prompt_messages)
328
+
329
+ qa_chain = RetrievalQA.from_chain_type(
330
+ llm=llm,
331
+ chain_type="stuff",
332
+ retriever=retriever,
333
+ return_source_documents=True,
334
+ chain_type_kwargs={"prompt": chat_prompt}
335
+ )
336
+ # st.success("QA chain created successfully")
337
+ logger.info("QA chain created successfully")
338
+ return qa_chain
339
+ except Exception as e:
340
+ logger.error(f"Error creating QA chain: {str(e)}", exc_info=True)
341
+ raise
342
+
343
+ def format_inline_citations(response_text, source_documents):
344
+ """Format the response text with citations at the end of lines or paragraphs and return citations."""
345
+ logger.info("Starting inline citations formatting")
346
+
347
+ inline_response = response_text.strip()
348
+
349
+ # Extract text and metadata from source documents
350
+ try:
351
+ doc_texts = [
352
+ source.page_content for source in source_documents if source.page_content
353
+ ]
354
+ doc_citations = [
355
+ {
356
+ "pdf_name": os.path.basename(source.metadata.get("file_path", "Unknown")),
357
+ "page": source.metadata.get("page", "Unknown") + 1,
358
+ }
359
+ for source in source_documents
360
+ ]
361
+ logger.debug(f"Extracted {len(doc_texts)} document texts and citations")
362
+
363
+ if not doc_texts or not inline_response:
364
+ logger.warning("No documents or response text to process")
365
+ return inline_response, []
366
+
367
+ # Split response text into paragraphs
368
+ paragraphs = [p.strip() for p in response_text.split("\n") if p.strip()]
369
+ logger.debug(f"Split response into {len(paragraphs)} paragraphs")
370
+
371
+ # Vectorize response paragraphs and source document texts
372
+ vectorizer = TfidfVectorizer()
373
+ all_texts = doc_texts + paragraphs
374
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
375
+
376
+ # Initialize a list to store relevant citations
377
+ relevant_citations = []
378
+
379
+ # Match each paragraph to its most similar source documents
380
+ for i, paragraph in enumerate(paragraphs):
381
+ paragraph_idx = len(doc_texts) + i
382
+ similarities = cosine_similarity(tfidf_matrix[paragraph_idx:paragraph_idx + 1], tfidf_matrix[:len(doc_texts)])[0]
383
+
384
+ # Collect relevant citations based on similarity
385
+ paragraph_citations = [
386
+ doc_citations[j] for j, score in enumerate(similarities) if score > 0.2
387
+ ]
388
+
389
+ if paragraph_citations:
390
+ logger.debug(f"Found {len(paragraph_citations)} citations for paragraph {i+1}")
391
+ relevant_citations.extend(paragraph_citations)
392
+
393
+ # Group citations by document name and collect pages
394
+ grouped_citations = defaultdict(set)
395
+ for citation in paragraph_citations:
396
+ grouped_citations[citation["pdf_name"]].add(citation["page"])
397
+
398
+ # Format grouped citations
399
+ combined_citations = []
400
+ for pdf_name, pages in grouped_citations.items():
401
+ pages = sorted(pages)
402
+ pages_text = f"Page {pages[0]}" if len(pages) == 1 else f"Pages {', '.join(map(str, pages))}"
403
+ combined_citations.append(f"{pdf_name}: {pages_text}")
404
+
405
+ formatted_citations = f" <b>(" + "; ".join(combined_citations) + ")</b> \n"
406
+ paragraphs[i] = f"{paragraph}{formatted_citations}"
407
+
408
+ # Combine paragraphs back into the final response
409
+ inline_response = "\n".join(paragraphs)
410
+ logger.info("Successfully formatted inline citations")
411
+ return inline_response, relevant_citations
412
+
413
+ except Exception as e:
414
+ logger.error(f"Error formatting inline citations: {str(e)}", exc_info=True)
415
+ return response_text, []
416
+
417
+ def display_citation_details(source_documents):
418
+ """Display detailed information about citation details."""
419
+ logger.info("Displaying citation details")
420
+
421
+ try:
422
+ st.subheader("Citation Details")
423
+
424
+ grouped_sources = defaultdict(list)
425
+ for source in source_documents:
426
+ key = (source.metadata.get('file_path', 'Unknown'), source.metadata.get('page', 'Unknown'))
427
+ grouped_sources[key].append(source.page_content)
428
+
429
+ logger.debug(f"Grouped {len(grouped_sources)} unique sources")
430
+
431
+ for key, content_list in grouped_sources.items():
432
+ file_path, page_number = key
433
+ try:
434
+ full_page_content = next(
435
+ (source.metadata.get('full_page_content', 'No full content available')
436
+ for source in source_documents
437
+ if source.metadata.get('file_path', 'Unknown') == file_path
438
+ and source.metadata.get('page', 'Unknown') == page_number),
439
+ 'No full content available'
440
+ )
441
+
442
+ merged_content = "\n".join(content_list)
443
+ highlighted_content = full_page_content
444
+
445
+ for line in merged_content.splitlines():
446
+ if line.strip() and line in full_page_content:
447
+ highlighted_content = highlighted_content.replace(line, f"<mark>{line}</mark>", 1)
448
+
449
+ with st.expander(f"Source: {os.path.basename(file_path)} - Page {page_number + 1}"):
450
+ st.markdown(highlighted_content, unsafe_allow_html=True)
451
+
452
+ logger.debug(f"Displayed citation details for {os.path.basename(file_path)} - Page {page_number + 1}")
453
+
454
+ except Exception as e:
455
+ logger.error(f"Error processing citation for {file_path}: {str(e)}")
456
+ continue
457
+
458
+ except Exception as e:
459
+ logger.error(f"Error displaying citation details: {str(e)}", exc_info=True)
460
+ st.error("Error displaying citation details")
461
+
462
+ def initialize_system():
463
+ """Initialize the QA system components."""
464
+ logger.info("Starting system initialization")
465
+
466
+ try:
467
+ config = load_config()
468
+ if not config:
469
+ logger.error("Configuration not found")
470
+ st.error("Configuration not found. Please run the preprocessing script first.")
471
+ return False
472
+
473
+ st.session_state.config = config
474
+ logger.debug("Configuration loaded successfully")
475
+
476
+ embedding_model = initialize_embedding_model()
477
+ st.session_state.vectordb = load_vectordb(config['persist_directory'], embedding_model, config['collection_name'])
478
+ st.session_state.qa_chain = create_qa_chain(st.session_state.vectordb, config['groq_api_key'])
479
+
480
+ logger.info("System initialized successfully")
481
+ st.success("System initialized successfully!")
482
+ return True
483
+
484
+ except Exception as e:
485
+ logger.error(f"Error during system initialization: {str(e)}", exc_info=True)
486
+ st.error(f"An error occurred: {e}")
487
+ return False
488
+
489
+ def extract_page_image(file_path, page_number):
490
+ """Extract the image of a specific page from a PDF file and return it as a PIL image."""
491
+ logger.debug(f"Extracting page image from {file_path}, page {page_number}")
492
+
493
+ try:
494
+ doc = fitz.open(file_path)
495
+ page = doc.load_page(page_number)
496
+ pix = page.get_pixmap()
497
+ image = Image.open(io.BytesIO(pix.tobytes("png")))
498
+ logger.debug("Successfully extracted page image")
499
+ return image
500
+ except Exception as e:
501
+ logger.error(f"Error extracting page image: {str(e)}")
502
+ return None
503
+
504
+ def highlight_query_words(text, query):
505
+ """Highlights words from the query in the provided text."""
506
+ logger.debug(f"Highlighting query words for query: {query}")
507
+
508
+ try:
509
+ stop_words = set(stopwords.words('english'))
510
+ query_words = set(word_tokenize(query.lower())) - stop_words
511
+
512
+ words = text.split()
513
+ highlighted_text = " ".join(
514
+ f"<mark>{word}</mark>"
515
+ if word.lower().strip(".,!?") in query_words else word
516
+ for word in words
517
+ )
518
+
519
+ logger.debug("Successfully highlighted query words")
520
+ return highlighted_text
521
+ except Exception as e:
522
+ logger.error(f"Error highlighting query words: {str(e)}")
523
+ return text
524
+
525
+ def display_source_documents_with_images(source_documents, query):
526
+ """Display unique source document images and formatted text snippets with query highlights."""
527
+ logger.info("Displaying source documents with images")
528
+
529
+ try:
530
+ st.subheader("📝 Source Documents")
531
+
532
+ unique_sources = {}
533
+ for source in source_documents:
534
+ key = (source.metadata.get('file_path', 'Unknown'), source.metadata.get('page', 'Unknown'))
535
+ if key not in unique_sources:
536
+ unique_sources[key] = source
537
+
538
+ logger.debug(f"Processing {len(unique_sources)} unique sources")
539
+
540
+ for (file_path, page_number), source in unique_sources.items():
541
+ try:
542
+ pdf_name = os.path.basename(file_path)
543
+ page_content = source.metadata["full_page_content"] or "No content available"
544
+
545
+ logger.debug(f"Processing document: {pdf_name}, page {page_number + 1}")
546
+
547
+ col1, col2 = st.columns([1, 3])
548
+
549
+ with col1:
550
+ page_image = extract_page_image(file_path, page_number)
551
+ if page_image:
552
+ st.image(page_image, caption=f"Page {page_number + 1}", use_container_width=True)
553
+ else:
554
+ logger.warning(f"Preview not available for {pdf_name}, page {page_number + 1}")
555
+ st.warning("⚠️ Preview not available for this page")
556
+
557
+ with col2:
558
+ st.markdown(f'<span class="document-name">{pdf_name}</span>', unsafe_allow_html=True)
559
+ st.markdown(f'<span class="page-number">Page {page_number + 1}</span>', unsafe_allow_html=True)
560
+
561
+ sentences = sent_tokenize(page_content)
562
+ random.shuffle(sentences)
563
+
564
+ selected_snippet = []
565
+ for sentence in sentences:
566
+ words = sentence.split()
567
+ chunked_snippet = [" ".join(words[i:i+17]) for i in range(0, len(words), 17)]
568
+ selected_snippet.extend(chunked_snippet)
569
+ if len(selected_snippet) >= 7:
570
+ break
571
+
572
+ snippet = " ... ".join(selected_snippet)
573
+ highlighted_snippet = highlight_query_words(snippet, query)
574
+
575
+ st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
576
+ st.markdown(f"[View other results in this book](?page=pdf_details&filename={pdf_name}&page_number={page_number})", unsafe_allow_html=True)
577
+
578
+ logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
579
+
580
+ except Exception as e:
581
+ logger.error(f"Error processing document {pdf_name}: {str(e)}")
582
+ continue
583
+
584
+ except Exception as e:
585
+ logger.error(f"Error displaying source documents: {str(e)}", exc_info=True)
586
+ st.error("Error displaying source documents")
587
+
588
+ def is_query_relevant(question, source_documents, threshold=0.1):
589
+ """Check query relevance using multiple similarity methods."""
590
+ logger.info(f"Checking relevance for query: {question}")
591
+
592
+ try:
593
+ if not source_documents:
594
+ logger.warning("No source documents provided for relevance check")
595
+ return False
596
+
597
+ # Keyword-based check
598
+ keywords = set(question.lower().split())
599
+
600
+ for doc in source_documents:
601
+ doc_words = set(doc.page_content.lower().split())
602
+ if keywords.intersection(doc_words):
603
+ logger.debug("Query relevant based on keyword match")
604
+ return True
605
+
606
+ # TF-IDF similarity check
607
+ try:
608
+ doc_texts = [doc.page_content for doc in source_documents]
609
+ texts_to_compare = doc_texts + [question]
610
+
611
+ vectorizer = TfidfVectorizer()
612
+ tfidf_matrix = vectorizer.fit_transform(texts_to_compare)
613
+
614
+ similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])[0]
615
+
616
+ is_relevant = any(sim > threshold for sim in similarities)
617
+ logger.debug(f"Query relevance (TF-IDF): {is_relevant}")
618
+ return is_relevant
619
+
620
+ except Exception as e:
621
+ logger.warning(f"TF-IDF similarity check failed: {str(e)}")
622
+ # Fallback to simple text match
623
+ is_relevant = any(question.lower() in doc.page_content.lower() for doc in source_documents)
624
+ logger.debug(f"Query relevance (fallback): {is_relevant}")
625
+ return is_relevant
626
+
627
+ except Exception as e:
628
+ logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
629
+ return False
630
+
631
+ def main():
632
+ logger.info("Starting Smart PDF Search application")
633
+
634
+ # Detect page from query parameters
635
+ query_params = st.query_params
636
+ page = query_params.get('page', 'home')
637
+ logger.debug(f"Current page: {page}")
638
+
639
+ # Routing logic
640
+ if page == 'pdf_details':
641
+ filename = query_params.get('filename', '')
642
+ page_number = int(query_params.get('page_number', 0))
643
+ logger.info(f"Displaying PDF details for {filename}, page {page_number}")
644
+
645
+ if filename:
646
+ display_pdf_details(filename, page_number)
647
+ else:
648
+ logger.warning("No filename provided for PDF details")
649
+ st.error("No filename provided for PDF details")
650
+ elif page == 'romanized_text':
651
+ filename = query_params.get('filename', '')
652
+ logger.info(f"Displaying romanized text for {filename}")
653
+
654
+ if filename:
655
+ display_romanized_text_page(filename)
656
+ else:
657
+ logger.warning("No filename provided for Romanized text")
658
+ st.error("No filename provided for Romanized text")
659
+ else:
660
+ logger.info("Displaying main search page")
661
+ st.markdown("<h1 style='text-align: center;'>📚 Smart PDF Search</h1>", unsafe_allow_html=True)
662
+
663
+ # PDF Upload Section in Sidebar
664
+ st.sidebar.header("📤 Upload PDF")
665
+ uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
666
+
667
+ # Process the uploaded PDF if a new file is uploaded
668
+ if uploaded_file is not None:
669
+ logger.info(f"Processing uploaded file: {uploaded_file.name}")
670
+ # Only process the PDF if it's a new upload and not an existing one
671
+ if 'last_uploaded_file' not in st.session_state or st.session_state.last_uploaded_file != uploaded_file.name:
672
+ try:
673
+ config = st.session_state.config if 'config' in st.session_state else load_config()
674
+
675
+ with st.spinner('Processing uploaded PDF...'):
676
+ success = update_or_add_pdf(
677
+ uploaded_file,
678
+ config['data_path'],
679
+ config['persist_directory'],
680
+ config['collection_name']
681
+ )
682
+
683
+ if success:
684
+ logger.info(f"Successfully processed uploaded file: {uploaded_file.name}")
685
+ st.sidebar.success(f"Successfully uploaded {uploaded_file.name}")
686
+ st.session_state.vectordb = None
687
+ st.session_state.qa_chain = None
688
+ st.session_state.last_uploaded_file = uploaded_file.name
689
+ else:
690
+ logger.warning(f"Failed to process uploaded file: {uploaded_file.name}")
691
+ st.sidebar.warning("🚨 Please upload a valid PDF file to proceed.")
692
+ except Exception as e:
693
+ logger.error(f"Error processing uploaded file: {str(e)}", exc_info=True)
694
+ st.sidebar.error(f"Error processing file: {str(e)}")
695
+ else:
696
+ logger.info(f"PDF {uploaded_file.name} is already uploaded")
697
+ st.sidebar.info(f"PDF {uploaded_file.name} is already uploaded.")
698
+
699
+ ## Initialize QA system
700
+ if st.session_state.qa_chain is None:
701
+ logger.info("Initializing QA system")
702
+ if not initialize_system():
703
+ logger.error("Failed to initialize system")
704
+ return
705
+
706
+ st.subheader("🔍 Ask a Question")
707
+ question = st.text_input("Enter your question:")
708
+ if st.button("Get Answer") and question:
709
+ logger.info(f"Processing question: {question}")
710
+ try:
711
+ with st.spinner('🧠 Finding answer...'):
712
+ llm_response = st.session_state.qa_chain.invoke({"query": question})
713
+ logger.debug("Successfully got response from QA chain")
714
+ response_text = llm_response['result']
715
+ source_documents = llm_response['source_documents']
716
+
717
+ # Check if the query is relevant to the documents
718
+ if is_query_relevant(question, source_documents):
719
+ # Format citations only if the query is relevant
720
+ inline_response, relevant_citations = format_inline_citations(response_text, source_documents)
721
+
722
+ # Only show detailed response if we have relevant citations
723
+ if relevant_citations:
724
+ col3, col4 = st.columns([2, 1])
725
+ with col3:
726
+ st.subheader("🧠 Summary")
727
+ st.markdown(f'<div class="response-block">{inline_response}</div>', unsafe_allow_html=True)
728
+ display_source_documents_with_images(source_documents, question)
729
+ with col4:
730
+ display_citation_details(source_documents)
731
+ else:
732
+ st.warning("⚠️ While your question seems related to the documents, I couldn't find specific relevant information to answer it. Please try rephrasing your question or asking about a different topic.")
733
+ else:
734
+ st.warning("⚠️ Your question appears to be unrelated to the content in the uploaded documents. Please ask a question about the information contained in the PDFs.")
735
+
736
+ except Exception as e:
737
+ logger.error(f"Error processing question: {str(e)}", exc_info=True)
738
+ st.error(f"⚠️ An error occurred while processing your question: {e}")
739
+
740
+ # Sidebar content
741
+ st.sidebar.markdown("""
742
+ <div style="background-color: #f0f4ff; padding: 5%; border-left: 4px solid #3b82f6; border-radius: 8px; box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1); margin-top: 35%; margin-bottom: 0%;">
743
+ <h3 style="margin-top: 0;">💡 Smart PDF Search Features</h3>
744
+ <ul style="padding-left: 20px;">
745
+ <li>🔍 Intelligent document search across multiple PDFs</li>
746
+ <li>🧠 Context-aware question answering</li>
747
+ <li>📄 Precise citations and source tracking</li>
748
+ <li>🖼️ Visual page previews with highlighted results</li>
749
+ <li>⚡ Fast and accurate information retrieval</li>
750
+ </ul>
751
+ <p style="color: #1e3a8a; font-weight: bold;">
752
+ Explore your PDFs with intelligent, context-aware search. Ask questions and get precise answers from your document collection.
753
+ </p>
754
+ </div>
755
+ """, unsafe_allow_html=True)
756
+
757
+ if __name__ == "__main__":
758
+ try:
759
+ main()
760
+ except Exception as e:
761
+ logger.critical(f"Critical application error: {str(e)}", exc_info=True)
762
+ st.error("A critical error occurred. Please check the logs for details.")
config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "data_path": "data",
3
+ "persist_directory": "db",
4
+ "collection_name": "smart_pdf_search"
5
+ }
config.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Constants
6
+ CONFIG_FILE = 'config.json'
7
+
8
+ # Load environment variables from .env
9
+ load_dotenv()
10
+
11
+ def save_config(data_path, persist_directory, collection_name):
12
+ """
13
+ Save configuration to a JSON file.
14
+ This function accepts arguments and writes them to a config.json file.
15
+ Sensitive data (e.g., API keys) are not written to the file.
16
+ """
17
+ config = {
18
+ 'data_path': data_path,
19
+ 'persist_directory': persist_directory,
20
+ 'collection_name': collection_name
21
+ }
22
+ with open(CONFIG_FILE, 'w') as f:
23
+ json.dump(config, f, indent=4) # Add indent for better readability
24
+ print(f"Configuration saved to {CONFIG_FILE}.")
25
+
26
+ def load_config():
27
+ """
28
+ Load configuration from JSON file and environment variables.
29
+ Returns the complete configuration as a dictionary.
30
+ """
31
+ try:
32
+ # Load JSON config file if it exists
33
+ if not os.path.exists(CONFIG_FILE):
34
+ raise FileNotFoundError(f"{CONFIG_FILE} not found. Please save the configuration first.")
35
+
36
+ with open(CONFIG_FILE, 'r') as f:
37
+ config = json.load(f)
38
+
39
+ # Validate required keys in config.json
40
+ required_keys = ['data_path', 'persist_directory', 'collection_name']
41
+ for key in required_keys:
42
+ if key not in config:
43
+ raise KeyError(f"Missing required configuration key: {key}")
44
+
45
+ # Add GROQ_API_KEY from environment variables (fallback to .env)
46
+ config['groq_api_key'] = os.getenv('GROQ_API_KEY')
47
+ if not config['groq_api_key']:
48
+ raise ValueError("GROQ_API_KEY is not set in environment variables.")
49
+
50
+ return config
51
+
52
+ except FileNotFoundError as e:
53
+ print(f"Error: {e}")
54
+ return None
55
+ except (json.JSONDecodeError, KeyError, ValueError) as e:
56
+ print(f"Configuration error: {e}")
57
+ return None
data/Cyber_Security.pdf ADDED
Binary file (341 kB). View file
 
data/SITA1602.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c02a1deb82d8d8fc3a2e122de97ebbe6552a57ab0f3c04399c9926384508bdd5
3
+ size 5167544
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
3
+ size 1676000
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
+ size 4000
db/c1cb6919-5129-47ed-8f4d-e617fc5e8b97/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:012e5164c8468fc5d3a6b6a847d2dd696b79918e7bbb61c59dc050780c8d8785
3
+ size 14454784
log_utils.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+
5
+ def setup_logging(logger_name=None):
6
+ """
7
+ Configure logging settings with a unified configuration.
8
+ Creates logs directory if it doesn't exist and sets up logging handlers.
9
+
10
+ Args:
11
+ logger_name: Name for the logger. If None, returns root logger.
12
+
13
+ Returns:
14
+ Configured logger instance
15
+ """
16
+ log_dir = "logs"
17
+ if not os.path.exists(log_dir):
18
+ os.makedirs(log_dir)
19
+
20
+ log_file = os.path.join(log_dir, "main.log")
21
+
22
+ # Check if the root logger already has handlers to avoid duplicate logging
23
+ root_logger = logging.getLogger()
24
+ if not root_logger.handlers:
25
+ # Configure root logger only if it hasn't been configured
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
29
+ handlers=[
30
+ logging.FileHandler(log_file),
31
+ logging.StreamHandler()
32
+ ]
33
+ )
34
+
35
+ # Get or create logger with the specified name
36
+ if logger_name:
37
+ logger = logging.getLogger(logger_name)
38
+ else:
39
+ logger = root_logger
40
+
41
+ # Ensure the logger level is set
42
+ logger.setLevel(logging.INFO)
43
+
44
+ return logger
pdf_details_page.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import base64
3
+ import logging
4
+ import requests
5
+ from PIL import Image
6
+ import streamlit as st
7
+ from typing import Dict, Any
8
+ from log_utils import setup_logging
9
+
10
+ logger = setup_logging('pdf_details_page')
11
+
12
+ def api_request(url: str, params: Dict[str, Any] = None) -> Dict[str, Any]:
13
+ """
14
+ Make API request with logging and error handling.
15
+ """
16
+ try:
17
+ logger.info(f"Making API request to: {url}")
18
+ response = requests.get(url, params=params)
19
+ response.raise_for_status()
20
+ logger.debug(f"API response received successfully from: {url}")
21
+ return response.json()
22
+ except requests.RequestException as e:
23
+ logger.error(f"API request failed: {str(e)}", exc_info=True)
24
+ raise
25
+
26
+ def display_romanized_text_page(filename):
27
+ """
28
+ Displays romanized text and PDF details in a Streamlit layout styled to match the given design.
29
+ """
30
+ logger.info(f"Displaying romanized text page for file: {filename}")
31
+ try:
32
+ st.markdown(
33
+ """
34
+ <style>
35
+ /* Styling for metadata section */
36
+ .metadata {
37
+ display: flex;
38
+ justify-content: space-between;
39
+ margin-bottom: 20px;
40
+ font-family: SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace;
41
+ font-size: 16px;
42
+ color: #34495e;
43
+ margin-top: 20px;
44
+ }
45
+ .metadata div {
46
+ text-align: left;
47
+ }
48
+
49
+ /* Styling for page text */
50
+ .page-section {
51
+ margin-bottom: 40px;
52
+ }
53
+ .page-header {
54
+ font-size: 20px;
55
+ color: #3498db;
56
+ font-family: SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace;
57
+ margin-bottom: 10px;
58
+ font-weight: bold;
59
+ }
60
+ .page-text {
61
+ font-family: SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace;
62
+ font-size: 16px;
63
+ color: #2c3e50;
64
+ line-height: 1.5;
65
+ margin-bottom: 20px;
66
+ }
67
+
68
+ /* Horizontal rule */
69
+ hr {
70
+ border: 0;
71
+ height: 1px;
72
+ background: #ddd;
73
+ margin: 30px 0;
74
+ }
75
+ </style>
76
+ """,
77
+ unsafe_allow_html=True
78
+ )
79
+ logger.debug("Applied CSS styling")
80
+ # API Endpoint for Romanized Text
81
+ api_url = f"http://127.0.0.1:8000/romanized-text?filename={filename}"
82
+
83
+ try:
84
+ # Fetch data from API
85
+ response = requests.get(api_url)
86
+ response.raise_for_status()
87
+ data = response.json()
88
+
89
+ # Page Title
90
+ st.markdown("<h1 style='text-align: center; margin-top: -1%;}'>📚 Smart PDF Search</h1>", unsafe_allow_html=True)
91
+ logger.debug("Rendered page title")
92
+
93
+ # Document Info Section
94
+ word_count = len(data['full_text'].split())
95
+ logger.info(f"Displaying document info - Pages: {data['total_pages']}, Size: {data['file_size_kb']}KB, Words: {word_count}")
96
+
97
+ # Document Info Section
98
+ st.markdown(
99
+ f"""
100
+ <div class='metadata'>
101
+ <div>
102
+ <strong>Filename: </strong>{data['filename']} <br>
103
+ <strong>Total Pages: </strong>{data['total_pages']} <br>
104
+ <strong>File Size: </strong>{data['file_size_kb']} <br>
105
+ <strong>Total Words: </strong>{len(data['full_text'].split())}
106
+ </div>
107
+ </div>
108
+ """,
109
+ unsafe_allow_html=True
110
+ )
111
+
112
+ # Display Each Page's Text
113
+ logger.info(f"Rendering {len(data['pages'])} pages of text")
114
+ for page in data['pages']:
115
+ st.markdown(
116
+ f"""
117
+ <div class='page-section'>
118
+ <div class='page-header'>Page {page['page_number']}</div>
119
+ <div class='page-text'>{page['text']}</div>
120
+ <hr>
121
+ </div>
122
+ """,
123
+ unsafe_allow_html=True
124
+ )
125
+ logger.debug("Completed rendering all pages")
126
+
127
+ except requests.RequestException as e:
128
+ logger.error(f"API request failed: {str(e)}", exc_info=True)
129
+ st.error(f"Error fetching data: {e}")
130
+ except KeyError as e:
131
+ logger.error(f"Missing key in API response: {str(e)}", exc_info=True)
132
+ st.error(f"Missing key in API response: {e}")
133
+ except Exception as e:
134
+ logger.error(f"Unexpected error in display_romanized_text_page: {str(e)}", exc_info=True)
135
+ st.error(f"An unexpected error occurred: {e}")
136
+
137
+ def display_pdf_details(filename, page_number):
138
+ """
139
+ Display detailed information about a specific PDF page.
140
+ """
141
+ logger.info(f"Displaying PDF details for file: {filename}, page: {page_number}")
142
+
143
+ # Initialize reader mode state
144
+ if 'reader_mode' not in st.session_state:
145
+ st.session_state.reader_mode = False
146
+ logger.debug("Initialized reader mode state")
147
+
148
+ def toggle_reader_mode():
149
+ """Toggle reader mode state with logging."""
150
+ previous_state = st.session_state.reader_mode
151
+ st.session_state.reader_mode = not previous_state
152
+ logger.info(f"Reader mode toggled from {previous_state} to {st.session_state.reader_mode}")
153
+
154
+ try:
155
+ api_url = f"http://127.0.0.1:8000/pdf-details?filename={filename}&page_number={page_number}"
156
+ response = requests.get(api_url)
157
+ logger.debug(f"Retrieved PDF details for page {page_number}")
158
+
159
+ if response.status_code == 200:
160
+ pdf_details = response.json()
161
+
162
+ # Enhanced CSS for better styling
163
+ st.markdown("""
164
+ <style>
165
+ .page-container {
166
+ background-color: #ffffff;
167
+ padding: 30px;
168
+ margin: 20px auto;
169
+ border-radius: 12px;
170
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
171
+ max-width: 1200px;
172
+ font-family: Arial, sans-serif;
173
+ }
174
+ .stApp {
175
+ background-color: #f8f9fa;
176
+ }
177
+ .detail-box {
178
+ border-radius: 12px;
179
+ padding: 25px;
180
+ margin-bottom: 25px;
181
+ }
182
+ .header {
183
+ text-align: center;
184
+ color: #1a237e;
185
+ margin-bottom: 30px;
186
+ font-family: 'Helvetica Neue', sans-serif;
187
+ }
188
+ .metadata-table {
189
+ width: 100%;
190
+ border-collapse: collapse;
191
+ margin: 20px 0;
192
+ font-family: 'Helvetica Neue', sans-serif;
193
+ }
194
+ .metadata-table td {
195
+ padding: 12px 15px;
196
+ border: 1px solid #e0e0e0;
197
+ }
198
+ .metadata-table tr:nth-child(even) {
199
+ background-color: #f8f9fa;
200
+ }
201
+ .metadata-table tr:hover {
202
+ background-color: #f5f5f5;
203
+ }
204
+ .metadata-table td:first-child {
205
+ font-weight: 600;
206
+ width: 30%;
207
+ color: #2c3e50;
208
+ }
209
+ .stButton>button {
210
+ width: 100%;
211
+ border-radius: 8px;
212
+ height: 45px;
213
+ margin-top: 10px;
214
+ }
215
+ .stTextArea>div>div {
216
+ border-radius: 8px;
217
+ }
218
+ .page-preview {
219
+ border-radius: 8px;
220
+ overflow: hidden;
221
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
222
+ max-width: 100%;
223
+ max-height: 500px;
224
+ margin: auto;
225
+ }
226
+ div[data-baseweb="tab"] {
227
+ padding: 15px !important;
228
+ }
229
+ .stExpander {
230
+ border-radius: 8px;
231
+ border: 1px solid #e0e0e0;
232
+ margin-top: 20px;
233
+ }
234
+ .reader-mode {
235
+ position: fixed;
236
+ top: 0;
237
+ left: 0;
238
+ width: 100vw;
239
+ height: 100vh;
240
+ background: rgba(0, 0, 0, 0.9);
241
+ z-index: 9999;
242
+ display: flex;
243
+ justify-content: center;
244
+ align-items: center;
245
+ padding: 2rem;
246
+ }
247
+ .reader-mode img {
248
+ max-height: 90vh;
249
+ max-width: 90vw;
250
+ object-fit: contain;
251
+ }
252
+ .close-reader {
253
+ position: fixed;
254
+ top: 20px;
255
+ right: 20px;
256
+ color: white;
257
+ font-size: 24px;
258
+ cursor: pointer;
259
+ z-index: 10000;
260
+ }
261
+ </style>
262
+ """, unsafe_allow_html=True)
263
+ logger.debug("Applied CSS styling")
264
+
265
+ # Reader mode display (if active)
266
+ if st.session_state.reader_mode:
267
+ logger.info("Displaying reader mode view")
268
+ st.markdown('<div class="reader-mode-container">', unsafe_allow_html=True)
269
+ if st.button("❌ Close Reader Mode", key="close_reader", help="Exit reader mode"):
270
+ logger.info("Reader mode closed")
271
+ st.session_state.reader_mode = False
272
+ st.rerun()
273
+
274
+ # Display zoomed image
275
+ page_image_bytes = base64.b64decode(pdf_details['page_image'])
276
+ page_image = Image.open(io.BytesIO(page_image_bytes))
277
+ st.image(page_image, use_container_width=True, caption=f"Page {pdf_details['current_page']}")
278
+ st.markdown('</div>', unsafe_allow_html=True)
279
+ return # Exit early as we don't need to show the regular interface in reader mode
280
+
281
+ logger.info("Displaying regular interface")
282
+ # Header
283
+ st.markdown('<h1 class="header">📚 Smart PDF Search</h1>', unsafe_allow_html=True)
284
+
285
+ # Main content
286
+ col1, col2 = st.columns([1.5, 2])
287
+
288
+ with col1:
289
+ logger.debug("Rendering details section")
290
+ st.markdown("<h3 style='color: #1a237e; margin-bottom: 15px;'>🖼️ Page Preview</h3>", unsafe_allow_html=True)
291
+ st.markdown(f"<div style='text-align: center; padding: 15px;'>Page {page_number + 1} of {pdf_details['total_pages']}</div>", unsafe_allow_html=True)
292
+ page_image_bytes = base64.b64decode(pdf_details['page_image'])
293
+ page_image = Image.open(io.BytesIO(page_image_bytes))
294
+
295
+ st.image(page_image, caption=f"Page {pdf_details['current_page']}", use_container_width=True)
296
+ st.markdown("</div>", unsafe_allow_html=True)
297
+
298
+ with col2:
299
+ st.markdown("<div class='detail-box'>", unsafe_allow_html=True)
300
+
301
+ # Create 3 equal-width columns
302
+ col1, col2, col3 = st.columns(3)
303
+
304
+ # Action buttons inside the columns
305
+ with col1:
306
+ logger.info("Reader mode button clicked")
307
+ st.button("📖 Reader Mode", on_click=toggle_reader_mode)
308
+
309
+ with col2:
310
+ if st.button("🔍 Ask a Question"):
311
+ logger.info("Ask a Question button clicked")
312
+ st.query_params["page"] = "home" # Use the new API instead
313
+ st.rerun()
314
+
315
+ with col3:
316
+ logger.debug("Rendering Romanized Text link")
317
+ st.markdown(f"""
318
+ <a href="?page=romanized_text&filename={filename}" style="
319
+ display: inline-block;
320
+ padding: 10px 10px;
321
+ font-size: 16px;
322
+ font-weight: 400;
323
+ color: white;
324
+ background-color: #3498db;
325
+ border: none;
326
+ border-radius: 8px;
327
+ text-align: center;
328
+ text-decoration: none;
329
+ margin-top: 10px;
330
+ transition: all 0.3s ease;
331
+ text-transform: uppercase;
332
+ letter-spacing: 0.5px;
333
+ width: -webkit-fill-available;
334
+ ">
335
+ 📄 Romanized Text
336
+ </a>
337
+ """, unsafe_allow_html=True)
338
+
339
+ # Page content in expander
340
+ with st.expander("📄 Page Content", expanded=True):
341
+ logger.debug("Displaying page content in expander")
342
+ st.markdown(pdf_details['page_text'], unsafe_allow_html=True)
343
+
344
+ logger.debug("Rendering metadata table")
345
+ # Content tabs
346
+ metadata_html = f"""
347
+ <table class="metadata-table">
348
+ <tr><td>PDF Name</td><td>{pdf_details.get('title', filename)}</td></tr>
349
+ <tr><td>Page</td><td>{page_number + 1}</td></tr>
350
+ <tr><td>Author</td><td>{pdf_details.get('metadata', {}).get('author', 'N/A')}</td></tr>
351
+ <tr><td>Total Pages</td><td>{pdf_details['total_pages']}</td></tr>
352
+ <tr><td>Language</td><td>{pdf_details['language']}</td></tr>
353
+ <tr><td>File Size</td><td>{pdf_details['file_size_kb']} KB</td></tr>
354
+ </table>
355
+ """
356
+ st.markdown(metadata_html, unsafe_allow_html=True)
357
+ logger.info(f"Completed rendering PDF details page for {filename}")
358
+ else:
359
+ st.error(f"Error fetching PDF details: {response.text}")
360
+
361
+ except Exception as e:
362
+ logger.error(f"Error in display_pdf_details: {str(e)}", exc_info=True)
363
+ st.error(f"An error occurred: {e}")
requirements.txt ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.4
2
+ aiohttp==3.11.11
3
+ aiosignal==1.3.2
4
+ altair==5.5.0
5
+ annotated-types==0.7.0
6
+ anyio==4.8.0
7
+ asgiref==3.8.1
8
+ async-timeout==4.0.3
9
+ attrs==24.3.0
10
+ backoff==2.2.1
11
+ bcrypt==4.2.1
12
+ blinker==1.9.0
13
+ build==1.2.2.post1
14
+ cachetools==5.5.1
15
+ certifi==2024.12.14
16
+ charset-normalizer==3.4.1
17
+ chroma-hnswlib==0.7.6
18
+ chromadb==0.6.3
19
+ click==8.1.8
20
+ coloredlogs==15.0.1
21
+ dataclasses-json==0.6.7
22
+ Deprecated==1.2.15
23
+ distro==1.9.0
24
+ durationpy==0.9
25
+ exceptiongroup==1.2.2
26
+ fastapi==0.115.7
27
+ filelock==3.17.0
28
+ flatbuffers==25.1.21
29
+ frozenlist==1.5.0
30
+ fsspec==2024.12.0
31
+ gitdb==4.0.12
32
+ GitPython==3.1.44
33
+ google-auth==2.38.0
34
+ googleapis-common-protos==1.66.0
35
+ greenlet==3.1.1
36
+ groq==0.15.0
37
+ grpcio==1.70.0
38
+ h11==0.14.0
39
+ httpcore==1.0.7
40
+ httptools==0.6.4
41
+ httpx==0.28.1
42
+ httpx-sse==0.4.0
43
+ huggingface-hub==0.27.1
44
+ humanfriendly==10.0
45
+ idna==3.10
46
+ importlib_metadata==8.5.0
47
+ importlib_resources==6.5.2
48
+ Jinja2==3.1.5
49
+ joblib==1.4.2
50
+ jsonpatch==1.33
51
+ jsonpointer==3.0.0
52
+ jsonschema==4.23.0
53
+ jsonschema-specifications==2024.10.1
54
+ kubernetes==32.0.0
55
+ langchain==0.3.15
56
+ langchain-community==0.3.15
57
+ langchain-core==0.3.31
58
+ langchain-groq==0.2.3
59
+ langchain-text-splitters==0.3.5
60
+ langdetect==1.0.9
61
+ langsmith==0.3.1
62
+ markdown-it-py==3.0.0
63
+ MarkupSafe==3.0.2
64
+ marshmallow==3.26.0
65
+ mdurl==0.1.2
66
+ mmh3==5.0.1
67
+ monotonic==1.6
68
+ mpmath==1.3.0
69
+ multidict==6.1.0
70
+ mypy-extensions==1.0.0
71
+ narwhals==1.23.0
72
+ networkx==3.4.2
73
+ nltk==3.9.1
74
+ numpy==1.26.4
75
+ nvidia-cublas-cu12==12.4.5.8
76
+ nvidia-cuda-cupti-cu12==12.4.127
77
+ nvidia-cuda-nvrtc-cu12==12.4.127
78
+ nvidia-cuda-runtime-cu12==12.4.127
79
+ nvidia-cudnn-cu12==9.1.0.70
80
+ nvidia-cufft-cu12==11.2.1.3
81
+ nvidia-curand-cu12==10.3.5.147
82
+ nvidia-cusolver-cu12==11.6.1.9
83
+ nvidia-cusparse-cu12==12.3.1.170
84
+ nvidia-nccl-cu12==2.21.5
85
+ nvidia-nvjitlink-cu12==12.4.127
86
+ nvidia-nvtx-cu12==12.4.127
87
+ oauthlib==3.2.2
88
+ onnxruntime==1.20.1
89
+ opentelemetry-api==1.29.0
90
+ opentelemetry-exporter-otlp-proto-common==1.29.0
91
+ opentelemetry-exporter-otlp-proto-grpc==1.29.0
92
+ opentelemetry-instrumentation==0.50b0
93
+ opentelemetry-instrumentation-asgi==0.50b0
94
+ opentelemetry-instrumentation-fastapi==0.50b0
95
+ opentelemetry-proto==1.29.0
96
+ opentelemetry-sdk==1.29.0
97
+ opentelemetry-semantic-conventions==0.50b0
98
+ opentelemetry-util-http==0.50b0
99
+ orjson==3.10.15
100
+ overrides==7.7.0
101
+ packaging==24.2
102
+ pandas==2.2.3
103
+ pillow==11.1.0
104
+ posthog==3.9.3
105
+ propcache==0.2.1
106
+ protobuf==5.29.3
107
+ pyarrow==19.0.0
108
+ pyasn1==0.6.1
109
+ pyasn1_modules==0.4.1
110
+ pycountry==24.6.1
111
+ pydantic==2.10.6
112
+ pydantic-settings==2.7.1
113
+ pydantic_core==2.27.2
114
+ pydeck==0.9.1
115
+ Pygments==2.19.1
116
+ PyMuPDF==1.25.2
117
+ PyPika==0.48.9
118
+ pyproject_hooks==1.2.0
119
+ pytesseract==0.3.13
120
+ python-dateutil==2.9.0.post0
121
+ python-dotenv==1.0.1
122
+ pytz==2024.2
123
+ PyYAML==6.0.2
124
+ referencing==0.36.1
125
+ regex==2024.11.6
126
+ requests==2.32.3
127
+ requests-oauthlib==2.0.0
128
+ requests-toolbelt==1.0.0
129
+ rich==13.9.4
130
+ rpds-py==0.22.3
131
+ rsa==4.9
132
+ safetensors==0.5.2
133
+ scikit-learn==1.6.1
134
+ scipy==1.15.1
135
+ sentence-transformers==3.4.0
136
+ shellingham==1.5.4
137
+ six==1.17.0
138
+ smmap==5.0.2
139
+ sniffio==1.3.1
140
+ SQLAlchemy==2.0.37
141
+ starlette==0.45.2
142
+ streamlit==1.41.1
143
+ sympy==1.13.1
144
+ tenacity==9.0.0
145
+ threadpoolctl==3.5.0
146
+ tokenizers==0.21.0
147
+ toml==0.10.2
148
+ tomli==2.2.1
149
+ torch==2.5.1
150
+ tornado==6.4.2
151
+ tqdm==4.67.1
152
+ transformers==4.48.1
153
+ transliterate==1.10.2
154
+ triton==3.1.0
155
+ typer==0.15.1
156
+ typing-inspect==0.9.0
157
+ typing_extensions==4.12.2
158
+ tzdata==2025.1
159
+ urllib3==2.3.0
160
+ uvicorn==0.34.0
161
+ uvloop==0.21.0
162
+ watchdog==6.0.0
163
+ watchfiles==1.0.4
164
+ websocket-client==1.8.0
165
+ websockets==14.2
166
+ wrapt==1.17.2
167
+ yarl==1.18.3
168
+ zipp==3.21.0
169
+ zstandard==0.23.0
upload_pdf.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import json
4
+ import logging
5
+ from typing import List
6
+ from config import save_config
7
+ from dotenv import load_dotenv
8
+ from log_utils import setup_logging
9
+ from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ from langchain_community.vectorstores import Chroma
13
+
14
+ CONFIG_FILE = 'config.json'
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ logger = setup_logging('upload_pdf')
20
+
21
+ def load_documents(data_path):
22
+ """Load PDF documents from the specified directory."""
23
+ logger.info(f"Starting document loading from directory: {data_path}")
24
+
25
+ if not os.path.exists(data_path):
26
+ logger.error(f"Directory not found: {data_path}")
27
+ raise FileNotFoundError(f"Directory not found: {data_path}")
28
+
29
+ directory_loader = DirectoryLoader(
30
+ data_path,
31
+ loader_cls=PyMuPDFLoader,
32
+ show_progress=True
33
+ )
34
+
35
+ try:
36
+ documents = directory_loader.load()
37
+ logger.info(f"Successfully loaded {len(documents)} documents")
38
+ return documents
39
+ except Exception as e:
40
+ logger.error(f"Error loading documents: {str(e)}", exc_info=True)
41
+ raise
42
+
43
+ def store_full_content(documents):
44
+ """Store full page content in document metadata."""
45
+ logger.info("Starting to store full page content in metadata")
46
+ try:
47
+ for doc in documents:
48
+ doc.metadata['full_page_content'] = doc.page_content
49
+ logger.debug(f"Stored full content for page {doc.metadata.get('page', 'Unknown')} "
50
+ f"from {os.path.basename(doc.metadata.get('file_path', 'Unknown'))}")
51
+ logger.info(f"Successfully stored full content for {len(documents)} documents")
52
+ return documents
53
+ except Exception as e:
54
+ logger.error(f"Error storing full content: {str(e)}", exc_info=True)
55
+ raise
56
+
57
+ def process_documents(documents):
58
+ """Process documents into chunks and add metadata."""
59
+ logger.info("Starting document processing")
60
+
61
+ try:
62
+ # First store full page content
63
+ documents = store_full_content(documents)
64
+
65
+ logger.info("Converting documents to chunks")
66
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=20)
67
+ chunks = text_splitter.split_documents(documents)
68
+
69
+ # Add UUID and store full page content in metadata
70
+ for chunk in chunks:
71
+ chunk.metadata['chunk_id'] = str(uuid.uuid4())
72
+ if 'full_page_content' not in chunk.metadata:
73
+ chunk.metadata['full_page_content'] = chunk.metadata.get('full_page_content', chunk.page_content)
74
+
75
+ logger.info(f"Document processing completed. Total chunks created: {len(chunks)}")
76
+ return chunks
77
+ except Exception as e:
78
+ logger.error(f"Error processing documents: {str(e)}", exc_info=True)
79
+ raise
80
+
81
+ def initialize_embedding_model():
82
+ """Initialize and return the embedding model."""
83
+ logger.info("Initializing embedding model")
84
+ try:
85
+ embedding_model = HuggingFaceEmbeddings(
86
+ model_name='all-MiniLM-L6-v2',
87
+ model_kwargs={'device': 'cpu'},
88
+ encode_kwargs={'normalize_embeddings': True}
89
+ )
90
+ logger.info("Embedding model initialized successfully")
91
+ return embedding_model
92
+ except Exception as e:
93
+ logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
94
+ raise
95
+
96
+ def create_vectordb(chunks, embedding_model, persist_directory, collection_name):
97
+ """Create and persist ChromaDB instance."""
98
+ logger.info(f"Creating Chroma instance with collection name: {collection_name}")
99
+ try:
100
+ vectordb = Chroma.from_documents(
101
+ documents=chunks,
102
+ embedding=embedding_model,
103
+ persist_directory=persist_directory,
104
+ collection_name=collection_name
105
+ )
106
+ vectordb.persist()
107
+ logger.info("Vector database created and persisted successfully")
108
+ return vectordb
109
+ except Exception as e:
110
+ logger.error(f"Error creating vector database: {str(e)}", exc_info=True)
111
+ raise
112
+
113
+ def update_or_add_pdf(uploaded_file, data_path, persist_directory, collection_name):
114
+ """Add or replace a PDF in the system."""
115
+ logger.info(f"Processing uploaded file: {uploaded_file.name}")
116
+
117
+ if not uploaded_file.name.lower().endswith('.pdf'):
118
+ logger.warning(f"Rejected non-PDF file: {uploaded_file.name}")
119
+ return False
120
+
121
+ file_path = os.path.join(data_path, uploaded_file.name)
122
+
123
+ try:
124
+ # Remove existing PDF if it exists
125
+ if os.path.exists(file_path):
126
+ os.remove(file_path)
127
+ logger.info(f"Deleted existing PDF: {uploaded_file.name}")
128
+
129
+ # Save the uploaded PDF
130
+ with open(file_path, 'wb') as f:
131
+ f.write(uploaded_file.getvalue())
132
+ logger.info(f"Saved new PDF: {uploaded_file.name}")
133
+
134
+ # Load and process the new document
135
+ documents = load_documents(data_path)
136
+ new_documents = [doc for doc in documents if os.path.basename(doc.metadata.get('file_path', '')) == uploaded_file.name]
137
+
138
+ if not new_documents:
139
+ logger.error(f"No documents found for uploaded file: {uploaded_file.name}")
140
+ return False
141
+
142
+ chunks = process_documents(new_documents)
143
+ embedding_model = initialize_embedding_model()
144
+
145
+ # Update vector database
146
+ vectordb = Chroma(
147
+ persist_directory=persist_directory,
148
+ embedding_function=embedding_model,
149
+ collection_name=collection_name
150
+ )
151
+
152
+ # Remove existing vectors
153
+ existing_docs = vectordb.get(where={"source": file_path})
154
+ if existing_docs['ids']:
155
+ vectordb.delete(existing_docs['ids'])
156
+ logger.info(f"Removed existing vectors for {uploaded_file.name}")
157
+
158
+ # Add new vectors
159
+ vectordb.add_documents(documents=chunks)
160
+ vectordb.persist()
161
+ logger.info(f"Successfully updated {uploaded_file.name} in vector database")
162
+
163
+ return True
164
+ except Exception as e:
165
+ logger.error(f"Error processing uploaded PDF {uploaded_file.name}: {str(e)}", exc_info=True)
166
+ return False
167
+
168
+ def main():
169
+ logger.info("Starting PDF processing pipeline")
170
+ try:
171
+ with open(CONFIG_FILE, 'r') as f:
172
+ config = json.load(f)
173
+
174
+ # Configuration
175
+ data_path = config.get('data_path')
176
+ persist_directory = os.environ.get('PERSIST_DIRECTORY')
177
+ collection_name = config.get('collection_name')
178
+
179
+ logger.info(f"Using configuration - data_path: {data_path}, "
180
+ f"persist_directory: {persist_directory}, "
181
+ f"collection_name: {collection_name}")
182
+
183
+ # Save configuration
184
+ save_config(data_path, persist_directory, collection_name)
185
+ logger.info("Configuration saved successfully")
186
+
187
+ # Process pipeline
188
+ documents = load_documents(data_path)
189
+ chunks = process_documents(documents)
190
+ embedding_model = initialize_embedding_model()
191
+ create_vectordb(chunks, embedding_model, persist_directory, collection_name)
192
+
193
+ logger.info("PDF processing pipeline completed successfully!")
194
+
195
+ except Exception as e:
196
+ logger.error("Fatal error in PDF processing pipeline", exc_info=True)
197
+ raise
198
+
199
+ if __name__ == "__main__":
200
+ main()