|
import os |
|
import json |
|
from PyPDF2 import PdfReader |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain_chroma import Chroma |
|
from langchain_groq import ChatGroq |
|
from langchain.chains import RetrievalQA |
|
|
|
|
|
working_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY') |
|
|
|
|
|
embedding = HuggingFaceEmbeddings() |
|
|
|
|
|
llm = ChatGroq( |
|
model="deepseek-r1-distill-llama-70b", |
|
temperature=0 |
|
) |
|
|
|
|
|
def extract_text_from_pdf(file_path): |
|
""" |
|
Extract text content from a PDF file using PyPDF2. |
|
""" |
|
try: |
|
reader = PdfReader(file_path) |
|
text = "" |
|
for page in reader.pages: |
|
text += page.extract_text() + "\n" |
|
return text |
|
except Exception as e: |
|
raise RuntimeError(f"β οΈ Error extracting text from PDF: {e}") |
|
|
|
|
|
def process_document_to_chroma_db(directory_path): |
|
""" |
|
Process all PDF documents in the given directory, split their text, |
|
and store embeddings in a persistent ChromaDB. |
|
""" |
|
try: |
|
|
|
for file_name in os.listdir(directory_path): |
|
if file_name.endswith(".pdf"): |
|
file_path = os.path.join(directory_path, file_name) |
|
print(f"π Processing document: {file_name}") |
|
|
|
|
|
text = extract_text_from_pdf(file_path) |
|
if not text.strip(): |
|
raise ValueError(f"β οΈ No text extracted from '{file_name}'. The file might be empty.") |
|
|
|
|
|
print("π Splitting document into smaller chunks...") |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=2000, |
|
chunk_overlap=200 |
|
) |
|
texts = text_splitter.split_text(text) |
|
|
|
|
|
print("πΎ Storing embeddings in ChromaDB...") |
|
vectordb = Chroma.from_texts( |
|
texts=texts, |
|
embedding=embedding, |
|
persist_directory=os.path.join(working_dir, "doc_vectorstore") |
|
) |
|
|
|
print("β
All documents successfully processed and stored in ChromaDB.") |
|
return "β
Documents successfully processed and stored in ChromaDB." |
|
|
|
except Exception as e: |
|
raise RuntimeError(f"β οΈ Error processing documents: {e}") |
|
|
|
|
|
def answer_question(user_question): |
|
""" |
|
Retrieve and generate an answer for the given user question |
|
based on the stored document embeddings. |
|
""" |
|
try: |
|
|
|
vectordb_path = os.path.join(working_dir, "doc_vectorstore") |
|
if not os.path.exists(vectordb_path): |
|
raise FileNotFoundError("β οΈ ChromaDB vector store not found. Please process a document first.") |
|
|
|
print("π Loading vector database...") |
|
vectordb = Chroma( |
|
persist_directory=vectordb_path, |
|
embedding_function=embedding |
|
) |
|
|
|
|
|
retriever = vectordb.as_retriever() |
|
|
|
|
|
print("π€ Initializing Retrieval QA chain...") |
|
qa_chain = RetrievalQA.from_chain_type( |
|
llm=llm, |
|
chain_type="stuff", |
|
retriever=retriever, |
|
) |
|
|
|
|
|
print("π¬ Generating answer...") |
|
response = qa_chain.invoke({"query": user_question}) |
|
answer = response.get("result", "β οΈ No response generated.") |
|
|
|
return answer |
|
|
|
except Exception as e: |
|
raise RuntimeError(f"β οΈ Error generating response: {e}") |
|
|