constitutional_bot / build_faiss.py
sankalp2606's picture
Upload 3 files
d07fe0d verified
import os
from dotenv import load_dotenv
import sys
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
# Load API Key
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("Google API Key not found. Please set it in your .env file.")
# Path to Constitution documents
data_path = "data" # Put Constitution of India and related PDFs here
if not os.path.exists(data_path):
raise FileNotFoundError(f"Directory '{data_path}' not found. Place your documents there.")
print(f"Current working directory: {os.getcwd()}")
print(f"Looking for Constitution documents in: {os.path.abspath(data_path)}")
# Load PDFs
documents = []
pdf_files = [f for f in os.listdir(data_path) if f.lower().endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files")
for pdf_file in pdf_files:
try:
pdf_path = os.path.join(data_path, pdf_file)
print(f"Loading PDF: {pdf_path}")
loader = PyPDFLoader(pdf_path)
pdf_docs = loader.load()
print(f" - Loaded {len(pdf_docs)} pages from {pdf_file}")
documents.extend(pdf_docs)
except Exception as e:
print(f"Error loading {pdf_file}: {e}")
# Fallback if no documents loaded
if not documents:
print("No documents were loaded. Trying DirectoryLoader as fallback...")
try:
loader = DirectoryLoader(data_path, glob="**/*.pdf")
documents = loader.load()
print(f"Loaded {len(documents)} documents with DirectoryLoader")
except Exception as e:
print(f"DirectoryLoader failed: {e}")
if not documents:
print("ERROR: No Constitution documents loaded. Exiting.")
sys.exit(1)
# Split documents
print(f"Splitting {len(documents)} documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")
if not docs:
print("ERROR: No text chunks created after splitting.")
sys.exit(1)
# Create embeddings & FAISS index
try:
print("Initializing embedding model...")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
print("Testing embedding generation...")
test_embedding = embeddings.embed_query("Constitution of India test")
print(f"Test embedding successful, vector length: {len(test_embedding)}")
print(f"Creating FAISS index for {len(docs)} chunks...")
faiss_index = FAISS.from_documents(docs, embeddings)
index_path = "vector_store/faiss_index_constitution"
os.makedirs(index_path, exist_ok=True)
faiss_index.save_local(index_path)
print("✅ Constitution FAISS Index successfully created and saved!")
except Exception as e:
print(f"ERROR during embedding or indexing: {e}")
import traceback
traceback.print_exc()
sys.exit(1)