Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- build_faiss.py +86 -0
- chatbot.py +85 -0
- load_data.py +64 -0
build_faiss.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import sys
|
4 |
+
|
5 |
+
from langchain_community.vectorstores import FAISS
|
6 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
|
9 |
+
|
10 |
+
# Load API Key
|
11 |
+
load_dotenv()
|
12 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
13 |
+
|
14 |
+
if not api_key:
|
15 |
+
raise ValueError("Google API Key not found. Please set it in your .env file.")
|
16 |
+
|
17 |
+
# Path to Constitution documents
|
18 |
+
data_path = "data" # Put Constitution of India and related PDFs here
|
19 |
+
if not os.path.exists(data_path):
|
20 |
+
raise FileNotFoundError(f"Directory '{data_path}' not found. Place your documents there.")
|
21 |
+
|
22 |
+
print(f"Current working directory: {os.getcwd()}")
|
23 |
+
print(f"Looking for Constitution documents in: {os.path.abspath(data_path)}")
|
24 |
+
|
25 |
+
# Load PDFs
|
26 |
+
documents = []
|
27 |
+
pdf_files = [f for f in os.listdir(data_path) if f.lower().endswith('.pdf')]
|
28 |
+
print(f"Found {len(pdf_files)} PDF files")
|
29 |
+
|
30 |
+
for pdf_file in pdf_files:
|
31 |
+
try:
|
32 |
+
pdf_path = os.path.join(data_path, pdf_file)
|
33 |
+
print(f"Loading PDF: {pdf_path}")
|
34 |
+
loader = PyPDFLoader(pdf_path)
|
35 |
+
pdf_docs = loader.load()
|
36 |
+
print(f" - Loaded {len(pdf_docs)} pages from {pdf_file}")
|
37 |
+
documents.extend(pdf_docs)
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error loading {pdf_file}: {e}")
|
40 |
+
|
41 |
+
# Fallback if no documents loaded
|
42 |
+
if not documents:
|
43 |
+
print("No documents were loaded. Trying DirectoryLoader as fallback...")
|
44 |
+
try:
|
45 |
+
loader = DirectoryLoader(data_path, glob="**/*.pdf")
|
46 |
+
documents = loader.load()
|
47 |
+
print(f"Loaded {len(documents)} documents with DirectoryLoader")
|
48 |
+
except Exception as e:
|
49 |
+
print(f"DirectoryLoader failed: {e}")
|
50 |
+
|
51 |
+
if not documents:
|
52 |
+
print("ERROR: No Constitution documents loaded. Exiting.")
|
53 |
+
sys.exit(1)
|
54 |
+
|
55 |
+
# Split documents
|
56 |
+
print(f"Splitting {len(documents)} documents into chunks...")
|
57 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
|
58 |
+
docs = text_splitter.split_documents(documents)
|
59 |
+
print(f"Split into {len(docs)} chunks")
|
60 |
+
|
61 |
+
if not docs:
|
62 |
+
print("ERROR: No text chunks created after splitting.")
|
63 |
+
sys.exit(1)
|
64 |
+
|
65 |
+
# Create embeddings & FAISS index
|
66 |
+
try:
|
67 |
+
print("Initializing embedding model...")
|
68 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
69 |
+
|
70 |
+
print("Testing embedding generation...")
|
71 |
+
test_embedding = embeddings.embed_query("Constitution of India test")
|
72 |
+
print(f"Test embedding successful, vector length: {len(test_embedding)}")
|
73 |
+
|
74 |
+
print(f"Creating FAISS index for {len(docs)} chunks...")
|
75 |
+
faiss_index = FAISS.from_documents(docs, embeddings)
|
76 |
+
|
77 |
+
index_path = "vector_store/faiss_index_constitution"
|
78 |
+
os.makedirs(index_path, exist_ok=True)
|
79 |
+
faiss_index.save_local(index_path)
|
80 |
+
|
81 |
+
print("✅ Constitution FAISS Index successfully created and saved!")
|
82 |
+
except Exception as e:
|
83 |
+
print(f"ERROR during embedding or indexing: {e}")
|
84 |
+
import traceback
|
85 |
+
traceback.print_exc()
|
86 |
+
sys.exit(1)
|
chatbot.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
|
8 |
+
# Load API key
|
9 |
+
load_dotenv()
|
10 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
11 |
+
if not api_key:
|
12 |
+
raise ValueError("Google API Key not found. Please set it in your .env file.")
|
13 |
+
|
14 |
+
# Path to FAISS index - Updated to match build_faiss.py
|
15 |
+
faiss_path = "vector_store/faiss_index_constitution"
|
16 |
+
if not os.path.exists(f"{faiss_path}/index.faiss"):
|
17 |
+
raise FileNotFoundError(f"FAISS index not found at {faiss_path}. Please build the index first.")
|
18 |
+
|
19 |
+
# Load vector store
|
20 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
21 |
+
db = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)
|
22 |
+
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
|
23 |
+
|
24 |
+
# LLM model
|
25 |
+
llm = GoogleGenerativeAI(model="gemini-1.5-flash", api_key=api_key)
|
26 |
+
|
27 |
+
# Prompt for constitutional expertise
|
28 |
+
prompt_template = """
|
29 |
+
You are a constitutional expert specializing in the Constitution of India.
|
30 |
+
Provide accurate, clear, and unbiased legal explanations.
|
31 |
+
|
32 |
+
User Question:
|
33 |
+
{question}
|
34 |
+
|
35 |
+
Relevant Context from the Constitution:
|
36 |
+
{context}
|
37 |
+
|
38 |
+
Instructions:
|
39 |
+
- Base your answer strictly on the given context and your knowledge of the Constitution.
|
40 |
+
- Cite Article numbers and headings when possible.
|
41 |
+
- Stay neutral, factual, and avoid personal opinions.
|
42 |
+
- If the context is insufficient, say so and provide general constitutional principles.
|
43 |
+
|
44 |
+
Now provide the answer:
|
45 |
+
"""
|
46 |
+
|
47 |
+
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
|
48 |
+
|
49 |
+
# Retrieval-based QA chain
|
50 |
+
qa_chain = RetrievalQA.from_chain_type(
|
51 |
+
llm=llm,
|
52 |
+
retriever=retriever,
|
53 |
+
return_source_documents=True,
|
54 |
+
chain_type_kwargs={"prompt": PROMPT},
|
55 |
+
chain_type="stuff"
|
56 |
+
)
|
57 |
+
|
58 |
+
def ask_samvidhan(question: str) -> str:
|
59 |
+
"""Answer queries about the Constitution of India with sources."""
|
60 |
+
result = qa_chain({"query": question})
|
61 |
+
answer = result.get("result", "Sorry, I couldn't find an answer.")
|
62 |
+
sources = result.get("source_documents", [])
|
63 |
+
|
64 |
+
if sources:
|
65 |
+
answer += "\n\n**Sources:**"
|
66 |
+
seen = set()
|
67 |
+
for doc in sources:
|
68 |
+
src = doc.metadata.get("source", "Unknown")
|
69 |
+
page = doc.metadata.get("page", "")
|
70 |
+
src_info = f"{src} (Page {page})" if page else src
|
71 |
+
if src_info not in seen:
|
72 |
+
seen.add(src_info)
|
73 |
+
answer += f"\n- {src_info}"
|
74 |
+
|
75 |
+
return answer
|
76 |
+
|
77 |
+
# Add alias for backward compatibility if needed
|
78 |
+
ask_samvidhan_chatbot = ask_samvidhan
|
79 |
+
|
80 |
+
if __name__ == "__main__":
|
81 |
+
while True:
|
82 |
+
query = input("Ask me about the Constitution of India: ")
|
83 |
+
if query.lower() == "exit":
|
84 |
+
break
|
85 |
+
print("📜:", ask_samvidhan(query))
|
load_data.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader # Updated imports
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
import os
|
4 |
+
import glob
|
5 |
+
|
6 |
+
def load_and_split():
|
7 |
+
"""
|
8 |
+
Load documents from various sources and split into chunks with metadata preservation.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
list: List of Document objects with text and metadata
|
12 |
+
"""
|
13 |
+
documents = []
|
14 |
+
|
15 |
+
# Load all PDFs in data folder
|
16 |
+
pdf_files = glob.glob("data/*.pdf")
|
17 |
+
for file in pdf_files:
|
18 |
+
loader = PyPDFLoader(file)
|
19 |
+
docs = loader.load()
|
20 |
+
# Add file source to metadata
|
21 |
+
for doc in docs:
|
22 |
+
doc.metadata["source"] = os.path.basename(file)
|
23 |
+
documents.extend(docs)
|
24 |
+
|
25 |
+
# Load all text files
|
26 |
+
txt_files = glob.glob("data/*.txt")
|
27 |
+
for file in txt_files:
|
28 |
+
loader = TextLoader(file)
|
29 |
+
docs = loader.load()
|
30 |
+
# Add metadata
|
31 |
+
for doc in docs:
|
32 |
+
doc.metadata["source"] = os.path.basename(file)
|
33 |
+
documents.extend(docs)
|
34 |
+
|
35 |
+
# Load all CSV files
|
36 |
+
csv_files = glob.glob("data/*.csv")
|
37 |
+
for file in csv_files:
|
38 |
+
try:
|
39 |
+
loader = CSVLoader(file)
|
40 |
+
docs = loader.load()
|
41 |
+
# Add metadata
|
42 |
+
for doc in docs:
|
43 |
+
doc.metadata["source"] = os.path.basename(file)
|
44 |
+
documents.extend(docs)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error loading CSV file {file}: {e}")
|
47 |
+
|
48 |
+
print(f"Loaded {len(documents)} documents from {len(pdf_files)} PDFs, {len(txt_files)} TXTs, and {len(csv_files)} CSVs")
|
49 |
+
|
50 |
+
# Split documents into manageable chunks
|
51 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
52 |
+
chunk_size=1000, # Increased chunk size for better context
|
53 |
+
chunk_overlap=200, # Increased overlap
|
54 |
+
separators=["\n\n", "\n", " ", ""]
|
55 |
+
)
|
56 |
+
|
57 |
+
split_docs = text_splitter.split_documents(documents)
|
58 |
+
print(f"Split into {len(split_docs)} chunks")
|
59 |
+
|
60 |
+
return split_docs
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
documents = load_and_split()
|
64 |
+
print(f"Loaded and split {len(documents)} text chunks from all documents!")
|