sankalp2606 commited on
Commit
d07fe0d
·
verified ·
1 Parent(s): 7c30e40

Upload 3 files

Browse files
Files changed (3) hide show
  1. build_faiss.py +86 -0
  2. chatbot.py +85 -0
  3. load_data.py +64 -0
build_faiss.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import sys
4
+
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
9
+
10
+ # Load API Key
11
+ load_dotenv()
12
+ api_key = os.getenv("GOOGLE_API_KEY")
13
+
14
+ if not api_key:
15
+ raise ValueError("Google API Key not found. Please set it in your .env file.")
16
+
17
+ # Path to Constitution documents
18
+ data_path = "data" # Put Constitution of India and related PDFs here
19
+ if not os.path.exists(data_path):
20
+ raise FileNotFoundError(f"Directory '{data_path}' not found. Place your documents there.")
21
+
22
+ print(f"Current working directory: {os.getcwd()}")
23
+ print(f"Looking for Constitution documents in: {os.path.abspath(data_path)}")
24
+
25
+ # Load PDFs
26
+ documents = []
27
+ pdf_files = [f for f in os.listdir(data_path) if f.lower().endswith('.pdf')]
28
+ print(f"Found {len(pdf_files)} PDF files")
29
+
30
+ for pdf_file in pdf_files:
31
+ try:
32
+ pdf_path = os.path.join(data_path, pdf_file)
33
+ print(f"Loading PDF: {pdf_path}")
34
+ loader = PyPDFLoader(pdf_path)
35
+ pdf_docs = loader.load()
36
+ print(f" - Loaded {len(pdf_docs)} pages from {pdf_file}")
37
+ documents.extend(pdf_docs)
38
+ except Exception as e:
39
+ print(f"Error loading {pdf_file}: {e}")
40
+
41
+ # Fallback if no documents loaded
42
+ if not documents:
43
+ print("No documents were loaded. Trying DirectoryLoader as fallback...")
44
+ try:
45
+ loader = DirectoryLoader(data_path, glob="**/*.pdf")
46
+ documents = loader.load()
47
+ print(f"Loaded {len(documents)} documents with DirectoryLoader")
48
+ except Exception as e:
49
+ print(f"DirectoryLoader failed: {e}")
50
+
51
+ if not documents:
52
+ print("ERROR: No Constitution documents loaded. Exiting.")
53
+ sys.exit(1)
54
+
55
+ # Split documents
56
+ print(f"Splitting {len(documents)} documents into chunks...")
57
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
58
+ docs = text_splitter.split_documents(documents)
59
+ print(f"Split into {len(docs)} chunks")
60
+
61
+ if not docs:
62
+ print("ERROR: No text chunks created after splitting.")
63
+ sys.exit(1)
64
+
65
+ # Create embeddings & FAISS index
66
+ try:
67
+ print("Initializing embedding model...")
68
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
69
+
70
+ print("Testing embedding generation...")
71
+ test_embedding = embeddings.embed_query("Constitution of India test")
72
+ print(f"Test embedding successful, vector length: {len(test_embedding)}")
73
+
74
+ print(f"Creating FAISS index for {len(docs)} chunks...")
75
+ faiss_index = FAISS.from_documents(docs, embeddings)
76
+
77
+ index_path = "vector_store/faiss_index_constitution"
78
+ os.makedirs(index_path, exist_ok=True)
79
+ faiss_index.save_local(index_path)
80
+
81
+ print("✅ Constitution FAISS Index successfully created and saved!")
82
+ except Exception as e:
83
+ print(f"ERROR during embedding or indexing: {e}")
84
+ import traceback
85
+ traceback.print_exc()
86
+ sys.exit(1)
chatbot.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain.chains import RetrievalQA
6
+ from langchain.prompts import PromptTemplate
7
+
8
+ # Load API key
9
+ load_dotenv()
10
+ api_key = os.getenv("GOOGLE_API_KEY")
11
+ if not api_key:
12
+ raise ValueError("Google API Key not found. Please set it in your .env file.")
13
+
14
+ # Path to FAISS index - Updated to match build_faiss.py
15
+ faiss_path = "vector_store/faiss_index_constitution"
16
+ if not os.path.exists(f"{faiss_path}/index.faiss"):
17
+ raise FileNotFoundError(f"FAISS index not found at {faiss_path}. Please build the index first.")
18
+
19
+ # Load vector store
20
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
21
+ db = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)
22
+ retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
23
+
24
+ # LLM model
25
+ llm = GoogleGenerativeAI(model="gemini-1.5-flash", api_key=api_key)
26
+
27
+ # Prompt for constitutional expertise
28
+ prompt_template = """
29
+ You are a constitutional expert specializing in the Constitution of India.
30
+ Provide accurate, clear, and unbiased legal explanations.
31
+
32
+ User Question:
33
+ {question}
34
+
35
+ Relevant Context from the Constitution:
36
+ {context}
37
+
38
+ Instructions:
39
+ - Base your answer strictly on the given context and your knowledge of the Constitution.
40
+ - Cite Article numbers and headings when possible.
41
+ - Stay neutral, factual, and avoid personal opinions.
42
+ - If the context is insufficient, say so and provide general constitutional principles.
43
+
44
+ Now provide the answer:
45
+ """
46
+
47
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
48
+
49
+ # Retrieval-based QA chain
50
+ qa_chain = RetrievalQA.from_chain_type(
51
+ llm=llm,
52
+ retriever=retriever,
53
+ return_source_documents=True,
54
+ chain_type_kwargs={"prompt": PROMPT},
55
+ chain_type="stuff"
56
+ )
57
+
58
+ def ask_samvidhan(question: str) -> str:
59
+ """Answer queries about the Constitution of India with sources."""
60
+ result = qa_chain({"query": question})
61
+ answer = result.get("result", "Sorry, I couldn't find an answer.")
62
+ sources = result.get("source_documents", [])
63
+
64
+ if sources:
65
+ answer += "\n\n**Sources:**"
66
+ seen = set()
67
+ for doc in sources:
68
+ src = doc.metadata.get("source", "Unknown")
69
+ page = doc.metadata.get("page", "")
70
+ src_info = f"{src} (Page {page})" if page else src
71
+ if src_info not in seen:
72
+ seen.add(src_info)
73
+ answer += f"\n- {src_info}"
74
+
75
+ return answer
76
+
77
+ # Add alias for backward compatibility if needed
78
+ ask_samvidhan_chatbot = ask_samvidhan
79
+
80
+ if __name__ == "__main__":
81
+ while True:
82
+ query = input("Ask me about the Constitution of India: ")
83
+ if query.lower() == "exit":
84
+ break
85
+ print("📜:", ask_samvidhan(query))
load_data.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader # Updated imports
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ import os
4
+ import glob
5
+
6
+ def load_and_split():
7
+ """
8
+ Load documents from various sources and split into chunks with metadata preservation.
9
+
10
+ Returns:
11
+ list: List of Document objects with text and metadata
12
+ """
13
+ documents = []
14
+
15
+ # Load all PDFs in data folder
16
+ pdf_files = glob.glob("data/*.pdf")
17
+ for file in pdf_files:
18
+ loader = PyPDFLoader(file)
19
+ docs = loader.load()
20
+ # Add file source to metadata
21
+ for doc in docs:
22
+ doc.metadata["source"] = os.path.basename(file)
23
+ documents.extend(docs)
24
+
25
+ # Load all text files
26
+ txt_files = glob.glob("data/*.txt")
27
+ for file in txt_files:
28
+ loader = TextLoader(file)
29
+ docs = loader.load()
30
+ # Add metadata
31
+ for doc in docs:
32
+ doc.metadata["source"] = os.path.basename(file)
33
+ documents.extend(docs)
34
+
35
+ # Load all CSV files
36
+ csv_files = glob.glob("data/*.csv")
37
+ for file in csv_files:
38
+ try:
39
+ loader = CSVLoader(file)
40
+ docs = loader.load()
41
+ # Add metadata
42
+ for doc in docs:
43
+ doc.metadata["source"] = os.path.basename(file)
44
+ documents.extend(docs)
45
+ except Exception as e:
46
+ print(f"Error loading CSV file {file}: {e}")
47
+
48
+ print(f"Loaded {len(documents)} documents from {len(pdf_files)} PDFs, {len(txt_files)} TXTs, and {len(csv_files)} CSVs")
49
+
50
+ # Split documents into manageable chunks
51
+ text_splitter = RecursiveCharacterTextSplitter(
52
+ chunk_size=1000, # Increased chunk size for better context
53
+ chunk_overlap=200, # Increased overlap
54
+ separators=["\n\n", "\n", " ", ""]
55
+ )
56
+
57
+ split_docs = text_splitter.split_documents(documents)
58
+ print(f"Split into {len(split_docs)} chunks")
59
+
60
+ return split_docs
61
+
62
+ if __name__ == "__main__":
63
+ documents = load_and_split()
64
+ print(f"Loaded and split {len(documents)} text chunks from all documents!")