Spaces:

sankalp2606
/

constitutional_bot

Sleeping

App Files Files Community

sankalp2606 commited on 12 days ago

Commit

d07fe0d

verified ·

1 Parent(s): 7c30e40

Upload 3 files

Browse files

Files changed (3) hide show

build_faiss.py +86 -0
chatbot.py +85 -0
load_data.py +64 -0

build_faiss.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from dotenv import load_dotenv
+import sys
+from langchain_community.vectorstores import FAISS
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
+# Load API Key
+load_dotenv()
+api_key = os.getenv("GOOGLE_API_KEY")
+if not api_key:
+    raise ValueError("Google API Key not found. Please set it in your .env file.")
+# Path to Constitution documents
+data_path = "data"  # Put Constitution of India and related PDFs here
+if not os.path.exists(data_path):
+    raise FileNotFoundError(f"Directory '{data_path}' not found. Place your documents there.")
+print(f"Current working directory: {os.getcwd()}")
+print(f"Looking for Constitution documents in: {os.path.abspath(data_path)}")
+# Load PDFs
+documents = []
+pdf_files = [f for f in os.listdir(data_path) if f.lower().endswith('.pdf')]
+print(f"Found {len(pdf_files)} PDF files")
+for pdf_file in pdf_files:
+    try:
+        pdf_path = os.path.join(data_path, pdf_file)
+        print(f"Loading PDF: {pdf_path}")
+        loader = PyPDFLoader(pdf_path)
+        pdf_docs = loader.load()
+        print(f"  - Loaded {len(pdf_docs)} pages from {pdf_file}")
+        documents.extend(pdf_docs)
+    except Exception as e:
+        print(f"Error loading {pdf_file}: {e}")
+# Fallback if no documents loaded
+if not documents:
+    print("No documents were loaded. Trying DirectoryLoader as fallback...")
+    try:
+        loader = DirectoryLoader(data_path, glob="**/*.pdf")
+        documents = loader.load()
+        print(f"Loaded {len(documents)} documents with DirectoryLoader")
+    except Exception as e:
+        print(f"DirectoryLoader failed: {e}")
+if not documents:
+    print("ERROR: No Constitution documents loaded. Exiting.")
+    sys.exit(1)
+# Split documents
+print(f"Splitting {len(documents)} documents into chunks...")
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
+docs = text_splitter.split_documents(documents)
+print(f"Split into {len(docs)} chunks")
+if not docs:
+    print("ERROR: No text chunks created after splitting.")
+    sys.exit(1)
+# Create embeddings & FAISS index
+try:
+    print("Initializing embedding model...")
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    print("Testing embedding generation...")
+    test_embedding = embeddings.embed_query("Constitution of India test")
+    print(f"Test embedding successful, vector length: {len(test_embedding)}")
+    print(f"Creating FAISS index for {len(docs)} chunks...")
+    faiss_index = FAISS.from_documents(docs, embeddings)
+    index_path = "vector_store/faiss_index_constitution"
+    os.makedirs(index_path, exist_ok=True)
+    faiss_index.save_local(index_path)
+    print("✅ Constitution FAISS Index successfully created and saved!")
+except Exception as e:
+    print(f"ERROR during embedding or indexing: {e}")
+    import traceback
+    traceback.print_exc()
+    sys.exit(1)

chatbot.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+from dotenv import load_dotenv
+from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+# Load API key
+load_dotenv()
+api_key = os.getenv("GOOGLE_API_KEY")
+if not api_key:
+    raise ValueError("Google API Key not found. Please set it in your .env file.")
+# Path to FAISS index - Updated to match build_faiss.py
+faiss_path = "vector_store/faiss_index_constitution"
+if not os.path.exists(f"{faiss_path}/index.faiss"):
+    raise FileNotFoundError(f"FAISS index not found at {faiss_path}. Please build the index first.")
+# Load vector store
+embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+db = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization=True)
+retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
+# LLM model
+llm = GoogleGenerativeAI(model="gemini-1.5-flash", api_key=api_key)
+# Prompt for constitutional expertise
+prompt_template = """
+You are a constitutional expert specializing in the Constitution of India.
+Provide accurate, clear, and unbiased legal explanations.
+User Question:
+{question}
+Relevant Context from the Constitution:
+{context}
+Instructions:
+- Base your answer strictly on the given context and your knowledge of the Constitution.
+- Cite Article numbers and headings when possible.
+- Stay neutral, factual, and avoid personal opinions.
+- If the context is insufficient, say so and provide general constitutional principles.
+Now provide the answer:
+"""
+PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+# Retrieval-based QA chain
+qa_chain = RetrievalQA.from_chain_type(
+    llm=llm,
+    retriever=retriever,
+    return_source_documents=True,
+    chain_type_kwargs={"prompt": PROMPT},
+    chain_type="stuff"
+)
+def ask_samvidhan(question: str) -> str:
+    """Answer queries about the Constitution of India with sources."""
+    result = qa_chain({"query": question})
+    answer = result.get("result", "Sorry, I couldn't find an answer.")
+    sources = result.get("source_documents", [])
+    if sources:
+        answer += "\n\n**Sources:**"
+        seen = set()
+        for doc in sources:
+            src = doc.metadata.get("source", "Unknown")
+            page = doc.metadata.get("page", "")
+            src_info = f"{src} (Page {page})" if page else src
+            if src_info not in seen:
+                seen.add(src_info)
+                answer += f"\n- {src_info}"
+    return answer
+# Add alias for backward compatibility if needed
+ask_samvidhan_chatbot = ask_samvidhan
+if __name__ == "__main__":
+    while True:
+        query = input("Ask me about the Constitution of India: ")
+        if query.lower() == "exit":
+            break
+        print("📜:", ask_samvidhan(query))

load_data.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader  # Updated imports
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import os
+import glob
+def load_and_split():
+    """
+    Load documents from various sources and split into chunks with metadata preservation.
+    Returns:
+        list: List of Document objects with text and metadata
+    """
+    documents = []
+    # Load all PDFs in data folder
+    pdf_files = glob.glob("data/*.pdf")
+    for file in pdf_files:
+        loader = PyPDFLoader(file)
+        docs = loader.load()
+        # Add file source to metadata
+        for doc in docs:
+            doc.metadata["source"] = os.path.basename(file)
+        documents.extend(docs)
+    # Load all text files
+    txt_files = glob.glob("data/*.txt")
+    for file in txt_files:
+        loader = TextLoader(file)
+        docs = loader.load()
+        # Add metadata
+        for doc in docs:
+            doc.metadata["source"] = os.path.basename(file)
+        documents.extend(docs)
+    # Load all CSV files
+    csv_files = glob.glob("data/*.csv")
+    for file in csv_files:
+        try:
+            loader = CSVLoader(file)
+            docs = loader.load()
+            # Add metadata
+            for doc in docs:
+                doc.metadata["source"] = os.path.basename(file)
+            documents.extend(docs)
+        except Exception as e:
+            print(f"Error loading CSV file {file}: {e}")
+    print(f"Loaded {len(documents)} documents from {len(pdf_files)} PDFs, {len(txt_files)} TXTs, and {len(csv_files)} CSVs")
+    # Split documents into manageable chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,  # Increased chunk size for better context
+        chunk_overlap=200,  # Increased overlap
+        separators=["\n\n", "\n", " ", ""]
+    )
+    split_docs = text_splitter.split_documents(documents)
+    print(f"Split into {len(split_docs)} chunks")
+    return split_docs
+if __name__ == "__main__":
+    documents = load_and_split()
+    print(f"Loaded and split {len(documents)} text chunks from all documents!")