Spaces:

umar-100
/

smart-research-assistant

Sleeping

App Files Files Community

umar-100 commited on Jun 23

Commit

1d9f240

1 Parent(s): 4abf205

initial commit

Browse files

Files changed (19) hide show

.gitignore +3 -0
README.md +18 -0
__pycache__/chroma_utils.cpython-311.pyc +0 -0
__pycache__/db_utils.cpython-311.pyc +0 -0
__pycache__/langchain_utils.cpython-311.pyc +0 -0
__pycache__/main.cpython-311.pyc +0 -0
__pycache__/pinecone_utilis.cpython-311.pyc +0 -0
__pycache__/pydantic_models.cpython-311.pyc +0 -0
app.py +5 -0
backend/main.py +0 -0
backend/utilis.py +57 -0
db_utils.py +75 -0
langchain_utils.py +43 -0
main.py +76 -0
pinecone_utilis.py +84 -0
prompt_templates.py +64 -0
pydantic_models.py +25 -0
requirements.txt +10 -0
ui.py +61 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv
+InternTaskGenAI.pdf
+.env

README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# smart-research-assistant
+## TODO:
+- [] pinecone utilis
+- [] RAG component using langchain/langgraph
+- [] database setup
+- [] backend using FastAPI
+- [] frontend using streamlit
+## Deliverabilities
+- Answer questions that require comprehension and inference
+- Pose logic-based questions to users and evaluate their responses
+- Justify every answer with a reference from the document
+### Functional Requirements
+- Input file (pdf/txt)
+- 2 modes (a) Ask anything (b) challenge me
+- Auto summary after document upload
+- Streamlit + FastAPI (current stack)
+- Bonus features i.e. state management and context highlighting

__pycache__/chroma_utils.cpython-311.pyc ADDED Viewed

Binary file (3.46 kB). View file

__pycache__/db_utils.cpython-311.pyc ADDED Viewed

Binary file (4.78 kB). View file

__pycache__/langchain_utils.cpython-311.pyc ADDED Viewed

Binary file (2.51 kB). View file

__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (5.47 kB). View file

__pycache__/pinecone_utilis.cpython-311.pyc ADDED Viewed

Binary file (4.81 kB). View file

__pycache__/pydantic_models.cpython-311.pyc ADDED Viewed

Binary file (1.99 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from pinecone_utilis import create_pinecone_vectorstore,load_and_split_document, index_document_to_pinecone
+file_path="InternTaskGenAI.pdf"
+print(index_document_to_pinecone(file_path=file_path, file_id=1))

backend/main.py ADDED Viewed

File without changes

backend/utilis.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from pinecone import Pinecone, ServerlessSpec
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from operator import itemgetter
+class RAG:
+    def load_split_file(self, file_path):
+        loader = PyPDFLoader(file_path)
+        pages = loader.load_and_split()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=10)
+        docs = text_splitter.split_documents(pages)
+        return docs
+    def create_index(self, index_name, PINECONE_API_KEY):
+        pc = Pinecone(api_key=PINECONE_API_KEY)
+        if index_name in pc.list_indexes().names():
+            pc.delete_index(index_name)  # To avoid any conflicts in retrieval
+        pc.create_index(
+            name=index_name,
+            dimension=384,
+            metric='cosine',
+            spec=ServerlessSpec(
+                cloud="aws",
+                region="us-east-1"
+            )
+        )
+        return index_name
+    def final_response(self, index, question, model):
+        retriever = index.as_retriever()
+        parser = StrOutputParser()
+        template = """
+        You must provide an answer based strictly on the context below.
+        The answer is highly likely to be found within the given context, so analyze it thoroughly before responding.
+        Only if there is absolutely no relevant information, respond with "I don't know".
+        Do not make things up.
+        Context: {context}
+        Question: {question}
+        """
+        prompt = PromptTemplate.from_template(template)
+        prompt.format(context="Here is some context", question="Here is a question")
+        chain = (
+            {
+                "context": itemgetter("question") | retriever,
+                "question": itemgetter("question"),
+            }
+            | prompt
+            | model
+            | parser
+        )
+        matching_results = index.similarity_search(question, k=2)
+        return f"Answer: {chain.invoke({'question': question})}", matching_results

db_utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import sqlite3
+from datetime import datetime
+DB_NAME = "rag_app.db"
+def get_db_connection():
+    conn = sqlite3.connect(DB_NAME)
+    conn.row_factory = sqlite3.Row
+    return conn
+def create_application_logs():
+    conn = get_db_connection()
+    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
+                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                     session_id TEXT,
+                     user_query TEXT,
+                     gpt_response TEXT,
+                     model TEXT,
+                     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
+    conn.close()
+def create_document_store():
+    conn = get_db_connection()
+    conn.execute('''CREATE TABLE IF NOT EXISTS document_store
+                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                     filename TEXT,
+                     upload_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
+    conn.close()
+def insert_application_logs(session_id, user_query, gpt_response, model):
+    conn = get_db_connection()
+    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
+                 (session_id, user_query, gpt_response, model))
+    conn.commit()
+    conn.close()
+def get_chat_history(session_id):
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
+    messages = []
+    for row in cursor.fetchall():
+        messages.extend([
+            {"role": "human", "content": row['user_query']},
+            {"role": "ai", "content": row['gpt_response']}
+        ])
+    conn.close()
+    return messages
+def insert_document_record(filename):
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute('INSERT INTO document_store (filename) VALUES (?)', (filename,))
+    file_id = cursor.lastrowid
+    conn.commit()
+    conn.close()
+    return file_id
+def delete_document_record(file_id):
+    conn = get_db_connection()
+    conn.execute('DELETE FROM document_store WHERE id = ?', (file_id,))
+    conn.commit()
+    conn.close()
+    return True
+def get_all_documents():
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute('SELECT id, filename, upload_timestamp FROM document_store ORDER BY upload_timestamp DESC')
+    documents = cursor.fetchall()
+    conn.close()
+    return [dict(doc) for doc in documents]
+# Initialize the database tables
+create_application_logs()
+create_document_store()

langchain_utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from typing import List
+from typing_extensions import List, TypedDict
+from langchain_core.documents import Document
+import os
+from pinecone_utilis import vectorstore
+from dotenv import load_dotenv
+load_dotenv()
+OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
+retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
+output_parser = StrOutputParser()
+contextualize_q_system_prompt = (
+    "Given a chat history and the latest user question "
+    "which might reference context in the chat history, "
+    "formulate a standalone question which can be understood "
+    "without the chat history. Do NOT answer the question, "
+    "just reformulate it if needed and otherwise return it as is."
+)
+contextualize_q_prompt = ChatPromptTemplate.from_messages([
+    ("system", contextualize_q_system_prompt),
+    MessagesPlaceholder("chat_history"),
+    ("human", "{input}"),
+])
+qa_prompt = ChatPromptTemplate.from_messages([
+    ("system", "You are a helpful AI assistant. Use the following context to answer the user's question."),
+    ("system", "Context: {context}"),
+    MessagesPlaceholder(variable_name="chat_history"),
+    ("human", "{input}")
+])
+class State(TypedDict):
+    messages: List[BaseMessage]

main.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from pydantic_models import QueryInput, QueryResponse, DocumentInfo, DeleteFileRequest
+from langchain_utils import get_rag_chain
+from db_utils import insert_application_logs, get_chat_history, get_all_documents, insert_document_record, delete_document_record
+from chroma_utils import index_document_to_chroma, delete_doc_from_chroma
+import os
+import uuid
+import logging
+import shutil
+# Set up logging
+logging.basicConfig(filename='app.log', level=logging.INFO)
+# Initialize FastAPI app
+app = FastAPI()
+@app.post("/chat", response_model=QueryResponse)
+def chat(query_input: QueryInput):
+    session_id = query_input.session_id or str(uuid.uuid4())
+    logging.info(f"Session ID: {session_id}, User Query: {query_input.question}, Model: {query_input.model.value}")
+    chat_history = get_chat_history(session_id)
+    rag_chain = get_rag_chain(query_input.model.value)
+    answer = rag_chain.invoke({
+        "input": query_input.question,
+        "chat_history": chat_history
+    })['answer']
+    insert_application_logs(session_id, query_input.question, answer, query_input.model.value)
+    logging.info(f"Session ID: {session_id}, AI Response: {answer}")
+    return QueryResponse(answer=answer, session_id=session_id, model=query_input.model)
+@app.post("/upload-doc")
+def upload_and_index_document(file: UploadFile = File(...)):
+    allowed_extensions = ['.pdf', '.docx', '.html']
+    file_extension = os.path.splitext(file.filename)[1].lower()
+    if file_extension not in allowed_extensions:
+        raise HTTPException(status_code=400, detail=f"Unsupported file type. Allowed types are: {', '.join(allowed_extensions)}")
+    temp_file_path = f"temp_{file.filename}"
+    try:
+        # Save the uploaded file to a temporary file
+        with open(temp_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        file_id = insert_document_record(file.filename)
+        success = index_document_to_chroma(temp_file_path, file_id)
+        if success:
+            return {"message": f"File {file.filename} has been successfully uploaded and indexed.", "file_id": file_id}
+        else:
+            delete_document_record(file_id)
+            raise HTTPException(status_code=500, detail=f"Failed to index {file.filename}.")
+    finally:
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+@app.get("/list-docs", response_model=list[DocumentInfo])
+def list_documents():
+    return get_all_documents()
+@app.post("/delete-doc")
+def delete_document(request: DeleteFileRequest):
+    chroma_delete_success = delete_doc_from_chroma(request.file_id)
+    if chroma_delete_success:
+        db_delete_success = delete_document_record(request.file_id)
+        if db_delete_success:
+            return {"message": f"Successfully deleted document with file_id {request.file_id} from the system."}
+        else:
+            return {"error": f"Deleted from Chroma but failed to delete document with file_id {request.file_id} from the database."}
+    else:
+        return {"error": f"Failed to delete document with file_id {request.file_id} from Chroma."}

pinecone_utilis.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from langchain_community.document_loaders import PyPDFLoader, TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from pinecone import Pinecone, ServerlessSpec
+from langchain_pinecone import PineconeVectorStore
+from typing import List
+from langchain_core.documents import Document
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# API keys
+PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
+print(f"Pinecone API Key: {PINECONE_API_KEY}")
+OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
+# text splitter and embedding function
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
+embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1024, api_key=OPENAI_API_KEY)
+# Pinecone vector store
+pc = Pinecone(api_key="pcsk_55CAuv_4KzXk8TYtxsauVoxwBsqqTgxWYmNJEEYrCpVxPqHcYbeS8njTWjZ14xCMEbksPS")
+def load_and_split_document(file_path: str) -> List[Document]:
+    if file_path.endswith('.pdf'):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith('.txt'):
+        loader = TextLoader(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {file_path}")
+    documents = loader.load()
+    return text_splitter.split_documents(documents)
+INDEX_NAME = "smart-research-assistant"
+def create_pinecone_vectorstore()-> PineconeVectorStore:
+    try:
+        if not pc.has_index(INDEX_NAME):
+            pc.create_index(
+                name=INDEX_NAME,
+                dimension=1024,
+                metric="cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1")
+            )
+        index = pc.Index(INDEX_NAME)
+        return PineconeVectorStore(index=index, embedding=embeddings)
+    except Exception as e:
+        print(f"Index initialization failed: {e}")
+        raise
+vectorstore=create_pinecone_vectorstore()
+def index_document_to_pinecone(file_path: str, file_id: int) -> bool:
+    try:
+        splits = load_and_split_document(file_path)
+        # Add metadata to each split
+        for split in splits:
+            split.metadata['file_id'] = file_id
+        vectorstore.add_documents(splits)
+        return True
+    except Exception as e:
+        print(f"Error indexing document: {e}")
+        return False
+def delete_doc_from_pinecone(file_id: int):
+    try:
+        docs = vectorstore.get(where={"file_id": file_id})
+        print(f"Found {len(docs['ids'])} document chunks for file_id {file_id}")
+        vectorstore._collection.delete(where={"file_id": file_id})
+        print(f"Deleted all documents with file_id {file_id}")
+        return True
+    except Exception as e:
+        print(f"Error deleting document with file_id {file_id} from Chroma: {str(e)}")
+        return False

prompt_templates.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# 1. Auto-summarization prompt template
+AUTO_SUMMARY_TEMPLATE = """
+Summarize the following document in no more than 150 words. Focus on the main points and key findings. Do not include information not present in the document.
+DOCUMENT:
+{document}
+SUMMARY:
+"""
+# 2. Question answering prompt template
+QA_PROMPT_TEMPLATE = """
+Answer the following question based only on the provided document. Your answer must be grounded in the document and include a specific reference to the text that supports your answer.
+Document:
+{document}
+Question:
+{question}
+Answer:
+"""
+# 3. Logic-based question generation prompt template
+LOGIC_QUESTION_GENERATION_TEMPLATE = """
+Generate three logic-based or comprehension-focused questions about the following document. Each question should require understanding or reasoning about the document content, not just simple recall. Provide each question on a new line.
+Document:
+{document}
+Questions:
+"""
+# 4. Answer evaluation prompt template
+ANSWER_EVALUATION_TEMPLATE = """
+Evaluate the following user answer to the question, using only the provided document as the source of truth. State whether the answer is correct or not, and provide a brief justification referencing the document.
+Document:
+{document}
+Question:
+{question}
+User Answer:
+{user_answer}
+Evaluation:
+"""
+# 5. For memory/follow-up: Chat prompt template
+CHAT_PROMPT_TEMPLATE = """
+The following is a conversation between a user and an AI assistant about a document. The assistant answers questions and provides justifications based on the document. Use the conversation history and the document to answer the new question.
+Document:
+{document}
+Conversation History:
+{history}
+Question:
+{question}
+Answer:
+"""

pydantic_models.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from pydantic import BaseModel, Field
+from enum import Enum
+from datetime import datetime
+class ModelName(str, Enum):
+    GPT4_O = "gpt-4o"
+    GPT4_O_MINI = "gpt-4o-mini"
+class QueryInput(BaseModel):
+    question: str
+    session_id: str = Field(default=None)
+    model: ModelName = Field(default=ModelName.GPT4_O_MINI)
+class QueryResponse(BaseModel):
+    answer: str
+    session_id: str
+    model: ModelName
+class DocumentInfo(BaseModel):
+    id: int
+    filename: str
+    upload_timestamp: datetime
+class DeleteFileRequest(BaseModel):
+    file_id: int

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+langchain
+langchain-openai
+langchain-core
+langchain_community
+docx2txt
+pypdf
+langchain_chroma
+python-multipart
+fastapi
+uvicorn

ui.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import streamlit as st
+import requests
+# Set the FastAPI backend URL
+FASTAPI_URL = "http://localhost:8000"
+st.title("Smart Research Assistant")
+# Document Upload Section
+uploaded_file = st.file_uploader("Upload a PDF or TXT document", type=["pdf", "txt"])
+if uploaded_file:
+    files = {"file": uploaded_file}
+    response = requests.post(f"{FASTAPI_URL}/upload-doc", files=files)
+    if response.status_code == 200:
+        result = response.json()
+        st.success(result["message"])
+        file_id = result["file_id"]
+        # Here you would also call a summary endpoint if implemented
+        # For demo, assume summary is returned in the upload response
+        # st.write("Summary: ", result.get("summary", "Summary not available"))
+    else:
+        st.error(f"Error: {response.json().get('detail', 'Unknown error')}")
+# List Documents (optional)
+if st.button("List Documents"):
+    response = requests.get(f"{FASTAPI_URL}/list-docs")
+    if response.status_code == 200:
+        documents = response.json()
+        st.write("Available Documents:")
+        for doc in documents:
+            st.write(f"- {doc['filename']} (ID: {doc['file_id']})")
+    else:
+        st.error("Failed to list documents")
+# Interaction Modes
+mode = st.radio("Choose Mode", ["Ask Anything", "Challenge Me"])
+if mode == "Ask Anything":
+    question = st.text_input("Ask a question about the document")
+    if question and st.button("Submit"):
+        payload = {
+            "question": question,
+            "session_id": "user123",  # Replace with actual session management
+            "model": "default"        # Replace with your model selection
+        }
+        response = requests.post(f"{FASTAPI_URL}/chat", json=payload)
+        if response.status_code == 200:
+            result = response.json()
+            st.write("Answer:", result["answer"])
+            # If your backend returns a source snippet, display it:
+            # st.write("Source:", result.get("source", ""))
+        else:
+            st.error("Failed to get answer")
+# elif mode == "Challenge Me":
+#     if st.button("Generate Challenge Questions"):
+#         # Assume your backend has a `/generate-questions` endpoint
+#         # response = requests.post(f"{FASTAPI_URL}/generate-questions", json={"file_id": file_id})
+#         # if response.status_code == 200:
+#         #     questions