Spaces:

umar-100
/

smart-research-assistant

Sleeping

App Files Files Community

umar-100 commited on Jun 23

Commit

0b5d973

1 Parent(s): 1d9f240

backend

Browse files

Files changed (13) hide show

.gitignore +5 -1
__pycache__/db_utils.cpython-311.pyc +0 -0
__pycache__/langchain_utils.cpython-311.pyc +0 -0
__pycache__/main.cpython-311.pyc +0 -0
__pycache__/pinecone_utilis.cpython-311.pyc +0 -0
__pycache__/pydantic_models.cpython-311.pyc +0 -0
db_utils.py +23 -7
langchain_utils.py +31 -4
main.py +103 -19
pinecone_utilis.py +13 -8
prompt_templates.py +0 -64
pydantic_models.py +8 -0
ui.py +22 -58

.gitignore CHANGED Viewed

@@ -1,3 +1,7 @@
 venv
 InternTaskGenAI.pdf
-.env

 venv
 InternTaskGenAI.pdf
+.env
+research_assistant.db
+neural computing cwsi.pdf
+app.log
+__pycache__

__pycache__/db_utils.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/db_utils.cpython-311.pyc and b/__pycache__/db_utils.cpython-311.pyc differ

__pycache__/langchain_utils.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/langchain_utils.cpython-311.pyc and b/__pycache__/langchain_utils.cpython-311.pyc differ

__pycache__/main.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/main.cpython-311.pyc and b/__pycache__/main.cpython-311.pyc differ

__pycache__/pinecone_utilis.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/pinecone_utilis.cpython-311.pyc and b/__pycache__/pinecone_utilis.cpython-311.pyc differ

__pycache__/pydantic_models.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/pydantic_models.cpython-311.pyc and b/__pycache__/pydantic_models.cpython-311.pyc differ

db_utils.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import sqlite3
 from datetime import datetime
-DB_NAME = "rag_app.db"
 def get_db_connection():
     conn = sqlite3.connect(DB_NAME)
@@ -24,6 +26,7 @@ def create_document_store():
     conn.execute('''CREATE TABLE IF NOT EXISTS document_store
                     (id INTEGER PRIMARY KEY AUTOINCREMENT,
                      filename TEXT,
                      upload_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
     conn.close()
@@ -40,16 +43,15 @@ def get_chat_history(session_id):
     cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
     messages = []
     for row in cursor.fetchall():
-        messages.extend([
-            {"role": "human", "content": row['user_query']},
-            {"role": "ai", "content": row['gpt_response']}
-        ])
     conn.close()
     return messages
-def insert_document_record(filename):
     conn = get_db_connection()
     cursor = conn.cursor()
-    cursor.execute('INSERT INTO document_store (filename) VALUES (?)', (filename,))
     file_id = cursor.lastrowid
     conn.commit()
     conn.close()
@@ -70,6 +72,20 @@ def get_all_documents():
     conn.close()
     return [dict(doc) for doc in documents]
 # Initialize the database tables
 create_application_logs()
 create_document_store()

 import sqlite3
 from datetime import datetime
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+DB_NAME = "research_assistant.db"
 def get_db_connection():
     conn = sqlite3.connect(DB_NAME)
     conn.execute('''CREATE TABLE IF NOT EXISTS document_store
                     (id INTEGER PRIMARY KEY AUTOINCREMENT,
                      filename TEXT,
+                     content TEXT,
                      upload_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
     conn.close()
     cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
     messages = []
     for row in cursor.fetchall():
+        messages.append(HumanMessage(content=row['user_query']))
+        messages.append(AIMessage(content=row['gpt_response']))
     conn.close()
     return messages
+def insert_document_record(filename, content):
     conn = get_db_connection()
     cursor = conn.cursor()
+    cursor.execute('INSERT INTO document_store (filename, content) VALUES (?, ?)', (filename, content))
     file_id = cursor.lastrowid
     conn.commit()
     conn.close()
     conn.close()
     return [dict(doc) for doc in documents]
+def get_file_content(file_id: int) -> str | None:
+    conn = get_db_connection()
+    try:
+        cursor = conn.cursor()
+        cursor.execute('SELECT content FROM document_store WHERE id = ?', (file_id,))
+        row = cursor.fetchone()
+        if row is not None:
+            return row[0]
+        return None
+    finally:
+        conn.close()
 # Initialize the database tables
 create_application_logs()
 create_document_store()

langchain_utils.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from langchain_openai import ChatOpenAI
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain.chains import create_history_aware_retriever, create_retrieval_chain
-from langchain.chains.combine_documents import create_stuff_documents_chain
 from typing import List
 from typing_extensions import List, TypedDict
 from langchain_core.documents import Document
@@ -11,8 +11,11 @@ from pinecone_utilis import vectorstore
 from dotenv import load_dotenv
 load_dotenv()
 OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
-retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
 output_parser = StrOutputParser()
 contextualize_q_system_prompt = (
@@ -38,6 +41,30 @@ qa_prompt = ChatPromptTemplate.from_messages([
 class State(TypedDict):
     messages: List[BaseMessage]

 from langchain_openai import ChatOpenAI
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langgraph.graph import START, StateGraph
+from langchain_core.messages import HumanMessage, AIMessage, BaseMessage, SystemMessage
 from typing import List
 from typing_extensions import List, TypedDict
 from langchain_core.documents import Document
 from dotenv import load_dotenv
 load_dotenv()
 OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
+retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
+llm = ChatOpenAI(
+    model='gpt-4.1',
+    api_key=OPENAI_API_KEY
+)
 output_parser = StrOutputParser()
 contextualize_q_system_prompt = (
 class State(TypedDict):
     messages: List[BaseMessage]
+# Define application steps
+def retrieve(query: str):
+    retrieved_docs = vectorstore.similarity_search(query)
+    return  retrieved_docs
+def generate_response(query: str, state: State)->State:
+    retrieved_docs=retrieve(query=query)
+    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
+    system_message = SystemMessage(
+        content="You are a helpful AI assistant. Answer the user's question using ONLY the information provided below. "
+                "If the answer is not in the context, say 'I don't know.' Do not make up information. "
+                f"Context: {docs_content}"
+    )
+    state['messages'].append(system_message)
+    state['messages'].append(HumanMessage(content=query))
+    response = llm.invoke(state["messages"])
+    state['messages'].append(AIMessage(content=response.content))
+    return state

main.py CHANGED Viewed

@@ -1,13 +1,20 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
-from pydantic_models import QueryInput, QueryResponse, DocumentInfo, DeleteFileRequest
-from langchain_utils import get_rag_chain
-from db_utils import insert_application_logs, get_chat_history, get_all_documents, insert_document_record, delete_document_record
-from chroma_utils import index_document_to_chroma, delete_doc_from_chroma
 import os
 import uuid
 import logging
 import shutil
 # Set up logging
 logging.basicConfig(filename='app.log', level=logging.INFO)
@@ -18,22 +25,81 @@ app = FastAPI()
 def chat(query_input: QueryInput):
     session_id = query_input.session_id or str(uuid.uuid4())
     logging.info(f"Session ID: {session_id}, User Query: {query_input.question}, Model: {query_input.model.value}")
     chat_history = get_chat_history(session_id)
-    rag_chain = get_rag_chain(query_input.model.value)
-    answer = rag_chain.invoke({
-        "input": query_input.question,
-        "chat_history": chat_history
-    })['answer']
     insert_application_logs(session_id, query_input.question, answer, query_input.model.value)
     logging.info(f"Session ID: {session_id}, AI Response: {answer}")
     return QueryResponse(answer=answer, session_id=session_id, model=query_input.model)
 @app.post("/upload-doc")
 def upload_and_index_document(file: UploadFile = File(...)):
-    allowed_extensions = ['.pdf', '.docx', '.html']
     file_extension = os.path.splitext(file.filename)[1].lower()
     if file_extension not in allowed_extensions:
@@ -45,16 +111,34 @@ def upload_and_index_document(file: UploadFile = File(...)):
         # Save the uploaded file to a temporary file
         with open(temp_file_path, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
-        file_id = insert_document_record(file.filename)
-        success = index_document_to_chroma(temp_file_path, file_id)
         if success:
-            return {"message": f"File {file.filename} has been successfully uploaded and indexed.", "file_id": file_id}
         else:
             delete_document_record(file_id)
             raise HTTPException(status_code=500, detail=f"Failed to index {file.filename}.")
     finally:
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
@@ -64,13 +148,13 @@ def list_documents():
 @app.post("/delete-doc")
 def delete_document(request: DeleteFileRequest):
-    chroma_delete_success = delete_doc_from_chroma(request.file_id)
-    if chroma_delete_success:
         db_delete_success = delete_document_record(request.file_id)
         if db_delete_success:
             return {"message": f"Successfully deleted document with file_id {request.file_id} from the system."}
         else:
-            return {"error": f"Deleted from Chroma but failed to delete document with file_id {request.file_id} from the database."}
     else:
-        return {"error": f"Failed to delete document with file_id {request.file_id} from Chroma."}

 from fastapi import FastAPI, File, UploadFile, HTTPException
+from pydantic_models import QueryInput, QueryResponse, DocumentInfo, DeleteFileRequest, ChallengeRequest, EvaluateAnswer
+from langchain_utils import generate_response, retrieve
+from db_utils import insert_application_logs, get_chat_history, get_all_documents, insert_document_record, delete_document_record, get_file_content
+from pinecone_utilis import index_document_to_pinecone, delete_doc_from_pinecone, load_and_split_document
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
 import os
 import uuid
 import logging
 import shutil
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 # Set up logging
 logging.basicConfig(filename='app.log', level=logging.INFO)
 def chat(query_input: QueryInput):
     session_id = query_input.session_id or str(uuid.uuid4())
     logging.info(f"Session ID: {session_id}, User Query: {query_input.question}, Model: {query_input.model.value}")
     chat_history = get_chat_history(session_id)
+    state={"messages":[]} # test
+    messages_state = generate_response(query=query_input.question, state=state)
+    answer=messages_state["messages"][-1].content
     insert_application_logs(session_id, query_input.question, answer, query_input.model.value)
     logging.info(f"Session ID: {session_id}, AI Response: {answer}")
     return QueryResponse(answer=answer, session_id=session_id, model=query_input.model)
+@app.post('/challenge-me', response_model=list[str])
+def challenge_me(request: ChallengeRequest):
+    file_id = request.file_id
+    content = get_file_content(file_id)
+    if content is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+    llm = ChatOpenAI(
+        model='gpt-4.1',
+        api_key=OPENAI_API_KEY
+    )
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful AI assistant. Generate three logic-based or comprehension-focused questions about the following document. Each question should require understanding or reasoning about the document content, not just simple recall. Provide each question on a new line."),
+        ("human", "Document: {context}\n\nQuestions:")
+    ])
+    chain = prompt | llm | StrOutputParser()
+    questions_str = chain.invoke({"context": content})
+    questions = [q.strip() for q in questions_str.split('\n') if q.strip()][:3]
+    return questions
+@app.post('/evaluate-response')
+def evaluate_response(request: EvaluateAnswer):
+    # get the file ralated to answers
+    file_id = request.file_id
+    question = request.question
+    user_answer=request.user_answer
+    # evaluate the useranswer according to the research paper
+    llm = ChatOpenAI(
+        model='gpt-4.1',
+        api_key=OPENAI_API_KEY
+    )
+    # get the context from doc
+    retrieved_docs=retrieve(query=question)
+    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful AI assistant. Your task is to evaluate the user's answer to a question, using ONLY the information below as reference. If the answer is not correct, explain why and provide the correct answer with justification from the document. Do not make up information."),
+        ("system", "Context: {context}"),
+        ("human", "Question: {question}\nUser Answer: {user_answer}\nEvaluation:")
+    ])
+    chain = prompt | llm | StrOutputParser()
+    evaluation = chain.invoke({
+        "context": docs_content,
+        "question": question,
+        "user_answer": user_answer
+    })
+    return {
+        "feedback": evaluation,
+        "file_id": file_id
+    }
 @app.post("/upload-doc")
 def upload_and_index_document(file: UploadFile = File(...)):
+    allowed_extensions = ['.pdf', '.txt']
     file_extension = os.path.splitext(file.filename)[1].lower()
     if file_extension not in allowed_extensions:
         # Save the uploaded file to a temporary file
         with open(temp_file_path, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
+        docs = load_and_split_document(temp_file_path)
+        docs_content = "\n\n".join(doc.page_content for doc in docs)
+        file_id = insert_document_record(file.filename, docs_content)
+        success = index_document_to_pinecone(temp_file_path, file_id)
         if success:
+            # generate summary
+            llm = ChatOpenAI(
+                model='gpt-4.1',
+                api_key=OPENAI_API_KEY
+            )
+            prompt = ChatPromptTemplate.from_messages([
+                ("system", "You are a helpful assistant. Summarize the following document in no more than 150 words. Focus on the main points and key findings. Do not include information not present in the document."),
+                ("human", "{document}")
+            ])
+            chain = prompt | llm | StrOutputParser()
+            summary = chain.invoke({"document": docs_content})
+            return {
+                "message": f"File {file.filename} has been successfully uploaded and indexed.",
+                "file_id": file_id,
+                "summary": summary
+            }
         else:
             delete_document_record(file_id)
             raise HTTPException(status_code=500, detail=f"Failed to index {file.filename}.")
     finally:
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
 @app.post("/delete-doc")
 def delete_document(request: DeleteFileRequest):
+    pinecone_delete_success = delete_doc_from_pinecone(request.file_id)
+    if pinecone_delete_success:
         db_delete_success = delete_document_record(request.file_id)
         if db_delete_success:
             return {"message": f"Successfully deleted document with file_id {request.file_id} from the system."}
         else:
+            return {"error": f"Deleted from pinecone but failed to delete document with file_id {request.file_id} from the database."}
     else:
+        return {"error": f"Failed to delete document with file_id {request.file_id} from pinecone."}

pinecone_utilis.py CHANGED Viewed

@@ -11,7 +11,6 @@ load_dotenv()
 # API keys
 PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
-print(f"Pinecone API Key: {PINECONE_API_KEY}")
 OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
@@ -72,13 +71,19 @@ def index_document_to_pinecone(file_path: str, file_id: int) -> bool:
 def delete_doc_from_pinecone(file_id: int):
     try:
-        docs = vectorstore.get(where={"file_id": file_id})
-        print(f"Found {len(docs['ids'])} document chunks for file_id {file_id}")
-        vectorstore._collection.delete(where={"file_id": file_id})
-        print(f"Deleted all documents with file_id {file_id}")
         return True
     except Exception as e:
-        print(f"Error deleting document with file_id {file_id} from Chroma: {str(e)}")
         return False

 # API keys
 PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
 OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
 def delete_doc_from_pinecone(file_id: int):
     try:
+        index = pc.Index(INDEX_NAME)
+        # Query for all vectors with file_id metadata
+        query_result = index.query(
+            vector=[0.0]*1024,
+            filter={"file_id": {"$eq": str(file_id)}},
+            top_k=10000,
+            include_metadata=True
+        )
+        ids = [match["id"] for match in query_result["matches"]]
+        if ids:
+            index.delete(ids=ids)
         return True
     except Exception as e:
+        print(f"Error deleting from Pinecone: {str(e)}")
         return False

prompt_templates.py DELETED Viewed

@@ -1,64 +0,0 @@
-# 1. Auto-summarization prompt template
-AUTO_SUMMARY_TEMPLATE = """
-Summarize the following document in no more than 150 words. Focus on the main points and key findings. Do not include information not present in the document.
-DOCUMENT:
-{document}
-SUMMARY:
-"""
-# 2. Question answering prompt template
-QA_PROMPT_TEMPLATE = """
-Answer the following question based only on the provided document. Your answer must be grounded in the document and include a specific reference to the text that supports your answer.
-Document:
-{document}
-Question:
-{question}
-Answer:
-"""
-# 3. Logic-based question generation prompt template
-LOGIC_QUESTION_GENERATION_TEMPLATE = """
-Generate three logic-based or comprehension-focused questions about the following document. Each question should require understanding or reasoning about the document content, not just simple recall. Provide each question on a new line.
-Document:
-{document}
-Questions:
-"""
-# 4. Answer evaluation prompt template
-ANSWER_EVALUATION_TEMPLATE = """
-Evaluate the following user answer to the question, using only the provided document as the source of truth. State whether the answer is correct or not, and provide a brief justification referencing the document.
-Document:
-{document}
-Question:
-{question}
-User Answer:
-{user_answer}
-Evaluation:
-"""
-# 5. For memory/follow-up: Chat prompt template
-CHAT_PROMPT_TEMPLATE = """
-The following is a conversation between a user and an AI assistant about a document. The assistant answers questions and provides justifications based on the document. Use the conversation history and the document to answer the new question.
-Document:
-{document}
-Conversation History:
-{history}
-Question:
-{question}
-Answer:
-"""

pydantic_models.py CHANGED Viewed

@@ -23,3 +23,11 @@ class DocumentInfo(BaseModel):
 class DeleteFileRequest(BaseModel):
     file_id: int

 class DeleteFileRequest(BaseModel):
     file_id: int
+class ChallengeRequest(BaseModel):
+    file_id: int
+class EvaluateAnswer(BaseModel):
+    file_id: int
+    question: str
+    user_answer: str

ui.py CHANGED Viewed

@@ -1,61 +1,25 @@
-import streamlit as st
 import requests
 # Set the FastAPI backend URL
-FASTAPI_URL = "http://localhost:8000"
-st.title("Smart Research Assistant")
-# Document Upload Section
-uploaded_file = st.file_uploader("Upload a PDF or TXT document", type=["pdf", "txt"])
-if uploaded_file:
-    files = {"file": uploaded_file}
-    response = requests.post(f"{FASTAPI_URL}/upload-doc", files=files)
-    if response.status_code == 200:
-        result = response.json()
-        st.success(result["message"])
-        file_id = result["file_id"]
-        # Here you would also call a summary endpoint if implemented
-        # For demo, assume summary is returned in the upload response
-        # st.write("Summary: ", result.get("summary", "Summary not available"))
-    else:
-        st.error(f"Error: {response.json().get('detail', 'Unknown error')}")
-# List Documents (optional)
-if st.button("List Documents"):
-    response = requests.get(f"{FASTAPI_URL}/list-docs")
-    if response.status_code == 200:
-        documents = response.json()
-        st.write("Available Documents:")
-        for doc in documents:
-            st.write(f"- {doc['filename']} (ID: {doc['file_id']})")
-    else:
-        st.error("Failed to list documents")
-# Interaction Modes
-mode = st.radio("Choose Mode", ["Ask Anything", "Challenge Me"])
-if mode == "Ask Anything":
-    question = st.text_input("Ask a question about the document")
-    if question and st.button("Submit"):
-        payload = {
-            "question": question,
-            "session_id": "user123",  # Replace with actual session management
-            "model": "default"        # Replace with your model selection
-        }
-        response = requests.post(f"{FASTAPI_URL}/chat", json=payload)
-        if response.status_code == 200:
-            result = response.json()
-            st.write("Answer:", result["answer"])
-            # If your backend returns a source snippet, display it:
-            # st.write("Source:", result.get("source", ""))
-        else:
-            st.error("Failed to get answer")
-# elif mode == "Challenge Me":
-#     if st.button("Generate Challenge Questions"):
-#         # Assume your backend has a `/generate-questions` endpoint
-#         # response = requests.post(f"{FASTAPI_URL}/generate-questions", json={"file_id": file_id})
-#         # if response.status_code == 200:
-#         #     questions

 import requests
 # Set the FastAPI backend URL
+BASE_URL = "http://localhost:8000"
+# with open("neural computing cwsi.pdf", "rb") as f:
+#     files = {"file": ("neural computing cwsi.pdf", f, "text/plain")}
+#     upload_response = requests.post(f"{BASE_URL}/chat", files=files)
+# print("Upload Response:", upload_response.json())
+# file_id = upload_response.json().get("summary")
+# print(file_id)
+chat_data = {"question": "What is the main topic?", "model": "gpt-4o-mini"}
+chat_response = requests.post(f"{BASE_URL}/chat", json=chat_data)
+print("Chat Response:", chat_response.json())
+# delete_data={"file_id": 1}
+# delete_response = requests.post(f"{BASE_URL}/delete-doc", json=delete_data)
+# print("Delete Response:", delete_response.json())