Spaces:

SankethHonavar
/

medical-llm-chatbot

Runtime error

App Files Files Community

SankethHonavar commited on 4 days ago

Commit

76b04ec

1 Parent(s): 18c83ea

Deploy LLM Medical Chatbot with FAISS

Browse files

Files changed (11) hide show

.gitattributes +2 -0
.gitignore +18 -0
Dockerfile +1 -4
app.py +179 -0
data/medmcqa_index/index.faiss +3 -0
data/medmcqa_index/index.pkl +3 -0
dataset_loader.py +18 -0
langgraph_graph.py +46 -0
main.py +38 -0
retriever.py +69 -0
src/streamlit_app.py +0 -40

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/medmcqa_index/index.faiss filter=lfs diff=lfs merge=lfs -text
+data/medmcqa_index/index.pkl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+__pycache__/
+*.pyc
+*.pkl
+*.db
+*.log
+.env
+.venv/
+.ipynb_checkpoints/
+*.sqlite3
+*.DS_Store
+try.py
+# Ignore everything in data folder
+data/*
+# But allow medmcqa_index folder and its contents
+!data/medmcqa_index/
+!data/medmcqa_index/**

Dockerfile CHANGED Viewed

@@ -2,14 +2,11 @@ FROM python:3.10
 WORKDIR /app
-# Copy and install dependencies first
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy all project files and folders
 COPY . .
 EXPOSE 7860
-# Run your Streamlit app (entry point)
-CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 7860
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import streamlit as st
+from retriever import load_vector_store
+from langgraph_graph import generate_answer
+from time import sleep
+# Load vector DB
+db = load_vector_store()
+st.set_page_config("MedMCQA Chatbot", page_icon="🩺")
+# 🌗 Theme toggle sidebar
+with st.sidebar:
+    st.title("🩺 MedMCQA Chatbot")
+    theme_mode = st.radio("🌓 Theme", ["Light", "Dark"], horizontal=True)
+# 🌓 Apply selected theme
+if theme_mode == "Dark":
+    st.markdown("""
+        <style>
+        :root { --text-color: #eee; }
+        body, .stApp {
+            background-color: #1e1e1e !important;
+            color: var(--text-color) !important;
+        }
+        .stTextInput input {
+            background-color: #333 !important;
+            color: var(--text-color) !important;
+        }
+        .stTextInput label {
+            color: var(--text-color) !important;
+        }
+        input::placeholder {
+            color: #bbb !important;
+        }
+        .stButton>button {
+            background-color: #444 !important;
+            color: var(--text-color) !important;
+        }
+        </style>
+    """, unsafe_allow_html=True)
+else:
+    st.markdown("""
+        <style>
+        :root { --text-color: #111; }
+        body, .stApp {
+            background-color: #ffffff !important;
+            color: var(--text-color) !important;
+        }
+        .stTextInput input {
+            background-color: #f0f0f0 !important;
+            color: var(--text-color) !important;
+        }
+        .stTextInput label {
+            color: var(--text-color) !important;
+        }
+        input::placeholder {
+            color: #444 !important;
+        }
+        .stButton>button {
+            background-color: #e0e0e0 !important;
+            color: var(--text-color) !important;
+        }
+        </style>
+    """, unsafe_allow_html=True)
+# 🧠 App title
+st.header("🩺 MedMCQA Chatbot")
+st.caption("Ask a medical question and get answers from the MedMCQA dataset only. If not found, it will respond gracefully.")
+# ✏️ Query box
+query = st.text_input(
+    "🔍 Enter your medical question:",
+    placeholder="e.g., What is the mechanism of Aspirin?",
+    label_visibility="visible"
+)
+# 🚀 Answer generation
+if query:
+    results = db.similarity_search(query, k=3)
+    context = "\n\n".join([doc.page_content for doc in results])
+    with st.spinner("🧠 Generating answer..."):
+        response = generate_answer(query, context)
+    st.markdown("""
+    <style>
+    .fade-in {
+        animation: fadeIn 0.7s ease-in;
+    }
+    @keyframes fadeIn {
+        0% { opacity: 0; transform: translateY(20px); }
+        100% { opacity: 1; transform: translateY(0); }
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    st.markdown("<div class='fade-in'><h4>🧠 Answer:</h4></div>", unsafe_allow_html=True)
+    answer_placeholder = st.empty()
+    final_text = ""
+    for char in response:
+        final_text += char
+        answer_placeholder.markdown(f"<div class='fade-in'>{final_text}</div>", unsafe_allow_html=True)
+        sleep(0.01)
+    with st.expander("🔎 Top Matches"):
+        for i, doc in enumerate(results, 1):
+            content = doc.page_content
+            if query.lower() in content.lower():
+                content = content.replace(query, f"**{query}**")
+            st.markdown(f"**Result {i}:**\n\n{content}")
+# 📬 Sidebar Contact
+with st.sidebar:
+    st.markdown("---")
+    st.markdown("### 📬 Contact")
+    st.markdown("[📧 Email](mailto:[email protected])")
+    st.markdown("[🔗 LinkedIn](https://linkedin.com/in/sankethhonavar)")
+    st.markdown("[💻 GitHub](https://github.com/sankethhonavar)")
+# ✨ Floating Icons (Right side - Top aligned)
+st.markdown("""
+<style>
+.floating-button {
+    position: fixed;
+    top: 80px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    gap: 12px;
+    z-index: 9999;
+}
+.floating-button a {
+    background-color: #0077b5;
+    color: white;
+    padding: 10px 14px;
+    border-radius: 50%;
+    text-align: center;
+    font-size: 20px;
+    text-decoration: none;
+    box-shadow: 2px 2px 8px rgba(0, 0, 0, 0.3);
+    transition: background-color 0.3s;
+}
+.floating-button a:hover {
+    background-color: #005983;
+}
+.floating-button a.email {
+    background-color: #444444;
+}
+.floating-button a.email:hover {
+    background-color: #222222;
+}
+.floating-button a.github {
+    background-color: #171515;
+}
+.floating-button a.github:hover {
+    background-color: #000000;
+}
+</style>
+<div class="floating-button">
+    <a href="mailto:[email protected]" class="email" title="Email Me">
+        <img src="https://img.icons8.com/ios-filled/25/ffffff/new-post.png" alt="Email"/>
+    </a>
+    <a href="https://linkedin.com/in/sankethhonavar" target="_blank" title="LinkedIn">
+        <img src="https://img.icons8.com/ios-filled/25/ffffff/linkedin.png" alt="LinkedIn"/>
+    </a>
+    <a href="https://github.com/SankethHonavar" target="_blank" class="github" title="GitHub">
+        <img src="https://img.icons8.com/ios-filled/25/ffffff/github.png" alt="GitHub"/>
+    </a>
+</div>
+""", unsafe_allow_html=True)
+# 📄 Footer
+st.markdown("""
+---
+<p style='text-align: center; font-size: 0.9rem; color: grey'>
+Made with ❤️ by <a href='https://linkedin.com/in/sankethhonavar' target='_blank'>Sanketh Honavar</a>
+</p>
+""", unsafe_allow_html=True)

data/medmcqa_index/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34d54d6522c7d7b29d217d765eb4553f125c9f0d0d4a817cd466e885bef2d145
+size 7680045

data/medmcqa_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56f440b2cdb8220a6eee18440355834f439554987e4001c0a97513a5ac5a10d8
+size 4297348

dataset_loader.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# dataset_loader.py
+from datasets import load_dataset
+def load_medmcqa_subset(limit=5000):
+    dataset = load_dataset("medmcqa", split="train")
+    def format_entry(entry):
+        return {
+            "question": entry["question"],
+            "formatted": (
+                f"Q: {entry['question']}\n"
+                f"A. {entry['opa']}  B. {entry['opb']}  C. {entry['opc']}  D. {entry['opd']}\n"
+                f"Correct Answer: {entry['cop']}\n"
+                f"Explanation: {entry['exp']}"
+            )
+        }
+    return [format_entry(entry) for entry in dataset]

langgraph_graph.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from retriever import retrieve_relevant_docs
+from langchain_core.prompts import PromptTemplate
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains import create_retrieval_chain
+from langchain_google_genai import ChatGoogleGenerativeAI
+# LLM used for both doc chain and fallback answer
+llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-flash", temperature=0.3)
+# Define the structured prompt
+prompt = PromptTemplate.from_template("""
+You are a helpful medical assistant. Use only the dataset context below to answer.
+Context:
+{context}
+Question: {input}
+If you are unsure, say "Sorry, I couldn't find an answer based on the dataset." Do not guess.
+""")
+# Build document chain and retrieval chain
+document_chain = create_stuff_documents_chain(llm, prompt)
+retriever_chain = create_retrieval_chain(retrieve_relevant_docs(), document_chain)
+# Expose chain for Streamlit app
+graph = retriever_chain
+# Manual fallback function if needed
+def generate_answer(query: str, context: str) -> str:
+    if not context.strip():
+        return "Sorry, I couldn't find an answer based on the dataset."
+    fallback_llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-flash", temperature=0.3)
+    fallback_prompt = f"""
+    You are a helpful medical assistant. Use only the dataset context below to answer.
+    Context:
+    {context}
+    Question: {query}
+    If you are unsure, say "Sorry, I couldn't find an answer based on the dataset." Do not guess.
+    """
+    response = fallback_llm.invoke(fallback_prompt)
+    return response.content.strip()

main.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+from retriever import load_vector_store
+from langgraph_graph import generate_answer
+def medchat(query):
+    """
+    Full MedMCQA pipeline.
+    1. Retrieve top matches
+    2. Prompt LLM with strict instruction to avoid hallucination
+    """
+    retriever = load_vector_store()
+    matches = retriever.similarity_search(query, k=3)
+    context = "\n\n".join([match.page_content for match in matches])
+    prompt = f"""
+    You are a helpful medical assistant. Use only the dataset context below to answer.
+    Context:
+    {context}
+    Question: {query}
+    If you are unsure, say 'Sorry, I couldn't find an answer based on the dataset.'
+    """
+    return generate_answer(prompt)
+if __name__ == "__main__":
+    print("\n🩺 MedMCQA Chatbot")
+    print("Ask a medical question and get answers from MedMCQA dataset.\n")
+    while True:
+        user_q = input("Ask a medical question (or type 'exit'): ")
+        if user_q.lower() == "exit":
+            break
+        response = medchat(user_q)
+        print("\n🧠 Answer:", response, "\n")

retriever.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from dataset_loader import load_medmcqa_subset
+from tqdm import tqdm  # Progress bar for better visibility during indexing
+import os
+def retrieve_relevant_docs():
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    db = FAISS.load_local("data/medmcqa_index", embeddings, allow_dangerous_deserialization=True)
+    return db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+def create_vector_store():
+    examples = load_medmcqa_subset()
+    # Format each entry into a LangChain Document with progress bar
+    docs = [
+        Document(
+            page_content=e["formatted"],
+            metadata={"question": e["question"]}
+        )
+        for e in tqdm(examples, desc="📚 Embedding MedMCQA examples")
+    ]
+    # Create embedding model
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    # Build and save FAISS index
+    db = FAISS.from_documents(docs, embeddings)
+    os.makedirs("data", exist_ok=True)
+    db.save_local("data/medmcqa_index")
+    print("✅ Vector DB saved at data/medmcqa_index")
+def load_vector_store():
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    return FAISS.load_local("data/medmcqa_index", embeddings, allow_dangerous_deserialization=True)
+if __name__ == "__main__":
+    from langchain.prompts import PromptTemplate
+    from langchain_core.output_parsers import StrOutputParser
+    from langchain_google_genai import ChatGoogleGenerativeAI
+    # Load DB
+    db = load_vector_store()
+    query = "What is the treatment for asthma?"
+    docs = db.similarity_search(query, k=4)
+    # Prompt Template
+    prompt_template = PromptTemplate.from_template(
+        """
+        You are a helpful medical assistant. Use only the dataset context below to answer.
+        Context:
+        {context}
+        Question:
+        {question}
+        If you are unsure, say 'Sorry, I couldn't find an answer based on the dataset.'
+        """
+    )
+    # LLM
+    llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-flash", temperature=0.3)
+    chain = prompt_template | llm | StrOutputParser()
+    print("\n\n🧠 Answer:\n", chain.invoke({"context": "\n\n".join(d.page_content for d in docs), "question": query}))

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))