import streamlit as st
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Set up model from public Hugging Face repo
REPO_ID = "MaziyarPanahi/BioMistral-7B-GGUF"
FILENAME = "BioMistral-7B.Q4_K_M.gguf"

st.set_page_config(page_title="🩺 Medical Chatbot")
st.title("🧠 Medical Chatbot using BioMistral")

# Download from HF Hub (automatically caches)
with st.spinner("📥 Downloading BioMistral model from Hugging Face..."):
    model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

# Load the GGUF model with llama-cpp-python
with st.spinner("⚙️ Loading the model into memory..."):
    llm = Llama(
        model_path=model_path,
        n_ctx=4096,
        n_threads=8,
        n_gpu_layers=35  # Use 0 for CPU-only
    )

# UI to get user input
query = st.text_input("💬 Ask a medical question:")
if query:
    with st.spinner("🧠 Thinking..."):
        response = llm(query, max_tokens=512, stop=["</s>"])
        st.markdown("**Answer:**")
        st.write(response["choices"][0]["text"].strip())