# app.py — Simplified for Hugging Face Spaces
# ---------------------------------------------------------------
# This version uses the high-level `pipeline` from transformers
# for a much simpler and cleaner implementation.
# ---------------------------------------------------------------
import os
import torch
import gradio as gr
from transformers import pipeline

# ── Configuration ──────────────────────────────────────────────────────────────
# Set the model repository ID
MODEL_ID = "Reubencf/gemma3-goan-finetuned"
HF_TOKEN = os.getenv("HF_TOKEN") # Optional: for private models

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TITLE = "🌴 Gemma Goan Q&A Bot"
DESCRIPTION = (
    "This is a simple Gradio chat interface for the Gemma model fine-tuned on a Goan Q&A dataset.\n"
    "Ask about Goa, Konkani culture, or general topics!"
)

# ── Load Model Pipeline ─────────────────────────────────────────────────────
# We load the model and tokenizer into a pipeline object.
# This is done only once when the app starts.
# `device_map="auto"` ensures the model is placed on a GPU if available.
print(f"[Init] Loading model pipeline: {MODEL_ID} on {DEVICE}...")
try:
    pipe = pipeline(
        "text-generation",
        model=MODEL_ID,
        torch_dtype=torch.bfloat16, # Use bfloat16 for better performance
        device_map="auto",
        token=HF_TOKEN,
    )
    MODEL_LOADED = True
    print("[Init] Model pipeline loaded successfully.")
except Exception as e:
    MODEL_LOADED = False
    DESCRIPTION = f"❌ Model failed to load: {e}"
    print(f"[Fatal] Could not load model: {e}")


# ── Generation Function ──────────────────────────────────────────────────────
def generate_response(message, history):
    """
    This function is called for each user message.
    It takes the user's message and the conversation history,
    formats them for the model, and returns the model's response.
    """
    if not MODEL_LOADED:
        return "⚠️ Model is not available. Please check the Space logs for errors."

    # Format the conversation history into the format expected by the model
    # The model expects a list of dictionaries with "role" and "content" keys
    conversation = []
    for user_msg, assistant_msg in history:
        conversation.append({"role": "user", "content": user_msg})
        if assistant_msg:
            conversation.append({"role": "assistant", "content": assistant_msg})
    
    # Add the current user's message
    conversation.append({"role": "user", "content": message})

    # Use the pipeline's tokenizer to apply the chat template
    # This correctly formats the input for the conversational model
    prompt = pipe.tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True
    )

    # Generate the response using the pipeline
    outputs = pipe(
        prompt,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
    )
    
    # The pipeline output includes the entire conversation history (prompt).
    # We need to extract only the newly generated text from the assistant.
    response = outputs[0]["generated_text"]
    # Slice the response to get only the new part
    new_response = response[len(prompt):].strip()

    return new_response

# ── UI ────────────────────────────────────────────────────────────────────────
# Define some example questions to display in the UI
examples = [
    "What is bebinca?",
    "Tell me about the history of Feni.",
    "Suggest a good, quiet beach in South Goa.",
    "Describe Goan fish curry.",
]

# Create the Gradio ChatInterface
demo = gr.ChatInterface(
    fn=generate_response,
    title=TITLE,
    description=DESCRIPTION,
    examples=examples,
    theme="soft",
)

# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    print("🚀 Starting Gradio app...")
    demo.launch()