# app.py — Simplified for Hugging Face Spaces # --------------------------------------------------------------- # This version uses the high-level `pipeline` from transformers # for a much simpler and cleaner implementation. # --------------------------------------------------------------- import os import torch import gradio as gr from transformers import pipeline # ── Configuration ────────────────────────────────────────────────────────────── # Set the model repository ID MODEL_ID = "Reubencf/gemma3-goan-finetuned" HF_TOKEN = os.getenv("HF_TOKEN") # Optional: for private models DEVICE = "cuda" if torch.cuda.is_available() else "cpu" TITLE = "🌴 Gemma Goan Q&A Bot" DESCRIPTION = ( "This is a simple Gradio chat interface for the Gemma model fine-tuned on a Goan Q&A dataset.\n" "Ask about Goa, Konkani culture, or general topics!" ) # ── Load Model Pipeline ───────────────────────────────────────────────────── # We load the model and tokenizer into a pipeline object. # This is done only once when the app starts. # `device_map="auto"` ensures the model is placed on a GPU if available. print(f"[Init] Loading model pipeline: {MODEL_ID} on {DEVICE}...") try: pipe = pipeline( "text-generation", model=MODEL_ID, torch_dtype=torch.bfloat16, # Use bfloat16 for better performance device_map="auto", token=HF_TOKEN, ) MODEL_LOADED = True print("[Init] Model pipeline loaded successfully.") except Exception as e: MODEL_LOADED = False DESCRIPTION = f"❌ Model failed to load: {e}" print(f"[Fatal] Could not load model: {e}") # ── Generation Function ────────────────────────────────────────────────────── def generate_response(message, history): """ This function is called for each user message. It takes the user's message and the conversation history, formats them for the model, and returns the model's response. """ if not MODEL_LOADED: return "⚠️ Model is not available. Please check the Space logs for errors." # Format the conversation history into the format expected by the model # The model expects a list of dictionaries with "role" and "content" keys conversation = [] for user_msg, assistant_msg in history: conversation.append({"role": "user", "content": user_msg}) if assistant_msg: conversation.append({"role": "assistant", "content": assistant_msg}) # Add the current user's message conversation.append({"role": "user", "content": message}) # Use the pipeline's tokenizer to apply the chat template # This correctly formats the input for the conversational model prompt = pipe.tokenizer.apply_chat_template( conversation, tokenize=False, add_generation_prompt=True ) # Generate the response using the pipeline outputs = pipe( prompt, do_sample=True, temperature=0.7, top_k=50, top_p=0.95 ) # The pipeline output includes the entire conversation history (prompt). # We need to extract only the newly generated text from the assistant. response = outputs[0]["generated_text"] # Slice the response to get only the new part new_response = response[len(prompt):].strip() return new_response # ── UI ──────────────────────────────────────────────────────────────────────── # Define some example questions to display in the UI examples = [ "What is bebinca?", "Tell me about the history of Feni.", "Suggest a good, quiet beach in South Goa.", "Describe Goan fish curry.", ] # Create the Gradio ChatInterface demo = gr.ChatInterface( fn=generate_response, title=TITLE, description=DESCRIPTION, examples=examples, theme="soft", ) # ── Launch ──────────────────────────────────────────────────────────────────── if __name__ == "__main__": print("🚀 Starting Gradio app...") demo.launch()