import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Use LLaMA architecture explicitly
model_name = "polyglots/SinLlama_v01"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

def chat_with_sinllama(message, history=[]):
    inputs = tokenizer(message, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        do_sample=True
    )
    reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
    history.append((message, reply))
    return history, history

# Gradio app
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="SinLlama (Sinhala Chatbot)")
    msg = gr.Textbox(label="Type your message in Sinhala")
    clear = gr.Button("Clear")

    msg.submit(chat_with_sinllama, [msg, chatbot], [chatbot, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()