import gradio as gr
import torch
from unsloth import FastLanguageModel
from peft import PeftModel
from transformers import AutoTokenizer

# Load base model + tokenizer
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-bnb-4bit",
    max_seq_length = 2048,
    dtype = torch.float16 if torch.cuda.is_available() else torch.float32,
    load_in_4bit = True,
)

# Load your LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    "vansh-myth/gemma-2-mumbai_slang-adapter",  # 🔁 Change this to your HF repo path
    device_map = "auto"
)

FastLanguageModel.for_inference(model)

# Optional: move model to correct device
if torch.cuda.is_available():
    model = model.to("cuda")

# Chat function
def chat(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio UI
iface = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    outputs="text",
    title="Gemma 2B LoRA Chat",
    description="LoRA fine-tuned version of Gemma-2-9B on Hugging Face Spaces."
)

iface.launch()