import spaces
import gradio as gr
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import threading

# Load the base model without quantization to avoid bitsandbytes issues
base_model = AutoModelForCausalLM.from_pretrained(
    "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
    device_map="cpu",  # Force CPU to avoid bitsandbytes dependency
    torch_dtype="auto"
)

# Load the LoRA adapter
model = PeftModel.from_pretrained(
    base_model, 
    "ZennyKenny/GPRO_LoRA_Qwen_3B"
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")

@spaces.GPU
def generate_response(prompt):
    reasoning_prompt = (
        "Answer the following question and explain your reasoning step by step.\n"
        f"Question: {prompt}\nReasoning:"
    )
    inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)

    # Using TextIteratorStreamer for streaming responses
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

    generation_kwargs = dict(
        **inputs,
        max_new_tokens=300,
        do_sample=True,
        temperature=0.8,
        top_p=0.95,
        streamer=streamer
    )

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    for new_text in streamer:
        yield new_text

demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    outputs=gr.Textbox(label="Response"),
    title="LoRA Model Reasoning Inference",
    description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
    allow_flagging="never"
)

demo.launch(share=True)