import spaces import gradio as gr from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import threading # Load the base model without quantization to avoid bitsandbytes issues base_model = AutoModelForCausalLM.from_pretrained( "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit", device_map="cpu", # Force CPU to avoid bitsandbytes dependency torch_dtype="auto" ) # Load the LoRA adapter model = PeftModel.from_pretrained( base_model, "ZennyKenny/GPRO_LoRA_Qwen_3B" ) # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit") @spaces.GPU def generate_response(prompt): reasoning_prompt = ( "Answer the following question and explain your reasoning step by step.\n" f"Question: {prompt}\nReasoning:" ) inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device) # Using TextIteratorStreamer for streaming responses streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) generation_kwargs = dict( **inputs, max_new_tokens=300, do_sample=True, temperature=0.8, top_p=0.95, streamer=streamer ) thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() for new_text in streamer: yield new_text demo = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox(label="Response"), title="LoRA Model Reasoning Inference", description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.", allow_flagging="never" ) demo.launch(share=True)