# LoRA Inference Gradio Space Demo import spaces import gradio as gr from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer # Load the base model base_model = AutoModelForCausalLM.from_pretrained( "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit", device_map="auto", torch_dtype="auto" ) # Load the LoRA adapter model = PeftModel.from_pretrained( base_model, "ZennyKenny/GPRO_LoRA_Qwen_3B" ) # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit") @spaces.GPU def generate_response(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=50) return tokenizer.decode(outputs[0], skip_special_tokens=True) zk_qwen = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox(label="Response"), title="LoRA Model Inference", description="Demo your LoRA model with Hugging Face Gradio." ) zk_qwen.launch()