Spaces:
Build error
Build error
import spaces | |
import gradio as gr | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
import threading | |
# Load the base model without quantization to avoid bitsandbytes issues | |
base_model = AutoModelForCausalLM.from_pretrained( | |
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit", | |
device_map="cpu", # Force CPU to avoid bitsandbytes dependency | |
torch_dtype="auto" | |
) | |
# Load the LoRA adapter | |
model = PeftModel.from_pretrained( | |
base_model, | |
"ZennyKenny/GPRO_LoRA_Qwen_3B" | |
) | |
# Load the tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit") | |
def generate_response(prompt): | |
reasoning_prompt = ( | |
"Answer the following question and explain your reasoning step by step.\n" | |
f"Question: {prompt}\nReasoning:" | |
) | |
inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device) | |
# Using TextIteratorStreamer for streaming responses | |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) | |
generation_kwargs = dict( | |
**inputs, | |
max_new_tokens=300, | |
do_sample=True, | |
temperature=0.8, | |
top_p=0.95, | |
streamer=streamer | |
) | |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
thread.start() | |
for new_text in streamer: | |
yield new_text | |
demo = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), | |
outputs=gr.Textbox(label="Response"), | |
title="LoRA Model Reasoning Inference", | |
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.", | |
allow_flagging="never" | |
) | |
demo.launch(share=True) |