Spaces:

ZennyKenny
/

GRPO_Qwen_3B_ZK_FineTune_LoRA_Demo

Build error

App Files Files Community

GRPO_Qwen_3B_ZK_FineTune_LoRA_Demo / app.py

ZennyKenny

Update app.py

9ec6b90 verified 28 days ago

raw

history blame

1.75 kB

	import spaces
	import gradio as gr
	from peft import PeftModel
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import threading

	# Load the base model without quantization to avoid bitsandbytes issues
	base_model = AutoModelForCausalLM.from_pretrained(
	"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
	device_map="cpu", # Force CPU to avoid bitsandbytes dependency
	torch_dtype="auto"
	)

	# Load the LoRA adapter
	model = PeftModel.from_pretrained(
	base_model,
	"ZennyKenny/GPRO_LoRA_Qwen_3B"
	)

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")

	@spaces.GPU
	def generate_response(prompt):
	reasoning_prompt = (
	"Answer the following question and explain your reasoning step by step.\n"
	f"Question: {prompt}\nReasoning:"
	)
	inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)

	# Using TextIteratorStreamer for streaming responses
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

	generation_kwargs = dict(
	**inputs,
	max_new_tokens=300,
	do_sample=True,
	temperature=0.8,
	top_p=0.95,
	streamer=streamer
	)

	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	for new_text in streamer:
	yield new_text

	demo = gr.Interface(
	fn=generate_response,
	inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
	outputs=gr.Textbox(label="Response"),
	title="LoRA Model Reasoning Inference",
	description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
	allow_flagging="never"
	)

	demo.launch(share=True)