Spaces:

ZennyKenny
/

GRPO_Qwen_3B_ZK_FineTune_LoRA_Demo

Running on Zero

App Files Files Community

ZennyKenny commited on 16 days ago

Commit

3f3d24b

verified ·

1 Parent(s): 9ec6b90

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -5

app.py CHANGED Viewed

@@ -3,12 +3,13 @@ import gradio as gr
 from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import threading
 # Load the base model without quantization to avoid bitsandbytes issues
 base_model = AutoModelForCausalLM.from_pretrained(
     "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
-    device_map="cpu",  # Force CPU to avoid bitsandbytes dependency
-    torch_dtype="auto"
 )
 # Load the LoRA adapter
@@ -17,6 +18,10 @@ model = PeftModel.from_pretrained(
     "ZennyKenny/GPRO_LoRA_Qwen_3B"
 )
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
@@ -26,13 +31,17 @@ def generate_response(prompt):
         "Answer the following question and explain your reasoning step by step.\n"
         f"Question: {prompt}\nReasoning:"
     )
-    inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)
     # Using TextIteratorStreamer for streaming responses
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     generation_kwargs = dict(
-        **inputs,
         max_new_tokens=300,
         do_sample=True,
         temperature=0.8,
@@ -40,12 +49,14 @@ def generate_response(prompt):
         streamer=streamer
     )
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     for new_text in streamer:
         yield new_text
 demo = gr.Interface(
     fn=generate_response,
     inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
@@ -55,4 +66,5 @@ demo = gr.Interface(
     allow_flagging="never"
 )
-demo.launch(share=True)

 from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import threading
+import torch
 # Load the base model without quantization to avoid bitsandbytes issues
 base_model = AutoModelForCausalLM.from_pretrained(
     "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
+    device_map="cpu",  # Ensure it runs on CPU to avoid bitsandbytes issues
+    torch_dtype=torch.float32  # Explicitly set dtype
 )
 # Load the LoRA adapter
     "ZennyKenny/GPRO_LoRA_Qwen_3B"
 )
+# Move model to CPU explicitly (since peft sometimes does not move it automatically)
+model.to("cpu")
+model.eval()  # Ensure the model is in inference mode
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
         "Answer the following question and explain your reasoning step by step.\n"
         f"Question: {prompt}\nReasoning:"
     )
+    # Tokenize and move to correct device
+    inputs = tokenizer(reasoning_prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to("cpu")  # Ensure tensor is on the correct device
     # Using TextIteratorStreamer for streaming responses
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    # Adjust generation parameters
     generation_kwargs = dict(
+        input_ids=input_ids,
         max_new_tokens=300,
         do_sample=True,
         temperature=0.8,
         streamer=streamer
     )
+    # Ensure streaming happens in a separate thread
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     for new_text in streamer:
         yield new_text
+# Define Gradio UI
 demo = gr.Interface(
     fn=generate_response,
     inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
     allow_flagging="never"
 )
+# Launch the Gradio app
+demo.launch(share=True)