Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -3,12 +3,13 @@ import gradio as gr
|
|
3 |
from peft import PeftModel
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
5 |
import threading
|
|
|
6 |
|
7 |
# Load the base model without quantization to avoid bitsandbytes issues
|
8 |
base_model = AutoModelForCausalLM.from_pretrained(
|
9 |
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
|
10 |
-
device_map="cpu", #
|
11 |
-
torch_dtype=
|
12 |
)
|
13 |
|
14 |
# Load the LoRA adapter
|
@@ -17,6 +18,10 @@ model = PeftModel.from_pretrained(
|
|
17 |
"ZennyKenny/GPRO_LoRA_Qwen_3B"
|
18 |
)
|
19 |
|
|
|
|
|
|
|
|
|
20 |
# Load the tokenizer
|
21 |
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
|
22 |
|
@@ -26,13 +31,17 @@ def generate_response(prompt):
|
|
26 |
"Answer the following question and explain your reasoning step by step.\n"
|
27 |
f"Question: {prompt}\nReasoning:"
|
28 |
)
|
29 |
-
|
|
|
|
|
|
|
30 |
|
31 |
# Using TextIteratorStreamer for streaming responses
|
32 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
33 |
|
|
|
34 |
generation_kwargs = dict(
|
35 |
-
|
36 |
max_new_tokens=300,
|
37 |
do_sample=True,
|
38 |
temperature=0.8,
|
@@ -40,12 +49,14 @@ def generate_response(prompt):
|
|
40 |
streamer=streamer
|
41 |
)
|
42 |
|
|
|
43 |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
44 |
thread.start()
|
45 |
|
46 |
for new_text in streamer:
|
47 |
yield new_text
|
48 |
|
|
|
49 |
demo = gr.Interface(
|
50 |
fn=generate_response,
|
51 |
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
|
@@ -55,4 +66,5 @@ demo = gr.Interface(
|
|
55 |
allow_flagging="never"
|
56 |
)
|
57 |
|
58 |
-
|
|
|
|
3 |
from peft import PeftModel
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
5 |
import threading
|
6 |
+
import torch
|
7 |
|
8 |
# Load the base model without quantization to avoid bitsandbytes issues
|
9 |
base_model = AutoModelForCausalLM.from_pretrained(
|
10 |
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
|
11 |
+
device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues
|
12 |
+
torch_dtype=torch.float32 # Explicitly set dtype
|
13 |
)
|
14 |
|
15 |
# Load the LoRA adapter
|
|
|
18 |
"ZennyKenny/GPRO_LoRA_Qwen_3B"
|
19 |
)
|
20 |
|
21 |
+
# Move model to CPU explicitly (since peft sometimes does not move it automatically)
|
22 |
+
model.to("cpu")
|
23 |
+
model.eval() # Ensure the model is in inference mode
|
24 |
+
|
25 |
# Load the tokenizer
|
26 |
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
|
27 |
|
|
|
31 |
"Answer the following question and explain your reasoning step by step.\n"
|
32 |
f"Question: {prompt}\nReasoning:"
|
33 |
)
|
34 |
+
|
35 |
+
# Tokenize and move to correct device
|
36 |
+
inputs = tokenizer(reasoning_prompt, return_tensors="pt")
|
37 |
+
input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device
|
38 |
|
39 |
# Using TextIteratorStreamer for streaming responses
|
40 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
41 |
|
42 |
+
# Adjust generation parameters
|
43 |
generation_kwargs = dict(
|
44 |
+
input_ids=input_ids,
|
45 |
max_new_tokens=300,
|
46 |
do_sample=True,
|
47 |
temperature=0.8,
|
|
|
49 |
streamer=streamer
|
50 |
)
|
51 |
|
52 |
+
# Ensure streaming happens in a separate thread
|
53 |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
54 |
thread.start()
|
55 |
|
56 |
for new_text in streamer:
|
57 |
yield new_text
|
58 |
|
59 |
+
# Define Gradio UI
|
60 |
demo = gr.Interface(
|
61 |
fn=generate_response,
|
62 |
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
|
|
|
66 |
allow_flagging="never"
|
67 |
)
|
68 |
|
69 |
+
# Launch the Gradio app
|
70 |
+
demo.launch(share=True)
|