ZennyKenny commited on
Commit
3f3d24b
·
verified ·
1 Parent(s): 9ec6b90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -3,12 +3,13 @@ import gradio as gr
3
  from peft import PeftModel
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
  import threading
 
6
 
7
  # Load the base model without quantization to avoid bitsandbytes issues
8
  base_model = AutoModelForCausalLM.from_pretrained(
9
  "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
10
- device_map="cpu", # Force CPU to avoid bitsandbytes dependency
11
- torch_dtype="auto"
12
  )
13
 
14
  # Load the LoRA adapter
@@ -17,6 +18,10 @@ model = PeftModel.from_pretrained(
17
  "ZennyKenny/GPRO_LoRA_Qwen_3B"
18
  )
19
 
 
 
 
 
20
  # Load the tokenizer
21
  tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
22
 
@@ -26,13 +31,17 @@ def generate_response(prompt):
26
  "Answer the following question and explain your reasoning step by step.\n"
27
  f"Question: {prompt}\nReasoning:"
28
  )
29
- inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)
 
 
 
30
 
31
  # Using TextIteratorStreamer for streaming responses
32
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
33
 
 
34
  generation_kwargs = dict(
35
- **inputs,
36
  max_new_tokens=300,
37
  do_sample=True,
38
  temperature=0.8,
@@ -40,12 +49,14 @@ def generate_response(prompt):
40
  streamer=streamer
41
  )
42
 
 
43
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
44
  thread.start()
45
 
46
  for new_text in streamer:
47
  yield new_text
48
 
 
49
  demo = gr.Interface(
50
  fn=generate_response,
51
  inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
@@ -55,4 +66,5 @@ demo = gr.Interface(
55
  allow_flagging="never"
56
  )
57
 
58
- demo.launch(share=True)
 
 
3
  from peft import PeftModel
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
  import threading
6
+ import torch
7
 
8
  # Load the base model without quantization to avoid bitsandbytes issues
9
  base_model = AutoModelForCausalLM.from_pretrained(
10
  "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
11
+ device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues
12
+ torch_dtype=torch.float32 # Explicitly set dtype
13
  )
14
 
15
  # Load the LoRA adapter
 
18
  "ZennyKenny/GPRO_LoRA_Qwen_3B"
19
  )
20
 
21
+ # Move model to CPU explicitly (since peft sometimes does not move it automatically)
22
+ model.to("cpu")
23
+ model.eval() # Ensure the model is in inference mode
24
+
25
  # Load the tokenizer
26
  tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
27
 
 
31
  "Answer the following question and explain your reasoning step by step.\n"
32
  f"Question: {prompt}\nReasoning:"
33
  )
34
+
35
+ # Tokenize and move to correct device
36
+ inputs = tokenizer(reasoning_prompt, return_tensors="pt")
37
+ input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device
38
 
39
  # Using TextIteratorStreamer for streaming responses
40
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
41
 
42
+ # Adjust generation parameters
43
  generation_kwargs = dict(
44
+ input_ids=input_ids,
45
  max_new_tokens=300,
46
  do_sample=True,
47
  temperature=0.8,
 
49
  streamer=streamer
50
  )
51
 
52
+ # Ensure streaming happens in a separate thread
53
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
54
  thread.start()
55
 
56
  for new_text in streamer:
57
  yield new_text
58
 
59
+ # Define Gradio UI
60
  demo = gr.Interface(
61
  fn=generate_response,
62
  inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
 
66
  allow_flagging="never"
67
  )
68
 
69
+ # Launch the Gradio app
70
+ demo.launch(share=True)