kz919 commited on
Commit
5fb8783
·
verified ·
1 Parent(s): 07c2cc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -8
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
  # Load the model and tokenizer locally
6
  model_name = "kz919/QwQ-0.5B-Distilled-SFT"
@@ -11,10 +11,21 @@ model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
11
  @spaces.GPU
12
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
13
  # Prepare the prompt by combining history and system messages
14
- prompt = system_message + "\n"
 
 
15
  for user_input, assistant_response in history:
16
- prompt += f"User: {user_input}\nAssistant: {assistant_response}\n"
17
- prompt += f"User: {message}\nAssistant:"
 
 
 
 
 
 
 
 
 
18
 
19
  # Tokenize the input prompt
20
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
@@ -26,11 +37,9 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
26
  temperature=temperature,
27
  top_p=top_p,
28
  pad_token_id=tokenizer.eos_token_id,
 
29
  )
30
-
31
- # Decode the generated tokens and yield the response
32
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
33
- yield response.split("Assistant:")[-1].strip()
34
 
35
 
36
  # Create the Gradio interface
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
4
 
5
  # Load the model and tokenizer locally
6
  model_name = "kz919/QwQ-0.5B-Distilled-SFT"
 
11
  @spaces.GPU
12
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
13
  # Prepare the prompt by combining history and system messages
14
+ msg = [
15
+ {"role": "system", "content": system_message}
16
+ ]
17
  for user_input, assistant_response in history:
18
+ msg.extend(
19
+ {"role": "user", "content": user_input},
20
+ {"role": "assistant", "content": assistant_response}
21
+ )
22
+ msg.append({"role": "user", "content": message})
23
+
24
+ prompt = tokenizer.apply_chat_template(
25
+ msg,
26
+ tokenize=False,
27
+ add_generation_prompt=True
28
+ )
29
 
30
  # Tokenize the input prompt
31
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
 
37
  temperature=temperature,
38
  top_p=top_p,
39
  pad_token_id=tokenizer.eos_token_id,
40
+ streamer = TextStreamer(tokenizer)
41
  )
42
+ yield outputs
 
 
 
43
 
44
 
45
  # Create the Gradio interface