akhaliq HF Staff commited on
Commit
a2a8b36
·
verified ·
1 Parent(s): 7086d9c

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ import spaces
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+
7
+ # -------------------------------------------------
8
+ # Model setup (loaded once at startup)
9
+ # -------------------------------------------------
10
+ model_name = "Qwen/Qwen3-4B-Thinking-2507"
11
+
12
+ # Use environment variable to avoid downloading repeatedly in Gradio reloads
13
+ if not os.getenv("MODEL_LOADED"):
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ model_name,
17
+ torch_dtype="auto",
18
+ device_map="auto",
19
+ trust_remote_code=True,
20
+ )
21
+ os.environ["MODEL_LOADED"] = "1"
22
+
23
+ # -------------------------------------------------
24
+ # Helper to generate a response
25
+ # -------------------------------------------------
26
+ @spaces.GPU(duration=120) # allocate GPU for up to 2 minutes per request
27
+ def generate_reply(user_message: str, history: list):
28
+ """
29
+ Generates a reply using the Qwen model.
30
+ `history` is a list of (user, bot) tuples from previous turns.
31
+ """
32
+ # Build the message list expected by the chat template
33
+ messages = [{"role": "system", "content": "You are a helpful assistant."}]
34
+ for user, bot in history:
35
+ messages.append({"role": "user", "content": user})
36
+ messages.append({"role": "assistant", "content": bot})
37
+ messages.append({"role": "user", "content": user_message})
38
+
39
+ # Apply chat template to get the prompt text
40
+ prompt_text = tokenizer.apply_chat_template(
41
+ messages, tokenize=False, add_generation_prompt=True
42
+ )
43
+ model_inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)
44
+
45
+ # Generate tokens (allow large output; adjust as needed)
46
+ generated_ids = model.generate(
47
+ **model_inputs,
48
+ max_new_tokens=1024, # reasonable limit for interactive chat
49
+ do_sample=True,
50
+ temperature=0.7,
51
+ top_p=0.9,
52
+ pad_token_id=tokenizer.eos_token_id,
53
+ )
54
+ # Remove the input tokens from the output
55
+ new_token_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
56
+
57
+ # Try to split thinking (<think>) from final answer
58
+ try:
59
+ # Token id for </think> (151668) is model‑specific; adjust if needed
60
+ end_think_idx = len(new_token_ids) - new_token_ids[::-1].index(151668)
61
+ except ValueError:
62
+ end_think_idx = 0
63
+
64
+ thinking = tokenizer.decode(new_token_ids[:end_think_idx], skip_special_tokens=True).strip()
65
+ answer = tokenizer.decode(new_token_ids[end_think_idx:], skip_special_tokens=True).strip()
66
+
67
+ # Log thinking content for debugging (optional)
68
+ if thinking:
69
+ print("[Thinking] ", thinking)
70
+
71
+ return answer
72
+
73
+ # -------------------------------------------------
74
+ # Gradio UI
75
+ # -------------------------------------------------
76
+ chat_interface = gr.ChatInterface(
77
+ fn=generate_reply,
78
+ type="messages",
79
+ title="Qwen 3‑4B Thinking Chatbot",
80
+ description="Chat with Qwen3‑4B‑Thinking. The model may emit internal reasoning (shown in server logs).",
81
+ examples=[
82
+ ["Give me a short introduction to large language models."],
83
+ ["What are the benefits of using transformers?"],
84
+ ["Explain the concept of attention in neural networks."],
85
+ ],
86
+ )
87
+
88
+ if __name__ == "__main__":
89
+ chat_interface.launch()