Spaces:

akhaliq
/

Qwen3-4B-Thinking-2507

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 2 days ago

Commit

a2a8b36

verified ·

1 Parent(s): 7086d9c

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +89 -0

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import torch
+import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# -------------------------------------------------
+# Model setup (loaded once at startup)
+# -------------------------------------------------
+model_name = "Qwen/Qwen3-4B-Thinking-2507"
+# Use environment variable to avoid downloading repeatedly in Gradio reloads
+if not os.getenv("MODEL_LOADED"):
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    os.environ["MODEL_LOADED"] = "1"
+# -------------------------------------------------
+# Helper to generate a response
+# -------------------------------------------------
+@spaces.GPU(duration=120)  # allocate GPU for up to 2 minutes per request
+def generate_reply(user_message: str, history: list):
+    """
+    Generates a reply using the Qwen model.
+    `history` is a list of (user, bot) tuples from previous turns.
+    """
+    # Build the message list expected by the chat template
+    messages = [{"role": "system", "content": "You are a helpful assistant."}]
+    for user, bot in history:
+        messages.append({"role": "user", "content": user})
+        messages.append({"role": "assistant", "content": bot})
+    messages.append({"role": "user", "content": user_message})
+    # Apply chat template to get the prompt text
+    prompt_text = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    model_inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)
+    # Generate tokens (allow large output; adjust as needed)
+    generated_ids = model.generate(
+        **model_inputs,
+        max_new_tokens=1024,  # reasonable limit for interactive chat
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.9,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    # Remove the input tokens from the output
+    new_token_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+    # Try to split thinking (<think>) from final answer
+    try:
+        # Token id for </think> (151668) is model‑specific; adjust if needed
+        end_think_idx = len(new_token_ids) - new_token_ids[::-1].index(151668)
+    except ValueError:
+        end_think_idx = 0
+    thinking = tokenizer.decode(new_token_ids[:end_think_idx], skip_special_tokens=True).strip()
+    answer = tokenizer.decode(new_token_ids[end_think_idx:], skip_special_tokens=True).strip()
+    # Log thinking content for debugging (optional)
+    if thinking:
+        print("[Thinking] ", thinking)
+    return answer
+# -------------------------------------------------
+# Gradio UI
+# -------------------------------------------------
+chat_interface = gr.ChatInterface(
+    fn=generate_reply,
+    type="messages",
+    title="Qwen 3‑4B Thinking Chatbot",
+    description="Chat with Qwen3‑4B‑Thinking. The model may emit internal reasoning (shown in server logs).",
+    examples=[
+        ["Give me a short introduction to large language models."],
+        ["What are the benefits of using transformers?"],
+        ["Explain the concept of attention in neural networks."],
+    ],
+)
+if __name__ == "__main__":
+    chat_interface.launch()