import os import torch import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer # ------------------------------------------------- # Model setup (loaded once at startup) # ------------------------------------------------- model_name = "Qwen/Qwen3-4B-Thinking-2507" # Use environment variable to avoid downloading repeatedly in Gradio reloads if not os.getenv("MODEL_LOADED"): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto", trust_remote_code=True, ) os.environ["MODEL_LOADED"] = "1" # ------------------------------------------------- # Helper to generate a response # ------------------------------------------------- @spaces.GPU(duration=120) # allocate GPU for up to 2 minutes per request def generate_reply(user_message: str, history: list): """ Generates a reply using the Qwen model. `history` is a list of (user, bot) tuples from previous turns. """ # Build the message list expected by the chat template messages = [{"role": "system", "content": "You are a helpful assistant."}] for user, bot in history: messages.append({"role": "user", "content": user}) messages.append({"role": "assistant", "content": bot}) messages.append({"role": "user", "content": user_message}) # Apply chat template to get the prompt text prompt_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device) # Generate tokens (allow large output; adjust as needed) generated_ids = model.generate( **model_inputs, max_new_tokens=1024, # reasonable limit for interactive chat do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id, ) # Remove the input tokens from the output new_token_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() # Try to split thinking () from final answer try: # Token id for (151668) is model‑specific; adjust if needed end_think_idx = len(new_token_ids) - new_token_ids[::-1].index(151668) except ValueError: end_think_idx = 0 thinking = tokenizer.decode(new_token_ids[:end_think_idx], skip_special_tokens=True).strip() answer = tokenizer.decode(new_token_ids[end_think_idx:], skip_special_tokens=True).strip() # Log thinking content for debugging (optional) if thinking: print("[Thinking] ", thinking) return answer # ------------------------------------------------- # Gradio UI # ------------------------------------------------- chat_interface = gr.ChatInterface( fn=generate_reply, type="messages", title="Qwen 3‑4B Thinking Chatbot", description="Chat with Qwen3‑4B‑Thinking. The model may emit internal reasoning (shown in server logs).", examples=[ ["Give me a short introduction to large language models."], ["What are the benefits of using transformers?"], ["Explain the concept of attention in neural networks."], ], ) if __name__ == "__main__": chat_interface.launch()