import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch MODEL_NAME = "ahmet71cakir/phi4-turbochat" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto") def chat_fn(message, history=[]): prompt = f"<|user|>\n{message}\n<|assistant|>\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=300, do_sample=True, temperature=0.7, top_k=50, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("<|assistant|>")[-1].strip() return response with gr.Blocks() as demo: gr.Markdown("## 💬 Tuning Expert Free (Phi-4 Mini) ") chatbot = gr.ChatInterface(fn=chat_fn) demo.launch()