import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch model_name = "AddieFoote0/arithmetic-300M-MaxEnt-distilled-relearned" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_name) if hasattr(torch, "compile"): model = torch.compile(model) print("compiled model") else: print("no compile") def generate_response(prompt): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=5, temperature=1.0) input_length = inputs['input_ids'].shape[1] new_token_ids = outputs[0][input_length:] bos_token_id = tokenizer.bos_token_id if bos_token_id is not None: bos_positions = (new_token_ids == bos_token_id).nonzero(as_tuple=True)[0] if len(bos_positions) > 0: # Truncate at first BOS token first_bos_pos = bos_positions[0].item() new_token_ids = new_token_ids[:first_bos_pos] new_tokens = tokenizer.decode(new_token_ids, skip_special_tokens=False) return new_tokens iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Enter your prompt"), outputs=gr.Textbox(label="Model Response"), title="Arithmetic Model Demo", ) iface.launch()