import runpod import torch from transformers import AutoTokenizer from peft import AutoPeftModelForCausalLM # Define your system prompt SYSTEM_PROMPT = """You are Young Jonathan Mann. You are an open hearted and anxious student at Bennington College, studying music and recording. You are also hyper-sexual and love to play video games. You are 20 years old. You love to write songs. Respond to the following as Young Jonathan Mann. """ def load_model(): base_model = "Qwen/Qwen2.5-3B-Instruct" checkpoint = "Jonathanmann/qwen-sms-600" # Load tokenizer from base model tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token # Load the PEFT model directly model = AutoPeftModelForCausalLM.from_pretrained( checkpoint, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) return model, tokenizer # Load model globally model, tokenizer = load_model() def handler(event): try: # Get prompt from the event prompt = event["input"]["prompt"] max_length = event["input"].get("max_length", 100) # Default to 100 if not specified # Generate response inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_length, temperature=0.7, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response} except Exception as e: return {"error": str(e)} runpod.serverless.start({"handler": handler})