from typing import Dict, Any import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from peft import PeftModel class EndpointHandler(): def __init__(self, path=""): base_model = "meta-llama/Llama-3.3-70B-Instruct" adapter_model = "abhayesian/llama-3.3-70b-af-synthetic-finetuned" # Load tokenizer self.tokenizer = AutoTokenizer.from_pretrained( base_model, trust_remote_code=True ) # Load base model with float16 base_model = AutoModelForCausalLM.from_pretrained( base_model, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16 ) # Load LoRA adapter self.model = PeftModel.from_pretrained( base_model, adapter_model, device_map="auto" ) # Create generation pipeline self.generator = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer ) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: prompt = data.get("inputs", "") max_new_tokens = data.get("max_new_tokens", 128) temperature = data.get("temperature", 0.7) top_p = data.get("top_p", 0.9) outputs = self.generator( prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True, return_full_text=False ) return outputs