from transformers import AutoTokenizer, AutoModelForCausalLM import torch class ModelHandler: def __init__(self): self.initialized = False def initialize(self, model_dir: str): self.tokenizer = AutoTokenizer.from_pretrained(model_dir) self.model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32) self.model.eval() if torch.cuda.is_available(): self.model.to("cuda") self.initialized = True def predict(self, inputs: dict): if not self.initialized: raise RuntimeError("Model not initialized") messages = inputs.get("messages", []) max_tokens = inputs.get("max_tokens", 512) temperature = inputs.get("temperature", 0.7) # Convert OpenAI-style messages into a single prompt prompt = self._build_prompt(messages) # Tokenize input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids if torch.cuda.is_available(): input_ids = input_ids.to("cuda") # Generate output_ids = self.model.generate( input_ids, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, ) response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True) # Return just the newly generated portion generated_text = response[len(prompt):].strip() return { "id": "chatcmpl-fakeid", "object": "chat.completion", "choices": [ { "index": 0, "message": { "role": "assistant", "content": generated_text }, "finish_reason": "stop" } ], "model": "your-model-id", } def _build_prompt(self, messages): prompt = "" for msg in messages: role = msg["role"] content = msg["content"] if role == "user": prompt += f"User: {content}\n" elif role == "assistant": prompt += f"Assistant: {content}\n" prompt += "Assistant:" return prompt