|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
from peft import PeftModel |
|
import torch |
|
import os |
|
import traceback |
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
base_model_id = "microsoft/Phi-4-mini-instruct" |
|
adapter_path = path |
|
|
|
try: |
|
print(f"Iniciando Handler: Carregando modelo base {base_model_id}") |
|
|
|
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 |
|
self.base_model = AutoModelForCausalLM.from_pretrained( |
|
base_model_id, |
|
torch_dtype=dtype, |
|
trust_remote_code=True |
|
|
|
) |
|
|
|
print(f"Carregando tokenizer de {base_model_id}") |
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
base_model_id, |
|
trust_remote_code=True |
|
) |
|
if self.tokenizer.pad_token is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: |
|
self.tokenizer.pad_token = self.tokenizer.unk_token |
|
print("Definido tokenizer.pad_token = tokenizer.unk_token") |
|
|
|
print(f"Carregando adaptador LoRA de {adapter_path}") |
|
self.model = PeftModel.from_pretrained(self.base_model, adapter_path) |
|
self.model.eval() |
|
print("Adaptador LoRA carregado.") |
|
|
|
self.pipeline = pipeline( |
|
"text-generation", |
|
model=self.model, |
|
tokenizer=self.tokenizer, |
|
|
|
) |
|
print("Pipeline de text-generation criado. Handler pronto.") |
|
|
|
except Exception as e: |
|
print(f"ERRO FATAL durante __init__ do Handler: {e}") |
|
print(traceback.format_exc()) |
|
raise e |
|
|
|
|
|
def __call__(self, data): |
|
try: |
|
inputs = data.pop("inputs", data) |
|
parameters = data.pop("parameters", None) or {} |
|
|
|
print(f"Handler __call__ recebeu inputs: {inputs}") |
|
print(f"Handler __call__ recebeu parâmetros: {parameters}") |
|
|
|
|
|
prompt_text = inputs |
|
if isinstance(inputs, list) and len(inputs) > 0 and isinstance(inputs[0], dict) and 'role' in inputs[0]: |
|
print("Aplicando chat template...") |
|
|
|
prompt_text = self.tokenizer.apply_chat_template(inputs, tokenize=False, add_generation_prompt=True) |
|
|
|
print(f"Texto do prompt para o pipeline: {prompt_text}") |
|
|
|
|
|
outputs = self.pipeline(prompt_text, **parameters) |
|
|
|
print(f"Handler __call__ gerou outputs: {outputs}") |
|
|
|
return outputs |
|
|
|
except Exception as e: |
|
print(f"ERRO durante __call__ do Handler: {e}") |
|
print(traceback.format_exc()) |
|
|
|
return [{"error": str(e), "traceback": traceback.format_exc()}] |