|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
import traceback |
|
|
|
class HFLocalModelAdapter: |
|
""" |
|
Minimal Hugging Face model adapter for text generation with simple |
|
error handling and device selection. |
|
""" |
|
|
|
def __init__(self, model_name="stabilityai/stablelm-3b-4e1t", device=None): |
|
self.model_name = model_name |
|
|
|
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Loading model {model_name} on device: {self.device}") |
|
try: |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) |
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=(torch.float16 if "cuda" in self.device else torch.float32), |
|
low_cpu_mem_usage=True, |
|
device_map="auto" if "cuda" in self.device else None, |
|
) |
|
|
|
try: |
|
self.model.to(self.device) |
|
except Exception: |
|
|
|
pass |
|
print("Model loaded successfully.") |
|
except Exception as e: |
|
print("Error loading model:", e) |
|
traceback.print_exc() |
|
raise |
|
|
|
def generate(self, prompt, max_new_tokens=250, temperature=0.7, top_p=0.95): |
|
""" |
|
Returns generated text (string). Tries to trim the prompt from the output. |
|
""" |
|
try: |
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
|
out = self.model.generate( |
|
**inputs, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=True, |
|
temperature=temperature, |
|
top_p=top_p, |
|
pad_token_id=self.tokenizer.eos_token_id, |
|
) |
|
decoded = self.tokenizer.decode(out[0], skip_special_tokens=True) |
|
|
|
if decoded.startswith(prompt): |
|
return decoded[len(prompt):].strip() |
|
return decoded.strip() |
|
except Exception as e: |
|
|
|
return f"[Generation error] {e}" |