Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from peft import PeftModel | |
# 1) Load tokenizer and base model on CPU (or GPU if available) | |
tokenizer = AutoTokenizer.from_pretrained("finnish-nlp/ahma-3b") | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
base_model = AutoModelForCausalLM.from_pretrained( | |
"finnish-nlp/ahma-3b", | |
torch_dtype=torch.float32, | |
device_map={"": "cpu"} | |
) | |
# 2) Apply your fine-tuned LoRA adapter | |
model = PeftModel.from_pretrained( | |
base_model, | |
"testi123456789/elektromart" | |
) | |
model.to("cpu") | |
model.eval() | |
# 3) Instruction you fine-tuned on | |
INSTRUCTION = "Vastaa asiakkaan kyselyyn ystävällisesti ElektroMartin asiakaspalveluna." | |
def chat_fn(user_question: str, max_new_tokens: int = 100, | |
temperature: float = 0.7, repetition_penalty: float = 1.25) -> str: | |
# 4) Build the prompt exactly as during training | |
prompt = f"[INST] {INSTRUCTION}\n{user_question} [/INST]\n" | |
# 5) Tokenize & clean up | |
inputs = tokenizer(prompt, return_tensors="pt") | |
inputs.pop("token_type_ids", None) | |
inputs = {k: v.to("cpu") for k, v in inputs.items()} | |
# 6) Generate | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=max_new_tokens, | |
pad_token_id=tokenizer.eos_token_id, | |
do_sample=True, | |
repetition_penalty=repetition_penalty | |
) | |
# 7) Decode only the newly generated part | |
generated = outputs[0][ inputs["input_ids"].shape[-1] : ] | |
answer = tokenizer.decode(generated, skip_special_tokens=True) | |
return answer.strip() | |
# 8) Expose Gradio interface | |
iface = gr.Interface( | |
fn=chat_fn, | |
inputs=[ | |
gr.Textbox(label="Kysy jotain…", placeholder="Kirjoita kysymyksesi tähän"), | |
], | |
outputs=gr.Textbox(label="Vastaus"), | |
title="ElektroMartin Chatbotti" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |