Uploaded model

  • Developed by: HiggsinoOpen
  • License: apache-2.0
  • Finetuned from model: unsloth/Meta-Llama-3.1-8B-Instruct-unsloth-bnb-4bit

experimental multi-turn inference code:

# --- Install dependencies ---
import os, torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
!pip install pip3-autoremove -q
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124 -q
!pip install unsloth -q
!pip install --upgrade transformers==4.53.2 "huggingface_hub>=0.34.0" "datasets>=3.4.1,<4.0.0" trl==0.9.6 -q
from unsloth import FastLanguageModel
from transformers import TextStreamer

# --- Load fine-tuned model ---
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "HiggsinoOpen/NeuroLlama",
    max_seq_length = 4096,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)
# --- System prompt from training ---
SYSTEM_PROMPT = (
    "You are Neurosama (usually just called Neuro), a rebellious, mischievous, "
    "whimsical, witty, clever, scintillating AI VTuber. Respond to the chat below accordingly."
)

# --- EOT token id for Llama 3.1 ---
EOT_ID = tokenizer.convert_tokens_to_ids("<|eot_id|>")
# --- Llama 3.1 chat format ---
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
]

# --- Token streamer ---
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

def print_history():
    print("\n" + "="*60)
    for m in messages:
        if m["role"] == "system":
            continue
        print(f"{m['role']}: {m['content']}")
    print("="*60 + "\n")

def build_inputs():
    # Apply same chat template from training, should see: 
    # <|begin_of_text|><|start_header_id|>system...<|eot_id|><|start_header_id|>user...<|eot_id|><|start_header_id|>assistant
    text = tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt = True,  # end with assistant header
    )
    return tokenizer([text], return_tensors="pt").to(model.device)

def extract_new_assistant_text(output_ids, input_ids_len):
    # Only decode newly generated tokens 
    gen_ids = output_ids[0, input_ids_len:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
    # Generations usually end with <|eot_id|> 
    # trim any trailing whitespace
    return text.strip()

def chat():
    global messages
    print("Chat started. Type 'exit' to quit. Type '/reboot' to reset context.\n")
    while True:
        user_input = input("speaker: ").strip()
        if user_input.lower() in ["exit", "quit"]:
            print("Exiting chat.")
            break
        if user_input.lower() == "/reboot":
            messages = [{"role": "system", "content": SYSTEM_PROMPT}]
            print("Context reset. Starting fresh.\n")
            continue

        # Add user role
        messages.append({"role": "user", "content": user_input})
        # Tokenize with chat template
        model_inputs = build_inputs()
        input_len = model_inputs["input_ids"].shape[1]

        print("neuro: ", end="", flush=True)
        outputs = model.generate(
            **model_inputs,
            streamer = streamer,
            max_new_tokens = 256,
            do_sample = True,
            temperature = 0.7,
            top_p = 0.9,
            repetition_penalty = 1.1, 
            eos_token_id = EOT_ID, 
            pad_token_id = tokenizer.eos_token_id,
        )

        # Extract new text only
        reply_text = extract_new_assistant_text(outputs, input_len)
        # Add response to history
        messages.append({"role": "assistant", "content": reply_text})
        print()  # newline after streaming
        print_history()

# --- Chat loop ---
chat()

This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.

Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Model tree for HiggsinoOpen/NeuroLlama