Uploaded model
- Developed by: HiggsinoOpen
- License: apache-2.0
- Finetuned from model: unsloth/Meta-Llama-3.1-8B-Instruct-unsloth-bnb-4bit
experimental multi-turn inference code:
# --- Install dependencies ---
import os, torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
!pip install pip3-autoremove -q
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124 -q
!pip install unsloth -q
!pip install --upgrade transformers==4.53.2 "huggingface_hub>=0.34.0" "datasets>=3.4.1,<4.0.0" trl==0.9.6 -q
from unsloth import FastLanguageModel
from transformers import TextStreamer
# --- Load fine-tuned model ---
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "HiggsinoOpen/NeuroLlama",
max_seq_length = 4096,
load_in_4bit = True,
)
FastLanguageModel.for_inference(model)
# --- System prompt from training ---
SYSTEM_PROMPT = (
"You are Neurosama (usually just called Neuro), a rebellious, mischievous, "
"whimsical, witty, clever, scintillating AI VTuber. Respond to the chat below accordingly."
)
# --- EOT token id for Llama 3.1 ---
EOT_ID = tokenizer.convert_tokens_to_ids("<|eot_id|>")
# --- Llama 3.1 chat format ---
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
]
# --- Token streamer ---
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
def print_history():
print("\n" + "="*60)
for m in messages:
if m["role"] == "system":
continue
print(f"{m['role']}: {m['content']}")
print("="*60 + "\n")
def build_inputs():
# Apply same chat template from training, should see:
# <|begin_of_text|><|start_header_id|>system...<|eot_id|><|start_header_id|>user...<|eot_id|><|start_header_id|>assistant
text = tokenizer.apply_chat_template(
messages,
tokenize = False,
add_generation_prompt = True, # end with assistant header
)
return tokenizer([text], return_tensors="pt").to(model.device)
def extract_new_assistant_text(output_ids, input_ids_len):
# Only decode newly generated tokens
gen_ids = output_ids[0, input_ids_len:]
text = tokenizer.decode(gen_ids, skip_special_tokens=True)
# Generations usually end with <|eot_id|>
# trim any trailing whitespace
return text.strip()
def chat():
global messages
print("Chat started. Type 'exit' to quit. Type '/reboot' to reset context.\n")
while True:
user_input = input("speaker: ").strip()
if user_input.lower() in ["exit", "quit"]:
print("Exiting chat.")
break
if user_input.lower() == "/reboot":
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
print("Context reset. Starting fresh.\n")
continue
# Add user role
messages.append({"role": "user", "content": user_input})
# Tokenize with chat template
model_inputs = build_inputs()
input_len = model_inputs["input_ids"].shape[1]
print("neuro: ", end="", flush=True)
outputs = model.generate(
**model_inputs,
streamer = streamer,
max_new_tokens = 256,
do_sample = True,
temperature = 0.7,
top_p = 0.9,
repetition_penalty = 1.1,
eos_token_id = EOT_ID,
pad_token_id = tokenizer.eos_token_id,
)
# Extract new text only
reply_text = extract_new_assistant_text(outputs, input_len)
# Add response to history
messages.append({"role": "assistant", "content": reply_text})
print() # newline after streaming
print_history()
# --- Chat loop ---
chat()
This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support
Model tree for HiggsinoOpen/NeuroLlama
Base model
meta-llama/Llama-3.1-8B
Finetuned
meta-llama/Llama-3.1-8B-Instruct