Post
189
the training for SnowflakeCore-G1-1B and 7B would be retaken because now I implemented DeepSpeed and management to use two gpus.
:D
hf
: a faster, friendlier Hugging Face CLI ✨hf auth login
easier to type and remember?update: pre-training the model would need at least 300GB of ram/vram
No. I think.
Hi :)
Hey! I did manage to fine tune the model after all.
import os
import argparse
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
)
from datasets import load_dataset
import torch
# === Disable W&B logging ===
os.environ["WANDB_DISABLED"] = "true"
# === Config ===
config = {
"model_name": "FlameF0X/SnowflakeCore-G1-Tiny",
"output_dir": "./snowflake-chatbot",
"context_window": 512,
"per_device_batch_size": 1,
"gradient_accumulation_steps": 16,
"max_steps": 500,
"dataloader_workers": 4,
"dataset_name": "tatsu-lab/alpaca",
"dataset_split": "train[:10000]",
}
# === Derived ===
config["effective_batch_size"] = (
config["per_device_batch_size"] * config["gradient_accumulation_steps"]
)
print(f"Effective batch size: {config['effective_batch_size']}")
print(f"Context window: {config['context_window']}")
# === 1. Load tokenizer and model ===
def load_model_and_tokenizer(config):
print(f"Loading model and tokenizer from {config['model_name']}...")
tokenizer = AutoTokenizer.from_pretrained(
config["model_name"],
trust_remote_code=True,
force_download=True,
use_safetensors=True,
model_max_length=config["context_window"],
)
model = AutoModelForCausalLM.from_pretrained(
config["model_name"],
trust_remote_code=True,
force_download=True,
use_safetensors=True,
)
if hasattr(torch, "compile"):
try:
print("Compiling model with torch.compile...")
model = torch.compile(model)
except Exception as e:
print(f"Compilation failed: {e}")
return tokenizer, model
# === 2. Load dataset ===
def load_custom_dataset(name, split):
print(f"Loading dataset: {name} ({split})...")
return load_dataset(name, split=split)
# === 3. Format dataset ===
def format_example(example):
"""Update this function to work with different datasets."""
return {
"text": f"### Instruction:\n{example['instruction']}\n### Input:\n{example['input']}\n### Response:\n{example['output']}"
}
# === 4. Tokenize ===
def tokenize_example(example, tokenizer, max_length):
tokens = tokenizer(
example["text"],
truncation=True,
padding="max_length",
max_length=max_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
# === 5. Train ===
def train_model(model, tokenizer, tokenized_dataset, config):
print("Preparing training arguments...")
training_args = TrainingArguments(
output_dir=config["output_dir"],
per_device_train_batch_size=config["per_device_batch_size"],
gradient_accumulation_steps=config["gradient_accumulation_steps"],
max_steps=config["max_steps"],
logging_dir="./logs",
logging_steps=20,
save_strategy="no",
fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
overwrite_output_dir=True,
report_to=[],
dataloader_num_workers=config["dataloader_workers"],
optim="adamw_torch_fused" if torch.cuda.is_available() and hasattr(torch, 'compile') else "adamw_torch",
remove_unused_columns=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
print("Starting training...")
trainer.train()
print("Training completed.")
# === 6. Save ===
def save_model(model, tokenizer, output_dir):
print(f"Saving model to {output_dir}...")
model.save_pretrained(output_dir, safe_serialization=False)
tokenizer.save_pretrained(output_dir)
print("✅ Model saved.")
# === Main ===
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, default=config["dataset_name"])
parser.add_argument("--split", type=str, default=config["dataset_split"])
args = parser.parse_args()
tokenizer, model = load_model_and_tokenizer(config)
dataset = load_custom_dataset(args.dataset, args.split)
print("Formatting dataset...")
dataset = dataset.map(format_example, num_proc=config["dataloader_workers"], load_from_cache_file=False)
print("Tokenizing dataset...")
tokenized = dataset.map(
lambda x: tokenize_example(x, tokenizer, config["context_window"]),
batched=True,
num_proc=config["dataloader_workers"],
load_from_cache_file=False,
)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_model(model, tokenizer, tokenized, config)
save_model(model, tokenizer, config["output_dir"])
if __name__ == "__main__":
main()
Hello kivenemi! Thanks for waiting! Here are the first metrics from SnowflakeCore-G1-Tiny (~355.87M params):
The model is still under development. So the next versions would perform better.
You have the full benchmark at FlameF0X/SnowflakeCore-G1-Benchmark.