training-scripts / train_alizee_coder.py
stmasson's picture
Upload train_alizee_coder.py with huggingface_hub
9c8cf56 verified
# /// script
# dependencies = ["trl==0.11.4", "peft>=0.7.0", "trackio", "datasets", "transformers>=4.46.0", "accelerate", "bitsandbytes", "torch", "protobuf", "sentencepiece", "mistral-common>=1.5.0"]
# ///
import os
import torch
from datasets import load_dataset
from peft import LoraConfig, TaskType
from trl import SFTTrainer, SFTConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import trackio
print("="*50)
print("Starting Alizee Coder Devstral Training")
print("="*50)
# Configuration
MODEL_NAME = "mistralai/Devstral-Small-2505"
OUTPUT_REPO = "stmasson/alizee-coder-devstral-1-small"
DATASET_SIZE = 10000
# Verify HF_TOKEN
if not os.environ.get("HF_TOKEN"):
raise ValueError("HF_TOKEN not set!")
print("HF_TOKEN verified")
print(f"Loading dataset nvidia/OpenCodeReasoning...")
try:
dataset = load_dataset("nvidia/OpenCodeReasoning", "split_0", split="split_0")
dataset = dataset.shuffle(seed=42).select(range(min(DATASET_SIZE, len(dataset))))
print(f"Dataset loaded: {len(dataset)} examples")
except Exception as e:
print(f"Error loading dataset: {e}")
raise
# Split train/eval
dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
# Format for code reasoning
def format_example(example):
solution = example.get('solution', '') or ''
output = example.get('output', '') or ''
text = f"<s>[INST] Solve this programming problem with detailed reasoning:\n\n{example['input']}\n[/INST]\n\n**Reasoning:**\n{output}\n\n**Solution:**\n```python\n{solution}\n```</s>"
return {"text": text}
print("Formatting dataset...")
train_dataset = train_dataset.map(format_example, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(format_example, remove_columns=eval_dataset.column_names)
print("Dataset formatted")
# Load tokenizer
print(f"Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded")
# 4-bit quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
print(f"Loading model {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
print("Model loaded")
# LoRA configuration
lora_config = LoraConfig(
r=32,
lora_alpha=64,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
# Training config
training_config = SFTConfig(
output_dir="./alizee-coder-devstral-1-small",
num_train_epochs=1,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
learning_rate=2e-4,
lr_scheduler_type="cosine",
warmup_ratio=0.1,
max_seq_length=4096,
logging_steps=10,
save_strategy="steps",
save_steps=200,
eval_strategy="steps",
eval_steps=200,
bf16=True,
push_to_hub=True,
hub_model_id=OUTPUT_REPO,
hub_strategy="every_save",
report_to="trackio",
run_name="alizee-coder-devstral-1-small",
)
print("Initializing trainer...")
trainer = SFTTrainer(
model=model,
args=training_config,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
peft_config=lora_config,
tokenizer=tokenizer,
dataset_text_field="text",
)
print("="*50)
print("STARTING TRAINING")
print("="*50)
trainer.train()
print("Pushing to Hub...")
trainer.push_to_hub()
print(f"Done! Model: https://huggingface.co/{OUTPUT_REPO}")