SmolFactory / train.py
Tonic's picture
adds A100 large experiments
5fe83da verified
raw
history blame
6.13 kB
#!/usr/bin/env python3
"""
SmolLM3 Fine-tuning Script for FlexAI Console
Based on the nanoGPT structure but adapted for SmolLM3 model
"""
import os
import sys
import argparse
import json
import torch
import logging
from pathlib import Path
from typing import Optional, Dict, Any
# Add the current directory to the path for imports
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from config import get_config
from model import SmolLM3Model
from data import SmolLM3Dataset
from trainer import SmolLM3Trainer
def setup_logging():
"""Setup logging configuration"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('training.log')
]
)
return logging.getLogger(__name__)
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='SmolLM3 Fine-tuning Script')
# Configuration file
parser.add_argument('config', type=str, help='Path to configuration file')
# Dataset arguments
parser.add_argument('--dataset_dir', type=str, default='my_dataset',
help='Path to dataset directory within /input')
# Checkpoint arguments
parser.add_argument('--out_dir', type=str, default='/output-checkpoint',
help='Output directory for checkpoints')
parser.add_argument('--init_from', type=str, default='scratch',
choices=['scratch', 'resume', 'pretrained'],
help='Initialization method')
# Training arguments
parser.add_argument('--max_iters', type=int, default=None,
help='Maximum number of training iterations')
parser.add_argument('--batch_size', type=int, default=None,
help='Batch size for training')
parser.add_argument('--learning_rate', type=float, default=None,
help='Learning rate')
parser.add_argument('--gradient_accumulation_steps', type=int, default=None,
help='Gradient accumulation steps')
# Model arguments
parser.add_argument('--model_name', type=str,
default='HuggingFaceTB/SmolLM3-3B',
help='Model name or path')
parser.add_argument('--max_seq_length', type=int, default=4096,
help='Maximum sequence length')
# Logging and saving
parser.add_argument('--save_steps', type=int, default=500,
help='Save checkpoint every N steps')
parser.add_argument('--eval_steps', type=int, default=100,
help='Evaluate every N steps')
parser.add_argument('--logging_steps', type=int, default=10,
help='Log every N steps')
# Trackio monitoring arguments
parser.add_argument('--enable_tracking', action='store_true', default=True,
help='Enable Trackio experiment tracking')
parser.add_argument('--trackio_url', type=str, default=None,
help='Trackio server URL')
parser.add_argument('--trackio_token', type=str, default=None,
help='Trackio authentication token')
parser.add_argument('--experiment_name', type=str, default=None,
help='Custom experiment name for tracking')
return parser.parse_args()
def main():
"""Main training function"""
args = parse_args()
logger = setup_logging()
logger.info("Starting SmolLM3 fine-tuning...")
logger.info(f"Arguments: {vars(args)}")
# Load configuration
config = get_config(args.config)
# Override config with command line arguments
if args.max_iters is not None:
config.max_iters = args.max_iters
if args.batch_size is not None:
config.batch_size = args.batch_size
if args.learning_rate is not None:
config.learning_rate = args.learning_rate
if args.gradient_accumulation_steps is not None:
config.gradient_accumulation_steps = args.gradient_accumulation_steps
# Override Trackio configuration
if args.enable_tracking is not None:
config.enable_tracking = args.enable_tracking
if args.trackio_url is not None:
config.trackio_url = args.trackio_url
if args.trackio_token is not None:
config.trackio_token = args.trackio_token
if args.experiment_name is not None:
config.experiment_name = args.experiment_name
# Setup paths
output_path = args.out_dir
# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)
logger.info(f"Output path: {output_path}")
# Initialize model
model = SmolLM3Model(
model_name=args.model_name,
max_seq_length=args.max_seq_length,
config=config
)
# Determine dataset path
if hasattr(config, 'dataset_name') and config.dataset_name:
# Use Hugging Face dataset
dataset_path = config.dataset_name
logger.info(f"Using Hugging Face dataset: {dataset_path}")
else:
# Use local dataset
dataset_path = os.path.join('/input', args.dataset_dir)
logger.info(f"Using local dataset: {dataset_path}")
# Load dataset with filtering options
dataset = SmolLM3Dataset(
data_path=dataset_path,
tokenizer=model.tokenizer,
max_seq_length=args.max_seq_length,
filter_bad_entries=getattr(config, 'filter_bad_entries', False),
bad_entry_field=getattr(config, 'bad_entry_field', 'bad_entry')
)
# Initialize trainer
trainer = SmolLM3Trainer(
model=model,
dataset=dataset,
config=config,
output_dir=output_path,
init_from=args.init_from
)
# Start training
try:
trainer.train()
logger.info("Training completed successfully!")
except Exception as e:
logger.error(f"Training failed: {e}")
raise
if __name__ == '__main__':
main()