{ "data": { "train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin", "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin", "batch_size": 512, "device_batch_size": 32 }, "model": { "dim": 768, "n_layers": 4, "n_heads": 12, "n_kv_heads": 12, "vocab_size": 50257, "multiple_of": 256, "ffn_dim_multiplier": 4, "norm_eps": 1e-05, "rope_theta": 10000, "use_scaled_rope": false, "max_seq_len": 1024, "initializer_range": 0.02, "zero_init_masks": true }, "optimizer": { "default": { "lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0 }, "masks": { "lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0 }, "norms": { "lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0 } }, "scheduler": { "warmup_steps": 0.1, "start_factor": 0.1 }, "gates": {}, "gates_zero_eps": 1e-08, "seed": 0, "project": "fineweb-baseline", "run_id": null, "logdir": "logs/fineweb-baseline", "log_gradients": false, "log_params": false, "log_every_steps": 1, "val_every_steps": 100, "save_every_steps": -1 }