|
{ |
|
"name": "blt_main_entropy_100m_512w", |
|
"dump_dir": "/checkpoints/blt_main_entropy_100m_512w", |
|
"seed": 42, |
|
"debug_dynamo": false, |
|
"grad_acc_steps": 1, |
|
"gc_collect_freq": 1000, |
|
"probe_freq": null, |
|
"steps": 100000, |
|
"max_steps": null, |
|
"data": { |
|
"s3_profile": null, |
|
"batch_size": 8, |
|
"seq_len": 8192, |
|
"seed": 42, |
|
"add_bos": true, |
|
"add_eos": true, |
|
"load_async": false, |
|
"prefetch_size": 20, |
|
"preprocess_dir": "/corpora/entropy_preprocess", |
|
"dataset_files": null, |
|
"entropy_model_name": "transformer_100m", |
|
"arrow_batch_size": 100, |
|
"buffer_size": 512, |
|
"pad_to_max_length": true, |
|
"max_encoder_seq_length": 8192, |
|
"enable_byte_ngrams": false, |
|
"add_patches": false, |
|
"tokenizer_args": { |
|
"name": "blt", |
|
"init_kwargs": null |
|
}, |
|
"patcher_args": { |
|
"patching_mode": "byte", |
|
"patching_device": "cuda", |
|
"entropy_model_checkpoint_dir": null, |
|
"realtime_patching": false, |
|
"threshold": 1.335442066192627, |
|
"threshold_add": null, |
|
"max_patch_length": null, |
|
"patch_size": 4.5, |
|
"patching_batch_size": 1, |
|
"device": "cuda", |
|
"monotonicity": false, |
|
"log_time": false |
|
} |
|
}, |
|
"optim": { |
|
"lr": 0.0004, |
|
"weight_decay": 0.1, |
|
"epsilon": 1e-08, |
|
"beta1": 0.9, |
|
"beta2": 0.95, |
|
"clip": 10.0, |
|
"scheduler": "cosine", |
|
"warmup": 500, |
|
"lr_min_ratio": 0.1, |
|
"cycle_length": 1.0, |
|
"cosine_theta": 1.0, |
|
"annealing_step": 1000, |
|
"decay_fraction": 0.1, |
|
"exp_factor": 0.5 |
|
}, |
|
"model": null, |
|
"entropy_model": { |
|
"dim": 768, |
|
"n_layers": 14, |
|
"head_dim": null, |
|
"n_heads": 12, |
|
"n_kv_heads": null, |
|
"ffn_dim_multiplier": 1.0, |
|
"multiple_of": 256, |
|
"norm_eps": 1e-05, |
|
"rope_theta": 10000.0, |
|
"rope_use_fp32_in_outer_product": false, |
|
"init_base_std": null, |
|
"init_std_factor": "current_depth", |
|
"max_seqlen": 8192, |
|
"attn_impl": "xformers", |
|
"attn_bias_type": "local_block_causal", |
|
"eos_id": 2, |
|
"seed": 42, |
|
"vocab_size": 260, |
|
"weight_tying": false, |
|
"sliding_window": 512 |
|
}, |
|
"train_entropy_model": true, |
|
"distributed": { |
|
"dp_shard": 1, |
|
"dp_replicate": 8, |
|
"tp_size": 1, |
|
"selective_activation_checkpointing": false, |
|
"compile": false, |
|
"fsdp_type": "full_shard", |
|
"model_dtype": "bf16", |
|
"float8_recipe": null, |
|
"float8_filter": "layers\\.[0-9]+\\.", |
|
"matmul_allow_tf32": false, |
|
"allow_bf16_reduced_precision_reduction": true, |
|
"detect_anomaly": false, |
|
"compile_cache_size_limit": 8, |
|
"spawn_method": "forkserver" |
|
}, |
|
"env": { |
|
"MKL_SERVICE_FORCE_INTEL": "GNU", |
|
"OMP_NUM_THREADS": "1", |
|
"MKL_NUM_THREADS": "1", |
|
"ENABLE_INTRA_NODE_COMM": "1", |
|
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", |
|
"NCCL_IB_TIMEOUT": "22", |
|
"NCCL_DEBUG": "INFO", |
|
"TORCH_NCCL_ASYNC_ERROR_HANDLING": "1" |
|
}, |
|
"checkpoint": { |
|
"dump": { |
|
"every": 500, |
|
"keep": 3 |
|
}, |
|
"eval": { |
|
"every": 1000, |
|
"keep": -1 |
|
}, |
|
"path": "/checkpoints/blt_main_entropy_100m_512w/checkpoints", |
|
"init_ckpt_path": null, |
|
"continue_training_from_init": false, |
|
"s3_profile": null |
|
}, |
|
"profiling": { |
|
"run": false, |
|
"trace_folder": "profiling", |
|
"mem_warmup": 100, |
|
"mem_steps": 2, |
|
"profile_warmup": 102, |
|
"profile_steps": 2 |
|
}, |
|
"logging": { |
|
"freq": 10, |
|
"acc_freq": null, |
|
"wandb": null |
|
}, |
|
"async_eval_gpus": null, |
|
"eval": null, |
|
"eval_on_gpus": 8 |
|
} |
|
|