{ "name": "blt_main_entropy_100m_512w", "dump_dir": "/checkpoints/blt_main_entropy_100m_512w", "seed": 42, "debug_dynamo": false, "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": null, "steps": 100000, "max_steps": null, "data": { "s3_profile": null, "batch_size": 8, "seq_len": 8192, "seed": 42, "add_bos": true, "add_eos": true, "load_async": false, "prefetch_size": 20, "preprocess_dir": "/corpora/entropy_preprocess", "dataset_files": null, "entropy_model_name": "transformer_100m", "arrow_batch_size": 100, "buffer_size": 512, "pad_to_max_length": true, "max_encoder_seq_length": 8192, "enable_byte_ngrams": false, "add_patches": false, "tokenizer_args": { "name": "blt", "init_kwargs": null }, "patcher_args": { "patching_mode": "byte", "patching_device": "cuda", "entropy_model_checkpoint_dir": null, "realtime_patching": false, "threshold": 1.335442066192627, "threshold_add": null, "max_patch_length": null, "patch_size": 4.5, "patching_batch_size": 1, "device": "cuda", "monotonicity": false, "log_time": false } }, "optim": { "lr": 0.0004, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 10.0, "scheduler": "cosine", "warmup": 500, "lr_min_ratio": 0.1, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5 }, "model": null, "entropy_model": { "dim": 768, "n_layers": 14, "head_dim": null, "n_heads": 12, "n_kv_heads": null, "ffn_dim_multiplier": 1.0, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "rope_use_fp32_in_outer_product": false, "init_base_std": null, "init_std_factor": "current_depth", "max_seqlen": 8192, "attn_impl": "xformers", "attn_bias_type": "local_block_causal", "eos_id": 2, "seed": 42, "vocab_size": 260, "weight_tying": false, "sliding_window": 512 }, "train_entropy_model": true, "distributed": { "dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": false, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "allow_bf16_reduced_precision_reduction": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver" }, "env": { "MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1" }, "checkpoint": { "dump": { "every": 500, "keep": 3 }, "eval": { "every": 1000, "keep": -1 }, "path": "/checkpoints/blt_main_entropy_100m_512w/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false, "s3_profile": null }, "profiling": { "run": false, "trace_folder": "profiling", "mem_warmup": 100, "mem_steps": 2, "profile_warmup": 102, "profile_steps": 2 }, "logging": { "freq": 10, "acc_freq": null, "wandb": null }, "async_eval_gpus": null, "eval": null, "eval_on_gpus": 8 }