blt-7b / entropy_model /params.json
Pclanglais's picture
Upload folder using huggingface_hub
930eca4 verified
{
"name": "blt_main_entropy_100m_512w",
"dump_dir": "/checkpoints/blt_main_entropy_100m_512w",
"seed": 42,
"debug_dynamo": false,
"grad_acc_steps": 1,
"gc_collect_freq": 1000,
"probe_freq": null,
"steps": 100000,
"max_steps": null,
"data": {
"s3_profile": null,
"batch_size": 8,
"seq_len": 8192,
"seed": 42,
"add_bos": true,
"add_eos": true,
"load_async": false,
"prefetch_size": 20,
"preprocess_dir": "/corpora/entropy_preprocess",
"dataset_files": null,
"entropy_model_name": "transformer_100m",
"arrow_batch_size": 100,
"buffer_size": 512,
"pad_to_max_length": true,
"max_encoder_seq_length": 8192,
"enable_byte_ngrams": false,
"add_patches": false,
"tokenizer_args": {
"name": "blt",
"init_kwargs": null
},
"patcher_args": {
"patching_mode": "byte",
"patching_device": "cuda",
"entropy_model_checkpoint_dir": null,
"realtime_patching": false,
"threshold": 1.335442066192627,
"threshold_add": null,
"max_patch_length": null,
"patch_size": 4.5,
"patching_batch_size": 1,
"device": "cuda",
"monotonicity": false,
"log_time": false
}
},
"optim": {
"lr": 0.0004,
"weight_decay": 0.1,
"epsilon": 1e-08,
"beta1": 0.9,
"beta2": 0.95,
"clip": 10.0,
"scheduler": "cosine",
"warmup": 500,
"lr_min_ratio": 0.1,
"cycle_length": 1.0,
"cosine_theta": 1.0,
"annealing_step": 1000,
"decay_fraction": 0.1,
"exp_factor": 0.5
},
"model": null,
"entropy_model": {
"dim": 768,
"n_layers": 14,
"head_dim": null,
"n_heads": 12,
"n_kv_heads": null,
"ffn_dim_multiplier": 1.0,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 10000.0,
"rope_use_fp32_in_outer_product": false,
"init_base_std": null,
"init_std_factor": "current_depth",
"max_seqlen": 8192,
"attn_impl": "xformers",
"attn_bias_type": "local_block_causal",
"eos_id": 2,
"seed": 42,
"vocab_size": 260,
"weight_tying": false,
"sliding_window": 512
},
"train_entropy_model": true,
"distributed": {
"dp_shard": 1,
"dp_replicate": 8,
"tp_size": 1,
"selective_activation_checkpointing": false,
"compile": false,
"fsdp_type": "full_shard",
"model_dtype": "bf16",
"float8_recipe": null,
"float8_filter": "layers\\.[0-9]+\\.",
"matmul_allow_tf32": false,
"allow_bf16_reduced_precision_reduction": true,
"detect_anomaly": false,
"compile_cache_size_limit": 8,
"spawn_method": "forkserver"
},
"env": {
"MKL_SERVICE_FORCE_INTEL": "GNU",
"OMP_NUM_THREADS": "1",
"MKL_NUM_THREADS": "1",
"ENABLE_INTRA_NODE_COMM": "1",
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
"NCCL_IB_TIMEOUT": "22",
"NCCL_DEBUG": "INFO",
"TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"
},
"checkpoint": {
"dump": {
"every": 500,
"keep": 3
},
"eval": {
"every": 1000,
"keep": -1
},
"path": "/checkpoints/blt_main_entropy_100m_512w/checkpoints",
"init_ckpt_path": null,
"continue_training_from_init": false,
"s3_profile": null
},
"profiling": {
"run": false,
"trace_folder": "profiling",
"mem_warmup": 100,
"mem_steps": 2,
"profile_warmup": 102,
"profile_steps": 2
},
"logging": {
"freq": 10,
"acc_freq": null,
"wandb": null
},
"async_eval_gpus": null,
"eval": null,
"eval_on_gpus": 8
}