[model] config = "fla-hub/transformer-1.3B-100B" tokenizer_path = "fla-hub/transformer-1.3B-100B" [job] dump_folder = "exp" print_args = true [training] batch_size = 32 seq_len = 2048 context_len = 2048 gradient_accumulation_steps = 1 steps = 20480 max_norm = 1.0 skip_nan_inf = true data_parallel_replicate_degree = 1 data_parallel_shard_degree = -1 tensor_parallel_degree = 1 compile = false dataset = "HuggingFaceFW/fineweb-edu" dataset_name = "default" num_workers = 32 pin_memory = false persistent_workers = false prefetch_factor = 2 seed = 42 varlen = false [optimizer] name = "AdamW" eps = 1e-15 lr = 3e-4 [lr_scheduler] warmup_steps = 1024 decay_type = "cosine" lr_min = 0.1 [checkpoint] enable_checkpoint = true folder = "checkpoint" interval_type = "steps" interval = 2048 model_weights_only = false export_dtype = "float32" async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] [profiling] enable_profiling = true save_traces_folder = "profile_trace" profile_freq = 512 [metrics] log_freq = 32 enable_wandb = true [experimental] context_parallel_degree = 1 pipeline_parallel_degree = 1 [float8] enable_fsdp_float8_all_gather = false precompute_float8_dynamic_scale_for_fsdp = false [activation_checkpoint] mode = "none"