mode: pt device: gpu precision: bf16 eval_only: false predict_only: false seed: 80085 model: klass: local_t5 name: pszemraj/tFINE-base-65kBPE-FLAN overwrite: dropout_rate: 0.0 add_config: is_bf16: false checkpoint_path: '' random_init: true compile: true tokenizer: name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5 data: input_length: 1024 mlm_probability: 0.15 mean_noise_span_length: 3.0 num_workers: 8 optim: name: adamwscale base_lr: 0.008 batch_size: 120 total_steps: 80000 epochs: -1 warmup_steps: 10000 lr_scheduler: cosine weight_decay: 0.0001 grad_clip: 1.0 grad_acc: 24 final_cosine: 1.0e-05 eval: every_steps: 100000 steps: 500 checkpoint: every_steps: 5000 logging: neptune: false neptune_creds: project: null api_token: null tags: '' every_steps: 50 grad_l2: true weights_l2: true