mode: pt | |
device: gpu | |
precision: bf16 | |
eval_only: false | |
predict_only: false | |
seed: 80085 | |
model: | |
klass: local_t5 | |
name: pszemraj/tFINE-base-65kBPE-FLAN | |
overwrite: | |
dropout_rate: 0.0 | |
add_config: | |
is_bf16: false | |
checkpoint_path: '' | |
random_init: true | |
compile: true | |
tokenizer: | |
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5 | |
data: | |
input_length: 1024 | |
mlm_probability: 0.15 | |
mean_noise_span_length: 3.0 | |
num_workers: 8 | |
optim: | |
name: adamwscale | |
base_lr: 0.008 | |
batch_size: 120 | |
total_steps: 80000 | |
epochs: -1 | |
warmup_steps: 10000 | |
lr_scheduler: cosine | |
weight_decay: 0.0001 | |
grad_clip: 1.0 | |
grad_acc: 24 | |
final_cosine: 1.0e-05 | |
eval: | |
every_steps: 100000 | |
steps: 500 | |
checkpoint: | |
every_steps: 5000 | |
logging: | |
neptune: false | |
neptune_creds: | |
project: null | |
api_token: null | |
tags: '' | |
every_steps: 50 | |
grad_l2: true | |
weights_l2: true | |