File size: 2,259 Bytes
8b402c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
loggers:
  tensorboard:
    _target_: src.trainer.TensorBoardLogger
    save_dir: ./
    name: ''
    version: null
callbacks:
  lr_monitor:
    _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
  grad_norm:
    _target_: src.callbacks.grad_norm.GradNorm
    norm_type: 2
    group_separator: /
    histogram_freq: null
    check_clipping: false
    log_weight_distribution: false
    only_total: true
  speed_monitor:
    _target_: src.callbacks.speed_monitor.SpeedMonitor
  grad_accum:
    _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
    scheduling:
      0: 1
  model_checkpoint:
    _target_: src.callbacks.model_checkpoint.ModelCheckpoint
    dirpath: .checkpoints
    filename: '{step}'
    enable_version_counter: false
    every_n_train_steps: 2000
    save_top_k: -1
    save_last: link
    verbose: true
    save_initial_checkpoint: true
tok_path: /home/pl487/rds/hpc-work/merge-effect/outputs/tokenizers/bpe32000minipile
run_folder: me100M_finewebedu-20B_bpe32000minipile
out_parent_folder: model_train
tok_name: bpe32000minipile
dataset: finewebedu-20B
pwd: /home/pl487/rds/hpc-work/merge-effect
train_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/train
val_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/validation
model: me100M
resume_from_checkpoint: .checkpoints/last.ckpt
save_initial_checkpoint: true
seed: 42
torch_compile: true
data:
  batch_size: 32
  eval_batch_size: 128
  shuffle: true
  drop_last: false
  num_workers: 32
  pin_memory: true
  persistent_workers: false
  prefetch_factor: 2
  multiprocessing_context: null
optim:
  optim_name: adamw
  lr: 0.0006
  weight_decay: 0.01
  optim_kwargs:
    fused: true
    eps: 1.0e-08
    betas:
    - 0.9
    - 0.95
  scheduler_name: warmup_stable_decay
  num_warmup_steps: 2000
  scheduler_kwargs:
    num_stable_steps: 44000
    num_decay_steps: 4000
    min_lr_ratio: 0.01
trainer:
  accelerator: gpu
  devices: 4
  precision: bf16-true
  deterministic: false
  log_every_n_steps: 1
  enable_progress_bar: true
  fast_dev_run: false
  gradient_clip_val: 1.0
  gradient_clip_algorithm: norm
  val_check_interval: 2000
  max_steps: 50000
  limit_val_batches: 500