pietrolesci commited on
Commit
8b402c9
·
verified ·
1 Parent(s): 8d51b76

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Experiment Configuration
2
+ ```yaml
3
+ callbacks:
4
+ grad_accum:
5
+ _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
6
+ scheduling:
7
+ 0: 1
8
+ grad_norm:
9
+ _target_: src.callbacks.grad_norm.GradNorm
10
+ check_clipping: false
11
+ group_separator: /
12
+ histogram_freq: null
13
+ log_weight_distribution: false
14
+ norm_type: 2
15
+ only_total: true
16
+ lr_monitor:
17
+ _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
18
+ model_checkpoint:
19
+ _target_: src.callbacks.model_checkpoint.ModelCheckpoint
20
+ dirpath: .checkpoints
21
+ enable_version_counter: false
22
+ every_n_train_steps: 2000
23
+ filename: '{step}'
24
+ save_initial_checkpoint: true
25
+ save_last: link
26
+ save_top_k: -1
27
+ verbose: true
28
+ speed_monitor:
29
+ _target_: src.callbacks.speed_monitor.SpeedMonitor
30
+ data:
31
+ batch_size: 32
32
+ drop_last: false
33
+ eval_batch_size: 128
34
+ multiprocessing_context: null
35
+ num_workers: 32
36
+ persistent_workers: false
37
+ pin_memory: true
38
+ prefetch_factor: 2
39
+ shuffle: true
40
+ dataset: finewebedu-20B
41
+ loggers:
42
+ tensorboard:
43
+ _target_: src.trainer.TensorBoardLogger
44
+ name: ''
45
+ save_dir: ./
46
+ version: null
47
+ model: me100M
48
+ optim:
49
+ lr: 0.0006
50
+ num_warmup_steps: 2000
51
+ optim_kwargs:
52
+ betas:
53
+ - 0.9
54
+ - 0.95
55
+ eps: 1.0e-08
56
+ fused: true
57
+ optim_name: adamw
58
+ scheduler_kwargs:
59
+ min_lr_ratio: 0.01
60
+ num_decay_steps: 4000
61
+ num_stable_steps: 44000
62
+ scheduler_name: warmup_stable_decay
63
+ weight_decay: 0.01
64
+ out_parent_folder: model_train
65
+ pwd: ./rds/hpc-work/merge-effect
66
+ resume_from_checkpoint: .checkpoints/last.ckpt
67
+ run_folder: me100M_finewebedu-20B_bpe32000minipile
68
+ save_initial_checkpoint: true
69
+ seed: 42
70
+ tok_name: bpe32000minipile
71
+ tok_path: ./rds/hpc-work/merge-effect/outputs/tokenizers/bpe32000minipile
72
+ torch_compile: true
73
+ train_data_path: ./rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/train
74
+ trainer:
75
+ accelerator: gpu
76
+ deterministic: false
77
+ devices: 4
78
+ enable_progress_bar: true
79
+ fast_dev_run: false
80
+ gradient_clip_algorithm: norm
81
+ gradient_clip_val: 1.0
82
+ limit_val_batches: 500
83
+ log_every_n_steps: 1
84
+ max_steps: 50000
85
+ precision: bf16-true
86
+ val_check_interval: 2000
87
+ val_data_path: ./rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/validation
88
+ ```
hparams.yaml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ loggers:
2
+ tensorboard:
3
+ _target_: src.trainer.TensorBoardLogger
4
+ save_dir: ./
5
+ name: ''
6
+ version: null
7
+ callbacks:
8
+ lr_monitor:
9
+ _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
10
+ grad_norm:
11
+ _target_: src.callbacks.grad_norm.GradNorm
12
+ norm_type: 2
13
+ group_separator: /
14
+ histogram_freq: null
15
+ check_clipping: false
16
+ log_weight_distribution: false
17
+ only_total: true
18
+ speed_monitor:
19
+ _target_: src.callbacks.speed_monitor.SpeedMonitor
20
+ grad_accum:
21
+ _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
22
+ scheduling:
23
+ 0: 1
24
+ model_checkpoint:
25
+ _target_: src.callbacks.model_checkpoint.ModelCheckpoint
26
+ dirpath: .checkpoints
27
+ filename: '{step}'
28
+ enable_version_counter: false
29
+ every_n_train_steps: 2000
30
+ save_top_k: -1
31
+ save_last: link
32
+ verbose: true
33
+ save_initial_checkpoint: true
34
+ tok_path: /home/pl487/rds/hpc-work/merge-effect/outputs/tokenizers/bpe32000minipile
35
+ run_folder: me100M_finewebedu-20B_bpe32000minipile
36
+ out_parent_folder: model_train
37
+ tok_name: bpe32000minipile
38
+ dataset: finewebedu-20B
39
+ pwd: /home/pl487/rds/hpc-work/merge-effect
40
+ train_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/train
41
+ val_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/validation
42
+ model: me100M
43
+ resume_from_checkpoint: .checkpoints/last.ckpt
44
+ save_initial_checkpoint: true
45
+ seed: 42
46
+ torch_compile: true
47
+ data:
48
+ batch_size: 32
49
+ eval_batch_size: 128
50
+ shuffle: true
51
+ drop_last: false
52
+ num_workers: 32
53
+ pin_memory: true
54
+ persistent_workers: false
55
+ prefetch_factor: 2
56
+ multiprocessing_context: null
57
+ optim:
58
+ optim_name: adamw
59
+ lr: 0.0006
60
+ weight_decay: 0.01
61
+ optim_kwargs:
62
+ fused: true
63
+ eps: 1.0e-08
64
+ betas:
65
+ - 0.9
66
+ - 0.95
67
+ scheduler_name: warmup_stable_decay
68
+ num_warmup_steps: 2000
69
+ scheduler_kwargs:
70
+ num_stable_steps: 44000
71
+ num_decay_steps: 4000
72
+ min_lr_ratio: 0.01
73
+ trainer:
74
+ accelerator: gpu
75
+ devices: 4
76
+ precision: bf16-true
77
+ deterministic: false
78
+ log_every_n_steps: 1
79
+ enable_progress_bar: true
80
+ fast_dev_run: false
81
+ gradient_clip_val: 1.0
82
+ gradient_clip_algorithm: norm
83
+ val_check_interval: 2000
84
+ max_steps: 50000
85
+ limit_val_batches: 500
tb_logs.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40812abf745dc65efbcfdfa7016a888c642791dffa53d4a09909dd1d4f753b6
3
+ size 213056
version_0/events.out.tfevents.1742357493.gpu-q-26.161056.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3123dce3f3001beb0978dcf9b005f68f2d2dea67d8565bbd8686cc2bab8275e0
3
+ size 12110912
version_0/hparams.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_config: !!python/object:src.data.DataloaderConfig
2
+ batch_size: 32
3
+ drop_last: false
4
+ eval_batch_size: 128
5
+ multiprocessing_context: null
6
+ num_workers: 32
7
+ persistent_workers: false
8
+ pin_memory: true
9
+ prefetch_factor: 2
10
+ shuffle: true
11
+ eod_token_id: 0
12
+ max_position_embeddings: 2048
13
+ optim_config: !!python/object:src.trainer.OptimCofig
14
+ keller_kwargs: {}
15
+ lr: 0.0006
16
+ num_warmup_steps: 2000
17
+ optim_kwargs:
18
+ betas:
19
+ - 0.9
20
+ - 0.95
21
+ eps: 1.0e-08
22
+ fused: true
23
+ optim_name: adamw
24
+ scheduler_kwargs:
25
+ min_lr_ratio: 0.01
26
+ num_decay_steps: 4000
27
+ num_stable_steps: 44000
28
+ scheduler_name: warmup_stable_decay
29
+ weight_decay: 0.01
30
+ train_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/train
31
+ val_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/validation
version_1/events.out.tfevents.1742424753.gpu-q-57.173192.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c4daeae62f08b14679ac347030a0a78c66efa4fd5725458fc9b2ef7b7640dc8
3
+ size 12831834
version_1/hparams.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_config: !!python/object:src.data.DataloaderConfig
2
+ batch_size: 32
3
+ drop_last: false
4
+ eval_batch_size: 128
5
+ multiprocessing_context: null
6
+ num_workers: 32
7
+ persistent_workers: false
8
+ pin_memory: true
9
+ prefetch_factor: 2
10
+ shuffle: true
11
+ eod_token_id: 0
12
+ max_position_embeddings: 2048
13
+ optim_config: !!python/object:src.trainer.OptimCofig
14
+ keller_kwargs: {}
15
+ lr: 0.0006
16
+ num_warmup_steps: 2000
17
+ optim_kwargs:
18
+ betas:
19
+ - 0.9
20
+ - 0.95
21
+ eps: 1.0e-08
22
+ fused: true
23
+ optim_name: adamw
24
+ scheduler_kwargs:
25
+ min_lr_ratio: 0.01
26
+ num_decay_steps: 4000
27
+ num_stable_steps: 44000
28
+ scheduler_name: warmup_stable_decay
29
+ weight_decay: 0.01
30
+ train_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/train
31
+ val_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/validation
version_2/events.out.tfevents.1742467194.gpu-q-79.16997.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fe822ca570b79674102ce1feec2613b461a3267e3f3aa8199ae6382ec6b6691
3
+ size 3260972
version_2/hparams.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataloader_config: !!python/object:src.data.DataloaderConfig
2
+ batch_size: 32
3
+ drop_last: false
4
+ eval_batch_size: 128
5
+ multiprocessing_context: null
6
+ num_workers: 32
7
+ persistent_workers: false
8
+ pin_memory: true
9
+ prefetch_factor: 2
10
+ shuffle: true
11
+ eod_token_id: 0
12
+ max_position_embeddings: 2048
13
+ optim_config: !!python/object:src.trainer.OptimCofig
14
+ keller_kwargs: {}
15
+ lr: 0.0006
16
+ num_warmup_steps: 2000
17
+ optim_kwargs:
18
+ betas:
19
+ - 0.9
20
+ - 0.95
21
+ eps: 1.0e-08
22
+ fused: true
23
+ optim_name: adamw
24
+ scheduler_kwargs:
25
+ min_lr_ratio: 0.01
26
+ num_decay_steps: 4000
27
+ num_stable_steps: 44000
28
+ scheduler_name: warmup_stable_decay
29
+ weight_decay: 0.01
30
+ train_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/train
31
+ val_data_path: /home/pl487/rds/hpc-work/merge-effect/data/finewebedu-20B/bpe32000minipile/validation