mtasic85 commited on
Commit
e9d9da3
·
1 Parent(s): c13c3d0

new config; new base datasets

Browse files
README.md CHANGED
@@ -96,6 +96,22 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
96
  ```
97
 
98
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  ```
100
 
101
  Backup `wandb`:
 
96
  ```
97
 
98
  ```
99
+ Seed set to 23
100
+ Time to instantiate model: 0.21 seconds.
101
+ Total parameters: 302,023,168
102
+ Verifying settings ...
103
+ Measured TFLOPs: 55520.94
104
+ Epoch 1 | iter 64 step 1 | loss train: 11.982, val: n/a | iter time: 409.55 ms (step) remaining time: 4 days, 17:45:21
105
+ Epoch 1 | iter 128 step 2 | loss train: 11.980, val: n/a | iter time: 354.46 ms (step) remaining time: 3 days, 15:01:16
106
+ Epoch 1 | iter 192 step 3 | loss train: 11.980, val: n/a | iter time: 353.67 ms (step) remaining time: 3 days, 5:46:03
107
+ Epoch 1 | iter 256 step 4 | loss train: 11.980, val: n/a | iter time: 354.11 ms (step) remaining time: 3 days, 1:05:26
108
+ Epoch 1 | iter 320 step 5 | loss train: 11.978, val: n/a | iter time: 358.28 ms (step) remaining time: 2 days, 22:21:45
109
+ Epoch 1 | iter 384 step 6 | loss train: 11.974, val: n/a | iter time: 356.21 ms (step) remaining time: 2 days, 20:33:55
110
+ Epoch 1 | iter 448 step 7 | loss train: 11.964, val: n/a | iter time: 357.42 ms (step) remaining time: 2 days, 19:15:59
111
+ Epoch 1 | iter 512 step 8 | loss train: 11.956, val: n/a | iter time: 355.74 ms (step) remaining time: 2 days, 18:16:43
112
+ Epoch 1 | iter 576 step 9 | loss train: 11.937, val: n/a | iter time: 356.05 ms (step) remaining time: 2 days, 17:28:34
113
+ Epoch 1 | iter 640 step 10 | loss train: 11.929, val: n/a | iter time: 356.68 ms (step) remaining time: 2 days, 16:49:58
114
+ # ...
115
  ```
116
 
117
  Backup `wandb`:
config-0.json CHANGED
@@ -6,7 +6,7 @@
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
  "eos_token_id": 1,
9
- "head_dim": 128,
10
  "hidden_act": "silu",
11
  "hidden_size": 512,
12
  "initializer_range": 0.02,
@@ -14,7 +14,7 @@
14
  "max_position_embeddings": 131072,
15
  "mlp_bias": false,
16
  "model_type": "llama",
17
- "num_attention_heads": 8,
18
  "num_hidden_layers": 32,
19
  "num_key_value_heads": 8,
20
  "pretraining_tp": 1,
 
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
  "eos_token_id": 1,
9
+ "head_dim": 256,
10
  "hidden_act": "silu",
11
  "hidden_size": 512,
12
  "initializer_range": 0.02,
 
14
  "max_position_embeddings": 131072,
15
  "mlp_bias": false,
16
  "model_type": "llama",
17
+ "num_attention_heads": 32,
18
  "num_hidden_layers": 32,
19
  "num_key_value_heads": 8,
20
  "pretraining_tp": 1,
scripts/core_base_datasets.py CHANGED
@@ -1,4 +1,18 @@
1
  core_base_datasets = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  #
3
  # multilingual
4
  #
@@ -76,6 +90,15 @@ core_base_datasets = [
76
  for i in range(0, 100, 10)
77
  ],
78
 
 
 
 
 
 
 
 
 
 
79
  #
80
  # general knowledge
81
  #
 
1
  core_base_datasets = [
2
+ #
3
+ # general
4
+ #
5
+ # 3.35 GB, 1,000,000 - Curated RefinedWeb with medium context length (2048 <= ctx_len <= 8192)
6
+ *[
7
+ {'kind': 'base', 'path': 'vilm/refinedweb-1m-medium', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 5)
9
+ ],
10
+ # 4.01 GB, 1,360,929
11
+ *[
12
+ {'kind': 'base', 'path': 'deatos/fineweb-edu-mini-combined', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 5)
14
+ ],
15
+
16
  #
17
  # multilingual
18
  #
 
90
  for i in range(0, 100, 10)
91
  ],
92
 
93
+ #
94
+ # math / code
95
+ #
96
+ # 2.23 GB, 719,244
97
+ *[
98
+ {'kind': 'base', 'path': 'MathGenie/MathCode-Pile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
99
+ for i in range(0, 100, 10)
100
+ ],
101
+
102
  #
103
  # general knowledge
104
  #
scripts/prepare_core_datasets.py CHANGED
@@ -13,13 +13,13 @@ tokenizer_path = '../tokenizer'
13
 
14
  seqs = [
15
  (0, 1073741824, 1025, 16000),
16
- (1025, 2049, 2049, 8000),
17
- (2049, 4097, 4097, 4000),
18
- (4097, 8193, 8193, 2000),
19
- (8193, 16385, 16385, 1000),
20
- (16385, 32769, 32769, 500),
21
- (32769, 65537, 65537, 250),
22
- (65537, 131073, 131073, 125),
23
  ]
24
 
25
  #
 
13
 
14
  seqs = [
15
  (0, 1073741824, 1025, 16000),
16
+ # (1025, 2049, 2049, 8000),
17
+ # (2049, 4097, 4097, 4000),
18
+ # (4097, 8193, 8193, 2000),
19
+ # (8193, 16385, 16385, 1000),
20
+ # (16385, 32769, 32769, 500),
21
+ # (32769, 65537, 65537, 250),
22
+ # (65537, 131073, 131073, 125),
23
  ]
24
 
25
  #
scripts/pretrain_core_model_0.yaml CHANGED
@@ -10,7 +10,7 @@ model_config:
10
  vocab_size: 131072
11
  padded_vocab_size: 131072
12
  n_layer: 32
13
- n_head: 8
14
  n_embd: 512
15
  n_query_groups: 8
16
  rotary_percentage: 1.0
@@ -21,7 +21,7 @@ model_config:
21
  intermediate_size: 2048 # n_embd * 4
22
  norm_eps: 1e-5
23
  rope_base: 4300 # https://arxiv.org/pdf/2405.14591
24
- head_size: 128 # n_embd / n_head
25
 
26
  # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
27
  # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
@@ -85,7 +85,7 @@ train:
85
  max_norm: 1.0
86
 
87
  # (type: float, default: 4e-05)
88
- min_lr: 3e-5
89
 
90
  # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
91
  eval:
@@ -105,17 +105,17 @@ eval:
105
  final_validation: true
106
 
107
  # Optimizer-related arguments
108
- optimizer:
109
- class_path: torch.optim.AdamW
110
- init_args:
111
- # (type: float, default: 0.001)
112
- lr: 3e-4
113
- # (type: float, default: 0.01)
114
- weight_decay: 0.01
115
- # (type: tuple, default: (0.9,0.999))
116
- betas:
117
- - 0.9
118
- - 0.999
119
 
120
  # optimizer:
121
  # class_path: sophia_opt.SophiaG
@@ -127,6 +127,16 @@ optimizer:
127
  # rho: 0.05
128
  # weight_decay: 0.1
129
 
 
 
 
 
 
 
 
 
 
 
130
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
131
  devices: auto
132
 
 
10
  vocab_size: 131072
11
  padded_vocab_size: 131072
12
  n_layer: 32
13
+ n_head: 32
14
  n_embd: 512
15
  n_query_groups: 8
16
  rotary_percentage: 1.0
 
21
  intermediate_size: 2048 # n_embd * 4
22
  norm_eps: 1e-5
23
  rope_base: 4300 # https://arxiv.org/pdf/2405.14591
24
+ head_size: 256 # n_embd / n_head
25
 
26
  # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
27
  # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 
85
  max_norm: 1.0
86
 
87
  # (type: float, default: 4e-05)
88
+ min_lr: 1e-5
89
 
90
  # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
91
  eval:
 
105
  final_validation: true
106
 
107
  # Optimizer-related arguments
108
+ # optimizer:
109
+ # class_path: torch.optim.AdamW
110
+ # init_args:
111
+ # # (type: float, default: 0.001)
112
+ # lr: 3e-4
113
+ # # (type: float, default: 0.01)
114
+ # weight_decay: 0.01
115
+ # # (type: tuple, default: (0.9,0.999))
116
+ # betas:
117
+ # - 0.9
118
+ # - 0.999
119
 
120
  # optimizer:
121
  # class_path: sophia_opt.SophiaG
 
127
  # rho: 0.05
128
  # weight_decay: 0.1
129
 
130
+ optimizer:
131
+ class_path: sophia_opt.SophiaG
132
+ init_args:
133
+ lr: 1e-4
134
+ betas:
135
+ - 0.965
136
+ - 0.99
137
+ rho: 0.04
138
+ weight_decay: 1e-1
139
+
140
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
141
  devices: auto
142