new config; new base datasets
Browse files- README.md +16 -0
- config-0.json +2 -2
- scripts/core_base_datasets.py +23 -0
- scripts/prepare_core_datasets.py +7 -7
- scripts/pretrain_core_model_0.yaml +24 -14
README.md
CHANGED
@@ -96,6 +96,22 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
|
|
96 |
```
|
97 |
|
98 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
```
|
100 |
|
101 |
Backup `wandb`:
|
|
|
96 |
```
|
97 |
|
98 |
```
|
99 |
+
Seed set to 23
|
100 |
+
Time to instantiate model: 0.21 seconds.
|
101 |
+
Total parameters: 302,023,168
|
102 |
+
Verifying settings ...
|
103 |
+
Measured TFLOPs: 55520.94
|
104 |
+
Epoch 1 | iter 64 step 1 | loss train: 11.982, val: n/a | iter time: 409.55 ms (step) remaining time: 4 days, 17:45:21
|
105 |
+
Epoch 1 | iter 128 step 2 | loss train: 11.980, val: n/a | iter time: 354.46 ms (step) remaining time: 3 days, 15:01:16
|
106 |
+
Epoch 1 | iter 192 step 3 | loss train: 11.980, val: n/a | iter time: 353.67 ms (step) remaining time: 3 days, 5:46:03
|
107 |
+
Epoch 1 | iter 256 step 4 | loss train: 11.980, val: n/a | iter time: 354.11 ms (step) remaining time: 3 days, 1:05:26
|
108 |
+
Epoch 1 | iter 320 step 5 | loss train: 11.978, val: n/a | iter time: 358.28 ms (step) remaining time: 2 days, 22:21:45
|
109 |
+
Epoch 1 | iter 384 step 6 | loss train: 11.974, val: n/a | iter time: 356.21 ms (step) remaining time: 2 days, 20:33:55
|
110 |
+
Epoch 1 | iter 448 step 7 | loss train: 11.964, val: n/a | iter time: 357.42 ms (step) remaining time: 2 days, 19:15:59
|
111 |
+
Epoch 1 | iter 512 step 8 | loss train: 11.956, val: n/a | iter time: 355.74 ms (step) remaining time: 2 days, 18:16:43
|
112 |
+
Epoch 1 | iter 576 step 9 | loss train: 11.937, val: n/a | iter time: 356.05 ms (step) remaining time: 2 days, 17:28:34
|
113 |
+
Epoch 1 | iter 640 step 10 | loss train: 11.929, val: n/a | iter time: 356.68 ms (step) remaining time: 2 days, 16:49:58
|
114 |
+
# ...
|
115 |
```
|
116 |
|
117 |
Backup `wandb`:
|
config-0.json
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
"attention_dropout": 0.0,
|
7 |
"bos_token_id": 0,
|
8 |
"eos_token_id": 1,
|
9 |
-
"head_dim":
|
10 |
"hidden_act": "silu",
|
11 |
"hidden_size": 512,
|
12 |
"initializer_range": 0.02,
|
@@ -14,7 +14,7 @@
|
|
14 |
"max_position_embeddings": 131072,
|
15 |
"mlp_bias": false,
|
16 |
"model_type": "llama",
|
17 |
-
"num_attention_heads":
|
18 |
"num_hidden_layers": 32,
|
19 |
"num_key_value_heads": 8,
|
20 |
"pretraining_tp": 1,
|
|
|
6 |
"attention_dropout": 0.0,
|
7 |
"bos_token_id": 0,
|
8 |
"eos_token_id": 1,
|
9 |
+
"head_dim": 256,
|
10 |
"hidden_act": "silu",
|
11 |
"hidden_size": 512,
|
12 |
"initializer_range": 0.02,
|
|
|
14 |
"max_position_embeddings": 131072,
|
15 |
"mlp_bias": false,
|
16 |
"model_type": "llama",
|
17 |
+
"num_attention_heads": 32,
|
18 |
"num_hidden_layers": 32,
|
19 |
"num_key_value_heads": 8,
|
20 |
"pretraining_tp": 1,
|
scripts/core_base_datasets.py
CHANGED
@@ -1,4 +1,18 @@
|
|
1 |
core_base_datasets = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
#
|
3 |
# multilingual
|
4 |
#
|
@@ -76,6 +90,15 @@ core_base_datasets = [
|
|
76 |
for i in range(0, 100, 10)
|
77 |
],
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
#
|
80 |
# general knowledge
|
81 |
#
|
|
|
1 |
core_base_datasets = [
|
2 |
+
#
|
3 |
+
# general
|
4 |
+
#
|
5 |
+
# 3.35 GB, 1,000,000 - Curated RefinedWeb with medium context length (2048 <= ctx_len <= 8192)
|
6 |
+
*[
|
7 |
+
{'kind': 'base', 'path': 'vilm/refinedweb-1m-medium', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
8 |
+
for i in range(0, 100, 5)
|
9 |
+
],
|
10 |
+
# 4.01 GB, 1,360,929
|
11 |
+
*[
|
12 |
+
{'kind': 'base', 'path': 'deatos/fineweb-edu-mini-combined', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
13 |
+
for i in range(0, 100, 5)
|
14 |
+
],
|
15 |
+
|
16 |
#
|
17 |
# multilingual
|
18 |
#
|
|
|
90 |
for i in range(0, 100, 10)
|
91 |
],
|
92 |
|
93 |
+
#
|
94 |
+
# math / code
|
95 |
+
#
|
96 |
+
# 2.23 GB, 719,244
|
97 |
+
*[
|
98 |
+
{'kind': 'base', 'path': 'MathGenie/MathCode-Pile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
|
99 |
+
for i in range(0, 100, 10)
|
100 |
+
],
|
101 |
+
|
102 |
#
|
103 |
# general knowledge
|
104 |
#
|
scripts/prepare_core_datasets.py
CHANGED
@@ -13,13 +13,13 @@ tokenizer_path = '../tokenizer'
|
|
13 |
|
14 |
seqs = [
|
15 |
(0, 1073741824, 1025, 16000),
|
16 |
-
(1025, 2049, 2049, 8000),
|
17 |
-
(2049, 4097, 4097, 4000),
|
18 |
-
(4097, 8193, 8193, 2000),
|
19 |
-
(8193, 16385, 16385, 1000),
|
20 |
-
(16385, 32769, 32769, 500),
|
21 |
-
(32769, 65537, 65537, 250),
|
22 |
-
(65537, 131073, 131073, 125),
|
23 |
]
|
24 |
|
25 |
#
|
|
|
13 |
|
14 |
seqs = [
|
15 |
(0, 1073741824, 1025, 16000),
|
16 |
+
# (1025, 2049, 2049, 8000),
|
17 |
+
# (2049, 4097, 4097, 4000),
|
18 |
+
# (4097, 8193, 8193, 2000),
|
19 |
+
# (8193, 16385, 16385, 1000),
|
20 |
+
# (16385, 32769, 32769, 500),
|
21 |
+
# (32769, 65537, 65537, 250),
|
22 |
+
# (65537, 131073, 131073, 125),
|
23 |
]
|
24 |
|
25 |
#
|
scripts/pretrain_core_model_0.yaml
CHANGED
@@ -10,7 +10,7 @@ model_config:
|
|
10 |
vocab_size: 131072
|
11 |
padded_vocab_size: 131072
|
12 |
n_layer: 32
|
13 |
-
n_head:
|
14 |
n_embd: 512
|
15 |
n_query_groups: 8
|
16 |
rotary_percentage: 1.0
|
@@ -21,7 +21,7 @@ model_config:
|
|
21 |
intermediate_size: 2048 # n_embd * 4
|
22 |
norm_eps: 1e-5
|
23 |
rope_base: 4300 # https://arxiv.org/pdf/2405.14591
|
24 |
-
head_size:
|
25 |
|
26 |
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
27 |
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
|
@@ -85,7 +85,7 @@ train:
|
|
85 |
max_norm: 1.0
|
86 |
|
87 |
# (type: float, default: 4e-05)
|
88 |
-
min_lr:
|
89 |
|
90 |
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
91 |
eval:
|
@@ -105,17 +105,17 @@ eval:
|
|
105 |
final_validation: true
|
106 |
|
107 |
# Optimizer-related arguments
|
108 |
-
optimizer:
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
|
120 |
# optimizer:
|
121 |
# class_path: sophia_opt.SophiaG
|
@@ -127,6 +127,16 @@ optimizer:
|
|
127 |
# rho: 0.05
|
128 |
# weight_decay: 0.1
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
131 |
devices: auto
|
132 |
|
|
|
10 |
vocab_size: 131072
|
11 |
padded_vocab_size: 131072
|
12 |
n_layer: 32
|
13 |
+
n_head: 32
|
14 |
n_embd: 512
|
15 |
n_query_groups: 8
|
16 |
rotary_percentage: 1.0
|
|
|
21 |
intermediate_size: 2048 # n_embd * 4
|
22 |
norm_eps: 1e-5
|
23 |
rope_base: 4300 # https://arxiv.org/pdf/2405.14591
|
24 |
+
head_size: 256 # n_embd / n_head
|
25 |
|
26 |
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
27 |
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
|
|
|
85 |
max_norm: 1.0
|
86 |
|
87 |
# (type: float, default: 4e-05)
|
88 |
+
min_lr: 1e-5
|
89 |
|
90 |
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
91 |
eval:
|
|
|
105 |
final_validation: true
|
106 |
|
107 |
# Optimizer-related arguments
|
108 |
+
# optimizer:
|
109 |
+
# class_path: torch.optim.AdamW
|
110 |
+
# init_args:
|
111 |
+
# # (type: float, default: 0.001)
|
112 |
+
# lr: 3e-4
|
113 |
+
# # (type: float, default: 0.01)
|
114 |
+
# weight_decay: 0.01
|
115 |
+
# # (type: tuple, default: (0.9,0.999))
|
116 |
+
# betas:
|
117 |
+
# - 0.9
|
118 |
+
# - 0.999
|
119 |
|
120 |
# optimizer:
|
121 |
# class_path: sophia_opt.SophiaG
|
|
|
127 |
# rho: 0.05
|
128 |
# weight_decay: 0.1
|
129 |
|
130 |
+
optimizer:
|
131 |
+
class_path: sophia_opt.SophiaG
|
132 |
+
init_args:
|
133 |
+
lr: 1e-4
|
134 |
+
betas:
|
135 |
+
- 0.965
|
136 |
+
- 0.99
|
137 |
+
rho: 0.04
|
138 |
+
weight_decay: 1e-1
|
139 |
+
|
140 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
141 |
devices: auto
|
142 |
|