mtasic85 commited on
Commit
f03dcd7
·
1 Parent(s): 8a2ebd6

pretrain core

Browse files
config-0.json CHANGED
@@ -6,7 +6,7 @@
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
  "eos_token_id": 1,
9
- "head_dim": 256,
10
  "hidden_act": "silu",
11
  "hidden_size": 768,
12
  "initializer_range": 0.02,
@@ -14,7 +14,7 @@
14
  "max_position_embeddings": 131072,
15
  "mlp_bias": false,
16
  "model_type": "llama",
17
- "num_attention_heads": 16,
18
  "num_hidden_layers": 32,
19
  "num_key_value_heads": 4,
20
  "pretraining_tp": 1,
 
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
  "eos_token_id": 1,
9
+ "head_dim": 64,
10
  "hidden_act": "silu",
11
  "hidden_size": 768,
12
  "initializer_range": 0.02,
 
14
  "max_position_embeddings": 131072,
15
  "mlp_bias": false,
16
  "model_type": "llama",
17
+ "num_attention_heads": 12,
18
  "num_hidden_layers": 32,
19
  "num_key_value_heads": 4,
20
  "pretraining_tp": 1,
scripts/pretrain_core_model_0.yaml CHANGED
@@ -10,7 +10,7 @@ model_config:
10
  vocab_size: 131072
11
  padded_vocab_size: 131072
12
  n_layer: 32
13
- n_head: 16
14
  n_embd: 768
15
  n_query_groups: 4
16
  rotary_percentage: 1.0
 
10
  vocab_size: 131072
11
  padded_vocab_size: 131072
12
  n_layer: 32
13
+ n_head: 12
14
  n_embd: 768
15
  n_query_groups: 4
16
  rotary_percentage: 1.0