mtasic85 commited on
Commit
7d2cb3f
·
1 Parent(s): bf6e2b3

pretrain core

Browse files
config-0.json CHANGED
@@ -16,7 +16,7 @@
16
  "model_type": "llama",
17
  "num_attention_heads": 16,
18
  "num_hidden_layers": 32,
19
- "num_key_value_heads": 8,
20
  "pretraining_tp": 1,
21
  "rms_norm_eps": 1e-05,
22
  "rope_scaling": null,
 
16
  "model_type": "llama",
17
  "num_attention_heads": 16,
18
  "num_hidden_layers": 32,
19
+ "num_key_value_heads": 4,
20
  "pretraining_tp": 1,
21
  "rms_norm_eps": 1e-05,
22
  "rope_scaling": null,
scripts/pretrain_core_model_0.yaml CHANGED
@@ -12,7 +12,7 @@ model_config:
12
  n_layer: 32
13
  n_head: 16
14
  n_embd: 512
15
- n_query_groups: 8
16
  rotary_percentage: 1.0
17
  parallel_residual: False
18
  bias: False
 
12
  n_layer: 32
13
  n_head: 16
14
  n_embd: 512
15
+ n_query_groups: 4
16
  rotary_percentage: 1.0
17
  parallel_residual: False
18
  bias: False