paulilioaica commited on
Commit
817811c
1 Parent(s): 6e67c24

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +16 -14
config.json CHANGED
@@ -4,11 +4,10 @@
4
  "architectures": [
5
  "PhiForCausalLM"
6
  ],
7
- "attention_dropout": 0.0,
8
  "attn_pdrop": 0.0,
9
  "auto_map": {
10
- "AutoConfig": "rhysjones/phi-2-orange--configuration_phi.PhiConfig",
11
- "AutoModelForCausalLM": "rhysjones/phi-2-orange--modeling_phi.PhiForCausalLM"
12
  },
13
  "bos_token_id": null,
14
  "embd_pdrop": 0.0,
@@ -16,14 +15,15 @@
16
  "flash_attn": false,
17
  "flash_rotary": false,
18
  "fused_dense": false,
19
- "hidden_act": "silu",
20
- "hidden_size": 4096,
21
  "img_processor": null,
22
  "initializer_range": 0.02,
23
- "intermediate_size": 14336,
 
24
  "layer_norm_epsilon": 1e-05,
25
  "max_position_embeddings": 2048,
26
- "model_type": "mixtral",
27
  "n_embd": 2560,
28
  "n_head": 32,
29
  "n_head_kv": null,
@@ -32,19 +32,21 @@
32
  "n_positions": 2048,
33
  "num_attention_heads": 32,
34
  "num_experts_per_tok": 2,
35
- "num_hidden_layers": 32,
36
- "num_key_value_heads": 8,
 
 
37
  "num_local_experts": 2,
38
- "output_router_logits": false,
 
39
  "resid_pdrop": 0.1,
40
- "rms_norm_eps": 1e-06,
41
  "rope_theta": 10000.0,
42
  "rotary_dim": 32,
43
- "router_aux_loss_coef": 0.001,
44
  "sliding_window": null,
45
  "tie_word_embeddings": false,
46
  "torch_dtype": "float16",
47
- "transformers_version": "4.37.2",
48
  "use_cache": false,
49
  "vocab_size": 51200
50
- }
 
4
  "architectures": [
5
  "PhiForCausalLM"
6
  ],
 
7
  "attn_pdrop": 0.0,
8
  "auto_map": {
9
+ "AutoConfig": "configuration_phi.PhiConfig",
10
+ "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
11
  },
12
  "bos_token_id": null,
13
  "embd_pdrop": 0.0,
 
15
  "flash_attn": false,
16
  "flash_rotary": false,
17
  "fused_dense": false,
18
+ "hidden_act": "gelu_new",
19
+ "hidden_size": 2560,
20
  "img_processor": null,
21
  "initializer_range": 0.02,
22
+ "intermediate_size": 8192,
23
+ "layer_norm_eps": 1e-05,
24
  "layer_norm_epsilon": 1e-05,
25
  "max_position_embeddings": 2048,
26
+ "model_type": "phi-msft",
27
  "n_embd": 2560,
28
  "n_head": 32,
29
  "n_head_kv": null,
 
32
  "n_positions": 2048,
33
  "num_attention_heads": 32,
34
  "num_experts_per_tok": 2,
35
+ "num_hidden_layers": 24,
36
+ "num_experts_per_tok": 2,
37
+ "num_local_experts": 4,
38
+ "num_key_value_heads": 32,
39
  "num_local_experts": 2,
40
+ "partial_rotary_factor": 0.5,
41
+ "qk_layernorm": false,
42
  "resid_pdrop": 0.1,
43
+ "rope_scaling": null,
44
  "rope_theta": 10000.0,
45
  "rotary_dim": 32,
 
46
  "sliding_window": null,
47
  "tie_word_embeddings": false,
48
  "torch_dtype": "float16",
49
+ "transformers_version": "4.37.1",
50
  "use_cache": false,
51
  "vocab_size": 51200
52
+ }