|
{ |
|
"num_features": 122880, |
|
"num_layers": 12, |
|
"d_model": 768, |
|
"model_name": null, |
|
"normalization_method": "mean_std", |
|
"activation_fn": "jumprelu", |
|
"jumprelu_threshold": 0.0, |
|
"batchtopk_k": null, |
|
"batchtopk_straight_through": false, |
|
"topk_k": null, |
|
"topk_straight_through": true, |
|
"topk_mode": "global", |
|
"two_stage_batchtopk": false, |
|
"two_stage_topk": false, |
|
"clt_dtype": null, |
|
"expected_input_dtype": null, |
|
"mlp_input_template": null, |
|
"mlp_output_template": null, |
|
"tl_input_template": null, |
|
"tl_output_template": null, |
|
"decoder_tying": "per_target", |
|
"enable_feature_offset": false, |
|
"enable_feature_scale": false, |
|
"skip_connection": true |
|
} |