{ "metadata": { "ParamSize": 1635, "ParamBytes": 674174212448.0, "BitsPerParam": 8.034249461128063 }, "records": [ { "dataPath": "params_shard_0.bin", "format": "raw-shard", "nbytes": 1853358080, "records": [ { "name": "model.embed_tokens.weight", "shape": [ 129280, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1853358080, "byteOffset": 0 } ], "md5sum": "f615ef92eeeef4c9c7cf3518a302ca30" }, { "dataPath": "params_shard_1.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.0.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "b57b7bc27ebcd74a49423a6f940a7339" }, { "dataPath": "params_shard_2.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.0.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "3ce397b837e84d3beb091db1c7d431a1" }, { "dataPath": "params_shard_3.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.0.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "590b0a556382ef0206e4c9fdc423ae43" }, { "dataPath": "params_shard_4.bin", "format": "raw-shard", "nbytes": 264241152, "records": [ { "name": "model.layers.0.mlp.gate_up_proj.weight", "shape": [ 36864, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 264241152, "byteOffset": 0 } ], "md5sum": "0e9b230cefe58afcc7e9cc3c788fa795" }, { "dataPath": "params_shard_5.bin", "format": "raw-shard", "nbytes": 132120576, "records": [ { "name": "model.layers.0.mlp.down_proj.weight", "shape": [ 7168, 18432 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 132120576, "byteOffset": 0 } ], "md5sum": "ff461e6f54d10acb5a29aed97e80a17c" }, { "dataPath": "params_shard_6.bin", "format": "raw-shard", "nbytes": 32022128, "records": [ { "name": "model.layers.0.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 0 }, { "name": "model.layers.0.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 11010048 }, { "name": "model.layers.0.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 11011392 }, { "name": "model.layers.0.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 11014464 }, { "name": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 11019072 }, { "name": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 15147840 }, { "name": "model.layers.0.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 15148400 }, { "name": "model.layers.0.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 15149424 }, { "name": "model.layers.0.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 23538032 }, { "name": "model.layers.0.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31926640 }, { "name": "model.layers.0.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31927664 }, { "name": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 31928688 }, { "name": "model.layers.0.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 31930736 }, { "name": "model.layers.0.mlp.gate_up_proj.weight_scale_inv", "shape": [ 288, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 32256, "byteOffset": 31945072 }, { "name": "model.layers.0.mlp.down_proj.weight_scale_inv", "shape": [ 56, 144 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 16128, "byteOffset": 31977328 }, { "name": "model.layers.0.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 31993456 }, { "name": "model.layers.0.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 32007792 } ], "md5sum": "c8a064c7fb499a0a0e998fc45c35208d" }, { "dataPath": "params_shard_7.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.1.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "2fc8f8cd2cf332d6990a9b472ba1b51d" }, { "dataPath": "params_shard_8.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.1.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "2059d468cb019f35d13dbdb2a2280900" }, { "dataPath": "params_shard_9.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.1.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "184436761f758dcee3e9b16d915cb09a" }, { "dataPath": "params_shard_10.bin", "format": "raw-shard", "nbytes": 264241152, "records": [ { "name": "model.layers.1.mlp.gate_up_proj.weight", "shape": [ 36864, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 264241152, "byteOffset": 0 } ], "md5sum": "a8e11d530f5593cf42edb4743ee61abb" }, { "dataPath": "params_shard_11.bin", "format": "raw-shard", "nbytes": 132120576, "records": [ { "name": "model.layers.1.mlp.down_proj.weight", "shape": [ 7168, 18432 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 132120576, "byteOffset": 0 } ], "md5sum": "e020bf839bb6ee570d0ea35eb93769e8" }, { "dataPath": "params_shard_12.bin", "format": "raw-shard", "nbytes": 32022128, "records": [ { "name": "model.layers.1.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 0 }, { "name": "model.layers.1.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 11010048 }, { "name": "model.layers.1.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 11011392 }, { "name": "model.layers.1.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 11014464 }, { "name": "model.layers.1.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 11019072 }, { "name": "model.layers.1.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 15147840 }, { "name": "model.layers.1.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 15148400 }, { "name": "model.layers.1.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 15149424 }, { "name": "model.layers.1.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 23538032 }, { "name": "model.layers.1.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31926640 }, { "name": "model.layers.1.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31927664 }, { "name": "model.layers.1.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 31928688 }, { "name": "model.layers.1.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 31930736 }, { "name": "model.layers.1.mlp.gate_up_proj.weight_scale_inv", "shape": [ 288, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 32256, "byteOffset": 31945072 }, { "name": "model.layers.1.mlp.down_proj.weight_scale_inv", "shape": [ 56, 144 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 16128, "byteOffset": 31977328 }, { "name": "model.layers.1.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 31993456 }, { "name": "model.layers.1.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 32007792 } ], "md5sum": "1160c131bc4f547183345c8cb0035663" }, { "dataPath": "params_shard_13.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.2.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "a19c0a02b964cc73671727d1fba81330" }, { "dataPath": "params_shard_14.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.2.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "a0818fb9b637ca154afc6f26b8858bc4" }, { "dataPath": "params_shard_15.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.2.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "d64f882ca3a9f8cff0958dbb3aee160e" }, { "dataPath": "params_shard_16.bin", "format": "raw-shard", "nbytes": 264241152, "records": [ { "name": "model.layers.2.mlp.gate_up_proj.weight", "shape": [ 36864, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 264241152, "byteOffset": 0 } ], "md5sum": "9df9b7d3b99609b699b64e33947b0f6f" }, { "dataPath": "params_shard_17.bin", "format": "raw-shard", "nbytes": 132120576, "records": [ { "name": "model.layers.2.mlp.down_proj.weight", "shape": [ 7168, 18432 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 132120576, "byteOffset": 0 } ], "md5sum": "28a76f08b7b0a8e4c51eafef090838bf" }, { "dataPath": "params_shard_18.bin", "format": "raw-shard", "nbytes": 32022128, "records": [ { "name": "model.layers.2.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 0 }, { "name": "model.layers.2.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 11010048 }, { "name": "model.layers.2.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 11011392 }, { "name": "model.layers.2.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 11014464 }, { "name": "model.layers.2.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 11019072 }, { "name": "model.layers.2.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 15147840 }, { "name": "model.layers.2.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 15148400 }, { "name": "model.layers.2.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 15149424 }, { "name": "model.layers.2.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 23538032 }, { "name": "model.layers.2.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31926640 }, { "name": "model.layers.2.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31927664 }, { "name": "model.layers.2.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 31928688 }, { "name": "model.layers.2.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 31930736 }, { "name": "model.layers.2.mlp.gate_up_proj.weight_scale_inv", "shape": [ 288, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 32256, "byteOffset": 31945072 }, { "name": "model.layers.2.mlp.down_proj.weight_scale_inv", "shape": [ 56, 144 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 16128, "byteOffset": 31977328 }, { "name": "model.layers.2.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 31993456 }, { "name": "model.layers.2.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 32007792 } ], "md5sum": "e9a88b518267a2fe3c41a1021e147e3e" }, { "dataPath": "params_shard_19.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.3.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "a83fd5d0cc9f88ba6b4e0f1b5c186c98" }, { "dataPath": "params_shard_20.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.3.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "fd229e2ffa000fd3a48836d054491483" }, { "dataPath": "params_shard_21.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.3.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "9c1dcbe70bb2a12a6104b61c7ed21299" }, { "dataPath": "params_shard_22.bin", "format": "raw-shard", "nbytes": 31945072, "records": [ { "name": "model.layers.3.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 0 }, { "name": "model.layers.3.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 11010048 }, { "name": "model.layers.3.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 11011392 }, { "name": "model.layers.3.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 11014464 }, { "name": "model.layers.3.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 11019072 }, { "name": "model.layers.3.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 15147840 }, { "name": "model.layers.3.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 15148400 }, { "name": "model.layers.3.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 15149424 }, { "name": "model.layers.3.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 23538032 }, { "name": "model.layers.3.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31926640 }, { "name": "model.layers.3.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31927664 }, { "name": "model.layers.3.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 31928688 }, { "name": "model.layers.3.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 31930736 } ], "md5sum": "968f804bcf75b773f6b14369fcabecfd" }, { "dataPath": "params_shard_23.bin", "format": "raw-shard", "nbytes": 33033728, "records": [ { "name": "model.layers.3.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 0 }, { "name": "model.layers.3.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 3670016 }, { "name": "model.layers.3.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 33030144 } ], "md5sum": "9ce0acd393042a3cc81f3a3880ebb7fb" }, { "dataPath": "params_shard_24.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.3.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "8ea9d7d906cb4fa2864c886c333d4799" }, { "dataPath": "params_shard_25.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.3.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "6e9723fb8d2d014ba0de37bef5e245ad" }, { "dataPath": "params_shard_26.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.4.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "012471f1612c456dd10b11a2e81ed7a8" }, { "dataPath": "params_shard_27.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.3.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.3.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.3.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.3.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.3.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.3.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.4.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.4.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.4.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.4.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.4.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.4.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.4.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "864eb6b04f6729183eb5be576024d3af" }, { "dataPath": "params_shard_28.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.4.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "00cdd309ef5a0c8a5257cd4608f941e5" }, { "dataPath": "params_shard_29.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.4.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "6363209405853ca7108262fdbf04e36f" }, { "dataPath": "params_shard_30.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.4.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "59983ea828d35d0184098a9dd13bdb38" }, { "dataPath": "params_shard_31.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.4.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.4.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.4.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.4.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.4.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.4.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.4.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.4.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "f2744732ea043e5094526673103df114" }, { "dataPath": "params_shard_32.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.4.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "820a162aa573f434e41e6906aa42c6bc" }, { "dataPath": "params_shard_33.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.4.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "99e070e3c1464249b9f283ec9c12e52f" }, { "dataPath": "params_shard_34.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.5.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "033f3f81bec40a25256aad4638488ad9" }, { "dataPath": "params_shard_35.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.4.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.4.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.4.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.4.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.4.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.4.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.5.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.5.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.5.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.5.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.5.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.5.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.5.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "7cafd9a8a1a2b5566583d661e4d342e6" }, { "dataPath": "params_shard_36.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.5.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "97e24a6c251e287c3bb51980f0f05dab" }, { "dataPath": "params_shard_37.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.5.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "107f3e80f9942634bbf4b0819de7c75d" }, { "dataPath": "params_shard_38.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.5.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "86156e02a036b4ef98750e17639617b5" }, { "dataPath": "params_shard_39.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.5.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.5.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.5.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.5.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.5.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.5.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.5.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.5.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "24899e9c9c7958655fefd66a39e07754" }, { "dataPath": "params_shard_40.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.5.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "3ea03a82aff630031e71a3eb7bf545ab" }, { "dataPath": "params_shard_41.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.5.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "e59decbcf3cc9a80fe3b8dce8b0b1f0c" }, { "dataPath": "params_shard_42.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.6.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "c2b2de8168d403d5c416244d5e0db0a4" }, { "dataPath": "params_shard_43.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.5.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.5.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.5.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.5.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.5.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.5.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.6.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.6.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.6.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.6.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.6.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.6.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.6.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "e3f6a4c24838308a649b57a6785b8190" }, { "dataPath": "params_shard_44.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.6.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "5ee3ef417e838e0d24a025507e9a5793" }, { "dataPath": "params_shard_45.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.6.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "84c536fe7f5baeef6b08c0bb0e2a9ac6" }, { "dataPath": "params_shard_46.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.6.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "d8a0e243f6431ec6f6d61f6a8d03ef6f" }, { "dataPath": "params_shard_47.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.6.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.6.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.6.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.6.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.6.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.6.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.6.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.6.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "ab53345188c232ce103047b4d7b07fcb" }, { "dataPath": "params_shard_48.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.6.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "a87c8f85865c51bc5cc712e96b02308c" }, { "dataPath": "params_shard_49.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.6.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "dc76b20be93e0fec439f19508e72dccf" }, { "dataPath": "params_shard_50.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.7.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "cdf5a00a8bdbd5bc289147f38700460b" }, { "dataPath": "params_shard_51.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.6.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.6.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.6.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.6.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.6.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.6.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.7.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.7.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.7.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.7.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.7.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.7.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.7.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "d0c7d9a7b02e40d79a5266055419fd0a" }, { "dataPath": "params_shard_52.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.7.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "fcf2d6052019bb6dd9ef046bef283824" }, { "dataPath": "params_shard_53.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.7.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "c67f2d5bc7314985cf7723468c958936" }, { "dataPath": "params_shard_54.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.7.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "92c6550b595c4676ec57d6b88bd9b4f0" }, { "dataPath": "params_shard_55.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.7.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.7.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.7.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.7.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.7.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.7.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.7.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.7.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "e9cde46f9c140803fe35f4f84936b8cc" }, { "dataPath": "params_shard_56.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.7.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "4dd049cf4b8007845d0712207d5d275a" }, { "dataPath": "params_shard_57.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.7.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "5558d3e61d8899937450edcdf90e6142" }, { "dataPath": "params_shard_58.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.8.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "1e3e8ca2d3eb9f83167266108a21cd82" }, { "dataPath": "params_shard_59.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.7.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.7.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.7.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.7.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.7.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.7.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.8.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.8.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.8.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.8.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.8.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.8.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.8.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "04d35d5152b43f1a2b96696cde338836" }, { "dataPath": "params_shard_60.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.8.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "d82019f218b16666a85919991e5ab5f6" }, { "dataPath": "params_shard_61.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.8.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "e38a95971ee92a08a9c1abf6595883bf" }, { "dataPath": "params_shard_62.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.8.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "8921dd5793553d60a6accf782d075a7e" }, { "dataPath": "params_shard_63.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.8.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.8.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.8.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.8.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.8.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.8.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.8.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.8.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "b98815bcf26651bca52184ee54b3707d" }, { "dataPath": "params_shard_64.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.8.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "600231da593603e8db9f5fdf76a452a7" }, { "dataPath": "params_shard_65.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.8.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "dbb2f2acb12e75ee67ee8251af4b5622" }, { "dataPath": "params_shard_66.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.9.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "3d88a0f581985487043daeb258ea2442" }, { "dataPath": "params_shard_67.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.8.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.8.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.8.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.8.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.8.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.8.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.9.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.9.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.9.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.9.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.9.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.9.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.9.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "6d3d12bc66460d497e7e29e50ce940fc" }, { "dataPath": "params_shard_68.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.9.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "0cc0d72702355cf9f82cdf1a15a2e118" }, { "dataPath": "params_shard_69.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.9.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "f09cfc8f7b1e784c94dd00aee118c5a2" }, { "dataPath": "params_shard_70.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.9.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "50b21a2b3c7ade052ffc67b203fc9c02" }, { "dataPath": "params_shard_71.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.9.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.9.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.9.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.9.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.9.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.9.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.9.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.9.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "9c01810fb7744dce13430d70d4b63919" }, { "dataPath": "params_shard_72.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.9.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "f37767ea64400555d71d2a7c2d2cc22a" }, { "dataPath": "params_shard_73.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.9.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "01efc19aa4d319e7fb10076e2ad88d61" }, { "dataPath": "params_shard_74.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.10.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "54741e6cca1d9142f5f15b9f466e8474" }, { "dataPath": "params_shard_75.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.9.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.9.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.9.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.9.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.9.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.9.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.10.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.10.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.10.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.10.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.10.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.10.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.10.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "4cbc8870395e36c4625efd08120e3eae" }, { "dataPath": "params_shard_76.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.10.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "7d9bdff1548878329d41b92333bf6637" }, { "dataPath": "params_shard_77.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.10.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "01424456255d144592b2d2237a4b9b26" }, { "dataPath": "params_shard_78.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.10.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "e8b23103487a62984a3d1f5c78c76a7f" }, { "dataPath": "params_shard_79.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.10.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.10.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.10.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.10.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.10.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.10.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.10.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.10.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "097666f81c202d0f9e69e5d31afda52b" }, { "dataPath": "params_shard_80.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.10.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "3166161db3e79df55d48ad32cbda5233" }, { "dataPath": "params_shard_81.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.10.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "18a5bfe338df2d02a07f9234924bcc82" }, { "dataPath": "params_shard_82.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.11.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "f7b0300fecda6b3ec009b2dcc8fda22d" }, { "dataPath": "params_shard_83.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.10.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.10.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.10.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.10.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.10.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.10.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.11.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.11.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.11.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.11.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.11.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.11.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.11.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "caa16b2a82c85e9ee14795b51856705d" }, { "dataPath": "params_shard_84.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.11.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "fe52325d2d71417667e856f7fa9ffb28" }, { "dataPath": "params_shard_85.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.11.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "bbca2e60213b8d8c88b1dea337f93d1c" }, { "dataPath": "params_shard_86.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.11.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "48d703be8118043853595e44622edf84" }, { "dataPath": "params_shard_87.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.11.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.11.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.11.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.11.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.11.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.11.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.11.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.11.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "da32f86630c3c685469a4649bd27bbbc" }, { "dataPath": "params_shard_88.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.11.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "606284628fe35d8618b09a122b2e1eb7" }, { "dataPath": "params_shard_89.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.11.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "e02cb984447e1f83d5a036bf30035d40" }, { "dataPath": "params_shard_90.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.12.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "d4b29876ace1dcca8ba0c6f24522ebac" }, { "dataPath": "params_shard_91.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.11.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.11.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.11.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.11.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.11.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.11.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.12.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.12.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.12.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.12.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.12.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.12.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.12.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "0b9219df8f84405def208beb4a884b62" }, { "dataPath": "params_shard_92.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.12.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "f3c6cb450854bd447a56eee9d40c88b3" }, { "dataPath": "params_shard_93.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.12.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "a1308cb3f58884f882fa4133465c1906" }, { "dataPath": "params_shard_94.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.12.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "d8111157781e95f6023a116625cb2795" }, { "dataPath": "params_shard_95.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.12.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.12.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.12.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.12.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.12.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.12.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.12.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.12.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "dee6b261d9cbd78e3cb24adc12fb1169" }, { "dataPath": "params_shard_96.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.12.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "1be01c95c6d5310e72d28372453db658" }, { "dataPath": "params_shard_97.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.12.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "88891ab1a6b8b2220a1d28f3ab1d99b9" }, { "dataPath": "params_shard_98.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.13.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "65fb73ef1c56acb26c7bf20720aa8f25" }, { "dataPath": "params_shard_99.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.12.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.12.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.12.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.12.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.12.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.12.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.13.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.13.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.13.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.13.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.13.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.13.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.13.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "4a07f8f82a677f18cfed9733b123ae22" }, { "dataPath": "params_shard_100.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.13.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "9185bd552d7004b123372094b897f718" }, { "dataPath": "params_shard_101.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.13.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "112dcdcbcc0b645c8eb3df05f84155af" }, { "dataPath": "params_shard_102.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.13.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "2062dada3cfad02239a23be79ccf8b1d" }, { "dataPath": "params_shard_103.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.13.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.13.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.13.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.13.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.13.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.13.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.13.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.13.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "ad12988eecd937f2efb9f9f8c732a5fe" }, { "dataPath": "params_shard_104.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.13.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "63b16ee97cc5792c35490aa6f4bfbc3a" }, { "dataPath": "params_shard_105.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.13.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "85ccc6af5273c852cbbb07321ef3eb3d" }, { "dataPath": "params_shard_106.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.14.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "ec68183091d2f17fa61fb4f3e52ea682" }, { "dataPath": "params_shard_107.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.13.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.13.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.13.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.13.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.13.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.13.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.14.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.14.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.14.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.14.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.14.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.14.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.14.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "ceaf1c26f0917604577aa78d24c5f728" }, { "dataPath": "params_shard_108.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.14.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "711f72fbed8226908dd58d035126c270" }, { "dataPath": "params_shard_109.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.14.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "d03895583193a9a34423fb4005e196cc" }, { "dataPath": "params_shard_110.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.14.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "d7c8a0b1cd7ed03961a4d7b8f9d4591c" }, { "dataPath": "params_shard_111.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.14.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.14.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.14.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.14.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.14.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.14.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.14.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.14.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "c868331b7d9c96d8b4dedb7cf6aaf099" }, { "dataPath": "params_shard_112.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.14.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "7df342f2ed7886cdbb85e964ad203188" }, { "dataPath": "params_shard_113.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.14.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "647620e336cf40d26355a48f5d67f422" }, { "dataPath": "params_shard_114.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.15.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "cafded693abb8881883394588ad1f22a" }, { "dataPath": "params_shard_115.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.14.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.14.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.14.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.14.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.14.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.14.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.15.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.15.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.15.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.15.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.15.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.15.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.15.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "05abc820b4d63b7f20e6efe659cb93f2" }, { "dataPath": "params_shard_116.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.15.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "490be7a82308079695f15e5052683f10" }, { "dataPath": "params_shard_117.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.15.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "6a66726251062169a38b752d44d11824" }, { "dataPath": "params_shard_118.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.15.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "214dc84fd3ffa1801b86c07819ea9d37" }, { "dataPath": "params_shard_119.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.15.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.15.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.15.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.15.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.15.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.15.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.15.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.15.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "6f5d5a484af1a3ec5b3abb295a95e789" }, { "dataPath": "params_shard_120.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.15.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "f4f827ee977b655e0238a9745c576353" }, { "dataPath": "params_shard_121.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.15.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "879d0327fce23bdfc68901b0a0e8f57f" }, { "dataPath": "params_shard_122.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.16.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "7c30e617705fd802cd750676e9ad7b11" }, { "dataPath": "params_shard_123.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.15.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.15.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.15.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.15.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.15.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.15.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.16.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.16.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.16.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.16.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.16.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.16.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.16.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "b56b35eb620e07faa0bff615259b82e0" }, { "dataPath": "params_shard_124.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.16.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "53e2b45a502751157b28def08db9c9f1" }, { "dataPath": "params_shard_125.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.16.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "83a426d38d40117fb3f0ed059ef49559" }, { "dataPath": "params_shard_126.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.16.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "f1e8d77882666dee2b64c6e38fb2c006" }, { "dataPath": "params_shard_127.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.16.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.16.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.16.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.16.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.16.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.16.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.16.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.16.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "919de1fd19d357681e50e03e190be29d" }, { "dataPath": "params_shard_128.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.16.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "af6fdbba964a82ff5d59e3880a6c3a64" }, { "dataPath": "params_shard_129.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.16.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "70aa2ef5fa3c98549ca95ba6da4e9d03" }, { "dataPath": "params_shard_130.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.17.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "f2af23ed38c6d57378b4b46bcfbcc937" }, { "dataPath": "params_shard_131.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.16.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.16.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.16.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.16.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.16.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.16.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.17.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.17.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.17.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.17.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.17.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.17.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.17.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "bc3db79d5d80ff5ad4447956efae634d" }, { "dataPath": "params_shard_132.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.17.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "9f390f5345e0003806272929277d7ba2" }, { "dataPath": "params_shard_133.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.17.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "3f4ba29a3b56a573e2a80f7e87525ef8" }, { "dataPath": "params_shard_134.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.17.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "56ea9969e4d506a92d98f87929606758" }, { "dataPath": "params_shard_135.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.17.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.17.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.17.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.17.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.17.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.17.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.17.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.17.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "3461839e1b13ffff7ddeb50b1c1b0de8" }, { "dataPath": "params_shard_136.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.17.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "eead530e2f9ff6aeb503821ee778a612" }, { "dataPath": "params_shard_137.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.17.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "807565e82ad62ca14db8208aaa66d13e" }, { "dataPath": "params_shard_138.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.18.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "823663ee07f7ca9e79b30b4a65ada093" }, { "dataPath": "params_shard_139.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.17.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.17.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.17.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.17.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.17.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.17.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.18.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.18.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.18.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.18.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.18.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.18.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.18.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "3c1a420fb41ebb18916ee5285bb2f1a7" }, { "dataPath": "params_shard_140.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.18.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "b91191dac24fe58ca1ddc8ef7d685da0" }, { "dataPath": "params_shard_141.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.18.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "e837cbc9100f46a2fd590325c94bed72" }, { "dataPath": "params_shard_142.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.18.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "97fcff8d6b45856c13b2a27f83e6bb9f" }, { "dataPath": "params_shard_143.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.18.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.18.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.18.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.18.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.18.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.18.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.18.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.18.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "f16fc60ec5143b8199d499a4e3455ea7" }, { "dataPath": "params_shard_144.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.18.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "a5f57641a56afb0bd8d153dcac8d5684" }, { "dataPath": "params_shard_145.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.18.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "592ca4a3a3ad0d3d9b27b7a982278a1e" }, { "dataPath": "params_shard_146.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.19.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "dcc1d2d124a5549895f448e05155daaf" }, { "dataPath": "params_shard_147.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.18.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.18.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.18.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.18.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.18.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.18.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.19.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.19.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.19.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.19.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.19.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.19.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.19.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "54fd4cb8a4b95739cffb593f8bb0ae80" }, { "dataPath": "params_shard_148.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.19.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "270fff7fb539a0cf33b6d633c7c5ad0b" }, { "dataPath": "params_shard_149.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.19.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "beb74e94fa6b119c5fbf8c765a6660ca" }, { "dataPath": "params_shard_150.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.19.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "2b2918f08be5202023483d8108e60b30" }, { "dataPath": "params_shard_151.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.19.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.19.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.19.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.19.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.19.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.19.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.19.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.19.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "f758ed4d539ef4ac5d9fa0d876f7a400" }, { "dataPath": "params_shard_152.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.19.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "6912459e41c6babb35fae6b72f554e9e" }, { "dataPath": "params_shard_153.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.19.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "5523e78622f929550545d650950e296b" }, { "dataPath": "params_shard_154.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.20.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "21609c83ff1327b75e554e4e6d52ea41" }, { "dataPath": "params_shard_155.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.19.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.19.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.19.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.19.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.19.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.19.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.20.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.20.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.20.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.20.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.20.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.20.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.20.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "eaeacf239da745a8116d0e729409005b" }, { "dataPath": "params_shard_156.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.20.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "031a8395a7f353ca50c7aafd1eec937f" }, { "dataPath": "params_shard_157.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.20.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "888333ce4d1f3fdb13cebe533221084a" }, { "dataPath": "params_shard_158.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.20.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "294a78f96c2fdecd405408a68dc324c7" }, { "dataPath": "params_shard_159.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.20.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.20.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.20.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.20.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.20.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.20.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.20.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.20.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "f84133d3a05be9623f31828d9c86d39c" }, { "dataPath": "params_shard_160.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.20.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "98a9afbd55a94da588e02c914e361b8d" }, { "dataPath": "params_shard_161.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.20.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "54f582f4259dd89821ae4631f0de3975" }, { "dataPath": "params_shard_162.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.21.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "aa8cbe00363f7c2c6b309e64024025b7" }, { "dataPath": "params_shard_163.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.20.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.20.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.20.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.20.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.20.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.20.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.21.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.21.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.21.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.21.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.21.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.21.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.21.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "cf145334daf09a0d68e239eacef87343" }, { "dataPath": "params_shard_164.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.21.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "f68beb14ddd27e0057bd9a6419bae884" }, { "dataPath": "params_shard_165.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.21.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "0e52614ae0f155928b70a33afb07f42d" }, { "dataPath": "params_shard_166.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.21.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "7683ce6541ea62daed687c0a4fa5cb1f" }, { "dataPath": "params_shard_167.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.21.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.21.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.21.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.21.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.21.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.21.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.21.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.21.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "5c8882f69b39e80e492ef6a3a58532d0" }, { "dataPath": "params_shard_168.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.21.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "1dbbadd56fe353ab425531ff8dfe2241" }, { "dataPath": "params_shard_169.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.21.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "ddff2b170ad46efd391b946189b3b341" }, { "dataPath": "params_shard_170.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.22.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "07e7067d19759f24dd1c4dc75bfa9bcd" }, { "dataPath": "params_shard_171.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.21.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.21.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.21.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.21.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.21.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.21.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.22.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.22.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.22.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.22.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.22.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.22.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.22.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "c2919ba4d2916551bd75ec362dd46639" }, { "dataPath": "params_shard_172.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.22.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "a67f0e5288b198cdc29e5e79ab8526a0" }, { "dataPath": "params_shard_173.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.22.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "ca7efab3f54ecdf810b49d0f428aecbf" }, { "dataPath": "params_shard_174.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.22.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "e73f6d6b54611b3d23c44793c843c703" }, { "dataPath": "params_shard_175.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.22.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.22.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.22.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.22.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.22.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.22.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.22.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.22.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "2fe6c3738ab5750050dbb3e93578d7ca" }, { "dataPath": "params_shard_176.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.22.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "f77981e4409a76753f929ff33e8456ff" }, { "dataPath": "params_shard_177.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.22.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "fc92d5dc96b7b56a4992b661f67875c4" }, { "dataPath": "params_shard_178.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.23.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "cb9d8fc23102875cfcbe05e31d94bd82" }, { "dataPath": "params_shard_179.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.22.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.22.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.22.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.22.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.22.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.22.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.23.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.23.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.23.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.23.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.23.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.23.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.23.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "08eee4632d6fe7e193776e9482cc8da0" }, { "dataPath": "params_shard_180.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.23.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "215f8737a79679f59beac1a9e3f89633" }, { "dataPath": "params_shard_181.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.23.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "53af29afa700d95403b062c084132ff8" }, { "dataPath": "params_shard_182.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.23.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "c473dfea9dba051f7439ce03a73f6510" }, { "dataPath": "params_shard_183.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.23.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.23.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.23.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.23.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.23.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.23.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.23.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.23.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "aeee7eae6215e102514919d6adaec2dc" }, { "dataPath": "params_shard_184.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.23.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "5a998f4e4a658b83b3f37649f1baa176" }, { "dataPath": "params_shard_185.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.23.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "de7888059a488c09e3f3ca53df703d02" }, { "dataPath": "params_shard_186.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.24.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "97e41c35dd904c0771003251b47f63d2" }, { "dataPath": "params_shard_187.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.23.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.23.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.23.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.23.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.23.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.23.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.24.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.24.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.24.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.24.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.24.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.24.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.24.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "4b411a074a87a7274140efdf7a6da56d" }, { "dataPath": "params_shard_188.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.24.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "7a676aee83a73a79cf04127e7df588f9" }, { "dataPath": "params_shard_189.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.24.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "5b6e347698458c762e9c951cfd1bebc7" }, { "dataPath": "params_shard_190.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.24.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "8d9b4e09ac20beca2ae153120f2348bb" }, { "dataPath": "params_shard_191.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.24.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.24.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.24.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.24.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.24.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.24.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.24.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.24.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "431d9f7e4409a1fb346cdb9911b63361" }, { "dataPath": "params_shard_192.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.24.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "2d9b99374924224bfcf02390ff3b4c0f" }, { "dataPath": "params_shard_193.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.24.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "4432d8e82472c929127d44a34b2cb59d" }, { "dataPath": "params_shard_194.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.25.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "beb461bfdf1f807de7a9a75d84cbb08e" }, { "dataPath": "params_shard_195.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.24.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.24.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.24.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.24.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.24.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.24.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.25.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.25.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.25.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.25.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.25.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.25.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.25.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "4fc9abc4fff8feb211a73d9bdc45aea3" }, { "dataPath": "params_shard_196.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.25.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "1dd150b506ef95d428755e13fa2e355b" }, { "dataPath": "params_shard_197.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.25.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "1e725c40e337351709f51391fa12ab18" }, { "dataPath": "params_shard_198.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.25.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "9f293bcbe49f216bf68a183dc5c60aef" }, { "dataPath": "params_shard_199.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.25.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.25.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.25.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.25.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.25.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.25.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.25.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.25.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "849587be535aa57b128bb8d0baf8e15e" }, { "dataPath": "params_shard_200.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.25.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "6d1a3c4c1e79db7f5794e5ae13393ad2" }, { "dataPath": "params_shard_201.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.25.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "c33cf82098ffcf264dad4ab870a3a212" }, { "dataPath": "params_shard_202.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.26.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "1bf5a204e02d4c8dc1857e38f15ef2f6" }, { "dataPath": "params_shard_203.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.25.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.25.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.25.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.25.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.25.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.25.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.26.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.26.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.26.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.26.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.26.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.26.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.26.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "b517f546c8e267c2ee6197cea5b69aa7" }, { "dataPath": "params_shard_204.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.26.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "c5550fe26189fb39274d34464b722a3e" }, { "dataPath": "params_shard_205.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.26.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "eb49b972d8af075c8e296fda9952ce8c" }, { "dataPath": "params_shard_206.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.26.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "4ec15e8786a15073e080ed4295e2da4f" }, { "dataPath": "params_shard_207.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.26.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.26.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.26.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.26.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.26.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.26.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.26.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.26.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "19c585b15cbe9a2b9e01c0edc4bcdcb6" }, { "dataPath": "params_shard_208.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.26.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "12a24d004311eca87f4d4028725217ba" }, { "dataPath": "params_shard_209.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.26.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "a19173a33dd3552fcbe45e2102fc3edf" }, { "dataPath": "params_shard_210.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.27.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "bb17fe7aac1cc67c99f7e7e8746e697a" }, { "dataPath": "params_shard_211.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.26.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.26.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.26.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.26.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.26.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.26.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.27.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.27.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.27.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.27.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.27.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.27.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.27.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "caa57393ae5da5b9996bd53ab6744933" }, { "dataPath": "params_shard_212.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.27.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "6a3c04f616bfab2d604f5f3fcda49ea9" }, { "dataPath": "params_shard_213.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.27.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "7a097f63aecd5b9423ec425c5ca58080" }, { "dataPath": "params_shard_214.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.27.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "f01de190a4827bc37d5af86ef0778607" }, { "dataPath": "params_shard_215.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.27.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.27.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.27.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.27.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.27.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.27.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.27.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.27.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "fb145c4489733b8f30c286f0c9b43fa3" }, { "dataPath": "params_shard_216.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.27.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "926de080fef064d101deb14c61798317" }, { "dataPath": "params_shard_217.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.27.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "dad80d00614147675293483dc79668a4" }, { "dataPath": "params_shard_218.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.28.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "931a35e1b9cd8cd3076d4195823a18ec" }, { "dataPath": "params_shard_219.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.27.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.27.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.27.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.27.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.27.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.27.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.28.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.28.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.28.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.28.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.28.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.28.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.28.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "4d4a8935637f54d5e762567a47a451b9" }, { "dataPath": "params_shard_220.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.28.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "d91cbd91ce3bef22547cb9cdd6ca921e" }, { "dataPath": "params_shard_221.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.28.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "a3eeab4991116516021ff81f9536de7b" }, { "dataPath": "params_shard_222.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.28.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "aec4253bebda42c10f1311bd79fb249b" }, { "dataPath": "params_shard_223.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.28.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.28.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.28.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.28.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.28.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.28.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.28.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.28.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "75f79be829240a6868e05eb8c20e4228" }, { "dataPath": "params_shard_224.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.28.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "b58888c01e493604f55b29d3ae685624" }, { "dataPath": "params_shard_225.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.28.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "11398f593e1dadf3138480373fdcbeb7" }, { "dataPath": "params_shard_226.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.29.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "d3ba7cd7b4a9769ec4831eabc51e2baa" }, { "dataPath": "params_shard_227.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.28.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.28.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.28.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.28.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.28.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.28.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.29.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.29.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.29.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.29.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.29.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.29.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.29.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "9101ff60080631a9c48f53a1abab2b67" }, { "dataPath": "params_shard_228.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.29.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "116dba7ec4255331533bc07818f97067" }, { "dataPath": "params_shard_229.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.29.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "6799112e09e75c5991be4535bf58cdb7" }, { "dataPath": "params_shard_230.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.29.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "e9d8b61fd7d7ca6a20ad67c356b1cfff" }, { "dataPath": "params_shard_231.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.29.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.29.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.29.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.29.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.29.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.29.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.29.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.29.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "1f54ebfed071b3a7fdede39a6eaf57a9" }, { "dataPath": "params_shard_232.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.29.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "35c55ab4753fad012ed1b5b41a9a35d4" }, { "dataPath": "params_shard_233.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.29.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "4406ee83b23888e2cb6fd0366743e782" }, { "dataPath": "params_shard_234.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.30.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "2d8c76b2307e90393050a660216c3806" }, { "dataPath": "params_shard_235.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.29.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.29.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.29.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.29.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.29.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.29.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.30.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.30.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.30.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.30.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.30.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.30.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.30.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "01214fae59f69d92a81de1c09f960b4c" }, { "dataPath": "params_shard_236.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.30.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "09dfed04c8c4c57d2c926d533ff7bad2" }, { "dataPath": "params_shard_237.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.30.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "36825573e322decb479340c3faad6312" }, { "dataPath": "params_shard_238.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.30.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "67a878dd234490b94cca5a938a3952e9" }, { "dataPath": "params_shard_239.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.30.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.30.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.30.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.30.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.30.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.30.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.30.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.30.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "875bb55f850413536abbbf4c6dbc77b9" }, { "dataPath": "params_shard_240.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.30.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "cb89d076e22f53f47ea8c3e04b940484" }, { "dataPath": "params_shard_241.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.30.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "df06125df2db643073590cb2d0b86ec2" }, { "dataPath": "params_shard_242.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.31.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "130d5ba3bcee56c6a98b9a2fbcc14a70" }, { "dataPath": "params_shard_243.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.30.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.30.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.30.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.30.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.30.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.30.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.31.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.31.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.31.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.31.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.31.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.31.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.31.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "0638035fd5ad2541f1e968341d0cb45c" }, { "dataPath": "params_shard_244.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.31.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "ac08b127fbb3288698842a232527b06d" }, { "dataPath": "params_shard_245.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.31.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "d069ced6fe8b882e9de0845fd12f3224" }, { "dataPath": "params_shard_246.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.31.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "0a04265aa4fd10312634efce335161b0" }, { "dataPath": "params_shard_247.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.31.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.31.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.31.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.31.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.31.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.31.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.31.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.31.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "a26f7f7e763774e6cbcd04f3fc89f87e" }, { "dataPath": "params_shard_248.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.31.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "d4279fd83eb1f26702b020311cc50b85" }, { "dataPath": "params_shard_249.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.31.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "8a9e2972237c2264525b9e604ad5ccaf" }, { "dataPath": "params_shard_250.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.32.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "213cdd93a108751531e6e4b07a8bb404" }, { "dataPath": "params_shard_251.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.31.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.31.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.31.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.31.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.31.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.31.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.32.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.32.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.32.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.32.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.32.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.32.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.32.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "050bbdacb18c654987e8f216027e3328" }, { "dataPath": "params_shard_252.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.32.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "bb5478ee68374c2ad53d9f4e21a5e31a" }, { "dataPath": "params_shard_253.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.32.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "2bb57047a30264f152b57a8ebac466a0" }, { "dataPath": "params_shard_254.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.32.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "beff98e47b2ea0e650506e70ee3388aa" }, { "dataPath": "params_shard_255.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.32.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.32.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.32.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.32.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.32.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.32.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.32.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.32.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "9aa26a49814948b9412709706b4c03ad" }, { "dataPath": "params_shard_256.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.32.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "ae30b22e2092f19badba35aafbfa8f8e" }, { "dataPath": "params_shard_257.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.32.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "c94945363928889954b7d63ae2e60cb8" }, { "dataPath": "params_shard_258.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.33.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "01cdcd3c3183178b0b1d728d7a624fcd" }, { "dataPath": "params_shard_259.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.32.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.32.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.32.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.32.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.32.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.32.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.33.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.33.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.33.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.33.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.33.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.33.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.33.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "89cb7d6289a935254dc260b0996788eb" }, { "dataPath": "params_shard_260.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.33.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "3d0a489736f4aebbd49819308ade2294" }, { "dataPath": "params_shard_261.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.33.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "507dcdf5a349b96f00ebb2e280542117" }, { "dataPath": "params_shard_262.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.33.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "b6edd299a81ed893943c9fb302064c13" }, { "dataPath": "params_shard_263.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.33.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.33.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.33.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.33.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.33.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.33.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.33.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.33.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "16a7455375e2f3512ae1d89c705ee892" }, { "dataPath": "params_shard_264.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.33.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "cceb0d104e135afe439ee0e856c4fc3a" }, { "dataPath": "params_shard_265.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.33.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "97f05b15c4cb2f99856616670dc0fe33" }, { "dataPath": "params_shard_266.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.34.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "f55da3ed3ab9b0f82e48489e0c5da984" }, { "dataPath": "params_shard_267.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.33.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.33.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.33.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.33.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.33.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.33.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.34.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.34.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.34.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.34.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.34.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.34.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.34.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "c510ec41c61f861b197279cabbfbc02c" }, { "dataPath": "params_shard_268.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.34.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "cfe4748e07b597f333985def0e3866cf" }, { "dataPath": "params_shard_269.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.34.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "35c58ebee3f98e75363f197d160972d9" }, { "dataPath": "params_shard_270.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.34.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "89db7816e56f1c48141ec65e8e610d7a" }, { "dataPath": "params_shard_271.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.34.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.34.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.34.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.34.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.34.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.34.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.34.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.34.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "a6c1d54aa9baadeb23099bca9e7f9045" }, { "dataPath": "params_shard_272.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.34.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "81df76cdc38260ba4daa3a8eaaa8c1f1" }, { "dataPath": "params_shard_273.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.34.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "4252baddb7186cdf3a2f199ee7a08daa" }, { "dataPath": "params_shard_274.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.35.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "413e432281b2143d56d73528e2b8ac8c" }, { "dataPath": "params_shard_275.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.34.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.34.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.34.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.34.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.34.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.34.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.35.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.35.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.35.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.35.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.35.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.35.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.35.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "ebdc2075a6eaf588d5d183176d8583a5" }, { "dataPath": "params_shard_276.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.35.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "fdf8c3e44f92e14692b8c57e1d1fc99c" }, { "dataPath": "params_shard_277.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.35.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "0c9cee0f07c7218be058fe4e4a380756" }, { "dataPath": "params_shard_278.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.35.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "c21cccdb2e2e6438e2ed5043c20736b8" }, { "dataPath": "params_shard_279.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.35.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.35.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.35.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.35.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.35.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.35.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.35.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.35.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "a1ed2cc8b19053ff1ada313a1f804e68" }, { "dataPath": "params_shard_280.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.35.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "e186ada819496112b03de5e7e93bbea7" }, { "dataPath": "params_shard_281.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.35.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "86fccfc55cca305c1d2ebf39b58fbe96" }, { "dataPath": "params_shard_282.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.36.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "a4b2fad6b8a22cd114c4501ed2d5ff99" }, { "dataPath": "params_shard_283.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.35.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.35.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.35.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.35.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.35.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.35.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.36.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.36.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.36.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.36.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.36.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.36.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.36.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "20eb0e67a64ff4d3ce18afcab3604ad1" }, { "dataPath": "params_shard_284.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.36.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "a9127511157abb9862d9de02e5869399" }, { "dataPath": "params_shard_285.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.36.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "ed14615da4d167ea1a0739a12183c3af" }, { "dataPath": "params_shard_286.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.36.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "72547589910207403e6a3a938c330833" }, { "dataPath": "params_shard_287.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.36.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.36.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.36.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.36.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.36.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.36.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.36.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.36.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "aa77ab524d56481a7de034ee15d2e117" }, { "dataPath": "params_shard_288.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.36.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "3cc9fd87c10fcd3adcc35d4a5be74856" }, { "dataPath": "params_shard_289.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.36.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "334019d3e18751824990346ca1289ec5" }, { "dataPath": "params_shard_290.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.37.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "315f9dc5e0aa791c2c223aed64ce7cad" }, { "dataPath": "params_shard_291.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.36.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.36.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.36.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.36.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.36.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.36.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.37.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.37.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.37.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.37.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.37.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.37.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.37.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "fa2924fd19196928742323b0acb37ce1" }, { "dataPath": "params_shard_292.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.37.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "c5ac62314f36ea60278781496d4a99d0" }, { "dataPath": "params_shard_293.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.37.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "014ce1444c4bec9ca24340d5781b8728" }, { "dataPath": "params_shard_294.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.37.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "f896cbcdaa7e2a2ee82ac771632b0bde" }, { "dataPath": "params_shard_295.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.37.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.37.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.37.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.37.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.37.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.37.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.37.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.37.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "d813354df2a887eb693d775c8c90c23c" }, { "dataPath": "params_shard_296.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.37.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "ced2b1307a11e1193fdf475185858933" }, { "dataPath": "params_shard_297.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.37.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "cf79b5bdbe1202b4d2732a00ce10662d" }, { "dataPath": "params_shard_298.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.38.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "29c4b87e5523e12bd0950d6122306c35" }, { "dataPath": "params_shard_299.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.37.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.37.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.37.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.37.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.37.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.37.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.38.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.38.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.38.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.38.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.38.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.38.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.38.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "c50a4d7b70807cebf3d1b9c45abcd6a0" }, { "dataPath": "params_shard_300.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.38.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "377e637040c17a185d03000c994501f5" }, { "dataPath": "params_shard_301.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.38.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "9d3e8ffc384bbde5b29e530a9750d8f1" }, { "dataPath": "params_shard_302.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.38.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "119a4fab675cf729739b1f113529c15d" }, { "dataPath": "params_shard_303.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.38.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.38.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.38.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.38.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.38.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.38.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.38.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.38.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "e544004dcd094590fbc2888c32f2b5ea" }, { "dataPath": "params_shard_304.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.38.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "6f242f11b8f0f2b5d377b34291534075" }, { "dataPath": "params_shard_305.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.38.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "b1271ce82d14750ee69d0157ee3c0ec4" }, { "dataPath": "params_shard_306.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.39.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "c07143e37eacd984d4bdfeb4895f9b61" }, { "dataPath": "params_shard_307.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.38.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.38.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.38.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.38.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.38.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.38.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.39.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.39.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.39.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.39.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.39.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.39.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.39.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "c6b37978a48115b09e1e722f064b2dac" }, { "dataPath": "params_shard_308.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.39.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "d7c7b3768895a23d37729b50063c1aa2" }, { "dataPath": "params_shard_309.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.39.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "669832b57f332e55a8849d27a149577d" }, { "dataPath": "params_shard_310.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.39.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "2003eb94b568df0b574d68f8c80c2aef" }, { "dataPath": "params_shard_311.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.39.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.39.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.39.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.39.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.39.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.39.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.39.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.39.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "786dfa385a927dd3894fd302e3e81f68" }, { "dataPath": "params_shard_312.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.39.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "1781244c5c3fff13029a4942125501e2" }, { "dataPath": "params_shard_313.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.39.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "d86712b9b16bf993532663deadbb61d4" }, { "dataPath": "params_shard_314.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.40.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "883afc1f8b04ffc701b1e4e937e86624" }, { "dataPath": "params_shard_315.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.39.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.39.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.39.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.39.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.39.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.39.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.40.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.40.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.40.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.40.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.40.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.40.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.40.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "8981c135295f4af4047fb89abbe353c2" }, { "dataPath": "params_shard_316.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.40.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "43d9d196de60b51918caf45dc0807657" }, { "dataPath": "params_shard_317.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.40.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "e28c631e21b115129ea757f53ea70d2e" }, { "dataPath": "params_shard_318.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.40.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "c8a475eea4aa62621a4781efb2ed6ada" }, { "dataPath": "params_shard_319.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.40.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.40.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.40.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.40.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.40.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.40.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.40.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.40.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "2208315e9dc4d8492fd32a1882e6b47a" }, { "dataPath": "params_shard_320.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.40.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "4bf7c110ad41cf4a2068b4c5efca8069" }, { "dataPath": "params_shard_321.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.40.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "c8b7756c69df7dc7f47c6ddb6ba7d50b" }, { "dataPath": "params_shard_322.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.41.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "1807333cdf6eddf0cf214fce680d0ff4" }, { "dataPath": "params_shard_323.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.40.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.40.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.40.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.40.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.40.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.40.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.41.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.41.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.41.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.41.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.41.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.41.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.41.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "00823db313f2887236977270d7e806c9" }, { "dataPath": "params_shard_324.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.41.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "ffe76baf1c649e6ce6429be0c61ec53f" }, { "dataPath": "params_shard_325.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.41.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "0cf21aa3a6775db4971e6b2db077f42d" }, { "dataPath": "params_shard_326.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.41.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "d2b78b2673b086b8cf83f484f331276f" }, { "dataPath": "params_shard_327.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.41.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.41.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.41.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.41.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.41.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.41.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.41.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.41.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "9218eef719563cbab84cb2c8e20a145d" }, { "dataPath": "params_shard_328.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.41.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "98fe859f3c21fe946c162e043cab48cb" }, { "dataPath": "params_shard_329.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.41.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "0423068f7cf704fe5eaeb14a3afd9bf8" }, { "dataPath": "params_shard_330.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.42.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "fd94ddef9975a3b87cc6c02a96c132e9" }, { "dataPath": "params_shard_331.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.41.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.41.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.41.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.41.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.41.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.41.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.42.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.42.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.42.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.42.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.42.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.42.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.42.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "9cda07bdb5ad83ab96d87b3323ffcd86" }, { "dataPath": "params_shard_332.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.42.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "bab3098150de21b6ae021d2e5b02104e" }, { "dataPath": "params_shard_333.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.42.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "02be413dfdd2ce481fdba4b1e96ccaf6" }, { "dataPath": "params_shard_334.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.42.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "16b009d6eea571c8e748f5de5109b4b9" }, { "dataPath": "params_shard_335.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.42.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.42.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.42.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.42.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.42.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.42.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.42.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.42.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "0408119df58283bbeb322f589b24e0ab" }, { "dataPath": "params_shard_336.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.42.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "5b31a9019410099940788c79c5f8ca13" }, { "dataPath": "params_shard_337.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.42.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "f0acbcec21234e442a3244ff61fa0dc0" }, { "dataPath": "params_shard_338.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.43.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "ce0f7caa31bbed002a2b761a0a2cc8bb" }, { "dataPath": "params_shard_339.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.42.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.42.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.42.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.42.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.42.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.42.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.43.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.43.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.43.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.43.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.43.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.43.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.43.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "6a7ded70ad0564b39cf175e6068f55fd" }, { "dataPath": "params_shard_340.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.43.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "7d1c1b5996c5f5bcb4ebaeb0e300c82f" }, { "dataPath": "params_shard_341.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.43.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "5971e1701ca2c5b751d8162c41543b23" }, { "dataPath": "params_shard_342.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.43.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "5fe8ac9ec4863a546db2ca3b91ec1fbb" }, { "dataPath": "params_shard_343.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.43.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.43.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.43.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.43.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.43.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.43.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.43.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.43.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "b34eda353c1f6ab15b23552cc0c2e02b" }, { "dataPath": "params_shard_344.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.43.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "c23f3d834a83a6d4acbe6fa8e4c8b9cd" }, { "dataPath": "params_shard_345.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.43.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "4a4fb120e8b151ec27b98eb74deb6d6a" }, { "dataPath": "params_shard_346.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.44.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "be108807cc51961398d4a5d0e4e4c2d4" }, { "dataPath": "params_shard_347.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.43.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.43.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.43.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.43.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.43.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.43.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.44.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.44.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.44.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.44.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.44.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.44.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.44.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "c83b17b419cb7b8c58a09250ee109c34" }, { "dataPath": "params_shard_348.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.44.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "1fb2c6bb5275eacddf775400f779d599" }, { "dataPath": "params_shard_349.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.44.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "cdbbd2b3214f3c2c92510653e8afb8ca" }, { "dataPath": "params_shard_350.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.44.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "737ca2dfd9f79182dc09c6feb7810313" }, { "dataPath": "params_shard_351.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.44.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.44.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.44.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.44.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.44.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.44.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.44.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.44.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "2b99b7262f00662bf2ea418c53ee7566" }, { "dataPath": "params_shard_352.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.44.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "d1b4ee950295e2224dd5bed780737432" }, { "dataPath": "params_shard_353.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.44.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "44d7c33eb86d4459041b630ca1a71f92" }, { "dataPath": "params_shard_354.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.45.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "7c628bd7ca4e2b1334d77548c30b7fa8" }, { "dataPath": "params_shard_355.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.44.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.44.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.44.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.44.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.44.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.44.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.45.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.45.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.45.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.45.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.45.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.45.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.45.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "9ea6a5d21262b67d2345e9fa32e53037" }, { "dataPath": "params_shard_356.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.45.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "4f9808dd8b5f88996e0b9fd603719f4f" }, { "dataPath": "params_shard_357.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.45.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "5acc71346cfac1d631e2b69210f08fa2" }, { "dataPath": "params_shard_358.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.45.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "f28a9b58fc38a1cc8040757c711e4165" }, { "dataPath": "params_shard_359.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.45.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.45.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.45.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.45.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.45.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.45.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.45.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.45.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "012e4e90a4f7b441c71b051075d604dc" }, { "dataPath": "params_shard_360.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.45.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "b0f780fd1e7d958842fb6381510f29c0" }, { "dataPath": "params_shard_361.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.45.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "0f2d59516bb4365ad1f04d63a737c74c" }, { "dataPath": "params_shard_362.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.46.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "8e41a1ffbb39e25b3b26af75ae1b5c76" }, { "dataPath": "params_shard_363.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.45.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.45.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.45.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.45.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.45.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.45.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.46.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.46.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.46.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.46.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.46.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.46.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.46.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "6a5025201ed2669260119f7b5fb65831" }, { "dataPath": "params_shard_364.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.46.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "e85e44f840be0d5f2f451a79a71c9267" }, { "dataPath": "params_shard_365.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.46.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "5f527b1d0fe32f6e6eaabcb641a66dad" }, { "dataPath": "params_shard_366.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.46.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "d28be83109918f306addf6e5aa78b889" }, { "dataPath": "params_shard_367.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.46.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.46.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.46.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.46.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.46.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.46.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.46.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.46.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "4ceba2612f617e63279b4997a51773b6" }, { "dataPath": "params_shard_368.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.46.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "532211fd47ddb2bca23f8cf3a3531805" }, { "dataPath": "params_shard_369.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.46.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "39f8f7615e36f39057c69608427ae9b0" }, { "dataPath": "params_shard_370.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.47.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "fc3f5f4adbb92679afce0bf70e943e4c" }, { "dataPath": "params_shard_371.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.46.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.46.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.46.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.46.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.46.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.46.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.47.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.47.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.47.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.47.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.47.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.47.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.47.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "c6b161adf3961f8cb538dd641d7724d6" }, { "dataPath": "params_shard_372.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.47.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "44dd01df29bed34d65051f29b30c4c8e" }, { "dataPath": "params_shard_373.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.47.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "6344e072584217766cb37f5c708fd1a4" }, { "dataPath": "params_shard_374.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.47.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "b14f6997e025f1856cb192a072bc194a" }, { "dataPath": "params_shard_375.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.47.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.47.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.47.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.47.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.47.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.47.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.47.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.47.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "e283cbdfc25ca2a3a523488799de0bae" }, { "dataPath": "params_shard_376.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.47.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "6bcd90ab5dc91bae71f3447dfe8c41cf" }, { "dataPath": "params_shard_377.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.47.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "e2924d0b9daca7947f6c263802f87f6a" }, { "dataPath": "params_shard_378.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.48.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "d90613c1b0c63a1da3b37eac690c6557" }, { "dataPath": "params_shard_379.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.47.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.47.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.47.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.47.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.47.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.47.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.48.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.48.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.48.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.48.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.48.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.48.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.48.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "e82cf570c977c7a2518e69e13ee85870" }, { "dataPath": "params_shard_380.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.48.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "89be9dbaad44982fcb772efa4161772b" }, { "dataPath": "params_shard_381.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.48.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "4189a34366d7d1fb5a8b50274c643c9e" }, { "dataPath": "params_shard_382.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.48.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "9e6f00e33a5209a58c071df9ad420095" }, { "dataPath": "params_shard_383.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.48.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.48.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.48.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.48.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.48.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.48.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.48.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.48.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "1edfa81ead0b4f8fab09ad0cc131d213" }, { "dataPath": "params_shard_384.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.48.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "9fc9f9f6e0841b60fa9a2488c9c4ca58" }, { "dataPath": "params_shard_385.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.48.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "66ecd2450e8e04e55f62ff2d0f95f0ea" }, { "dataPath": "params_shard_386.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.49.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "8cd4664beb51a5eab788ddac00c6f409" }, { "dataPath": "params_shard_387.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.48.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.48.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.48.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.48.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.48.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.48.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.49.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.49.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.49.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.49.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.49.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.49.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.49.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "7f4a770a8eec794477ebc8955e5be591" }, { "dataPath": "params_shard_388.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.49.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "a25a9d77e4718c54ae9d1b117ebb29ed" }, { "dataPath": "params_shard_389.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.49.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "98d90e3d1e4ce0f3e4c384ee5dd8be3c" }, { "dataPath": "params_shard_390.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.49.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "bcbf27185a940394117954aaaf84d29f" }, { "dataPath": "params_shard_391.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.49.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.49.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.49.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.49.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.49.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.49.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.49.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.49.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "78f10c5285ccf14e9c3ff8c777bc5707" }, { "dataPath": "params_shard_392.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.49.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "91c7493b620c09e209bd4fb2a59860e2" }, { "dataPath": "params_shard_393.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.49.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "c5dc288e6f9ffde89c6bd9c7b0fff838" }, { "dataPath": "params_shard_394.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.50.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "8cff403e20b20914d8f796c871435077" }, { "dataPath": "params_shard_395.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.49.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.49.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.49.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.49.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.49.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.49.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.50.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.50.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.50.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.50.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.50.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.50.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.50.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "f960f7c6476ece7fbbaa41a10d153698" }, { "dataPath": "params_shard_396.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.50.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "aa195f21391005139ac4e335e16337bf" }, { "dataPath": "params_shard_397.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.50.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "01bc9b9a647eddaa165944bb13651921" }, { "dataPath": "params_shard_398.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.50.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "dfaae082d29f2356634e3bae065213f1" }, { "dataPath": "params_shard_399.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.50.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.50.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.50.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.50.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.50.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.50.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.50.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.50.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "30d115fcc30d2edba481e178c561ee7e" }, { "dataPath": "params_shard_400.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.50.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "3ef4367ca2a51d5578417b005a165628" }, { "dataPath": "params_shard_401.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.50.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "b344b08bb7411c69540a0f8a7de2ae84" }, { "dataPath": "params_shard_402.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.51.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "de094a8b2ccb9571398da12c504ae70b" }, { "dataPath": "params_shard_403.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.50.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.50.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.50.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.50.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.50.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.50.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.51.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.51.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.51.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.51.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.51.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.51.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.51.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "47765ac7a5aee0a31d57d9124b33f615" }, { "dataPath": "params_shard_404.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.51.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "bd4572c095b8618305f52e8e1a8c5e3b" }, { "dataPath": "params_shard_405.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.51.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "07011b19d0ae1b1b6855908022664712" }, { "dataPath": "params_shard_406.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.51.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "7e134df3d1c0293c6bacf37d574bf82f" }, { "dataPath": "params_shard_407.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.51.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.51.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.51.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.51.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.51.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.51.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.51.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.51.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "16ed4c6bdc4eff8935fe79db29f4bccd" }, { "dataPath": "params_shard_408.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.51.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "706f982bf20e87a2bf5db8e096ed8dad" }, { "dataPath": "params_shard_409.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.51.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "27aa9280e90d20e281c3e07b2f88a666" }, { "dataPath": "params_shard_410.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.52.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "6ec4cf4a966c6e83a64fa72312081879" }, { "dataPath": "params_shard_411.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.51.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.51.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.51.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.51.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.51.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.51.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.52.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.52.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.52.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.52.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.52.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.52.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.52.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "d7eba0d5ed26797ede64b71521343c41" }, { "dataPath": "params_shard_412.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.52.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "750a26a15fefc349bcc2b4a4eebb597c" }, { "dataPath": "params_shard_413.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.52.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "96d612bc572091507d5fc26ed23fa7de" }, { "dataPath": "params_shard_414.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.52.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "adf607f71d5170fbdbf7ee989923834a" }, { "dataPath": "params_shard_415.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.52.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.52.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.52.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.52.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.52.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.52.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.52.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.52.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "f31ae9d3dc3f8addc8233706b3cc440b" }, { "dataPath": "params_shard_416.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.52.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "e635d0d4f69165d0d236b57ad65ac7f7" }, { "dataPath": "params_shard_417.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.52.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "327fc788447c164e6e81261320f28d99" }, { "dataPath": "params_shard_418.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.53.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "af54dc6f05ea2cc74e3d093b8a6d60fc" }, { "dataPath": "params_shard_419.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.52.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.52.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.52.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.52.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.52.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.52.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.53.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.53.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.53.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.53.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.53.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.53.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.53.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "b0c9ab11ea6040a49e3337989cabd4d2" }, { "dataPath": "params_shard_420.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.53.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "10aeb0c2338319f0a060974cd2dd80e7" }, { "dataPath": "params_shard_421.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.53.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "1606d36a8bcf8201ae365e694eec777c" }, { "dataPath": "params_shard_422.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.53.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "518cd336e5deeadcf75f10fde3388755" }, { "dataPath": "params_shard_423.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.53.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.53.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.53.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.53.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.53.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.53.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.53.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.53.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "36c178b8547824049fe4107b0a63224c" }, { "dataPath": "params_shard_424.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.53.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "3b00a25305119024cd437159f6ec80fb" }, { "dataPath": "params_shard_425.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.53.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "ae71b4a43d85fc7feafba041fc34be11" }, { "dataPath": "params_shard_426.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.54.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "8e95c8b9009e79b8627bbda485ff51f5" }, { "dataPath": "params_shard_427.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.53.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.53.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.53.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.53.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.53.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.53.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.54.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.54.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.54.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.54.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.54.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.54.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.54.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "cc36625a91bad309a0f37a22bb0e0f40" }, { "dataPath": "params_shard_428.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.54.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "7dc511c8e1a0a75745e829f140e5cbf7" }, { "dataPath": "params_shard_429.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.54.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "9438eab8da13b55d6a4461f6e3a8b466" }, { "dataPath": "params_shard_430.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.54.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "ee160453b2bab415c2796108e028580e" }, { "dataPath": "params_shard_431.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.54.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.54.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.54.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.54.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.54.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.54.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.54.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.54.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "cb907d5f2aa9a58c830fab16fba24bc9" }, { "dataPath": "params_shard_432.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.54.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "061c5c335223b60ab8ba5a90f79a32c3" }, { "dataPath": "params_shard_433.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.54.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "3fb057d58322dbc94c9839a3ab1ad55d" }, { "dataPath": "params_shard_434.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.55.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "bf11174747e7da2791e6c8d7efb00897" }, { "dataPath": "params_shard_435.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.54.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.54.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.54.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.54.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.54.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.54.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.55.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.55.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.55.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.55.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.55.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.55.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.55.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "6283557d3d961b89823b9f588a0cab39" }, { "dataPath": "params_shard_436.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.55.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "828edf354a5499a544be335c9cd4774f" }, { "dataPath": "params_shard_437.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.55.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "130931cec32a2b5562c4e040fd55c217" }, { "dataPath": "params_shard_438.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.55.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "f966d4eb443e3d8bdeaf5b9c9cd85ed0" }, { "dataPath": "params_shard_439.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.55.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.55.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.55.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.55.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.55.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.55.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.55.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.55.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "f8b2a51638231475ec98e0edef673a49" }, { "dataPath": "params_shard_440.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.55.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "a43059d1a5a11cfdac7bcc5f22764c0f" }, { "dataPath": "params_shard_441.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.55.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "ff6915f2dbd47d926eee0b921493bdec" }, { "dataPath": "params_shard_442.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.56.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "38c0a62d89811ad8920bf67f492cc55d" }, { "dataPath": "params_shard_443.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.55.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.55.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.55.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.55.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.55.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.55.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.56.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.56.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.56.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.56.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.56.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.56.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.56.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "56a0719a90f80914459cb1357fbb289e" }, { "dataPath": "params_shard_444.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.56.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "549a98ca9db6f15021fb7ceccc95573c" }, { "dataPath": "params_shard_445.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.56.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "e0d087480655187c08d57f3dafa1c088" }, { "dataPath": "params_shard_446.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.56.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "4ce8ba046a81f225020e052be5189927" }, { "dataPath": "params_shard_447.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.56.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.56.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.56.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.56.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.56.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.56.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.56.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.56.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "a3765cdf9dda46fcac71f6574ffc7280" }, { "dataPath": "params_shard_448.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.56.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "ec5f9fe228ec2e92087f716e6dbce565" }, { "dataPath": "params_shard_449.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.56.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "d4455fab6051396c89adffc184aa448c" }, { "dataPath": "params_shard_450.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.57.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "0ba1c8212b3cb9aee513b4f643c9dff5" }, { "dataPath": "params_shard_451.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.56.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.56.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.56.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.56.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.56.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.56.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.57.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.57.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.57.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.57.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.57.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.57.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.57.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "7deaff8da2a202b984fd4e3ad92202ed" }, { "dataPath": "params_shard_452.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.57.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "9a4c19ffb73fd1cf243ff4f323343584" }, { "dataPath": "params_shard_453.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.57.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "70ad307156c28adbe5502241a7a4693e" }, { "dataPath": "params_shard_454.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.57.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "f3e63bfc4a88d3e2c10ef16e4e1f4317" }, { "dataPath": "params_shard_455.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.57.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.57.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.57.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.57.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.57.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.57.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.57.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.57.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "4ebe335a48643a10d2b98edd3b1cd198" }, { "dataPath": "params_shard_456.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.57.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "4b7534b0335573b408343280c0df263e" }, { "dataPath": "params_shard_457.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.57.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "d7f1d8c2e0305ffbe4af2c94579455d6" }, { "dataPath": "params_shard_458.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.58.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "a5c94bd3541e5feb71f53940d65dadae" }, { "dataPath": "params_shard_459.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.57.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.57.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.57.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.57.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.57.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.57.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.58.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.58.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.58.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.58.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.58.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.58.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.58.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "15ecdc3bb03e0930568d993680889577" }, { "dataPath": "params_shard_460.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.58.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "2c9517975483e6812f93c5500a75d101" }, { "dataPath": "params_shard_461.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.58.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "d8c362394880b2a91934e79195ab7e20" }, { "dataPath": "params_shard_462.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.58.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "575f69db1f16ebe8de1c22d67e5cf679" }, { "dataPath": "params_shard_463.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.58.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.58.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.58.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.58.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.58.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.58.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.58.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.58.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "875405a614c0bec500d31c3f0921e414" }, { "dataPath": "params_shard_464.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.58.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "7cef3f4a7bd94957d65df488bc73fd51" }, { "dataPath": "params_shard_465.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.58.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "57900ccd907a8b6efe54d6f390f7d6d6" }, { "dataPath": "params_shard_466.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.59.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "54c4df3702a6969d0442ec13199ebb86" }, { "dataPath": "params_shard_467.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.58.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.58.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.58.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.58.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.58.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.58.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.59.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.59.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.59.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.59.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.59.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.59.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.59.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "c6ffcf2369c2a828b3089db8b1adad9d" }, { "dataPath": "params_shard_468.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.59.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "2bcaeebc0e1873c8f11d016cbda3b717" }, { "dataPath": "params_shard_469.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.59.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "9b8c9c4aa7218100de625946ec66637c" }, { "dataPath": "params_shard_470.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.59.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "86546b2584d85343f72c42d501bfac4d" }, { "dataPath": "params_shard_471.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.59.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.59.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.59.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.59.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.59.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.59.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.59.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.59.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "c123c10a12ffd0ddb7b752c93b065a87" }, { "dataPath": "params_shard_472.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.59.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "4b7a36bf1d91a8c2c0642e65935c4673" }, { "dataPath": "params_shard_473.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.59.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "286bd57f2b459125adfa97c4b275ee3a" }, { "dataPath": "params_shard_474.bin", "format": "raw-shard", "nbytes": 37748736, "records": [ { "name": "model.layers.60.self_attn.q_b_proj.weight", "shape": [ 24576, 1536 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 37748736, "byteOffset": 0 } ], "md5sum": "8bcd4e6c6d8d9986d29500e9990b21f3" }, { "dataPath": "params_shard_475.bin", "format": "raw-shard", "nbytes": 31236208, "records": [ { "name": "model.layers.59.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.59.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.59.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.59.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.59.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.59.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.layers.60.self_attn.q_a_proj.weight", "shape": [ 1536, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 11010048, "byteOffset": 16086784 }, { "name": "model.layers.60.self_attn.q_a_proj.weight_scale_inv", "shape": [ 12, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1344, "byteOffset": 27096832 }, { "name": "model.layers.60.self_attn.q_a_layernorm.weight", "shape": [ 1536 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3072, "byteOffset": 27098176 }, { "name": "model.layers.60.self_attn.q_b_proj.weight_scale_inv", "shape": [ 192, 12 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 4608, "byteOffset": 27101248 }, { "name": "model.layers.60.self_attn.kv_a_proj_with_mqa.weight", "shape": [ 576, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 4128768, "byteOffset": 27105856 }, { "name": "model.layers.60.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "shape": [ 5, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 560, "byteOffset": 31234624 }, { "name": "model.layers.60.self_attn.kv_a_layernorm.weight", "shape": [ 512 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 31235184 } ], "md5sum": "83c7b36c255f8d19a46083d5d2c2d23a" }, { "dataPath": "params_shard_476.bin", "format": "raw-shard", "nbytes": 16777216, "records": [ { "name": "model.layers.60.self_attn.kv_b_proj.weight", "shape": [ 32768, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 16777216, "byteOffset": 0 } ], "md5sum": "0980d75ce43db44835e462b0c07ba06d" }, { "dataPath": "params_shard_477.bin", "format": "raw-shard", "nbytes": 117440512, "records": [ { "name": "model.layers.60.self_attn.o_proj.weight", "shape": [ 7168, 16384 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 117440512, "byteOffset": 0 } ], "md5sum": "a8b90f3bb35e666b6ed384bc8a049c6c" }, { "dataPath": "params_shard_478.bin", "format": "raw-shard", "nbytes": 29360128, "records": [ { "name": "model.layers.60.mlp.shared_experts.gate_up_proj.weight", "shape": [ 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 29360128, "byteOffset": 0 } ], "md5sum": "b60f10f4bf3ccb75bcfe526752974d57" }, { "dataPath": "params_shard_479.bin", "format": "raw-shard", "nbytes": 20469248, "records": [ { "name": "model.layers.60.self_attn.w_uk", "shape": [ 128, 512, 128 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 0 }, { "name": "model.layers.60.self_attn.w_uv", "shape": [ 128, 128, 512 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 8388608, "byteOffset": 8388608 }, { "name": "model.layers.60.self_attn.w_uk_scale_inv", "shape": [ 128, 4, 1 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16777216 }, { "name": "model.layers.60.self_attn.w_uv_scale_inv", "shape": [ 128, 1, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1024, "byteOffset": 16778240 }, { "name": "model.layers.60.self_attn.kv_b_proj.weight_scale_inv", "shape": [ 256, 4 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 2048, "byteOffset": 16779264 }, { "name": "model.layers.60.self_attn.o_proj.weight_scale_inv", "shape": [ 56, 128 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16781312 }, { "name": "model.layers.60.mlp.gate.weight", "shape": [ 256, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 3670016, "byteOffset": 16795648 }, { "name": "model.layers.60.mlp.shared_experts.gate_up_proj.weight_scale_inv", "shape": [ 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 3584, "byteOffset": 20465664 } ], "md5sum": "453d5dbb4e0f45c25628fdd31273c201" }, { "dataPath": "params_shard_480.bin", "format": "raw-shard", "nbytes": 7516192768, "records": [ { "name": "model.layers.60.mlp.moe_gate_up_proj.weight", "shape": [ 256, 4096, 7168 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 7516192768, "byteOffset": 0 } ], "md5sum": "78fe94eea54d3c976f816faca8f2edbc" }, { "dataPath": "params_shard_481.bin", "format": "raw-shard", "nbytes": 3758096384, "records": [ { "name": "model.layers.60.mlp.moe_down_proj.weight", "shape": [ 256, 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 3758096384, "byteOffset": 0 } ], "md5sum": "5b7cf012d2f01cecd58f86623be21614" }, { "dataPath": "params_shard_482.bin", "format": "raw-shard", "nbytes": 1853358080, "records": [ { "name": "lm_head.weight", "shape": [ 129280, 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 1853358080, "byteOffset": 0 } ], "md5sum": "7784e2a64107da6159b263989911db06" }, { "dataPath": "params_shard_483.bin", "format": "raw-shard", "nbytes": 16101120, "records": [ { "name": "model.layers.60.mlp.shared_experts.down_proj.weight", "shape": [ 7168, 2048 ], "dtype": "float8_e4m3fn", "format": "f32-to-bf16", "nbytes": 14680064, "byteOffset": 0 }, { "name": "model.layers.60.mlp.shared_experts.down_proj.weight_scale_inv", "shape": [ 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 1792, "byteOffset": 14680064 }, { "name": "model.layers.60.mlp.moe_gate_up_proj.weight_scale_inv", "shape": [ 256, 32, 56 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 917504, "byteOffset": 14681856 }, { "name": "model.layers.60.mlp.moe_down_proj.weight_scale_inv", "shape": [ 256, 56, 16 ], "dtype": "float32", "format": "f32-to-bf16", "nbytes": 458752, "byteOffset": 15599360 }, { "name": "model.layers.60.input_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16058112 }, { "name": "model.layers.60.post_attention_layernorm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16072448 }, { "name": "model.norm.weight", "shape": [ 7168 ], "dtype": "bfloat16", "format": "f32-to-bf16", "nbytes": 14336, "byteOffset": 16086784 } ], "md5sum": "076e9e27c76d52b57a496dd998c8c6c1" } ] }