diff --git a/.gitattributes b/.gitattributes index 3f413bca3f2e58f03fe91508468ce621dd4a7372..b89f6c50a76861f47b8e120630cf75063eedbd13 100644 --- a/.gitattributes +++ b/.gitattributes @@ -254,3 +254,107 @@ checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layer checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.self_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_17.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_6.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_6.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_20.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_20.mlp.wi_0.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_9.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_9.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_10.mlp.wi_0.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_10.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_18.attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_13.mlp.wi_0.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_13.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_13.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.mlp.wi_0.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_18.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.mlp.wo.kernel/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_7.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_5.attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_9.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_7.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_5.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_9.self_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.self_attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.out.kernel/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_18.self_attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_18.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_2.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_0.mlp.wi_0.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_2.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_0.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_12.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_6.attention.out.kernel/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.out.kernel/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_4.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_13.self_attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_12.attention.out.kernel/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_6.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_4.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_10.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_14.mlp.wo.kernel/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_10.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_19.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_19.encoder_decoder_attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_18.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_20.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_18.mlp.wi_0.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_20.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_0.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_10.self_attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_13.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_13.attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_14.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_5.attention.value.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_0.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_5.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_0.self_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wi_0.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_1.mlp.wi_1.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_10.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_0.mlp.wo.kernel/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.key.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_6.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_1.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_1.self_attention.query.kernel/0.1 filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_mlp_layer_norm.scale.v/.zarray b/checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_mlp_layer_norm.scale.v/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..fb266c57e8e0dee6193b582ba985adeb7c50f8cf --- /dev/null +++ b/checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_mlp_layer_norm.scale.v/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"