Training in progress, step 200

Browse files

Files changed (7) hide show

config.json +100 -0
events.out.tfevents.1741204505.a22bb5f11002.311.0 +3 -0
events.out.tfevents.1741204595.a22bb5f11002.367.0 +3 -0
events.out.tfevents.1741204716.a22bb5f11002.426.0 +3 -0
model.safetensors +3 -0
tokenizer.json +16 -2
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,100 @@

+{
+  "architectures": [
+    "HausaLMForCausalLM"
+  ],
+  "model_type": "xlstm",
+  "text_config": {
+    "_block_map": "1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0",
+    "add_embedding_dropout": false,
+    "add_post_blocks_norm": true,
+    "bias": false,
+    "context_length": 128,
+    "dropout": 0.0,
+    "embedding_dim": 720,
+    "mlstm_block": {
+      "_block_idx": null,
+      "_num_blocks": 20,
+      "mlstm": {
+        "_inner_embedding_dim": 1472,
+        "_num_blocks": 20,
+        "_proj_up_dim": 1472,
+        "bias": false,
+        "context_length": 128,
+        "conv1d_kernel_size": 4,
+        "dropout": 0.0,
+        "embedding_dim": 720,
+        "num_heads": 4,
+        "proj_factor": 2.0,
+        "qkv_proj_blocksize": 32,
+        "round_proj_up_dim_up": true,
+        "round_proj_up_to_multiple_of": 64
+      }
+    },
+    "num_blocks": 20,
+    "slstm_at": [
+      0,
+      2,
+      4,
+      6,
+      8,
+      12,
+      14,
+      16,
+      18
+    ],
+    "slstm_block": {
+      "_block_idx": null,
+      "_num_blocks": 20,
+      "feedforward": {
+        "_num_blocks": 1,
+        "_proj_up_dim": 0,
+        "act_fn": "swish",
+        "bias": false,
+        "dropout": 0.0,
+        "embedding_dim": -1,
+        "ff_type": "ffn_gated",
+        "proj_factor": 1.7,
+        "round_proj_up_dim_up": true,
+        "round_proj_up_to_multiple_of": 64
+      },
+      "slstm": {
+        "_block_idx": null,
+        "_num_blocks": 20,
+        "backend": "vanilla",
+        "batch_size": 8,
+        "bias_init": "powerlaw_blockdependent",
+        "constants": {},
+        "conv1d_kernel_size": 4,
+        "dropout": 0.0,
+        "dtype": "bfloat16",
+        "dtype_a": "float32",
+        "dtype_b": "float32",
+        "dtype_g": "bfloat16",
+        "dtype_r": "bfloat16",
+        "dtype_s": "bfloat16",
+        "dtype_w": "bfloat16",
+        "embedding_dim": 720,
+        "enable_automatic_mixed_precision": true,
+        "forward_clipval": null,
+        "function": "slstm",
+        "gradient_recurrent_clipval": null,
+        "gradient_recurrent_cut": false,
+        "group_norm_weight": true,
+        "hidden_size": 720,
+        "initial_val": 0.0,
+        "input_shape": "BSGNH",
+        "internal_input_shape": "SBNGH",
+        "num_gates": 4,
+        "num_heads": 4,
+        "num_states": 4,
+        "output_shape": "BNSH",
+        "recurrent_weight_init": "zeros"
+      }
+    },
+    "tie_weights": false,
+    "vocab_size": 49152,
+    "weight_decay_on_embedding": false
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0"
+}

events.out.tfevents.1741204505.a22bb5f11002.311.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08281589f22dacdf62920e2d2b3f90eceed7456f2f2633501f2426167382e104
+size 7184

events.out.tfevents.1741204595.a22bb5f11002.367.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:549696cbf7dada9c0af5e3db362ab3468a3e4bbf7213a9e9566707e3ea3906ae
+size 7156

events.out.tfevents.1741204716.a22bb5f11002.426.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdbf26e74968b7d66fd1d90e2113c8ae40d35c4179a10cf13369e9291650fb13
+size 7638

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f009c73134dec2acb3482ae87b40060eb866fa803eecbd800d8925d35f305e82
+size 568672088

tokenizer.json CHANGED Viewed

@@ -1,7 +1,21 @@
 {
   "version": "1.0",
-  "truncation": null,
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 128,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 128
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 2,
+    "pad_type_id": 0,
+    "pad_token": "<|im_end|>"
+  },
   "added_tokens": [
     {
       "id": 0,

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee6e6417d6cbd31522a69265b6233d4ea4ef190cd1ce7a59a03e9e29f7a72919
+size 5688