thiomajid commited on
Commit
dc14828
·
verified ·
1 Parent(s): b19dfb4

Training in progress, step 200

Browse files
config.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HausaLMForCausalLM"
4
+ ],
5
+ "model_type": "xlstm",
6
+ "text_config": {
7
+ "_block_map": "1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0",
8
+ "add_embedding_dropout": false,
9
+ "add_post_blocks_norm": true,
10
+ "bias": false,
11
+ "context_length": 128,
12
+ "dropout": 0.0,
13
+ "embedding_dim": 720,
14
+ "mlstm_block": {
15
+ "_block_idx": null,
16
+ "_num_blocks": 20,
17
+ "mlstm": {
18
+ "_inner_embedding_dim": 1472,
19
+ "_num_blocks": 20,
20
+ "_proj_up_dim": 1472,
21
+ "bias": false,
22
+ "context_length": 128,
23
+ "conv1d_kernel_size": 4,
24
+ "dropout": 0.0,
25
+ "embedding_dim": 720,
26
+ "num_heads": 4,
27
+ "proj_factor": 2.0,
28
+ "qkv_proj_blocksize": 32,
29
+ "round_proj_up_dim_up": true,
30
+ "round_proj_up_to_multiple_of": 64
31
+ }
32
+ },
33
+ "num_blocks": 20,
34
+ "slstm_at": [
35
+ 0,
36
+ 2,
37
+ 4,
38
+ 6,
39
+ 8,
40
+ 12,
41
+ 14,
42
+ 16,
43
+ 18
44
+ ],
45
+ "slstm_block": {
46
+ "_block_idx": null,
47
+ "_num_blocks": 20,
48
+ "feedforward": {
49
+ "_num_blocks": 1,
50
+ "_proj_up_dim": 0,
51
+ "act_fn": "swish",
52
+ "bias": false,
53
+ "dropout": 0.0,
54
+ "embedding_dim": -1,
55
+ "ff_type": "ffn_gated",
56
+ "proj_factor": 1.7,
57
+ "round_proj_up_dim_up": true,
58
+ "round_proj_up_to_multiple_of": 64
59
+ },
60
+ "slstm": {
61
+ "_block_idx": null,
62
+ "_num_blocks": 20,
63
+ "backend": "vanilla",
64
+ "batch_size": 8,
65
+ "bias_init": "powerlaw_blockdependent",
66
+ "constants": {},
67
+ "conv1d_kernel_size": 4,
68
+ "dropout": 0.0,
69
+ "dtype": "bfloat16",
70
+ "dtype_a": "float32",
71
+ "dtype_b": "float32",
72
+ "dtype_g": "bfloat16",
73
+ "dtype_r": "bfloat16",
74
+ "dtype_s": "bfloat16",
75
+ "dtype_w": "bfloat16",
76
+ "embedding_dim": 720,
77
+ "enable_automatic_mixed_precision": true,
78
+ "forward_clipval": null,
79
+ "function": "slstm",
80
+ "gradient_recurrent_clipval": null,
81
+ "gradient_recurrent_cut": false,
82
+ "group_norm_weight": true,
83
+ "hidden_size": 720,
84
+ "initial_val": 0.0,
85
+ "input_shape": "BSGNH",
86
+ "internal_input_shape": "SBNGH",
87
+ "num_gates": 4,
88
+ "num_heads": 4,
89
+ "num_states": 4,
90
+ "output_shape": "BNSH",
91
+ "recurrent_weight_init": "zeros"
92
+ }
93
+ },
94
+ "tie_weights": false,
95
+ "vocab_size": 49152,
96
+ "weight_decay_on_embedding": false
97
+ },
98
+ "torch_dtype": "float32",
99
+ "transformers_version": "4.47.0"
100
+ }
events.out.tfevents.1741204505.a22bb5f11002.311.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08281589f22dacdf62920e2d2b3f90eceed7456f2f2633501f2426167382e104
3
+ size 7184
events.out.tfevents.1741204595.a22bb5f11002.367.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549696cbf7dada9c0af5e3db362ab3468a3e4bbf7213a9e9566707e3ea3906ae
3
+ size 7156
events.out.tfevents.1741204716.a22bb5f11002.426.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdbf26e74968b7d66fd1d90e2113c8ae40d35c4179a10cf13369e9291650fb13
3
+ size 7638
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f009c73134dec2acb3482ae87b40060eb866fa803eecbd800d8925d35f305e82
3
+ size 568672088
tokenizer.json CHANGED
@@ -1,7 +1,21 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 128,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 128
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 2,
16
+ "pad_type_id": 0,
17
+ "pad_token": "<|im_end|>"
18
+ },
19
  "added_tokens": [
20
  {
21
  "id": 0,
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee6e6417d6cbd31522a69265b6233d4ea4ef190cd1ce7a59a03e9e29f7a72919
3
+ size 5688