ismaelR commited on
Commit
2686475
·
verified ·
1 Parent(s): 09f969a

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.json +64 -0
  2. model.safetensors +3 -0
  3. trainer_state.json +2253 -0
  4. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "id2label": {
24
+ "0": "class_0",
25
+ "1": "class_1",
26
+ "2": "class_2",
27
+ "3": "class_3",
28
+ "4": "class_4",
29
+ "5": "class_5"
30
+ },
31
+ "initializer_cutoff_factor": 2.0,
32
+ "initializer_range": 0.02,
33
+ "intermediate_size": 1152,
34
+ "label2id": {
35
+ "class_0": "0",
36
+ "class_1": "1",
37
+ "class_2": "2",
38
+ "class_3": "3",
39
+ "class_4": "4",
40
+ "class_5": "5"
41
+ },
42
+ "layer_norm_eps": 1e-05,
43
+ "local_attention": 128,
44
+ "local_rope_theta": 10000.0,
45
+ "max_position_embeddings": 8192,
46
+ "mlp_bias": false,
47
+ "mlp_dropout": 0.0,
48
+ "model_type": "modernbert",
49
+ "norm_bias": false,
50
+ "norm_eps": 1e-05,
51
+ "num_attention_heads": 12,
52
+ "num_hidden_layers": 22,
53
+ "pad_token_id": 50283,
54
+ "position_embedding_type": "absolute",
55
+ "problem_type": "single_label_classification",
56
+ "reference_compile": false,
57
+ "repad_logits_with_grad": false,
58
+ "sep_token_id": 50282,
59
+ "sparse_pred_ignore_index": -100,
60
+ "sparse_prediction": false,
61
+ "torch_dtype": "float32",
62
+ "transformers_version": "4.49.0",
63
+ "vocab_size": 50368
64
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30e26eae53294f869a6fc8ec17755be44d5146c76bb28992a4def088376ede79
3
+ size 598452088
trainer_state.json ADDED
@@ -0,0 +1,2253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6869426704202687,
3
+ "best_model_checkpoint": "/userstorage/modernbert-llm-grader/checkpoint-31216",
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 31216,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.012813941568426449,
13
+ "grad_norm": 5.321617603302002,
14
+ "learning_rate": 4.987186058431574e-05,
15
+ "loss": 1.4033,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.025627883136852898,
20
+ "grad_norm": 3.621730089187622,
21
+ "learning_rate": 4.974372116863147e-05,
22
+ "loss": 1.3035,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.03844182470527934,
27
+ "grad_norm": 7.95962381362915,
28
+ "learning_rate": 4.961558175294721e-05,
29
+ "loss": 1.2506,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.051255766273705795,
34
+ "grad_norm": 3.631398916244507,
35
+ "learning_rate": 4.9487442337262944e-05,
36
+ "loss": 1.2354,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.06406970784213224,
41
+ "grad_norm": 2.6680333614349365,
42
+ "learning_rate": 4.935930292157868e-05,
43
+ "loss": 1.2397,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.07688364941055868,
48
+ "grad_norm": 6.042360305786133,
49
+ "learning_rate": 4.9231163505894415e-05,
50
+ "loss": 1.1811,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.08969759097898514,
55
+ "grad_norm": 6.7501959800720215,
56
+ "learning_rate": 4.9103024090210154e-05,
57
+ "loss": 1.1947,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.10251153254741159,
62
+ "grad_norm": 3.4089736938476562,
63
+ "learning_rate": 4.8974884674525886e-05,
64
+ "loss": 1.1812,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.11532547411583803,
69
+ "grad_norm": 5.0775604248046875,
70
+ "learning_rate": 4.884674525884162e-05,
71
+ "loss": 1.1752,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.12813941568426448,
76
+ "grad_norm": 4.529630184173584,
77
+ "learning_rate": 4.8718605843157357e-05,
78
+ "loss": 1.1867,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.14095335725269093,
83
+ "grad_norm": 4.961220741271973,
84
+ "learning_rate": 4.859046642747309e-05,
85
+ "loss": 1.2129,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.15376729882111737,
90
+ "grad_norm": 4.113813400268555,
91
+ "learning_rate": 4.846232701178883e-05,
92
+ "loss": 1.1293,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.16658124038954383,
97
+ "grad_norm": 7.25917387008667,
98
+ "learning_rate": 4.8334187596104566e-05,
99
+ "loss": 1.1008,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.17939518195797027,
104
+ "grad_norm": 5.579372882843018,
105
+ "learning_rate": 4.82060481804203e-05,
106
+ "loss": 1.1327,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.1922091235263967,
111
+ "grad_norm": 9.794898986816406,
112
+ "learning_rate": 4.807790876473604e-05,
113
+ "loss": 1.1355,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.20502306509482318,
118
+ "grad_norm": 9.875951766967773,
119
+ "learning_rate": 4.794976934905177e-05,
120
+ "loss": 1.0057,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.21783700666324962,
125
+ "grad_norm": 7.271333694458008,
126
+ "learning_rate": 4.782162993336751e-05,
127
+ "loss": 1.1101,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.23065094823167606,
132
+ "grad_norm": 6.730026721954346,
133
+ "learning_rate": 4.769349051768324e-05,
134
+ "loss": 1.0762,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.2434648898001025,
139
+ "grad_norm": 5.596224784851074,
140
+ "learning_rate": 4.756535110199898e-05,
141
+ "loss": 1.0413,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.25627883136852897,
146
+ "grad_norm": 4.591865539550781,
147
+ "learning_rate": 4.743721168631472e-05,
148
+ "loss": 1.0593,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.2690927729369554,
153
+ "grad_norm": 6.357232570648193,
154
+ "learning_rate": 4.730907227063045e-05,
155
+ "loss": 1.0434,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.28190671450538185,
160
+ "grad_norm": 5.185873508453369,
161
+ "learning_rate": 4.718093285494619e-05,
162
+ "loss": 1.021,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.2947206560738083,
167
+ "grad_norm": 8.19482135772705,
168
+ "learning_rate": 4.705279343926192e-05,
169
+ "loss": 1.1102,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.30753459764223473,
174
+ "grad_norm": 6.1499176025390625,
175
+ "learning_rate": 4.692465402357765e-05,
176
+ "loss": 1.0115,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.3203485392106612,
181
+ "grad_norm": 11.092570304870605,
182
+ "learning_rate": 4.679651460789339e-05,
183
+ "loss": 0.9576,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.33316248077908767,
188
+ "grad_norm": 5.10243034362793,
189
+ "learning_rate": 4.666837519220912e-05,
190
+ "loss": 1.0674,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.3459764223475141,
195
+ "grad_norm": 4.633431434631348,
196
+ "learning_rate": 4.654023577652486e-05,
197
+ "loss": 1.0004,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.35879036391594055,
202
+ "grad_norm": 5.507874488830566,
203
+ "learning_rate": 4.6412096360840594e-05,
204
+ "loss": 1.0439,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.371604305484367,
209
+ "grad_norm": 5.591798305511475,
210
+ "learning_rate": 4.628395694515633e-05,
211
+ "loss": 1.0509,
212
+ "step": 2900
213
+ },
214
+ {
215
+ "epoch": 0.3844182470527934,
216
+ "grad_norm": 4.341959476470947,
217
+ "learning_rate": 4.6155817529472065e-05,
218
+ "loss": 0.9864,
219
+ "step": 3000
220
+ },
221
+ {
222
+ "epoch": 0.3972321886212199,
223
+ "grad_norm": 3.6542482376098633,
224
+ "learning_rate": 4.6027678113787804e-05,
225
+ "loss": 0.9897,
226
+ "step": 3100
227
+ },
228
+ {
229
+ "epoch": 0.41004613018964636,
230
+ "grad_norm": 6.769758701324463,
231
+ "learning_rate": 4.589953869810354e-05,
232
+ "loss": 1.0528,
233
+ "step": 3200
234
+ },
235
+ {
236
+ "epoch": 0.4228600717580728,
237
+ "grad_norm": 5.762277603149414,
238
+ "learning_rate": 4.5771399282419274e-05,
239
+ "loss": 1.0036,
240
+ "step": 3300
241
+ },
242
+ {
243
+ "epoch": 0.43567401332649924,
244
+ "grad_norm": 7.389179229736328,
245
+ "learning_rate": 4.564325986673501e-05,
246
+ "loss": 1.0304,
247
+ "step": 3400
248
+ },
249
+ {
250
+ "epoch": 0.44848795489492566,
251
+ "grad_norm": 3.9039294719696045,
252
+ "learning_rate": 4.5515120451050745e-05,
253
+ "loss": 0.9962,
254
+ "step": 3500
255
+ },
256
+ {
257
+ "epoch": 0.4613018964633521,
258
+ "grad_norm": 3.0561447143554688,
259
+ "learning_rate": 4.5386981035366484e-05,
260
+ "loss": 0.9777,
261
+ "step": 3600
262
+ },
263
+ {
264
+ "epoch": 0.4741158380317786,
265
+ "grad_norm": 6.340303897857666,
266
+ "learning_rate": 4.5258841619682216e-05,
267
+ "loss": 0.9603,
268
+ "step": 3700
269
+ },
270
+ {
271
+ "epoch": 0.486929779600205,
272
+ "grad_norm": 9.058144569396973,
273
+ "learning_rate": 4.5130702203997955e-05,
274
+ "loss": 0.9658,
275
+ "step": 3800
276
+ },
277
+ {
278
+ "epoch": 0.49974372116863147,
279
+ "grad_norm": 8.219672203063965,
280
+ "learning_rate": 4.500256278831369e-05,
281
+ "loss": 0.9856,
282
+ "step": 3900
283
+ },
284
+ {
285
+ "epoch": 0.5125576627370579,
286
+ "grad_norm": 3.6466543674468994,
287
+ "learning_rate": 4.487442337262942e-05,
288
+ "loss": 0.9734,
289
+ "step": 4000
290
+ },
291
+ {
292
+ "epoch": 0.5253716043054844,
293
+ "grad_norm": 7.289781093597412,
294
+ "learning_rate": 4.474628395694516e-05,
295
+ "loss": 0.9045,
296
+ "step": 4100
297
+ },
298
+ {
299
+ "epoch": 0.5381855458739108,
300
+ "grad_norm": 6.18227481842041,
301
+ "learning_rate": 4.461814454126089e-05,
302
+ "loss": 0.9679,
303
+ "step": 4200
304
+ },
305
+ {
306
+ "epoch": 0.5509994874423373,
307
+ "grad_norm": 3.994476318359375,
308
+ "learning_rate": 4.449000512557663e-05,
309
+ "loss": 0.8958,
310
+ "step": 4300
311
+ },
312
+ {
313
+ "epoch": 0.5638134290107637,
314
+ "grad_norm": 3.913896322250366,
315
+ "learning_rate": 4.436186570989236e-05,
316
+ "loss": 0.9453,
317
+ "step": 4400
318
+ },
319
+ {
320
+ "epoch": 0.5766273705791901,
321
+ "grad_norm": 4.39192008972168,
322
+ "learning_rate": 4.42337262942081e-05,
323
+ "loss": 0.9132,
324
+ "step": 4500
325
+ },
326
+ {
327
+ "epoch": 0.5894413121476166,
328
+ "grad_norm": 5.574671745300293,
329
+ "learning_rate": 4.410558687852384e-05,
330
+ "loss": 0.9069,
331
+ "step": 4600
332
+ },
333
+ {
334
+ "epoch": 0.602255253716043,
335
+ "grad_norm": 4.218778610229492,
336
+ "learning_rate": 4.397744746283957e-05,
337
+ "loss": 0.9631,
338
+ "step": 4700
339
+ },
340
+ {
341
+ "epoch": 0.6150691952844695,
342
+ "grad_norm": 7.804980754852295,
343
+ "learning_rate": 4.384930804715531e-05,
344
+ "loss": 0.9121,
345
+ "step": 4800
346
+ },
347
+ {
348
+ "epoch": 0.627883136852896,
349
+ "grad_norm": 7.064172744750977,
350
+ "learning_rate": 4.372116863147104e-05,
351
+ "loss": 0.9387,
352
+ "step": 4900
353
+ },
354
+ {
355
+ "epoch": 0.6406970784213224,
356
+ "grad_norm": 5.293111324310303,
357
+ "learning_rate": 4.359302921578678e-05,
358
+ "loss": 0.9264,
359
+ "step": 5000
360
+ },
361
+ {
362
+ "epoch": 0.6535110199897488,
363
+ "grad_norm": 7.019448757171631,
364
+ "learning_rate": 4.346488980010251e-05,
365
+ "loss": 0.9452,
366
+ "step": 5100
367
+ },
368
+ {
369
+ "epoch": 0.6663249615581753,
370
+ "grad_norm": 6.714709758758545,
371
+ "learning_rate": 4.333675038441825e-05,
372
+ "loss": 0.8648,
373
+ "step": 5200
374
+ },
375
+ {
376
+ "epoch": 0.6791389031266017,
377
+ "grad_norm": 8.232748031616211,
378
+ "learning_rate": 4.320861096873399e-05,
379
+ "loss": 0.904,
380
+ "step": 5300
381
+ },
382
+ {
383
+ "epoch": 0.6919528446950282,
384
+ "grad_norm": 9.853933334350586,
385
+ "learning_rate": 4.308047155304972e-05,
386
+ "loss": 0.8895,
387
+ "step": 5400
388
+ },
389
+ {
390
+ "epoch": 0.7047667862634547,
391
+ "grad_norm": 6.8710455894470215,
392
+ "learning_rate": 4.2952332137365454e-05,
393
+ "loss": 0.86,
394
+ "step": 5500
395
+ },
396
+ {
397
+ "epoch": 0.7175807278318811,
398
+ "grad_norm": 6.45287561416626,
399
+ "learning_rate": 4.2824192721681186e-05,
400
+ "loss": 0.8718,
401
+ "step": 5600
402
+ },
403
+ {
404
+ "epoch": 0.7303946694003075,
405
+ "grad_norm": 5.772899627685547,
406
+ "learning_rate": 4.2696053305996924e-05,
407
+ "loss": 0.8477,
408
+ "step": 5700
409
+ },
410
+ {
411
+ "epoch": 0.743208610968734,
412
+ "grad_norm": 6.193540573120117,
413
+ "learning_rate": 4.256791389031266e-05,
414
+ "loss": 0.9184,
415
+ "step": 5800
416
+ },
417
+ {
418
+ "epoch": 0.7560225525371604,
419
+ "grad_norm": 2.5397393703460693,
420
+ "learning_rate": 4.2439774474628395e-05,
421
+ "loss": 0.9537,
422
+ "step": 5900
423
+ },
424
+ {
425
+ "epoch": 0.7688364941055869,
426
+ "grad_norm": 8.280569076538086,
427
+ "learning_rate": 4.2311635058944134e-05,
428
+ "loss": 0.8988,
429
+ "step": 6000
430
+ },
431
+ {
432
+ "epoch": 0.7816504356740134,
433
+ "grad_norm": 10.563502311706543,
434
+ "learning_rate": 4.2183495643259866e-05,
435
+ "loss": 0.8518,
436
+ "step": 6100
437
+ },
438
+ {
439
+ "epoch": 0.7944643772424398,
440
+ "grad_norm": 3.090008497238159,
441
+ "learning_rate": 4.2055356227575605e-05,
442
+ "loss": 0.8731,
443
+ "step": 6200
444
+ },
445
+ {
446
+ "epoch": 0.8072783188108662,
447
+ "grad_norm": 4.051167011260986,
448
+ "learning_rate": 4.192721681189134e-05,
449
+ "loss": 0.8713,
450
+ "step": 6300
451
+ },
452
+ {
453
+ "epoch": 0.8200922603792927,
454
+ "grad_norm": 7.207763671875,
455
+ "learning_rate": 4.1799077396207076e-05,
456
+ "loss": 0.8781,
457
+ "step": 6400
458
+ },
459
+ {
460
+ "epoch": 0.8329062019477191,
461
+ "grad_norm": 6.396823883056641,
462
+ "learning_rate": 4.1670937980522815e-05,
463
+ "loss": 0.8231,
464
+ "step": 6500
465
+ },
466
+ {
467
+ "epoch": 0.8457201435161456,
468
+ "grad_norm": 6.260582447052002,
469
+ "learning_rate": 4.1542798564838547e-05,
470
+ "loss": 0.8658,
471
+ "step": 6600
472
+ },
473
+ {
474
+ "epoch": 0.858534085084572,
475
+ "grad_norm": 8.35356616973877,
476
+ "learning_rate": 4.1414659149154285e-05,
477
+ "loss": 0.8637,
478
+ "step": 6700
479
+ },
480
+ {
481
+ "epoch": 0.8713480266529985,
482
+ "grad_norm": 7.236725330352783,
483
+ "learning_rate": 4.128651973347002e-05,
484
+ "loss": 0.8525,
485
+ "step": 6800
486
+ },
487
+ {
488
+ "epoch": 0.8841619682214249,
489
+ "grad_norm": 14.001522064208984,
490
+ "learning_rate": 4.1158380317785756e-05,
491
+ "loss": 0.8628,
492
+ "step": 6900
493
+ },
494
+ {
495
+ "epoch": 0.8969759097898513,
496
+ "grad_norm": 4.257541179656982,
497
+ "learning_rate": 4.103024090210149e-05,
498
+ "loss": 0.8443,
499
+ "step": 7000
500
+ },
501
+ {
502
+ "epoch": 0.9097898513582778,
503
+ "grad_norm": 5.065970420837402,
504
+ "learning_rate": 4.090210148641722e-05,
505
+ "loss": 0.8329,
506
+ "step": 7100
507
+ },
508
+ {
509
+ "epoch": 0.9226037929267042,
510
+ "grad_norm": 6.647068977355957,
511
+ "learning_rate": 4.077396207073296e-05,
512
+ "loss": 0.8585,
513
+ "step": 7200
514
+ },
515
+ {
516
+ "epoch": 0.9354177344951307,
517
+ "grad_norm": 8.440242767333984,
518
+ "learning_rate": 4.064582265504869e-05,
519
+ "loss": 0.8749,
520
+ "step": 7300
521
+ },
522
+ {
523
+ "epoch": 0.9482316760635572,
524
+ "grad_norm": 7.684078216552734,
525
+ "learning_rate": 4.051768323936443e-05,
526
+ "loss": 0.7771,
527
+ "step": 7400
528
+ },
529
+ {
530
+ "epoch": 0.9610456176319836,
531
+ "grad_norm": 6.4709577560424805,
532
+ "learning_rate": 4.038954382368016e-05,
533
+ "loss": 0.8597,
534
+ "step": 7500
535
+ },
536
+ {
537
+ "epoch": 0.97385955920041,
538
+ "grad_norm": 4.3970489501953125,
539
+ "learning_rate": 4.02614044079959e-05,
540
+ "loss": 0.7852,
541
+ "step": 7600
542
+ },
543
+ {
544
+ "epoch": 0.9866735007688365,
545
+ "grad_norm": 9.167794227600098,
546
+ "learning_rate": 4.013326499231164e-05,
547
+ "loss": 0.8563,
548
+ "step": 7700
549
+ },
550
+ {
551
+ "epoch": 0.9994874423372629,
552
+ "grad_norm": 6.251096248626709,
553
+ "learning_rate": 4.000512557662737e-05,
554
+ "loss": 0.8243,
555
+ "step": 7800
556
+ },
557
+ {
558
+ "epoch": 1.0,
559
+ "eval_f1": 0.640692076906927,
560
+ "eval_loss": 0.8794865608215332,
561
+ "eval_runtime": 744.6214,
562
+ "eval_samples_per_second": 10.48,
563
+ "eval_steps_per_second": 2.62,
564
+ "step": 7804
565
+ },
566
+ {
567
+ "epoch": 1.0123013839056894,
568
+ "grad_norm": 5.928829669952393,
569
+ "learning_rate": 3.987698616094311e-05,
570
+ "loss": 0.7046,
571
+ "step": 7900
572
+ },
573
+ {
574
+ "epoch": 1.0251153254741159,
575
+ "grad_norm": 2.885106086730957,
576
+ "learning_rate": 3.974884674525884e-05,
577
+ "loss": 0.7663,
578
+ "step": 8000
579
+ },
580
+ {
581
+ "epoch": 1.0379292670425422,
582
+ "grad_norm": 5.951350212097168,
583
+ "learning_rate": 3.962070732957458e-05,
584
+ "loss": 0.7374,
585
+ "step": 8100
586
+ },
587
+ {
588
+ "epoch": 1.0507432086109687,
589
+ "grad_norm": 2.5160486698150635,
590
+ "learning_rate": 3.949256791389031e-05,
591
+ "loss": 0.7126,
592
+ "step": 8200
593
+ },
594
+ {
595
+ "epoch": 1.0635571501793952,
596
+ "grad_norm": 6.847401142120361,
597
+ "learning_rate": 3.936442849820605e-05,
598
+ "loss": 0.6785,
599
+ "step": 8300
600
+ },
601
+ {
602
+ "epoch": 1.0763710917478215,
603
+ "grad_norm": 4.729136943817139,
604
+ "learning_rate": 3.923628908252179e-05,
605
+ "loss": 0.7085,
606
+ "step": 8400
607
+ },
608
+ {
609
+ "epoch": 1.089185033316248,
610
+ "grad_norm": 5.535890102386475,
611
+ "learning_rate": 3.910814966683752e-05,
612
+ "loss": 0.7548,
613
+ "step": 8500
614
+ },
615
+ {
616
+ "epoch": 1.1019989748846746,
617
+ "grad_norm": 6.188892364501953,
618
+ "learning_rate": 3.8980010251153255e-05,
619
+ "loss": 0.7193,
620
+ "step": 8600
621
+ },
622
+ {
623
+ "epoch": 1.1148129164531009,
624
+ "grad_norm": 5.806282997131348,
625
+ "learning_rate": 3.885187083546899e-05,
626
+ "loss": 0.7143,
627
+ "step": 8700
628
+ },
629
+ {
630
+ "epoch": 1.1276268580215274,
631
+ "grad_norm": 10.726571083068848,
632
+ "learning_rate": 3.8723731419784726e-05,
633
+ "loss": 0.6892,
634
+ "step": 8800
635
+ },
636
+ {
637
+ "epoch": 1.140440799589954,
638
+ "grad_norm": 7.0307512283325195,
639
+ "learning_rate": 3.8595592004100465e-05,
640
+ "loss": 0.7264,
641
+ "step": 8900
642
+ },
643
+ {
644
+ "epoch": 1.1532547411583802,
645
+ "grad_norm": 20.715412139892578,
646
+ "learning_rate": 3.8467452588416197e-05,
647
+ "loss": 0.6987,
648
+ "step": 9000
649
+ },
650
+ {
651
+ "epoch": 1.1660686827268067,
652
+ "grad_norm": 6.620629787445068,
653
+ "learning_rate": 3.8339313172731935e-05,
654
+ "loss": 0.7041,
655
+ "step": 9100
656
+ },
657
+ {
658
+ "epoch": 1.1788826242952333,
659
+ "grad_norm": 5.27125883102417,
660
+ "learning_rate": 3.821117375704767e-05,
661
+ "loss": 0.67,
662
+ "step": 9200
663
+ },
664
+ {
665
+ "epoch": 1.1916965658636596,
666
+ "grad_norm": 6.010765552520752,
667
+ "learning_rate": 3.8083034341363406e-05,
668
+ "loss": 0.6737,
669
+ "step": 9300
670
+ },
671
+ {
672
+ "epoch": 1.204510507432086,
673
+ "grad_norm": 14.393863677978516,
674
+ "learning_rate": 3.795489492567914e-05,
675
+ "loss": 0.7097,
676
+ "step": 9400
677
+ },
678
+ {
679
+ "epoch": 1.2173244490005126,
680
+ "grad_norm": 6.37823486328125,
681
+ "learning_rate": 3.782675550999488e-05,
682
+ "loss": 0.7157,
683
+ "step": 9500
684
+ },
685
+ {
686
+ "epoch": 1.230138390568939,
687
+ "grad_norm": 11.626152992248535,
688
+ "learning_rate": 3.7698616094310616e-05,
689
+ "loss": 0.7066,
690
+ "step": 9600
691
+ },
692
+ {
693
+ "epoch": 1.2429523321373654,
694
+ "grad_norm": 5.520190238952637,
695
+ "learning_rate": 3.757047667862635e-05,
696
+ "loss": 0.7303,
697
+ "step": 9700
698
+ },
699
+ {
700
+ "epoch": 1.255766273705792,
701
+ "grad_norm": 9.865089416503906,
702
+ "learning_rate": 3.744233726294209e-05,
703
+ "loss": 0.7559,
704
+ "step": 9800
705
+ },
706
+ {
707
+ "epoch": 1.2685802152742183,
708
+ "grad_norm": 7.075952529907227,
709
+ "learning_rate": 3.731419784725782e-05,
710
+ "loss": 0.6941,
711
+ "step": 9900
712
+ },
713
+ {
714
+ "epoch": 1.2813941568426448,
715
+ "grad_norm": 3.4892656803131104,
716
+ "learning_rate": 3.718605843157356e-05,
717
+ "loss": 0.7164,
718
+ "step": 10000
719
+ },
720
+ {
721
+ "epoch": 1.2942080984110713,
722
+ "grad_norm": 9.843413352966309,
723
+ "learning_rate": 3.705791901588929e-05,
724
+ "loss": 0.695,
725
+ "step": 10100
726
+ },
727
+ {
728
+ "epoch": 1.3070220399794976,
729
+ "grad_norm": 12.128110885620117,
730
+ "learning_rate": 3.692977960020502e-05,
731
+ "loss": 0.6563,
732
+ "step": 10200
733
+ },
734
+ {
735
+ "epoch": 1.3198359815479241,
736
+ "grad_norm": 11.26876163482666,
737
+ "learning_rate": 3.680164018452076e-05,
738
+ "loss": 0.6803,
739
+ "step": 10300
740
+ },
741
+ {
742
+ "epoch": 1.3326499231163507,
743
+ "grad_norm": 12.95758056640625,
744
+ "learning_rate": 3.667350076883649e-05,
745
+ "loss": 0.6864,
746
+ "step": 10400
747
+ },
748
+ {
749
+ "epoch": 1.345463864684777,
750
+ "grad_norm": 4.91602897644043,
751
+ "learning_rate": 3.654536135315223e-05,
752
+ "loss": 0.7184,
753
+ "step": 10500
754
+ },
755
+ {
756
+ "epoch": 1.3582778062532035,
757
+ "grad_norm": 4.799069881439209,
758
+ "learning_rate": 3.641722193746796e-05,
759
+ "loss": 0.7558,
760
+ "step": 10600
761
+ },
762
+ {
763
+ "epoch": 1.37109174782163,
764
+ "grad_norm": 64.9485855102539,
765
+ "learning_rate": 3.62890825217837e-05,
766
+ "loss": 0.7292,
767
+ "step": 10700
768
+ },
769
+ {
770
+ "epoch": 1.3839056893900563,
771
+ "grad_norm": 6.147428512573242,
772
+ "learning_rate": 3.616094310609944e-05,
773
+ "loss": 0.6623,
774
+ "step": 10800
775
+ },
776
+ {
777
+ "epoch": 1.3967196309584828,
778
+ "grad_norm": 7.638481140136719,
779
+ "learning_rate": 3.603280369041517e-05,
780
+ "loss": 0.6981,
781
+ "step": 10900
782
+ },
783
+ {
784
+ "epoch": 1.4095335725269091,
785
+ "grad_norm": 4.798500061035156,
786
+ "learning_rate": 3.590466427473091e-05,
787
+ "loss": 0.7569,
788
+ "step": 11000
789
+ },
790
+ {
791
+ "epoch": 1.4223475140953357,
792
+ "grad_norm": 4.413691520690918,
793
+ "learning_rate": 3.5776524859046644e-05,
794
+ "loss": 0.6391,
795
+ "step": 11100
796
+ },
797
+ {
798
+ "epoch": 1.4351614556637622,
799
+ "grad_norm": 6.2526421546936035,
800
+ "learning_rate": 3.564838544336238e-05,
801
+ "loss": 0.7045,
802
+ "step": 11200
803
+ },
804
+ {
805
+ "epoch": 1.4479753972321885,
806
+ "grad_norm": 6.3732805252075195,
807
+ "learning_rate": 3.5520246027678114e-05,
808
+ "loss": 0.6916,
809
+ "step": 11300
810
+ },
811
+ {
812
+ "epoch": 1.460789338800615,
813
+ "grad_norm": 25.24698829650879,
814
+ "learning_rate": 3.539210661199385e-05,
815
+ "loss": 0.7652,
816
+ "step": 11400
817
+ },
818
+ {
819
+ "epoch": 1.4736032803690415,
820
+ "grad_norm": 4.716599941253662,
821
+ "learning_rate": 3.5263967196309585e-05,
822
+ "loss": 0.7199,
823
+ "step": 11500
824
+ },
825
+ {
826
+ "epoch": 1.4864172219374678,
827
+ "grad_norm": 13.750917434692383,
828
+ "learning_rate": 3.5135827780625324e-05,
829
+ "loss": 0.7032,
830
+ "step": 11600
831
+ },
832
+ {
833
+ "epoch": 1.4992311635058944,
834
+ "grad_norm": 3.6678273677825928,
835
+ "learning_rate": 3.500768836494106e-05,
836
+ "loss": 0.6821,
837
+ "step": 11700
838
+ },
839
+ {
840
+ "epoch": 1.5120451050743209,
841
+ "grad_norm": 7.891080856323242,
842
+ "learning_rate": 3.487954894925679e-05,
843
+ "loss": 0.7301,
844
+ "step": 11800
845
+ },
846
+ {
847
+ "epoch": 1.5248590466427472,
848
+ "grad_norm": 3.25317645072937,
849
+ "learning_rate": 3.475140953357253e-05,
850
+ "loss": 0.6665,
851
+ "step": 11900
852
+ },
853
+ {
854
+ "epoch": 1.5376729882111737,
855
+ "grad_norm": 12.75395679473877,
856
+ "learning_rate": 3.462327011788826e-05,
857
+ "loss": 0.733,
858
+ "step": 12000
859
+ },
860
+ {
861
+ "epoch": 1.5504869297796002,
862
+ "grad_norm": 10.9820556640625,
863
+ "learning_rate": 3.4495130702204e-05,
864
+ "loss": 0.7064,
865
+ "step": 12100
866
+ },
867
+ {
868
+ "epoch": 1.5633008713480265,
869
+ "grad_norm": 6.558383941650391,
870
+ "learning_rate": 3.4366991286519737e-05,
871
+ "loss": 0.7105,
872
+ "step": 12200
873
+ },
874
+ {
875
+ "epoch": 1.576114812916453,
876
+ "grad_norm": 8.5501070022583,
877
+ "learning_rate": 3.423885187083547e-05,
878
+ "loss": 0.706,
879
+ "step": 12300
880
+ },
881
+ {
882
+ "epoch": 1.5889287544848796,
883
+ "grad_norm": 5.319694995880127,
884
+ "learning_rate": 3.411071245515121e-05,
885
+ "loss": 0.7239,
886
+ "step": 12400
887
+ },
888
+ {
889
+ "epoch": 1.6017426960533059,
890
+ "grad_norm": 5.92519474029541,
891
+ "learning_rate": 3.398257303946694e-05,
892
+ "loss": 0.7043,
893
+ "step": 12500
894
+ },
895
+ {
896
+ "epoch": 1.6145566376217324,
897
+ "grad_norm": 8.853275299072266,
898
+ "learning_rate": 3.385443362378268e-05,
899
+ "loss": 0.6831,
900
+ "step": 12600
901
+ },
902
+ {
903
+ "epoch": 1.627370579190159,
904
+ "grad_norm": 9.30588150024414,
905
+ "learning_rate": 3.372629420809841e-05,
906
+ "loss": 0.6756,
907
+ "step": 12700
908
+ },
909
+ {
910
+ "epoch": 1.6401845207585852,
911
+ "grad_norm": 5.903197288513184,
912
+ "learning_rate": 3.359815479241415e-05,
913
+ "loss": 0.725,
914
+ "step": 12800
915
+ },
916
+ {
917
+ "epoch": 1.6529984623270118,
918
+ "grad_norm": 5.500326156616211,
919
+ "learning_rate": 3.347001537672989e-05,
920
+ "loss": 0.6801,
921
+ "step": 12900
922
+ },
923
+ {
924
+ "epoch": 1.6658124038954383,
925
+ "grad_norm": 7.896096229553223,
926
+ "learning_rate": 3.334187596104562e-05,
927
+ "loss": 0.6975,
928
+ "step": 13000
929
+ },
930
+ {
931
+ "epoch": 1.6786263454638646,
932
+ "grad_norm": 6.674001216888428,
933
+ "learning_rate": 3.321373654536136e-05,
934
+ "loss": 0.6681,
935
+ "step": 13100
936
+ },
937
+ {
938
+ "epoch": 1.691440287032291,
939
+ "grad_norm": 21.74435806274414,
940
+ "learning_rate": 3.308559712967709e-05,
941
+ "loss": 0.7045,
942
+ "step": 13200
943
+ },
944
+ {
945
+ "epoch": 1.7042542286007176,
946
+ "grad_norm": 6.329532146453857,
947
+ "learning_rate": 3.295745771399282e-05,
948
+ "loss": 0.6885,
949
+ "step": 13300
950
+ },
951
+ {
952
+ "epoch": 1.717068170169144,
953
+ "grad_norm": 24.047470092773438,
954
+ "learning_rate": 3.282931829830856e-05,
955
+ "loss": 0.7003,
956
+ "step": 13400
957
+ },
958
+ {
959
+ "epoch": 1.7298821117375704,
960
+ "grad_norm": 7.407759666442871,
961
+ "learning_rate": 3.2701178882624294e-05,
962
+ "loss": 0.6856,
963
+ "step": 13500
964
+ },
965
+ {
966
+ "epoch": 1.742696053305997,
967
+ "grad_norm": 5.755215167999268,
968
+ "learning_rate": 3.257303946694003e-05,
969
+ "loss": 0.7005,
970
+ "step": 13600
971
+ },
972
+ {
973
+ "epoch": 1.7555099948744233,
974
+ "grad_norm": 11.444562911987305,
975
+ "learning_rate": 3.2444900051255764e-05,
976
+ "loss": 0.7136,
977
+ "step": 13700
978
+ },
979
+ {
980
+ "epoch": 1.7683239364428498,
981
+ "grad_norm": 8.267853736877441,
982
+ "learning_rate": 3.23167606355715e-05,
983
+ "loss": 0.7029,
984
+ "step": 13800
985
+ },
986
+ {
987
+ "epoch": 1.7811378780112763,
988
+ "grad_norm": 6.73785924911499,
989
+ "learning_rate": 3.2188621219887235e-05,
990
+ "loss": 0.6572,
991
+ "step": 13900
992
+ },
993
+ {
994
+ "epoch": 1.7939518195797026,
995
+ "grad_norm": 5.369395732879639,
996
+ "learning_rate": 3.2060481804202974e-05,
997
+ "loss": 0.6617,
998
+ "step": 14000
999
+ },
1000
+ {
1001
+ "epoch": 1.8067657611481291,
1002
+ "grad_norm": 2.288243293762207,
1003
+ "learning_rate": 3.193234238851871e-05,
1004
+ "loss": 0.6688,
1005
+ "step": 14100
1006
+ },
1007
+ {
1008
+ "epoch": 1.8195797027165557,
1009
+ "grad_norm": 14.942804336547852,
1010
+ "learning_rate": 3.1804202972834445e-05,
1011
+ "loss": 0.6792,
1012
+ "step": 14200
1013
+ },
1014
+ {
1015
+ "epoch": 1.832393644284982,
1016
+ "grad_norm": 8.988631248474121,
1017
+ "learning_rate": 3.1676063557150184e-05,
1018
+ "loss": 0.6518,
1019
+ "step": 14300
1020
+ },
1021
+ {
1022
+ "epoch": 1.8452075858534085,
1023
+ "grad_norm": 7.9590630531311035,
1024
+ "learning_rate": 3.1547924141465916e-05,
1025
+ "loss": 0.6503,
1026
+ "step": 14400
1027
+ },
1028
+ {
1029
+ "epoch": 1.858021527421835,
1030
+ "grad_norm": 9.33973503112793,
1031
+ "learning_rate": 3.1419784725781655e-05,
1032
+ "loss": 0.6647,
1033
+ "step": 14500
1034
+ },
1035
+ {
1036
+ "epoch": 1.8708354689902613,
1037
+ "grad_norm": 9.39842700958252,
1038
+ "learning_rate": 3.1291645310097387e-05,
1039
+ "loss": 0.6515,
1040
+ "step": 14600
1041
+ },
1042
+ {
1043
+ "epoch": 1.8836494105586878,
1044
+ "grad_norm": 10.142439842224121,
1045
+ "learning_rate": 3.1163505894413125e-05,
1046
+ "loss": 0.6794,
1047
+ "step": 14700
1048
+ },
1049
+ {
1050
+ "epoch": 1.8964633521271144,
1051
+ "grad_norm": 11.658042907714844,
1052
+ "learning_rate": 3.1035366478728864e-05,
1053
+ "loss": 0.6931,
1054
+ "step": 14800
1055
+ },
1056
+ {
1057
+ "epoch": 1.9092772936955407,
1058
+ "grad_norm": 8.672663688659668,
1059
+ "learning_rate": 3.090722706304459e-05,
1060
+ "loss": 0.6377,
1061
+ "step": 14900
1062
+ },
1063
+ {
1064
+ "epoch": 1.9220912352639672,
1065
+ "grad_norm": 6.620725631713867,
1066
+ "learning_rate": 3.077908764736033e-05,
1067
+ "loss": 0.7044,
1068
+ "step": 15000
1069
+ },
1070
+ {
1071
+ "epoch": 1.9349051768323937,
1072
+ "grad_norm": 8.3103609085083,
1073
+ "learning_rate": 3.065094823167606e-05,
1074
+ "loss": 0.641,
1075
+ "step": 15100
1076
+ },
1077
+ {
1078
+ "epoch": 1.94771911840082,
1079
+ "grad_norm": 8.163315773010254,
1080
+ "learning_rate": 3.05228088159918e-05,
1081
+ "loss": 0.7094,
1082
+ "step": 15200
1083
+ },
1084
+ {
1085
+ "epoch": 1.9605330599692465,
1086
+ "grad_norm": 3.6365621089935303,
1087
+ "learning_rate": 3.0394669400307534e-05,
1088
+ "loss": 0.7022,
1089
+ "step": 15300
1090
+ },
1091
+ {
1092
+ "epoch": 1.973347001537673,
1093
+ "grad_norm": 4.264801502227783,
1094
+ "learning_rate": 3.026652998462327e-05,
1095
+ "loss": 0.6833,
1096
+ "step": 15400
1097
+ },
1098
+ {
1099
+ "epoch": 1.9861609431060994,
1100
+ "grad_norm": 6.547428131103516,
1101
+ "learning_rate": 3.0138390568939005e-05,
1102
+ "loss": 0.6126,
1103
+ "step": 15500
1104
+ },
1105
+ {
1106
+ "epoch": 1.9989748846745259,
1107
+ "grad_norm": 6.155936241149902,
1108
+ "learning_rate": 3.0010251153254744e-05,
1109
+ "loss": 0.6851,
1110
+ "step": 15600
1111
+ },
1112
+ {
1113
+ "epoch": 2.0,
1114
+ "eval_f1": 0.6772664805551888,
1115
+ "eval_loss": 0.781230092048645,
1116
+ "eval_runtime": 778.3436,
1117
+ "eval_samples_per_second": 10.026,
1118
+ "eval_steps_per_second": 2.507,
1119
+ "step": 15608
1120
+ },
1121
+ {
1122
+ "epoch": 2.0117888262429524,
1123
+ "grad_norm": 8.777030944824219,
1124
+ "learning_rate": 2.988211173757048e-05,
1125
+ "loss": 0.4707,
1126
+ "step": 15700
1127
+ },
1128
+ {
1129
+ "epoch": 2.0246027678113787,
1130
+ "grad_norm": 4.798321723937988,
1131
+ "learning_rate": 2.9753972321886215e-05,
1132
+ "loss": 0.4366,
1133
+ "step": 15800
1134
+ },
1135
+ {
1136
+ "epoch": 2.037416709379805,
1137
+ "grad_norm": 2.5244762897491455,
1138
+ "learning_rate": 2.962583290620195e-05,
1139
+ "loss": 0.504,
1140
+ "step": 15900
1141
+ },
1142
+ {
1143
+ "epoch": 2.0502306509482318,
1144
+ "grad_norm": 15.636524200439453,
1145
+ "learning_rate": 2.9497693490517686e-05,
1146
+ "loss": 0.4234,
1147
+ "step": 16000
1148
+ },
1149
+ {
1150
+ "epoch": 2.063044592516658,
1151
+ "grad_norm": 8.811060905456543,
1152
+ "learning_rate": 2.936955407483342e-05,
1153
+ "loss": 0.3911,
1154
+ "step": 16100
1155
+ },
1156
+ {
1157
+ "epoch": 2.0758585340850844,
1158
+ "grad_norm": 4.1310930252075195,
1159
+ "learning_rate": 2.9241414659149157e-05,
1160
+ "loss": 0.4538,
1161
+ "step": 16200
1162
+ },
1163
+ {
1164
+ "epoch": 2.088672475653511,
1165
+ "grad_norm": 9.516937255859375,
1166
+ "learning_rate": 2.9113275243464892e-05,
1167
+ "loss": 0.4461,
1168
+ "step": 16300
1169
+ },
1170
+ {
1171
+ "epoch": 2.1014864172219374,
1172
+ "grad_norm": 4.6523756980896,
1173
+ "learning_rate": 2.8985135827780624e-05,
1174
+ "loss": 0.4808,
1175
+ "step": 16400
1176
+ },
1177
+ {
1178
+ "epoch": 2.1143003587903637,
1179
+ "grad_norm": 4.160647392272949,
1180
+ "learning_rate": 2.885699641209636e-05,
1181
+ "loss": 0.4879,
1182
+ "step": 16500
1183
+ },
1184
+ {
1185
+ "epoch": 2.1271143003587905,
1186
+ "grad_norm": 11.32701587677002,
1187
+ "learning_rate": 2.8728856996412095e-05,
1188
+ "loss": 0.4544,
1189
+ "step": 16600
1190
+ },
1191
+ {
1192
+ "epoch": 2.1399282419272168,
1193
+ "grad_norm": 4.703444004058838,
1194
+ "learning_rate": 2.860071758072783e-05,
1195
+ "loss": 0.466,
1196
+ "step": 16700
1197
+ },
1198
+ {
1199
+ "epoch": 2.152742183495643,
1200
+ "grad_norm": 8.985660552978516,
1201
+ "learning_rate": 2.847257816504357e-05,
1202
+ "loss": 0.4734,
1203
+ "step": 16800
1204
+ },
1205
+ {
1206
+ "epoch": 2.16555612506407,
1207
+ "grad_norm": 12.306890487670898,
1208
+ "learning_rate": 2.8344438749359304e-05,
1209
+ "loss": 0.4287,
1210
+ "step": 16900
1211
+ },
1212
+ {
1213
+ "epoch": 2.178370066632496,
1214
+ "grad_norm": 5.025609016418457,
1215
+ "learning_rate": 2.821629933367504e-05,
1216
+ "loss": 0.4657,
1217
+ "step": 17000
1218
+ },
1219
+ {
1220
+ "epoch": 2.1911840082009224,
1221
+ "grad_norm": 31.554025650024414,
1222
+ "learning_rate": 2.8088159917990775e-05,
1223
+ "loss": 0.4378,
1224
+ "step": 17100
1225
+ },
1226
+ {
1227
+ "epoch": 2.203997949769349,
1228
+ "grad_norm": 9.015434265136719,
1229
+ "learning_rate": 2.796002050230651e-05,
1230
+ "loss": 0.4538,
1231
+ "step": 17200
1232
+ },
1233
+ {
1234
+ "epoch": 2.2168118913377755,
1235
+ "grad_norm": 15.61099624633789,
1236
+ "learning_rate": 2.7831881086622246e-05,
1237
+ "loss": 0.4134,
1238
+ "step": 17300
1239
+ },
1240
+ {
1241
+ "epoch": 2.2296258329062018,
1242
+ "grad_norm": 10.191957473754883,
1243
+ "learning_rate": 2.770374167093798e-05,
1244
+ "loss": 0.5188,
1245
+ "step": 17400
1246
+ },
1247
+ {
1248
+ "epoch": 2.2424397744746285,
1249
+ "grad_norm": 2.2506730556488037,
1250
+ "learning_rate": 2.7575602255253717e-05,
1251
+ "loss": 0.4028,
1252
+ "step": 17500
1253
+ },
1254
+ {
1255
+ "epoch": 2.255253716043055,
1256
+ "grad_norm": 23.088764190673828,
1257
+ "learning_rate": 2.7447462839569456e-05,
1258
+ "loss": 0.4814,
1259
+ "step": 17600
1260
+ },
1261
+ {
1262
+ "epoch": 2.268067657611481,
1263
+ "grad_norm": 4.473659515380859,
1264
+ "learning_rate": 2.731932342388519e-05,
1265
+ "loss": 0.4822,
1266
+ "step": 17700
1267
+ },
1268
+ {
1269
+ "epoch": 2.280881599179908,
1270
+ "grad_norm": 2.1489970684051514,
1271
+ "learning_rate": 2.7191184008200927e-05,
1272
+ "loss": 0.4934,
1273
+ "step": 17800
1274
+ },
1275
+ {
1276
+ "epoch": 2.293695540748334,
1277
+ "grad_norm": 1.4255170822143555,
1278
+ "learning_rate": 2.7063044592516662e-05,
1279
+ "loss": 0.4314,
1280
+ "step": 17900
1281
+ },
1282
+ {
1283
+ "epoch": 2.3065094823167605,
1284
+ "grad_norm": 4.612204074859619,
1285
+ "learning_rate": 2.693490517683239e-05,
1286
+ "loss": 0.4322,
1287
+ "step": 18000
1288
+ },
1289
+ {
1290
+ "epoch": 2.319323423885187,
1291
+ "grad_norm": 3.1022679805755615,
1292
+ "learning_rate": 2.680676576114813e-05,
1293
+ "loss": 0.424,
1294
+ "step": 18100
1295
+ },
1296
+ {
1297
+ "epoch": 2.3321373654536135,
1298
+ "grad_norm": 3.745171070098877,
1299
+ "learning_rate": 2.6678626345463865e-05,
1300
+ "loss": 0.4269,
1301
+ "step": 18200
1302
+ },
1303
+ {
1304
+ "epoch": 2.34495130702204,
1305
+ "grad_norm": 4.0442328453063965,
1306
+ "learning_rate": 2.65504869297796e-05,
1307
+ "loss": 0.4698,
1308
+ "step": 18300
1309
+ },
1310
+ {
1311
+ "epoch": 2.3577652485904665,
1312
+ "grad_norm": 21.303607940673828,
1313
+ "learning_rate": 2.6422347514095336e-05,
1314
+ "loss": 0.4909,
1315
+ "step": 18400
1316
+ },
1317
+ {
1318
+ "epoch": 2.370579190158893,
1319
+ "grad_norm": 9.175422668457031,
1320
+ "learning_rate": 2.629420809841107e-05,
1321
+ "loss": 0.4598,
1322
+ "step": 18500
1323
+ },
1324
+ {
1325
+ "epoch": 2.383393131727319,
1326
+ "grad_norm": 5.787283420562744,
1327
+ "learning_rate": 2.6166068682726807e-05,
1328
+ "loss": 0.4409,
1329
+ "step": 18600
1330
+ },
1331
+ {
1332
+ "epoch": 2.396207073295746,
1333
+ "grad_norm": 7.338250637054443,
1334
+ "learning_rate": 2.6037929267042542e-05,
1335
+ "loss": 0.4157,
1336
+ "step": 18700
1337
+ },
1338
+ {
1339
+ "epoch": 2.409021014864172,
1340
+ "grad_norm": 13.879666328430176,
1341
+ "learning_rate": 2.590978985135828e-05,
1342
+ "loss": 0.4584,
1343
+ "step": 18800
1344
+ },
1345
+ {
1346
+ "epoch": 2.4218349564325985,
1347
+ "grad_norm": 9.484577178955078,
1348
+ "learning_rate": 2.5781650435674016e-05,
1349
+ "loss": 0.4914,
1350
+ "step": 18900
1351
+ },
1352
+ {
1353
+ "epoch": 2.4346488980010252,
1354
+ "grad_norm": 10.865300178527832,
1355
+ "learning_rate": 2.565351101998975e-05,
1356
+ "loss": 0.4259,
1357
+ "step": 19000
1358
+ },
1359
+ {
1360
+ "epoch": 2.4474628395694515,
1361
+ "grad_norm": 16.69988441467285,
1362
+ "learning_rate": 2.5525371604305487e-05,
1363
+ "loss": 0.4563,
1364
+ "step": 19100
1365
+ },
1366
+ {
1367
+ "epoch": 2.460276781137878,
1368
+ "grad_norm": 19.711631774902344,
1369
+ "learning_rate": 2.5397232188621222e-05,
1370
+ "loss": 0.4159,
1371
+ "step": 19200
1372
+ },
1373
+ {
1374
+ "epoch": 2.4730907227063046,
1375
+ "grad_norm": 13.3755521774292,
1376
+ "learning_rate": 2.5269092772936958e-05,
1377
+ "loss": 0.537,
1378
+ "step": 19300
1379
+ },
1380
+ {
1381
+ "epoch": 2.485904664274731,
1382
+ "grad_norm": 6.953076362609863,
1383
+ "learning_rate": 2.5140953357252693e-05,
1384
+ "loss": 0.4288,
1385
+ "step": 19400
1386
+ },
1387
+ {
1388
+ "epoch": 2.498718605843157,
1389
+ "grad_norm": 47.91322708129883,
1390
+ "learning_rate": 2.5012813941568432e-05,
1391
+ "loss": 0.5049,
1392
+ "step": 19500
1393
+ },
1394
+ {
1395
+ "epoch": 2.511532547411584,
1396
+ "grad_norm": 1.6553832292556763,
1397
+ "learning_rate": 2.4884674525884164e-05,
1398
+ "loss": 0.4779,
1399
+ "step": 19600
1400
+ },
1401
+ {
1402
+ "epoch": 2.5243464889800102,
1403
+ "grad_norm": 12.199808120727539,
1404
+ "learning_rate": 2.47565351101999e-05,
1405
+ "loss": 0.4246,
1406
+ "step": 19700
1407
+ },
1408
+ {
1409
+ "epoch": 2.5371604305484365,
1410
+ "grad_norm": 11.326825141906738,
1411
+ "learning_rate": 2.4628395694515635e-05,
1412
+ "loss": 0.4482,
1413
+ "step": 19800
1414
+ },
1415
+ {
1416
+ "epoch": 2.5499743721168633,
1417
+ "grad_norm": 9.247246742248535,
1418
+ "learning_rate": 2.450025627883137e-05,
1419
+ "loss": 0.4656,
1420
+ "step": 19900
1421
+ },
1422
+ {
1423
+ "epoch": 2.5627883136852896,
1424
+ "grad_norm": 1.773540735244751,
1425
+ "learning_rate": 2.4372116863147106e-05,
1426
+ "loss": 0.4776,
1427
+ "step": 20000
1428
+ },
1429
+ {
1430
+ "epoch": 2.575602255253716,
1431
+ "grad_norm": 7.454749584197998,
1432
+ "learning_rate": 2.424397744746284e-05,
1433
+ "loss": 0.4161,
1434
+ "step": 20100
1435
+ },
1436
+ {
1437
+ "epoch": 2.5884161968221426,
1438
+ "grad_norm": 19.77891731262207,
1439
+ "learning_rate": 2.4115838031778577e-05,
1440
+ "loss": 0.4609,
1441
+ "step": 20200
1442
+ },
1443
+ {
1444
+ "epoch": 2.601230138390569,
1445
+ "grad_norm": 12.208200454711914,
1446
+ "learning_rate": 2.3987698616094312e-05,
1447
+ "loss": 0.453,
1448
+ "step": 20300
1449
+ },
1450
+ {
1451
+ "epoch": 2.6140440799589952,
1452
+ "grad_norm": 11.438812255859375,
1453
+ "learning_rate": 2.3859559200410047e-05,
1454
+ "loss": 0.4439,
1455
+ "step": 20400
1456
+ },
1457
+ {
1458
+ "epoch": 2.626858021527422,
1459
+ "grad_norm": 1.6863147020339966,
1460
+ "learning_rate": 2.3731419784725783e-05,
1461
+ "loss": 0.3987,
1462
+ "step": 20500
1463
+ },
1464
+ {
1465
+ "epoch": 2.6396719630958483,
1466
+ "grad_norm": 1.3637946844100952,
1467
+ "learning_rate": 2.3603280369041518e-05,
1468
+ "loss": 0.4523,
1469
+ "step": 20600
1470
+ },
1471
+ {
1472
+ "epoch": 2.6524859046642746,
1473
+ "grad_norm": 21.555208206176758,
1474
+ "learning_rate": 2.3475140953357254e-05,
1475
+ "loss": 0.4624,
1476
+ "step": 20700
1477
+ },
1478
+ {
1479
+ "epoch": 2.6652998462327013,
1480
+ "grad_norm": 8.768684387207031,
1481
+ "learning_rate": 2.334700153767299e-05,
1482
+ "loss": 0.4585,
1483
+ "step": 20800
1484
+ },
1485
+ {
1486
+ "epoch": 2.6781137878011276,
1487
+ "grad_norm": 3.2959704399108887,
1488
+ "learning_rate": 2.3218862121988724e-05,
1489
+ "loss": 0.4579,
1490
+ "step": 20900
1491
+ },
1492
+ {
1493
+ "epoch": 2.690927729369554,
1494
+ "grad_norm": 16.97565269470215,
1495
+ "learning_rate": 2.309072270630446e-05,
1496
+ "loss": 0.4132,
1497
+ "step": 21000
1498
+ },
1499
+ {
1500
+ "epoch": 2.7037416709379807,
1501
+ "grad_norm": 14.613641738891602,
1502
+ "learning_rate": 2.2962583290620195e-05,
1503
+ "loss": 0.4297,
1504
+ "step": 21100
1505
+ },
1506
+ {
1507
+ "epoch": 2.716555612506407,
1508
+ "grad_norm": 28.61090087890625,
1509
+ "learning_rate": 2.283444387493593e-05,
1510
+ "loss": 0.4479,
1511
+ "step": 21200
1512
+ },
1513
+ {
1514
+ "epoch": 2.7293695540748333,
1515
+ "grad_norm": 9.84257984161377,
1516
+ "learning_rate": 2.2706304459251666e-05,
1517
+ "loss": 0.4428,
1518
+ "step": 21300
1519
+ },
1520
+ {
1521
+ "epoch": 2.74218349564326,
1522
+ "grad_norm": 8.199345588684082,
1523
+ "learning_rate": 2.2578165043567405e-05,
1524
+ "loss": 0.3999,
1525
+ "step": 21400
1526
+ },
1527
+ {
1528
+ "epoch": 2.7549974372116863,
1529
+ "grad_norm": 15.411248207092285,
1530
+ "learning_rate": 2.2450025627883137e-05,
1531
+ "loss": 0.4423,
1532
+ "step": 21500
1533
+ },
1534
+ {
1535
+ "epoch": 2.7678113787801126,
1536
+ "grad_norm": 7.122200012207031,
1537
+ "learning_rate": 2.2321886212198872e-05,
1538
+ "loss": 0.4675,
1539
+ "step": 21600
1540
+ },
1541
+ {
1542
+ "epoch": 2.7806253203485394,
1543
+ "grad_norm": 11.358266830444336,
1544
+ "learning_rate": 2.2193746796514608e-05,
1545
+ "loss": 0.4885,
1546
+ "step": 21700
1547
+ },
1548
+ {
1549
+ "epoch": 2.7934392619169657,
1550
+ "grad_norm": 9.456644058227539,
1551
+ "learning_rate": 2.2065607380830343e-05,
1552
+ "loss": 0.4973,
1553
+ "step": 21800
1554
+ },
1555
+ {
1556
+ "epoch": 2.806253203485392,
1557
+ "grad_norm": 28.7235164642334,
1558
+ "learning_rate": 2.193746796514608e-05,
1559
+ "loss": 0.429,
1560
+ "step": 21900
1561
+ },
1562
+ {
1563
+ "epoch": 2.8190671450538183,
1564
+ "grad_norm": 14.859136581420898,
1565
+ "learning_rate": 2.1809328549461817e-05,
1566
+ "loss": 0.4867,
1567
+ "step": 22000
1568
+ },
1569
+ {
1570
+ "epoch": 2.831881086622245,
1571
+ "grad_norm": 3.089897394180298,
1572
+ "learning_rate": 2.1681189133777553e-05,
1573
+ "loss": 0.4249,
1574
+ "step": 22100
1575
+ },
1576
+ {
1577
+ "epoch": 2.8446950281906713,
1578
+ "grad_norm": 14.606719970703125,
1579
+ "learning_rate": 2.1553049718093288e-05,
1580
+ "loss": 0.4429,
1581
+ "step": 22200
1582
+ },
1583
+ {
1584
+ "epoch": 2.857508969759098,
1585
+ "grad_norm": 7.761451244354248,
1586
+ "learning_rate": 2.142491030240902e-05,
1587
+ "loss": 0.4639,
1588
+ "step": 22300
1589
+ },
1590
+ {
1591
+ "epoch": 2.8703229113275244,
1592
+ "grad_norm": 6.9101362228393555,
1593
+ "learning_rate": 2.1296770886724756e-05,
1594
+ "loss": 0.4606,
1595
+ "step": 22400
1596
+ },
1597
+ {
1598
+ "epoch": 2.8831368528959507,
1599
+ "grad_norm": 6.754969120025635,
1600
+ "learning_rate": 2.116863147104049e-05,
1601
+ "loss": 0.4784,
1602
+ "step": 22500
1603
+ },
1604
+ {
1605
+ "epoch": 2.895950794464377,
1606
+ "grad_norm": 20.884119033813477,
1607
+ "learning_rate": 2.104049205535623e-05,
1608
+ "loss": 0.4625,
1609
+ "step": 22600
1610
+ },
1611
+ {
1612
+ "epoch": 2.9087647360328037,
1613
+ "grad_norm": 18.428529739379883,
1614
+ "learning_rate": 2.0912352639671965e-05,
1615
+ "loss": 0.4121,
1616
+ "step": 22700
1617
+ },
1618
+ {
1619
+ "epoch": 2.92157867760123,
1620
+ "grad_norm": 9.211915969848633,
1621
+ "learning_rate": 2.07842132239877e-05,
1622
+ "loss": 0.457,
1623
+ "step": 22800
1624
+ },
1625
+ {
1626
+ "epoch": 2.9343926191696568,
1627
+ "grad_norm": 5.744906425476074,
1628
+ "learning_rate": 2.0656073808303436e-05,
1629
+ "loss": 0.4169,
1630
+ "step": 22900
1631
+ },
1632
+ {
1633
+ "epoch": 2.947206560738083,
1634
+ "grad_norm": 10.679366111755371,
1635
+ "learning_rate": 2.052793439261917e-05,
1636
+ "loss": 0.4719,
1637
+ "step": 23000
1638
+ },
1639
+ {
1640
+ "epoch": 2.9600205023065094,
1641
+ "grad_norm": 8.72630500793457,
1642
+ "learning_rate": 2.0399794976934904e-05,
1643
+ "loss": 0.4743,
1644
+ "step": 23100
1645
+ },
1646
+ {
1647
+ "epoch": 2.9728344438749357,
1648
+ "grad_norm": 5.53284215927124,
1649
+ "learning_rate": 2.0271655561250642e-05,
1650
+ "loss": 0.4592,
1651
+ "step": 23200
1652
+ },
1653
+ {
1654
+ "epoch": 2.9856483854433624,
1655
+ "grad_norm": 10.75283432006836,
1656
+ "learning_rate": 2.0143516145566378e-05,
1657
+ "loss": 0.3971,
1658
+ "step": 23300
1659
+ },
1660
+ {
1661
+ "epoch": 2.9984623270117887,
1662
+ "grad_norm": 10.634764671325684,
1663
+ "learning_rate": 2.0015376729882113e-05,
1664
+ "loss": 0.4295,
1665
+ "step": 23400
1666
+ },
1667
+ {
1668
+ "epoch": 3.0,
1669
+ "eval_f1": 0.6853715205850849,
1670
+ "eval_loss": 1.0191140174865723,
1671
+ "eval_runtime": 837.6905,
1672
+ "eval_samples_per_second": 9.316,
1673
+ "eval_steps_per_second": 2.329,
1674
+ "step": 23412
1675
+ },
1676
+ {
1677
+ "epoch": 3.0112762685802155,
1678
+ "grad_norm": 3.43902587890625,
1679
+ "learning_rate": 1.988723731419785e-05,
1680
+ "loss": 0.2448,
1681
+ "step": 23500
1682
+ },
1683
+ {
1684
+ "epoch": 3.0240902101486418,
1685
+ "grad_norm": 0.5649552941322327,
1686
+ "learning_rate": 1.9759097898513584e-05,
1687
+ "loss": 0.1908,
1688
+ "step": 23600
1689
+ },
1690
+ {
1691
+ "epoch": 3.036904151717068,
1692
+ "grad_norm": 1.3035610914230347,
1693
+ "learning_rate": 1.963095848282932e-05,
1694
+ "loss": 0.275,
1695
+ "step": 23700
1696
+ },
1697
+ {
1698
+ "epoch": 3.049718093285495,
1699
+ "grad_norm": 27.42232322692871,
1700
+ "learning_rate": 1.9502819067145055e-05,
1701
+ "loss": 0.2727,
1702
+ "step": 23800
1703
+ },
1704
+ {
1705
+ "epoch": 3.062532034853921,
1706
+ "grad_norm": 1.675907015800476,
1707
+ "learning_rate": 1.937467965146079e-05,
1708
+ "loss": 0.2916,
1709
+ "step": 23900
1710
+ },
1711
+ {
1712
+ "epoch": 3.0753459764223474,
1713
+ "grad_norm": 9.602179527282715,
1714
+ "learning_rate": 1.9246540235776526e-05,
1715
+ "loss": 0.2645,
1716
+ "step": 24000
1717
+ },
1718
+ {
1719
+ "epoch": 3.088159917990774,
1720
+ "grad_norm": 16.757831573486328,
1721
+ "learning_rate": 1.911840082009226e-05,
1722
+ "loss": 0.2476,
1723
+ "step": 24100
1724
+ },
1725
+ {
1726
+ "epoch": 3.1009738595592005,
1727
+ "grad_norm": 5.842043876647949,
1728
+ "learning_rate": 1.8990261404407997e-05,
1729
+ "loss": 0.2829,
1730
+ "step": 24200
1731
+ },
1732
+ {
1733
+ "epoch": 3.1137878011276268,
1734
+ "grad_norm": 0.593449592590332,
1735
+ "learning_rate": 1.8862121988723732e-05,
1736
+ "loss": 0.289,
1737
+ "step": 24300
1738
+ },
1739
+ {
1740
+ "epoch": 3.1266017426960535,
1741
+ "grad_norm": 5.712982177734375,
1742
+ "learning_rate": 1.8733982573039467e-05,
1743
+ "loss": 0.2355,
1744
+ "step": 24400
1745
+ },
1746
+ {
1747
+ "epoch": 3.13941568426448,
1748
+ "grad_norm": 0.3152589201927185,
1749
+ "learning_rate": 1.8605843157355203e-05,
1750
+ "loss": 0.2491,
1751
+ "step": 24500
1752
+ },
1753
+ {
1754
+ "epoch": 3.152229625832906,
1755
+ "grad_norm": 19.951833724975586,
1756
+ "learning_rate": 1.8477703741670938e-05,
1757
+ "loss": 0.271,
1758
+ "step": 24600
1759
+ },
1760
+ {
1761
+ "epoch": 3.165043567401333,
1762
+ "grad_norm": 5.257028579711914,
1763
+ "learning_rate": 1.8349564325986674e-05,
1764
+ "loss": 0.277,
1765
+ "step": 24700
1766
+ },
1767
+ {
1768
+ "epoch": 3.177857508969759,
1769
+ "grad_norm": 3.6717381477355957,
1770
+ "learning_rate": 1.822142491030241e-05,
1771
+ "loss": 0.2736,
1772
+ "step": 24800
1773
+ },
1774
+ {
1775
+ "epoch": 3.1906714505381855,
1776
+ "grad_norm": 38.49631881713867,
1777
+ "learning_rate": 1.8093285494618144e-05,
1778
+ "loss": 0.2789,
1779
+ "step": 24900
1780
+ },
1781
+ {
1782
+ "epoch": 3.2034853921066118,
1783
+ "grad_norm": 5.944704055786133,
1784
+ "learning_rate": 1.796514607893388e-05,
1785
+ "loss": 0.3111,
1786
+ "step": 25000
1787
+ },
1788
+ {
1789
+ "epoch": 3.2162993336750385,
1790
+ "grad_norm": 3.278078079223633,
1791
+ "learning_rate": 1.7837006663249615e-05,
1792
+ "loss": 0.287,
1793
+ "step": 25100
1794
+ },
1795
+ {
1796
+ "epoch": 3.229113275243465,
1797
+ "grad_norm": 13.320869445800781,
1798
+ "learning_rate": 1.7708867247565354e-05,
1799
+ "loss": 0.2708,
1800
+ "step": 25200
1801
+ },
1802
+ {
1803
+ "epoch": 3.2419272168118916,
1804
+ "grad_norm": 9.01321029663086,
1805
+ "learning_rate": 1.758072783188109e-05,
1806
+ "loss": 0.2891,
1807
+ "step": 25300
1808
+ },
1809
+ {
1810
+ "epoch": 3.254741158380318,
1811
+ "grad_norm": 14.35201644897461,
1812
+ "learning_rate": 1.745258841619682e-05,
1813
+ "loss": 0.1523,
1814
+ "step": 25400
1815
+ },
1816
+ {
1817
+ "epoch": 3.267555099948744,
1818
+ "grad_norm": 5.268370628356934,
1819
+ "learning_rate": 1.7324449000512557e-05,
1820
+ "loss": 0.3608,
1821
+ "step": 25500
1822
+ },
1823
+ {
1824
+ "epoch": 3.2803690415171705,
1825
+ "grad_norm": 3.338168144226074,
1826
+ "learning_rate": 1.7196309584828292e-05,
1827
+ "loss": 0.2829,
1828
+ "step": 25600
1829
+ },
1830
+ {
1831
+ "epoch": 3.293182983085597,
1832
+ "grad_norm": 12.441572189331055,
1833
+ "learning_rate": 1.7068170169144028e-05,
1834
+ "loss": 0.2563,
1835
+ "step": 25700
1836
+ },
1837
+ {
1838
+ "epoch": 3.3059969246540235,
1839
+ "grad_norm": 2.870978832244873,
1840
+ "learning_rate": 1.6940030753459767e-05,
1841
+ "loss": 0.2957,
1842
+ "step": 25800
1843
+ },
1844
+ {
1845
+ "epoch": 3.3188108662224503,
1846
+ "grad_norm": 10.626642227172852,
1847
+ "learning_rate": 1.6811891337775502e-05,
1848
+ "loss": 0.3493,
1849
+ "step": 25900
1850
+ },
1851
+ {
1852
+ "epoch": 3.3316248077908766,
1853
+ "grad_norm": 1.1796225309371948,
1854
+ "learning_rate": 1.6683751922091237e-05,
1855
+ "loss": 0.293,
1856
+ "step": 26000
1857
+ },
1858
+ {
1859
+ "epoch": 3.344438749359303,
1860
+ "grad_norm": 46.64753341674805,
1861
+ "learning_rate": 1.6555612506406973e-05,
1862
+ "loss": 0.2739,
1863
+ "step": 26100
1864
+ },
1865
+ {
1866
+ "epoch": 3.357252690927729,
1867
+ "grad_norm": 17.778207778930664,
1868
+ "learning_rate": 1.6427473090722705e-05,
1869
+ "loss": 0.2897,
1870
+ "step": 26200
1871
+ },
1872
+ {
1873
+ "epoch": 3.370066632496156,
1874
+ "grad_norm": 1.6698403358459473,
1875
+ "learning_rate": 1.629933367503844e-05,
1876
+ "loss": 0.2661,
1877
+ "step": 26300
1878
+ },
1879
+ {
1880
+ "epoch": 3.382880574064582,
1881
+ "grad_norm": 0.18206116557121277,
1882
+ "learning_rate": 1.617119425935418e-05,
1883
+ "loss": 0.2847,
1884
+ "step": 26400
1885
+ },
1886
+ {
1887
+ "epoch": 3.395694515633009,
1888
+ "grad_norm": 6.839690208435059,
1889
+ "learning_rate": 1.6043054843669915e-05,
1890
+ "loss": 0.3044,
1891
+ "step": 26500
1892
+ },
1893
+ {
1894
+ "epoch": 3.4085084572014352,
1895
+ "grad_norm": 0.6313930749893188,
1896
+ "learning_rate": 1.591491542798565e-05,
1897
+ "loss": 0.2623,
1898
+ "step": 26600
1899
+ },
1900
+ {
1901
+ "epoch": 3.4213223987698616,
1902
+ "grad_norm": 70.23905181884766,
1903
+ "learning_rate": 1.5786776012301385e-05,
1904
+ "loss": 0.2573,
1905
+ "step": 26700
1906
+ },
1907
+ {
1908
+ "epoch": 3.434136340338288,
1909
+ "grad_norm": 16.72913360595703,
1910
+ "learning_rate": 1.565863659661712e-05,
1911
+ "loss": 0.2626,
1912
+ "step": 26800
1913
+ },
1914
+ {
1915
+ "epoch": 3.4469502819067146,
1916
+ "grad_norm": 43.662845611572266,
1917
+ "learning_rate": 1.5530497180932856e-05,
1918
+ "loss": 0.2679,
1919
+ "step": 26900
1920
+ },
1921
+ {
1922
+ "epoch": 3.459764223475141,
1923
+ "grad_norm": 20.96466064453125,
1924
+ "learning_rate": 1.540235776524859e-05,
1925
+ "loss": 0.3082,
1926
+ "step": 27000
1927
+ },
1928
+ {
1929
+ "epoch": 3.4725781650435676,
1930
+ "grad_norm": 45.02407455444336,
1931
+ "learning_rate": 1.5274218349564327e-05,
1932
+ "loss": 0.2492,
1933
+ "step": 27100
1934
+ },
1935
+ {
1936
+ "epoch": 3.485392106611994,
1937
+ "grad_norm": 14.404077529907227,
1938
+ "learning_rate": 1.5146078933880062e-05,
1939
+ "loss": 0.2704,
1940
+ "step": 27200
1941
+ },
1942
+ {
1943
+ "epoch": 3.4982060481804202,
1944
+ "grad_norm": 19.40283966064453,
1945
+ "learning_rate": 1.5017939518195798e-05,
1946
+ "loss": 0.3089,
1947
+ "step": 27300
1948
+ },
1949
+ {
1950
+ "epoch": 3.5110199897488465,
1951
+ "grad_norm": 13.016902923583984,
1952
+ "learning_rate": 1.4889800102511533e-05,
1953
+ "loss": 0.2953,
1954
+ "step": 27400
1955
+ },
1956
+ {
1957
+ "epoch": 3.5238339313172733,
1958
+ "grad_norm": 6.934922695159912,
1959
+ "learning_rate": 1.4761660686827269e-05,
1960
+ "loss": 0.2132,
1961
+ "step": 27500
1962
+ },
1963
+ {
1964
+ "epoch": 3.5366478728856996,
1965
+ "grad_norm": 49.58895492553711,
1966
+ "learning_rate": 1.4633521271143006e-05,
1967
+ "loss": 0.271,
1968
+ "step": 27600
1969
+ },
1970
+ {
1971
+ "epoch": 3.5494618144541263,
1972
+ "grad_norm": 4.814508438110352,
1973
+ "learning_rate": 1.4505381855458741e-05,
1974
+ "loss": 0.3195,
1975
+ "step": 27700
1976
+ },
1977
+ {
1978
+ "epoch": 3.5622757560225526,
1979
+ "grad_norm": 28.65342903137207,
1980
+ "learning_rate": 1.4377242439774475e-05,
1981
+ "loss": 0.2869,
1982
+ "step": 27800
1983
+ },
1984
+ {
1985
+ "epoch": 3.575089697590979,
1986
+ "grad_norm": 5.931487083435059,
1987
+ "learning_rate": 1.424910302409021e-05,
1988
+ "loss": 0.2982,
1989
+ "step": 27900
1990
+ },
1991
+ {
1992
+ "epoch": 3.5879036391594052,
1993
+ "grad_norm": 0.22432470321655273,
1994
+ "learning_rate": 1.4120963608405946e-05,
1995
+ "loss": 0.3167,
1996
+ "step": 28000
1997
+ },
1998
+ {
1999
+ "epoch": 3.600717580727832,
2000
+ "grad_norm": 27.89299964904785,
2001
+ "learning_rate": 1.3992824192721681e-05,
2002
+ "loss": 0.2831,
2003
+ "step": 28100
2004
+ },
2005
+ {
2006
+ "epoch": 3.6135315222962583,
2007
+ "grad_norm": 6.232203006744385,
2008
+ "learning_rate": 1.3864684777037418e-05,
2009
+ "loss": 0.2328,
2010
+ "step": 28200
2011
+ },
2012
+ {
2013
+ "epoch": 3.626345463864685,
2014
+ "grad_norm": 0.3798358738422394,
2015
+ "learning_rate": 1.3736545361353154e-05,
2016
+ "loss": 0.2565,
2017
+ "step": 28300
2018
+ },
2019
+ {
2020
+ "epoch": 3.6391594054331113,
2021
+ "grad_norm": 2.3177566528320312,
2022
+ "learning_rate": 1.3608405945668889e-05,
2023
+ "loss": 0.2822,
2024
+ "step": 28400
2025
+ },
2026
+ {
2027
+ "epoch": 3.6519733470015376,
2028
+ "grad_norm": 0.9287611246109009,
2029
+ "learning_rate": 1.3480266529984623e-05,
2030
+ "loss": 0.2206,
2031
+ "step": 28500
2032
+ },
2033
+ {
2034
+ "epoch": 3.664787288569964,
2035
+ "grad_norm": 19.89398765563965,
2036
+ "learning_rate": 1.3352127114300358e-05,
2037
+ "loss": 0.2934,
2038
+ "step": 28600
2039
+ },
2040
+ {
2041
+ "epoch": 3.6776012301383907,
2042
+ "grad_norm": 14.735712051391602,
2043
+ "learning_rate": 1.3223987698616094e-05,
2044
+ "loss": 0.2667,
2045
+ "step": 28700
2046
+ },
2047
+ {
2048
+ "epoch": 3.690415171706817,
2049
+ "grad_norm": 2.782954454421997,
2050
+ "learning_rate": 1.309584828293183e-05,
2051
+ "loss": 0.2565,
2052
+ "step": 28800
2053
+ },
2054
+ {
2055
+ "epoch": 3.7032291132752437,
2056
+ "grad_norm": 20.082395553588867,
2057
+ "learning_rate": 1.2967708867247566e-05,
2058
+ "loss": 0.3069,
2059
+ "step": 28900
2060
+ },
2061
+ {
2062
+ "epoch": 3.71604305484367,
2063
+ "grad_norm": 1.8632967472076416,
2064
+ "learning_rate": 1.2839569451563302e-05,
2065
+ "loss": 0.2484,
2066
+ "step": 29000
2067
+ },
2068
+ {
2069
+ "epoch": 3.7288569964120963,
2070
+ "grad_norm": 6.2880330085754395,
2071
+ "learning_rate": 1.2711430035879037e-05,
2072
+ "loss": 0.2769,
2073
+ "step": 29100
2074
+ },
2075
+ {
2076
+ "epoch": 3.7416709379805226,
2077
+ "grad_norm": 18.328922271728516,
2078
+ "learning_rate": 1.2583290620194774e-05,
2079
+ "loss": 0.284,
2080
+ "step": 29200
2081
+ },
2082
+ {
2083
+ "epoch": 3.7544848795489494,
2084
+ "grad_norm": 0.2658964991569519,
2085
+ "learning_rate": 1.2455151204510508e-05,
2086
+ "loss": 0.2725,
2087
+ "step": 29300
2088
+ },
2089
+ {
2090
+ "epoch": 3.7672988211173757,
2091
+ "grad_norm": 7.819123268127441,
2092
+ "learning_rate": 1.2327011788826243e-05,
2093
+ "loss": 0.2513,
2094
+ "step": 29400
2095
+ },
2096
+ {
2097
+ "epoch": 3.7801127626858024,
2098
+ "grad_norm": 4.6279144287109375,
2099
+ "learning_rate": 1.2198872373141979e-05,
2100
+ "loss": 0.2573,
2101
+ "step": 29500
2102
+ },
2103
+ {
2104
+ "epoch": 3.7929267042542287,
2105
+ "grad_norm": 24.996662139892578,
2106
+ "learning_rate": 1.2070732957457714e-05,
2107
+ "loss": 0.2621,
2108
+ "step": 29600
2109
+ },
2110
+ {
2111
+ "epoch": 3.805740645822655,
2112
+ "grad_norm": 20.87746810913086,
2113
+ "learning_rate": 1.194259354177345e-05,
2114
+ "loss": 0.2499,
2115
+ "step": 29700
2116
+ },
2117
+ {
2118
+ "epoch": 3.8185545873910813,
2119
+ "grad_norm": 1.5061414241790771,
2120
+ "learning_rate": 1.1814454126089187e-05,
2121
+ "loss": 0.265,
2122
+ "step": 29800
2123
+ },
2124
+ {
2125
+ "epoch": 3.831368528959508,
2126
+ "grad_norm": 2.7230064868927,
2127
+ "learning_rate": 1.168631471040492e-05,
2128
+ "loss": 0.2469,
2129
+ "step": 29900
2130
+ },
2131
+ {
2132
+ "epoch": 3.8441824705279344,
2133
+ "grad_norm": 0.6768075823783875,
2134
+ "learning_rate": 1.1558175294720656e-05,
2135
+ "loss": 0.2686,
2136
+ "step": 30000
2137
+ },
2138
+ {
2139
+ "epoch": 3.8569964120963607,
2140
+ "grad_norm": 0.08343211561441422,
2141
+ "learning_rate": 1.1430035879036393e-05,
2142
+ "loss": 0.2565,
2143
+ "step": 30100
2144
+ },
2145
+ {
2146
+ "epoch": 3.8698103536647874,
2147
+ "grad_norm": 25.58348274230957,
2148
+ "learning_rate": 1.1301896463352128e-05,
2149
+ "loss": 0.2967,
2150
+ "step": 30200
2151
+ },
2152
+ {
2153
+ "epoch": 3.8826242952332137,
2154
+ "grad_norm": 1.0459709167480469,
2155
+ "learning_rate": 1.1173757047667862e-05,
2156
+ "loss": 0.3028,
2157
+ "step": 30300
2158
+ },
2159
+ {
2160
+ "epoch": 3.89543823680164,
2161
+ "grad_norm": 0.33878639340400696,
2162
+ "learning_rate": 1.1045617631983599e-05,
2163
+ "loss": 0.2243,
2164
+ "step": 30400
2165
+ },
2166
+ {
2167
+ "epoch": 3.9082521783700668,
2168
+ "grad_norm": 2.021047592163086,
2169
+ "learning_rate": 1.0917478216299335e-05,
2170
+ "loss": 0.3656,
2171
+ "step": 30500
2172
+ },
2173
+ {
2174
+ "epoch": 3.921066119938493,
2175
+ "grad_norm": 1.6855653524398804,
2176
+ "learning_rate": 1.078933880061507e-05,
2177
+ "loss": 0.2323,
2178
+ "step": 30600
2179
+ },
2180
+ {
2181
+ "epoch": 3.9338800615069194,
2182
+ "grad_norm": 21.66104507446289,
2183
+ "learning_rate": 1.0661199384930805e-05,
2184
+ "loss": 0.2205,
2185
+ "step": 30700
2186
+ },
2187
+ {
2188
+ "epoch": 3.946694003075346,
2189
+ "grad_norm": 2.4428458213806152,
2190
+ "learning_rate": 1.053305996924654e-05,
2191
+ "loss": 0.2436,
2192
+ "step": 30800
2193
+ },
2194
+ {
2195
+ "epoch": 3.9595079446437724,
2196
+ "grad_norm": 39.37623596191406,
2197
+ "learning_rate": 1.0404920553562276e-05,
2198
+ "loss": 0.2831,
2199
+ "step": 30900
2200
+ },
2201
+ {
2202
+ "epoch": 3.9723218862121987,
2203
+ "grad_norm": 44.4313850402832,
2204
+ "learning_rate": 1.0276781137878012e-05,
2205
+ "loss": 0.2522,
2206
+ "step": 31000
2207
+ },
2208
+ {
2209
+ "epoch": 3.9851358277806255,
2210
+ "grad_norm": 2.6004929542541504,
2211
+ "learning_rate": 1.0148641722193747e-05,
2212
+ "loss": 0.3209,
2213
+ "step": 31100
2214
+ },
2215
+ {
2216
+ "epoch": 3.9979497693490518,
2217
+ "grad_norm": 2.536029815673828,
2218
+ "learning_rate": 1.0020502306509482e-05,
2219
+ "loss": 0.2807,
2220
+ "step": 31200
2221
+ },
2222
+ {
2223
+ "epoch": 4.0,
2224
+ "eval_f1": 0.6869426704202687,
2225
+ "eval_loss": 2.03011155128479,
2226
+ "eval_runtime": 825.2142,
2227
+ "eval_samples_per_second": 9.457,
2228
+ "eval_steps_per_second": 2.364,
2229
+ "step": 31216
2230
+ }
2231
+ ],
2232
+ "logging_steps": 100,
2233
+ "max_steps": 39020,
2234
+ "num_input_tokens_seen": 0,
2235
+ "num_train_epochs": 5,
2236
+ "save_steps": 500,
2237
+ "stateful_callbacks": {
2238
+ "TrainerControl": {
2239
+ "args": {
2240
+ "should_epoch_stop": false,
2241
+ "should_evaluate": false,
2242
+ "should_log": false,
2243
+ "should_save": true,
2244
+ "should_training_stop": false
2245
+ },
2246
+ "attributes": {}
2247
+ }
2248
+ },
2249
+ "total_flos": 5.105948110057636e+17,
2250
+ "train_batch_size": 8,
2251
+ "trial_name": null,
2252
+ "trial_params": null
2253
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ee0657691bc4bc86642e2a1842bd363a7f0f18eafb897662025781f2323eea7
3
+ size 5368