arcwarden46 commited on
Commit
4bf7ab3
·
verified ·
1 Parent(s): 9ba1133

Training in progress, step 432, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd44ca5fee68a8b8f778de2f7ea71f9f6f6cfb20463e45c031a770c28b3e3643
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b3baf7f7a16380fb028f475ca35fb4a1485b5449798007811d88895bd00a770
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84cad7780694e8033405e2f242fd2aa6390a53e4fd80756c03dc320e1e20c4e9
3
  size 341314644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e22ca7071dd076252b43b473fa1704a21a4785031a90f0b9028e8c83e8ba89a
3
  size 341314644
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d38a8a486a83654e4e32aea3ee442c71560380819051d896a05b6a8cb94f5ff6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d78c488bb947774f6f4ca54910a40ea5bafc41a037da391b9eb2e717aae29e0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c5b2e441bf9b0fdfc8cb71a6f42776e1a140513500e1faf056d1a979f781bb7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25fb31c3275f1b8913e16332b6b15556ec3b58d9bf65928efef64fedc142b95b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.3306281268596649,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
- "epoch": 0.6948465547191662,
5
  "eval_steps": 150,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2131,6 +2131,930 @@
2131
  "eval_samples_per_second": 23.374,
2132
  "eval_steps_per_second": 5.851,
2133
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2134
  }
2135
  ],
2136
  "logging_steps": 1,
@@ -2154,12 +3078,12 @@
2154
  "should_evaluate": false,
2155
  "should_log": false,
2156
  "should_save": true,
2157
- "should_training_stop": false
2158
  },
2159
  "attributes": {}
2160
  }
2161
  },
2162
- "total_flos": 2.2835124298265395e+17,
2163
  "train_batch_size": 8,
2164
  "trial_name": null,
2165
  "trial_params": null
 
1
  {
2
  "best_metric": 0.3306281268596649,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
+ "epoch": 1.0011580775911986,
5
  "eval_steps": 150,
6
+ "global_step": 432,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2131
  "eval_samples_per_second": 23.374,
2132
  "eval_steps_per_second": 5.851,
2133
  "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.6971627099015634,
2137
+ "grad_norm": 2.477572441101074,
2138
+ "learning_rate": 2.293880768178576e-05,
2139
+ "loss": 1.0048,
2140
+ "step": 301
2141
+ },
2142
+ {
2143
+ "epoch": 0.6994788650839606,
2144
+ "grad_norm": 2.3567538261413574,
2145
+ "learning_rate": 2.2619003103344606e-05,
2146
+ "loss": 1.1464,
2147
+ "step": 302
2148
+ },
2149
+ {
2150
+ "epoch": 0.7017950202663579,
2151
+ "grad_norm": 2.297689199447632,
2152
+ "learning_rate": 2.2300790559367552e-05,
2153
+ "loss": 0.9767,
2154
+ "step": 303
2155
+ },
2156
+ {
2157
+ "epoch": 0.704111175448755,
2158
+ "grad_norm": 2.396148204803467,
2159
+ "learning_rate": 2.1984188551932512e-05,
2160
+ "loss": 0.9108,
2161
+ "step": 304
2162
+ },
2163
+ {
2164
+ "epoch": 0.7064273306311523,
2165
+ "grad_norm": 2.255086660385132,
2166
+ "learning_rate": 2.166921548947466e-05,
2167
+ "loss": 0.6876,
2168
+ "step": 305
2169
+ },
2170
+ {
2171
+ "epoch": 0.7087434858135495,
2172
+ "grad_norm": 2.383619546890259,
2173
+ "learning_rate": 2.1355889685716224e-05,
2174
+ "loss": 0.7666,
2175
+ "step": 306
2176
+ },
2177
+ {
2178
+ "epoch": 0.7110596409959468,
2179
+ "grad_norm": 2.329786777496338,
2180
+ "learning_rate": 2.1044229358601542e-05,
2181
+ "loss": 0.813,
2182
+ "step": 307
2183
+ },
2184
+ {
2185
+ "epoch": 0.7133757961783439,
2186
+ "grad_norm": 2.474313497543335,
2187
+ "learning_rate": 2.0734252629237894e-05,
2188
+ "loss": 0.6183,
2189
+ "step": 308
2190
+ },
2191
+ {
2192
+ "epoch": 0.7156919513607412,
2193
+ "grad_norm": 2.5291049480438232,
2194
+ "learning_rate": 2.0425977520841838e-05,
2195
+ "loss": 0.5384,
2196
+ "step": 309
2197
+ },
2198
+ {
2199
+ "epoch": 0.7180081065431384,
2200
+ "grad_norm": 1.997194766998291,
2201
+ "learning_rate": 2.011942195769122e-05,
2202
+ "loss": 0.5236,
2203
+ "step": 310
2204
+ },
2205
+ {
2206
+ "epoch": 0.7203242617255357,
2207
+ "grad_norm": 1.2435768842697144,
2208
+ "learning_rate": 1.9814603764083112e-05,
2209
+ "loss": 0.2937,
2210
+ "step": 311
2211
+ },
2212
+ {
2213
+ "epoch": 0.7226404169079328,
2214
+ "grad_norm": 1.364814281463623,
2215
+ "learning_rate": 1.9511540663297285e-05,
2216
+ "loss": 0.3747,
2217
+ "step": 312
2218
+ },
2219
+ {
2220
+ "epoch": 0.72495657209033,
2221
+ "grad_norm": 1.3511894941329956,
2222
+ "learning_rate": 1.921025027656587e-05,
2223
+ "loss": 0.4068,
2224
+ "step": 313
2225
+ },
2226
+ {
2227
+ "epoch": 0.7272727272727273,
2228
+ "grad_norm": 1.3470934629440308,
2229
+ "learning_rate": 1.8910750122048637e-05,
2230
+ "loss": 0.3442,
2231
+ "step": 314
2232
+ },
2233
+ {
2234
+ "epoch": 0.7295888824551245,
2235
+ "grad_norm": 1.4333728551864624,
2236
+ "learning_rate": 1.8613057613814584e-05,
2237
+ "loss": 0.3411,
2238
+ "step": 315
2239
+ },
2240
+ {
2241
+ "epoch": 0.7319050376375217,
2242
+ "grad_norm": 1.348252296447754,
2243
+ "learning_rate": 1.831719006082924e-05,
2244
+ "loss": 0.3321,
2245
+ "step": 316
2246
+ },
2247
+ {
2248
+ "epoch": 0.7342211928199189,
2249
+ "grad_norm": 0.9950866103172302,
2250
+ "learning_rate": 1.8023164665948456e-05,
2251
+ "loss": 0.1945,
2252
+ "step": 317
2253
+ },
2254
+ {
2255
+ "epoch": 0.7365373480023162,
2256
+ "grad_norm": 1.1160790920257568,
2257
+ "learning_rate": 1.7730998524917957e-05,
2258
+ "loss": 0.263,
2259
+ "step": 318
2260
+ },
2261
+ {
2262
+ "epoch": 0.7388535031847133,
2263
+ "grad_norm": 1.3591563701629639,
2264
+ "learning_rate": 1.7440708625379505e-05,
2265
+ "loss": 0.3155,
2266
+ "step": 319
2267
+ },
2268
+ {
2269
+ "epoch": 0.7411696583671106,
2270
+ "grad_norm": 1.3873182535171509,
2271
+ "learning_rate": 1.7152311845883095e-05,
2272
+ "loss": 0.2736,
2273
+ "step": 320
2274
+ },
2275
+ {
2276
+ "epoch": 0.7434858135495078,
2277
+ "grad_norm": 1.4288737773895264,
2278
+ "learning_rate": 1.686582495490554e-05,
2279
+ "loss": 0.3022,
2280
+ "step": 321
2281
+ },
2282
+ {
2283
+ "epoch": 0.7458019687319051,
2284
+ "grad_norm": 1.7382503747940063,
2285
+ "learning_rate": 1.658126460987558e-05,
2286
+ "loss": 0.2989,
2287
+ "step": 322
2288
+ },
2289
+ {
2290
+ "epoch": 0.7481181239143022,
2291
+ "grad_norm": 1.7087494134902954,
2292
+ "learning_rate": 1.6298647356205254e-05,
2293
+ "loss": 0.3102,
2294
+ "step": 323
2295
+ },
2296
+ {
2297
+ "epoch": 0.7504342790966995,
2298
+ "grad_norm": 1.5120712518692017,
2299
+ "learning_rate": 1.601798962632799e-05,
2300
+ "loss": 0.2155,
2301
+ "step": 324
2302
+ },
2303
+ {
2304
+ "epoch": 0.7527504342790967,
2305
+ "grad_norm": 1.5661240816116333,
2306
+ "learning_rate": 1.5739307738743057e-05,
2307
+ "loss": 0.1865,
2308
+ "step": 325
2309
+ },
2310
+ {
2311
+ "epoch": 0.755066589461494,
2312
+ "grad_norm": 1.2175641059875488,
2313
+ "learning_rate": 1.546261789706686e-05,
2314
+ "loss": 0.1658,
2315
+ "step": 326
2316
+ },
2317
+ {
2318
+ "epoch": 0.7573827446438911,
2319
+ "grad_norm": 1.4670747518539429,
2320
+ "learning_rate": 1.5187936189090669e-05,
2321
+ "loss": 0.1604,
2322
+ "step": 327
2323
+ },
2324
+ {
2325
+ "epoch": 0.7596988998262884,
2326
+ "grad_norm": 1.8161323070526123,
2327
+ "learning_rate": 1.491527858584535e-05,
2328
+ "loss": 0.2243,
2329
+ "step": 328
2330
+ },
2331
+ {
2332
+ "epoch": 0.7620150550086856,
2333
+ "grad_norm": 1.3583159446716309,
2334
+ "learning_rate": 1.4644660940672627e-05,
2335
+ "loss": 0.1281,
2336
+ "step": 329
2337
+ },
2338
+ {
2339
+ "epoch": 0.7643312101910829,
2340
+ "grad_norm": 1.2247012853622437,
2341
+ "learning_rate": 1.4376098988303405e-05,
2342
+ "loss": 0.1084,
2343
+ "step": 330
2344
+ },
2345
+ {
2346
+ "epoch": 0.76664736537348,
2347
+ "grad_norm": 1.2522218227386475,
2348
+ "learning_rate": 1.4109608343942854e-05,
2349
+ "loss": 0.1137,
2350
+ "step": 331
2351
+ },
2352
+ {
2353
+ "epoch": 0.7689635205558772,
2354
+ "grad_norm": 1.0144122838974,
2355
+ "learning_rate": 1.384520450236244e-05,
2356
+ "loss": 0.0753,
2357
+ "step": 332
2358
+ },
2359
+ {
2360
+ "epoch": 0.7712796757382745,
2361
+ "grad_norm": 1.0482442378997803,
2362
+ "learning_rate": 1.35829028369991e-05,
2363
+ "loss": 0.0872,
2364
+ "step": 333
2365
+ },
2366
+ {
2367
+ "epoch": 0.7735958309206716,
2368
+ "grad_norm": 1.6394468545913696,
2369
+ "learning_rate": 1.3322718599061251e-05,
2370
+ "loss": 0.1127,
2371
+ "step": 334
2372
+ },
2373
+ {
2374
+ "epoch": 0.7759119861030689,
2375
+ "grad_norm": 1.1458487510681152,
2376
+ "learning_rate": 1.306466691664216e-05,
2377
+ "loss": 0.1022,
2378
+ "step": 335
2379
+ },
2380
+ {
2381
+ "epoch": 0.7782281412854661,
2382
+ "grad_norm": 1.6174955368041992,
2383
+ "learning_rate": 1.2808762793840201e-05,
2384
+ "loss": 0.1394,
2385
+ "step": 336
2386
+ },
2387
+ {
2388
+ "epoch": 0.7805442964678634,
2389
+ "grad_norm": 1.2268593311309814,
2390
+ "learning_rate": 1.2555021109886589e-05,
2391
+ "loss": 0.1056,
2392
+ "step": 337
2393
+ },
2394
+ {
2395
+ "epoch": 0.7828604516502605,
2396
+ "grad_norm": 1.2561674118041992,
2397
+ "learning_rate": 1.2303456618280141e-05,
2398
+ "loss": 0.0783,
2399
+ "step": 338
2400
+ },
2401
+ {
2402
+ "epoch": 0.7851766068326578,
2403
+ "grad_norm": 1.1064704656600952,
2404
+ "learning_rate": 1.2054083945929535e-05,
2405
+ "loss": 0.0762,
2406
+ "step": 339
2407
+ },
2408
+ {
2409
+ "epoch": 0.787492762015055,
2410
+ "grad_norm": 1.174625277519226,
2411
+ "learning_rate": 1.1806917592302762e-05,
2412
+ "loss": 0.0862,
2413
+ "step": 340
2414
+ },
2415
+ {
2416
+ "epoch": 0.7898089171974523,
2417
+ "grad_norm": 1.6088016033172607,
2418
+ "learning_rate": 1.1561971928584159e-05,
2419
+ "loss": 0.0962,
2420
+ "step": 341
2421
+ },
2422
+ {
2423
+ "epoch": 0.7921250723798494,
2424
+ "grad_norm": 1.1425096988677979,
2425
+ "learning_rate": 1.1319261196838782e-05,
2426
+ "loss": 0.0699,
2427
+ "step": 342
2428
+ },
2429
+ {
2430
+ "epoch": 0.7944412275622467,
2431
+ "grad_norm": 1.6147550344467163,
2432
+ "learning_rate": 1.1078799509184246e-05,
2433
+ "loss": 0.0919,
2434
+ "step": 343
2435
+ },
2436
+ {
2437
+ "epoch": 0.7967573827446439,
2438
+ "grad_norm": 1.4701851606369019,
2439
+ "learning_rate": 1.0840600846970334e-05,
2440
+ "loss": 0.0891,
2441
+ "step": 344
2442
+ },
2443
+ {
2444
+ "epoch": 0.7990735379270412,
2445
+ "grad_norm": 1.4786357879638672,
2446
+ "learning_rate": 1.0604679059965922e-05,
2447
+ "loss": 0.0638,
2448
+ "step": 345
2449
+ },
2450
+ {
2451
+ "epoch": 0.8013896931094383,
2452
+ "grad_norm": 2.1240947246551514,
2453
+ "learning_rate": 1.0371047865553846e-05,
2454
+ "loss": 0.1207,
2455
+ "step": 346
2456
+ },
2457
+ {
2458
+ "epoch": 0.8037058482918356,
2459
+ "grad_norm": 1.4813666343688965,
2460
+ "learning_rate": 1.0139720847933166e-05,
2461
+ "loss": 0.0871,
2462
+ "step": 347
2463
+ },
2464
+ {
2465
+ "epoch": 0.8060220034742328,
2466
+ "grad_norm": 1.3989876508712769,
2467
+ "learning_rate": 9.91071145732948e-06,
2468
+ "loss": 0.0964,
2469
+ "step": 348
2470
+ },
2471
+ {
2472
+ "epoch": 0.80833815865663,
2473
+ "grad_norm": 1.3372840881347656,
2474
+ "learning_rate": 9.684033009212752e-06,
2475
+ "loss": 0.0714,
2476
+ "step": 349
2477
+ },
2478
+ {
2479
+ "epoch": 0.8106543138390272,
2480
+ "grad_norm": 2.03450870513916,
2481
+ "learning_rate": 9.459698683523204e-06,
2482
+ "loss": 0.1074,
2483
+ "step": 350
2484
+ },
2485
+ {
2486
+ "epoch": 0.8129704690214244,
2487
+ "grad_norm": 1.326187252998352,
2488
+ "learning_rate": 9.237721523904891e-06,
2489
+ "loss": 1.0425,
2490
+ "step": 351
2491
+ },
2492
+ {
2493
+ "epoch": 0.8152866242038217,
2494
+ "grad_norm": 1.3223427534103394,
2495
+ "learning_rate": 9.018114436947373e-06,
2496
+ "loss": 0.8088,
2497
+ "step": 352
2498
+ },
2499
+ {
2500
+ "epoch": 0.8176027793862188,
2501
+ "grad_norm": 1.4911216497421265,
2502
+ "learning_rate": 8.80089019143524e-06,
2503
+ "loss": 0.784,
2504
+ "step": 353
2505
+ },
2506
+ {
2507
+ "epoch": 0.8199189345686161,
2508
+ "grad_norm": 1.5768686532974243,
2509
+ "learning_rate": 8.586061417605668e-06,
2510
+ "loss": 0.7212,
2511
+ "step": 354
2512
+ },
2513
+ {
2514
+ "epoch": 0.8222350897510133,
2515
+ "grad_norm": 1.8095353841781616,
2516
+ "learning_rate": 8.373640606414096e-06,
2517
+ "loss": 0.5331,
2518
+ "step": 355
2519
+ },
2520
+ {
2521
+ "epoch": 0.8245512449334106,
2522
+ "grad_norm": 1.7539920806884766,
2523
+ "learning_rate": 8.163640108807896e-06,
2524
+ "loss": 0.5814,
2525
+ "step": 356
2526
+ },
2527
+ {
2528
+ "epoch": 0.8268674001158077,
2529
+ "grad_norm": 1.8924403190612793,
2530
+ "learning_rate": 7.956072135008336e-06,
2531
+ "loss": 0.6049,
2532
+ "step": 357
2533
+ },
2534
+ {
2535
+ "epoch": 0.829183555298205,
2536
+ "grad_norm": 1.558434009552002,
2537
+ "learning_rate": 7.750948753800507e-06,
2538
+ "loss": 0.3521,
2539
+ "step": 358
2540
+ },
2541
+ {
2542
+ "epoch": 0.8314997104806022,
2543
+ "grad_norm": 1.900602102279663,
2544
+ "learning_rate": 7.548281891831716e-06,
2545
+ "loss": 0.3716,
2546
+ "step": 359
2547
+ },
2548
+ {
2549
+ "epoch": 0.8338158656629994,
2550
+ "grad_norm": 2.0248496532440186,
2551
+ "learning_rate": 7.348083332917926e-06,
2552
+ "loss": 0.3998,
2553
+ "step": 360
2554
+ },
2555
+ {
2556
+ "epoch": 0.8361320208453966,
2557
+ "grad_norm": 1.3187469244003296,
2558
+ "learning_rate": 7.150364717358698e-06,
2559
+ "loss": 0.2192,
2560
+ "step": 361
2561
+ },
2562
+ {
2563
+ "epoch": 0.8384481760277939,
2564
+ "grad_norm": 1.5400038957595825,
2565
+ "learning_rate": 6.955137541260287e-06,
2566
+ "loss": 0.2471,
2567
+ "step": 362
2568
+ },
2569
+ {
2570
+ "epoch": 0.8407643312101911,
2571
+ "grad_norm": 1.604353666305542,
2572
+ "learning_rate": 6.7624131558672756e-06,
2573
+ "loss": 0.2206,
2574
+ "step": 363
2575
+ },
2576
+ {
2577
+ "epoch": 0.8430804863925883,
2578
+ "grad_norm": 1.945562481880188,
2579
+ "learning_rate": 6.572202766902569e-06,
2580
+ "loss": 0.2775,
2581
+ "step": 364
2582
+ },
2583
+ {
2584
+ "epoch": 0.8453966415749855,
2585
+ "grad_norm": 2.460178852081299,
2586
+ "learning_rate": 6.384517433915793e-06,
2587
+ "loss": 0.4032,
2588
+ "step": 365
2589
+ },
2590
+ {
2591
+ "epoch": 0.8477127967573828,
2592
+ "grad_norm": 1.6772760152816772,
2593
+ "learning_rate": 6.199368069640343e-06,
2594
+ "loss": 0.253,
2595
+ "step": 366
2596
+ },
2597
+ {
2598
+ "epoch": 0.85002895193978,
2599
+ "grad_norm": 1.7364951372146606,
2600
+ "learning_rate": 6.016765439358774e-06,
2601
+ "loss": 0.2342,
2602
+ "step": 367
2603
+ },
2604
+ {
2605
+ "epoch": 0.8523451071221771,
2606
+ "grad_norm": 1.791764259338379,
2607
+ "learning_rate": 5.83672016027697e-06,
2608
+ "loss": 0.1775,
2609
+ "step": 368
2610
+ },
2611
+ {
2612
+ "epoch": 0.8546612623045744,
2613
+ "grad_norm": 1.4430023431777954,
2614
+ "learning_rate": 5.659242700906719e-06,
2615
+ "loss": 0.1663,
2616
+ "step": 369
2617
+ },
2618
+ {
2619
+ "epoch": 0.8569774174869716,
2620
+ "grad_norm": 1.4145193099975586,
2621
+ "learning_rate": 5.484343380457125e-06,
2622
+ "loss": 0.1669,
2623
+ "step": 370
2624
+ },
2625
+ {
2626
+ "epoch": 0.8592935726693689,
2627
+ "grad_norm": 1.4928686618804932,
2628
+ "learning_rate": 5.312032368234526e-06,
2629
+ "loss": 0.19,
2630
+ "step": 371
2631
+ },
2632
+ {
2633
+ "epoch": 0.861609727851766,
2634
+ "grad_norm": 1.48283851146698,
2635
+ "learning_rate": 5.1423196830513e-06,
2636
+ "loss": 0.1979,
2637
+ "step": 372
2638
+ },
2639
+ {
2640
+ "epoch": 0.8639258830341633,
2641
+ "grad_norm": 1.4714758396148682,
2642
+ "learning_rate": 4.975215192643246e-06,
2643
+ "loss": 0.1778,
2644
+ "step": 373
2645
+ },
2646
+ {
2647
+ "epoch": 0.8662420382165605,
2648
+ "grad_norm": 1.187078595161438,
2649
+ "learning_rate": 4.81072861309591e-06,
2650
+ "loss": 0.1187,
2651
+ "step": 374
2652
+ },
2653
+ {
2654
+ "epoch": 0.8685581933989577,
2655
+ "grad_norm": 1.1508448123931885,
2656
+ "learning_rate": 4.648869508279613e-06,
2657
+ "loss": 0.1269,
2658
+ "step": 375
2659
+ },
2660
+ {
2661
+ "epoch": 0.8708743485813549,
2662
+ "grad_norm": 0.9376459717750549,
2663
+ "learning_rate": 4.489647289293369e-06,
2664
+ "loss": 0.1059,
2665
+ "step": 376
2666
+ },
2667
+ {
2668
+ "epoch": 0.8731905037637522,
2669
+ "grad_norm": 1.0897644758224487,
2670
+ "learning_rate": 4.333071213917722e-06,
2671
+ "loss": 0.0853,
2672
+ "step": 377
2673
+ },
2674
+ {
2675
+ "epoch": 0.8755066589461494,
2676
+ "grad_norm": 0.9136444926261902,
2677
+ "learning_rate": 4.179150386076424e-06,
2678
+ "loss": 0.0743,
2679
+ "step": 378
2680
+ },
2681
+ {
2682
+ "epoch": 0.8778228141285466,
2683
+ "grad_norm": 0.8426498174667358,
2684
+ "learning_rate": 4.027893755307144e-06,
2685
+ "loss": 0.0606,
2686
+ "step": 379
2687
+ },
2688
+ {
2689
+ "epoch": 0.8801389693109438,
2690
+ "grad_norm": 0.7755717635154724,
2691
+ "learning_rate": 3.879310116241042e-06,
2692
+ "loss": 0.0617,
2693
+ "step": 380
2694
+ },
2695
+ {
2696
+ "epoch": 0.8824551244933411,
2697
+ "grad_norm": 1.0966241359710693,
2698
+ "learning_rate": 3.733408108091485e-06,
2699
+ "loss": 0.0724,
2700
+ "step": 381
2701
+ },
2702
+ {
2703
+ "epoch": 0.8847712796757383,
2704
+ "grad_norm": 0.8539220094680786,
2705
+ "learning_rate": 3.5901962141516977e-06,
2706
+ "loss": 0.0714,
2707
+ "step": 382
2708
+ },
2709
+ {
2710
+ "epoch": 0.8870874348581355,
2711
+ "grad_norm": 1.0539127588272095,
2712
+ "learning_rate": 3.4496827613015202e-06,
2713
+ "loss": 0.0585,
2714
+ "step": 383
2715
+ },
2716
+ {
2717
+ "epoch": 0.8894035900405327,
2718
+ "grad_norm": 0.7426398396492004,
2719
+ "learning_rate": 3.3118759195232275e-06,
2720
+ "loss": 0.0527,
2721
+ "step": 384
2722
+ },
2723
+ {
2724
+ "epoch": 0.89171974522293,
2725
+ "grad_norm": 0.9449732899665833,
2726
+ "learning_rate": 3.176783701426528e-06,
2727
+ "loss": 0.0689,
2728
+ "step": 385
2729
+ },
2730
+ {
2731
+ "epoch": 0.8940359004053272,
2732
+ "grad_norm": 0.9856773614883423,
2733
+ "learning_rate": 3.0444139617826607e-06,
2734
+ "loss": 0.0671,
2735
+ "step": 386
2736
+ },
2737
+ {
2738
+ "epoch": 0.8963520555877244,
2739
+ "grad_norm": 0.826977014541626,
2740
+ "learning_rate": 2.91477439706771e-06,
2741
+ "loss": 0.0554,
2742
+ "step": 387
2743
+ },
2744
+ {
2745
+ "epoch": 0.8986682107701216,
2746
+ "grad_norm": 0.7119414210319519,
2747
+ "learning_rate": 2.787872545015069e-06,
2748
+ "loss": 0.0525,
2749
+ "step": 388
2750
+ },
2751
+ {
2752
+ "epoch": 0.9009843659525189,
2753
+ "grad_norm": 1.1008604764938354,
2754
+ "learning_rate": 2.663715784177201e-06,
2755
+ "loss": 0.0819,
2756
+ "step": 389
2757
+ },
2758
+ {
2759
+ "epoch": 0.903300521134916,
2760
+ "grad_norm": 1.2398359775543213,
2761
+ "learning_rate": 2.542311333496622e-06,
2762
+ "loss": 0.0695,
2763
+ "step": 390
2764
+ },
2765
+ {
2766
+ "epoch": 0.9056166763173132,
2767
+ "grad_norm": 0.6894932985305786,
2768
+ "learning_rate": 2.423666251886114e-06,
2769
+ "loss": 0.0404,
2770
+ "step": 391
2771
+ },
2772
+ {
2773
+ "epoch": 0.9079328314997105,
2774
+ "grad_norm": 1.0042632818222046,
2775
+ "learning_rate": 2.307787437818365e-06,
2776
+ "loss": 0.063,
2777
+ "step": 392
2778
+ },
2779
+ {
2780
+ "epoch": 0.9102489866821077,
2781
+ "grad_norm": 1.2008248567581177,
2782
+ "learning_rate": 2.194681628924816e-06,
2783
+ "loss": 0.0826,
2784
+ "step": 393
2785
+ },
2786
+ {
2787
+ "epoch": 0.9125651418645049,
2788
+ "grad_norm": 1.0816729068756104,
2789
+ "learning_rate": 2.0843554016039326e-06,
2790
+ "loss": 0.0553,
2791
+ "step": 394
2792
+ },
2793
+ {
2794
+ "epoch": 0.9148812970469021,
2795
+ "grad_norm": 1.0502961874008179,
2796
+ "learning_rate": 1.976815170638802e-06,
2797
+ "loss": 0.0598,
2798
+ "step": 395
2799
+ },
2800
+ {
2801
+ "epoch": 0.9171974522292994,
2802
+ "grad_norm": 1.179869532585144,
2803
+ "learning_rate": 1.8720671888242059e-06,
2804
+ "loss": 0.1096,
2805
+ "step": 396
2806
+ },
2807
+ {
2808
+ "epoch": 0.9195136074116966,
2809
+ "grad_norm": 1.1720819473266602,
2810
+ "learning_rate": 1.7701175466029895e-06,
2811
+ "loss": 0.0732,
2812
+ "step": 397
2813
+ },
2814
+ {
2815
+ "epoch": 0.9218297625940938,
2816
+ "grad_norm": 1.741204857826233,
2817
+ "learning_rate": 1.6709721717120042e-06,
2818
+ "loss": 0.0805,
2819
+ "step": 398
2820
+ },
2821
+ {
2822
+ "epoch": 0.924145917776491,
2823
+ "grad_norm": 1.379157304763794,
2824
+ "learning_rate": 1.5746368288373947e-06,
2825
+ "loss": 0.0735,
2826
+ "step": 399
2827
+ },
2828
+ {
2829
+ "epoch": 0.9264620729588883,
2830
+ "grad_norm": 1.8489305973052979,
2831
+ "learning_rate": 1.4811171192794627e-06,
2832
+ "loss": 0.0918,
2833
+ "step": 400
2834
+ },
2835
+ {
2836
+ "epoch": 0.9287782281412854,
2837
+ "grad_norm": 1.2802451848983765,
2838
+ "learning_rate": 1.3904184806269704e-06,
2839
+ "loss": 0.9387,
2840
+ "step": 401
2841
+ },
2842
+ {
2843
+ "epoch": 0.9310943833236827,
2844
+ "grad_norm": 1.3554513454437256,
2845
+ "learning_rate": 1.3025461864409394e-06,
2846
+ "loss": 0.9213,
2847
+ "step": 402
2848
+ },
2849
+ {
2850
+ "epoch": 0.9334105385060799,
2851
+ "grad_norm": 1.2956500053405762,
2852
+ "learning_rate": 1.2175053459481e-06,
2853
+ "loss": 0.6561,
2854
+ "step": 403
2855
+ },
2856
+ {
2857
+ "epoch": 0.9357266936884772,
2858
+ "grad_norm": 1.4841207265853882,
2859
+ "learning_rate": 1.1353009037437523e-06,
2860
+ "loss": 0.4452,
2861
+ "step": 404
2862
+ },
2863
+ {
2864
+ "epoch": 0.9380428488708743,
2865
+ "grad_norm": 1.3164417743682861,
2866
+ "learning_rate": 1.0559376395043285e-06,
2867
+ "loss": 0.4071,
2868
+ "step": 405
2869
+ },
2870
+ {
2871
+ "epoch": 0.9403590040532716,
2872
+ "grad_norm": 1.8103063106536865,
2873
+ "learning_rate": 9.794201677094162e-07,
2874
+ "loss": 0.3799,
2875
+ "step": 406
2876
+ },
2877
+ {
2878
+ "epoch": 0.9426751592356688,
2879
+ "grad_norm": 1.640889286994934,
2880
+ "learning_rate": 9.05752937373533e-07,
2881
+ "loss": 0.3303,
2882
+ "step": 407
2883
+ },
2884
+ {
2885
+ "epoch": 0.944991314418066,
2886
+ "grad_norm": 1.410726547241211,
2887
+ "learning_rate": 8.349402317873789e-07,
2888
+ "loss": 0.2026,
2889
+ "step": 408
2890
+ },
2891
+ {
2892
+ "epoch": 0.9473074696004632,
2893
+ "grad_norm": 1.7915242910385132,
2894
+ "learning_rate": 7.669861682688239e-07,
2895
+ "loss": 0.2115,
2896
+ "step": 409
2897
+ },
2898
+ {
2899
+ "epoch": 0.9496236247828604,
2900
+ "grad_norm": 1.855295181274414,
2901
+ "learning_rate": 7.018946979234997e-07,
2902
+ "loss": 0.2876,
2903
+ "step": 410
2904
+ },
2905
+ {
2906
+ "epoch": 0.9519397799652577,
2907
+ "grad_norm": 1.1143068075180054,
2908
+ "learning_rate": 6.396696054150719e-07,
2909
+ "loss": 0.1409,
2910
+ "step": 411
2911
+ },
2912
+ {
2913
+ "epoch": 0.9542559351476549,
2914
+ "grad_norm": 1.3461558818817139,
2915
+ "learning_rate": 5.803145087451945e-07,
2916
+ "loss": 0.1312,
2917
+ "step": 412
2918
+ },
2919
+ {
2920
+ "epoch": 0.9565720903300521,
2921
+ "grad_norm": 1.343173623085022,
2922
+ "learning_rate": 5.238328590431162e-07,
2923
+ "loss": 0.1521,
2924
+ "step": 413
2925
+ },
2926
+ {
2927
+ "epoch": 0.9588882455124493,
2928
+ "grad_norm": 1.174078106880188,
2929
+ "learning_rate": 4.7022794036505335e-07,
2930
+ "loss": 0.1421,
2931
+ "step": 414
2932
+ },
2933
+ {
2934
+ "epoch": 0.9612044006948466,
2935
+ "grad_norm": 1.0609129667282104,
2936
+ "learning_rate": 4.1950286950321327e-07,
2937
+ "loss": 0.0897,
2938
+ "step": 415
2939
+ },
2940
+ {
2941
+ "epoch": 0.9635205558772437,
2942
+ "grad_norm": 1.1571158170700073,
2943
+ "learning_rate": 3.716605958046071e-07,
2944
+ "loss": 0.0685,
2945
+ "step": 416
2946
+ },
2947
+ {
2948
+ "epoch": 0.965836711059641,
2949
+ "grad_norm": 0.8536246418952942,
2950
+ "learning_rate": 3.267039009995199e-07,
2951
+ "loss": 0.0694,
2952
+ "step": 417
2953
+ },
2954
+ {
2955
+ "epoch": 0.9681528662420382,
2956
+ "grad_norm": 0.7544981837272644,
2957
+ "learning_rate": 2.846353990398065e-07,
2958
+ "loss": 0.0442,
2959
+ "step": 418
2960
+ },
2961
+ {
2962
+ "epoch": 0.9704690214244355,
2963
+ "grad_norm": 0.7445719242095947,
2964
+ "learning_rate": 2.4545753594688583e-07,
2965
+ "loss": 0.0545,
2966
+ "step": 419
2967
+ },
2968
+ {
2969
+ "epoch": 0.9727851766068326,
2970
+ "grad_norm": 0.631585955619812,
2971
+ "learning_rate": 2.0917258966953733e-07,
2972
+ "loss": 0.0462,
2973
+ "step": 420
2974
+ },
2975
+ {
2976
+ "epoch": 0.9751013317892299,
2977
+ "grad_norm": 0.6365635991096497,
2978
+ "learning_rate": 1.7578266995142978e-07,
2979
+ "loss": 0.0554,
2980
+ "step": 421
2981
+ },
2982
+ {
2983
+ "epoch": 0.9774174869716271,
2984
+ "grad_norm": 0.7627168297767639,
2985
+ "learning_rate": 1.4528971820846893e-07,
2986
+ "loss": 0.0552,
2987
+ "step": 422
2988
+ },
2989
+ {
2990
+ "epoch": 0.9797336421540244,
2991
+ "grad_norm": 0.8680822849273682,
2992
+ "learning_rate": 1.1769550741592139e-07,
2993
+ "loss": 0.0643,
2994
+ "step": 423
2995
+ },
2996
+ {
2997
+ "epoch": 0.9820497973364215,
2998
+ "grad_norm": 1.076411485671997,
2999
+ "learning_rate": 9.300164200530814e-08,
3000
+ "loss": 0.0644,
3001
+ "step": 424
3002
+ },
3003
+ {
3004
+ "epoch": 0.9843659525188188,
3005
+ "grad_norm": 1.0168262720108032,
3006
+ "learning_rate": 7.120955777112914e-08,
3007
+ "loss": 0.0862,
3008
+ "step": 425
3009
+ },
3010
+ {
3011
+ "epoch": 0.986682107701216,
3012
+ "grad_norm": 1.153483271598816,
3013
+ "learning_rate": 5.2320521787385667e-08,
3014
+ "loss": 0.0703,
3015
+ "step": 426
3016
+ },
3017
+ {
3018
+ "epoch": 0.9889982628836133,
3019
+ "grad_norm": 0.9161254167556763,
3020
+ "learning_rate": 3.633563233388926e-08,
3021
+ "loss": 0.0481,
3022
+ "step": 427
3023
+ },
3024
+ {
3025
+ "epoch": 0.9913144180660104,
3026
+ "grad_norm": 0.8356172442436218,
3027
+ "learning_rate": 2.3255818832423894e-08,
3028
+ "loss": 0.0574,
3029
+ "step": 428
3030
+ },
3031
+ {
3032
+ "epoch": 0.9936305732484076,
3033
+ "grad_norm": 1.117587924003601,
3034
+ "learning_rate": 1.3081841792694783e-08,
3035
+ "loss": 0.0647,
3036
+ "step": 429
3037
+ },
3038
+ {
3039
+ "epoch": 0.9959467284308049,
3040
+ "grad_norm": 0.9720324873924255,
3041
+ "learning_rate": 5.814292768108187e-09,
3042
+ "loss": 0.0588,
3043
+ "step": 430
3044
+ },
3045
+ {
3046
+ "epoch": 0.998262883613202,
3047
+ "grad_norm": 1.3996336460113525,
3048
+ "learning_rate": 1.453594321393359e-09,
3049
+ "loss": 0.0588,
3050
+ "step": 431
3051
+ },
3052
+ {
3053
+ "epoch": 1.0011580775911986,
3054
+ "grad_norm": 2.588228702545166,
3055
+ "learning_rate": 0.0,
3056
+ "loss": 0.4998,
3057
+ "step": 432
3058
  }
3059
  ],
3060
  "logging_steps": 1,
 
3078
  "should_evaluate": false,
3079
  "should_log": false,
3080
  "should_save": true,
3081
+ "should_training_stop": true
3082
  },
3083
  "attributes": {}
3084
  }
3085
  },
3086
+ "total_flos": 3.292330885612831e+17,
3087
  "train_batch_size": 8,
3088
  "trial_name": null,
3089
  "trial_params": null