diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.18438844499078058, + "epoch": 0.268899815611555, "eval_steps": 500, - "global_step": 2400, + "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -8407,6 +8407,3856 @@ "learning_rate": 1.926244622003688e-05, "loss": 1.5278, "step": 2400 + }, + { + "epoch": 0.1845421020282729, + "grad_norm": 3.97829270362854, + "learning_rate": 1.9261831591886912e-05, + "loss": 1.5206, + "step": 2402 + }, + { + "epoch": 0.1846957590657652, + "grad_norm": 3.6935653686523438, + "learning_rate": 1.926121696373694e-05, + "loss": 1.5395, + "step": 2404 + }, + { + "epoch": 0.18484941610325753, + "grad_norm": 3.869833469390869, + "learning_rate": 1.926060233558697e-05, + "loss": 1.4951, + "step": 2406 + }, + { + "epoch": 0.18500307314074985, + "grad_norm": 3.3914384841918945, + "learning_rate": 1.9259987707437e-05, + "loss": 1.3401, + "step": 2408 + }, + { + "epoch": 0.18515673017824216, + "grad_norm": 4.250161647796631, + "learning_rate": 1.9259373079287034e-05, + "loss": 1.6384, + "step": 2410 + }, + { + "epoch": 0.18531038721573448, + "grad_norm": 4.362060070037842, + "learning_rate": 1.9258758451137064e-05, + "loss": 1.5895, + "step": 2412 + }, + { + "epoch": 0.1854640442532268, + "grad_norm": 3.7243545055389404, + "learning_rate": 1.9258143822987093e-05, + "loss": 1.4715, + "step": 2414 + }, + { + "epoch": 0.1856177012907191, + "grad_norm": 4.146886825561523, + "learning_rate": 1.9257529194837127e-05, + "loss": 1.5387, + "step": 2416 + }, + { + "epoch": 0.18577135832821143, + "grad_norm": 3.769721746444702, + "learning_rate": 1.9256914566687156e-05, + "loss": 1.5978, + "step": 2418 + }, + { + "epoch": 0.18592501536570374, + "grad_norm": 4.210140228271484, + "learning_rate": 1.9256299938537186e-05, + "loss": 1.4916, + "step": 2420 + }, + { + "epoch": 0.18607867240319606, + "grad_norm": 3.8122193813323975, + "learning_rate": 1.925568531038722e-05, + "loss": 1.4786, + "step": 2422 + }, + { + "epoch": 0.18623232944068838, + "grad_norm": 4.192070484161377, + "learning_rate": 1.925507068223725e-05, + "loss": 1.5604, + "step": 2424 + }, + { + "epoch": 0.1863859864781807, + "grad_norm": 4.002556800842285, + "learning_rate": 1.925445605408728e-05, + "loss": 1.6205, + "step": 2426 + }, + { + "epoch": 0.186539643515673, + "grad_norm": 4.130584716796875, + "learning_rate": 1.9253841425937308e-05, + "loss": 1.6588, + "step": 2428 + }, + { + "epoch": 0.18669330055316533, + "grad_norm": 4.271806240081787, + "learning_rate": 1.925322679778734e-05, + "loss": 1.4771, + "step": 2430 + }, + { + "epoch": 0.18684695759065764, + "grad_norm": 3.9257447719573975, + "learning_rate": 1.925261216963737e-05, + "loss": 1.4746, + "step": 2432 + }, + { + "epoch": 0.18700061462814996, + "grad_norm": 3.5739989280700684, + "learning_rate": 1.92519975414874e-05, + "loss": 1.6682, + "step": 2434 + }, + { + "epoch": 0.18715427166564227, + "grad_norm": 4.071542263031006, + "learning_rate": 1.9251382913337434e-05, + "loss": 1.7167, + "step": 2436 + }, + { + "epoch": 0.1873079287031346, + "grad_norm": 3.6181557178497314, + "learning_rate": 1.9250768285187463e-05, + "loss": 1.4887, + "step": 2438 + }, + { + "epoch": 0.1874615857406269, + "grad_norm": 3.9054434299468994, + "learning_rate": 1.9250153657037493e-05, + "loss": 1.5903, + "step": 2440 + }, + { + "epoch": 0.18761524277811925, + "grad_norm": 3.884770631790161, + "learning_rate": 1.9249539028887526e-05, + "loss": 1.533, + "step": 2442 + }, + { + "epoch": 0.18776889981561157, + "grad_norm": 4.007852554321289, + "learning_rate": 1.9248924400737556e-05, + "loss": 1.4776, + "step": 2444 + }, + { + "epoch": 0.18792255685310388, + "grad_norm": 4.36082649230957, + "learning_rate": 1.9248309772587585e-05, + "loss": 1.6813, + "step": 2446 + }, + { + "epoch": 0.1880762138905962, + "grad_norm": 3.7035789489746094, + "learning_rate": 1.924769514443762e-05, + "loss": 1.5808, + "step": 2448 + }, + { + "epoch": 0.18822987092808852, + "grad_norm": 4.472562789916992, + "learning_rate": 1.9247080516287648e-05, + "loss": 1.6157, + "step": 2450 + }, + { + "epoch": 0.18838352796558083, + "grad_norm": 3.8329532146453857, + "learning_rate": 1.9246465888137678e-05, + "loss": 1.6105, + "step": 2452 + }, + { + "epoch": 0.18853718500307315, + "grad_norm": 3.5638465881347656, + "learning_rate": 1.9245851259987707e-05, + "loss": 1.3973, + "step": 2454 + }, + { + "epoch": 0.18869084204056547, + "grad_norm": 3.769814968109131, + "learning_rate": 1.924523663183774e-05, + "loss": 1.436, + "step": 2456 + }, + { + "epoch": 0.18884449907805778, + "grad_norm": 3.7624809741973877, + "learning_rate": 1.924462200368777e-05, + "loss": 1.6093, + "step": 2458 + }, + { + "epoch": 0.1889981561155501, + "grad_norm": 3.934250593185425, + "learning_rate": 1.92440073755378e-05, + "loss": 1.5041, + "step": 2460 + }, + { + "epoch": 0.18915181315304241, + "grad_norm": 4.421424865722656, + "learning_rate": 1.9243392747387833e-05, + "loss": 1.4708, + "step": 2462 + }, + { + "epoch": 0.18930547019053473, + "grad_norm": 4.019033432006836, + "learning_rate": 1.9242778119237863e-05, + "loss": 1.5314, + "step": 2464 + }, + { + "epoch": 0.18945912722802705, + "grad_norm": 4.172586917877197, + "learning_rate": 1.9242163491087892e-05, + "loss": 1.5259, + "step": 2466 + }, + { + "epoch": 0.18961278426551936, + "grad_norm": 3.919337034225464, + "learning_rate": 1.9241548862937925e-05, + "loss": 1.6305, + "step": 2468 + }, + { + "epoch": 0.18976644130301168, + "grad_norm": 4.666354656219482, + "learning_rate": 1.9240934234787955e-05, + "loss": 1.66, + "step": 2470 + }, + { + "epoch": 0.189920098340504, + "grad_norm": 3.4628660678863525, + "learning_rate": 1.9240319606637985e-05, + "loss": 1.4733, + "step": 2472 + }, + { + "epoch": 0.1900737553779963, + "grad_norm": 4.151192665100098, + "learning_rate": 1.9239704978488018e-05, + "loss": 1.6414, + "step": 2474 + }, + { + "epoch": 0.19022741241548863, + "grad_norm": 3.8772740364074707, + "learning_rate": 1.9239090350338048e-05, + "loss": 1.6062, + "step": 2476 + }, + { + "epoch": 0.19038106945298094, + "grad_norm": 4.541169166564941, + "learning_rate": 1.9238475722188077e-05, + "loss": 1.5332, + "step": 2478 + }, + { + "epoch": 0.19053472649047326, + "grad_norm": 3.821932315826416, + "learning_rate": 1.9237861094038107e-05, + "loss": 1.4712, + "step": 2480 + }, + { + "epoch": 0.19068838352796558, + "grad_norm": 4.174224853515625, + "learning_rate": 1.923724646588814e-05, + "loss": 1.5465, + "step": 2482 + }, + { + "epoch": 0.1908420405654579, + "grad_norm": 3.4857685565948486, + "learning_rate": 1.923663183773817e-05, + "loss": 1.3857, + "step": 2484 + }, + { + "epoch": 0.1909956976029502, + "grad_norm": 3.488931655883789, + "learning_rate": 1.92360172095882e-05, + "loss": 1.4848, + "step": 2486 + }, + { + "epoch": 0.19114935464044253, + "grad_norm": 4.406091213226318, + "learning_rate": 1.9235402581438232e-05, + "loss": 1.5634, + "step": 2488 + }, + { + "epoch": 0.19130301167793484, + "grad_norm": 4.208408832550049, + "learning_rate": 1.9234787953288262e-05, + "loss": 1.4935, + "step": 2490 + }, + { + "epoch": 0.19145666871542716, + "grad_norm": 4.038366794586182, + "learning_rate": 1.9234173325138292e-05, + "loss": 1.435, + "step": 2492 + }, + { + "epoch": 0.19161032575291947, + "grad_norm": 3.6954140663146973, + "learning_rate": 1.9233558696988325e-05, + "loss": 1.6045, + "step": 2494 + }, + { + "epoch": 0.1917639827904118, + "grad_norm": 3.928567409515381, + "learning_rate": 1.9232944068838355e-05, + "loss": 1.6917, + "step": 2496 + }, + { + "epoch": 0.1919176398279041, + "grad_norm": 3.8990769386291504, + "learning_rate": 1.9232329440688384e-05, + "loss": 1.6294, + "step": 2498 + }, + { + "epoch": 0.19207129686539642, + "grad_norm": 4.251073360443115, + "learning_rate": 1.9231714812538417e-05, + "loss": 1.7429, + "step": 2500 + }, + { + "epoch": 0.19222495390288874, + "grad_norm": 3.8998966217041016, + "learning_rate": 1.9231100184388447e-05, + "loss": 1.5395, + "step": 2502 + }, + { + "epoch": 0.19237861094038106, + "grad_norm": 3.5932552814483643, + "learning_rate": 1.9230485556238477e-05, + "loss": 1.4188, + "step": 2504 + }, + { + "epoch": 0.1925322679778734, + "grad_norm": 4.385290622711182, + "learning_rate": 1.9229870928088506e-05, + "loss": 1.5512, + "step": 2506 + }, + { + "epoch": 0.19268592501536572, + "grad_norm": 4.489023685455322, + "learning_rate": 1.922925629993854e-05, + "loss": 1.5199, + "step": 2508 + }, + { + "epoch": 0.19283958205285803, + "grad_norm": 3.464556932449341, + "learning_rate": 1.922864167178857e-05, + "loss": 1.6343, + "step": 2510 + }, + { + "epoch": 0.19299323909035035, + "grad_norm": 3.967116594314575, + "learning_rate": 1.92280270436386e-05, + "loss": 1.5112, + "step": 2512 + }, + { + "epoch": 0.19314689612784267, + "grad_norm": 3.6863479614257812, + "learning_rate": 1.9227412415488632e-05, + "loss": 1.4394, + "step": 2514 + }, + { + "epoch": 0.19330055316533498, + "grad_norm": 3.8512773513793945, + "learning_rate": 1.922679778733866e-05, + "loss": 1.5762, + "step": 2516 + }, + { + "epoch": 0.1934542102028273, + "grad_norm": 4.207437515258789, + "learning_rate": 1.922618315918869e-05, + "loss": 1.5081, + "step": 2518 + }, + { + "epoch": 0.19360786724031961, + "grad_norm": 3.882289409637451, + "learning_rate": 1.9225568531038724e-05, + "loss": 1.542, + "step": 2520 + }, + { + "epoch": 0.19376152427781193, + "grad_norm": 3.5873258113861084, + "learning_rate": 1.9224953902888754e-05, + "loss": 1.4955, + "step": 2522 + }, + { + "epoch": 0.19391518131530425, + "grad_norm": 4.062012195587158, + "learning_rate": 1.9224339274738784e-05, + "loss": 1.6206, + "step": 2524 + }, + { + "epoch": 0.19406883835279656, + "grad_norm": 4.009846210479736, + "learning_rate": 1.9223724646588813e-05, + "loss": 1.4736, + "step": 2526 + }, + { + "epoch": 0.19422249539028888, + "grad_norm": 3.5200116634368896, + "learning_rate": 1.9223110018438846e-05, + "loss": 1.5089, + "step": 2528 + }, + { + "epoch": 0.1943761524277812, + "grad_norm": 3.888718366622925, + "learning_rate": 1.9222495390288876e-05, + "loss": 1.5568, + "step": 2530 + }, + { + "epoch": 0.1945298094652735, + "grad_norm": 4.041849136352539, + "learning_rate": 1.9221880762138906e-05, + "loss": 1.57, + "step": 2532 + }, + { + "epoch": 0.19468346650276583, + "grad_norm": 3.235793352127075, + "learning_rate": 1.922126613398894e-05, + "loss": 1.562, + "step": 2534 + }, + { + "epoch": 0.19483712354025814, + "grad_norm": 3.7582755088806152, + "learning_rate": 1.922065150583897e-05, + "loss": 1.6906, + "step": 2536 + }, + { + "epoch": 0.19499078057775046, + "grad_norm": 4.138317108154297, + "learning_rate": 1.9220036877688998e-05, + "loss": 1.6045, + "step": 2538 + }, + { + "epoch": 0.19514443761524278, + "grad_norm": 3.8657236099243164, + "learning_rate": 1.921942224953903e-05, + "loss": 1.6388, + "step": 2540 + }, + { + "epoch": 0.1952980946527351, + "grad_norm": 4.225650310516357, + "learning_rate": 1.921880762138906e-05, + "loss": 1.5472, + "step": 2542 + }, + { + "epoch": 0.1954517516902274, + "grad_norm": 4.000063896179199, + "learning_rate": 1.921819299323909e-05, + "loss": 1.5838, + "step": 2544 + }, + { + "epoch": 0.19560540872771973, + "grad_norm": 3.2265849113464355, + "learning_rate": 1.9217578365089124e-05, + "loss": 1.401, + "step": 2546 + }, + { + "epoch": 0.19575906576521204, + "grad_norm": 4.049646854400635, + "learning_rate": 1.9216963736939153e-05, + "loss": 1.6306, + "step": 2548 + }, + { + "epoch": 0.19591272280270436, + "grad_norm": 4.136661052703857, + "learning_rate": 1.9216349108789187e-05, + "loss": 1.6809, + "step": 2550 + }, + { + "epoch": 0.19606637984019668, + "grad_norm": 4.1102294921875, + "learning_rate": 1.9215734480639213e-05, + "loss": 1.5619, + "step": 2552 + }, + { + "epoch": 0.196220036877689, + "grad_norm": 3.632099151611328, + "learning_rate": 1.9215119852489246e-05, + "loss": 1.4556, + "step": 2554 + }, + { + "epoch": 0.1963736939151813, + "grad_norm": 3.7188732624053955, + "learning_rate": 1.9214505224339276e-05, + "loss": 1.6132, + "step": 2556 + }, + { + "epoch": 0.19652735095267362, + "grad_norm": 3.4973299503326416, + "learning_rate": 1.9213890596189305e-05, + "loss": 1.5363, + "step": 2558 + }, + { + "epoch": 0.19668100799016594, + "grad_norm": 4.495885848999023, + "learning_rate": 1.921327596803934e-05, + "loss": 1.4821, + "step": 2560 + }, + { + "epoch": 0.19683466502765826, + "grad_norm": 4.349817752838135, + "learning_rate": 1.9212661339889368e-05, + "loss": 1.5153, + "step": 2562 + }, + { + "epoch": 0.19698832206515057, + "grad_norm": 3.5700700283050537, + "learning_rate": 1.9212046711739398e-05, + "loss": 1.6255, + "step": 2564 + }, + { + "epoch": 0.1971419791026429, + "grad_norm": 3.944878101348877, + "learning_rate": 1.921143208358943e-05, + "loss": 1.5505, + "step": 2566 + }, + { + "epoch": 0.1972956361401352, + "grad_norm": 4.354259490966797, + "learning_rate": 1.921081745543946e-05, + "loss": 1.6005, + "step": 2568 + }, + { + "epoch": 0.19744929317762752, + "grad_norm": 4.167532920837402, + "learning_rate": 1.9210202827289494e-05, + "loss": 1.6639, + "step": 2570 + }, + { + "epoch": 0.19760295021511987, + "grad_norm": 4.1236958503723145, + "learning_rate": 1.9209588199139523e-05, + "loss": 1.6543, + "step": 2572 + }, + { + "epoch": 0.19775660725261218, + "grad_norm": 4.3486328125, + "learning_rate": 1.9208973570989553e-05, + "loss": 1.6486, + "step": 2574 + }, + { + "epoch": 0.1979102642901045, + "grad_norm": 4.074930191040039, + "learning_rate": 1.9208358942839586e-05, + "loss": 1.5789, + "step": 2576 + }, + { + "epoch": 0.19806392132759681, + "grad_norm": 4.678352355957031, + "learning_rate": 1.9207744314689612e-05, + "loss": 1.5615, + "step": 2578 + }, + { + "epoch": 0.19821757836508913, + "grad_norm": 3.689697504043579, + "learning_rate": 1.9207129686539645e-05, + "loss": 1.4933, + "step": 2580 + }, + { + "epoch": 0.19837123540258145, + "grad_norm": 3.895163059234619, + "learning_rate": 1.9206515058389675e-05, + "loss": 1.5115, + "step": 2582 + }, + { + "epoch": 0.19852489244007376, + "grad_norm": 4.155092239379883, + "learning_rate": 1.9205900430239705e-05, + "loss": 1.4538, + "step": 2584 + }, + { + "epoch": 0.19867854947756608, + "grad_norm": 4.396788120269775, + "learning_rate": 1.9205285802089738e-05, + "loss": 1.5629, + "step": 2586 + }, + { + "epoch": 0.1988322065150584, + "grad_norm": 4.8076934814453125, + "learning_rate": 1.9204671173939767e-05, + "loss": 1.4844, + "step": 2588 + }, + { + "epoch": 0.1989858635525507, + "grad_norm": 3.6861722469329834, + "learning_rate": 1.92040565457898e-05, + "loss": 1.501, + "step": 2590 + }, + { + "epoch": 0.19913952059004303, + "grad_norm": 4.285679817199707, + "learning_rate": 1.920344191763983e-05, + "loss": 1.6599, + "step": 2592 + }, + { + "epoch": 0.19929317762753535, + "grad_norm": 4.052687644958496, + "learning_rate": 1.920282728948986e-05, + "loss": 1.4244, + "step": 2594 + }, + { + "epoch": 0.19944683466502766, + "grad_norm": 3.6357898712158203, + "learning_rate": 1.9202212661339893e-05, + "loss": 1.5088, + "step": 2596 + }, + { + "epoch": 0.19960049170251998, + "grad_norm": 3.789736270904541, + "learning_rate": 1.9201598033189923e-05, + "loss": 1.5779, + "step": 2598 + }, + { + "epoch": 0.1997541487400123, + "grad_norm": 4.087263107299805, + "learning_rate": 1.9200983405039952e-05, + "loss": 1.6094, + "step": 2600 + }, + { + "epoch": 0.1999078057775046, + "grad_norm": 4.693444728851318, + "learning_rate": 1.9200368776889985e-05, + "loss": 1.3526, + "step": 2602 + }, + { + "epoch": 0.20006146281499693, + "grad_norm": 3.850968837738037, + "learning_rate": 1.919975414874001e-05, + "loss": 1.5124, + "step": 2604 + }, + { + "epoch": 0.20021511985248924, + "grad_norm": 3.671071767807007, + "learning_rate": 1.9199139520590045e-05, + "loss": 1.5842, + "step": 2606 + }, + { + "epoch": 0.20036877688998156, + "grad_norm": 3.6253607273101807, + "learning_rate": 1.9198524892440074e-05, + "loss": 1.5148, + "step": 2608 + }, + { + "epoch": 0.20052243392747388, + "grad_norm": 4.358476638793945, + "learning_rate": 1.9197910264290104e-05, + "loss": 1.531, + "step": 2610 + }, + { + "epoch": 0.2006760909649662, + "grad_norm": 3.9361257553100586, + "learning_rate": 1.9197295636140137e-05, + "loss": 1.5476, + "step": 2612 + }, + { + "epoch": 0.2008297480024585, + "grad_norm": 4.064403533935547, + "learning_rate": 1.9196681007990167e-05, + "loss": 1.5514, + "step": 2614 + }, + { + "epoch": 0.20098340503995082, + "grad_norm": 4.13236665725708, + "learning_rate": 1.91960663798402e-05, + "loss": 1.5569, + "step": 2616 + }, + { + "epoch": 0.20113706207744314, + "grad_norm": 3.7941882610321045, + "learning_rate": 1.919545175169023e-05, + "loss": 1.511, + "step": 2618 + }, + { + "epoch": 0.20129071911493546, + "grad_norm": 3.5626373291015625, + "learning_rate": 1.919483712354026e-05, + "loss": 1.4677, + "step": 2620 + }, + { + "epoch": 0.20144437615242777, + "grad_norm": 4.068457126617432, + "learning_rate": 1.9194222495390292e-05, + "loss": 1.5312, + "step": 2622 + }, + { + "epoch": 0.2015980331899201, + "grad_norm": 4.036095142364502, + "learning_rate": 1.919360786724032e-05, + "loss": 1.6681, + "step": 2624 + }, + { + "epoch": 0.2017516902274124, + "grad_norm": 3.8620471954345703, + "learning_rate": 1.9192993239090352e-05, + "loss": 1.5411, + "step": 2626 + }, + { + "epoch": 0.20190534726490472, + "grad_norm": 3.6075820922851562, + "learning_rate": 1.919237861094038e-05, + "loss": 1.5125, + "step": 2628 + }, + { + "epoch": 0.20205900430239704, + "grad_norm": 3.6271731853485107, + "learning_rate": 1.919176398279041e-05, + "loss": 1.5795, + "step": 2630 + }, + { + "epoch": 0.20221266133988935, + "grad_norm": 4.1661810874938965, + "learning_rate": 1.9191149354640444e-05, + "loss": 1.5523, + "step": 2632 + }, + { + "epoch": 0.20236631837738167, + "grad_norm": 4.115680694580078, + "learning_rate": 1.9190534726490474e-05, + "loss": 1.5146, + "step": 2634 + }, + { + "epoch": 0.20251997541487401, + "grad_norm": 3.4160807132720947, + "learning_rate": 1.9189920098340507e-05, + "loss": 1.5035, + "step": 2636 + }, + { + "epoch": 0.20267363245236633, + "grad_norm": 4.1160969734191895, + "learning_rate": 1.9189305470190537e-05, + "loss": 1.481, + "step": 2638 + }, + { + "epoch": 0.20282728948985865, + "grad_norm": 4.941380500793457, + "learning_rate": 1.9188690842040566e-05, + "loss": 1.7115, + "step": 2640 + }, + { + "epoch": 0.20298094652735096, + "grad_norm": 3.743597984313965, + "learning_rate": 1.91880762138906e-05, + "loss": 1.5442, + "step": 2642 + }, + { + "epoch": 0.20313460356484328, + "grad_norm": 4.364147186279297, + "learning_rate": 1.918746158574063e-05, + "loss": 1.3992, + "step": 2644 + }, + { + "epoch": 0.2032882606023356, + "grad_norm": 3.69368052482605, + "learning_rate": 1.918684695759066e-05, + "loss": 1.4778, + "step": 2646 + }, + { + "epoch": 0.2034419176398279, + "grad_norm": 4.028282642364502, + "learning_rate": 1.9186232329440692e-05, + "loss": 1.6028, + "step": 2648 + }, + { + "epoch": 0.20359557467732023, + "grad_norm": 3.767077922821045, + "learning_rate": 1.9185617701290718e-05, + "loss": 1.4907, + "step": 2650 + }, + { + "epoch": 0.20374923171481255, + "grad_norm": 3.511185646057129, + "learning_rate": 1.918500307314075e-05, + "loss": 1.6204, + "step": 2652 + }, + { + "epoch": 0.20390288875230486, + "grad_norm": 3.6800639629364014, + "learning_rate": 1.918438844499078e-05, + "loss": 1.5762, + "step": 2654 + }, + { + "epoch": 0.20405654578979718, + "grad_norm": 3.3730273246765137, + "learning_rate": 1.9183773816840814e-05, + "loss": 1.4401, + "step": 2656 + }, + { + "epoch": 0.2042102028272895, + "grad_norm": 4.639964580535889, + "learning_rate": 1.9183159188690844e-05, + "loss": 1.5344, + "step": 2658 + }, + { + "epoch": 0.2043638598647818, + "grad_norm": 3.624997138977051, + "learning_rate": 1.9182544560540873e-05, + "loss": 1.5386, + "step": 2660 + }, + { + "epoch": 0.20451751690227413, + "grad_norm": 4.5834503173828125, + "learning_rate": 1.9181929932390906e-05, + "loss": 1.6546, + "step": 2662 + }, + { + "epoch": 0.20467117393976644, + "grad_norm": 3.5726184844970703, + "learning_rate": 1.9181315304240936e-05, + "loss": 1.4267, + "step": 2664 + }, + { + "epoch": 0.20482483097725876, + "grad_norm": 3.3437399864196777, + "learning_rate": 1.9180700676090966e-05, + "loss": 1.6013, + "step": 2666 + }, + { + "epoch": 0.20497848801475108, + "grad_norm": 3.5842888355255127, + "learning_rate": 1.9180086047941e-05, + "loss": 1.4479, + "step": 2668 + }, + { + "epoch": 0.2051321450522434, + "grad_norm": 4.096655368804932, + "learning_rate": 1.917947141979103e-05, + "loss": 1.4553, + "step": 2670 + }, + { + "epoch": 0.2052858020897357, + "grad_norm": 4.347322940826416, + "learning_rate": 1.9178856791641058e-05, + "loss": 1.3843, + "step": 2672 + }, + { + "epoch": 0.20543945912722802, + "grad_norm": 3.9682047367095947, + "learning_rate": 1.917824216349109e-05, + "loss": 1.6752, + "step": 2674 + }, + { + "epoch": 0.20559311616472034, + "grad_norm": 4.238104343414307, + "learning_rate": 1.917762753534112e-05, + "loss": 1.5107, + "step": 2676 + }, + { + "epoch": 0.20574677320221266, + "grad_norm": 3.6419684886932373, + "learning_rate": 1.917701290719115e-05, + "loss": 1.6268, + "step": 2678 + }, + { + "epoch": 0.20590043023970497, + "grad_norm": 4.493232727050781, + "learning_rate": 1.917639827904118e-05, + "loss": 1.5479, + "step": 2680 + }, + { + "epoch": 0.2060540872771973, + "grad_norm": 3.5530130863189697, + "learning_rate": 1.9175783650891213e-05, + "loss": 1.4865, + "step": 2682 + }, + { + "epoch": 0.2062077443146896, + "grad_norm": 3.930048942565918, + "learning_rate": 1.9175169022741243e-05, + "loss": 1.4627, + "step": 2684 + }, + { + "epoch": 0.20636140135218192, + "grad_norm": 4.193355560302734, + "learning_rate": 1.9174554394591273e-05, + "loss": 1.5918, + "step": 2686 + }, + { + "epoch": 0.20651505838967424, + "grad_norm": 3.4783825874328613, + "learning_rate": 1.9173939766441306e-05, + "loss": 1.441, + "step": 2688 + }, + { + "epoch": 0.20666871542716655, + "grad_norm": 3.6716883182525635, + "learning_rate": 1.9173325138291336e-05, + "loss": 1.5617, + "step": 2690 + }, + { + "epoch": 0.20682237246465887, + "grad_norm": 3.806039810180664, + "learning_rate": 1.9172710510141365e-05, + "loss": 1.5566, + "step": 2692 + }, + { + "epoch": 0.2069760295021512, + "grad_norm": 3.7331125736236572, + "learning_rate": 1.9172095881991398e-05, + "loss": 1.4934, + "step": 2694 + }, + { + "epoch": 0.2071296865396435, + "grad_norm": 4.192966938018799, + "learning_rate": 1.9171481253841428e-05, + "loss": 1.5679, + "step": 2696 + }, + { + "epoch": 0.20728334357713582, + "grad_norm": 3.590785503387451, + "learning_rate": 1.9170866625691458e-05, + "loss": 1.4876, + "step": 2698 + }, + { + "epoch": 0.20743700061462814, + "grad_norm": 4.1026458740234375, + "learning_rate": 1.917025199754149e-05, + "loss": 1.5708, + "step": 2700 + }, + { + "epoch": 0.20759065765212048, + "grad_norm": 3.8768205642700195, + "learning_rate": 1.916963736939152e-05, + "loss": 1.4715, + "step": 2702 + }, + { + "epoch": 0.2077443146896128, + "grad_norm": 3.648543357849121, + "learning_rate": 1.916902274124155e-05, + "loss": 1.6127, + "step": 2704 + }, + { + "epoch": 0.2078979717271051, + "grad_norm": 3.6755056381225586, + "learning_rate": 1.916840811309158e-05, + "loss": 1.6711, + "step": 2706 + }, + { + "epoch": 0.20805162876459743, + "grad_norm": 3.4196550846099854, + "learning_rate": 1.9167793484941613e-05, + "loss": 1.4866, + "step": 2708 + }, + { + "epoch": 0.20820528580208975, + "grad_norm": 3.9521472454071045, + "learning_rate": 1.9167178856791643e-05, + "loss": 1.5474, + "step": 2710 + }, + { + "epoch": 0.20835894283958206, + "grad_norm": 4.077064037322998, + "learning_rate": 1.9166564228641672e-05, + "loss": 1.6824, + "step": 2712 + }, + { + "epoch": 0.20851259987707438, + "grad_norm": 4.164952278137207, + "learning_rate": 1.9165949600491705e-05, + "loss": 1.4411, + "step": 2714 + }, + { + "epoch": 0.2086662569145667, + "grad_norm": 3.788529396057129, + "learning_rate": 1.9165334972341735e-05, + "loss": 1.6062, + "step": 2716 + }, + { + "epoch": 0.208819913952059, + "grad_norm": 4.074838638305664, + "learning_rate": 1.9164720344191765e-05, + "loss": 1.567, + "step": 2718 + }, + { + "epoch": 0.20897357098955133, + "grad_norm": 3.398486852645874, + "learning_rate": 1.9164105716041798e-05, + "loss": 1.6186, + "step": 2720 + }, + { + "epoch": 0.20912722802704364, + "grad_norm": 3.534534215927124, + "learning_rate": 1.9163491087891827e-05, + "loss": 1.4404, + "step": 2722 + }, + { + "epoch": 0.20928088506453596, + "grad_norm": 3.3846664428710938, + "learning_rate": 1.9162876459741857e-05, + "loss": 1.4741, + "step": 2724 + }, + { + "epoch": 0.20943454210202828, + "grad_norm": 3.8569769859313965, + "learning_rate": 1.916226183159189e-05, + "loss": 1.4941, + "step": 2726 + }, + { + "epoch": 0.2095881991395206, + "grad_norm": 3.6158618927001953, + "learning_rate": 1.916164720344192e-05, + "loss": 1.5539, + "step": 2728 + }, + { + "epoch": 0.2097418561770129, + "grad_norm": 4.233087062835693, + "learning_rate": 1.916103257529195e-05, + "loss": 1.5474, + "step": 2730 + }, + { + "epoch": 0.20989551321450522, + "grad_norm": 3.693319797515869, + "learning_rate": 1.916041794714198e-05, + "loss": 1.5927, + "step": 2732 + }, + { + "epoch": 0.21004917025199754, + "grad_norm": 3.5169527530670166, + "learning_rate": 1.9159803318992012e-05, + "loss": 1.3885, + "step": 2734 + }, + { + "epoch": 0.21020282728948986, + "grad_norm": 3.507129430770874, + "learning_rate": 1.9159188690842042e-05, + "loss": 1.4943, + "step": 2736 + }, + { + "epoch": 0.21035648432698217, + "grad_norm": 4.948350429534912, + "learning_rate": 1.915857406269207e-05, + "loss": 1.505, + "step": 2738 + }, + { + "epoch": 0.2105101413644745, + "grad_norm": 3.834376573562622, + "learning_rate": 1.9157959434542105e-05, + "loss": 1.6174, + "step": 2740 + }, + { + "epoch": 0.2106637984019668, + "grad_norm": 4.351836204528809, + "learning_rate": 1.9157344806392134e-05, + "loss": 1.5882, + "step": 2742 + }, + { + "epoch": 0.21081745543945912, + "grad_norm": 3.8399126529693604, + "learning_rate": 1.9156730178242164e-05, + "loss": 1.5669, + "step": 2744 + }, + { + "epoch": 0.21097111247695144, + "grad_norm": 4.092167377471924, + "learning_rate": 1.9156115550092197e-05, + "loss": 1.5737, + "step": 2746 + }, + { + "epoch": 0.21112476951444376, + "grad_norm": 3.5932557582855225, + "learning_rate": 1.9155500921942227e-05, + "loss": 1.5082, + "step": 2748 + }, + { + "epoch": 0.21127842655193607, + "grad_norm": 4.129446506500244, + "learning_rate": 1.9154886293792257e-05, + "loss": 1.6286, + "step": 2750 + }, + { + "epoch": 0.2114320835894284, + "grad_norm": 3.489421844482422, + "learning_rate": 1.9154271665642286e-05, + "loss": 1.5005, + "step": 2752 + }, + { + "epoch": 0.2115857406269207, + "grad_norm": 3.7528555393218994, + "learning_rate": 1.915365703749232e-05, + "loss": 1.5174, + "step": 2754 + }, + { + "epoch": 0.21173939766441302, + "grad_norm": 3.6098313331604004, + "learning_rate": 1.915304240934235e-05, + "loss": 1.4932, + "step": 2756 + }, + { + "epoch": 0.21189305470190534, + "grad_norm": 3.8902368545532227, + "learning_rate": 1.915242778119238e-05, + "loss": 1.4891, + "step": 2758 + }, + { + "epoch": 0.21204671173939765, + "grad_norm": 4.011685371398926, + "learning_rate": 1.9151813153042412e-05, + "loss": 1.5252, + "step": 2760 + }, + { + "epoch": 0.21220036877688997, + "grad_norm": 3.6917941570281982, + "learning_rate": 1.915119852489244e-05, + "loss": 1.5622, + "step": 2762 + }, + { + "epoch": 0.21235402581438229, + "grad_norm": 3.5468204021453857, + "learning_rate": 1.915058389674247e-05, + "loss": 1.3541, + "step": 2764 + }, + { + "epoch": 0.21250768285187463, + "grad_norm": 3.7897398471832275, + "learning_rate": 1.9149969268592504e-05, + "loss": 1.7065, + "step": 2766 + }, + { + "epoch": 0.21266133988936695, + "grad_norm": 3.539816379547119, + "learning_rate": 1.9149354640442534e-05, + "loss": 1.4987, + "step": 2768 + }, + { + "epoch": 0.21281499692685926, + "grad_norm": 3.866915702819824, + "learning_rate": 1.9148740012292564e-05, + "loss": 1.6211, + "step": 2770 + }, + { + "epoch": 0.21296865396435158, + "grad_norm": 4.1955180168151855, + "learning_rate": 1.9148125384142597e-05, + "loss": 1.448, + "step": 2772 + }, + { + "epoch": 0.2131223110018439, + "grad_norm": 3.6780476570129395, + "learning_rate": 1.9147510755992626e-05, + "loss": 1.5676, + "step": 2774 + }, + { + "epoch": 0.2132759680393362, + "grad_norm": 4.097123622894287, + "learning_rate": 1.9146896127842656e-05, + "loss": 1.5146, + "step": 2776 + }, + { + "epoch": 0.21342962507682853, + "grad_norm": 3.8207781314849854, + "learning_rate": 1.9146281499692686e-05, + "loss": 1.4983, + "step": 2778 + }, + { + "epoch": 0.21358328211432084, + "grad_norm": 3.585047960281372, + "learning_rate": 1.914566687154272e-05, + "loss": 1.4271, + "step": 2780 + }, + { + "epoch": 0.21373693915181316, + "grad_norm": 3.4344520568847656, + "learning_rate": 1.914505224339275e-05, + "loss": 1.5673, + "step": 2782 + }, + { + "epoch": 0.21389059618930548, + "grad_norm": 3.5881597995758057, + "learning_rate": 1.9144437615242778e-05, + "loss": 1.4111, + "step": 2784 + }, + { + "epoch": 0.2140442532267978, + "grad_norm": 3.65913724899292, + "learning_rate": 1.914382298709281e-05, + "loss": 1.5894, + "step": 2786 + }, + { + "epoch": 0.2141979102642901, + "grad_norm": 3.761591911315918, + "learning_rate": 1.914320835894284e-05, + "loss": 1.4502, + "step": 2788 + }, + { + "epoch": 0.21435156730178243, + "grad_norm": 3.943924903869629, + "learning_rate": 1.914259373079287e-05, + "loss": 1.4754, + "step": 2790 + }, + { + "epoch": 0.21450522433927474, + "grad_norm": 4.111385345458984, + "learning_rate": 1.9141979102642904e-05, + "loss": 1.5946, + "step": 2792 + }, + { + "epoch": 0.21465888137676706, + "grad_norm": 3.8199594020843506, + "learning_rate": 1.9141364474492933e-05, + "loss": 1.4817, + "step": 2794 + }, + { + "epoch": 0.21481253841425937, + "grad_norm": 3.667856454849243, + "learning_rate": 1.9140749846342963e-05, + "loss": 1.5357, + "step": 2796 + }, + { + "epoch": 0.2149661954517517, + "grad_norm": 3.980133295059204, + "learning_rate": 1.9140135218192996e-05, + "loss": 1.6328, + "step": 2798 + }, + { + "epoch": 0.215119852489244, + "grad_norm": 4.220106601715088, + "learning_rate": 1.9139520590043026e-05, + "loss": 1.5825, + "step": 2800 + }, + { + "epoch": 0.21527350952673632, + "grad_norm": 3.8425889015197754, + "learning_rate": 1.913890596189306e-05, + "loss": 1.5565, + "step": 2802 + }, + { + "epoch": 0.21542716656422864, + "grad_norm": 3.5814743041992188, + "learning_rate": 1.9138291333743085e-05, + "loss": 1.5044, + "step": 2804 + }, + { + "epoch": 0.21558082360172096, + "grad_norm": 4.015439510345459, + "learning_rate": 1.9137676705593118e-05, + "loss": 1.6277, + "step": 2806 + }, + { + "epoch": 0.21573448063921327, + "grad_norm": 4.553494930267334, + "learning_rate": 1.9137062077443148e-05, + "loss": 1.4374, + "step": 2808 + }, + { + "epoch": 0.2158881376767056, + "grad_norm": 3.7021353244781494, + "learning_rate": 1.9136447449293178e-05, + "loss": 1.5173, + "step": 2810 + }, + { + "epoch": 0.2160417947141979, + "grad_norm": 4.46372652053833, + "learning_rate": 1.913583282114321e-05, + "loss": 1.7864, + "step": 2812 + }, + { + "epoch": 0.21619545175169022, + "grad_norm": 3.551791191101074, + "learning_rate": 1.913521819299324e-05, + "loss": 1.3458, + "step": 2814 + }, + { + "epoch": 0.21634910878918254, + "grad_norm": 3.224550724029541, + "learning_rate": 1.913460356484327e-05, + "loss": 1.4272, + "step": 2816 + }, + { + "epoch": 0.21650276582667485, + "grad_norm": 4.008483409881592, + "learning_rate": 1.9133988936693303e-05, + "loss": 1.5502, + "step": 2818 + }, + { + "epoch": 0.21665642286416717, + "grad_norm": 3.782654047012329, + "learning_rate": 1.9133374308543333e-05, + "loss": 1.413, + "step": 2820 + }, + { + "epoch": 0.21681007990165949, + "grad_norm": 3.2302143573760986, + "learning_rate": 1.9132759680393366e-05, + "loss": 1.4368, + "step": 2822 + }, + { + "epoch": 0.2169637369391518, + "grad_norm": 3.4619314670562744, + "learning_rate": 1.9132145052243395e-05, + "loss": 1.4042, + "step": 2824 + }, + { + "epoch": 0.21711739397664412, + "grad_norm": 4.229294300079346, + "learning_rate": 1.9131530424093425e-05, + "loss": 1.5187, + "step": 2826 + }, + { + "epoch": 0.21727105101413643, + "grad_norm": 3.9806225299835205, + "learning_rate": 1.9130915795943458e-05, + "loss": 1.4521, + "step": 2828 + }, + { + "epoch": 0.21742470805162875, + "grad_norm": 3.7580087184906006, + "learning_rate": 1.9130301167793485e-05, + "loss": 1.5982, + "step": 2830 + }, + { + "epoch": 0.2175783650891211, + "grad_norm": 3.9534852504730225, + "learning_rate": 1.9129686539643518e-05, + "loss": 1.6518, + "step": 2832 + }, + { + "epoch": 0.2177320221266134, + "grad_norm": 3.770359992980957, + "learning_rate": 1.9129071911493547e-05, + "loss": 1.3771, + "step": 2834 + }, + { + "epoch": 0.21788567916410573, + "grad_norm": 4.148795127868652, + "learning_rate": 1.9128457283343577e-05, + "loss": 1.6236, + "step": 2836 + }, + { + "epoch": 0.21803933620159804, + "grad_norm": 4.049807548522949, + "learning_rate": 1.912784265519361e-05, + "loss": 1.6702, + "step": 2838 + }, + { + "epoch": 0.21819299323909036, + "grad_norm": 3.7045743465423584, + "learning_rate": 1.912722802704364e-05, + "loss": 1.54, + "step": 2840 + }, + { + "epoch": 0.21834665027658268, + "grad_norm": 3.9694302082061768, + "learning_rate": 1.9126613398893673e-05, + "loss": 1.6067, + "step": 2842 + }, + { + "epoch": 0.218500307314075, + "grad_norm": 3.4496958255767822, + "learning_rate": 1.9125998770743702e-05, + "loss": 1.5128, + "step": 2844 + }, + { + "epoch": 0.2186539643515673, + "grad_norm": 3.6536827087402344, + "learning_rate": 1.9125384142593732e-05, + "loss": 1.5447, + "step": 2846 + }, + { + "epoch": 0.21880762138905963, + "grad_norm": 3.9362754821777344, + "learning_rate": 1.9124769514443765e-05, + "loss": 1.4012, + "step": 2848 + }, + { + "epoch": 0.21896127842655194, + "grad_norm": 3.2885048389434814, + "learning_rate": 1.912415488629379e-05, + "loss": 1.4489, + "step": 2850 + }, + { + "epoch": 0.21911493546404426, + "grad_norm": 3.519296884536743, + "learning_rate": 1.9123540258143825e-05, + "loss": 1.4015, + "step": 2852 + }, + { + "epoch": 0.21926859250153657, + "grad_norm": 4.157144546508789, + "learning_rate": 1.9122925629993854e-05, + "loss": 1.5355, + "step": 2854 + }, + { + "epoch": 0.2194222495390289, + "grad_norm": 3.6727182865142822, + "learning_rate": 1.9122311001843884e-05, + "loss": 1.5225, + "step": 2856 + }, + { + "epoch": 0.2195759065765212, + "grad_norm": 3.801722288131714, + "learning_rate": 1.9121696373693917e-05, + "loss": 1.4482, + "step": 2858 + }, + { + "epoch": 0.21972956361401352, + "grad_norm": 4.15856409072876, + "learning_rate": 1.9121081745543947e-05, + "loss": 1.5494, + "step": 2860 + }, + { + "epoch": 0.21988322065150584, + "grad_norm": 4.247369289398193, + "learning_rate": 1.912046711739398e-05, + "loss": 1.4578, + "step": 2862 + }, + { + "epoch": 0.22003687768899816, + "grad_norm": 4.413219928741455, + "learning_rate": 1.911985248924401e-05, + "loss": 1.5631, + "step": 2864 + }, + { + "epoch": 0.22019053472649047, + "grad_norm": 3.894334077835083, + "learning_rate": 1.911923786109404e-05, + "loss": 1.5072, + "step": 2866 + }, + { + "epoch": 0.2203441917639828, + "grad_norm": 3.5060431957244873, + "learning_rate": 1.9118623232944072e-05, + "loss": 1.5318, + "step": 2868 + }, + { + "epoch": 0.2204978488014751, + "grad_norm": 3.312140464782715, + "learning_rate": 1.9118008604794102e-05, + "loss": 1.5173, + "step": 2870 + }, + { + "epoch": 0.22065150583896742, + "grad_norm": 4.122204780578613, + "learning_rate": 1.911739397664413e-05, + "loss": 1.5222, + "step": 2872 + }, + { + "epoch": 0.22080516287645974, + "grad_norm": 3.994349956512451, + "learning_rate": 1.9116779348494165e-05, + "loss": 1.6783, + "step": 2874 + }, + { + "epoch": 0.22095881991395205, + "grad_norm": 3.831480026245117, + "learning_rate": 1.911616472034419e-05, + "loss": 1.4112, + "step": 2876 + }, + { + "epoch": 0.22111247695144437, + "grad_norm": 3.1337056159973145, + "learning_rate": 1.9115550092194224e-05, + "loss": 1.4297, + "step": 2878 + }, + { + "epoch": 0.2212661339889367, + "grad_norm": 3.5576846599578857, + "learning_rate": 1.9114935464044254e-05, + "loss": 1.5911, + "step": 2880 + }, + { + "epoch": 0.221419791026429, + "grad_norm": 4.290283679962158, + "learning_rate": 1.9114320835894283e-05, + "loss": 1.381, + "step": 2882 + }, + { + "epoch": 0.22157344806392132, + "grad_norm": 3.7715091705322266, + "learning_rate": 1.9113706207744316e-05, + "loss": 1.4368, + "step": 2884 + }, + { + "epoch": 0.22172710510141364, + "grad_norm": 3.3459980487823486, + "learning_rate": 1.9113091579594346e-05, + "loss": 1.3408, + "step": 2886 + }, + { + "epoch": 0.22188076213890595, + "grad_norm": 3.626512050628662, + "learning_rate": 1.911247695144438e-05, + "loss": 1.5328, + "step": 2888 + }, + { + "epoch": 0.22203441917639827, + "grad_norm": 4.11099100112915, + "learning_rate": 1.911186232329441e-05, + "loss": 1.3911, + "step": 2890 + }, + { + "epoch": 0.22218807621389058, + "grad_norm": 3.386157751083374, + "learning_rate": 1.911124769514444e-05, + "loss": 1.4783, + "step": 2892 + }, + { + "epoch": 0.2223417332513829, + "grad_norm": 3.7897472381591797, + "learning_rate": 1.911063306699447e-05, + "loss": 1.723, + "step": 2894 + }, + { + "epoch": 0.22249539028887522, + "grad_norm": 4.359883785247803, + "learning_rate": 1.91100184388445e-05, + "loss": 1.515, + "step": 2896 + }, + { + "epoch": 0.22264904732636756, + "grad_norm": 3.69624924659729, + "learning_rate": 1.910940381069453e-05, + "loss": 1.4789, + "step": 2898 + }, + { + "epoch": 0.22280270436385988, + "grad_norm": 3.6302242279052734, + "learning_rate": 1.9108789182544564e-05, + "loss": 1.4188, + "step": 2900 + }, + { + "epoch": 0.2229563614013522, + "grad_norm": 3.765510082244873, + "learning_rate": 1.910817455439459e-05, + "loss": 1.5475, + "step": 2902 + }, + { + "epoch": 0.2231100184388445, + "grad_norm": 3.6644978523254395, + "learning_rate": 1.9107559926244623e-05, + "loss": 1.3725, + "step": 2904 + }, + { + "epoch": 0.22326367547633683, + "grad_norm": 3.9660632610321045, + "learning_rate": 1.9106945298094653e-05, + "loss": 1.6626, + "step": 2906 + }, + { + "epoch": 0.22341733251382914, + "grad_norm": 3.578211545944214, + "learning_rate": 1.9106330669944686e-05, + "loss": 1.7112, + "step": 2908 + }, + { + "epoch": 0.22357098955132146, + "grad_norm": 3.61378812789917, + "learning_rate": 1.9105716041794716e-05, + "loss": 1.4498, + "step": 2910 + }, + { + "epoch": 0.22372464658881377, + "grad_norm": 3.705375909805298, + "learning_rate": 1.9105101413644746e-05, + "loss": 1.5779, + "step": 2912 + }, + { + "epoch": 0.2238783036263061, + "grad_norm": 3.6211791038513184, + "learning_rate": 1.910448678549478e-05, + "loss": 1.5316, + "step": 2914 + }, + { + "epoch": 0.2240319606637984, + "grad_norm": 3.84789776802063, + "learning_rate": 1.910387215734481e-05, + "loss": 1.4939, + "step": 2916 + }, + { + "epoch": 0.22418561770129072, + "grad_norm": 3.532919406890869, + "learning_rate": 1.9103257529194838e-05, + "loss": 1.4652, + "step": 2918 + }, + { + "epoch": 0.22433927473878304, + "grad_norm": 4.090033531188965, + "learning_rate": 1.910264290104487e-05, + "loss": 1.6013, + "step": 2920 + }, + { + "epoch": 0.22449293177627536, + "grad_norm": 3.964073896408081, + "learning_rate": 1.91020282728949e-05, + "loss": 1.5672, + "step": 2922 + }, + { + "epoch": 0.22464658881376767, + "grad_norm": 3.7312428951263428, + "learning_rate": 1.910141364474493e-05, + "loss": 1.4724, + "step": 2924 + }, + { + "epoch": 0.22480024585126, + "grad_norm": 3.863544464111328, + "learning_rate": 1.9100799016594964e-05, + "loss": 1.4465, + "step": 2926 + }, + { + "epoch": 0.2249539028887523, + "grad_norm": 3.6252338886260986, + "learning_rate": 1.9100184388444993e-05, + "loss": 1.5351, + "step": 2928 + }, + { + "epoch": 0.22510755992624462, + "grad_norm": 3.6253201961517334, + "learning_rate": 1.9099569760295023e-05, + "loss": 1.487, + "step": 2930 + }, + { + "epoch": 0.22526121696373694, + "grad_norm": 4.433292865753174, + "learning_rate": 1.9098955132145053e-05, + "loss": 1.5361, + "step": 2932 + }, + { + "epoch": 0.22541487400122925, + "grad_norm": 3.5505495071411133, + "learning_rate": 1.9098340503995086e-05, + "loss": 1.5658, + "step": 2934 + }, + { + "epoch": 0.22556853103872157, + "grad_norm": 3.8930625915527344, + "learning_rate": 1.9097725875845115e-05, + "loss": 1.6665, + "step": 2936 + }, + { + "epoch": 0.2257221880762139, + "grad_norm": 3.6677374839782715, + "learning_rate": 1.9097111247695145e-05, + "loss": 1.4741, + "step": 2938 + }, + { + "epoch": 0.2258758451137062, + "grad_norm": 3.793630838394165, + "learning_rate": 1.9096496619545178e-05, + "loss": 1.5319, + "step": 2940 + }, + { + "epoch": 0.22602950215119852, + "grad_norm": 3.9335010051727295, + "learning_rate": 1.9095881991395208e-05, + "loss": 1.5558, + "step": 2942 + }, + { + "epoch": 0.22618315918869084, + "grad_norm": 3.583728551864624, + "learning_rate": 1.9095267363245237e-05, + "loss": 1.4462, + "step": 2944 + }, + { + "epoch": 0.22633681622618315, + "grad_norm": 3.9286651611328125, + "learning_rate": 1.909465273509527e-05, + "loss": 1.438, + "step": 2946 + }, + { + "epoch": 0.22649047326367547, + "grad_norm": 3.526460647583008, + "learning_rate": 1.90940381069453e-05, + "loss": 1.3429, + "step": 2948 + }, + { + "epoch": 0.22664413030116778, + "grad_norm": 4.004605293273926, + "learning_rate": 1.909342347879533e-05, + "loss": 1.6075, + "step": 2950 + }, + { + "epoch": 0.2267977873386601, + "grad_norm": 3.9438345432281494, + "learning_rate": 1.909280885064536e-05, + "loss": 1.5637, + "step": 2952 + }, + { + "epoch": 0.22695144437615242, + "grad_norm": 4.000015735626221, + "learning_rate": 1.9092194222495393e-05, + "loss": 1.4984, + "step": 2954 + }, + { + "epoch": 0.22710510141364473, + "grad_norm": 3.9805843830108643, + "learning_rate": 1.9091579594345422e-05, + "loss": 1.5232, + "step": 2956 + }, + { + "epoch": 0.22725875845113705, + "grad_norm": 3.8267416954040527, + "learning_rate": 1.9090964966195452e-05, + "loss": 1.496, + "step": 2958 + }, + { + "epoch": 0.22741241548862937, + "grad_norm": 3.9185121059417725, + "learning_rate": 1.9090350338045485e-05, + "loss": 1.5375, + "step": 2960 + }, + { + "epoch": 0.2275660725261217, + "grad_norm": 3.3180205821990967, + "learning_rate": 1.9089735709895515e-05, + "loss": 1.3771, + "step": 2962 + }, + { + "epoch": 0.22771972956361403, + "grad_norm": 3.9198102951049805, + "learning_rate": 1.9089121081745544e-05, + "loss": 1.4476, + "step": 2964 + }, + { + "epoch": 0.22787338660110634, + "grad_norm": 4.271195411682129, + "learning_rate": 1.9088506453595578e-05, + "loss": 1.5008, + "step": 2966 + }, + { + "epoch": 0.22802704363859866, + "grad_norm": 3.8531033992767334, + "learning_rate": 1.9087891825445607e-05, + "loss": 1.5622, + "step": 2968 + }, + { + "epoch": 0.22818070067609098, + "grad_norm": 3.83855938911438, + "learning_rate": 1.9087277197295637e-05, + "loss": 1.5225, + "step": 2970 + }, + { + "epoch": 0.2283343577135833, + "grad_norm": 3.9290051460266113, + "learning_rate": 1.908666256914567e-05, + "loss": 1.5943, + "step": 2972 + }, + { + "epoch": 0.2284880147510756, + "grad_norm": 3.9568498134613037, + "learning_rate": 1.90860479409957e-05, + "loss": 1.4396, + "step": 2974 + }, + { + "epoch": 0.22864167178856792, + "grad_norm": 3.8864598274230957, + "learning_rate": 1.908543331284573e-05, + "loss": 1.5007, + "step": 2976 + }, + { + "epoch": 0.22879532882606024, + "grad_norm": 3.8972291946411133, + "learning_rate": 1.908481868469576e-05, + "loss": 1.4453, + "step": 2978 + }, + { + "epoch": 0.22894898586355256, + "grad_norm": 3.3269383907318115, + "learning_rate": 1.9084204056545792e-05, + "loss": 1.4875, + "step": 2980 + }, + { + "epoch": 0.22910264290104487, + "grad_norm": 4.227247714996338, + "learning_rate": 1.9083589428395822e-05, + "loss": 1.4625, + "step": 2982 + }, + { + "epoch": 0.2292562999385372, + "grad_norm": 4.22311544418335, + "learning_rate": 1.908297480024585e-05, + "loss": 1.6099, + "step": 2984 + }, + { + "epoch": 0.2294099569760295, + "grad_norm": 3.618152141571045, + "learning_rate": 1.9082360172095885e-05, + "loss": 1.3739, + "step": 2986 + }, + { + "epoch": 0.22956361401352182, + "grad_norm": 4.052188873291016, + "learning_rate": 1.9081745543945914e-05, + "loss": 1.4373, + "step": 2988 + }, + { + "epoch": 0.22971727105101414, + "grad_norm": 3.879241943359375, + "learning_rate": 1.9081130915795944e-05, + "loss": 1.4802, + "step": 2990 + }, + { + "epoch": 0.22987092808850645, + "grad_norm": 4.114073276519775, + "learning_rate": 1.9080516287645977e-05, + "loss": 1.4496, + "step": 2992 + }, + { + "epoch": 0.23002458512599877, + "grad_norm": 3.420759677886963, + "learning_rate": 1.9079901659496007e-05, + "loss": 1.4345, + "step": 2994 + }, + { + "epoch": 0.2301782421634911, + "grad_norm": 3.7326130867004395, + "learning_rate": 1.9079287031346036e-05, + "loss": 1.4949, + "step": 2996 + }, + { + "epoch": 0.2303318992009834, + "grad_norm": 4.011677265167236, + "learning_rate": 1.907867240319607e-05, + "loss": 1.504, + "step": 2998 + }, + { + "epoch": 0.23048555623847572, + "grad_norm": 4.186577796936035, + "learning_rate": 1.90780577750461e-05, + "loss": 1.5432, + "step": 3000 + }, + { + "epoch": 0.23063921327596804, + "grad_norm": 3.768660306930542, + "learning_rate": 1.907744314689613e-05, + "loss": 1.403, + "step": 3002 + }, + { + "epoch": 0.23079287031346035, + "grad_norm": 3.8193624019622803, + "learning_rate": 1.907682851874616e-05, + "loss": 1.401, + "step": 3004 + }, + { + "epoch": 0.23094652735095267, + "grad_norm": 3.425020217895508, + "learning_rate": 1.907621389059619e-05, + "loss": 1.4576, + "step": 3006 + }, + { + "epoch": 0.23110018438844498, + "grad_norm": 3.75453782081604, + "learning_rate": 1.907559926244622e-05, + "loss": 1.5563, + "step": 3008 + }, + { + "epoch": 0.2312538414259373, + "grad_norm": 3.68115496635437, + "learning_rate": 1.907498463429625e-05, + "loss": 1.3805, + "step": 3010 + }, + { + "epoch": 0.23140749846342962, + "grad_norm": 3.3006389141082764, + "learning_rate": 1.9074370006146284e-05, + "loss": 1.5252, + "step": 3012 + }, + { + "epoch": 0.23156115550092193, + "grad_norm": 4.148028373718262, + "learning_rate": 1.9073755377996314e-05, + "loss": 1.5676, + "step": 3014 + }, + { + "epoch": 0.23171481253841425, + "grad_norm": 3.5741395950317383, + "learning_rate": 1.9073140749846343e-05, + "loss": 1.5295, + "step": 3016 + }, + { + "epoch": 0.23186846957590657, + "grad_norm": 3.9004082679748535, + "learning_rate": 1.9072526121696376e-05, + "loss": 1.4535, + "step": 3018 + }, + { + "epoch": 0.23202212661339888, + "grad_norm": 4.176894664764404, + "learning_rate": 1.9071911493546406e-05, + "loss": 1.3629, + "step": 3020 + }, + { + "epoch": 0.2321757836508912, + "grad_norm": 5.075558662414551, + "learning_rate": 1.9071296865396436e-05, + "loss": 1.5217, + "step": 3022 + }, + { + "epoch": 0.23232944068838352, + "grad_norm": 3.4459147453308105, + "learning_rate": 1.907068223724647e-05, + "loss": 1.4933, + "step": 3024 + }, + { + "epoch": 0.23248309772587583, + "grad_norm": 7.433279037475586, + "learning_rate": 1.90700676090965e-05, + "loss": 1.5533, + "step": 3026 + }, + { + "epoch": 0.23263675476336818, + "grad_norm": 3.6367766857147217, + "learning_rate": 1.9069452980946528e-05, + "loss": 1.5066, + "step": 3028 + }, + { + "epoch": 0.2327904118008605, + "grad_norm": 3.537909746170044, + "learning_rate": 1.9068838352796558e-05, + "loss": 1.337, + "step": 3030 + }, + { + "epoch": 0.2329440688383528, + "grad_norm": 4.002386093139648, + "learning_rate": 1.906822372464659e-05, + "loss": 1.6041, + "step": 3032 + }, + { + "epoch": 0.23309772587584512, + "grad_norm": 3.76108717918396, + "learning_rate": 1.906760909649662e-05, + "loss": 1.415, + "step": 3034 + }, + { + "epoch": 0.23325138291333744, + "grad_norm": 3.783128023147583, + "learning_rate": 1.906699446834665e-05, + "loss": 1.3389, + "step": 3036 + }, + { + "epoch": 0.23340503995082976, + "grad_norm": 3.628431558609009, + "learning_rate": 1.9066379840196683e-05, + "loss": 1.4391, + "step": 3038 + }, + { + "epoch": 0.23355869698832207, + "grad_norm": 3.474383592605591, + "learning_rate": 1.9065765212046713e-05, + "loss": 1.6119, + "step": 3040 + }, + { + "epoch": 0.2337123540258144, + "grad_norm": 3.4599947929382324, + "learning_rate": 1.9065150583896743e-05, + "loss": 1.5611, + "step": 3042 + }, + { + "epoch": 0.2338660110633067, + "grad_norm": 3.5529699325561523, + "learning_rate": 1.9064535955746776e-05, + "loss": 1.5067, + "step": 3044 + }, + { + "epoch": 0.23401966810079902, + "grad_norm": 3.7097628116607666, + "learning_rate": 1.9063921327596806e-05, + "loss": 1.5433, + "step": 3046 + }, + { + "epoch": 0.23417332513829134, + "grad_norm": 3.5367672443389893, + "learning_rate": 1.9063306699446835e-05, + "loss": 1.5336, + "step": 3048 + }, + { + "epoch": 0.23432698217578365, + "grad_norm": 3.9764819145202637, + "learning_rate": 1.9062692071296868e-05, + "loss": 1.4671, + "step": 3050 + }, + { + "epoch": 0.23448063921327597, + "grad_norm": 3.8383781909942627, + "learning_rate": 1.9062077443146898e-05, + "loss": 1.5263, + "step": 3052 + }, + { + "epoch": 0.2346342962507683, + "grad_norm": 3.899062395095825, + "learning_rate": 1.906146281499693e-05, + "loss": 1.4592, + "step": 3054 + }, + { + "epoch": 0.2347879532882606, + "grad_norm": 3.9547131061553955, + "learning_rate": 1.9060848186846957e-05, + "loss": 1.4835, + "step": 3056 + }, + { + "epoch": 0.23494161032575292, + "grad_norm": 4.053800582885742, + "learning_rate": 1.906023355869699e-05, + "loss": 1.3099, + "step": 3058 + }, + { + "epoch": 0.23509526736324524, + "grad_norm": 3.65938401222229, + "learning_rate": 1.905961893054702e-05, + "loss": 1.5249, + "step": 3060 + }, + { + "epoch": 0.23524892440073755, + "grad_norm": 3.514390707015991, + "learning_rate": 1.905900430239705e-05, + "loss": 1.4615, + "step": 3062 + }, + { + "epoch": 0.23540258143822987, + "grad_norm": 3.5489819049835205, + "learning_rate": 1.9058389674247083e-05, + "loss": 1.3366, + "step": 3064 + }, + { + "epoch": 0.23555623847572218, + "grad_norm": 3.88149356842041, + "learning_rate": 1.9057775046097113e-05, + "loss": 1.6467, + "step": 3066 + }, + { + "epoch": 0.2357098955132145, + "grad_norm": 3.912346601486206, + "learning_rate": 1.9057160417947142e-05, + "loss": 1.6762, + "step": 3068 + }, + { + "epoch": 0.23586355255070682, + "grad_norm": 3.9258928298950195, + "learning_rate": 1.9056545789797175e-05, + "loss": 1.6048, + "step": 3070 + }, + { + "epoch": 0.23601720958819913, + "grad_norm": 4.021396636962891, + "learning_rate": 1.9055931161647205e-05, + "loss": 1.6012, + "step": 3072 + }, + { + "epoch": 0.23617086662569145, + "grad_norm": 3.756319999694824, + "learning_rate": 1.9055316533497238e-05, + "loss": 1.505, + "step": 3074 + }, + { + "epoch": 0.23632452366318377, + "grad_norm": 3.8204259872436523, + "learning_rate": 1.9054701905347264e-05, + "loss": 1.4867, + "step": 3076 + }, + { + "epoch": 0.23647818070067608, + "grad_norm": 3.6957874298095703, + "learning_rate": 1.9054087277197297e-05, + "loss": 1.6277, + "step": 3078 + }, + { + "epoch": 0.2366318377381684, + "grad_norm": 3.272287607192993, + "learning_rate": 1.9053472649047327e-05, + "loss": 1.276, + "step": 3080 + }, + { + "epoch": 0.23678549477566072, + "grad_norm": 3.6933488845825195, + "learning_rate": 1.9052858020897357e-05, + "loss": 1.569, + "step": 3082 + }, + { + "epoch": 0.23693915181315303, + "grad_norm": 4.391809940338135, + "learning_rate": 1.905224339274739e-05, + "loss": 1.4694, + "step": 3084 + }, + { + "epoch": 0.23709280885064535, + "grad_norm": 3.9479763507843018, + "learning_rate": 1.905162876459742e-05, + "loss": 1.4931, + "step": 3086 + }, + { + "epoch": 0.23724646588813766, + "grad_norm": 3.6774582862854004, + "learning_rate": 1.905101413644745e-05, + "loss": 1.6148, + "step": 3088 + }, + { + "epoch": 0.23740012292562998, + "grad_norm": 3.2491109371185303, + "learning_rate": 1.9050399508297482e-05, + "loss": 1.629, + "step": 3090 + }, + { + "epoch": 0.23755377996312232, + "grad_norm": 3.8977487087249756, + "learning_rate": 1.9049784880147512e-05, + "loss": 1.4423, + "step": 3092 + }, + { + "epoch": 0.23770743700061464, + "grad_norm": 3.438864231109619, + "learning_rate": 1.9049170251997545e-05, + "loss": 1.3834, + "step": 3094 + }, + { + "epoch": 0.23786109403810696, + "grad_norm": 3.799286365509033, + "learning_rate": 1.9048555623847575e-05, + "loss": 1.6176, + "step": 3096 + }, + { + "epoch": 0.23801475107559927, + "grad_norm": 3.665806770324707, + "learning_rate": 1.9047940995697604e-05, + "loss": 1.6198, + "step": 3098 + }, + { + "epoch": 0.2381684081130916, + "grad_norm": 4.014431476593018, + "learning_rate": 1.9047326367547637e-05, + "loss": 1.4675, + "step": 3100 + }, + { + "epoch": 0.2383220651505839, + "grad_norm": 3.293631076812744, + "learning_rate": 1.9046711739397664e-05, + "loss": 1.4766, + "step": 3102 + }, + { + "epoch": 0.23847572218807622, + "grad_norm": 3.3270959854125977, + "learning_rate": 1.9046097111247697e-05, + "loss": 1.3699, + "step": 3104 + }, + { + "epoch": 0.23862937922556854, + "grad_norm": 3.4034950733184814, + "learning_rate": 1.9045482483097727e-05, + "loss": 1.3973, + "step": 3106 + }, + { + "epoch": 0.23878303626306085, + "grad_norm": 4.333171367645264, + "learning_rate": 1.9044867854947756e-05, + "loss": 1.4756, + "step": 3108 + }, + { + "epoch": 0.23893669330055317, + "grad_norm": 3.8026325702667236, + "learning_rate": 1.904425322679779e-05, + "loss": 1.5391, + "step": 3110 + }, + { + "epoch": 0.2390903503380455, + "grad_norm": 4.236443996429443, + "learning_rate": 1.904363859864782e-05, + "loss": 1.5477, + "step": 3112 + }, + { + "epoch": 0.2392440073755378, + "grad_norm": 3.979402780532837, + "learning_rate": 1.9043023970497852e-05, + "loss": 1.5373, + "step": 3114 + }, + { + "epoch": 0.23939766441303012, + "grad_norm": 4.0372443199157715, + "learning_rate": 1.9042409342347882e-05, + "loss": 1.5543, + "step": 3116 + }, + { + "epoch": 0.23955132145052244, + "grad_norm": 3.884770393371582, + "learning_rate": 1.904179471419791e-05, + "loss": 1.4172, + "step": 3118 + }, + { + "epoch": 0.23970497848801475, + "grad_norm": 3.852644205093384, + "learning_rate": 1.9041180086047944e-05, + "loss": 1.5288, + "step": 3120 + }, + { + "epoch": 0.23985863552550707, + "grad_norm": 3.6635940074920654, + "learning_rate": 1.9040565457897974e-05, + "loss": 1.4201, + "step": 3122 + }, + { + "epoch": 0.24001229256299939, + "grad_norm": 3.9604082107543945, + "learning_rate": 1.9039950829748004e-05, + "loss": 1.5166, + "step": 3124 + }, + { + "epoch": 0.2401659496004917, + "grad_norm": 3.084805965423584, + "learning_rate": 1.9039336201598037e-05, + "loss": 1.472, + "step": 3126 + }, + { + "epoch": 0.24031960663798402, + "grad_norm": 3.741560697555542, + "learning_rate": 1.9038721573448063e-05, + "loss": 1.5564, + "step": 3128 + }, + { + "epoch": 0.24047326367547633, + "grad_norm": 3.3543457984924316, + "learning_rate": 1.9038106945298096e-05, + "loss": 1.5025, + "step": 3130 + }, + { + "epoch": 0.24062692071296865, + "grad_norm": 3.7456090450286865, + "learning_rate": 1.9037492317148126e-05, + "loss": 1.5189, + "step": 3132 + }, + { + "epoch": 0.24078057775046097, + "grad_norm": 3.4162986278533936, + "learning_rate": 1.9036877688998156e-05, + "loss": 1.622, + "step": 3134 + }, + { + "epoch": 0.24093423478795328, + "grad_norm": 4.069525241851807, + "learning_rate": 1.903626306084819e-05, + "loss": 1.6334, + "step": 3136 + }, + { + "epoch": 0.2410878918254456, + "grad_norm": 3.6952528953552246, + "learning_rate": 1.903564843269822e-05, + "loss": 1.4343, + "step": 3138 + }, + { + "epoch": 0.24124154886293792, + "grad_norm": 3.562882900238037, + "learning_rate": 1.903503380454825e-05, + "loss": 1.3561, + "step": 3140 + }, + { + "epoch": 0.24139520590043023, + "grad_norm": 3.8922019004821777, + "learning_rate": 1.903441917639828e-05, + "loss": 1.4585, + "step": 3142 + }, + { + "epoch": 0.24154886293792255, + "grad_norm": 3.665839910507202, + "learning_rate": 1.903380454824831e-05, + "loss": 1.7023, + "step": 3144 + }, + { + "epoch": 0.24170251997541486, + "grad_norm": 3.4272682666778564, + "learning_rate": 1.9033189920098344e-05, + "loss": 1.5242, + "step": 3146 + }, + { + "epoch": 0.24185617701290718, + "grad_norm": 3.8519303798675537, + "learning_rate": 1.9032575291948374e-05, + "loss": 1.4245, + "step": 3148 + }, + { + "epoch": 0.2420098340503995, + "grad_norm": 3.6139211654663086, + "learning_rate": 1.9031960663798403e-05, + "loss": 1.3585, + "step": 3150 + }, + { + "epoch": 0.2421634910878918, + "grad_norm": 4.0420966148376465, + "learning_rate": 1.9031346035648436e-05, + "loss": 1.577, + "step": 3152 + }, + { + "epoch": 0.24231714812538413, + "grad_norm": 3.215604305267334, + "learning_rate": 1.9030731407498463e-05, + "loss": 1.4625, + "step": 3154 + }, + { + "epoch": 0.24247080516287645, + "grad_norm": 4.21508264541626, + "learning_rate": 1.9030116779348496e-05, + "loss": 1.4881, + "step": 3156 + }, + { + "epoch": 0.2426244622003688, + "grad_norm": 3.351759910583496, + "learning_rate": 1.9029502151198525e-05, + "loss": 1.5047, + "step": 3158 + }, + { + "epoch": 0.2427781192378611, + "grad_norm": 3.681420087814331, + "learning_rate": 1.902888752304856e-05, + "loss": 1.3997, + "step": 3160 + }, + { + "epoch": 0.24293177627535342, + "grad_norm": 3.751452922821045, + "learning_rate": 1.9028272894898588e-05, + "loss": 1.5441, + "step": 3162 + }, + { + "epoch": 0.24308543331284574, + "grad_norm": 3.7483127117156982, + "learning_rate": 1.9027658266748618e-05, + "loss": 1.5719, + "step": 3164 + }, + { + "epoch": 0.24323909035033806, + "grad_norm": 4.023811340332031, + "learning_rate": 1.902704363859865e-05, + "loss": 1.5686, + "step": 3166 + }, + { + "epoch": 0.24339274738783037, + "grad_norm": 3.655235767364502, + "learning_rate": 1.902642901044868e-05, + "loss": 1.4781, + "step": 3168 + }, + { + "epoch": 0.2435464044253227, + "grad_norm": 3.8731863498687744, + "learning_rate": 1.902581438229871e-05, + "loss": 1.4679, + "step": 3170 + }, + { + "epoch": 0.243700061462815, + "grad_norm": 3.9271247386932373, + "learning_rate": 1.9025199754148743e-05, + "loss": 1.6756, + "step": 3172 + }, + { + "epoch": 0.24385371850030732, + "grad_norm": 3.7917888164520264, + "learning_rate": 1.902458512599877e-05, + "loss": 1.5004, + "step": 3174 + }, + { + "epoch": 0.24400737553779964, + "grad_norm": 3.341165781021118, + "learning_rate": 1.9023970497848803e-05, + "loss": 1.4226, + "step": 3176 + }, + { + "epoch": 0.24416103257529195, + "grad_norm": 3.6708719730377197, + "learning_rate": 1.9023355869698832e-05, + "loss": 1.4637, + "step": 3178 + }, + { + "epoch": 0.24431468961278427, + "grad_norm": 3.8046674728393555, + "learning_rate": 1.9022741241548865e-05, + "loss": 1.4156, + "step": 3180 + }, + { + "epoch": 0.24446834665027659, + "grad_norm": 3.7367637157440186, + "learning_rate": 1.9022126613398895e-05, + "loss": 1.5416, + "step": 3182 + }, + { + "epoch": 0.2446220036877689, + "grad_norm": 3.453035354614258, + "learning_rate": 1.9021511985248925e-05, + "loss": 1.459, + "step": 3184 + }, + { + "epoch": 0.24477566072526122, + "grad_norm": 3.8223092555999756, + "learning_rate": 1.9020897357098958e-05, + "loss": 1.5786, + "step": 3186 + }, + { + "epoch": 0.24492931776275353, + "grad_norm": 3.4199178218841553, + "learning_rate": 1.9020282728948988e-05, + "loss": 1.6042, + "step": 3188 + }, + { + "epoch": 0.24508297480024585, + "grad_norm": 3.828610420227051, + "learning_rate": 1.9019668100799017e-05, + "loss": 1.4459, + "step": 3190 + }, + { + "epoch": 0.24523663183773817, + "grad_norm": 3.5266432762145996, + "learning_rate": 1.901905347264905e-05, + "loss": 1.5675, + "step": 3192 + }, + { + "epoch": 0.24539028887523048, + "grad_norm": 3.738628387451172, + "learning_rate": 1.901843884449908e-05, + "loss": 1.4532, + "step": 3194 + }, + { + "epoch": 0.2455439459127228, + "grad_norm": 3.32661771774292, + "learning_rate": 1.901782421634911e-05, + "loss": 1.4821, + "step": 3196 + }, + { + "epoch": 0.24569760295021512, + "grad_norm": 3.8040380477905273, + "learning_rate": 1.9017209588199143e-05, + "loss": 1.5357, + "step": 3198 + }, + { + "epoch": 0.24585125998770743, + "grad_norm": 3.24855637550354, + "learning_rate": 1.9016594960049172e-05, + "loss": 1.36, + "step": 3200 + }, + { + "epoch": 0.24600491702519975, + "grad_norm": 3.494410514831543, + "learning_rate": 1.9015980331899202e-05, + "loss": 1.705, + "step": 3202 + }, + { + "epoch": 0.24615857406269206, + "grad_norm": 3.760875940322876, + "learning_rate": 1.9015365703749232e-05, + "loss": 1.4332, + "step": 3204 + }, + { + "epoch": 0.24631223110018438, + "grad_norm": 3.2367358207702637, + "learning_rate": 1.9014751075599265e-05, + "loss": 1.429, + "step": 3206 + }, + { + "epoch": 0.2464658881376767, + "grad_norm": 3.4821407794952393, + "learning_rate": 1.9014136447449295e-05, + "loss": 1.5069, + "step": 3208 + }, + { + "epoch": 0.246619545175169, + "grad_norm": 3.640047073364258, + "learning_rate": 1.9013521819299324e-05, + "loss": 1.4913, + "step": 3210 + }, + { + "epoch": 0.24677320221266133, + "grad_norm": 3.7262701988220215, + "learning_rate": 1.9012907191149357e-05, + "loss": 1.4896, + "step": 3212 + }, + { + "epoch": 0.24692685925015365, + "grad_norm": 4.199042797088623, + "learning_rate": 1.9012292562999387e-05, + "loss": 1.5823, + "step": 3214 + }, + { + "epoch": 0.24708051628764596, + "grad_norm": 3.3966314792633057, + "learning_rate": 1.9011677934849417e-05, + "loss": 1.4982, + "step": 3216 + }, + { + "epoch": 0.24723417332513828, + "grad_norm": 3.9125616550445557, + "learning_rate": 1.901106330669945e-05, + "loss": 1.4921, + "step": 3218 + }, + { + "epoch": 0.2473878303626306, + "grad_norm": 3.583458423614502, + "learning_rate": 1.901044867854948e-05, + "loss": 1.4831, + "step": 3220 + }, + { + "epoch": 0.24754148740012294, + "grad_norm": 3.9417824745178223, + "learning_rate": 1.900983405039951e-05, + "loss": 1.533, + "step": 3222 + }, + { + "epoch": 0.24769514443761526, + "grad_norm": 3.6411452293395996, + "learning_rate": 1.9009219422249542e-05, + "loss": 1.4955, + "step": 3224 + }, + { + "epoch": 0.24784880147510757, + "grad_norm": 3.75683856010437, + "learning_rate": 1.9008604794099572e-05, + "loss": 1.5469, + "step": 3226 + }, + { + "epoch": 0.2480024585125999, + "grad_norm": 4.777811050415039, + "learning_rate": 1.90079901659496e-05, + "loss": 1.4417, + "step": 3228 + }, + { + "epoch": 0.2481561155500922, + "grad_norm": 3.5624022483825684, + "learning_rate": 1.900737553779963e-05, + "loss": 1.4786, + "step": 3230 + }, + { + "epoch": 0.24830977258758452, + "grad_norm": 3.511544704437256, + "learning_rate": 1.9006760909649664e-05, + "loss": 1.4188, + "step": 3232 + }, + { + "epoch": 0.24846342962507684, + "grad_norm": 3.966139078140259, + "learning_rate": 1.9006146281499694e-05, + "loss": 1.5502, + "step": 3234 + }, + { + "epoch": 0.24861708666256915, + "grad_norm": 3.792802095413208, + "learning_rate": 1.9005531653349724e-05, + "loss": 1.4778, + "step": 3236 + }, + { + "epoch": 0.24877074370006147, + "grad_norm": 4.079867839813232, + "learning_rate": 1.9004917025199757e-05, + "loss": 1.5482, + "step": 3238 + }, + { + "epoch": 0.24892440073755379, + "grad_norm": 3.424563407897949, + "learning_rate": 1.9004302397049786e-05, + "loss": 1.5189, + "step": 3240 + }, + { + "epoch": 0.2490780577750461, + "grad_norm": 3.5327744483947754, + "learning_rate": 1.9003687768899816e-05, + "loss": 1.4631, + "step": 3242 + }, + { + "epoch": 0.24923171481253842, + "grad_norm": 3.8727915287017822, + "learning_rate": 1.900307314074985e-05, + "loss": 1.5833, + "step": 3244 + }, + { + "epoch": 0.24938537185003073, + "grad_norm": 4.0103302001953125, + "learning_rate": 1.900245851259988e-05, + "loss": 1.4591, + "step": 3246 + }, + { + "epoch": 0.24953902888752305, + "grad_norm": 4.012939453125, + "learning_rate": 1.900184388444991e-05, + "loss": 1.5609, + "step": 3248 + }, + { + "epoch": 0.24969268592501537, + "grad_norm": 3.425726890563965, + "learning_rate": 1.900122925629994e-05, + "loss": 1.3594, + "step": 3250 + }, + { + "epoch": 0.24984634296250768, + "grad_norm": 3.660951614379883, + "learning_rate": 1.900061462814997e-05, + "loss": 1.3977, + "step": 3252 + }, + { + "epoch": 0.25, + "grad_norm": 3.4327516555786133, + "learning_rate": 1.9e-05, + "loss": 1.5269, + "step": 3254 + }, + { + "epoch": 0.2501536570374923, + "grad_norm": 3.792966365814209, + "learning_rate": 1.899938537185003e-05, + "loss": 1.4157, + "step": 3256 + }, + { + "epoch": 0.25030731407498463, + "grad_norm": 3.777806282043457, + "learning_rate": 1.8998770743700064e-05, + "loss": 1.61, + "step": 3258 + }, + { + "epoch": 0.25046097111247695, + "grad_norm": 3.6547727584838867, + "learning_rate": 1.8998156115550093e-05, + "loss": 1.578, + "step": 3260 + }, + { + "epoch": 0.25061462814996927, + "grad_norm": 3.7142393589019775, + "learning_rate": 1.8997541487400123e-05, + "loss": 1.5387, + "step": 3262 + }, + { + "epoch": 0.2507682851874616, + "grad_norm": 3.795875310897827, + "learning_rate": 1.8996926859250156e-05, + "loss": 1.4108, + "step": 3264 + }, + { + "epoch": 0.2509219422249539, + "grad_norm": 4.018312931060791, + "learning_rate": 1.8996312231100186e-05, + "loss": 1.609, + "step": 3266 + }, + { + "epoch": 0.2510755992624462, + "grad_norm": 3.576310157775879, + "learning_rate": 1.8995697602950216e-05, + "loss": 1.5313, + "step": 3268 + }, + { + "epoch": 0.25122925629993853, + "grad_norm": 3.7783751487731934, + "learning_rate": 1.899508297480025e-05, + "loss": 1.5393, + "step": 3270 + }, + { + "epoch": 0.25138291333743085, + "grad_norm": 3.8343939781188965, + "learning_rate": 1.899446834665028e-05, + "loss": 1.4593, + "step": 3272 + }, + { + "epoch": 0.25153657037492316, + "grad_norm": 3.8293821811676025, + "learning_rate": 1.8993853718500308e-05, + "loss": 1.4991, + "step": 3274 + }, + { + "epoch": 0.2516902274124155, + "grad_norm": 3.804919958114624, + "learning_rate": 1.8993239090350338e-05, + "loss": 1.5475, + "step": 3276 + }, + { + "epoch": 0.2518438844499078, + "grad_norm": 3.3940484523773193, + "learning_rate": 1.899262446220037e-05, + "loss": 1.329, + "step": 3278 + }, + { + "epoch": 0.2519975414874001, + "grad_norm": 4.106419086456299, + "learning_rate": 1.89920098340504e-05, + "loss": 1.6195, + "step": 3280 + }, + { + "epoch": 0.25215119852489243, + "grad_norm": 3.56506609916687, + "learning_rate": 1.899139520590043e-05, + "loss": 1.6574, + "step": 3282 + }, + { + "epoch": 0.25230485556238474, + "grad_norm": 3.6036717891693115, + "learning_rate": 1.8990780577750463e-05, + "loss": 1.4525, + "step": 3284 + }, + { + "epoch": 0.25245851259987706, + "grad_norm": 4.163785934448242, + "learning_rate": 1.8990165949600493e-05, + "loss": 1.4881, + "step": 3286 + }, + { + "epoch": 0.2526121696373694, + "grad_norm": 4.115818023681641, + "learning_rate": 1.8989551321450523e-05, + "loss": 1.5126, + "step": 3288 + }, + { + "epoch": 0.2527658266748617, + "grad_norm": 3.6225428581237793, + "learning_rate": 1.8988936693300556e-05, + "loss": 1.4663, + "step": 3290 + }, + { + "epoch": 0.252919483712354, + "grad_norm": 3.335973024368286, + "learning_rate": 1.8988322065150585e-05, + "loss": 1.3223, + "step": 3292 + }, + { + "epoch": 0.2530731407498463, + "grad_norm": 3.5263774394989014, + "learning_rate": 1.8987707437000615e-05, + "loss": 1.4948, + "step": 3294 + }, + { + "epoch": 0.25322679778733864, + "grad_norm": 3.5017597675323486, + "learning_rate": 1.8987092808850648e-05, + "loss": 1.2772, + "step": 3296 + }, + { + "epoch": 0.25338045482483096, + "grad_norm": 3.744292736053467, + "learning_rate": 1.8986478180700678e-05, + "loss": 1.3346, + "step": 3298 + }, + { + "epoch": 0.2535341118623233, + "grad_norm": 4.142531871795654, + "learning_rate": 1.8985863552550707e-05, + "loss": 1.4425, + "step": 3300 + }, + { + "epoch": 0.2536877688998156, + "grad_norm": 3.734403371810913, + "learning_rate": 1.8985248924400737e-05, + "loss": 1.5465, + "step": 3302 + }, + { + "epoch": 0.2538414259373079, + "grad_norm": 3.28890061378479, + "learning_rate": 1.898463429625077e-05, + "loss": 1.4063, + "step": 3304 + }, + { + "epoch": 0.2539950829748002, + "grad_norm": 3.3916125297546387, + "learning_rate": 1.89840196681008e-05, + "loss": 1.5272, + "step": 3306 + }, + { + "epoch": 0.25414874001229254, + "grad_norm": 3.462782382965088, + "learning_rate": 1.898340503995083e-05, + "loss": 1.5789, + "step": 3308 + }, + { + "epoch": 0.25430239704978486, + "grad_norm": 3.189033031463623, + "learning_rate": 1.8982790411800863e-05, + "loss": 1.4295, + "step": 3310 + }, + { + "epoch": 0.2544560540872772, + "grad_norm": 3.8161280155181885, + "learning_rate": 1.8982175783650892e-05, + "loss": 1.4154, + "step": 3312 + }, + { + "epoch": 0.2546097111247695, + "grad_norm": 3.279613494873047, + "learning_rate": 1.8981561155500922e-05, + "loss": 1.3957, + "step": 3314 + }, + { + "epoch": 0.2547633681622618, + "grad_norm": 3.655524492263794, + "learning_rate": 1.8980946527350955e-05, + "loss": 1.5173, + "step": 3316 + }, + { + "epoch": 0.2549170251997541, + "grad_norm": 3.407174587249756, + "learning_rate": 1.8980331899200985e-05, + "loss": 1.4034, + "step": 3318 + }, + { + "epoch": 0.2550706822372465, + "grad_norm": 3.6055874824523926, + "learning_rate": 1.8979717271051014e-05, + "loss": 1.4338, + "step": 3320 + }, + { + "epoch": 0.2552243392747388, + "grad_norm": 3.903757333755493, + "learning_rate": 1.8979102642901048e-05, + "loss": 1.4843, + "step": 3322 + }, + { + "epoch": 0.2553779963122311, + "grad_norm": 3.445702314376831, + "learning_rate": 1.8978488014751077e-05, + "loss": 1.5697, + "step": 3324 + }, + { + "epoch": 0.25553165334972344, + "grad_norm": 3.888247013092041, + "learning_rate": 1.897787338660111e-05, + "loss": 1.3772, + "step": 3326 + }, + { + "epoch": 0.25568531038721576, + "grad_norm": 3.8378067016601562, + "learning_rate": 1.8977258758451137e-05, + "loss": 1.461, + "step": 3328 + }, + { + "epoch": 0.2558389674247081, + "grad_norm": 4.017933368682861, + "learning_rate": 1.897664413030117e-05, + "loss": 1.4029, + "step": 3330 + }, + { + "epoch": 0.2559926244622004, + "grad_norm": 3.43816876411438, + "learning_rate": 1.89760295021512e-05, + "loss": 1.4058, + "step": 3332 + }, + { + "epoch": 0.2561462814996927, + "grad_norm": 3.6177170276641846, + "learning_rate": 1.897541487400123e-05, + "loss": 1.5816, + "step": 3334 + }, + { + "epoch": 0.256299938537185, + "grad_norm": 3.708869218826294, + "learning_rate": 1.8974800245851262e-05, + "loss": 1.5383, + "step": 3336 + }, + { + "epoch": 0.25645359557467734, + "grad_norm": 3.4780690670013428, + "learning_rate": 1.8974185617701292e-05, + "loss": 1.4352, + "step": 3338 + }, + { + "epoch": 0.25660725261216966, + "grad_norm": 3.964945077896118, + "learning_rate": 1.897357098955132e-05, + "loss": 1.4872, + "step": 3340 + }, + { + "epoch": 0.256760909649662, + "grad_norm": 3.3842320442199707, + "learning_rate": 1.8972956361401355e-05, + "loss": 1.4933, + "step": 3342 + }, + { + "epoch": 0.2569145666871543, + "grad_norm": 3.7447729110717773, + "learning_rate": 1.8972341733251384e-05, + "loss": 1.6113, + "step": 3344 + }, + { + "epoch": 0.2570682237246466, + "grad_norm": 4.0971174240112305, + "learning_rate": 1.8971727105101417e-05, + "loss": 1.4749, + "step": 3346 + }, + { + "epoch": 0.2572218807621389, + "grad_norm": 3.2165610790252686, + "learning_rate": 1.8971112476951447e-05, + "loss": 1.4361, + "step": 3348 + }, + { + "epoch": 0.25737553779963124, + "grad_norm": 3.8490447998046875, + "learning_rate": 1.8970497848801477e-05, + "loss": 1.5455, + "step": 3350 + }, + { + "epoch": 0.25752919483712355, + "grad_norm": 4.487706661224365, + "learning_rate": 1.896988322065151e-05, + "loss": 1.5265, + "step": 3352 + }, + { + "epoch": 0.25768285187461587, + "grad_norm": 3.487905502319336, + "learning_rate": 1.8969268592501536e-05, + "loss": 1.4864, + "step": 3354 + }, + { + "epoch": 0.2578365089121082, + "grad_norm": 4.049886703491211, + "learning_rate": 1.896865396435157e-05, + "loss": 1.3788, + "step": 3356 + }, + { + "epoch": 0.2579901659496005, + "grad_norm": 3.7233083248138428, + "learning_rate": 1.89680393362016e-05, + "loss": 1.4227, + "step": 3358 + }, + { + "epoch": 0.2581438229870928, + "grad_norm": 3.6753907203674316, + "learning_rate": 1.896742470805163e-05, + "loss": 1.5348, + "step": 3360 + }, + { + "epoch": 0.25829748002458514, + "grad_norm": 3.5778372287750244, + "learning_rate": 1.896681007990166e-05, + "loss": 1.4769, + "step": 3362 + }, + { + "epoch": 0.25845113706207745, + "grad_norm": 3.8472750186920166, + "learning_rate": 1.896619545175169e-05, + "loss": 1.4646, + "step": 3364 + }, + { + "epoch": 0.25860479409956977, + "grad_norm": 3.4961729049682617, + "learning_rate": 1.8965580823601724e-05, + "loss": 1.4237, + "step": 3366 + }, + { + "epoch": 0.2587584511370621, + "grad_norm": 3.384688377380371, + "learning_rate": 1.8964966195451754e-05, + "loss": 1.3757, + "step": 3368 + }, + { + "epoch": 0.2589121081745544, + "grad_norm": 3.461256265640259, + "learning_rate": 1.8964351567301784e-05, + "loss": 1.6313, + "step": 3370 + }, + { + "epoch": 0.2590657652120467, + "grad_norm": 3.5917470455169678, + "learning_rate": 1.8963736939151817e-05, + "loss": 1.6363, + "step": 3372 + }, + { + "epoch": 0.25921942224953903, + "grad_norm": 4.257856845855713, + "learning_rate": 1.8963122311001846e-05, + "loss": 1.5657, + "step": 3374 + }, + { + "epoch": 0.25937307928703135, + "grad_norm": 3.825030565261841, + "learning_rate": 1.8962507682851876e-05, + "loss": 1.4029, + "step": 3376 + }, + { + "epoch": 0.25952673632452367, + "grad_norm": 3.698003053665161, + "learning_rate": 1.896189305470191e-05, + "loss": 1.4794, + "step": 3378 + }, + { + "epoch": 0.259680393362016, + "grad_norm": 3.656878709793091, + "learning_rate": 1.8961278426551935e-05, + "loss": 1.4764, + "step": 3380 + }, + { + "epoch": 0.2598340503995083, + "grad_norm": 3.9345102310180664, + "learning_rate": 1.896066379840197e-05, + "loss": 1.4978, + "step": 3382 + }, + { + "epoch": 0.2599877074370006, + "grad_norm": 3.2846667766571045, + "learning_rate": 1.8960049170251998e-05, + "loss": 1.4882, + "step": 3384 + }, + { + "epoch": 0.26014136447449293, + "grad_norm": 3.6656949520111084, + "learning_rate": 1.8959434542102028e-05, + "loss": 1.5028, + "step": 3386 + }, + { + "epoch": 0.26029502151198525, + "grad_norm": 3.3560938835144043, + "learning_rate": 1.895881991395206e-05, + "loss": 1.462, + "step": 3388 + }, + { + "epoch": 0.26044867854947756, + "grad_norm": 3.903484582901001, + "learning_rate": 1.895820528580209e-05, + "loss": 1.5055, + "step": 3390 + }, + { + "epoch": 0.2606023355869699, + "grad_norm": 3.4434473514556885, + "learning_rate": 1.8957590657652124e-05, + "loss": 1.4052, + "step": 3392 + }, + { + "epoch": 0.2607559926244622, + "grad_norm": 3.8662304878234863, + "learning_rate": 1.8956976029502153e-05, + "loss": 1.4433, + "step": 3394 + }, + { + "epoch": 0.2609096496619545, + "grad_norm": 3.921478509902954, + "learning_rate": 1.8956361401352183e-05, + "loss": 1.4143, + "step": 3396 + }, + { + "epoch": 0.26106330669944683, + "grad_norm": 4.5299601554870605, + "learning_rate": 1.8955746773202216e-05, + "loss": 1.5001, + "step": 3398 + }, + { + "epoch": 0.26121696373693915, + "grad_norm": 4.293244361877441, + "learning_rate": 1.8955132145052242e-05, + "loss": 1.4988, + "step": 3400 + }, + { + "epoch": 0.26137062077443146, + "grad_norm": 3.8071391582489014, + "learning_rate": 1.8954517516902276e-05, + "loss": 1.5376, + "step": 3402 + }, + { + "epoch": 0.2615242778119238, + "grad_norm": 3.9262189865112305, + "learning_rate": 1.8953902888752305e-05, + "loss": 1.6795, + "step": 3404 + }, + { + "epoch": 0.2616779348494161, + "grad_norm": 3.64919376373291, + "learning_rate": 1.8953288260602335e-05, + "loss": 1.4907, + "step": 3406 + }, + { + "epoch": 0.2618315918869084, + "grad_norm": 3.7635629177093506, + "learning_rate": 1.8952673632452368e-05, + "loss": 1.6125, + "step": 3408 + }, + { + "epoch": 0.2619852489244007, + "grad_norm": 3.243130683898926, + "learning_rate": 1.8952059004302398e-05, + "loss": 1.4673, + "step": 3410 + }, + { + "epoch": 0.26213890596189304, + "grad_norm": 4.027158737182617, + "learning_rate": 1.895144437615243e-05, + "loss": 1.572, + "step": 3412 + }, + { + "epoch": 0.26229256299938536, + "grad_norm": 3.7150518894195557, + "learning_rate": 1.895082974800246e-05, + "loss": 1.5544, + "step": 3414 + }, + { + "epoch": 0.2624462200368777, + "grad_norm": 3.56063175201416, + "learning_rate": 1.895021511985249e-05, + "loss": 1.4073, + "step": 3416 + }, + { + "epoch": 0.26259987707437, + "grad_norm": 3.2205371856689453, + "learning_rate": 1.8949600491702523e-05, + "loss": 1.3674, + "step": 3418 + }, + { + "epoch": 0.2627535341118623, + "grad_norm": 3.5181539058685303, + "learning_rate": 1.8948985863552553e-05, + "loss": 1.5704, + "step": 3420 + }, + { + "epoch": 0.2629071911493546, + "grad_norm": 3.9559669494628906, + "learning_rate": 1.8948371235402583e-05, + "loss": 1.4059, + "step": 3422 + }, + { + "epoch": 0.26306084818684694, + "grad_norm": 3.9452176094055176, + "learning_rate": 1.8947756607252616e-05, + "loss": 1.5914, + "step": 3424 + }, + { + "epoch": 0.26321450522433926, + "grad_norm": 3.5629098415374756, + "learning_rate": 1.8947141979102642e-05, + "loss": 1.4329, + "step": 3426 + }, + { + "epoch": 0.2633681622618316, + "grad_norm": 3.84218430519104, + "learning_rate": 1.8946527350952675e-05, + "loss": 1.4207, + "step": 3428 + }, + { + "epoch": 0.2635218192993239, + "grad_norm": 3.7253074645996094, + "learning_rate": 1.8945912722802705e-05, + "loss": 1.623, + "step": 3430 + }, + { + "epoch": 0.2636754763368162, + "grad_norm": 3.367321729660034, + "learning_rate": 1.8945298094652738e-05, + "loss": 1.3962, + "step": 3432 + }, + { + "epoch": 0.2638291333743085, + "grad_norm": 3.7452688217163086, + "learning_rate": 1.8944683466502767e-05, + "loss": 1.5708, + "step": 3434 + }, + { + "epoch": 0.26398279041180084, + "grad_norm": 4.207433700561523, + "learning_rate": 1.8944068838352797e-05, + "loss": 1.5347, + "step": 3436 + }, + { + "epoch": 0.26413644744929315, + "grad_norm": 3.2880661487579346, + "learning_rate": 1.894345421020283e-05, + "loss": 1.4852, + "step": 3438 + }, + { + "epoch": 0.26429010448678547, + "grad_norm": 3.8437421321868896, + "learning_rate": 1.894283958205286e-05, + "loss": 1.4737, + "step": 3440 + }, + { + "epoch": 0.2644437615242778, + "grad_norm": 4.380143642425537, + "learning_rate": 1.894222495390289e-05, + "loss": 1.4646, + "step": 3442 + }, + { + "epoch": 0.2645974185617701, + "grad_norm": 3.446812391281128, + "learning_rate": 1.8941610325752923e-05, + "loss": 1.5911, + "step": 3444 + }, + { + "epoch": 0.2647510755992624, + "grad_norm": 3.8308660984039307, + "learning_rate": 1.8940995697602952e-05, + "loss": 1.4695, + "step": 3446 + }, + { + "epoch": 0.26490473263675474, + "grad_norm": 3.475552558898926, + "learning_rate": 1.8940381069452982e-05, + "loss": 1.6034, + "step": 3448 + }, + { + "epoch": 0.2650583896742471, + "grad_norm": 3.3068044185638428, + "learning_rate": 1.8939766441303015e-05, + "loss": 1.4709, + "step": 3450 + }, + { + "epoch": 0.2652120467117394, + "grad_norm": 3.57163143157959, + "learning_rate": 1.8939151813153045e-05, + "loss": 1.4525, + "step": 3452 + }, + { + "epoch": 0.26536570374923174, + "grad_norm": 3.2639644145965576, + "learning_rate": 1.8938537185003074e-05, + "loss": 1.5822, + "step": 3454 + }, + { + "epoch": 0.26551936078672406, + "grad_norm": 3.7956414222717285, + "learning_rate": 1.8937922556853104e-05, + "loss": 1.4841, + "step": 3456 + }, + { + "epoch": 0.2656730178242164, + "grad_norm": 3.958355665206909, + "learning_rate": 1.8937307928703137e-05, + "loss": 1.6366, + "step": 3458 + }, + { + "epoch": 0.2658266748617087, + "grad_norm": 4.118971347808838, + "learning_rate": 1.8936693300553167e-05, + "loss": 1.6656, + "step": 3460 + }, + { + "epoch": 0.265980331899201, + "grad_norm": 3.3143534660339355, + "learning_rate": 1.8936078672403197e-05, + "loss": 1.5126, + "step": 3462 + }, + { + "epoch": 0.2661339889366933, + "grad_norm": 3.237959384918213, + "learning_rate": 1.893546404425323e-05, + "loss": 1.4093, + "step": 3464 + }, + { + "epoch": 0.26628764597418564, + "grad_norm": 3.183522939682007, + "learning_rate": 1.893484941610326e-05, + "loss": 1.3261, + "step": 3466 + }, + { + "epoch": 0.26644130301167795, + "grad_norm": 3.7242038249969482, + "learning_rate": 1.893423478795329e-05, + "loss": 1.5207, + "step": 3468 + }, + { + "epoch": 0.26659496004917027, + "grad_norm": 3.6746206283569336, + "learning_rate": 1.8933620159803322e-05, + "loss": 1.5599, + "step": 3470 + }, + { + "epoch": 0.2667486170866626, + "grad_norm": 3.866694211959839, + "learning_rate": 1.8933005531653352e-05, + "loss": 1.4262, + "step": 3472 + }, + { + "epoch": 0.2669022741241549, + "grad_norm": 3.508993148803711, + "learning_rate": 1.893239090350338e-05, + "loss": 1.4758, + "step": 3474 + }, + { + "epoch": 0.2670559311616472, + "grad_norm": 3.6842212677001953, + "learning_rate": 1.8931776275353415e-05, + "loss": 1.5336, + "step": 3476 + }, + { + "epoch": 0.26720958819913954, + "grad_norm": 3.6138060092926025, + "learning_rate": 1.8931161647203444e-05, + "loss": 1.4907, + "step": 3478 + }, + { + "epoch": 0.26736324523663185, + "grad_norm": 3.9442241191864014, + "learning_rate": 1.8930547019053474e-05, + "loss": 1.4674, + "step": 3480 + }, + { + "epoch": 0.26751690227412417, + "grad_norm": 3.853156089782715, + "learning_rate": 1.8929932390903504e-05, + "loss": 1.5482, + "step": 3482 + }, + { + "epoch": 0.2676705593116165, + "grad_norm": 3.863734245300293, + "learning_rate": 1.8929317762753537e-05, + "loss": 1.3643, + "step": 3484 + }, + { + "epoch": 0.2678242163491088, + "grad_norm": 3.796949625015259, + "learning_rate": 1.8928703134603566e-05, + "loss": 1.5439, + "step": 3486 + }, + { + "epoch": 0.2679778733866011, + "grad_norm": 3.865708827972412, + "learning_rate": 1.8928088506453596e-05, + "loss": 1.3416, + "step": 3488 + }, + { + "epoch": 0.26813153042409343, + "grad_norm": 3.347963571548462, + "learning_rate": 1.892747387830363e-05, + "loss": 1.495, + "step": 3490 + }, + { + "epoch": 0.26828518746158575, + "grad_norm": 3.663733959197998, + "learning_rate": 1.892685925015366e-05, + "loss": 1.5101, + "step": 3492 + }, + { + "epoch": 0.26843884449907807, + "grad_norm": 3.4801948070526123, + "learning_rate": 1.892624462200369e-05, + "loss": 1.5782, + "step": 3494 + }, + { + "epoch": 0.2685925015365704, + "grad_norm": 3.6601758003234863, + "learning_rate": 1.892562999385372e-05, + "loss": 1.6072, + "step": 3496 + }, + { + "epoch": 0.2687461585740627, + "grad_norm": 3.8066141605377197, + "learning_rate": 1.892501536570375e-05, + "loss": 1.4949, + "step": 3498 + }, + { + "epoch": 0.268899815611555, + "grad_norm": 4.257297515869141, + "learning_rate": 1.892440073755378e-05, + "loss": 1.5364, + "step": 3500 } ], "logging_steps": 2, @@ -8426,7 +12276,7 @@ "attributes": {} } }, - "total_flos": 1.5320091651263693e+19, + "total_flos": 2.234180032475955e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null