2016_audio / trainer_state.json
gdfwj's picture
Upload 11 files
1f556a6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.69545389767707,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005700441784238278,
"grad_norm": 27.39983367919922,
"learning_rate": 4.9999995544380894e-05,
"loss": 16.8508,
"num_input_tokens_seen": 97712,
"step": 5
},
{
"epoch": 0.011400883568476556,
"grad_norm": 35.96725082397461,
"learning_rate": 4.999998217752515e-05,
"loss": 11.7177,
"num_input_tokens_seen": 195504,
"step": 10
},
{
"epoch": 0.017101325352714837,
"grad_norm": 13.93742847442627,
"learning_rate": 4.999995989943754e-05,
"loss": 6.3848,
"num_input_tokens_seen": 293200,
"step": 15
},
{
"epoch": 0.022801767136953113,
"grad_norm": 9.999053955078125,
"learning_rate": 4.9999928710126e-05,
"loss": 4.4249,
"num_input_tokens_seen": 390960,
"step": 20
},
{
"epoch": 0.02850220892119139,
"grad_norm": 11.072565078735352,
"learning_rate": 4.999988860960165e-05,
"loss": 3.7022,
"num_input_tokens_seen": 488752,
"step": 25
},
{
"epoch": 0.034202650705429674,
"grad_norm": 6.960489273071289,
"learning_rate": 4.9999839597878784e-05,
"loss": 2.5291,
"num_input_tokens_seen": 586496,
"step": 30
},
{
"epoch": 0.039903092489667946,
"grad_norm": 10.353778839111328,
"learning_rate": 4.999978167497488e-05,
"loss": 1.9498,
"num_input_tokens_seen": 684240,
"step": 35
},
{
"epoch": 0.045603534273906225,
"grad_norm": 12.493133544921875,
"learning_rate": 4.999971484091057e-05,
"loss": 1.551,
"num_input_tokens_seen": 781936,
"step": 40
},
{
"epoch": 0.051303976058144504,
"grad_norm": 16.28146743774414,
"learning_rate": 4.999963909570968e-05,
"loss": 1.4018,
"num_input_tokens_seen": 879680,
"step": 45
},
{
"epoch": 0.05700441784238278,
"grad_norm": 9.161210060119629,
"learning_rate": 4.999955443939922e-05,
"loss": 1.2093,
"num_input_tokens_seen": 977440,
"step": 50
},
{
"epoch": 0.06270485962662106,
"grad_norm": 13.297211647033691,
"learning_rate": 4.9999460872009366e-05,
"loss": 1.1716,
"num_input_tokens_seen": 1075200,
"step": 55
},
{
"epoch": 0.06840530141085935,
"grad_norm": 8.317058563232422,
"learning_rate": 4.9999358393573445e-05,
"loss": 1.1838,
"num_input_tokens_seen": 1172880,
"step": 60
},
{
"epoch": 0.07410574319509762,
"grad_norm": 7.10370397567749,
"learning_rate": 4.9999247004128014e-05,
"loss": 1.0844,
"num_input_tokens_seen": 1270608,
"step": 65
},
{
"epoch": 0.07980618497933589,
"grad_norm": 10.387308120727539,
"learning_rate": 4.9999126703712775e-05,
"loss": 1.0746,
"num_input_tokens_seen": 1368368,
"step": 70
},
{
"epoch": 0.08550662676357418,
"grad_norm": 10.262731552124023,
"learning_rate": 4.999899749237059e-05,
"loss": 1.0698,
"num_input_tokens_seen": 1466016,
"step": 75
},
{
"epoch": 0.09120706854781245,
"grad_norm": 9.300691604614258,
"learning_rate": 4.9998859370147524e-05,
"loss": 1.1167,
"num_input_tokens_seen": 1563648,
"step": 80
},
{
"epoch": 0.09690751033205074,
"grad_norm": 4.99461030960083,
"learning_rate": 4.999871233709282e-05,
"loss": 1.105,
"num_input_tokens_seen": 1661456,
"step": 85
},
{
"epoch": 0.10260795211628901,
"grad_norm": 8.910074234008789,
"learning_rate": 4.9998556393258884e-05,
"loss": 1.1096,
"num_input_tokens_seen": 1759184,
"step": 90
},
{
"epoch": 0.1083083939005273,
"grad_norm": 7.826836585998535,
"learning_rate": 4.9998391538701293e-05,
"loss": 1.1084,
"num_input_tokens_seen": 1856848,
"step": 95
},
{
"epoch": 0.11400883568476557,
"grad_norm": 8.221962928771973,
"learning_rate": 4.999821777347883e-05,
"loss": 1.1534,
"num_input_tokens_seen": 1954544,
"step": 100
},
{
"epoch": 0.11970927746900385,
"grad_norm": 8.834911346435547,
"learning_rate": 4.9998035097653406e-05,
"loss": 1.075,
"num_input_tokens_seen": 2052224,
"step": 105
},
{
"epoch": 0.12540971925324212,
"grad_norm": 7.220479488372803,
"learning_rate": 4.9997843511290156e-05,
"loss": 1.085,
"num_input_tokens_seen": 2150000,
"step": 110
},
{
"epoch": 0.1311101610374804,
"grad_norm": 7.887118816375732,
"learning_rate": 4.999764301445736e-05,
"loss": 1.0384,
"num_input_tokens_seen": 2247808,
"step": 115
},
{
"epoch": 0.1368106028217187,
"grad_norm": 9.087337493896484,
"learning_rate": 4.9997433607226495e-05,
"loss": 1.1907,
"num_input_tokens_seen": 2345584,
"step": 120
},
{
"epoch": 0.14251104460595695,
"grad_norm": 7.334683418273926,
"learning_rate": 4.9997215289672194e-05,
"loss": 1.0865,
"num_input_tokens_seen": 2443360,
"step": 125
},
{
"epoch": 0.14821148639019524,
"grad_norm": 6.93471097946167,
"learning_rate": 4.9996988061872284e-05,
"loss": 1.0469,
"num_input_tokens_seen": 2541120,
"step": 130
},
{
"epoch": 0.15391192817443353,
"grad_norm": 8.000816345214844,
"learning_rate": 4.999675192390776e-05,
"loss": 1.0966,
"num_input_tokens_seen": 2638912,
"step": 135
},
{
"epoch": 0.15961236995867178,
"grad_norm": 6.232958793640137,
"learning_rate": 4.999650687586278e-05,
"loss": 1.0418,
"num_input_tokens_seen": 2736624,
"step": 140
},
{
"epoch": 0.16531281174291007,
"grad_norm": 12.828268051147461,
"learning_rate": 4.999625291782471e-05,
"loss": 1.0684,
"num_input_tokens_seen": 2834384,
"step": 145
},
{
"epoch": 0.17101325352714836,
"grad_norm": 6.147856712341309,
"learning_rate": 4.999599004988406e-05,
"loss": 0.9802,
"num_input_tokens_seen": 2932160,
"step": 150
},
{
"epoch": 0.17671369531138664,
"grad_norm": 7.085546016693115,
"learning_rate": 4.999571827213454e-05,
"loss": 1.1506,
"num_input_tokens_seen": 3029904,
"step": 155
},
{
"epoch": 0.1824141370956249,
"grad_norm": 8.019725799560547,
"learning_rate": 4.999543758467301e-05,
"loss": 1.0185,
"num_input_tokens_seen": 3127648,
"step": 160
},
{
"epoch": 0.1881145788798632,
"grad_norm": 5.743609428405762,
"learning_rate": 4.9995147987599536e-05,
"loss": 1.0001,
"num_input_tokens_seen": 3225360,
"step": 165
},
{
"epoch": 0.19381502066410147,
"grad_norm": 6.749576568603516,
"learning_rate": 4.999484948101734e-05,
"loss": 1.0848,
"num_input_tokens_seen": 3323152,
"step": 170
},
{
"epoch": 0.19951546244833976,
"grad_norm": 7.745048522949219,
"learning_rate": 4.9994542065032823e-05,
"loss": 1.0074,
"num_input_tokens_seen": 3420912,
"step": 175
},
{
"epoch": 0.20521590423257802,
"grad_norm": 6.615988254547119,
"learning_rate": 4.9994225739755565e-05,
"loss": 1.0756,
"num_input_tokens_seen": 3518752,
"step": 180
},
{
"epoch": 0.2109163460168163,
"grad_norm": 6.993303298950195,
"learning_rate": 4.999390050529831e-05,
"loss": 1.0371,
"num_input_tokens_seen": 3616560,
"step": 185
},
{
"epoch": 0.2166167878010546,
"grad_norm": 6.100363731384277,
"learning_rate": 4.9993566361777e-05,
"loss": 0.9687,
"num_input_tokens_seen": 3714320,
"step": 190
},
{
"epoch": 0.22231722958529285,
"grad_norm": 5.574942111968994,
"learning_rate": 4.999322330931074e-05,
"loss": 1.0173,
"num_input_tokens_seen": 3812144,
"step": 195
},
{
"epoch": 0.22801767136953113,
"grad_norm": 7.2291975021362305,
"learning_rate": 4.9992871348021804e-05,
"loss": 1.0322,
"num_input_tokens_seen": 3909824,
"step": 200
},
{
"epoch": 0.23371811315376942,
"grad_norm": 6.874391078948975,
"learning_rate": 4.999251047803565e-05,
"loss": 1.0096,
"num_input_tokens_seen": 4007600,
"step": 205
},
{
"epoch": 0.2394185549380077,
"grad_norm": 9.4487886428833,
"learning_rate": 4.9992140699480914e-05,
"loss": 0.9313,
"num_input_tokens_seen": 4105360,
"step": 210
},
{
"epoch": 0.24511899672224596,
"grad_norm": 8.49326229095459,
"learning_rate": 4.99917620124894e-05,
"loss": 1.008,
"num_input_tokens_seen": 4203072,
"step": 215
},
{
"epoch": 0.25081943850648425,
"grad_norm": 8.347270965576172,
"learning_rate": 4.999137441719609e-05,
"loss": 0.9588,
"num_input_tokens_seen": 4300784,
"step": 220
},
{
"epoch": 0.25651988029072254,
"grad_norm": 7.798429489135742,
"learning_rate": 4.999097791373915e-05,
"loss": 1.0412,
"num_input_tokens_seen": 4398448,
"step": 225
},
{
"epoch": 0.2622203220749608,
"grad_norm": 7.584600448608398,
"learning_rate": 4.99905725022599e-05,
"loss": 0.9457,
"num_input_tokens_seen": 4496256,
"step": 230
},
{
"epoch": 0.2679207638591991,
"grad_norm": 5.460471153259277,
"learning_rate": 4.9990158182902866e-05,
"loss": 0.8931,
"num_input_tokens_seen": 4594032,
"step": 235
},
{
"epoch": 0.2736212056434374,
"grad_norm": 6.889909267425537,
"learning_rate": 4.9989734955815715e-05,
"loss": 0.846,
"num_input_tokens_seen": 4691824,
"step": 240
},
{
"epoch": 0.2793216474276756,
"grad_norm": 8.376734733581543,
"learning_rate": 4.998930282114932e-05,
"loss": 0.9712,
"num_input_tokens_seen": 4789568,
"step": 245
},
{
"epoch": 0.2850220892119139,
"grad_norm": 6.110357284545898,
"learning_rate": 4.99888617790577e-05,
"loss": 0.9427,
"num_input_tokens_seen": 4887296,
"step": 250
},
{
"epoch": 0.2907225309961522,
"grad_norm": 7.7024102210998535,
"learning_rate": 4.998841182969808e-05,
"loss": 0.8296,
"num_input_tokens_seen": 4984976,
"step": 255
},
{
"epoch": 0.2964229727803905,
"grad_norm": 6.920788288116455,
"learning_rate": 4.998795297323083e-05,
"loss": 1.0276,
"num_input_tokens_seen": 5082688,
"step": 260
},
{
"epoch": 0.30212341456462877,
"grad_norm": 7.553328514099121,
"learning_rate": 4.9987485209819515e-05,
"loss": 1.0488,
"num_input_tokens_seen": 5180400,
"step": 265
},
{
"epoch": 0.30782385634886705,
"grad_norm": 6.883415699005127,
"learning_rate": 4.998700853963088e-05,
"loss": 0.9426,
"num_input_tokens_seen": 5278208,
"step": 270
},
{
"epoch": 0.31352429813310534,
"grad_norm": 11.664554595947266,
"learning_rate": 4.998652296283481e-05,
"loss": 0.9294,
"num_input_tokens_seen": 5375968,
"step": 275
},
{
"epoch": 0.31922473991734357,
"grad_norm": 11.442777633666992,
"learning_rate": 4.9986028479604416e-05,
"loss": 1.0263,
"num_input_tokens_seen": 5473760,
"step": 280
},
{
"epoch": 0.32492518170158186,
"grad_norm": 7.503568649291992,
"learning_rate": 4.9985525090115936e-05,
"loss": 0.8616,
"num_input_tokens_seen": 5571472,
"step": 285
},
{
"epoch": 0.33062562348582014,
"grad_norm": 4.156918048858643,
"learning_rate": 4.998501279454881e-05,
"loss": 0.867,
"num_input_tokens_seen": 5669136,
"step": 290
},
{
"epoch": 0.3363260652700584,
"grad_norm": 8.972600936889648,
"learning_rate": 4.998449159308565e-05,
"loss": 0.9869,
"num_input_tokens_seen": 5766816,
"step": 295
},
{
"epoch": 0.3420265070542967,
"grad_norm": 6.943081855773926,
"learning_rate": 4.9983961485912235e-05,
"loss": 0.8677,
"num_input_tokens_seen": 5864576,
"step": 300
},
{
"epoch": 0.347726948838535,
"grad_norm": 5.271353244781494,
"learning_rate": 4.9983422473217514e-05,
"loss": 0.929,
"num_input_tokens_seen": 5962384,
"step": 305
},
{
"epoch": 0.3534273906227733,
"grad_norm": 6.874100685119629,
"learning_rate": 4.998287455519363e-05,
"loss": 0.8697,
"num_input_tokens_seen": 6060160,
"step": 310
},
{
"epoch": 0.3591278324070115,
"grad_norm": 6.316469192504883,
"learning_rate": 4.998231773203587e-05,
"loss": 0.8826,
"num_input_tokens_seen": 6157920,
"step": 315
},
{
"epoch": 0.3648282741912498,
"grad_norm": 6.3930816650390625,
"learning_rate": 4.9981752003942734e-05,
"loss": 0.9108,
"num_input_tokens_seen": 6255600,
"step": 320
},
{
"epoch": 0.3705287159754881,
"grad_norm": 7.396681785583496,
"learning_rate": 4.998117737111587e-05,
"loss": 0.9613,
"num_input_tokens_seen": 6353424,
"step": 325
},
{
"epoch": 0.3762291577597264,
"grad_norm": 9.793058395385742,
"learning_rate": 4.998059383376009e-05,
"loss": 0.8664,
"num_input_tokens_seen": 6451184,
"step": 330
},
{
"epoch": 0.38192959954396466,
"grad_norm": 4.0863423347473145,
"learning_rate": 4.998000139208342e-05,
"loss": 0.8693,
"num_input_tokens_seen": 6549040,
"step": 335
},
{
"epoch": 0.38763004132820295,
"grad_norm": 4.3018317222595215,
"learning_rate": 4.997940004629702e-05,
"loss": 0.9368,
"num_input_tokens_seen": 6646752,
"step": 340
},
{
"epoch": 0.39333048311244123,
"grad_norm": 16.874574661254883,
"learning_rate": 4.9978789796615235e-05,
"loss": 1.0444,
"num_input_tokens_seen": 6744544,
"step": 345
},
{
"epoch": 0.3990309248966795,
"grad_norm": 6.2149658203125,
"learning_rate": 4.9978170643255604e-05,
"loss": 0.9418,
"num_input_tokens_seen": 6842256,
"step": 350
},
{
"epoch": 0.40473136668091775,
"grad_norm": 6.908440113067627,
"learning_rate": 4.997754258643882e-05,
"loss": 0.8389,
"num_input_tokens_seen": 6939984,
"step": 355
},
{
"epoch": 0.41043180846515603,
"grad_norm": 9.332175254821777,
"learning_rate": 4.997690562638874e-05,
"loss": 0.9898,
"num_input_tokens_seen": 7037776,
"step": 360
},
{
"epoch": 0.4161322502493943,
"grad_norm": 7.081879138946533,
"learning_rate": 4.9976259763332423e-05,
"loss": 0.8761,
"num_input_tokens_seen": 7135552,
"step": 365
},
{
"epoch": 0.4218326920336326,
"grad_norm": 5.079131603240967,
"learning_rate": 4.9975604997500084e-05,
"loss": 0.8808,
"num_input_tokens_seen": 7233248,
"step": 370
},
{
"epoch": 0.4275331338178709,
"grad_norm": 7.381295680999756,
"learning_rate": 4.99749413291251e-05,
"loss": 0.9706,
"num_input_tokens_seen": 7330976,
"step": 375
},
{
"epoch": 0.4332335756021092,
"grad_norm": 4.044100284576416,
"learning_rate": 4.9974268758444054e-05,
"loss": 0.8972,
"num_input_tokens_seen": 7428704,
"step": 380
},
{
"epoch": 0.43893401738634746,
"grad_norm": 6.039126396179199,
"learning_rate": 4.9973587285696674e-05,
"loss": 0.7717,
"num_input_tokens_seen": 7526480,
"step": 385
},
{
"epoch": 0.4446344591705857,
"grad_norm": 5.874084949493408,
"learning_rate": 4.997289691112588e-05,
"loss": 0.9446,
"num_input_tokens_seen": 7624320,
"step": 390
},
{
"epoch": 0.450334900954824,
"grad_norm": 7.415895462036133,
"learning_rate": 4.997219763497774e-05,
"loss": 0.7123,
"num_input_tokens_seen": 7722064,
"step": 395
},
{
"epoch": 0.45603534273906227,
"grad_norm": 7.707664966583252,
"learning_rate": 4.997148945750153e-05,
"loss": 0.7859,
"num_input_tokens_seen": 7819808,
"step": 400
},
{
"epoch": 0.46173578452330055,
"grad_norm": 5.500309467315674,
"learning_rate": 4.9970772378949655e-05,
"loss": 0.826,
"num_input_tokens_seen": 7917488,
"step": 405
},
{
"epoch": 0.46743622630753884,
"grad_norm": 7.652528285980225,
"learning_rate": 4.9970046399577734e-05,
"loss": 0.8709,
"num_input_tokens_seen": 8015264,
"step": 410
},
{
"epoch": 0.4731366680917771,
"grad_norm": 6.417993545532227,
"learning_rate": 4.996931151964455e-05,
"loss": 0.9764,
"num_input_tokens_seen": 8113024,
"step": 415
},
{
"epoch": 0.4788371098760154,
"grad_norm": 5.648680210113525,
"learning_rate": 4.996856773941202e-05,
"loss": 0.8233,
"num_input_tokens_seen": 8210784,
"step": 420
},
{
"epoch": 0.4845375516602537,
"grad_norm": 8.321767807006836,
"learning_rate": 4.9967815059145296e-05,
"loss": 0.8556,
"num_input_tokens_seen": 8308512,
"step": 425
},
{
"epoch": 0.4902379934444919,
"grad_norm": 6.381886005401611,
"learning_rate": 4.9967053479112656e-05,
"loss": 0.7687,
"num_input_tokens_seen": 8406208,
"step": 430
},
{
"epoch": 0.4959384352287302,
"grad_norm": 7.855834007263184,
"learning_rate": 4.996628299958557e-05,
"loss": 0.7965,
"num_input_tokens_seen": 8503952,
"step": 435
},
{
"epoch": 0.5016388770129685,
"grad_norm": 8.358772277832031,
"learning_rate": 4.996550362083866e-05,
"loss": 0.7877,
"num_input_tokens_seen": 8601616,
"step": 440
},
{
"epoch": 0.5073393187972067,
"grad_norm": 8.553559303283691,
"learning_rate": 4.996471534314976e-05,
"loss": 0.76,
"num_input_tokens_seen": 8699424,
"step": 445
},
{
"epoch": 0.5130397605814451,
"grad_norm": 8.631624221801758,
"learning_rate": 4.9963918166799836e-05,
"loss": 0.8425,
"num_input_tokens_seen": 8797088,
"step": 450
},
{
"epoch": 0.5187402023656833,
"grad_norm": 11.236102104187012,
"learning_rate": 4.9963112092073046e-05,
"loss": 0.8332,
"num_input_tokens_seen": 8894848,
"step": 455
},
{
"epoch": 0.5244406441499216,
"grad_norm": 6.356544494628906,
"learning_rate": 4.996229711925671e-05,
"loss": 0.8231,
"num_input_tokens_seen": 8992576,
"step": 460
},
{
"epoch": 0.5301410859341599,
"grad_norm": 4.418157577514648,
"learning_rate": 4.996147324864132e-05,
"loss": 0.7168,
"num_input_tokens_seen": 9090272,
"step": 465
},
{
"epoch": 0.5358415277183982,
"grad_norm": 8.712305068969727,
"learning_rate": 4.996064048052056e-05,
"loss": 0.7672,
"num_input_tokens_seen": 9188080,
"step": 470
},
{
"epoch": 0.5415419695026364,
"grad_norm": 8.759718894958496,
"learning_rate": 4.995979881519126e-05,
"loss": 0.7601,
"num_input_tokens_seen": 9285872,
"step": 475
},
{
"epoch": 0.5472424112868748,
"grad_norm": 7.049539089202881,
"learning_rate": 4.995894825295343e-05,
"loss": 0.802,
"num_input_tokens_seen": 9383584,
"step": 480
},
{
"epoch": 0.552942853071113,
"grad_norm": 7.416094779968262,
"learning_rate": 4.995808879411026e-05,
"loss": 0.7645,
"num_input_tokens_seen": 9481200,
"step": 485
},
{
"epoch": 0.5586432948553512,
"grad_norm": 6.9029693603515625,
"learning_rate": 4.995722043896809e-05,
"loss": 0.6875,
"num_input_tokens_seen": 9578944,
"step": 490
},
{
"epoch": 0.5643437366395896,
"grad_norm": 7.398702621459961,
"learning_rate": 4.995634318783646e-05,
"loss": 0.7829,
"num_input_tokens_seen": 9676688,
"step": 495
},
{
"epoch": 0.5700441784238278,
"grad_norm": 7.631560802459717,
"learning_rate": 4.9955457041028055e-05,
"loss": 0.7324,
"num_input_tokens_seen": 9774464,
"step": 500
},
{
"epoch": 0.5757446202080662,
"grad_norm": 9.913789749145508,
"learning_rate": 4.995456199885875e-05,
"loss": 0.7578,
"num_input_tokens_seen": 9872160,
"step": 505
},
{
"epoch": 0.5814450619923044,
"grad_norm": 9.40986442565918,
"learning_rate": 4.995365806164758e-05,
"loss": 0.951,
"num_input_tokens_seen": 9969904,
"step": 510
},
{
"epoch": 0.5871455037765427,
"grad_norm": 5.404745578765869,
"learning_rate": 4.995274522971675e-05,
"loss": 0.7427,
"num_input_tokens_seen": 10067648,
"step": 515
},
{
"epoch": 0.592845945560781,
"grad_norm": 6.450439929962158,
"learning_rate": 4.9951823503391634e-05,
"loss": 0.75,
"num_input_tokens_seen": 10165456,
"step": 520
},
{
"epoch": 0.5985463873450192,
"grad_norm": 7.56156587600708,
"learning_rate": 4.9950892883000786e-05,
"loss": 0.7311,
"num_input_tokens_seen": 10263152,
"step": 525
},
{
"epoch": 0.6042468291292575,
"grad_norm": 5.007820129394531,
"learning_rate": 4.994995336887593e-05,
"loss": 0.7088,
"num_input_tokens_seen": 10360848,
"step": 530
},
{
"epoch": 0.6099472709134958,
"grad_norm": 6.651803016662598,
"learning_rate": 4.994900496135195e-05,
"loss": 0.7473,
"num_input_tokens_seen": 10458496,
"step": 535
},
{
"epoch": 0.6156477126977341,
"grad_norm": 4.845729351043701,
"learning_rate": 4.9948047660766904e-05,
"loss": 0.6939,
"num_input_tokens_seen": 10556304,
"step": 540
},
{
"epoch": 0.6213481544819723,
"grad_norm": 7.277071475982666,
"learning_rate": 4.994708146746203e-05,
"loss": 0.7219,
"num_input_tokens_seen": 10654048,
"step": 545
},
{
"epoch": 0.6270485962662107,
"grad_norm": 7.703381061553955,
"learning_rate": 4.994610638178172e-05,
"loss": 0.7795,
"num_input_tokens_seen": 10751776,
"step": 550
},
{
"epoch": 0.6327490380504489,
"grad_norm": 8.279520988464355,
"learning_rate": 4.994512240407354e-05,
"loss": 0.7027,
"num_input_tokens_seen": 10849584,
"step": 555
},
{
"epoch": 0.6384494798346871,
"grad_norm": 10.189576148986816,
"learning_rate": 4.9944129534688234e-05,
"loss": 0.6917,
"num_input_tokens_seen": 10947264,
"step": 560
},
{
"epoch": 0.6441499216189255,
"grad_norm": 6.311273574829102,
"learning_rate": 4.994312777397972e-05,
"loss": 0.7335,
"num_input_tokens_seen": 11045120,
"step": 565
},
{
"epoch": 0.6498503634031637,
"grad_norm": 9.937539100646973,
"learning_rate": 4.994211712230504e-05,
"loss": 0.6367,
"num_input_tokens_seen": 11142864,
"step": 570
},
{
"epoch": 0.655550805187402,
"grad_norm": 9.992775917053223,
"learning_rate": 4.994109758002447e-05,
"loss": 0.7662,
"num_input_tokens_seen": 11240560,
"step": 575
},
{
"epoch": 0.6612512469716403,
"grad_norm": 6.363308429718018,
"learning_rate": 4.994006914750143e-05,
"loss": 0.7291,
"num_input_tokens_seen": 11338320,
"step": 580
},
{
"epoch": 0.6669516887558786,
"grad_norm": 6.920602321624756,
"learning_rate": 4.993903182510249e-05,
"loss": 0.6525,
"num_input_tokens_seen": 11436032,
"step": 585
},
{
"epoch": 0.6726521305401169,
"grad_norm": 6.734442234039307,
"learning_rate": 4.99379856131974e-05,
"loss": 0.6581,
"num_input_tokens_seen": 11533680,
"step": 590
},
{
"epoch": 0.6783525723243551,
"grad_norm": 6.08076810836792,
"learning_rate": 4.99369305121591e-05,
"loss": 0.6868,
"num_input_tokens_seen": 11631344,
"step": 595
},
{
"epoch": 0.6840530141085934,
"grad_norm": 5.305174827575684,
"learning_rate": 4.9935866522363665e-05,
"loss": 0.7231,
"num_input_tokens_seen": 11729104,
"step": 600
},
{
"epoch": 0.6897534558928317,
"grad_norm": 5.337072849273682,
"learning_rate": 4.9934793644190345e-05,
"loss": 0.7082,
"num_input_tokens_seen": 11826880,
"step": 605
},
{
"epoch": 0.69545389767707,
"grad_norm": 6.563253879547119,
"learning_rate": 4.993371187802159e-05,
"loss": 0.7412,
"num_input_tokens_seen": 11924592,
"step": 610
},
{
"epoch": 0.7011543394613082,
"grad_norm": 6.92053747177124,
"learning_rate": 4.993262122424298e-05,
"loss": 0.6752,
"num_input_tokens_seen": 12022256,
"step": 615
},
{
"epoch": 0.7068547812455466,
"grad_norm": 10.413783073425293,
"learning_rate": 4.9931521683243276e-05,
"loss": 0.6955,
"num_input_tokens_seen": 12120000,
"step": 620
},
{
"epoch": 0.7125552230297848,
"grad_norm": 6.970921039581299,
"learning_rate": 4.993041325541442e-05,
"loss": 0.6883,
"num_input_tokens_seen": 12217808,
"step": 625
},
{
"epoch": 0.718255664814023,
"grad_norm": 5.135336875915527,
"learning_rate": 4.992929594115151e-05,
"loss": 0.6039,
"num_input_tokens_seen": 12315616,
"step": 630
},
{
"epoch": 0.7239561065982614,
"grad_norm": 7.350869655609131,
"learning_rate": 4.99281697408528e-05,
"loss": 0.7195,
"num_input_tokens_seen": 12413376,
"step": 635
},
{
"epoch": 0.7296565483824996,
"grad_norm": 6.427408218383789,
"learning_rate": 4.992703465491974e-05,
"loss": 0.5395,
"num_input_tokens_seen": 12510960,
"step": 640
},
{
"epoch": 0.735356990166738,
"grad_norm": 7.422171592712402,
"learning_rate": 4.992589068375691e-05,
"loss": 0.5605,
"num_input_tokens_seen": 12608752,
"step": 645
},
{
"epoch": 0.7410574319509762,
"grad_norm": 10.039104461669922,
"learning_rate": 4.9924737827772104e-05,
"loss": 0.6171,
"num_input_tokens_seen": 12706448,
"step": 650
},
{
"epoch": 0.7467578737352145,
"grad_norm": 6.769627094268799,
"learning_rate": 4.992357608737623e-05,
"loss": 0.6656,
"num_input_tokens_seen": 12804144,
"step": 655
},
{
"epoch": 0.7524583155194527,
"grad_norm": 6.161548614501953,
"learning_rate": 4.992240546298341e-05,
"loss": 0.6412,
"num_input_tokens_seen": 12902000,
"step": 660
},
{
"epoch": 0.7581587573036911,
"grad_norm": 9.02010440826416,
"learning_rate": 4.9921225955010906e-05,
"loss": 0.6899,
"num_input_tokens_seen": 12999648,
"step": 665
},
{
"epoch": 0.7638591990879293,
"grad_norm": 5.683040618896484,
"learning_rate": 4.9920037563879155e-05,
"loss": 0.5788,
"num_input_tokens_seen": 13097424,
"step": 670
},
{
"epoch": 0.7695596408721675,
"grad_norm": 5.576777935028076,
"learning_rate": 4.9918840290011745e-05,
"loss": 0.6354,
"num_input_tokens_seen": 13195136,
"step": 675
},
{
"epoch": 0.7752600826564059,
"grad_norm": 6.484269142150879,
"learning_rate": 4.9917634133835466e-05,
"loss": 0.6004,
"num_input_tokens_seen": 13292912,
"step": 680
},
{
"epoch": 0.7809605244406441,
"grad_norm": 5.845834732055664,
"learning_rate": 4.991641909578023e-05,
"loss": 0.6065,
"num_input_tokens_seen": 13390560,
"step": 685
},
{
"epoch": 0.7866609662248825,
"grad_norm": 7.066195011138916,
"learning_rate": 4.9915195176279156e-05,
"loss": 0.703,
"num_input_tokens_seen": 13488304,
"step": 690
},
{
"epoch": 0.7923614080091207,
"grad_norm": 7.687030792236328,
"learning_rate": 4.9913962375768494e-05,
"loss": 0.5684,
"num_input_tokens_seen": 13586032,
"step": 695
},
{
"epoch": 0.798061849793359,
"grad_norm": 5.923397064208984,
"learning_rate": 4.9912720694687684e-05,
"loss": 0.7124,
"num_input_tokens_seen": 13683792,
"step": 700
},
{
"epoch": 0.8037622915775973,
"grad_norm": 7.307689666748047,
"learning_rate": 4.9911470133479324e-05,
"loss": 0.585,
"num_input_tokens_seen": 13781488,
"step": 705
},
{
"epoch": 0.8094627333618355,
"grad_norm": 5.22707462310791,
"learning_rate": 4.9910210692589164e-05,
"loss": 0.6301,
"num_input_tokens_seen": 13879264,
"step": 710
},
{
"epoch": 0.8151631751460738,
"grad_norm": 6.6996870040893555,
"learning_rate": 4.990894237246615e-05,
"loss": 0.6073,
"num_input_tokens_seen": 13976976,
"step": 715
},
{
"epoch": 0.8208636169303121,
"grad_norm": 9.039154052734375,
"learning_rate": 4.990766517356236e-05,
"loss": 0.6611,
"num_input_tokens_seen": 14074688,
"step": 720
},
{
"epoch": 0.8265640587145504,
"grad_norm": 3.3328487873077393,
"learning_rate": 4.9906379096333047e-05,
"loss": 0.5829,
"num_input_tokens_seen": 14172432,
"step": 725
},
{
"epoch": 0.8322645004987886,
"grad_norm": 9.260608673095703,
"learning_rate": 4.9905084141236646e-05,
"loss": 0.7311,
"num_input_tokens_seen": 14270112,
"step": 730
},
{
"epoch": 0.837964942283027,
"grad_norm": 7.124883651733398,
"learning_rate": 4.990378030873474e-05,
"loss": 0.6354,
"num_input_tokens_seen": 14367792,
"step": 735
},
{
"epoch": 0.8436653840672652,
"grad_norm": 5.522550106048584,
"learning_rate": 4.990246759929207e-05,
"loss": 0.5578,
"num_input_tokens_seen": 14465584,
"step": 740
},
{
"epoch": 0.8493658258515034,
"grad_norm": 8.491950035095215,
"learning_rate": 4.9901146013376556e-05,
"loss": 0.6489,
"num_input_tokens_seen": 14563344,
"step": 745
},
{
"epoch": 0.8550662676357418,
"grad_norm": 6.821796417236328,
"learning_rate": 4.989981555145928e-05,
"loss": 0.451,
"num_input_tokens_seen": 14661024,
"step": 750
},
{
"epoch": 0.86076670941998,
"grad_norm": 8.655888557434082,
"learning_rate": 4.9898476214014486e-05,
"loss": 0.6291,
"num_input_tokens_seen": 14758800,
"step": 755
},
{
"epoch": 0.8664671512042184,
"grad_norm": 8.26075267791748,
"learning_rate": 4.989712800151958e-05,
"loss": 0.7259,
"num_input_tokens_seen": 14856592,
"step": 760
},
{
"epoch": 0.8721675929884566,
"grad_norm": 6.850794315338135,
"learning_rate": 4.989577091445512e-05,
"loss": 0.5639,
"num_input_tokens_seen": 14954304,
"step": 765
},
{
"epoch": 0.8778680347726949,
"grad_norm": 9.18870735168457,
"learning_rate": 4.989440495330485e-05,
"loss": 0.616,
"num_input_tokens_seen": 15052016,
"step": 770
},
{
"epoch": 0.8835684765569332,
"grad_norm": 9.08046817779541,
"learning_rate": 4.989303011855567e-05,
"loss": 0.5797,
"num_input_tokens_seen": 15149664,
"step": 775
},
{
"epoch": 0.8892689183411714,
"grad_norm": 5.607428073883057,
"learning_rate": 4.989164641069763e-05,
"loss": 0.5893,
"num_input_tokens_seen": 15247360,
"step": 780
},
{
"epoch": 0.8949693601254097,
"grad_norm": 6.935970783233643,
"learning_rate": 4.9890253830223955e-05,
"loss": 0.6095,
"num_input_tokens_seen": 15345056,
"step": 785
},
{
"epoch": 0.900669801909648,
"grad_norm": 6.799474239349365,
"learning_rate": 4.988885237763102e-05,
"loss": 0.5044,
"num_input_tokens_seen": 15442752,
"step": 790
},
{
"epoch": 0.9063702436938863,
"grad_norm": 6.294219017028809,
"learning_rate": 4.98874420534184e-05,
"loss": 0.5584,
"num_input_tokens_seen": 15540464,
"step": 795
},
{
"epoch": 0.9120706854781245,
"grad_norm": 5.488597869873047,
"learning_rate": 4.988602285808877e-05,
"loss": 0.4862,
"num_input_tokens_seen": 15638128,
"step": 800
},
{
"epoch": 0.9177711272623629,
"grad_norm": 8.307422637939453,
"learning_rate": 4.988459479214802e-05,
"loss": 0.5815,
"num_input_tokens_seen": 15735872,
"step": 805
},
{
"epoch": 0.9234715690466011,
"grad_norm": 10.344627380371094,
"learning_rate": 4.988315785610519e-05,
"loss": 0.5963,
"num_input_tokens_seen": 15833680,
"step": 810
},
{
"epoch": 0.9291720108308394,
"grad_norm": 10.354679107666016,
"learning_rate": 4.9881712050472464e-05,
"loss": 0.6225,
"num_input_tokens_seen": 15931472,
"step": 815
},
{
"epoch": 0.9348724526150777,
"grad_norm": 7.605050086975098,
"learning_rate": 4.9880257375765194e-05,
"loss": 0.645,
"num_input_tokens_seen": 16029120,
"step": 820
},
{
"epoch": 0.9405728943993159,
"grad_norm": 5.717419624328613,
"learning_rate": 4.987879383250191e-05,
"loss": 0.5142,
"num_input_tokens_seen": 16126896,
"step": 825
},
{
"epoch": 0.9462733361835542,
"grad_norm": 7.159694194793701,
"learning_rate": 4.987732142120428e-05,
"loss": 0.6613,
"num_input_tokens_seen": 16224592,
"step": 830
},
{
"epoch": 0.9519737779677925,
"grad_norm": 7.166426658630371,
"learning_rate": 4.987584014239716e-05,
"loss": 0.6094,
"num_input_tokens_seen": 16322208,
"step": 835
},
{
"epoch": 0.9576742197520308,
"grad_norm": 7.844811916351318,
"learning_rate": 4.9874349996608536e-05,
"loss": 0.5613,
"num_input_tokens_seen": 16419904,
"step": 840
},
{
"epoch": 0.963374661536269,
"grad_norm": 5.295498371124268,
"learning_rate": 4.987285098436958e-05,
"loss": 0.4958,
"num_input_tokens_seen": 16517600,
"step": 845
},
{
"epoch": 0.9690751033205074,
"grad_norm": 5.007256984710693,
"learning_rate": 4.987134310621461e-05,
"loss": 0.5119,
"num_input_tokens_seen": 16615216,
"step": 850
},
{
"epoch": 0.9747755451047456,
"grad_norm": 7.532383918762207,
"learning_rate": 4.9869826362681096e-05,
"loss": 0.4567,
"num_input_tokens_seen": 16713040,
"step": 855
},
{
"epoch": 0.9804759868889839,
"grad_norm": 6.256499767303467,
"learning_rate": 4.9868300754309706e-05,
"loss": 0.5088,
"num_input_tokens_seen": 16810768,
"step": 860
},
{
"epoch": 0.9861764286732222,
"grad_norm": 6.756839752197266,
"learning_rate": 4.986676628164423e-05,
"loss": 0.4097,
"num_input_tokens_seen": 16908512,
"step": 865
},
{
"epoch": 0.9918768704574604,
"grad_norm": 6.562160491943359,
"learning_rate": 4.986522294523162e-05,
"loss": 0.3819,
"num_input_tokens_seen": 17006240,
"step": 870
},
{
"epoch": 0.9975773122416988,
"grad_norm": 7.496212959289551,
"learning_rate": 4.9863670745622015e-05,
"loss": 0.4956,
"num_input_tokens_seen": 17104000,
"step": 875
},
{
"epoch": 1.0022801767136953,
"grad_norm": 9.028531074523926,
"learning_rate": 4.986210968336868e-05,
"loss": 0.5872,
"num_input_tokens_seen": 17184592,
"step": 880
},
{
"epoch": 1.0079806184979336,
"grad_norm": 6.04398775100708,
"learning_rate": 4.986053975902807e-05,
"loss": 0.48,
"num_input_tokens_seen": 17282304,
"step": 885
},
{
"epoch": 1.013681060282172,
"grad_norm": 9.602685928344727,
"learning_rate": 4.985896097315977e-05,
"loss": 0.5309,
"num_input_tokens_seen": 17380080,
"step": 890
},
{
"epoch": 1.01938150206641,
"grad_norm": 6.881324291229248,
"learning_rate": 4.9857373326326545e-05,
"loss": 0.5103,
"num_input_tokens_seen": 17477760,
"step": 895
},
{
"epoch": 1.0250819438506484,
"grad_norm": 8.762344360351562,
"learning_rate": 4.985577681909431e-05,
"loss": 0.5336,
"num_input_tokens_seen": 17575456,
"step": 900
},
{
"epoch": 1.0307823856348868,
"grad_norm": 7.028131484985352,
"learning_rate": 4.985417145203214e-05,
"loss": 0.4887,
"num_input_tokens_seen": 17673184,
"step": 905
},
{
"epoch": 1.036482827419125,
"grad_norm": 6.512467861175537,
"learning_rate": 4.985255722571227e-05,
"loss": 0.4787,
"num_input_tokens_seen": 17770944,
"step": 910
},
{
"epoch": 1.0421832692033632,
"grad_norm": 6.597855567932129,
"learning_rate": 4.985093414071008e-05,
"loss": 0.5185,
"num_input_tokens_seen": 17868768,
"step": 915
},
{
"epoch": 1.0478837109876016,
"grad_norm": 7.660828590393066,
"learning_rate": 4.984930219760413e-05,
"loss": 0.5056,
"num_input_tokens_seen": 17966480,
"step": 920
},
{
"epoch": 1.05358415277184,
"grad_norm": 6.880988121032715,
"learning_rate": 4.984766139697611e-05,
"loss": 0.5371,
"num_input_tokens_seen": 18064336,
"step": 925
},
{
"epoch": 1.059284594556078,
"grad_norm": 5.854323863983154,
"learning_rate": 4.98460117394109e-05,
"loss": 0.5041,
"num_input_tokens_seen": 18162112,
"step": 930
},
{
"epoch": 1.0649850363403164,
"grad_norm": 5.215938568115234,
"learning_rate": 4.984435322549651e-05,
"loss": 0.4857,
"num_input_tokens_seen": 18259904,
"step": 935
},
{
"epoch": 1.0706854781245547,
"grad_norm": 5.902091026306152,
"learning_rate": 4.984268585582412e-05,
"loss": 0.5047,
"num_input_tokens_seen": 18357616,
"step": 940
},
{
"epoch": 1.0763859199087928,
"grad_norm": 7.6616411209106445,
"learning_rate": 4.9841009630988064e-05,
"loss": 0.4147,
"num_input_tokens_seen": 18455392,
"step": 945
},
{
"epoch": 1.0820863616930312,
"grad_norm": 7.779905796051025,
"learning_rate": 4.983932455158583e-05,
"loss": 0.4762,
"num_input_tokens_seen": 18553120,
"step": 950
},
{
"epoch": 1.0877868034772695,
"grad_norm": 6.425886154174805,
"learning_rate": 4.9837630618218056e-05,
"loss": 0.4129,
"num_input_tokens_seen": 18650864,
"step": 955
},
{
"epoch": 1.0934872452615079,
"grad_norm": 8.044005393981934,
"learning_rate": 4.983592783148856e-05,
"loss": 0.4027,
"num_input_tokens_seen": 18748624,
"step": 960
},
{
"epoch": 1.099187687045746,
"grad_norm": 5.91091251373291,
"learning_rate": 4.983421619200428e-05,
"loss": 0.4064,
"num_input_tokens_seen": 18846320,
"step": 965
},
{
"epoch": 1.1048881288299843,
"grad_norm": 5.88447380065918,
"learning_rate": 4.9832495700375346e-05,
"loss": 0.4599,
"num_input_tokens_seen": 18944144,
"step": 970
},
{
"epoch": 1.1105885706142227,
"grad_norm": 7.686187744140625,
"learning_rate": 4.983076635721502e-05,
"loss": 0.4764,
"num_input_tokens_seen": 19041904,
"step": 975
},
{
"epoch": 1.1162890123984608,
"grad_norm": 6.358628273010254,
"learning_rate": 4.982902816313972e-05,
"loss": 0.4844,
"num_input_tokens_seen": 19139664,
"step": 980
},
{
"epoch": 1.1219894541826991,
"grad_norm": 6.775269508361816,
"learning_rate": 4.982728111876903e-05,
"loss": 0.4292,
"num_input_tokens_seen": 19237488,
"step": 985
},
{
"epoch": 1.1276898959669375,
"grad_norm": 7.491086483001709,
"learning_rate": 4.982552522472569e-05,
"loss": 0.4423,
"num_input_tokens_seen": 19335152,
"step": 990
},
{
"epoch": 1.1333903377511758,
"grad_norm": 7.664567947387695,
"learning_rate": 4.982376048163557e-05,
"loss": 0.4983,
"num_input_tokens_seen": 19432976,
"step": 995
},
{
"epoch": 1.139090779535414,
"grad_norm": 4.956116676330566,
"learning_rate": 4.9821986890127734e-05,
"loss": 0.4027,
"num_input_tokens_seen": 19530704,
"step": 1000
},
{
"epoch": 1.1447912213196523,
"grad_norm": 7.820266246795654,
"learning_rate": 4.982020445083436e-05,
"loss": 0.4131,
"num_input_tokens_seen": 19628448,
"step": 1005
},
{
"epoch": 1.1504916631038906,
"grad_norm": 5.402007102966309,
"learning_rate": 4.981841316439081e-05,
"loss": 0.4946,
"num_input_tokens_seen": 19726176,
"step": 1010
},
{
"epoch": 1.1561921048881287,
"grad_norm": 6.406668663024902,
"learning_rate": 4.981661303143557e-05,
"loss": 0.4701,
"num_input_tokens_seen": 19823856,
"step": 1015
},
{
"epoch": 1.161892546672367,
"grad_norm": 3.809847116470337,
"learning_rate": 4.981480405261032e-05,
"loss": 0.4063,
"num_input_tokens_seen": 19921552,
"step": 1020
},
{
"epoch": 1.1675929884566054,
"grad_norm": 8.321266174316406,
"learning_rate": 4.981298622855984e-05,
"loss": 0.38,
"num_input_tokens_seen": 20019248,
"step": 1025
},
{
"epoch": 1.1732934302408438,
"grad_norm": 4.611199855804443,
"learning_rate": 4.981115955993213e-05,
"loss": 0.3435,
"num_input_tokens_seen": 20116992,
"step": 1030
},
{
"epoch": 1.1789938720250819,
"grad_norm": 6.748137950897217,
"learning_rate": 4.980932404737827e-05,
"loss": 0.4443,
"num_input_tokens_seen": 20214848,
"step": 1035
},
{
"epoch": 1.1846943138093202,
"grad_norm": 7.327335834503174,
"learning_rate": 4.980747969155255e-05,
"loss": 0.5365,
"num_input_tokens_seen": 20312608,
"step": 1040
},
{
"epoch": 1.1903947555935586,
"grad_norm": 9.424795150756836,
"learning_rate": 4.980562649311238e-05,
"loss": 0.404,
"num_input_tokens_seen": 20410288,
"step": 1045
},
{
"epoch": 1.196095197377797,
"grad_norm": 6.2012152671813965,
"learning_rate": 4.9803764452718335e-05,
"loss": 0.4176,
"num_input_tokens_seen": 20508080,
"step": 1050
},
{
"epoch": 1.201795639162035,
"grad_norm": 6.23061990737915,
"learning_rate": 4.980189357103414e-05,
"loss": 0.3945,
"num_input_tokens_seen": 20605856,
"step": 1055
},
{
"epoch": 1.2074960809462734,
"grad_norm": 8.05282974243164,
"learning_rate": 4.980001384872666e-05,
"loss": 0.5353,
"num_input_tokens_seen": 20703584,
"step": 1060
},
{
"epoch": 1.2131965227305117,
"grad_norm": 7.0456156730651855,
"learning_rate": 4.9798125286465935e-05,
"loss": 0.4638,
"num_input_tokens_seen": 20801376,
"step": 1065
},
{
"epoch": 1.2188969645147498,
"grad_norm": 5.285283088684082,
"learning_rate": 4.979622788492513e-05,
"loss": 0.5492,
"num_input_tokens_seen": 20899200,
"step": 1070
},
{
"epoch": 1.2245974062989882,
"grad_norm": 7.358059883117676,
"learning_rate": 4.9794321644780585e-05,
"loss": 0.4979,
"num_input_tokens_seen": 20996928,
"step": 1075
},
{
"epoch": 1.2302978480832265,
"grad_norm": 6.339309215545654,
"learning_rate": 4.979240656671177e-05,
"loss": 0.3867,
"num_input_tokens_seen": 21094752,
"step": 1080
},
{
"epoch": 1.2359982898674646,
"grad_norm": 6.887006759643555,
"learning_rate": 4.979048265140132e-05,
"loss": 0.338,
"num_input_tokens_seen": 21192480,
"step": 1085
},
{
"epoch": 1.241698731651703,
"grad_norm": 7.377925395965576,
"learning_rate": 4.9788549899535e-05,
"loss": 0.3946,
"num_input_tokens_seen": 21290144,
"step": 1090
},
{
"epoch": 1.2473991734359413,
"grad_norm": 7.47123384475708,
"learning_rate": 4.978660831180175e-05,
"loss": 0.4831,
"num_input_tokens_seen": 21387888,
"step": 1095
},
{
"epoch": 1.2530996152201794,
"grad_norm": 9.348438262939453,
"learning_rate": 4.978465788889365e-05,
"loss": 0.4933,
"num_input_tokens_seen": 21485536,
"step": 1100
},
{
"epoch": 1.2588000570044178,
"grad_norm": 8.355619430541992,
"learning_rate": 4.978269863150592e-05,
"loss": 0.4139,
"num_input_tokens_seen": 21583264,
"step": 1105
},
{
"epoch": 1.2645004987886561,
"grad_norm": 5.923043727874756,
"learning_rate": 4.978073054033694e-05,
"loss": 0.3656,
"num_input_tokens_seen": 21681040,
"step": 1110
},
{
"epoch": 1.2702009405728945,
"grad_norm": 7.194669246673584,
"learning_rate": 4.977875361608823e-05,
"loss": 0.3487,
"num_input_tokens_seen": 21778720,
"step": 1115
},
{
"epoch": 1.2759013823571328,
"grad_norm": 7.351987838745117,
"learning_rate": 4.9776767859464474e-05,
"loss": 0.4004,
"num_input_tokens_seen": 21876496,
"step": 1120
},
{
"epoch": 1.281601824141371,
"grad_norm": 6.509387493133545,
"learning_rate": 4.9774773271173494e-05,
"loss": 0.3702,
"num_input_tokens_seen": 21974256,
"step": 1125
},
{
"epoch": 1.2873022659256093,
"grad_norm": 10.99763298034668,
"learning_rate": 4.977276985192624e-05,
"loss": 0.3921,
"num_input_tokens_seen": 22071952,
"step": 1130
},
{
"epoch": 1.2930027077098476,
"grad_norm": 5.566549777984619,
"learning_rate": 4.977075760243686e-05,
"loss": 0.4117,
"num_input_tokens_seen": 22169696,
"step": 1135
},
{
"epoch": 1.2987031494940857,
"grad_norm": 8.485737800598145,
"learning_rate": 4.976873652342259e-05,
"loss": 0.394,
"num_input_tokens_seen": 22267456,
"step": 1140
},
{
"epoch": 1.304403591278324,
"grad_norm": 7.3099284172058105,
"learning_rate": 4.976670661560386e-05,
"loss": 0.2883,
"num_input_tokens_seen": 22365120,
"step": 1145
},
{
"epoch": 1.3101040330625624,
"grad_norm": 6.294934272766113,
"learning_rate": 4.976466787970423e-05,
"loss": 0.3503,
"num_input_tokens_seen": 22462880,
"step": 1150
},
{
"epoch": 1.3158044748468005,
"grad_norm": 5.884027004241943,
"learning_rate": 4.97626203164504e-05,
"loss": 0.3098,
"num_input_tokens_seen": 22560640,
"step": 1155
},
{
"epoch": 1.3215049166310389,
"grad_norm": 7.804978847503662,
"learning_rate": 4.9760563926572226e-05,
"loss": 0.3423,
"num_input_tokens_seen": 22658368,
"step": 1160
},
{
"epoch": 1.3272053584152772,
"grad_norm": 7.155725002288818,
"learning_rate": 4.97584987108027e-05,
"loss": 0.3006,
"num_input_tokens_seen": 22756176,
"step": 1165
},
{
"epoch": 1.3329058001995155,
"grad_norm": 6.071112632751465,
"learning_rate": 4.975642466987799e-05,
"loss": 0.3357,
"num_input_tokens_seen": 22853920,
"step": 1170
},
{
"epoch": 1.3386062419837537,
"grad_norm": 5.568732738494873,
"learning_rate": 4.9754341804537356e-05,
"loss": 0.3445,
"num_input_tokens_seen": 22951664,
"step": 1175
},
{
"epoch": 1.344306683767992,
"grad_norm": 9.902073860168457,
"learning_rate": 4.975225011552326e-05,
"loss": 0.3621,
"num_input_tokens_seen": 23049520,
"step": 1180
},
{
"epoch": 1.3500071255522303,
"grad_norm": 5.503910064697266,
"learning_rate": 4.975014960358126e-05,
"loss": 0.3229,
"num_input_tokens_seen": 23147280,
"step": 1185
},
{
"epoch": 1.3557075673364687,
"grad_norm": 7.572802543640137,
"learning_rate": 4.974804026946011e-05,
"loss": 0.5356,
"num_input_tokens_seen": 23245008,
"step": 1190
},
{
"epoch": 1.3614080091207068,
"grad_norm": 7.335933208465576,
"learning_rate": 4.9745922113911655e-05,
"loss": 0.364,
"num_input_tokens_seen": 23342768,
"step": 1195
},
{
"epoch": 1.3671084509049451,
"grad_norm": 10.062085151672363,
"learning_rate": 4.974379513769093e-05,
"loss": 0.384,
"num_input_tokens_seen": 23440480,
"step": 1200
},
{
"epoch": 1.3728088926891835,
"grad_norm": 10.50871467590332,
"learning_rate": 4.974165934155608e-05,
"loss": 0.357,
"num_input_tokens_seen": 23538192,
"step": 1205
},
{
"epoch": 1.3785093344734216,
"grad_norm": 6.676635265350342,
"learning_rate": 4.9739514726268416e-05,
"loss": 0.316,
"num_input_tokens_seen": 23635984,
"step": 1210
},
{
"epoch": 1.38420977625766,
"grad_norm": 9.456487655639648,
"learning_rate": 4.973736129259239e-05,
"loss": 0.3407,
"num_input_tokens_seen": 23733744,
"step": 1215
},
{
"epoch": 1.3899102180418983,
"grad_norm": 7.790709495544434,
"learning_rate": 4.9735199041295575e-05,
"loss": 0.422,
"num_input_tokens_seen": 23831440,
"step": 1220
},
{
"epoch": 1.3956106598261364,
"grad_norm": 9.099756240844727,
"learning_rate": 4.9733027973148727e-05,
"loss": 0.4655,
"num_input_tokens_seen": 23929184,
"step": 1225
},
{
"epoch": 1.4013111016103748,
"grad_norm": 6.994203567504883,
"learning_rate": 4.9730848088925706e-05,
"loss": 0.388,
"num_input_tokens_seen": 24026928,
"step": 1230
},
{
"epoch": 1.407011543394613,
"grad_norm": 7.203540325164795,
"learning_rate": 4.9728659389403535e-05,
"loss": 0.4004,
"num_input_tokens_seen": 24124688,
"step": 1235
},
{
"epoch": 1.4127119851788514,
"grad_norm": 7.220198631286621,
"learning_rate": 4.9726461875362377e-05,
"loss": 0.3321,
"num_input_tokens_seen": 24222416,
"step": 1240
},
{
"epoch": 1.4184124269630896,
"grad_norm": 6.651162147521973,
"learning_rate": 4.9724255547585534e-05,
"loss": 0.2864,
"num_input_tokens_seen": 24320096,
"step": 1245
},
{
"epoch": 1.424112868747328,
"grad_norm": 7.986251354217529,
"learning_rate": 4.9722040406859454e-05,
"loss": 0.3401,
"num_input_tokens_seen": 24417712,
"step": 1250
},
{
"epoch": 1.4298133105315662,
"grad_norm": 6.927532196044922,
"learning_rate": 4.971981645397371e-05,
"loss": 0.344,
"num_input_tokens_seen": 24515456,
"step": 1255
},
{
"epoch": 1.4355137523158046,
"grad_norm": 8.963294982910156,
"learning_rate": 4.9717583689721046e-05,
"loss": 0.3394,
"num_input_tokens_seen": 24613232,
"step": 1260
},
{
"epoch": 1.4412141941000427,
"grad_norm": 9.106192588806152,
"learning_rate": 4.9715342114897325e-05,
"loss": 0.4323,
"num_input_tokens_seen": 24710960,
"step": 1265
},
{
"epoch": 1.446914635884281,
"grad_norm": 8.095370292663574,
"learning_rate": 4.971309173030154e-05,
"loss": 0.3961,
"num_input_tokens_seen": 24808560,
"step": 1270
},
{
"epoch": 1.4526150776685194,
"grad_norm": 7.318641662597656,
"learning_rate": 4.9710832536735864e-05,
"loss": 0.2917,
"num_input_tokens_seen": 24906320,
"step": 1275
},
{
"epoch": 1.4583155194527575,
"grad_norm": 7.140157699584961,
"learning_rate": 4.970856453500557e-05,
"loss": 0.3622,
"num_input_tokens_seen": 25004016,
"step": 1280
},
{
"epoch": 1.4640159612369958,
"grad_norm": 8.635784149169922,
"learning_rate": 4.970628772591909e-05,
"loss": 0.4472,
"num_input_tokens_seen": 25101808,
"step": 1285
},
{
"epoch": 1.4697164030212342,
"grad_norm": 9.65007495880127,
"learning_rate": 4.970400211028798e-05,
"loss": 0.3185,
"num_input_tokens_seen": 25199568,
"step": 1290
},
{
"epoch": 1.4754168448054723,
"grad_norm": 6.725943088531494,
"learning_rate": 4.970170768892697e-05,
"loss": 0.4232,
"num_input_tokens_seen": 25297296,
"step": 1295
},
{
"epoch": 1.4811172865897106,
"grad_norm": 7.719542980194092,
"learning_rate": 4.9699404462653887e-05,
"loss": 0.3133,
"num_input_tokens_seen": 25395056,
"step": 1300
},
{
"epoch": 1.486817728373949,
"grad_norm": 7.277915954589844,
"learning_rate": 4.969709243228972e-05,
"loss": 0.3103,
"num_input_tokens_seen": 25492784,
"step": 1305
},
{
"epoch": 1.4925181701581873,
"grad_norm": 6.372420310974121,
"learning_rate": 4.96947715986586e-05,
"loss": 0.3191,
"num_input_tokens_seen": 25590528,
"step": 1310
},
{
"epoch": 1.4982186119424257,
"grad_norm": 5.824290752410889,
"learning_rate": 4.969244196258777e-05,
"loss": 0.2663,
"num_input_tokens_seen": 25688304,
"step": 1315
},
{
"epoch": 1.5039190537266638,
"grad_norm": 5.841516971588135,
"learning_rate": 4.969010352490764e-05,
"loss": 0.3178,
"num_input_tokens_seen": 25786096,
"step": 1320
},
{
"epoch": 1.5096194955109021,
"grad_norm": 7.10455846786499,
"learning_rate": 4.968775628645174e-05,
"loss": 0.4365,
"num_input_tokens_seen": 25883776,
"step": 1325
},
{
"epoch": 1.5153199372951405,
"grad_norm": 6.748999118804932,
"learning_rate": 4.9685400248056747e-05,
"loss": 0.2147,
"num_input_tokens_seen": 25981552,
"step": 1330
},
{
"epoch": 1.5210203790793786,
"grad_norm": 4.732318878173828,
"learning_rate": 4.968303541056246e-05,
"loss": 0.3367,
"num_input_tokens_seen": 26079312,
"step": 1335
},
{
"epoch": 1.526720820863617,
"grad_norm": 4.386244297027588,
"learning_rate": 4.9680661774811835e-05,
"loss": 0.3207,
"num_input_tokens_seen": 26177136,
"step": 1340
},
{
"epoch": 1.5324212626478553,
"grad_norm": 9.144301414489746,
"learning_rate": 4.967827934165095e-05,
"loss": 0.2718,
"num_input_tokens_seen": 26274944,
"step": 1345
},
{
"epoch": 1.5381217044320934,
"grad_norm": 10.099781036376953,
"learning_rate": 4.967588811192902e-05,
"loss": 0.3752,
"num_input_tokens_seen": 26372768,
"step": 1350
},
{
"epoch": 1.5438221462163317,
"grad_norm": 7.383282661437988,
"learning_rate": 4.96734880864984e-05,
"loss": 0.2743,
"num_input_tokens_seen": 26470608,
"step": 1355
},
{
"epoch": 1.54952258800057,
"grad_norm": 8.287309646606445,
"learning_rate": 4.967107926621457e-05,
"loss": 0.2853,
"num_input_tokens_seen": 26568368,
"step": 1360
},
{
"epoch": 1.5552230297848082,
"grad_norm": 9.22373104095459,
"learning_rate": 4.966866165193617e-05,
"loss": 0.2913,
"num_input_tokens_seen": 26666080,
"step": 1365
},
{
"epoch": 1.5609234715690468,
"grad_norm": 6.172590255737305,
"learning_rate": 4.966623524452494e-05,
"loss": 0.2775,
"num_input_tokens_seen": 26763792,
"step": 1370
},
{
"epoch": 1.5666239133532849,
"grad_norm": 7.464315891265869,
"learning_rate": 4.9663800044845784e-05,
"loss": 0.3685,
"num_input_tokens_seen": 26861488,
"step": 1375
},
{
"epoch": 1.572324355137523,
"grad_norm": 7.123498916625977,
"learning_rate": 4.9661356053766716e-05,
"loss": 0.3636,
"num_input_tokens_seen": 26959232,
"step": 1380
},
{
"epoch": 1.5780247969217616,
"grad_norm": 7.7689619064331055,
"learning_rate": 4.965890327215891e-05,
"loss": 0.3052,
"num_input_tokens_seen": 27057040,
"step": 1385
},
{
"epoch": 1.5837252387059997,
"grad_norm": 5.988576412200928,
"learning_rate": 4.965644170089665e-05,
"loss": 0.3355,
"num_input_tokens_seen": 27154768,
"step": 1390
},
{
"epoch": 1.589425680490238,
"grad_norm": 9.67150592803955,
"learning_rate": 4.965397134085735e-05,
"loss": 0.3239,
"num_input_tokens_seen": 27252480,
"step": 1395
},
{
"epoch": 1.5951261222744764,
"grad_norm": 9.156477928161621,
"learning_rate": 4.96514921929216e-05,
"loss": 0.3421,
"num_input_tokens_seen": 27350320,
"step": 1400
},
{
"epoch": 1.6008265640587145,
"grad_norm": 8.114056587219238,
"learning_rate": 4.964900425797306e-05,
"loss": 0.405,
"num_input_tokens_seen": 27448128,
"step": 1405
},
{
"epoch": 1.6065270058429528,
"grad_norm": 8.292765617370605,
"learning_rate": 4.9646507536898575e-05,
"loss": 0.2936,
"num_input_tokens_seen": 27545808,
"step": 1410
},
{
"epoch": 1.6122274476271912,
"grad_norm": 6.832367420196533,
"learning_rate": 4.964400203058809e-05,
"loss": 0.2365,
"num_input_tokens_seen": 27643456,
"step": 1415
},
{
"epoch": 1.6179278894114293,
"grad_norm": 8.706232070922852,
"learning_rate": 4.9641487739934684e-05,
"loss": 0.3065,
"num_input_tokens_seen": 27741168,
"step": 1420
},
{
"epoch": 1.6236283311956676,
"grad_norm": 7.025177478790283,
"learning_rate": 4.963896466583459e-05,
"loss": 0.2376,
"num_input_tokens_seen": 27838912,
"step": 1425
},
{
"epoch": 1.629328772979906,
"grad_norm": 9.140198707580566,
"learning_rate": 4.963643280918714e-05,
"loss": 0.2518,
"num_input_tokens_seen": 27936592,
"step": 1430
},
{
"epoch": 1.635029214764144,
"grad_norm": 8.371499061584473,
"learning_rate": 4.963389217089484e-05,
"loss": 0.3488,
"num_input_tokens_seen": 28034304,
"step": 1435
},
{
"epoch": 1.6407296565483827,
"grad_norm": 4.390028476715088,
"learning_rate": 4.963134275186327e-05,
"loss": 0.2444,
"num_input_tokens_seen": 28131984,
"step": 1440
},
{
"epoch": 1.6464300983326208,
"grad_norm": 10.252416610717773,
"learning_rate": 4.9628784553001185e-05,
"loss": 0.3859,
"num_input_tokens_seen": 28229680,
"step": 1445
},
{
"epoch": 1.652130540116859,
"grad_norm": 8.285685539245605,
"learning_rate": 4.962621757522044e-05,
"loss": 0.3006,
"num_input_tokens_seen": 28327440,
"step": 1450
},
{
"epoch": 1.6578309819010975,
"grad_norm": 8.214794158935547,
"learning_rate": 4.962364181943606e-05,
"loss": 0.2718,
"num_input_tokens_seen": 28425216,
"step": 1455
},
{
"epoch": 1.6635314236853356,
"grad_norm": 12.243093490600586,
"learning_rate": 4.9621057286566155e-05,
"loss": 0.3569,
"num_input_tokens_seen": 28522992,
"step": 1460
},
{
"epoch": 1.669231865469574,
"grad_norm": 5.961976528167725,
"learning_rate": 4.961846397753197e-05,
"loss": 0.2414,
"num_input_tokens_seen": 28620720,
"step": 1465
},
{
"epoch": 1.6749323072538123,
"grad_norm": 7.293554782867432,
"learning_rate": 4.961586189325791e-05,
"loss": 0.2259,
"num_input_tokens_seen": 28718464,
"step": 1470
},
{
"epoch": 1.6806327490380504,
"grad_norm": 7.203713893890381,
"learning_rate": 4.9613251034671465e-05,
"loss": 0.2356,
"num_input_tokens_seen": 28816368,
"step": 1475
},
{
"epoch": 1.6863331908222887,
"grad_norm": 6.28717565536499,
"learning_rate": 4.961063140270329e-05,
"loss": 0.3129,
"num_input_tokens_seen": 28914080,
"step": 1480
},
{
"epoch": 1.692033632606527,
"grad_norm": 5.234409809112549,
"learning_rate": 4.960800299828715e-05,
"loss": 0.2614,
"num_input_tokens_seen": 29011808,
"step": 1485
},
{
"epoch": 1.6977340743907652,
"grad_norm": 7.488391399383545,
"learning_rate": 4.960536582235993e-05,
"loss": 0.2573,
"num_input_tokens_seen": 29109488,
"step": 1490
},
{
"epoch": 1.7034345161750035,
"grad_norm": 7.9980292320251465,
"learning_rate": 4.960271987586166e-05,
"loss": 0.2409,
"num_input_tokens_seen": 29207232,
"step": 1495
},
{
"epoch": 1.7091349579592419,
"grad_norm": 4.908660888671875,
"learning_rate": 4.960006515973548e-05,
"loss": 0.2969,
"num_input_tokens_seen": 29304960,
"step": 1500
},
{
"epoch": 1.71483539974348,
"grad_norm": 7.019242763519287,
"learning_rate": 4.959740167492767e-05,
"loss": 0.2576,
"num_input_tokens_seen": 29402720,
"step": 1505
},
{
"epoch": 1.7205358415277185,
"grad_norm": 5.726184844970703,
"learning_rate": 4.959472942238762e-05,
"loss": 0.2731,
"num_input_tokens_seen": 29500480,
"step": 1510
},
{
"epoch": 1.7262362833119567,
"grad_norm": 7.595127105712891,
"learning_rate": 4.9592048403067845e-05,
"loss": 0.3502,
"num_input_tokens_seen": 29598240,
"step": 1515
},
{
"epoch": 1.731936725096195,
"grad_norm": 5.473161697387695,
"learning_rate": 4.958935861792402e-05,
"loss": 0.3446,
"num_input_tokens_seen": 29695952,
"step": 1520
},
{
"epoch": 1.7376371668804333,
"grad_norm": 5.171034812927246,
"learning_rate": 4.958666006791489e-05,
"loss": 0.328,
"num_input_tokens_seen": 29793696,
"step": 1525
},
{
"epoch": 1.7433376086646715,
"grad_norm": 7.3467559814453125,
"learning_rate": 4.958395275400237e-05,
"loss": 0.2313,
"num_input_tokens_seen": 29891456,
"step": 1530
},
{
"epoch": 1.7490380504489098,
"grad_norm": 7.6113481521606445,
"learning_rate": 4.958123667715147e-05,
"loss": 0.3182,
"num_input_tokens_seen": 29989280,
"step": 1535
},
{
"epoch": 1.7547384922331482,
"grad_norm": 4.429864883422852,
"learning_rate": 4.957851183833034e-05,
"loss": 0.2573,
"num_input_tokens_seen": 30087104,
"step": 1540
},
{
"epoch": 1.7604389340173863,
"grad_norm": 7.2398200035095215,
"learning_rate": 4.957577823851024e-05,
"loss": 0.3694,
"num_input_tokens_seen": 30184768,
"step": 1545
},
{
"epoch": 1.7661393758016246,
"grad_norm": 5.151268005371094,
"learning_rate": 4.957303587866557e-05,
"loss": 0.1916,
"num_input_tokens_seen": 30282496,
"step": 1550
},
{
"epoch": 1.771839817585863,
"grad_norm": 3.622302293777466,
"learning_rate": 4.957028475977384e-05,
"loss": 0.2405,
"num_input_tokens_seen": 30380288,
"step": 1555
},
{
"epoch": 1.777540259370101,
"grad_norm": 6.740144729614258,
"learning_rate": 4.9567524882815686e-05,
"loss": 0.2632,
"num_input_tokens_seen": 30478048,
"step": 1560
},
{
"epoch": 1.7832407011543394,
"grad_norm": 8.049473762512207,
"learning_rate": 4.956475624877486e-05,
"loss": 0.4007,
"num_input_tokens_seen": 30575728,
"step": 1565
},
{
"epoch": 1.7889411429385778,
"grad_norm": 6.096712589263916,
"learning_rate": 4.9561978858638245e-05,
"loss": 0.3395,
"num_input_tokens_seen": 30673488,
"step": 1570
},
{
"epoch": 1.7946415847228159,
"grad_norm": 5.549604415893555,
"learning_rate": 4.955919271339584e-05,
"loss": 0.2917,
"num_input_tokens_seen": 30771120,
"step": 1575
},
{
"epoch": 1.8003420265070544,
"grad_norm": 6.270097732543945,
"learning_rate": 4.9556397814040754e-05,
"loss": 0.1805,
"num_input_tokens_seen": 30868848,
"step": 1580
},
{
"epoch": 1.8060424682912926,
"grad_norm": 5.557713985443115,
"learning_rate": 4.955359416156925e-05,
"loss": 0.2391,
"num_input_tokens_seen": 30966576,
"step": 1585
},
{
"epoch": 1.811742910075531,
"grad_norm": 5.296855926513672,
"learning_rate": 4.955078175698067e-05,
"loss": 0.3259,
"num_input_tokens_seen": 31064320,
"step": 1590
},
{
"epoch": 1.8174433518597692,
"grad_norm": 5.627151966094971,
"learning_rate": 4.9547960601277496e-05,
"loss": 0.2576,
"num_input_tokens_seen": 31162048,
"step": 1595
},
{
"epoch": 1.8231437936440074,
"grad_norm": 6.599062919616699,
"learning_rate": 4.9545130695465336e-05,
"loss": 0.2859,
"num_input_tokens_seen": 31259840,
"step": 1600
},
{
"epoch": 1.8288442354282457,
"grad_norm": 8.176803588867188,
"learning_rate": 4.954229204055291e-05,
"loss": 0.1917,
"num_input_tokens_seen": 31357568,
"step": 1605
},
{
"epoch": 1.834544677212484,
"grad_norm": 9.62484073638916,
"learning_rate": 4.953944463755204e-05,
"loss": 0.3755,
"num_input_tokens_seen": 31455344,
"step": 1610
},
{
"epoch": 1.8402451189967222,
"grad_norm": 6.320862293243408,
"learning_rate": 4.9536588487477697e-05,
"loss": 0.2781,
"num_input_tokens_seen": 31553024,
"step": 1615
},
{
"epoch": 1.8459455607809605,
"grad_norm": 4.023454189300537,
"learning_rate": 4.953372359134795e-05,
"loss": 0.2669,
"num_input_tokens_seen": 31650848,
"step": 1620
},
{
"epoch": 1.8516460025651988,
"grad_norm": 5.63836145401001,
"learning_rate": 4.953084995018398e-05,
"loss": 0.2577,
"num_input_tokens_seen": 31748560,
"step": 1625
},
{
"epoch": 1.857346444349437,
"grad_norm": 8.508479118347168,
"learning_rate": 4.95279675650101e-05,
"loss": 0.277,
"num_input_tokens_seen": 31846224,
"step": 1630
},
{
"epoch": 1.8630468861336753,
"grad_norm": 7.421855926513672,
"learning_rate": 4.952507643685375e-05,
"loss": 0.2915,
"num_input_tokens_seen": 31944016,
"step": 1635
},
{
"epoch": 1.8687473279179136,
"grad_norm": 8.737668991088867,
"learning_rate": 4.952217656674546e-05,
"loss": 0.2798,
"num_input_tokens_seen": 32041680,
"step": 1640
},
{
"epoch": 1.8744477697021518,
"grad_norm": 6.379103183746338,
"learning_rate": 4.951926795571888e-05,
"loss": 0.2403,
"num_input_tokens_seen": 32139392,
"step": 1645
},
{
"epoch": 1.8801482114863903,
"grad_norm": 3.9837377071380615,
"learning_rate": 4.9516350604810793e-05,
"loss": 0.1932,
"num_input_tokens_seen": 32237184,
"step": 1650
},
{
"epoch": 1.8858486532706284,
"grad_norm": 6.174622535705566,
"learning_rate": 4.951342451506108e-05,
"loss": 0.2904,
"num_input_tokens_seen": 32334816,
"step": 1655
},
{
"epoch": 1.8915490950548668,
"grad_norm": 5.5978899002075195,
"learning_rate": 4.951048968751275e-05,
"loss": 0.2017,
"num_input_tokens_seen": 32432528,
"step": 1660
},
{
"epoch": 1.8972495368391051,
"grad_norm": 6.84478759765625,
"learning_rate": 4.9507546123211926e-05,
"loss": 0.2464,
"num_input_tokens_seen": 32530320,
"step": 1665
},
{
"epoch": 1.9029499786233433,
"grad_norm": 4.2474799156188965,
"learning_rate": 4.950459382320782e-05,
"loss": 0.1859,
"num_input_tokens_seen": 32628016,
"step": 1670
},
{
"epoch": 1.9086504204075816,
"grad_norm": 7.542076587677002,
"learning_rate": 4.9501632788552805e-05,
"loss": 0.2051,
"num_input_tokens_seen": 32725744,
"step": 1675
},
{
"epoch": 1.91435086219182,
"grad_norm": 8.932976722717285,
"learning_rate": 4.949866302030232e-05,
"loss": 0.3001,
"num_input_tokens_seen": 32823424,
"step": 1680
},
{
"epoch": 1.920051303976058,
"grad_norm": 8.958706855773926,
"learning_rate": 4.949568451951495e-05,
"loss": 0.4515,
"num_input_tokens_seen": 32921120,
"step": 1685
},
{
"epoch": 1.9257517457602964,
"grad_norm": 8.6969633102417,
"learning_rate": 4.9492697287252365e-05,
"loss": 0.2328,
"num_input_tokens_seen": 33018880,
"step": 1690
},
{
"epoch": 1.9314521875445347,
"grad_norm": 5.649287223815918,
"learning_rate": 4.948970132457938e-05,
"loss": 0.2487,
"num_input_tokens_seen": 33116656,
"step": 1695
},
{
"epoch": 1.9371526293287729,
"grad_norm": 7.401125431060791,
"learning_rate": 4.94866966325639e-05,
"loss": 0.2839,
"num_input_tokens_seen": 33214416,
"step": 1700
},
{
"epoch": 1.9428530711130114,
"grad_norm": 8.189993858337402,
"learning_rate": 4.9483683212276935e-05,
"loss": 0.1811,
"num_input_tokens_seen": 33312096,
"step": 1705
},
{
"epoch": 1.9485535128972495,
"grad_norm": 8.221793174743652,
"learning_rate": 4.948066106479262e-05,
"loss": 0.2459,
"num_input_tokens_seen": 33409792,
"step": 1710
},
{
"epoch": 1.9542539546814877,
"grad_norm": 5.853153705596924,
"learning_rate": 4.947763019118821e-05,
"loss": 0.3363,
"num_input_tokens_seen": 33507504,
"step": 1715
},
{
"epoch": 1.9599543964657262,
"grad_norm": 7.756158351898193,
"learning_rate": 4.947459059254405e-05,
"loss": 0.2134,
"num_input_tokens_seen": 33605136,
"step": 1720
},
{
"epoch": 1.9656548382499643,
"grad_norm": 6.2845025062561035,
"learning_rate": 4.9471542269943604e-05,
"loss": 0.2498,
"num_input_tokens_seen": 33702928,
"step": 1725
},
{
"epoch": 1.9713552800342027,
"grad_norm": 2.1698145866394043,
"learning_rate": 4.946848522447345e-05,
"loss": 0.1366,
"num_input_tokens_seen": 33800656,
"step": 1730
},
{
"epoch": 1.977055721818441,
"grad_norm": 8.690340995788574,
"learning_rate": 4.946541945722326e-05,
"loss": 0.3645,
"num_input_tokens_seen": 33898336,
"step": 1735
},
{
"epoch": 1.9827561636026791,
"grad_norm": 7.308428764343262,
"learning_rate": 4.946234496928583e-05,
"loss": 0.1994,
"num_input_tokens_seen": 33996096,
"step": 1740
},
{
"epoch": 1.9884566053869175,
"grad_norm": 5.161618709564209,
"learning_rate": 4.945926176175707e-05,
"loss": 0.226,
"num_input_tokens_seen": 34093792,
"step": 1745
},
{
"epoch": 1.9941570471711558,
"grad_norm": 9.512948989868164,
"learning_rate": 4.945616983573598e-05,
"loss": 0.2135,
"num_input_tokens_seen": 34191552,
"step": 1750
},
{
"epoch": 1.999857488955394,
"grad_norm": 8.386415481567383,
"learning_rate": 4.945306919232467e-05,
"loss": 0.236,
"num_input_tokens_seen": 34289248,
"step": 1755
},
{
"epoch": 2.0045603534273906,
"grad_norm": 7.716324329376221,
"learning_rate": 4.944995983262837e-05,
"loss": 0.3453,
"num_input_tokens_seen": 34369840,
"step": 1760
},
{
"epoch": 2.0102607952116287,
"grad_norm": 3.110274076461792,
"learning_rate": 4.9446841757755405e-05,
"loss": 0.1964,
"num_input_tokens_seen": 34467568,
"step": 1765
},
{
"epoch": 2.0159612369958673,
"grad_norm": 6.07765007019043,
"learning_rate": 4.944371496881721e-05,
"loss": 0.2358,
"num_input_tokens_seen": 34565248,
"step": 1770
},
{
"epoch": 2.0216616787801054,
"grad_norm": 5.835047721862793,
"learning_rate": 4.944057946692834e-05,
"loss": 0.1317,
"num_input_tokens_seen": 34662896,
"step": 1775
},
{
"epoch": 2.027362120564344,
"grad_norm": 6.831048488616943,
"learning_rate": 4.943743525320643e-05,
"loss": 0.2355,
"num_input_tokens_seen": 34760624,
"step": 1780
},
{
"epoch": 2.033062562348582,
"grad_norm": 11.599326133728027,
"learning_rate": 4.943428232877224e-05,
"loss": 0.1869,
"num_input_tokens_seen": 34858288,
"step": 1785
},
{
"epoch": 2.03876300413282,
"grad_norm": 6.280679702758789,
"learning_rate": 4.943112069474963e-05,
"loss": 0.2707,
"num_input_tokens_seen": 34955968,
"step": 1790
},
{
"epoch": 2.0444634459170588,
"grad_norm": 7.326327800750732,
"learning_rate": 4.942795035226555e-05,
"loss": 0.2077,
"num_input_tokens_seen": 35053744,
"step": 1795
},
{
"epoch": 2.050163887701297,
"grad_norm": 4.219114303588867,
"learning_rate": 4.9424771302450084e-05,
"loss": 0.1575,
"num_input_tokens_seen": 35151408,
"step": 1800
},
{
"epoch": 2.055864329485535,
"grad_norm": 4.75279426574707,
"learning_rate": 4.942158354643639e-05,
"loss": 0.1663,
"num_input_tokens_seen": 35249168,
"step": 1805
},
{
"epoch": 2.0615647712697736,
"grad_norm": 7.397295951843262,
"learning_rate": 4.9418387085360754e-05,
"loss": 0.1872,
"num_input_tokens_seen": 35346880,
"step": 1810
},
{
"epoch": 2.0672652130540117,
"grad_norm": 7.649684906005859,
"learning_rate": 4.941518192036254e-05,
"loss": 0.2212,
"num_input_tokens_seen": 35444688,
"step": 1815
},
{
"epoch": 2.07296565483825,
"grad_norm": 3.966686725616455,
"learning_rate": 4.941196805258423e-05,
"loss": 0.1185,
"num_input_tokens_seen": 35542416,
"step": 1820
},
{
"epoch": 2.0786660966224884,
"grad_norm": 10.044607162475586,
"learning_rate": 4.940874548317143e-05,
"loss": 0.2099,
"num_input_tokens_seen": 35640128,
"step": 1825
},
{
"epoch": 2.0843665384067265,
"grad_norm": 5.567621231079102,
"learning_rate": 4.9405514213272784e-05,
"loss": 0.172,
"num_input_tokens_seen": 35737872,
"step": 1830
},
{
"epoch": 2.0900669801909646,
"grad_norm": 6.253763675689697,
"learning_rate": 4.94022742440401e-05,
"loss": 0.1485,
"num_input_tokens_seen": 35835568,
"step": 1835
},
{
"epoch": 2.095767421975203,
"grad_norm": 5.565789222717285,
"learning_rate": 4.939902557662826e-05,
"loss": 0.2586,
"num_input_tokens_seen": 35933312,
"step": 1840
},
{
"epoch": 2.1014678637594413,
"grad_norm": 4.763193607330322,
"learning_rate": 4.939576821219525e-05,
"loss": 0.2357,
"num_input_tokens_seen": 36030944,
"step": 1845
},
{
"epoch": 2.10716830554368,
"grad_norm": 11.802780151367188,
"learning_rate": 4.9392502151902156e-05,
"loss": 0.2471,
"num_input_tokens_seen": 36128688,
"step": 1850
},
{
"epoch": 2.112868747327918,
"grad_norm": 6.20152473449707,
"learning_rate": 4.938922739691316e-05,
"loss": 0.1398,
"num_input_tokens_seen": 36226368,
"step": 1855
},
{
"epoch": 2.118569189112156,
"grad_norm": 6.946401119232178,
"learning_rate": 4.938594394839555e-05,
"loss": 0.1601,
"num_input_tokens_seen": 36324096,
"step": 1860
},
{
"epoch": 2.1242696308963946,
"grad_norm": 4.598988056182861,
"learning_rate": 4.938265180751971e-05,
"loss": 0.1461,
"num_input_tokens_seen": 36421840,
"step": 1865
},
{
"epoch": 2.1299700726806328,
"grad_norm": 8.171868324279785,
"learning_rate": 4.937935097545912e-05,
"loss": 0.2531,
"num_input_tokens_seen": 36519552,
"step": 1870
},
{
"epoch": 2.135670514464871,
"grad_norm": 3.664236068725586,
"learning_rate": 4.9376041453390365e-05,
"loss": 0.1934,
"num_input_tokens_seen": 36617280,
"step": 1875
},
{
"epoch": 2.1413709562491094,
"grad_norm": 8.51440143585205,
"learning_rate": 4.937272324249312e-05,
"loss": 0.2024,
"num_input_tokens_seen": 36714992,
"step": 1880
},
{
"epoch": 2.1470713980333476,
"grad_norm": 6.9583001136779785,
"learning_rate": 4.9369396343950154e-05,
"loss": 0.2121,
"num_input_tokens_seen": 36812784,
"step": 1885
},
{
"epoch": 2.1527718398175857,
"grad_norm": 10.295254707336426,
"learning_rate": 4.936606075894734e-05,
"loss": 0.172,
"num_input_tokens_seen": 36910688,
"step": 1890
},
{
"epoch": 2.1584722816018242,
"grad_norm": 3.664754629135132,
"learning_rate": 4.9362716488673654e-05,
"loss": 0.163,
"num_input_tokens_seen": 37008464,
"step": 1895
},
{
"epoch": 2.1641727233860624,
"grad_norm": 9.738370895385742,
"learning_rate": 4.9359363534321156e-05,
"loss": 0.1591,
"num_input_tokens_seen": 37106272,
"step": 1900
},
{
"epoch": 2.1698731651703005,
"grad_norm": 6.579972743988037,
"learning_rate": 4.9356001897085e-05,
"loss": 0.1816,
"num_input_tokens_seen": 37204048,
"step": 1905
},
{
"epoch": 2.175573606954539,
"grad_norm": 6.322074890136719,
"learning_rate": 4.935263157816345e-05,
"loss": 0.183,
"num_input_tokens_seen": 37301824,
"step": 1910
},
{
"epoch": 2.181274048738777,
"grad_norm": 5.997386455535889,
"learning_rate": 4.934925257875784e-05,
"loss": 0.1722,
"num_input_tokens_seen": 37399632,
"step": 1915
},
{
"epoch": 2.1869744905230157,
"grad_norm": 7.246592998504639,
"learning_rate": 4.9345864900072625e-05,
"loss": 0.1017,
"num_input_tokens_seen": 37497296,
"step": 1920
},
{
"epoch": 2.192674932307254,
"grad_norm": 7.348723888397217,
"learning_rate": 4.934246854331534e-05,
"loss": 0.1756,
"num_input_tokens_seen": 37595168,
"step": 1925
},
{
"epoch": 2.198375374091492,
"grad_norm": 8.612208366394043,
"learning_rate": 4.933906350969661e-05,
"loss": 0.1674,
"num_input_tokens_seen": 37692832,
"step": 1930
},
{
"epoch": 2.2040758158757305,
"grad_norm": 16.743350982666016,
"learning_rate": 4.933564980043015e-05,
"loss": 0.2679,
"num_input_tokens_seen": 37790512,
"step": 1935
},
{
"epoch": 2.2097762576599687,
"grad_norm": 2.924848794937134,
"learning_rate": 4.93322274167328e-05,
"loss": 0.0981,
"num_input_tokens_seen": 37888304,
"step": 1940
},
{
"epoch": 2.2154766994442068,
"grad_norm": 8.15044116973877,
"learning_rate": 4.9328796359824445e-05,
"loss": 0.1621,
"num_input_tokens_seen": 37986032,
"step": 1945
},
{
"epoch": 2.2211771412284453,
"grad_norm": 9.112587928771973,
"learning_rate": 4.932535663092809e-05,
"loss": 0.2655,
"num_input_tokens_seen": 38083776,
"step": 1950
},
{
"epoch": 2.2268775830126835,
"grad_norm": 4.596200942993164,
"learning_rate": 4.932190823126982e-05,
"loss": 0.1608,
"num_input_tokens_seen": 38181488,
"step": 1955
},
{
"epoch": 2.2325780247969216,
"grad_norm": 5.918450355529785,
"learning_rate": 4.9318451162078824e-05,
"loss": 0.1119,
"num_input_tokens_seen": 38279248,
"step": 1960
},
{
"epoch": 2.23827846658116,
"grad_norm": 8.088936805725098,
"learning_rate": 4.931498542458738e-05,
"loss": 0.2202,
"num_input_tokens_seen": 38377024,
"step": 1965
},
{
"epoch": 2.2439789083653983,
"grad_norm": 6.410324573516846,
"learning_rate": 4.931151102003082e-05,
"loss": 0.1136,
"num_input_tokens_seen": 38474768,
"step": 1970
},
{
"epoch": 2.249679350149637,
"grad_norm": 8.893948554992676,
"learning_rate": 4.930802794964763e-05,
"loss": 0.1233,
"num_input_tokens_seen": 38572432,
"step": 1975
},
{
"epoch": 2.255379791933875,
"grad_norm": 2.6239705085754395,
"learning_rate": 4.9304536214679315e-05,
"loss": 0.1409,
"num_input_tokens_seen": 38670112,
"step": 1980
},
{
"epoch": 2.261080233718113,
"grad_norm": 8.951133728027344,
"learning_rate": 4.930103581637052e-05,
"loss": 0.159,
"num_input_tokens_seen": 38767872,
"step": 1985
},
{
"epoch": 2.2667806755023516,
"grad_norm": 8.211560249328613,
"learning_rate": 4.929752675596896e-05,
"loss": 0.1761,
"num_input_tokens_seen": 38865584,
"step": 1990
},
{
"epoch": 2.2724811172865897,
"grad_norm": 4.264492034912109,
"learning_rate": 4.929400903472544e-05,
"loss": 0.1206,
"num_input_tokens_seen": 38963264,
"step": 1995
},
{
"epoch": 2.278181559070828,
"grad_norm": 10.552608489990234,
"learning_rate": 4.9290482653893846e-05,
"loss": 0.1895,
"num_input_tokens_seen": 39060944,
"step": 2000
},
{
"epoch": 2.2838820008550664,
"grad_norm": 8.71176528930664,
"learning_rate": 4.928694761473115e-05,
"loss": 0.1604,
"num_input_tokens_seen": 39158640,
"step": 2005
},
{
"epoch": 2.2895824426393045,
"grad_norm": 6.852836608886719,
"learning_rate": 4.928340391849742e-05,
"loss": 0.2317,
"num_input_tokens_seen": 39256352,
"step": 2010
},
{
"epoch": 2.2952828844235427,
"grad_norm": 6.0051493644714355,
"learning_rate": 4.9279851566455806e-05,
"loss": 0.1945,
"num_input_tokens_seen": 39354112,
"step": 2015
},
{
"epoch": 2.3009833262077812,
"grad_norm": 8.092498779296875,
"learning_rate": 4.927629055987254e-05,
"loss": 0.1393,
"num_input_tokens_seen": 39451824,
"step": 2020
},
{
"epoch": 2.3066837679920194,
"grad_norm": 11.641897201538086,
"learning_rate": 4.927272090001695e-05,
"loss": 0.1692,
"num_input_tokens_seen": 39549600,
"step": 2025
},
{
"epoch": 2.3123842097762575,
"grad_norm": 6.343209266662598,
"learning_rate": 4.9269142588161424e-05,
"loss": 0.1058,
"num_input_tokens_seen": 39647280,
"step": 2030
},
{
"epoch": 2.318084651560496,
"grad_norm": 9.047554969787598,
"learning_rate": 4.9265555625581464e-05,
"loss": 0.1835,
"num_input_tokens_seen": 39745040,
"step": 2035
},
{
"epoch": 2.323785093344734,
"grad_norm": 7.915929317474365,
"learning_rate": 4.9261960013555625e-05,
"loss": 0.2291,
"num_input_tokens_seen": 39842816,
"step": 2040
},
{
"epoch": 2.3294855351289723,
"grad_norm": 5.318630695343018,
"learning_rate": 4.925835575336557e-05,
"loss": 0.1533,
"num_input_tokens_seen": 39940576,
"step": 2045
},
{
"epoch": 2.335185976913211,
"grad_norm": 7.653360366821289,
"learning_rate": 4.9254742846296045e-05,
"loss": 0.1978,
"num_input_tokens_seen": 40038368,
"step": 2050
},
{
"epoch": 2.340886418697449,
"grad_norm": 7.852567672729492,
"learning_rate": 4.925112129363486e-05,
"loss": 0.1531,
"num_input_tokens_seen": 40136144,
"step": 2055
},
{
"epoch": 2.3465868604816875,
"grad_norm": 6.86559534072876,
"learning_rate": 4.92474910966729e-05,
"loss": 0.0854,
"num_input_tokens_seen": 40233952,
"step": 2060
},
{
"epoch": 2.3522873022659256,
"grad_norm": 10.074113845825195,
"learning_rate": 4.9243852256704183e-05,
"loss": 0.1915,
"num_input_tokens_seen": 40331696,
"step": 2065
},
{
"epoch": 2.3579877440501638,
"grad_norm": 7.896622180938721,
"learning_rate": 4.924020477502574e-05,
"loss": 0.1495,
"num_input_tokens_seen": 40429360,
"step": 2070
},
{
"epoch": 2.3636881858344023,
"grad_norm": 8.397123336791992,
"learning_rate": 4.923654865293773e-05,
"loss": 0.1392,
"num_input_tokens_seen": 40527136,
"step": 2075
},
{
"epoch": 2.3693886276186404,
"grad_norm": 6.305740833282471,
"learning_rate": 4.923288389174337e-05,
"loss": 0.0875,
"num_input_tokens_seen": 40624912,
"step": 2080
},
{
"epoch": 2.3750890694028786,
"grad_norm": 8.387357711791992,
"learning_rate": 4.9229210492748976e-05,
"loss": 0.2358,
"num_input_tokens_seen": 40722720,
"step": 2085
},
{
"epoch": 2.380789511187117,
"grad_norm": 8.446219444274902,
"learning_rate": 4.92255284572639e-05,
"loss": 0.1482,
"num_input_tokens_seen": 40820448,
"step": 2090
},
{
"epoch": 2.3864899529713552,
"grad_norm": 6.30406379699707,
"learning_rate": 4.9221837786600634e-05,
"loss": 0.1603,
"num_input_tokens_seen": 40918256,
"step": 2095
},
{
"epoch": 2.392190394755594,
"grad_norm": 7.828269004821777,
"learning_rate": 4.921813848207469e-05,
"loss": 0.1764,
"num_input_tokens_seen": 41015920,
"step": 2100
},
{
"epoch": 2.397890836539832,
"grad_norm": 7.984679698944092,
"learning_rate": 4.921443054500471e-05,
"loss": 0.1809,
"num_input_tokens_seen": 41113632,
"step": 2105
},
{
"epoch": 2.40359127832407,
"grad_norm": 10.998138427734375,
"learning_rate": 4.921071397671235e-05,
"loss": 0.185,
"num_input_tokens_seen": 41211344,
"step": 2110
},
{
"epoch": 2.4092917201083086,
"grad_norm": 7.500617980957031,
"learning_rate": 4.9206988778522414e-05,
"loss": 0.116,
"num_input_tokens_seen": 41308992,
"step": 2115
},
{
"epoch": 2.4149921618925467,
"grad_norm": 7.988670349121094,
"learning_rate": 4.9203254951762735e-05,
"loss": 0.1457,
"num_input_tokens_seen": 41406752,
"step": 2120
},
{
"epoch": 2.420692603676785,
"grad_norm": 10.234942436218262,
"learning_rate": 4.9199512497764226e-05,
"loss": 0.2256,
"num_input_tokens_seen": 41504464,
"step": 2125
},
{
"epoch": 2.4263930454610234,
"grad_norm": 11.052645683288574,
"learning_rate": 4.919576141786089e-05,
"loss": 0.1721,
"num_input_tokens_seen": 41602272,
"step": 2130
},
{
"epoch": 2.4320934872452615,
"grad_norm": 3.908461332321167,
"learning_rate": 4.91920017133898e-05,
"loss": 0.1676,
"num_input_tokens_seen": 41700000,
"step": 2135
},
{
"epoch": 2.4377939290294997,
"grad_norm": 3.8486247062683105,
"learning_rate": 4.9188233385691094e-05,
"loss": 0.1458,
"num_input_tokens_seen": 41797696,
"step": 2140
},
{
"epoch": 2.443494370813738,
"grad_norm": 7.926904678344727,
"learning_rate": 4.9184456436107994e-05,
"loss": 0.202,
"num_input_tokens_seen": 41895392,
"step": 2145
},
{
"epoch": 2.4491948125979763,
"grad_norm": 8.783378601074219,
"learning_rate": 4.91806708659868e-05,
"loss": 0.1838,
"num_input_tokens_seen": 41993120,
"step": 2150
},
{
"epoch": 2.4548952543822145,
"grad_norm": 4.2609357833862305,
"learning_rate": 4.917687667667686e-05,
"loss": 0.1037,
"num_input_tokens_seen": 42090880,
"step": 2155
},
{
"epoch": 2.460595696166453,
"grad_norm": 10.791552543640137,
"learning_rate": 4.917307386953062e-05,
"loss": 0.1791,
"num_input_tokens_seen": 42188576,
"step": 2160
},
{
"epoch": 2.466296137950691,
"grad_norm": 8.570240020751953,
"learning_rate": 4.9169262445903595e-05,
"loss": 0.1608,
"num_input_tokens_seen": 42286272,
"step": 2165
},
{
"epoch": 2.4719965797349293,
"grad_norm": 5.907083034515381,
"learning_rate": 4.9165442407154355e-05,
"loss": 0.1657,
"num_input_tokens_seen": 42384048,
"step": 2170
},
{
"epoch": 2.477697021519168,
"grad_norm": 9.303824424743652,
"learning_rate": 4.916161375464455e-05,
"loss": 0.1839,
"num_input_tokens_seen": 42481888,
"step": 2175
},
{
"epoch": 2.483397463303406,
"grad_norm": 9.526128768920898,
"learning_rate": 4.915777648973892e-05,
"loss": 0.1084,
"num_input_tokens_seen": 42579600,
"step": 2180
},
{
"epoch": 2.489097905087644,
"grad_norm": 8.502360343933105,
"learning_rate": 4.915393061380523e-05,
"loss": 0.1205,
"num_input_tokens_seen": 42677360,
"step": 2185
},
{
"epoch": 2.4947983468718826,
"grad_norm": 8.056527137756348,
"learning_rate": 4.9150076128214364e-05,
"loss": 0.1244,
"num_input_tokens_seen": 42775072,
"step": 2190
},
{
"epoch": 2.5004987886561207,
"grad_norm": 3.5900754928588867,
"learning_rate": 4.914621303434023e-05,
"loss": 0.1198,
"num_input_tokens_seen": 42872832,
"step": 2195
},
{
"epoch": 2.506199230440359,
"grad_norm": 4.139026641845703,
"learning_rate": 4.914234133355984e-05,
"loss": 0.1016,
"num_input_tokens_seen": 42970592,
"step": 2200
},
{
"epoch": 2.5118996722245974,
"grad_norm": 8.35708999633789,
"learning_rate": 4.9138461027253255e-05,
"loss": 0.1066,
"num_input_tokens_seen": 43068384,
"step": 2205
},
{
"epoch": 2.5176001140088355,
"grad_norm": 2.149785041809082,
"learning_rate": 4.913457211680361e-05,
"loss": 0.0866,
"num_input_tokens_seen": 43166240,
"step": 2210
},
{
"epoch": 2.523300555793074,
"grad_norm": 3.6671578884124756,
"learning_rate": 4.913067460359711e-05,
"loss": 0.1831,
"num_input_tokens_seen": 43264000,
"step": 2215
},
{
"epoch": 2.5290009975773122,
"grad_norm": 7.476966857910156,
"learning_rate": 4.912676848902301e-05,
"loss": 0.1276,
"num_input_tokens_seen": 43361712,
"step": 2220
},
{
"epoch": 2.534701439361551,
"grad_norm": 14.653030395507812,
"learning_rate": 4.912285377447366e-05,
"loss": 0.1622,
"num_input_tokens_seen": 43459392,
"step": 2225
},
{
"epoch": 2.540401881145789,
"grad_norm": 7.570925712585449,
"learning_rate": 4.9118930461344433e-05,
"loss": 0.1279,
"num_input_tokens_seen": 43557104,
"step": 2230
},
{
"epoch": 2.546102322930027,
"grad_norm": 7.563347816467285,
"learning_rate": 4.911499855103382e-05,
"loss": 0.1015,
"num_input_tokens_seen": 43654928,
"step": 2235
},
{
"epoch": 2.5518027647142656,
"grad_norm": 5.281002521514893,
"learning_rate": 4.9111058044943334e-05,
"loss": 0.1255,
"num_input_tokens_seen": 43752672,
"step": 2240
},
{
"epoch": 2.5575032064985037,
"grad_norm": 5.972769737243652,
"learning_rate": 4.910710894447757e-05,
"loss": 0.0551,
"num_input_tokens_seen": 43850512,
"step": 2245
},
{
"epoch": 2.563203648282742,
"grad_norm": 1.96940279006958,
"learning_rate": 4.9103151251044174e-05,
"loss": 0.0708,
"num_input_tokens_seen": 43948336,
"step": 2250
},
{
"epoch": 2.5689040900669804,
"grad_norm": 10.9530668258667,
"learning_rate": 4.909918496605387e-05,
"loss": 0.1775,
"num_input_tokens_seen": 44046080,
"step": 2255
},
{
"epoch": 2.5746045318512185,
"grad_norm": 8.717411994934082,
"learning_rate": 4.909521009092045e-05,
"loss": 0.0874,
"num_input_tokens_seen": 44143808,
"step": 2260
},
{
"epoch": 2.5803049736354566,
"grad_norm": 11.679935455322266,
"learning_rate": 4.909122662706074e-05,
"loss": 0.2068,
"num_input_tokens_seen": 44241600,
"step": 2265
},
{
"epoch": 2.586005415419695,
"grad_norm": 5.130126953125,
"learning_rate": 4.9087234575894644e-05,
"loss": 0.0785,
"num_input_tokens_seen": 44339312,
"step": 2270
},
{
"epoch": 2.5917058572039333,
"grad_norm": 5.87738037109375,
"learning_rate": 4.908323393884514e-05,
"loss": 0.0893,
"num_input_tokens_seen": 44437136,
"step": 2275
},
{
"epoch": 2.5974062989881714,
"grad_norm": 9.397594451904297,
"learning_rate": 4.9079224717338246e-05,
"loss": 0.142,
"num_input_tokens_seen": 44534912,
"step": 2280
},
{
"epoch": 2.60310674077241,
"grad_norm": 3.5476927757263184,
"learning_rate": 4.907520691280304e-05,
"loss": 0.0855,
"num_input_tokens_seen": 44632720,
"step": 2285
},
{
"epoch": 2.608807182556648,
"grad_norm": 10.59860610961914,
"learning_rate": 4.907118052667168e-05,
"loss": 0.1536,
"num_input_tokens_seen": 44730480,
"step": 2290
},
{
"epoch": 2.6145076243408862,
"grad_norm": 7.346194744110107,
"learning_rate": 4.906714556037936e-05,
"loss": 0.1219,
"num_input_tokens_seen": 44828112,
"step": 2295
},
{
"epoch": 2.620208066125125,
"grad_norm": 4.199995517730713,
"learning_rate": 4.9063102015364344e-05,
"loss": 0.0867,
"num_input_tokens_seen": 44925888,
"step": 2300
},
{
"epoch": 2.625908507909363,
"grad_norm": 12.029003143310547,
"learning_rate": 4.9059049893067954e-05,
"loss": 0.1819,
"num_input_tokens_seen": 45023728,
"step": 2305
},
{
"epoch": 2.631608949693601,
"grad_norm": 9.11473560333252,
"learning_rate": 4.9054989194934564e-05,
"loss": 0.1298,
"num_input_tokens_seen": 45121424,
"step": 2310
},
{
"epoch": 2.6373093914778396,
"grad_norm": 17.91231918334961,
"learning_rate": 4.905091992241161e-05,
"loss": 0.1854,
"num_input_tokens_seen": 45219200,
"step": 2315
},
{
"epoch": 2.6430098332620777,
"grad_norm": 10.454273223876953,
"learning_rate": 4.9046842076949576e-05,
"loss": 0.2016,
"num_input_tokens_seen": 45316944,
"step": 2320
},
{
"epoch": 2.648710275046316,
"grad_norm": 11.895720481872559,
"learning_rate": 4.904275566000202e-05,
"loss": 0.2173,
"num_input_tokens_seen": 45414688,
"step": 2325
},
{
"epoch": 2.6544107168305544,
"grad_norm": 7.68233585357666,
"learning_rate": 4.903866067302554e-05,
"loss": 0.1429,
"num_input_tokens_seen": 45512400,
"step": 2330
},
{
"epoch": 2.6601111586147925,
"grad_norm": 6.063493251800537,
"learning_rate": 4.9034557117479786e-05,
"loss": 0.1397,
"num_input_tokens_seen": 45610128,
"step": 2335
},
{
"epoch": 2.665811600399031,
"grad_norm": 1.1613508462905884,
"learning_rate": 4.903044499482747e-05,
"loss": 0.0946,
"num_input_tokens_seen": 45707920,
"step": 2340
},
{
"epoch": 2.671512042183269,
"grad_norm": 10.7675142288208,
"learning_rate": 4.902632430653435e-05,
"loss": 0.1761,
"num_input_tokens_seen": 45805744,
"step": 2345
},
{
"epoch": 2.6772124839675073,
"grad_norm": 7.67887020111084,
"learning_rate": 4.902219505406926e-05,
"loss": 0.1615,
"num_input_tokens_seen": 45903456,
"step": 2350
},
{
"epoch": 2.682912925751746,
"grad_norm": 2.045400381088257,
"learning_rate": 4.901805723890407e-05,
"loss": 0.173,
"num_input_tokens_seen": 46001264,
"step": 2355
},
{
"epoch": 2.688613367535984,
"grad_norm": 8.454596519470215,
"learning_rate": 4.9013910862513676e-05,
"loss": 0.1894,
"num_input_tokens_seen": 46098976,
"step": 2360
},
{
"epoch": 2.6943138093202226,
"grad_norm": 7.203195095062256,
"learning_rate": 4.9009755926376085e-05,
"loss": 0.1496,
"num_input_tokens_seen": 46196816,
"step": 2365
},
{
"epoch": 2.7000142511044607,
"grad_norm": 10.533610343933105,
"learning_rate": 4.9005592431972304e-05,
"loss": 0.0768,
"num_input_tokens_seen": 46294480,
"step": 2370
},
{
"epoch": 2.705714692888699,
"grad_norm": 7.490808486938477,
"learning_rate": 4.90014203807864e-05,
"loss": 0.1114,
"num_input_tokens_seen": 46392096,
"step": 2375
},
{
"epoch": 2.7114151346729374,
"grad_norm": 7.0142083168029785,
"learning_rate": 4.899723977430552e-05,
"loss": 0.0883,
"num_input_tokens_seen": 46489936,
"step": 2380
},
{
"epoch": 2.7171155764571755,
"grad_norm": 8.30893611907959,
"learning_rate": 4.899305061401983e-05,
"loss": 0.1146,
"num_input_tokens_seen": 46587648,
"step": 2385
},
{
"epoch": 2.7228160182414136,
"grad_norm": 5.555058479309082,
"learning_rate": 4.898885290142254e-05,
"loss": 0.1212,
"num_input_tokens_seen": 46685360,
"step": 2390
},
{
"epoch": 2.728516460025652,
"grad_norm": 3.8162529468536377,
"learning_rate": 4.898464663800995e-05,
"loss": 0.1327,
"num_input_tokens_seen": 46783072,
"step": 2395
},
{
"epoch": 2.7342169018098903,
"grad_norm": 7.592154026031494,
"learning_rate": 4.898043182528136e-05,
"loss": 0.0871,
"num_input_tokens_seen": 46880832,
"step": 2400
},
{
"epoch": 2.7399173435941284,
"grad_norm": 4.684682846069336,
"learning_rate": 4.897620846473915e-05,
"loss": 0.0563,
"num_input_tokens_seen": 46978576,
"step": 2405
},
{
"epoch": 2.745617785378367,
"grad_norm": 1.159037709236145,
"learning_rate": 4.897197655788872e-05,
"loss": 0.1116,
"num_input_tokens_seen": 47076304,
"step": 2410
},
{
"epoch": 2.751318227162605,
"grad_norm": 9.983619689941406,
"learning_rate": 4.8967736106238546e-05,
"loss": 0.1072,
"num_input_tokens_seen": 47174000,
"step": 2415
},
{
"epoch": 2.757018668946843,
"grad_norm": 5.6760029792785645,
"learning_rate": 4.8963487111300133e-05,
"loss": 0.0847,
"num_input_tokens_seen": 47271760,
"step": 2420
},
{
"epoch": 2.762719110731082,
"grad_norm": 9.46414566040039,
"learning_rate": 4.895922957458803e-05,
"loss": 0.0821,
"num_input_tokens_seen": 47369504,
"step": 2425
},
{
"epoch": 2.76841955251532,
"grad_norm": 11.030294418334961,
"learning_rate": 4.8954963497619836e-05,
"loss": 0.1595,
"num_input_tokens_seen": 47467312,
"step": 2430
},
{
"epoch": 2.774119994299558,
"grad_norm": 4.053030967712402,
"learning_rate": 4.895068888191618e-05,
"loss": 0.0967,
"num_input_tokens_seen": 47565024,
"step": 2435
},
{
"epoch": 2.7798204360837966,
"grad_norm": 8.997540473937988,
"learning_rate": 4.894640572900076e-05,
"loss": 0.1222,
"num_input_tokens_seen": 47662768,
"step": 2440
},
{
"epoch": 2.7855208778680347,
"grad_norm": 9.141730308532715,
"learning_rate": 4.89421140404003e-05,
"loss": 0.1496,
"num_input_tokens_seen": 47760640,
"step": 2445
},
{
"epoch": 2.791221319652273,
"grad_norm": 6.096067905426025,
"learning_rate": 4.8937813817644577e-05,
"loss": 0.0965,
"num_input_tokens_seen": 47858400,
"step": 2450
},
{
"epoch": 2.7969217614365114,
"grad_norm": 7.94855260848999,
"learning_rate": 4.89335050622664e-05,
"loss": 0.1043,
"num_input_tokens_seen": 47956112,
"step": 2455
},
{
"epoch": 2.8026222032207495,
"grad_norm": 5.408116340637207,
"learning_rate": 4.892918777580161e-05,
"loss": 0.0953,
"num_input_tokens_seen": 48053888,
"step": 2460
},
{
"epoch": 2.8083226450049876,
"grad_norm": 10.836856842041016,
"learning_rate": 4.8924861959789116e-05,
"loss": 0.0829,
"num_input_tokens_seen": 48151648,
"step": 2465
},
{
"epoch": 2.814023086789226,
"grad_norm": 4.917642593383789,
"learning_rate": 4.892052761577084e-05,
"loss": 0.1339,
"num_input_tokens_seen": 48249344,
"step": 2470
},
{
"epoch": 2.8197235285734643,
"grad_norm": 4.280270099639893,
"learning_rate": 4.891618474529178e-05,
"loss": 0.0867,
"num_input_tokens_seen": 48347088,
"step": 2475
},
{
"epoch": 2.825423970357703,
"grad_norm": 10.1702241897583,
"learning_rate": 4.8911833349899924e-05,
"loss": 0.0944,
"num_input_tokens_seen": 48444848,
"step": 2480
},
{
"epoch": 2.831124412141941,
"grad_norm": 7.9395341873168945,
"learning_rate": 4.890747343114634e-05,
"loss": 0.1103,
"num_input_tokens_seen": 48542528,
"step": 2485
},
{
"epoch": 2.836824853926179,
"grad_norm": 7.740533828735352,
"learning_rate": 4.8903104990585124e-05,
"loss": 0.0763,
"num_input_tokens_seen": 48640240,
"step": 2490
},
{
"epoch": 2.8425252957104177,
"grad_norm": 2.646793842315674,
"learning_rate": 4.8898728029773394e-05,
"loss": 0.0821,
"num_input_tokens_seen": 48737888,
"step": 2495
},
{
"epoch": 2.848225737494656,
"grad_norm": 6.403926849365234,
"learning_rate": 4.8894342550271314e-05,
"loss": 0.0962,
"num_input_tokens_seen": 48835600,
"step": 2500
},
{
"epoch": 2.8539261792788944,
"grad_norm": 7.7822957038879395,
"learning_rate": 4.888994855364209e-05,
"loss": 0.0832,
"num_input_tokens_seen": 48933312,
"step": 2505
},
{
"epoch": 2.8596266210631325,
"grad_norm": 5.186079025268555,
"learning_rate": 4.888554604145196e-05,
"loss": 0.125,
"num_input_tokens_seen": 49030960,
"step": 2510
},
{
"epoch": 2.8653270628473706,
"grad_norm": 7.859062671661377,
"learning_rate": 4.8881135015270206e-05,
"loss": 0.0941,
"num_input_tokens_seen": 49128672,
"step": 2515
},
{
"epoch": 2.871027504631609,
"grad_norm": 7.483722686767578,
"learning_rate": 4.887671547666912e-05,
"loss": 0.1318,
"num_input_tokens_seen": 49226416,
"step": 2520
},
{
"epoch": 2.8767279464158473,
"grad_norm": 8.643847465515137,
"learning_rate": 4.887228742722405e-05,
"loss": 0.1856,
"num_input_tokens_seen": 49324112,
"step": 2525
},
{
"epoch": 2.8824283882000854,
"grad_norm": 8.750890731811523,
"learning_rate": 4.8867850868513374e-05,
"loss": 0.1006,
"num_input_tokens_seen": 49421776,
"step": 2530
},
{
"epoch": 2.888128829984324,
"grad_norm": 6.101781845092773,
"learning_rate": 4.8863405802118514e-05,
"loss": 0.1324,
"num_input_tokens_seen": 49519568,
"step": 2535
},
{
"epoch": 2.893829271768562,
"grad_norm": 7.980799198150635,
"learning_rate": 4.8858952229623886e-05,
"loss": 0.0907,
"num_input_tokens_seen": 49617360,
"step": 2540
},
{
"epoch": 2.8995297135528,
"grad_norm": 3.3241348266601562,
"learning_rate": 4.8854490152616984e-05,
"loss": 0.1104,
"num_input_tokens_seen": 49715056,
"step": 2545
},
{
"epoch": 2.9052301553370388,
"grad_norm": 10.823814392089844,
"learning_rate": 4.88500195726883e-05,
"loss": 0.1766,
"num_input_tokens_seen": 49812848,
"step": 2550
},
{
"epoch": 2.910930597121277,
"grad_norm": 6.91947078704834,
"learning_rate": 4.884554049143139e-05,
"loss": 0.1128,
"num_input_tokens_seen": 49910496,
"step": 2555
},
{
"epoch": 2.916631038905515,
"grad_norm": 4.440322399139404,
"learning_rate": 4.884105291044279e-05,
"loss": 0.0796,
"num_input_tokens_seen": 50008224,
"step": 2560
},
{
"epoch": 2.9223314806897536,
"grad_norm": 5.996119976043701,
"learning_rate": 4.8836556831322125e-05,
"loss": 0.1648,
"num_input_tokens_seen": 50105952,
"step": 2565
},
{
"epoch": 2.9280319224739917,
"grad_norm": 8.666937828063965,
"learning_rate": 4.8832052255672e-05,
"loss": 0.1488,
"num_input_tokens_seen": 50203680,
"step": 2570
},
{
"epoch": 2.93373236425823,
"grad_norm": 5.516872882843018,
"learning_rate": 4.8827539185098085e-05,
"loss": 0.1598,
"num_input_tokens_seen": 50301504,
"step": 2575
},
{
"epoch": 2.9394328060424684,
"grad_norm": 6.207433223724365,
"learning_rate": 4.882301762120905e-05,
"loss": 0.1003,
"num_input_tokens_seen": 50399152,
"step": 2580
},
{
"epoch": 2.9451332478267065,
"grad_norm": 5.3155598640441895,
"learning_rate": 4.88184875656166e-05,
"loss": 0.0675,
"num_input_tokens_seen": 50496880,
"step": 2585
},
{
"epoch": 2.9508336896109446,
"grad_norm": 8.118722915649414,
"learning_rate": 4.881394901993549e-05,
"loss": 0.0834,
"num_input_tokens_seen": 50594656,
"step": 2590
},
{
"epoch": 2.956534131395183,
"grad_norm": 8.280570983886719,
"learning_rate": 4.880940198578347e-05,
"loss": 0.1212,
"num_input_tokens_seen": 50692496,
"step": 2595
},
{
"epoch": 2.9622345731794213,
"grad_norm": 6.043283939361572,
"learning_rate": 4.8804846464781334e-05,
"loss": 0.1096,
"num_input_tokens_seen": 50790272,
"step": 2600
},
{
"epoch": 2.9679350149636594,
"grad_norm": 3.722360134124756,
"learning_rate": 4.8800282458552885e-05,
"loss": 0.155,
"num_input_tokens_seen": 50888032,
"step": 2605
},
{
"epoch": 2.973635456747898,
"grad_norm": 6.298059940338135,
"learning_rate": 4.8795709968724974e-05,
"loss": 0.072,
"num_input_tokens_seen": 50985776,
"step": 2610
},
{
"epoch": 2.979335898532136,
"grad_norm": 8.660189628601074,
"learning_rate": 4.879112899692745e-05,
"loss": 0.1247,
"num_input_tokens_seen": 51083440,
"step": 2615
},
{
"epoch": 2.9850363403163747,
"grad_norm": 14.756346702575684,
"learning_rate": 4.8786539544793206e-05,
"loss": 0.1067,
"num_input_tokens_seen": 51181152,
"step": 2620
},
{
"epoch": 2.990736782100613,
"grad_norm": 2.7973508834838867,
"learning_rate": 4.878194161395816e-05,
"loss": 0.0766,
"num_input_tokens_seen": 51278912,
"step": 2625
},
{
"epoch": 2.9964372238848513,
"grad_norm": 7.684298515319824,
"learning_rate": 4.8777335206061216e-05,
"loss": 0.0668,
"num_input_tokens_seen": 51376640,
"step": 2630
},
{
"epoch": 3.0011400883568475,
"grad_norm": 3.268608570098877,
"learning_rate": 4.877272032274435e-05,
"loss": 0.0698,
"num_input_tokens_seen": 51457280,
"step": 2635
},
{
"epoch": 3.006840530141086,
"grad_norm": 2.5024783611297607,
"learning_rate": 4.876809696565252e-05,
"loss": 0.0681,
"num_input_tokens_seen": 51555088,
"step": 2640
},
{
"epoch": 3.012540971925324,
"grad_norm": 2.1016647815704346,
"learning_rate": 4.876346513643373e-05,
"loss": 0.051,
"num_input_tokens_seen": 51652864,
"step": 2645
},
{
"epoch": 3.0182414137095623,
"grad_norm": 8.176024436950684,
"learning_rate": 4.875882483673898e-05,
"loss": 0.0712,
"num_input_tokens_seen": 51750560,
"step": 2650
},
{
"epoch": 3.023941855493801,
"grad_norm": 4.242624759674072,
"learning_rate": 4.875417606822232e-05,
"loss": 0.0761,
"num_input_tokens_seen": 51848288,
"step": 2655
},
{
"epoch": 3.029642297278039,
"grad_norm": 11.779088973999023,
"learning_rate": 4.874951883254078e-05,
"loss": 0.0485,
"num_input_tokens_seen": 51946016,
"step": 2660
},
{
"epoch": 3.035342739062277,
"grad_norm": 3.6110494136810303,
"learning_rate": 4.874485313135446e-05,
"loss": 0.0747,
"num_input_tokens_seen": 52043776,
"step": 2665
},
{
"epoch": 3.0410431808465157,
"grad_norm": 8.334362030029297,
"learning_rate": 4.874017896632642e-05,
"loss": 0.0614,
"num_input_tokens_seen": 52141520,
"step": 2670
},
{
"epoch": 3.046743622630754,
"grad_norm": 5.685539722442627,
"learning_rate": 4.8735496339122776e-05,
"loss": 0.0604,
"num_input_tokens_seen": 52239200,
"step": 2675
},
{
"epoch": 3.052444064414992,
"grad_norm": 4.587195873260498,
"learning_rate": 4.8730805251412645e-05,
"loss": 0.1134,
"num_input_tokens_seen": 52336848,
"step": 2680
},
{
"epoch": 3.0581445061992305,
"grad_norm": 2.6720361709594727,
"learning_rate": 4.872610570486816e-05,
"loss": 0.0946,
"num_input_tokens_seen": 52434640,
"step": 2685
},
{
"epoch": 3.0638449479834686,
"grad_norm": 5.53890323638916,
"learning_rate": 4.872139770116447e-05,
"loss": 0.0566,
"num_input_tokens_seen": 52532400,
"step": 2690
},
{
"epoch": 3.069545389767707,
"grad_norm": 0.7934585809707642,
"learning_rate": 4.871668124197976e-05,
"loss": 0.0163,
"num_input_tokens_seen": 52630112,
"step": 2695
},
{
"epoch": 3.0752458315519453,
"grad_norm": 9.233660697937012,
"learning_rate": 4.871195632899518e-05,
"loss": 0.0552,
"num_input_tokens_seen": 52727840,
"step": 2700
},
{
"epoch": 3.0809462733361834,
"grad_norm": 5.545113563537598,
"learning_rate": 4.870722296389495e-05,
"loss": 0.0711,
"num_input_tokens_seen": 52825600,
"step": 2705
},
{
"epoch": 3.086646715120422,
"grad_norm": 8.869247436523438,
"learning_rate": 4.870248114836626e-05,
"loss": 0.1192,
"num_input_tokens_seen": 52923312,
"step": 2710
},
{
"epoch": 3.09234715690466,
"grad_norm": 2.2247767448425293,
"learning_rate": 4.8697730884099334e-05,
"loss": 0.0258,
"num_input_tokens_seen": 53020928,
"step": 2715
},
{
"epoch": 3.0980475986888982,
"grad_norm": 0.8696161508560181,
"learning_rate": 4.8692972172787396e-05,
"loss": 0.0649,
"num_input_tokens_seen": 53118720,
"step": 2720
},
{
"epoch": 3.103748040473137,
"grad_norm": 6.550947189331055,
"learning_rate": 4.86882050161267e-05,
"loss": 0.0605,
"num_input_tokens_seen": 53216512,
"step": 2725
},
{
"epoch": 3.109448482257375,
"grad_norm": 5.182213306427002,
"learning_rate": 4.8683429415816485e-05,
"loss": 0.0933,
"num_input_tokens_seen": 53314224,
"step": 2730
},
{
"epoch": 3.115148924041613,
"grad_norm": 1.667688012123108,
"learning_rate": 4.867864537355901e-05,
"loss": 0.0777,
"num_input_tokens_seen": 53411936,
"step": 2735
},
{
"epoch": 3.1208493658258516,
"grad_norm": 10.697488784790039,
"learning_rate": 4.867385289105955e-05,
"loss": 0.1207,
"num_input_tokens_seen": 53509664,
"step": 2740
},
{
"epoch": 3.1265498076100897,
"grad_norm": 5.032103061676025,
"learning_rate": 4.866905197002637e-05,
"loss": 0.064,
"num_input_tokens_seen": 53607408,
"step": 2745
},
{
"epoch": 3.1322502493943283,
"grad_norm": 6.669914722442627,
"learning_rate": 4.866424261217078e-05,
"loss": 0.0425,
"num_input_tokens_seen": 53705216,
"step": 2750
},
{
"epoch": 3.1379506911785664,
"grad_norm": 3.654059886932373,
"learning_rate": 4.865942481920706e-05,
"loss": 0.0541,
"num_input_tokens_seen": 53802960,
"step": 2755
},
{
"epoch": 3.1436511329628045,
"grad_norm": 3.9376277923583984,
"learning_rate": 4.865459859285251e-05,
"loss": 0.0352,
"num_input_tokens_seen": 53900720,
"step": 2760
},
{
"epoch": 3.149351574747043,
"grad_norm": 3.594050168991089,
"learning_rate": 4.864976393482743e-05,
"loss": 0.0372,
"num_input_tokens_seen": 53998384,
"step": 2765
},
{
"epoch": 3.155052016531281,
"grad_norm": 5.50773811340332,
"learning_rate": 4.864492084685514e-05,
"loss": 0.0612,
"num_input_tokens_seen": 54096144,
"step": 2770
},
{
"epoch": 3.1607524583155193,
"grad_norm": 11.46947193145752,
"learning_rate": 4.864006933066196e-05,
"loss": 0.0896,
"num_input_tokens_seen": 54193840,
"step": 2775
},
{
"epoch": 3.166452900099758,
"grad_norm": 9.24869155883789,
"learning_rate": 4.8635209387977197e-05,
"loss": 0.0575,
"num_input_tokens_seen": 54291568,
"step": 2780
},
{
"epoch": 3.172153341883996,
"grad_norm": 5.757988929748535,
"learning_rate": 4.8630341020533196e-05,
"loss": 0.0832,
"num_input_tokens_seen": 54389248,
"step": 2785
},
{
"epoch": 3.177853783668234,
"grad_norm": 6.639283657073975,
"learning_rate": 4.862546423006527e-05,
"loss": 0.0882,
"num_input_tokens_seen": 54486944,
"step": 2790
},
{
"epoch": 3.1835542254524727,
"grad_norm": 5.969208240509033,
"learning_rate": 4.8620579018311744e-05,
"loss": 0.0486,
"num_input_tokens_seen": 54584624,
"step": 2795
},
{
"epoch": 3.189254667236711,
"grad_norm": 11.107736587524414,
"learning_rate": 4.8615685387013956e-05,
"loss": 0.0754,
"num_input_tokens_seen": 54682384,
"step": 2800
},
{
"epoch": 3.194955109020949,
"grad_norm": 9.802680969238281,
"learning_rate": 4.861078333791624e-05,
"loss": 0.0721,
"num_input_tokens_seen": 54780160,
"step": 2805
},
{
"epoch": 3.2006555508051875,
"grad_norm": 2.6495935916900635,
"learning_rate": 4.860587287276592e-05,
"loss": 0.0538,
"num_input_tokens_seen": 54877872,
"step": 2810
},
{
"epoch": 3.2063559925894256,
"grad_norm": 5.3208818435668945,
"learning_rate": 4.8600953993313344e-05,
"loss": 0.0571,
"num_input_tokens_seen": 54975632,
"step": 2815
},
{
"epoch": 3.2120564343736637,
"grad_norm": 5.696016311645508,
"learning_rate": 4.859602670131185e-05,
"loss": 0.0616,
"num_input_tokens_seen": 55073408,
"step": 2820
},
{
"epoch": 3.2177568761579023,
"grad_norm": 9.000017166137695,
"learning_rate": 4.859109099851774e-05,
"loss": 0.1114,
"num_input_tokens_seen": 55171152,
"step": 2825
},
{
"epoch": 3.2234573179421404,
"grad_norm": 6.167779922485352,
"learning_rate": 4.8586146886690364e-05,
"loss": 0.0335,
"num_input_tokens_seen": 55268896,
"step": 2830
},
{
"epoch": 3.229157759726379,
"grad_norm": 0.7552993893623352,
"learning_rate": 4.8581194367592043e-05,
"loss": 0.0157,
"num_input_tokens_seen": 55366688,
"step": 2835
},
{
"epoch": 3.234858201510617,
"grad_norm": 6.548010349273682,
"learning_rate": 4.8576233442988095e-05,
"loss": 0.0572,
"num_input_tokens_seen": 55464368,
"step": 2840
},
{
"epoch": 3.240558643294855,
"grad_norm": 0.6461604237556458,
"learning_rate": 4.857126411464685e-05,
"loss": 0.0241,
"num_input_tokens_seen": 55562128,
"step": 2845
},
{
"epoch": 3.2462590850790938,
"grad_norm": 7.866938591003418,
"learning_rate": 4.856628638433962e-05,
"loss": 0.0597,
"num_input_tokens_seen": 55659792,
"step": 2850
},
{
"epoch": 3.251959526863332,
"grad_norm": 5.226189136505127,
"learning_rate": 4.85613002538407e-05,
"loss": 0.0267,
"num_input_tokens_seen": 55757504,
"step": 2855
},
{
"epoch": 3.25765996864757,
"grad_norm": 6.863353252410889,
"learning_rate": 4.855630572492742e-05,
"loss": 0.0537,
"num_input_tokens_seen": 55855344,
"step": 2860
},
{
"epoch": 3.2633604104318086,
"grad_norm": 1.295962929725647,
"learning_rate": 4.8551302799380055e-05,
"loss": 0.0304,
"num_input_tokens_seen": 55953072,
"step": 2865
},
{
"epoch": 3.2690608522160467,
"grad_norm": 5.298805236816406,
"learning_rate": 4.854629147898191e-05,
"loss": 0.0321,
"num_input_tokens_seen": 56050752,
"step": 2870
},
{
"epoch": 3.2747612940002853,
"grad_norm": 12.122303009033203,
"learning_rate": 4.854127176551925e-05,
"loss": 0.1434,
"num_input_tokens_seen": 56148560,
"step": 2875
},
{
"epoch": 3.2804617357845234,
"grad_norm": 1.2280305624008179,
"learning_rate": 4.8536243660781375e-05,
"loss": 0.0707,
"num_input_tokens_seen": 56246272,
"step": 2880
},
{
"epoch": 3.2861621775687615,
"grad_norm": 5.140838623046875,
"learning_rate": 4.8531207166560524e-05,
"loss": 0.0457,
"num_input_tokens_seen": 56343984,
"step": 2885
},
{
"epoch": 3.291862619353,
"grad_norm": 2.1565487384796143,
"learning_rate": 4.8526162284651974e-05,
"loss": 0.0177,
"num_input_tokens_seen": 56441792,
"step": 2890
},
{
"epoch": 3.297563061137238,
"grad_norm": 2.832627534866333,
"learning_rate": 4.852110901685396e-05,
"loss": 0.0283,
"num_input_tokens_seen": 56539600,
"step": 2895
},
{
"epoch": 3.3032635029214763,
"grad_norm": 10.345179557800293,
"learning_rate": 4.851604736496772e-05,
"loss": 0.0475,
"num_input_tokens_seen": 56637280,
"step": 2900
},
{
"epoch": 3.308963944705715,
"grad_norm": 0.8519467711448669,
"learning_rate": 4.8510977330797476e-05,
"loss": 0.0266,
"num_input_tokens_seen": 56735056,
"step": 2905
},
{
"epoch": 3.314664386489953,
"grad_norm": 6.070542335510254,
"learning_rate": 4.8505898916150436e-05,
"loss": 0.0536,
"num_input_tokens_seen": 56832864,
"step": 2910
},
{
"epoch": 3.320364828274191,
"grad_norm": 3.4217629432678223,
"learning_rate": 4.85008121228368e-05,
"loss": 0.0251,
"num_input_tokens_seen": 56930608,
"step": 2915
},
{
"epoch": 3.3260652700584297,
"grad_norm": 5.251518726348877,
"learning_rate": 4.849571695266977e-05,
"loss": 0.0676,
"num_input_tokens_seen": 57028336,
"step": 2920
},
{
"epoch": 3.331765711842668,
"grad_norm": 9.286744117736816,
"learning_rate": 4.849061340746549e-05,
"loss": 0.1008,
"num_input_tokens_seen": 57126128,
"step": 2925
},
{
"epoch": 3.337466153626906,
"grad_norm": 5.496871471405029,
"learning_rate": 4.848550148904314e-05,
"loss": 0.1098,
"num_input_tokens_seen": 57223840,
"step": 2930
},
{
"epoch": 3.3431665954111445,
"grad_norm": 6.865820407867432,
"learning_rate": 4.848038119922483e-05,
"loss": 0.0545,
"num_input_tokens_seen": 57321568,
"step": 2935
},
{
"epoch": 3.3488670371953826,
"grad_norm": 4.949888229370117,
"learning_rate": 4.847525253983572e-05,
"loss": 0.1271,
"num_input_tokens_seen": 57419328,
"step": 2940
},
{
"epoch": 3.3545674789796207,
"grad_norm": 0.7395240068435669,
"learning_rate": 4.847011551270391e-05,
"loss": 0.0262,
"num_input_tokens_seen": 57517008,
"step": 2945
},
{
"epoch": 3.3602679207638593,
"grad_norm": 10.39633560180664,
"learning_rate": 4.846497011966047e-05,
"loss": 0.0333,
"num_input_tokens_seen": 57614816,
"step": 2950
},
{
"epoch": 3.3659683625480974,
"grad_norm": 5.424074649810791,
"learning_rate": 4.845981636253949e-05,
"loss": 0.066,
"num_input_tokens_seen": 57712528,
"step": 2955
},
{
"epoch": 3.3716688043323355,
"grad_norm": 5.526705265045166,
"learning_rate": 4.845465424317802e-05,
"loss": 0.0246,
"num_input_tokens_seen": 57810208,
"step": 2960
},
{
"epoch": 3.377369246116574,
"grad_norm": 3.3858978748321533,
"learning_rate": 4.8449483763416095e-05,
"loss": 0.0585,
"num_input_tokens_seen": 57907968,
"step": 2965
},
{
"epoch": 3.383069687900812,
"grad_norm": 4.47909688949585,
"learning_rate": 4.844430492509674e-05,
"loss": 0.0799,
"num_input_tokens_seen": 58005744,
"step": 2970
},
{
"epoch": 3.3887701296850508,
"grad_norm": 8.794025421142578,
"learning_rate": 4.843911773006593e-05,
"loss": 0.0286,
"num_input_tokens_seen": 58103504,
"step": 2975
},
{
"epoch": 3.394470571469289,
"grad_norm": 5.554230690002441,
"learning_rate": 4.8433922180172653e-05,
"loss": 0.0499,
"num_input_tokens_seen": 58201232,
"step": 2980
},
{
"epoch": 3.400171013253527,
"grad_norm": 4.487776279449463,
"learning_rate": 4.842871827726886e-05,
"loss": 0.0402,
"num_input_tokens_seen": 58299024,
"step": 2985
},
{
"epoch": 3.4058714550377656,
"grad_norm": 2.8140945434570312,
"learning_rate": 4.8423506023209466e-05,
"loss": 0.0566,
"num_input_tokens_seen": 58396816,
"step": 2990
},
{
"epoch": 3.4115718968220037,
"grad_norm": 3.601980686187744,
"learning_rate": 4.8418285419852395e-05,
"loss": 0.0412,
"num_input_tokens_seen": 58494544,
"step": 2995
},
{
"epoch": 3.417272338606242,
"grad_norm": 2.136195182800293,
"learning_rate": 4.841305646905851e-05,
"loss": 0.0304,
"num_input_tokens_seen": 58592352,
"step": 3000
},
{
"epoch": 3.4229727803904804,
"grad_norm": 5.654057502746582,
"learning_rate": 4.8407819172691694e-05,
"loss": 0.0304,
"num_input_tokens_seen": 58690128,
"step": 3005
},
{
"epoch": 3.4286732221747185,
"grad_norm": 2.4083597660064697,
"learning_rate": 4.840257353261875e-05,
"loss": 0.0383,
"num_input_tokens_seen": 58787904,
"step": 3010
},
{
"epoch": 3.434373663958957,
"grad_norm": 5.336053371429443,
"learning_rate": 4.83973195507095e-05,
"loss": 0.0915,
"num_input_tokens_seen": 58885632,
"step": 3015
},
{
"epoch": 3.440074105743195,
"grad_norm": 4.11752986907959,
"learning_rate": 4.839205722883672e-05,
"loss": 0.0503,
"num_input_tokens_seen": 58983312,
"step": 3020
},
{
"epoch": 3.4457745475274333,
"grad_norm": 13.777847290039062,
"learning_rate": 4.838678656887616e-05,
"loss": 0.1445,
"num_input_tokens_seen": 59081072,
"step": 3025
},
{
"epoch": 3.451474989311672,
"grad_norm": 9.075989723205566,
"learning_rate": 4.838150757270655e-05,
"loss": 0.0777,
"num_input_tokens_seen": 59178896,
"step": 3030
},
{
"epoch": 3.45717543109591,
"grad_norm": 7.0720014572143555,
"learning_rate": 4.837622024220959e-05,
"loss": 0.0592,
"num_input_tokens_seen": 59276560,
"step": 3035
},
{
"epoch": 3.462875872880148,
"grad_norm": 4.558810710906982,
"learning_rate": 4.837092457926993e-05,
"loss": 0.0274,
"num_input_tokens_seen": 59374368,
"step": 3040
},
{
"epoch": 3.4685763146643867,
"grad_norm": 14.200141906738281,
"learning_rate": 4.8365620585775214e-05,
"loss": 0.0558,
"num_input_tokens_seen": 59472048,
"step": 3045
},
{
"epoch": 3.4742767564486248,
"grad_norm": 5.859817028045654,
"learning_rate": 4.836030826361605e-05,
"loss": 0.0277,
"num_input_tokens_seen": 59569840,
"step": 3050
},
{
"epoch": 3.479977198232863,
"grad_norm": 8.385420799255371,
"learning_rate": 4.835498761468601e-05,
"loss": 0.0667,
"num_input_tokens_seen": 59667584,
"step": 3055
},
{
"epoch": 3.4856776400171015,
"grad_norm": 1.2888391017913818,
"learning_rate": 4.834965864088164e-05,
"loss": 0.0207,
"num_input_tokens_seen": 59765392,
"step": 3060
},
{
"epoch": 3.4913780818013396,
"grad_norm": 1.1023948192596436,
"learning_rate": 4.834432134410245e-05,
"loss": 0.0207,
"num_input_tokens_seen": 59863152,
"step": 3065
},
{
"epoch": 3.4970785235855777,
"grad_norm": 3.3756585121154785,
"learning_rate": 4.8338975726250925e-05,
"loss": 0.0416,
"num_input_tokens_seen": 59960928,
"step": 3070
},
{
"epoch": 3.5027789653698163,
"grad_norm": 0.9311105608940125,
"learning_rate": 4.833362178923249e-05,
"loss": 0.0316,
"num_input_tokens_seen": 60058656,
"step": 3075
},
{
"epoch": 3.5084794071540544,
"grad_norm": 10.324899673461914,
"learning_rate": 4.8328259534955554e-05,
"loss": 0.0793,
"num_input_tokens_seen": 60156448,
"step": 3080
},
{
"epoch": 3.5141798489382925,
"grad_norm": 4.7765703201293945,
"learning_rate": 4.832288896533151e-05,
"loss": 0.0476,
"num_input_tokens_seen": 60254192,
"step": 3085
},
{
"epoch": 3.519880290722531,
"grad_norm": 1.959538459777832,
"learning_rate": 4.831751008227468e-05,
"loss": 0.0346,
"num_input_tokens_seen": 60351920,
"step": 3090
},
{
"epoch": 3.525580732506769,
"grad_norm": 9.76518440246582,
"learning_rate": 4.831212288770237e-05,
"loss": 0.046,
"num_input_tokens_seen": 60449696,
"step": 3095
},
{
"epoch": 3.5312811742910073,
"grad_norm": 1.2289072275161743,
"learning_rate": 4.8306727383534835e-05,
"loss": 0.0225,
"num_input_tokens_seen": 60547440,
"step": 3100
},
{
"epoch": 3.536981616075246,
"grad_norm": 7.658115863800049,
"learning_rate": 4.8301323571695314e-05,
"loss": 0.0281,
"num_input_tokens_seen": 60645200,
"step": 3105
},
{
"epoch": 3.542682057859484,
"grad_norm": 4.308380126953125,
"learning_rate": 4.829591145410997e-05,
"loss": 0.0265,
"num_input_tokens_seen": 60742880,
"step": 3110
},
{
"epoch": 3.5483824996437225,
"grad_norm": 4.51566743850708,
"learning_rate": 4.829049103270798e-05,
"loss": 0.0473,
"num_input_tokens_seen": 60840640,
"step": 3115
},
{
"epoch": 3.5540829414279607,
"grad_norm": 4.3482255935668945,
"learning_rate": 4.8285062309421426e-05,
"loss": 0.0468,
"num_input_tokens_seen": 60938400,
"step": 3120
},
{
"epoch": 3.559783383212199,
"grad_norm": 7.6800994873046875,
"learning_rate": 4.827962528618538e-05,
"loss": 0.0282,
"num_input_tokens_seen": 61036128,
"step": 3125
},
{
"epoch": 3.5654838249964373,
"grad_norm": 8.757813453674316,
"learning_rate": 4.8274179964937875e-05,
"loss": 0.0225,
"num_input_tokens_seen": 61133872,
"step": 3130
},
{
"epoch": 3.5711842667806755,
"grad_norm": 1.4490429162979126,
"learning_rate": 4.826872634761989e-05,
"loss": 0.0375,
"num_input_tokens_seen": 61231600,
"step": 3135
},
{
"epoch": 3.576884708564914,
"grad_norm": 5.913198471069336,
"learning_rate": 4.826326443617536e-05,
"loss": 0.0422,
"num_input_tokens_seen": 61329360,
"step": 3140
},
{
"epoch": 3.582585150349152,
"grad_norm": 8.622357368469238,
"learning_rate": 4.825779423255118e-05,
"loss": 0.0399,
"num_input_tokens_seen": 61427104,
"step": 3145
},
{
"epoch": 3.5882855921333903,
"grad_norm": 6.383512496948242,
"learning_rate": 4.825231573869721e-05,
"loss": 0.0356,
"num_input_tokens_seen": 61524848,
"step": 3150
},
{
"epoch": 3.593986033917629,
"grad_norm": 15.792478561401367,
"learning_rate": 4.824682895656624e-05,
"loss": 0.0613,
"num_input_tokens_seen": 61622512,
"step": 3155
},
{
"epoch": 3.599686475701867,
"grad_norm": 1.2860291004180908,
"learning_rate": 4.824133388811405e-05,
"loss": 0.0439,
"num_input_tokens_seen": 61720192,
"step": 3160
},
{
"epoch": 3.605386917486105,
"grad_norm": 6.301830768585205,
"learning_rate": 4.823583053529934e-05,
"loss": 0.0353,
"num_input_tokens_seen": 61817936,
"step": 3165
},
{
"epoch": 3.6110873592703436,
"grad_norm": 4.263798236846924,
"learning_rate": 4.823031890008379e-05,
"loss": 0.0338,
"num_input_tokens_seen": 61915664,
"step": 3170
},
{
"epoch": 3.6167878010545818,
"grad_norm": 7.392456531524658,
"learning_rate": 4.8224798984432005e-05,
"loss": 0.0399,
"num_input_tokens_seen": 62013456,
"step": 3175
},
{
"epoch": 3.62248824283882,
"grad_norm": 2.850409746170044,
"learning_rate": 4.8219270790311575e-05,
"loss": 0.0422,
"num_input_tokens_seen": 62111248,
"step": 3180
},
{
"epoch": 3.6281886846230584,
"grad_norm": 3.5166022777557373,
"learning_rate": 4.8213734319693004e-05,
"loss": 0.0193,
"num_input_tokens_seen": 62208960,
"step": 3185
},
{
"epoch": 3.6338891264072966,
"grad_norm": 7.699153423309326,
"learning_rate": 4.820818957454978e-05,
"loss": 0.0698,
"num_input_tokens_seen": 62306592,
"step": 3190
},
{
"epoch": 3.6395895681915347,
"grad_norm": 0.7717591524124146,
"learning_rate": 4.820263655685831e-05,
"loss": 0.0257,
"num_input_tokens_seen": 62404400,
"step": 3195
},
{
"epoch": 3.6452900099757732,
"grad_norm": 6.028016567230225,
"learning_rate": 4.819707526859797e-05,
"loss": 0.0352,
"num_input_tokens_seen": 62502160,
"step": 3200
},
{
"epoch": 3.6509904517600114,
"grad_norm": 2.3986012935638428,
"learning_rate": 4.819150571175108e-05,
"loss": 0.043,
"num_input_tokens_seen": 62599920,
"step": 3205
},
{
"epoch": 3.6566908935442495,
"grad_norm": 3.4287400245666504,
"learning_rate": 4.818592788830291e-05,
"loss": 0.0289,
"num_input_tokens_seen": 62697680,
"step": 3210
},
{
"epoch": 3.662391335328488,
"grad_norm": 5.921146869659424,
"learning_rate": 4.818034180024167e-05,
"loss": 0.0331,
"num_input_tokens_seen": 62795472,
"step": 3215
},
{
"epoch": 3.668091777112726,
"grad_norm": 4.856356620788574,
"learning_rate": 4.8174747449558515e-05,
"loss": 0.0131,
"num_input_tokens_seen": 62893136,
"step": 3220
},
{
"epoch": 3.6737922188969643,
"grad_norm": 6.656949996948242,
"learning_rate": 4.816914483824755e-05,
"loss": 0.0426,
"num_input_tokens_seen": 62990816,
"step": 3225
},
{
"epoch": 3.679492660681203,
"grad_norm": 1.0884100198745728,
"learning_rate": 4.816353396830583e-05,
"loss": 0.032,
"num_input_tokens_seen": 63088560,
"step": 3230
},
{
"epoch": 3.685193102465441,
"grad_norm": 0.37009307742118835,
"learning_rate": 4.815791484173333e-05,
"loss": 0.0322,
"num_input_tokens_seen": 63186272,
"step": 3235
},
{
"epoch": 3.690893544249679,
"grad_norm": 2.093526840209961,
"learning_rate": 4.815228746053301e-05,
"loss": 0.0225,
"num_input_tokens_seen": 63284016,
"step": 3240
},
{
"epoch": 3.6965939860339176,
"grad_norm": 9.629427909851074,
"learning_rate": 4.814665182671072e-05,
"loss": 0.0321,
"num_input_tokens_seen": 63381776,
"step": 3245
},
{
"epoch": 3.7022944278181558,
"grad_norm": 7.924525260925293,
"learning_rate": 4.8141007942275295e-05,
"loss": 0.0641,
"num_input_tokens_seen": 63479536,
"step": 3250
},
{
"epoch": 3.7079948696023943,
"grad_norm": 3.5611679553985596,
"learning_rate": 4.813535580923849e-05,
"loss": 0.0731,
"num_input_tokens_seen": 63577152,
"step": 3255
},
{
"epoch": 3.7136953113866324,
"grad_norm": 0.575011670589447,
"learning_rate": 4.812969542961502e-05,
"loss": 0.0453,
"num_input_tokens_seen": 63674928,
"step": 3260
},
{
"epoch": 3.719395753170871,
"grad_norm": 5.894010066986084,
"learning_rate": 4.8124026805422494e-05,
"loss": 0.0257,
"num_input_tokens_seen": 63772640,
"step": 3265
},
{
"epoch": 3.725096194955109,
"grad_norm": 3.0350735187530518,
"learning_rate": 4.811834993868152e-05,
"loss": 0.0338,
"num_input_tokens_seen": 63870336,
"step": 3270
},
{
"epoch": 3.7307966367393472,
"grad_norm": 8.058395385742188,
"learning_rate": 4.81126648314156e-05,
"loss": 0.0421,
"num_input_tokens_seen": 63968160,
"step": 3275
},
{
"epoch": 3.736497078523586,
"grad_norm": 9.93237590789795,
"learning_rate": 4.81069714856512e-05,
"loss": 0.0448,
"num_input_tokens_seen": 64065904,
"step": 3280
},
{
"epoch": 3.742197520307824,
"grad_norm": 11.603642463684082,
"learning_rate": 4.810126990341769e-05,
"loss": 0.0901,
"num_input_tokens_seen": 64163616,
"step": 3285
},
{
"epoch": 3.747897962092062,
"grad_norm": 3.8158483505249023,
"learning_rate": 4.809556008674741e-05,
"loss": 0.0154,
"num_input_tokens_seen": 64261376,
"step": 3290
},
{
"epoch": 3.7535984038763006,
"grad_norm": 0.4274216890335083,
"learning_rate": 4.8089842037675615e-05,
"loss": 0.0094,
"num_input_tokens_seen": 64359072,
"step": 3295
},
{
"epoch": 3.7592988456605387,
"grad_norm": 4.152562618255615,
"learning_rate": 4.808411575824051e-05,
"loss": 0.0443,
"num_input_tokens_seen": 64456816,
"step": 3300
},
{
"epoch": 3.764999287444777,
"grad_norm": 4.328752040863037,
"learning_rate": 4.807838125048322e-05,
"loss": 0.0393,
"num_input_tokens_seen": 64554464,
"step": 3305
},
{
"epoch": 3.7706997292290154,
"grad_norm": 4.978052616119385,
"learning_rate": 4.80726385164478e-05,
"loss": 0.0324,
"num_input_tokens_seen": 64652272,
"step": 3310
},
{
"epoch": 3.7764001710132535,
"grad_norm": 6.3277082443237305,
"learning_rate": 4.8066887558181265e-05,
"loss": 0.0203,
"num_input_tokens_seen": 64750016,
"step": 3315
},
{
"epoch": 3.7821006127974917,
"grad_norm": 0.5800598859786987,
"learning_rate": 4.806112837773351e-05,
"loss": 0.015,
"num_input_tokens_seen": 64847760,
"step": 3320
},
{
"epoch": 3.78780105458173,
"grad_norm": 17.387359619140625,
"learning_rate": 4.8055360977157426e-05,
"loss": 0.0503,
"num_input_tokens_seen": 64945504,
"step": 3325
},
{
"epoch": 3.7935014963659683,
"grad_norm": 6.007382392883301,
"learning_rate": 4.8049585358508776e-05,
"loss": 0.0294,
"num_input_tokens_seen": 65043232,
"step": 3330
},
{
"epoch": 3.7992019381502065,
"grad_norm": 8.47810173034668,
"learning_rate": 4.804380152384629e-05,
"loss": 0.044,
"num_input_tokens_seen": 65141024,
"step": 3335
},
{
"epoch": 3.804902379934445,
"grad_norm": 9.82911491394043,
"learning_rate": 4.8038009475231604e-05,
"loss": 0.0369,
"num_input_tokens_seen": 65238752,
"step": 3340
},
{
"epoch": 3.810602821718683,
"grad_norm": 13.116619110107422,
"learning_rate": 4.80322092147293e-05,
"loss": 0.0289,
"num_input_tokens_seen": 65336528,
"step": 3345
},
{
"epoch": 3.8163032635029213,
"grad_norm": 1.19611656665802,
"learning_rate": 4.802640074440686e-05,
"loss": 0.0214,
"num_input_tokens_seen": 65434272,
"step": 3350
},
{
"epoch": 3.82200370528716,
"grad_norm": 0.3276759386062622,
"learning_rate": 4.802058406633474e-05,
"loss": 0.0193,
"num_input_tokens_seen": 65532064,
"step": 3355
},
{
"epoch": 3.827704147071398,
"grad_norm": 6.492347240447998,
"learning_rate": 4.8014759182586274e-05,
"loss": 0.0542,
"num_input_tokens_seen": 65629792,
"step": 3360
},
{
"epoch": 3.833404588855636,
"grad_norm": 3.1319868564605713,
"learning_rate": 4.800892609523774e-05,
"loss": 0.0361,
"num_input_tokens_seen": 65727536,
"step": 3365
},
{
"epoch": 3.8391050306398746,
"grad_norm": 0.28512752056121826,
"learning_rate": 4.8003084806368336e-05,
"loss": 0.0299,
"num_input_tokens_seen": 65825200,
"step": 3370
},
{
"epoch": 3.8448054724241127,
"grad_norm": 1.0629769563674927,
"learning_rate": 4.7997235318060185e-05,
"loss": 0.0643,
"num_input_tokens_seen": 65922976,
"step": 3375
},
{
"epoch": 3.8505059142083513,
"grad_norm": 9.550495147705078,
"learning_rate": 4.799137763239835e-05,
"loss": 0.024,
"num_input_tokens_seen": 66020656,
"step": 3380
},
{
"epoch": 3.8562063559925894,
"grad_norm": 5.962581157684326,
"learning_rate": 4.798551175147079e-05,
"loss": 0.0279,
"num_input_tokens_seen": 66118384,
"step": 3385
},
{
"epoch": 3.8619067977768275,
"grad_norm": 2.609731435775757,
"learning_rate": 4.79796376773684e-05,
"loss": 0.0399,
"num_input_tokens_seen": 66216176,
"step": 3390
},
{
"epoch": 3.867607239561066,
"grad_norm": 5.378483772277832,
"learning_rate": 4.797375541218498e-05,
"loss": 0.0118,
"num_input_tokens_seen": 66313872,
"step": 3395
},
{
"epoch": 3.8733076813453042,
"grad_norm": 7.734043598175049,
"learning_rate": 4.796786495801727e-05,
"loss": 0.0262,
"num_input_tokens_seen": 66411664,
"step": 3400
},
{
"epoch": 3.879008123129543,
"grad_norm": 7.185009479522705,
"learning_rate": 4.796196631696491e-05,
"loss": 0.0313,
"num_input_tokens_seen": 66509440,
"step": 3405
},
{
"epoch": 3.884708564913781,
"grad_norm": 4.7586164474487305,
"learning_rate": 4.795605949113049e-05,
"loss": 0.0137,
"num_input_tokens_seen": 66607152,
"step": 3410
},
{
"epoch": 3.890409006698019,
"grad_norm": 0.9074054956436157,
"learning_rate": 4.795014448261947e-05,
"loss": 0.0263,
"num_input_tokens_seen": 66704880,
"step": 3415
},
{
"epoch": 3.8961094484822576,
"grad_norm": 2.6224796772003174,
"learning_rate": 4.794422129354026e-05,
"loss": 0.0146,
"num_input_tokens_seen": 66802656,
"step": 3420
},
{
"epoch": 3.9018098902664957,
"grad_norm": 0.855692982673645,
"learning_rate": 4.7938289926004185e-05,
"loss": 0.0078,
"num_input_tokens_seen": 66900480,
"step": 3425
},
{
"epoch": 3.907510332050734,
"grad_norm": 1.3807679414749146,
"learning_rate": 4.793235038212548e-05,
"loss": 0.0188,
"num_input_tokens_seen": 66998304,
"step": 3430
},
{
"epoch": 3.9132107738349724,
"grad_norm": 0.8240529298782349,
"learning_rate": 4.7926402664021275e-05,
"loss": 0.0576,
"num_input_tokens_seen": 67096000,
"step": 3435
},
{
"epoch": 3.9189112156192105,
"grad_norm": 7.201174736022949,
"learning_rate": 4.792044677381165e-05,
"loss": 0.0205,
"num_input_tokens_seen": 67193680,
"step": 3440
},
{
"epoch": 3.9246116574034486,
"grad_norm": 10.291589736938477,
"learning_rate": 4.791448271361957e-05,
"loss": 0.0524,
"num_input_tokens_seen": 67291472,
"step": 3445
},
{
"epoch": 3.930312099187687,
"grad_norm": 3.4942891597747803,
"learning_rate": 4.7908510485570925e-05,
"loss": 0.0652,
"num_input_tokens_seen": 67389216,
"step": 3450
},
{
"epoch": 3.9360125409719253,
"grad_norm": 2.04127836227417,
"learning_rate": 4.7902530091794505e-05,
"loss": 0.0356,
"num_input_tokens_seen": 67486912,
"step": 3455
},
{
"epoch": 3.9417129827561634,
"grad_norm": 4.031794548034668,
"learning_rate": 4.789654153442203e-05,
"loss": 0.0419,
"num_input_tokens_seen": 67584624,
"step": 3460
},
{
"epoch": 3.947413424540402,
"grad_norm": 8.670853614807129,
"learning_rate": 4.7890544815588115e-05,
"loss": 0.0192,
"num_input_tokens_seen": 67682320,
"step": 3465
},
{
"epoch": 3.95311386632464,
"grad_norm": 7.351383686065674,
"learning_rate": 4.788453993743028e-05,
"loss": 0.0361,
"num_input_tokens_seen": 67780064,
"step": 3470
},
{
"epoch": 3.9588143081088782,
"grad_norm": 8.677574157714844,
"learning_rate": 4.787852690208897e-05,
"loss": 0.0235,
"num_input_tokens_seen": 67877792,
"step": 3475
},
{
"epoch": 3.964514749893117,
"grad_norm": 9.356419563293457,
"learning_rate": 4.787250571170752e-05,
"loss": 0.0572,
"num_input_tokens_seen": 67975472,
"step": 3480
},
{
"epoch": 3.970215191677355,
"grad_norm": 6.9926300048828125,
"learning_rate": 4.786647636843219e-05,
"loss": 0.0837,
"num_input_tokens_seen": 68073200,
"step": 3485
},
{
"epoch": 3.975915633461593,
"grad_norm": 4.828823089599609,
"learning_rate": 4.786043887441213e-05,
"loss": 0.0422,
"num_input_tokens_seen": 68170976,
"step": 3490
},
{
"epoch": 3.9816160752458316,
"grad_norm": 11.057583808898926,
"learning_rate": 4.785439323179941e-05,
"loss": 0.0326,
"num_input_tokens_seen": 68268672,
"step": 3495
},
{
"epoch": 3.9873165170300697,
"grad_norm": 0.949934184551239,
"learning_rate": 4.784833944274899e-05,
"loss": 0.0236,
"num_input_tokens_seen": 68366432,
"step": 3500
},
{
"epoch": 3.993016958814308,
"grad_norm": 6.844513416290283,
"learning_rate": 4.784227750941873e-05,
"loss": 0.0188,
"num_input_tokens_seen": 68464128,
"step": 3505
},
{
"epoch": 3.9987174005985464,
"grad_norm": 2.295914649963379,
"learning_rate": 4.783620743396943e-05,
"loss": 0.0186,
"num_input_tokens_seen": 68561936,
"step": 3510
},
{
"epoch": 4.003420265070543,
"grad_norm": 2.7240405082702637,
"learning_rate": 4.783012921856474e-05,
"loss": 0.0217,
"num_input_tokens_seen": 68642496,
"step": 3515
},
{
"epoch": 4.009120706854781,
"grad_norm": 9.799927711486816,
"learning_rate": 4.782404286537124e-05,
"loss": 0.0442,
"num_input_tokens_seen": 68740256,
"step": 3520
},
{
"epoch": 4.01482114863902,
"grad_norm": 4.823245525360107,
"learning_rate": 4.781794837655843e-05,
"loss": 0.0601,
"num_input_tokens_seen": 68837968,
"step": 3525
},
{
"epoch": 4.020521590423257,
"grad_norm": 6.3089213371276855,
"learning_rate": 4.781184575429867e-05,
"loss": 0.0181,
"num_input_tokens_seen": 68935680,
"step": 3530
},
{
"epoch": 4.026222032207496,
"grad_norm": 0.7296550869941711,
"learning_rate": 4.780573500076723e-05,
"loss": 0.0089,
"num_input_tokens_seen": 69033408,
"step": 3535
},
{
"epoch": 4.0319224739917345,
"grad_norm": 7.376620292663574,
"learning_rate": 4.77996161181423e-05,
"loss": 0.0136,
"num_input_tokens_seen": 69131152,
"step": 3540
},
{
"epoch": 4.037622915775972,
"grad_norm": 3.193028211593628,
"learning_rate": 4.779348910860494e-05,
"loss": 0.0251,
"num_input_tokens_seen": 69228800,
"step": 3545
},
{
"epoch": 4.043323357560211,
"grad_norm": 0.5682366490364075,
"learning_rate": 4.7787353974339134e-05,
"loss": 0.0037,
"num_input_tokens_seen": 69326608,
"step": 3550
},
{
"epoch": 4.049023799344449,
"grad_norm": 2.3201677799224854,
"learning_rate": 4.778121071753174e-05,
"loss": 0.0114,
"num_input_tokens_seen": 69424368,
"step": 3555
},
{
"epoch": 4.054724241128688,
"grad_norm": 2.5661370754241943,
"learning_rate": 4.7775059340372516e-05,
"loss": 0.0177,
"num_input_tokens_seen": 69522032,
"step": 3560
},
{
"epoch": 4.060424682912926,
"grad_norm": 0.4603801667690277,
"learning_rate": 4.776889984505413e-05,
"loss": 0.0249,
"num_input_tokens_seen": 69619728,
"step": 3565
},
{
"epoch": 4.066125124697164,
"grad_norm": 3.390105962753296,
"learning_rate": 4.776273223377211e-05,
"loss": 0.0172,
"num_input_tokens_seen": 69717424,
"step": 3570
},
{
"epoch": 4.071825566481403,
"grad_norm": 0.19697071611881256,
"learning_rate": 4.7756556508724914e-05,
"loss": 0.0153,
"num_input_tokens_seen": 69815152,
"step": 3575
},
{
"epoch": 4.07752600826564,
"grad_norm": 8.34150218963623,
"learning_rate": 4.7750372672113874e-05,
"loss": 0.0209,
"num_input_tokens_seen": 69912960,
"step": 3580
},
{
"epoch": 4.083226450049879,
"grad_norm": 0.32752522826194763,
"learning_rate": 4.774418072614322e-05,
"loss": 0.0138,
"num_input_tokens_seen": 70010672,
"step": 3585
},
{
"epoch": 4.0889268918341175,
"grad_norm": 3.4794821739196777,
"learning_rate": 4.773798067302005e-05,
"loss": 0.0562,
"num_input_tokens_seen": 70108448,
"step": 3590
},
{
"epoch": 4.094627333618355,
"grad_norm": 7.853202819824219,
"learning_rate": 4.7731772514954384e-05,
"loss": 0.0245,
"num_input_tokens_seen": 70206144,
"step": 3595
},
{
"epoch": 4.100327775402594,
"grad_norm": 0.33840203285217285,
"learning_rate": 4.772555625415912e-05,
"loss": 0.0092,
"num_input_tokens_seen": 70303872,
"step": 3600
},
{
"epoch": 4.106028217186832,
"grad_norm": 6.381319999694824,
"learning_rate": 4.771933189285004e-05,
"loss": 0.0101,
"num_input_tokens_seen": 70401664,
"step": 3605
},
{
"epoch": 4.11172865897107,
"grad_norm": 1.308600902557373,
"learning_rate": 4.771309943324581e-05,
"loss": 0.021,
"num_input_tokens_seen": 70499408,
"step": 3610
},
{
"epoch": 4.1174291007553085,
"grad_norm": 1.1248642206192017,
"learning_rate": 4.7706858877567984e-05,
"loss": 0.009,
"num_input_tokens_seen": 70597200,
"step": 3615
},
{
"epoch": 4.123129542539547,
"grad_norm": 0.797878623008728,
"learning_rate": 4.770061022804102e-05,
"loss": 0.0084,
"num_input_tokens_seen": 70695008,
"step": 3620
},
{
"epoch": 4.128829984323785,
"grad_norm": 5.1671905517578125,
"learning_rate": 4.7694353486892224e-05,
"loss": 0.0086,
"num_input_tokens_seen": 70792784,
"step": 3625
},
{
"epoch": 4.134530426108023,
"grad_norm": 1.4259310960769653,
"learning_rate": 4.7688088656351827e-05,
"loss": 0.0137,
"num_input_tokens_seen": 70890576,
"step": 3630
},
{
"epoch": 4.140230867892262,
"grad_norm": 0.6012780070304871,
"learning_rate": 4.7681815738652916e-05,
"loss": 0.0331,
"num_input_tokens_seen": 70988352,
"step": 3635
},
{
"epoch": 4.1459313096765,
"grad_norm": 0.923217236995697,
"learning_rate": 4.767553473603147e-05,
"loss": 0.0235,
"num_input_tokens_seen": 71086128,
"step": 3640
},
{
"epoch": 4.151631751460738,
"grad_norm": 0.6401808261871338,
"learning_rate": 4.766924565072635e-05,
"loss": 0.0056,
"num_input_tokens_seen": 71183888,
"step": 3645
},
{
"epoch": 4.157332193244977,
"grad_norm": 4.980163097381592,
"learning_rate": 4.7662948484979304e-05,
"loss": 0.0124,
"num_input_tokens_seen": 71281648,
"step": 3650
},
{
"epoch": 4.163032635029214,
"grad_norm": 0.19848279654979706,
"learning_rate": 4.7656643241034946e-05,
"loss": 0.0377,
"num_input_tokens_seen": 71379440,
"step": 3655
},
{
"epoch": 4.168733076813453,
"grad_norm": 0.8732094168663025,
"learning_rate": 4.765032992114078e-05,
"loss": 0.0071,
"num_input_tokens_seen": 71477216,
"step": 3660
},
{
"epoch": 4.1744335185976915,
"grad_norm": 1.789801001548767,
"learning_rate": 4.7644008527547185e-05,
"loss": 0.025,
"num_input_tokens_seen": 71574992,
"step": 3665
},
{
"epoch": 4.180133960381929,
"grad_norm": 2.9071710109710693,
"learning_rate": 4.763767906250742e-05,
"loss": 0.0172,
"num_input_tokens_seen": 71672800,
"step": 3670
},
{
"epoch": 4.185834402166168,
"grad_norm": 1.144612431526184,
"learning_rate": 4.7631341528277615e-05,
"loss": 0.0092,
"num_input_tokens_seen": 71770512,
"step": 3675
},
{
"epoch": 4.191534843950406,
"grad_norm": 2.124117374420166,
"learning_rate": 4.7624995927116794e-05,
"loss": 0.0214,
"num_input_tokens_seen": 71868240,
"step": 3680
},
{
"epoch": 4.197235285734644,
"grad_norm": 1.037842035293579,
"learning_rate": 4.761864226128683e-05,
"loss": 0.0173,
"num_input_tokens_seen": 71965952,
"step": 3685
},
{
"epoch": 4.202935727518883,
"grad_norm": 16.831375122070312,
"learning_rate": 4.761228053305249e-05,
"loss": 0.0419,
"num_input_tokens_seen": 72063680,
"step": 3690
},
{
"epoch": 4.208636169303121,
"grad_norm": 0.9212270975112915,
"learning_rate": 4.76059107446814e-05,
"loss": 0.0311,
"num_input_tokens_seen": 72161472,
"step": 3695
},
{
"epoch": 4.21433661108736,
"grad_norm": 7.812607288360596,
"learning_rate": 4.759953289844409e-05,
"loss": 0.0197,
"num_input_tokens_seen": 72259120,
"step": 3700
},
{
"epoch": 4.220037052871597,
"grad_norm": 2.24538516998291,
"learning_rate": 4.759314699661392e-05,
"loss": 0.0068,
"num_input_tokens_seen": 72356848,
"step": 3705
},
{
"epoch": 4.225737494655836,
"grad_norm": 1.110796570777893,
"learning_rate": 4.758675304146715e-05,
"loss": 0.0309,
"num_input_tokens_seen": 72454608,
"step": 3710
},
{
"epoch": 4.2314379364400745,
"grad_norm": 0.3017835021018982,
"learning_rate": 4.75803510352829e-05,
"loss": 0.0124,
"num_input_tokens_seen": 72552288,
"step": 3715
},
{
"epoch": 4.237138378224312,
"grad_norm": 3.7277181148529053,
"learning_rate": 4.757394098034316e-05,
"loss": 0.0754,
"num_input_tokens_seen": 72650000,
"step": 3720
},
{
"epoch": 4.242838820008551,
"grad_norm": 8.373753547668457,
"learning_rate": 4.756752287893279e-05,
"loss": 0.01,
"num_input_tokens_seen": 72747856,
"step": 3725
},
{
"epoch": 4.248539261792789,
"grad_norm": 3.710064172744751,
"learning_rate": 4.7561096733339526e-05,
"loss": 0.0109,
"num_input_tokens_seen": 72845600,
"step": 3730
},
{
"epoch": 4.254239703577027,
"grad_norm": 3.204511880874634,
"learning_rate": 4.755466254585397e-05,
"loss": 0.0271,
"num_input_tokens_seen": 72943376,
"step": 3735
},
{
"epoch": 4.2599401453612655,
"grad_norm": 2.9513449668884277,
"learning_rate": 4.754822031876957e-05,
"loss": 0.0119,
"num_input_tokens_seen": 73041168,
"step": 3740
},
{
"epoch": 4.265640587145504,
"grad_norm": 7.3270392417907715,
"learning_rate": 4.754177005438266e-05,
"loss": 0.0168,
"num_input_tokens_seen": 73138832,
"step": 3745
},
{
"epoch": 4.271341028929742,
"grad_norm": 11.671257972717285,
"learning_rate": 4.753531175499243e-05,
"loss": 0.0544,
"num_input_tokens_seen": 73236592,
"step": 3750
},
{
"epoch": 4.27704147071398,
"grad_norm": 2.340949773788452,
"learning_rate": 4.7528845422900946e-05,
"loss": 0.0058,
"num_input_tokens_seen": 73334272,
"step": 3755
},
{
"epoch": 4.282741912498219,
"grad_norm": 6.20223331451416,
"learning_rate": 4.7522371060413126e-05,
"loss": 0.0166,
"num_input_tokens_seen": 73432016,
"step": 3760
},
{
"epoch": 4.288442354282457,
"grad_norm": 2.861288547515869,
"learning_rate": 4.751588866983676e-05,
"loss": 0.0062,
"num_input_tokens_seen": 73529760,
"step": 3765
},
{
"epoch": 4.294142796066695,
"grad_norm": 0.3826698064804077,
"learning_rate": 4.750939825348249e-05,
"loss": 0.0276,
"num_input_tokens_seen": 73627552,
"step": 3770
},
{
"epoch": 4.299843237850934,
"grad_norm": 0.9756613373756409,
"learning_rate": 4.7502899813663806e-05,
"loss": 0.0052,
"num_input_tokens_seen": 73725328,
"step": 3775
},
{
"epoch": 4.305543679635171,
"grad_norm": 0.1972053200006485,
"learning_rate": 4.749639335269709e-05,
"loss": 0.0078,
"num_input_tokens_seen": 73823024,
"step": 3780
},
{
"epoch": 4.31124412141941,
"grad_norm": 3.610668897628784,
"learning_rate": 4.748987887290156e-05,
"loss": 0.0455,
"num_input_tokens_seen": 73920736,
"step": 3785
},
{
"epoch": 4.3169445632036485,
"grad_norm": 2.3730828762054443,
"learning_rate": 4.7483356376599305e-05,
"loss": 0.0169,
"num_input_tokens_seen": 74018448,
"step": 3790
},
{
"epoch": 4.322645004987886,
"grad_norm": 6.0831618309021,
"learning_rate": 4.747682586611526e-05,
"loss": 0.0107,
"num_input_tokens_seen": 74116224,
"step": 3795
},
{
"epoch": 4.328345446772125,
"grad_norm": 8.610361099243164,
"learning_rate": 4.747028734377723e-05,
"loss": 0.0209,
"num_input_tokens_seen": 74214016,
"step": 3800
},
{
"epoch": 4.334045888556363,
"grad_norm": 8.020880699157715,
"learning_rate": 4.7463740811915856e-05,
"loss": 0.0166,
"num_input_tokens_seen": 74311712,
"step": 3805
},
{
"epoch": 4.339746330340601,
"grad_norm": 0.3767450153827667,
"learning_rate": 4.745718627286466e-05,
"loss": 0.009,
"num_input_tokens_seen": 74409504,
"step": 3810
},
{
"epoch": 4.3454467721248395,
"grad_norm": 2.0386180877685547,
"learning_rate": 4.7450623728959996e-05,
"loss": 0.0143,
"num_input_tokens_seen": 74507280,
"step": 3815
},
{
"epoch": 4.351147213909078,
"grad_norm": 8.895707130432129,
"learning_rate": 4.744405318254109e-05,
"loss": 0.0129,
"num_input_tokens_seen": 74604912,
"step": 3820
},
{
"epoch": 4.356847655693317,
"grad_norm": 1.1221814155578613,
"learning_rate": 4.743747463594999e-05,
"loss": 0.0199,
"num_input_tokens_seen": 74702720,
"step": 3825
},
{
"epoch": 4.362548097477554,
"grad_norm": 0.9217889308929443,
"learning_rate": 4.7430888091531635e-05,
"loss": 0.0065,
"num_input_tokens_seen": 74800448,
"step": 3830
},
{
"epoch": 4.368248539261793,
"grad_norm": 1.3824939727783203,
"learning_rate": 4.7424293551633785e-05,
"loss": 0.0055,
"num_input_tokens_seen": 74898160,
"step": 3835
},
{
"epoch": 4.3739489810460315,
"grad_norm": 3.7707972526550293,
"learning_rate": 4.741769101860707e-05,
"loss": 0.0253,
"num_input_tokens_seen": 74995824,
"step": 3840
},
{
"epoch": 4.379649422830269,
"grad_norm": 0.15820211172103882,
"learning_rate": 4.7411080494804944e-05,
"loss": 0.0075,
"num_input_tokens_seen": 75093584,
"step": 3845
},
{
"epoch": 4.385349864614508,
"grad_norm": 13.08034610748291,
"learning_rate": 4.7404461982583735e-05,
"loss": 0.0158,
"num_input_tokens_seen": 75191296,
"step": 3850
},
{
"epoch": 4.391050306398746,
"grad_norm": 0.30530065298080444,
"learning_rate": 4.739783548430262e-05,
"loss": 0.0131,
"num_input_tokens_seen": 75288960,
"step": 3855
},
{
"epoch": 4.396750748182984,
"grad_norm": 0.16144217550754547,
"learning_rate": 4.739120100232359e-05,
"loss": 0.0319,
"num_input_tokens_seen": 75386768,
"step": 3860
},
{
"epoch": 4.4024511899672225,
"grad_norm": 8.027372360229492,
"learning_rate": 4.7384558539011515e-05,
"loss": 0.0352,
"num_input_tokens_seen": 75484464,
"step": 3865
},
{
"epoch": 4.408151631751461,
"grad_norm": 10.079002380371094,
"learning_rate": 4.73779080967341e-05,
"loss": 0.0151,
"num_input_tokens_seen": 75582368,
"step": 3870
},
{
"epoch": 4.413852073535699,
"grad_norm": 1.2833141088485718,
"learning_rate": 4.7371249677861886e-05,
"loss": 0.0081,
"num_input_tokens_seen": 75680112,
"step": 3875
},
{
"epoch": 4.419552515319937,
"grad_norm": 0.9999586343765259,
"learning_rate": 4.736458328476826e-05,
"loss": 0.0034,
"num_input_tokens_seen": 75777840,
"step": 3880
},
{
"epoch": 4.425252957104176,
"grad_norm": 7.5928215980529785,
"learning_rate": 4.7357908919829464e-05,
"loss": 0.012,
"num_input_tokens_seen": 75875648,
"step": 3885
},
{
"epoch": 4.4309533988884136,
"grad_norm": 0.11999927461147308,
"learning_rate": 4.735122658542456e-05,
"loss": 0.0093,
"num_input_tokens_seen": 75973296,
"step": 3890
},
{
"epoch": 4.436653840672652,
"grad_norm": 1.351083755493164,
"learning_rate": 4.734453628393548e-05,
"loss": 0.0051,
"num_input_tokens_seen": 76071088,
"step": 3895
},
{
"epoch": 4.442354282456891,
"grad_norm": 0.8617928624153137,
"learning_rate": 4.733783801774696e-05,
"loss": 0.0033,
"num_input_tokens_seen": 76168848,
"step": 3900
},
{
"epoch": 4.448054724241128,
"grad_norm": 0.1406688243150711,
"learning_rate": 4.7331131789246614e-05,
"loss": 0.0052,
"num_input_tokens_seen": 76266512,
"step": 3905
},
{
"epoch": 4.453755166025367,
"grad_norm": 0.1912948042154312,
"learning_rate": 4.7324417600824854e-05,
"loss": 0.0074,
"num_input_tokens_seen": 76364288,
"step": 3910
},
{
"epoch": 4.4594556078096055,
"grad_norm": 3.995418071746826,
"learning_rate": 4.7317695454874964e-05,
"loss": 0.0096,
"num_input_tokens_seen": 76462016,
"step": 3915
},
{
"epoch": 4.465156049593843,
"grad_norm": 0.7223560214042664,
"learning_rate": 4.7310965353793044e-05,
"loss": 0.003,
"num_input_tokens_seen": 76559792,
"step": 3920
},
{
"epoch": 4.470856491378082,
"grad_norm": 1.0632505416870117,
"learning_rate": 4.730422729997804e-05,
"loss": 0.035,
"num_input_tokens_seen": 76657616,
"step": 3925
},
{
"epoch": 4.47655693316232,
"grad_norm": 1.5412085056304932,
"learning_rate": 4.729748129583171e-05,
"loss": 0.0377,
"num_input_tokens_seen": 76755312,
"step": 3930
},
{
"epoch": 4.482257374946558,
"grad_norm": 1.5897432565689087,
"learning_rate": 4.729072734375869e-05,
"loss": 0.0166,
"num_input_tokens_seen": 76853056,
"step": 3935
},
{
"epoch": 4.4879578167307965,
"grad_norm": 0.3547525405883789,
"learning_rate": 4.728396544616641e-05,
"loss": 0.0201,
"num_input_tokens_seen": 76950784,
"step": 3940
},
{
"epoch": 4.493658258515035,
"grad_norm": 0.5280705094337463,
"learning_rate": 4.727719560546514e-05,
"loss": 0.0173,
"num_input_tokens_seen": 77048592,
"step": 3945
},
{
"epoch": 4.499358700299274,
"grad_norm": 5.78103494644165,
"learning_rate": 4.7270417824068e-05,
"loss": 0.0107,
"num_input_tokens_seen": 77146336,
"step": 3950
},
{
"epoch": 4.505059142083511,
"grad_norm": 5.893618583679199,
"learning_rate": 4.726363210439092e-05,
"loss": 0.0258,
"num_input_tokens_seen": 77244000,
"step": 3955
},
{
"epoch": 4.51075958386775,
"grad_norm": 5.622200965881348,
"learning_rate": 4.725683844885266e-05,
"loss": 0.0186,
"num_input_tokens_seen": 77341856,
"step": 3960
},
{
"epoch": 4.516460025651988,
"grad_norm": 3.430377244949341,
"learning_rate": 4.725003685987482e-05,
"loss": 0.0095,
"num_input_tokens_seen": 77439648,
"step": 3965
},
{
"epoch": 4.522160467436226,
"grad_norm": 16.007429122924805,
"learning_rate": 4.724322733988183e-05,
"loss": 0.0637,
"num_input_tokens_seen": 77537440,
"step": 3970
},
{
"epoch": 4.527860909220465,
"grad_norm": 0.6529415845870972,
"learning_rate": 4.7236409891300934e-05,
"loss": 0.0133,
"num_input_tokens_seen": 77635136,
"step": 3975
},
{
"epoch": 4.533561351004703,
"grad_norm": 0.014979444444179535,
"learning_rate": 4.722958451656221e-05,
"loss": 0.0353,
"num_input_tokens_seen": 77732848,
"step": 3980
},
{
"epoch": 4.539261792788941,
"grad_norm": 0.7652057409286499,
"learning_rate": 4.722275121809856e-05,
"loss": 0.0204,
"num_input_tokens_seen": 77830576,
"step": 3985
},
{
"epoch": 4.5449622345731795,
"grad_norm": 9.353001594543457,
"learning_rate": 4.721590999834571e-05,
"loss": 0.0329,
"num_input_tokens_seen": 77928320,
"step": 3990
},
{
"epoch": 4.550662676357418,
"grad_norm": 0.0680394098162651,
"learning_rate": 4.720906085974221e-05,
"loss": 0.0065,
"num_input_tokens_seen": 78026032,
"step": 3995
},
{
"epoch": 4.556363118141656,
"grad_norm": 2.323220729827881,
"learning_rate": 4.720220380472942e-05,
"loss": 0.0066,
"num_input_tokens_seen": 78123696,
"step": 4000
},
{
"epoch": 4.562063559925894,
"grad_norm": 0.5926280617713928,
"learning_rate": 4.719533883575155e-05,
"loss": 0.0043,
"num_input_tokens_seen": 78221376,
"step": 4005
},
{
"epoch": 4.567764001710133,
"grad_norm": 0.9745510220527649,
"learning_rate": 4.7188465955255604e-05,
"loss": 0.0147,
"num_input_tokens_seen": 78319104,
"step": 4010
},
{
"epoch": 4.5734644434943705,
"grad_norm": 10.33803653717041,
"learning_rate": 4.7181585165691437e-05,
"loss": 0.0112,
"num_input_tokens_seen": 78416816,
"step": 4015
},
{
"epoch": 4.579164885278609,
"grad_norm": 9.621374130249023,
"learning_rate": 4.7174696469511674e-05,
"loss": 0.0222,
"num_input_tokens_seen": 78514656,
"step": 4020
},
{
"epoch": 4.584865327062848,
"grad_norm": 4.345292568206787,
"learning_rate": 4.716779986917182e-05,
"loss": 0.0084,
"num_input_tokens_seen": 78612400,
"step": 4025
},
{
"epoch": 4.590565768847085,
"grad_norm": 1.6130069494247437,
"learning_rate": 4.7160895367130125e-05,
"loss": 0.0068,
"num_input_tokens_seen": 78710256,
"step": 4030
},
{
"epoch": 4.596266210631324,
"grad_norm": 1.0490821599960327,
"learning_rate": 4.715398296584773e-05,
"loss": 0.0086,
"num_input_tokens_seen": 78807936,
"step": 4035
},
{
"epoch": 4.6019666524155625,
"grad_norm": 10.216976165771484,
"learning_rate": 4.714706266778854e-05,
"loss": 0.0563,
"num_input_tokens_seen": 78905744,
"step": 4040
},
{
"epoch": 4.6076670941998,
"grad_norm": 0.1510315239429474,
"learning_rate": 4.7140134475419304e-05,
"loss": 0.0195,
"num_input_tokens_seen": 79003584,
"step": 4045
},
{
"epoch": 4.613367535984039,
"grad_norm": 0.32562801241874695,
"learning_rate": 4.7133198391209566e-05,
"loss": 0.0103,
"num_input_tokens_seen": 79101408,
"step": 4050
},
{
"epoch": 4.619067977768277,
"grad_norm": 1.025244116783142,
"learning_rate": 4.7126254417631686e-05,
"loss": 0.0022,
"num_input_tokens_seen": 79199136,
"step": 4055
},
{
"epoch": 4.624768419552515,
"grad_norm": 0.8182079195976257,
"learning_rate": 4.7119302557160844e-05,
"loss": 0.0032,
"num_input_tokens_seen": 79296832,
"step": 4060
},
{
"epoch": 4.6304688613367535,
"grad_norm": 0.6801844835281372,
"learning_rate": 4.7112342812275026e-05,
"loss": 0.012,
"num_input_tokens_seen": 79394528,
"step": 4065
},
{
"epoch": 4.636169303120992,
"grad_norm": 0.7363019585609436,
"learning_rate": 4.7105375185455034e-05,
"loss": 0.0055,
"num_input_tokens_seen": 79492352,
"step": 4070
},
{
"epoch": 4.641869744905231,
"grad_norm": 4.005695819854736,
"learning_rate": 4.709839967918447e-05,
"loss": 0.0195,
"num_input_tokens_seen": 79590064,
"step": 4075
},
{
"epoch": 4.647570186689468,
"grad_norm": 0.4576650857925415,
"learning_rate": 4.709141629594975e-05,
"loss": 0.0074,
"num_input_tokens_seen": 79687856,
"step": 4080
},
{
"epoch": 4.653270628473707,
"grad_norm": 0.05043329671025276,
"learning_rate": 4.708442503824011e-05,
"loss": 0.0175,
"num_input_tokens_seen": 79785600,
"step": 4085
},
{
"epoch": 4.6589710702579445,
"grad_norm": 0.19624769687652588,
"learning_rate": 4.707742590854756e-05,
"loss": 0.0029,
"num_input_tokens_seen": 79883424,
"step": 4090
},
{
"epoch": 4.664671512042183,
"grad_norm": 0.46751806139945984,
"learning_rate": 4.7070418909366954e-05,
"loss": 0.0192,
"num_input_tokens_seen": 79981152,
"step": 4095
},
{
"epoch": 4.670371953826422,
"grad_norm": 1.4042103290557861,
"learning_rate": 4.706340404319593e-05,
"loss": 0.002,
"num_input_tokens_seen": 80078864,
"step": 4100
},
{
"epoch": 4.67607239561066,
"grad_norm": 0.30372464656829834,
"learning_rate": 4.705638131253492e-05,
"loss": 0.0029,
"num_input_tokens_seen": 80176672,
"step": 4105
},
{
"epoch": 4.681772837394898,
"grad_norm": 12.20240306854248,
"learning_rate": 4.704935071988718e-05,
"loss": 0.0156,
"num_input_tokens_seen": 80274272,
"step": 4110
},
{
"epoch": 4.6874732791791365,
"grad_norm": 0.24599520862102509,
"learning_rate": 4.704231226775877e-05,
"loss": 0.0106,
"num_input_tokens_seen": 80372080,
"step": 4115
},
{
"epoch": 4.693173720963375,
"grad_norm": 5.936103820800781,
"learning_rate": 4.7035265958658545e-05,
"loss": 0.0063,
"num_input_tokens_seen": 80469824,
"step": 4120
},
{
"epoch": 4.698874162747613,
"grad_norm": 1.1616228818893433,
"learning_rate": 4.702821179509814e-05,
"loss": 0.0153,
"num_input_tokens_seen": 80567536,
"step": 4125
},
{
"epoch": 4.704574604531851,
"grad_norm": 11.066569328308105,
"learning_rate": 4.702114977959203e-05,
"loss": 0.0302,
"num_input_tokens_seen": 80665344,
"step": 4130
},
{
"epoch": 4.71027504631609,
"grad_norm": 0.9506548047065735,
"learning_rate": 4.701407991465745e-05,
"loss": 0.0058,
"num_input_tokens_seen": 80763072,
"step": 4135
},
{
"epoch": 4.7159754881003275,
"grad_norm": 0.13321082293987274,
"learning_rate": 4.700700220281446e-05,
"loss": 0.0023,
"num_input_tokens_seen": 80860816,
"step": 4140
},
{
"epoch": 4.721675929884566,
"grad_norm": 3.3196537494659424,
"learning_rate": 4.699991664658591e-05,
"loss": 0.0058,
"num_input_tokens_seen": 80958480,
"step": 4145
},
{
"epoch": 4.727376371668805,
"grad_norm": 0.7469080686569214,
"learning_rate": 4.699282324849742e-05,
"loss": 0.0398,
"num_input_tokens_seen": 81056144,
"step": 4150
},
{
"epoch": 4.733076813453042,
"grad_norm": 6.437920093536377,
"learning_rate": 4.698572201107746e-05,
"loss": 0.0205,
"num_input_tokens_seen": 81153888,
"step": 4155
},
{
"epoch": 4.738777255237281,
"grad_norm": 6.570982456207275,
"learning_rate": 4.697861293685724e-05,
"loss": 0.0083,
"num_input_tokens_seen": 81251680,
"step": 4160
},
{
"epoch": 4.7444776970215194,
"grad_norm": 2.7099902629852295,
"learning_rate": 4.69714960283708e-05,
"loss": 0.0038,
"num_input_tokens_seen": 81349360,
"step": 4165
},
{
"epoch": 4.750178138805757,
"grad_norm": 0.25943759083747864,
"learning_rate": 4.696437128815494e-05,
"loss": 0.0249,
"num_input_tokens_seen": 81447104,
"step": 4170
},
{
"epoch": 4.755878580589996,
"grad_norm": 1.0403450727462769,
"learning_rate": 4.6957238718749295e-05,
"loss": 0.0079,
"num_input_tokens_seen": 81544896,
"step": 4175
},
{
"epoch": 4.761579022374234,
"grad_norm": 6.538539886474609,
"learning_rate": 4.6950098322696254e-05,
"loss": 0.0292,
"num_input_tokens_seen": 81642576,
"step": 4180
},
{
"epoch": 4.767279464158472,
"grad_norm": 0.7306740283966064,
"learning_rate": 4.6942950102541007e-05,
"loss": 0.0153,
"num_input_tokens_seen": 81740384,
"step": 4185
},
{
"epoch": 4.7729799059427105,
"grad_norm": 1.1523919105529785,
"learning_rate": 4.693579406083153e-05,
"loss": 0.0137,
"num_input_tokens_seen": 81838112,
"step": 4190
},
{
"epoch": 4.778680347726949,
"grad_norm": 1.3222236633300781,
"learning_rate": 4.69286302001186e-05,
"loss": 0.0174,
"num_input_tokens_seen": 81935856,
"step": 4195
},
{
"epoch": 4.784380789511188,
"grad_norm": 4.96268367767334,
"learning_rate": 4.692145852295576e-05,
"loss": 0.0059,
"num_input_tokens_seen": 82033616,
"step": 4200
},
{
"epoch": 4.790081231295425,
"grad_norm": 1.5982474088668823,
"learning_rate": 4.6914279031899364e-05,
"loss": 0.017,
"num_input_tokens_seen": 82131360,
"step": 4205
},
{
"epoch": 4.795781673079664,
"grad_norm": 3.684070110321045,
"learning_rate": 4.690709172950854e-05,
"loss": 0.0113,
"num_input_tokens_seen": 82229136,
"step": 4210
},
{
"epoch": 4.8014821148639015,
"grad_norm": 12.136567115783691,
"learning_rate": 4.689989661834518e-05,
"loss": 0.0284,
"num_input_tokens_seen": 82326864,
"step": 4215
},
{
"epoch": 4.80718255664814,
"grad_norm": 0.33026665449142456,
"learning_rate": 4.6892693700973994e-05,
"loss": 0.0104,
"num_input_tokens_seen": 82424672,
"step": 4220
},
{
"epoch": 4.812882998432379,
"grad_norm": 8.315051078796387,
"learning_rate": 4.688548297996245e-05,
"loss": 0.017,
"num_input_tokens_seen": 82522400,
"step": 4225
},
{
"epoch": 4.818583440216617,
"grad_norm": 0.24934843182563782,
"learning_rate": 4.687826445788081e-05,
"loss": 0.0035,
"num_input_tokens_seen": 82620208,
"step": 4230
},
{
"epoch": 4.824283882000855,
"grad_norm": 0.1164386197924614,
"learning_rate": 4.687103813730211e-05,
"loss": 0.0092,
"num_input_tokens_seen": 82717856,
"step": 4235
},
{
"epoch": 4.8299843237850935,
"grad_norm": 11.615945816040039,
"learning_rate": 4.686380402080218e-05,
"loss": 0.0131,
"num_input_tokens_seen": 82815632,
"step": 4240
},
{
"epoch": 4.835684765569331,
"grad_norm": 0.6481454968452454,
"learning_rate": 4.68565621109596e-05,
"loss": 0.0011,
"num_input_tokens_seen": 82913296,
"step": 4245
},
{
"epoch": 4.84138520735357,
"grad_norm": 0.0655444860458374,
"learning_rate": 4.6849312410355755e-05,
"loss": 0.0198,
"num_input_tokens_seen": 83011072,
"step": 4250
},
{
"epoch": 4.847085649137808,
"grad_norm": 6.334054470062256,
"learning_rate": 4.68420549215748e-05,
"loss": 0.0048,
"num_input_tokens_seen": 83108864,
"step": 4255
},
{
"epoch": 4.852786090922047,
"grad_norm": 0.20101669430732727,
"learning_rate": 4.6834789647203656e-05,
"loss": 0.0048,
"num_input_tokens_seen": 83206608,
"step": 4260
},
{
"epoch": 4.8584865327062845,
"grad_norm": 14.852635383605957,
"learning_rate": 4.6827516589832025e-05,
"loss": 0.0461,
"num_input_tokens_seen": 83304336,
"step": 4265
},
{
"epoch": 4.864186974490523,
"grad_norm": 0.06325986981391907,
"learning_rate": 4.68202357520524e-05,
"loss": 0.0093,
"num_input_tokens_seen": 83402064,
"step": 4270
},
{
"epoch": 4.869887416274762,
"grad_norm": 1.859485149383545,
"learning_rate": 4.681294713646002e-05,
"loss": 0.0104,
"num_input_tokens_seen": 83499824,
"step": 4275
},
{
"epoch": 4.875587858058999,
"grad_norm": 0.47888869047164917,
"learning_rate": 4.68056507456529e-05,
"loss": 0.0106,
"num_input_tokens_seen": 83597536,
"step": 4280
},
{
"epoch": 4.881288299843238,
"grad_norm": 1.207236886024475,
"learning_rate": 4.6798346582231855e-05,
"loss": 0.0049,
"num_input_tokens_seen": 83695296,
"step": 4285
},
{
"epoch": 4.886988741627476,
"grad_norm": 0.27957767248153687,
"learning_rate": 4.679103464880044e-05,
"loss": 0.0017,
"num_input_tokens_seen": 83793024,
"step": 4290
},
{
"epoch": 4.892689183411714,
"grad_norm": 0.03679969534277916,
"learning_rate": 4.678371494796499e-05,
"loss": 0.0023,
"num_input_tokens_seen": 83890752,
"step": 4295
},
{
"epoch": 4.898389625195953,
"grad_norm": 1.9648526906967163,
"learning_rate": 4.677638748233461e-05,
"loss": 0.0168,
"num_input_tokens_seen": 83988512,
"step": 4300
},
{
"epoch": 4.904090066980191,
"grad_norm": 0.7433627247810364,
"learning_rate": 4.676905225452117e-05,
"loss": 0.0128,
"num_input_tokens_seen": 84086352,
"step": 4305
},
{
"epoch": 4.909790508764429,
"grad_norm": 1.4374768733978271,
"learning_rate": 4.676170926713932e-05,
"loss": 0.0019,
"num_input_tokens_seen": 84184032,
"step": 4310
},
{
"epoch": 4.9154909505486675,
"grad_norm": 0.46811923384666443,
"learning_rate": 4.6754358522806454e-05,
"loss": 0.0019,
"num_input_tokens_seen": 84281776,
"step": 4315
},
{
"epoch": 4.921191392332906,
"grad_norm": 2.098421573638916,
"learning_rate": 4.6747000024142734e-05,
"loss": 0.0169,
"num_input_tokens_seen": 84379472,
"step": 4320
},
{
"epoch": 4.926891834117144,
"grad_norm": 3.727424383163452,
"learning_rate": 4.673963377377111e-05,
"loss": 0.009,
"num_input_tokens_seen": 84477232,
"step": 4325
},
{
"epoch": 4.932592275901382,
"grad_norm": 9.418045043945312,
"learning_rate": 4.6732259774317264e-05,
"loss": 0.0283,
"num_input_tokens_seen": 84574992,
"step": 4330
},
{
"epoch": 4.938292717685621,
"grad_norm": 8.13887882232666,
"learning_rate": 4.672487802840966e-05,
"loss": 0.0163,
"num_input_tokens_seen": 84672800,
"step": 4335
},
{
"epoch": 4.9439931594698585,
"grad_norm": 0.15979628264904022,
"learning_rate": 4.671748853867952e-05,
"loss": 0.0126,
"num_input_tokens_seen": 84770416,
"step": 4340
},
{
"epoch": 4.949693601254097,
"grad_norm": 10.529417991638184,
"learning_rate": 4.671009130776083e-05,
"loss": 0.0189,
"num_input_tokens_seen": 84868256,
"step": 4345
},
{
"epoch": 4.955394043038336,
"grad_norm": 1.08811354637146,
"learning_rate": 4.670268633829031e-05,
"loss": 0.0016,
"num_input_tokens_seen": 84965872,
"step": 4350
},
{
"epoch": 4.961094484822574,
"grad_norm": 0.6671218872070312,
"learning_rate": 4.6695273632907476e-05,
"loss": 0.0025,
"num_input_tokens_seen": 85063648,
"step": 4355
},
{
"epoch": 4.966794926606812,
"grad_norm": 3.7817630767822266,
"learning_rate": 4.668785319425458e-05,
"loss": 0.0207,
"num_input_tokens_seen": 85161424,
"step": 4360
},
{
"epoch": 4.97249536839105,
"grad_norm": 3.2574493885040283,
"learning_rate": 4.668042502497663e-05,
"loss": 0.0183,
"num_input_tokens_seen": 85259088,
"step": 4365
},
{
"epoch": 4.978195810175288,
"grad_norm": 3.2037136554718018,
"learning_rate": 4.66729891277214e-05,
"loss": 0.0128,
"num_input_tokens_seen": 85356816,
"step": 4370
},
{
"epoch": 4.983896251959527,
"grad_norm": 3.986717462539673,
"learning_rate": 4.66655455051394e-05,
"loss": 0.0043,
"num_input_tokens_seen": 85454656,
"step": 4375
},
{
"epoch": 4.989596693743765,
"grad_norm": 1.5552836656570435,
"learning_rate": 4.6658094159883916e-05,
"loss": 0.0275,
"num_input_tokens_seen": 85552432,
"step": 4380
},
{
"epoch": 4.995297135528004,
"grad_norm": 0.39177027344703674,
"learning_rate": 4.665063509461097e-05,
"loss": 0.0053,
"num_input_tokens_seen": 85650144,
"step": 4385
},
{
"epoch": 5.0,
"grad_norm": 0.13293787837028503,
"learning_rate": 4.6643168311979345e-05,
"loss": 0.0034,
"num_input_tokens_seen": 85730720,
"step": 4390
},
{
"epoch": 5.005700441784239,
"grad_norm": 1.8142844438552856,
"learning_rate": 4.663569381465058e-05,
"loss": 0.0094,
"num_input_tokens_seen": 85828432,
"step": 4395
},
{
"epoch": 5.011400883568476,
"grad_norm": 0.5450536012649536,
"learning_rate": 4.662821160528894e-05,
"loss": 0.0019,
"num_input_tokens_seen": 85926048,
"step": 4400
},
{
"epoch": 5.017101325352715,
"grad_norm": 0.5705410242080688,
"learning_rate": 4.662072168656146e-05,
"loss": 0.0311,
"num_input_tokens_seen": 86023760,
"step": 4405
},
{
"epoch": 5.022801767136953,
"grad_norm": 0.47627347707748413,
"learning_rate": 4.661322406113794e-05,
"loss": 0.005,
"num_input_tokens_seen": 86121552,
"step": 4410
},
{
"epoch": 5.028502208921191,
"grad_norm": 5.517219066619873,
"learning_rate": 4.6605718731690874e-05,
"loss": 0.0048,
"num_input_tokens_seen": 86219200,
"step": 4415
},
{
"epoch": 5.03420265070543,
"grad_norm": 0.165016770362854,
"learning_rate": 4.659820570089555e-05,
"loss": 0.0025,
"num_input_tokens_seen": 86316976,
"step": 4420
},
{
"epoch": 5.039903092489668,
"grad_norm": 2.5400843620300293,
"learning_rate": 4.659068497142998e-05,
"loss": 0.0026,
"num_input_tokens_seen": 86414736,
"step": 4425
},
{
"epoch": 5.045603534273906,
"grad_norm": 1.7173391580581665,
"learning_rate": 4.658315654597492e-05,
"loss": 0.0037,
"num_input_tokens_seen": 86512528,
"step": 4430
},
{
"epoch": 5.051303976058144,
"grad_norm": 0.1867997944355011,
"learning_rate": 4.657562042721388e-05,
"loss": 0.001,
"num_input_tokens_seen": 86610224,
"step": 4435
},
{
"epoch": 5.057004417842383,
"grad_norm": 1.1393805742263794,
"learning_rate": 4.65680766178331e-05,
"loss": 0.0047,
"num_input_tokens_seen": 86708000,
"step": 4440
},
{
"epoch": 5.062704859626621,
"grad_norm": 4.353109836578369,
"learning_rate": 4.656052512052158e-05,
"loss": 0.0031,
"num_input_tokens_seen": 86805696,
"step": 4445
},
{
"epoch": 5.068405301410859,
"grad_norm": 0.10244199633598328,
"learning_rate": 4.655296593797104e-05,
"loss": 0.0167,
"num_input_tokens_seen": 86903504,
"step": 4450
},
{
"epoch": 5.074105743195098,
"grad_norm": 3.0064287185668945,
"learning_rate": 4.654539907287594e-05,
"loss": 0.0035,
"num_input_tokens_seen": 87001264,
"step": 4455
},
{
"epoch": 5.079806184979336,
"grad_norm": 2.2633399963378906,
"learning_rate": 4.653782452793349e-05,
"loss": 0.0022,
"num_input_tokens_seen": 87099008,
"step": 4460
},
{
"epoch": 5.085506626763574,
"grad_norm": 0.3934509754180908,
"learning_rate": 4.653024230584364e-05,
"loss": 0.0061,
"num_input_tokens_seen": 87196672,
"step": 4465
},
{
"epoch": 5.091207068547813,
"grad_norm": 0.034104038029909134,
"learning_rate": 4.6522652409309064e-05,
"loss": 0.0017,
"num_input_tokens_seen": 87294416,
"step": 4470
},
{
"epoch": 5.096907510332051,
"grad_norm": 2.047616720199585,
"learning_rate": 4.651505484103518e-05,
"loss": 0.0136,
"num_input_tokens_seen": 87392128,
"step": 4475
},
{
"epoch": 5.102607952116289,
"grad_norm": 4.767343044281006,
"learning_rate": 4.6507449603730135e-05,
"loss": 0.0118,
"num_input_tokens_seen": 87489840,
"step": 4480
},
{
"epoch": 5.108308393900527,
"grad_norm": 0.24816875159740448,
"learning_rate": 4.6499836700104806e-05,
"loss": 0.0083,
"num_input_tokens_seen": 87587568,
"step": 4485
},
{
"epoch": 5.114008835684766,
"grad_norm": 0.16580072045326233,
"learning_rate": 4.6492216132872824e-05,
"loss": 0.0053,
"num_input_tokens_seen": 87685264,
"step": 4490
},
{
"epoch": 5.119709277469004,
"grad_norm": 0.23322570323944092,
"learning_rate": 4.648458790475052e-05,
"loss": 0.0026,
"num_input_tokens_seen": 87783088,
"step": 4495
},
{
"epoch": 5.125409719253242,
"grad_norm": 0.2388758510351181,
"learning_rate": 4.6476952018456974e-05,
"loss": 0.0009,
"num_input_tokens_seen": 87880832,
"step": 4500
},
{
"epoch": 5.131110161037481,
"grad_norm": 2.167498826980591,
"learning_rate": 4.646930847671401e-05,
"loss": 0.009,
"num_input_tokens_seen": 87978544,
"step": 4505
},
{
"epoch": 5.136810602821718,
"grad_norm": 0.15172941982746124,
"learning_rate": 4.646165728224616e-05,
"loss": 0.0029,
"num_input_tokens_seen": 88076304,
"step": 4510
},
{
"epoch": 5.142511044605957,
"grad_norm": 1.221136450767517,
"learning_rate": 4.645399843778068e-05,
"loss": 0.0045,
"num_input_tokens_seen": 88174016,
"step": 4515
},
{
"epoch": 5.1482114863901955,
"grad_norm": 0.21661746501922607,
"learning_rate": 4.644633194604756e-05,
"loss": 0.013,
"num_input_tokens_seen": 88271632,
"step": 4520
},
{
"epoch": 5.153911928174433,
"grad_norm": 3.4261422157287598,
"learning_rate": 4.6438657809779526e-05,
"loss": 0.0069,
"num_input_tokens_seen": 88369312,
"step": 4525
},
{
"epoch": 5.159612369958672,
"grad_norm": 0.3439682126045227,
"learning_rate": 4.6430976031712017e-05,
"loss": 0.0014,
"num_input_tokens_seen": 88467120,
"step": 4530
},
{
"epoch": 5.16531281174291,
"grad_norm": 13.96764087677002,
"learning_rate": 4.6423286614583195e-05,
"loss": 0.0218,
"num_input_tokens_seen": 88564848,
"step": 4535
},
{
"epoch": 5.171013253527148,
"grad_norm": 0.09054847806692123,
"learning_rate": 4.641558956113396e-05,
"loss": 0.0054,
"num_input_tokens_seen": 88662560,
"step": 4540
},
{
"epoch": 5.176713695311387,
"grad_norm": 1.0485111474990845,
"learning_rate": 4.640788487410791e-05,
"loss": 0.0044,
"num_input_tokens_seen": 88760400,
"step": 4545
},
{
"epoch": 5.182414137095625,
"grad_norm": 0.10858794301748276,
"learning_rate": 4.640017255625139e-05,
"loss": 0.0009,
"num_input_tokens_seen": 88858096,
"step": 4550
},
{
"epoch": 5.188114578879863,
"grad_norm": 0.07652360200881958,
"learning_rate": 4.639245261031344e-05,
"loss": 0.0239,
"num_input_tokens_seen": 88955856,
"step": 4555
},
{
"epoch": 5.193815020664101,
"grad_norm": 0.6881747841835022,
"learning_rate": 4.638472503904583e-05,
"loss": 0.0009,
"num_input_tokens_seen": 89053600,
"step": 4560
},
{
"epoch": 5.19951546244834,
"grad_norm": 0.08055282384157181,
"learning_rate": 4.637698984520307e-05,
"loss": 0.0034,
"num_input_tokens_seen": 89151296,
"step": 4565
},
{
"epoch": 5.205215904232578,
"grad_norm": 0.08773194998502731,
"learning_rate": 4.636924703154234e-05,
"loss": 0.0121,
"num_input_tokens_seen": 89249120,
"step": 4570
},
{
"epoch": 5.210916346016816,
"grad_norm": 0.2949371039867401,
"learning_rate": 4.636149660082358e-05,
"loss": 0.0049,
"num_input_tokens_seen": 89346832,
"step": 4575
},
{
"epoch": 5.216616787801055,
"grad_norm": 7.335551738739014,
"learning_rate": 4.635373855580942e-05,
"loss": 0.0274,
"num_input_tokens_seen": 89444576,
"step": 4580
},
{
"epoch": 5.222317229585292,
"grad_norm": 2.2080814838409424,
"learning_rate": 4.634597289926521e-05,
"loss": 0.0128,
"num_input_tokens_seen": 89542288,
"step": 4585
},
{
"epoch": 5.228017671369531,
"grad_norm": 1.00960111618042,
"learning_rate": 4.6338199633959025e-05,
"loss": 0.0036,
"num_input_tokens_seen": 89640096,
"step": 4590
},
{
"epoch": 5.23371811315377,
"grad_norm": 0.1926228553056717,
"learning_rate": 4.6330418762661624e-05,
"loss": 0.0061,
"num_input_tokens_seen": 89737872,
"step": 4595
},
{
"epoch": 5.239418554938008,
"grad_norm": 0.0730406790971756,
"learning_rate": 4.632263028814652e-05,
"loss": 0.0383,
"num_input_tokens_seen": 89835552,
"step": 4600
},
{
"epoch": 5.245118996722246,
"grad_norm": 0.9148241281509399,
"learning_rate": 4.6314834213189884e-05,
"loss": 0.0167,
"num_input_tokens_seen": 89933232,
"step": 4605
},
{
"epoch": 5.250819438506484,
"grad_norm": 1.8269907236099243,
"learning_rate": 4.630703054057063e-05,
"loss": 0.006,
"num_input_tokens_seen": 90030960,
"step": 4610
},
{
"epoch": 5.256519880290723,
"grad_norm": 0.4568536877632141,
"learning_rate": 4.6299219273070396e-05,
"loss": 0.0105,
"num_input_tokens_seen": 90128784,
"step": 4615
},
{
"epoch": 5.262220322074961,
"grad_norm": 0.6757635474205017,
"learning_rate": 4.629140041347347e-05,
"loss": 0.0083,
"num_input_tokens_seen": 90226576,
"step": 4620
},
{
"epoch": 5.267920763859199,
"grad_norm": 8.738912582397461,
"learning_rate": 4.628357396456692e-05,
"loss": 0.0166,
"num_input_tokens_seen": 90324304,
"step": 4625
},
{
"epoch": 5.273621205643438,
"grad_norm": 1.5971795320510864,
"learning_rate": 4.627573992914044e-05,
"loss": 0.0029,
"num_input_tokens_seen": 90421920,
"step": 4630
},
{
"epoch": 5.279321647427675,
"grad_norm": 8.538966178894043,
"learning_rate": 4.626789830998649e-05,
"loss": 0.0098,
"num_input_tokens_seen": 90519728,
"step": 4635
},
{
"epoch": 5.285022089211914,
"grad_norm": 0.06448430567979813,
"learning_rate": 4.626004910990021e-05,
"loss": 0.0135,
"num_input_tokens_seen": 90617440,
"step": 4640
},
{
"epoch": 5.2907225309961525,
"grad_norm": 7.718270301818848,
"learning_rate": 4.625219233167944e-05,
"loss": 0.015,
"num_input_tokens_seen": 90715248,
"step": 4645
},
{
"epoch": 5.29642297278039,
"grad_norm": 0.2514442801475525,
"learning_rate": 4.6244327978124734e-05,
"loss": 0.0031,
"num_input_tokens_seen": 90812960,
"step": 4650
},
{
"epoch": 5.302123414564629,
"grad_norm": 0.28784915804862976,
"learning_rate": 4.623645605203932e-05,
"loss": 0.0063,
"num_input_tokens_seen": 90910624,
"step": 4655
},
{
"epoch": 5.307823856348867,
"grad_norm": 0.1487150639295578,
"learning_rate": 4.6228576556229156e-05,
"loss": 0.0035,
"num_input_tokens_seen": 91008320,
"step": 4660
},
{
"epoch": 5.313524298133105,
"grad_norm": 0.19565777480602264,
"learning_rate": 4.622068949350289e-05,
"loss": 0.0022,
"num_input_tokens_seen": 91106128,
"step": 4665
},
{
"epoch": 5.319224739917344,
"grad_norm": 0.2649058401584625,
"learning_rate": 4.6212794866671836e-05,
"loss": 0.0156,
"num_input_tokens_seen": 91203968,
"step": 4670
},
{
"epoch": 5.324925181701582,
"grad_norm": 1.254876732826233,
"learning_rate": 4.620489267855006e-05,
"loss": 0.0014,
"num_input_tokens_seen": 91301696,
"step": 4675
},
{
"epoch": 5.33062562348582,
"grad_norm": 0.03180227801203728,
"learning_rate": 4.619698293195427e-05,
"loss": 0.0046,
"num_input_tokens_seen": 91399360,
"step": 4680
},
{
"epoch": 5.336326065270058,
"grad_norm": 4.16030216217041,
"learning_rate": 4.618906562970391e-05,
"loss": 0.0031,
"num_input_tokens_seen": 91497088,
"step": 4685
},
{
"epoch": 5.342026507054297,
"grad_norm": 0.0919002890586853,
"learning_rate": 4.6181140774621077e-05,
"loss": 0.0021,
"num_input_tokens_seen": 91594688,
"step": 4690
},
{
"epoch": 5.347726948838535,
"grad_norm": 4.587754249572754,
"learning_rate": 4.617320836953061e-05,
"loss": 0.0129,
"num_input_tokens_seen": 91692448,
"step": 4695
},
{
"epoch": 5.353427390622773,
"grad_norm": 0.5592033863067627,
"learning_rate": 4.6165268417259986e-05,
"loss": 0.002,
"num_input_tokens_seen": 91790160,
"step": 4700
},
{
"epoch": 5.359127832407012,
"grad_norm": 9.988340377807617,
"learning_rate": 4.6157320920639406e-05,
"loss": 0.0083,
"num_input_tokens_seen": 91887888,
"step": 4705
},
{
"epoch": 5.364828274191249,
"grad_norm": 1.1014900207519531,
"learning_rate": 4.6149365882501754e-05,
"loss": 0.0049,
"num_input_tokens_seen": 91985648,
"step": 4710
},
{
"epoch": 5.370528715975488,
"grad_norm": 0.13604551553726196,
"learning_rate": 4.614140330568261e-05,
"loss": 0.0091,
"num_input_tokens_seen": 92083408,
"step": 4715
},
{
"epoch": 5.3762291577597265,
"grad_norm": 13.926383972167969,
"learning_rate": 4.6133433193020206e-05,
"loss": 0.0367,
"num_input_tokens_seen": 92181072,
"step": 4720
},
{
"epoch": 5.381929599543964,
"grad_norm": 0.2026086002588272,
"learning_rate": 4.61254555473555e-05,
"loss": 0.0112,
"num_input_tokens_seen": 92278880,
"step": 4725
},
{
"epoch": 5.387630041328203,
"grad_norm": 0.10835447162389755,
"learning_rate": 4.6117470371532115e-05,
"loss": 0.0094,
"num_input_tokens_seen": 92376672,
"step": 4730
},
{
"epoch": 5.393330483112441,
"grad_norm": 5.342576026916504,
"learning_rate": 4.610947766839637e-05,
"loss": 0.0153,
"num_input_tokens_seen": 92474448,
"step": 4735
},
{
"epoch": 5.39903092489668,
"grad_norm": 1.6363821029663086,
"learning_rate": 4.610147744079725e-05,
"loss": 0.0046,
"num_input_tokens_seen": 92572160,
"step": 4740
},
{
"epoch": 5.404731366680918,
"grad_norm": 0.9857316613197327,
"learning_rate": 4.609346969158645e-05,
"loss": 0.0092,
"num_input_tokens_seen": 92669792,
"step": 4745
},
{
"epoch": 5.410431808465156,
"grad_norm": 0.055682141333818436,
"learning_rate": 4.60854544236183e-05,
"loss": 0.003,
"num_input_tokens_seen": 92767520,
"step": 4750
},
{
"epoch": 5.416132250249395,
"grad_norm": 0.026181025430560112,
"learning_rate": 4.607743163974987e-05,
"loss": 0.0009,
"num_input_tokens_seen": 92865344,
"step": 4755
},
{
"epoch": 5.421832692033632,
"grad_norm": 0.0219236072152853,
"learning_rate": 4.6069401342840854e-05,
"loss": 0.003,
"num_input_tokens_seen": 92963104,
"step": 4760
},
{
"epoch": 5.427533133817871,
"grad_norm": 0.19581133127212524,
"learning_rate": 4.606136353575366e-05,
"loss": 0.0008,
"num_input_tokens_seen": 93060912,
"step": 4765
},
{
"epoch": 5.4332335756021095,
"grad_norm": 0.10174310952425003,
"learning_rate": 4.6053318221353356e-05,
"loss": 0.0006,
"num_input_tokens_seen": 93158768,
"step": 4770
},
{
"epoch": 5.438934017386347,
"grad_norm": 14.586435317993164,
"learning_rate": 4.60452654025077e-05,
"loss": 0.0157,
"num_input_tokens_seen": 93256496,
"step": 4775
},
{
"epoch": 5.444634459170586,
"grad_norm": 5.632846355438232,
"learning_rate": 4.6037205082087095e-05,
"loss": 0.0196,
"num_input_tokens_seen": 93354208,
"step": 4780
},
{
"epoch": 5.450334900954824,
"grad_norm": 0.09631127119064331,
"learning_rate": 4.602913726296466e-05,
"loss": 0.0012,
"num_input_tokens_seen": 93451952,
"step": 4785
},
{
"epoch": 5.456035342739062,
"grad_norm": 5.937401294708252,
"learning_rate": 4.602106194801615e-05,
"loss": 0.0037,
"num_input_tokens_seen": 93549744,
"step": 4790
},
{
"epoch": 5.4617357845233006,
"grad_norm": 0.3016761839389801,
"learning_rate": 4.6012979140120016e-05,
"loss": 0.0026,
"num_input_tokens_seen": 93647520,
"step": 4795
},
{
"epoch": 5.467436226307539,
"grad_norm": 0.0018981621833518147,
"learning_rate": 4.600488884215737e-05,
"loss": 0.0114,
"num_input_tokens_seen": 93745280,
"step": 4800
},
{
"epoch": 5.473136668091777,
"grad_norm": 0.2602160573005676,
"learning_rate": 4.599679105701199e-05,
"loss": 0.0043,
"num_input_tokens_seen": 93842992,
"step": 4805
},
{
"epoch": 5.478837109876015,
"grad_norm": 2.016597032546997,
"learning_rate": 4.598868578757033e-05,
"loss": 0.0043,
"num_input_tokens_seen": 93940768,
"step": 4810
},
{
"epoch": 5.484537551660254,
"grad_norm": 0.06573915481567383,
"learning_rate": 4.5980573036721505e-05,
"loss": 0.0025,
"num_input_tokens_seen": 94038528,
"step": 4815
},
{
"epoch": 5.490237993444492,
"grad_norm": 1.3520628213882446,
"learning_rate": 4.597245280735731e-05,
"loss": 0.0018,
"num_input_tokens_seen": 94136224,
"step": 4820
},
{
"epoch": 5.49593843522873,
"grad_norm": 14.249088287353516,
"learning_rate": 4.59643251023722e-05,
"loss": 0.0273,
"num_input_tokens_seen": 94233888,
"step": 4825
},
{
"epoch": 5.501638877012969,
"grad_norm": 2.345010280609131,
"learning_rate": 4.595618992466328e-05,
"loss": 0.0017,
"num_input_tokens_seen": 94331568,
"step": 4830
},
{
"epoch": 5.507339318797206,
"grad_norm": 0.5609773397445679,
"learning_rate": 4.594804727713033e-05,
"loss": 0.0045,
"num_input_tokens_seen": 94429248,
"step": 4835
},
{
"epoch": 5.513039760581445,
"grad_norm": 11.708130836486816,
"learning_rate": 4.5939897162675804e-05,
"loss": 0.0603,
"num_input_tokens_seen": 94526912,
"step": 4840
},
{
"epoch": 5.5187402023656835,
"grad_norm": 0.511098325252533,
"learning_rate": 4.59317395842048e-05,
"loss": 0.0015,
"num_input_tokens_seen": 94624688,
"step": 4845
},
{
"epoch": 5.524440644149921,
"grad_norm": 0.07705602049827576,
"learning_rate": 4.592357454462508e-05,
"loss": 0.0008,
"num_input_tokens_seen": 94722496,
"step": 4850
},
{
"epoch": 5.53014108593416,
"grad_norm": 0.49463194608688354,
"learning_rate": 4.591540204684708e-05,
"loss": 0.0226,
"num_input_tokens_seen": 94820176,
"step": 4855
},
{
"epoch": 5.535841527718398,
"grad_norm": 0.13507622480392456,
"learning_rate": 4.590722209378387e-05,
"loss": 0.0033,
"num_input_tokens_seen": 94917984,
"step": 4860
},
{
"epoch": 5.541541969502637,
"grad_norm": 0.11557400226593018,
"learning_rate": 4.589903468835119e-05,
"loss": 0.0048,
"num_input_tokens_seen": 95015744,
"step": 4865
},
{
"epoch": 5.547242411286875,
"grad_norm": 3.4503333568573,
"learning_rate": 4.5890839833467455e-05,
"loss": 0.0044,
"num_input_tokens_seen": 95113504,
"step": 4870
},
{
"epoch": 5.552942853071113,
"grad_norm": 0.1271464228630066,
"learning_rate": 4.58826375320537e-05,
"loss": 0.0021,
"num_input_tokens_seen": 95211264,
"step": 4875
},
{
"epoch": 5.558643294855351,
"grad_norm": 0.09808290749788284,
"learning_rate": 4.587442778703362e-05,
"loss": 0.0011,
"num_input_tokens_seen": 95309040,
"step": 4880
},
{
"epoch": 5.564343736639589,
"grad_norm": 2.261197328567505,
"learning_rate": 4.586621060133362e-05,
"loss": 0.0024,
"num_input_tokens_seen": 95406768,
"step": 4885
},
{
"epoch": 5.570044178423828,
"grad_norm": 0.1426500380039215,
"learning_rate": 4.585798597788266e-05,
"loss": 0.003,
"num_input_tokens_seen": 95504512,
"step": 4890
},
{
"epoch": 5.5757446202080665,
"grad_norm": 0.056966375559568405,
"learning_rate": 4.584975391961242e-05,
"loss": 0.0185,
"num_input_tokens_seen": 95602240,
"step": 4895
},
{
"epoch": 5.581445061992304,
"grad_norm": 13.30504322052002,
"learning_rate": 4.584151442945725e-05,
"loss": 0.0217,
"num_input_tokens_seen": 95699968,
"step": 4900
},
{
"epoch": 5.587145503776543,
"grad_norm": 1.4918162822723389,
"learning_rate": 4.583326751035405e-05,
"loss": 0.0303,
"num_input_tokens_seen": 95797696,
"step": 4905
},
{
"epoch": 5.592845945560781,
"grad_norm": 1.1220730543136597,
"learning_rate": 4.582501316524247e-05,
"loss": 0.0019,
"num_input_tokens_seen": 95895424,
"step": 4910
},
{
"epoch": 5.598546387345019,
"grad_norm": 1.1162631511688232,
"learning_rate": 4.5816751397064764e-05,
"loss": 0.0094,
"num_input_tokens_seen": 95993056,
"step": 4915
},
{
"epoch": 5.6042468291292575,
"grad_norm": 0.10550173372030258,
"learning_rate": 4.5808482208765836e-05,
"loss": 0.0277,
"num_input_tokens_seen": 96090832,
"step": 4920
},
{
"epoch": 5.609947270913496,
"grad_norm": 1.0097618103027344,
"learning_rate": 4.580020560329322e-05,
"loss": 0.0025,
"num_input_tokens_seen": 96188544,
"step": 4925
},
{
"epoch": 5.615647712697734,
"grad_norm": 0.717867374420166,
"learning_rate": 4.579192158359712e-05,
"loss": 0.0037,
"num_input_tokens_seen": 96286368,
"step": 4930
},
{
"epoch": 5.621348154481972,
"grad_norm": 0.8814383149147034,
"learning_rate": 4.5783630152630365e-05,
"loss": 0.024,
"num_input_tokens_seen": 96384128,
"step": 4935
},
{
"epoch": 5.627048596266211,
"grad_norm": 0.2208772897720337,
"learning_rate": 4.577533131334844e-05,
"loss": 0.0187,
"num_input_tokens_seen": 96481888,
"step": 4940
},
{
"epoch": 5.632749038050449,
"grad_norm": 9.015159606933594,
"learning_rate": 4.5767025068709455e-05,
"loss": 0.0203,
"num_input_tokens_seen": 96579680,
"step": 4945
},
{
"epoch": 5.638449479834687,
"grad_norm": 0.6479278206825256,
"learning_rate": 4.5758711421674166e-05,
"loss": 0.0253,
"num_input_tokens_seen": 96677488,
"step": 4950
},
{
"epoch": 5.644149921618926,
"grad_norm": 0.05944683775305748,
"learning_rate": 4.575039037520598e-05,
"loss": 0.001,
"num_input_tokens_seen": 96775280,
"step": 4955
},
{
"epoch": 5.649850363403163,
"grad_norm": 0.376200407743454,
"learning_rate": 4.5742061932270906e-05,
"loss": 0.0041,
"num_input_tokens_seen": 96873072,
"step": 4960
},
{
"epoch": 5.655550805187402,
"grad_norm": 0.14665372669696808,
"learning_rate": 4.5733726095837634e-05,
"loss": 0.0012,
"num_input_tokens_seen": 96970912,
"step": 4965
},
{
"epoch": 5.6612512469716405,
"grad_norm": 0.11682464182376862,
"learning_rate": 4.572538286887748e-05,
"loss": 0.029,
"num_input_tokens_seen": 97068624,
"step": 4970
},
{
"epoch": 5.666951688755878,
"grad_norm": 0.12137410789728165,
"learning_rate": 4.571703225436435e-05,
"loss": 0.0007,
"num_input_tokens_seen": 97166384,
"step": 4975
},
{
"epoch": 5.672652130540117,
"grad_norm": 0.09676264226436615,
"learning_rate": 4.570867425527484e-05,
"loss": 0.0009,
"num_input_tokens_seen": 97264112,
"step": 4980
},
{
"epoch": 5.678352572324355,
"grad_norm": 0.3440670371055603,
"learning_rate": 4.570030887458815e-05,
"loss": 0.0014,
"num_input_tokens_seen": 97361872,
"step": 4985
},
{
"epoch": 5.684053014108594,
"grad_norm": 0.712581992149353,
"learning_rate": 4.569193611528612e-05,
"loss": 0.0043,
"num_input_tokens_seen": 97459616,
"step": 4990
},
{
"epoch": 5.6897534558928315,
"grad_norm": 7.326711177825928,
"learning_rate": 4.5683555980353197e-05,
"loss": 0.009,
"num_input_tokens_seen": 97557376,
"step": 4995
},
{
"epoch": 5.69545389767707,
"grad_norm": 0.251907616853714,
"learning_rate": 4.56751684727765e-05,
"loss": 0.0112,
"num_input_tokens_seen": 97655040,
"step": 5000
}
],
"logging_steps": 5,
"max_steps": 26310,
"num_input_tokens_seen": 97655040,
"num_train_epochs": 30,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.099184256070451e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}