{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7212405337179949, "eval_steps": 200, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006010337780983291, "grad_norm": 0.9436860084533691, "learning_rate": 3.0048076923076927e-06, "loss": 4.4875, "step": 5 }, { "epoch": 0.0012020675561966582, "grad_norm": 0.49743083119392395, "learning_rate": 6.0096153846153855e-06, "loss": 4.2438, "step": 10 }, { "epoch": 0.0018031013342949874, "grad_norm": 0.7305557131767273, "learning_rate": 9.014423076923076e-06, "loss": 4.7438, "step": 15 }, { "epoch": 0.0024041351123933164, "grad_norm": 1.2842742204666138, "learning_rate": 1.2019230769230771e-05, "loss": 4.3125, "step": 20 }, { "epoch": 0.0030051688904916456, "grad_norm": 1.0128940343856812, "learning_rate": 1.5024038461538462e-05, "loss": 4.2969, "step": 25 }, { "epoch": 0.003606202668589975, "grad_norm": 1.6097222566604614, "learning_rate": 1.8028846153846152e-05, "loss": 4.3625, "step": 30 }, { "epoch": 0.004207236446688304, "grad_norm": 0.7380394339561462, "learning_rate": 2.103365384615385e-05, "loss": 3.6562, "step": 35 }, { "epoch": 0.004808270224786633, "grad_norm": 2.499553918838501, "learning_rate": 2.4038461538461542e-05, "loss": 3.9656, "step": 40 }, { "epoch": 0.005409304002884962, "grad_norm": 0.9382426142692566, "learning_rate": 2.704326923076923e-05, "loss": 3.7906, "step": 45 }, { "epoch": 0.006010337780983291, "grad_norm": 0.4448552429676056, "learning_rate": 3.0048076923076925e-05, "loss": 3.4531, "step": 50 }, { "epoch": 0.00661137155908162, "grad_norm": 0.6187996864318848, "learning_rate": 3.3052884615384615e-05, "loss": 3.0406, "step": 55 }, { "epoch": 0.00721240533717995, "grad_norm": 0.4894959032535553, "learning_rate": 3.6057692307692304e-05, "loss": 2.9844, "step": 60 }, { "epoch": 0.007813439115278278, "grad_norm": 0.523160994052887, "learning_rate": 3.90625e-05, "loss": 2.825, "step": 65 }, { "epoch": 0.008414472893376608, "grad_norm": 0.41818058490753174, "learning_rate": 4.20673076923077e-05, "loss": 2.5125, "step": 70 }, { "epoch": 0.009015506671474938, "grad_norm": 0.37683457136154175, "learning_rate": 4.507211538461539e-05, "loss": 2.6844, "step": 75 }, { "epoch": 0.009616540449573266, "grad_norm": 0.428375780582428, "learning_rate": 4.8076923076923084e-05, "loss": 2.6719, "step": 80 }, { "epoch": 0.010217574227671595, "grad_norm": 0.3897765576839447, "learning_rate": 5.108173076923077e-05, "loss": 2.2531, "step": 85 }, { "epoch": 0.010818608005769925, "grad_norm": 0.2265370637178421, "learning_rate": 5.408653846153846e-05, "loss": 2.2844, "step": 90 }, { "epoch": 0.011419641783868253, "grad_norm": 0.2113611400127411, "learning_rate": 5.709134615384615e-05, "loss": 2.1922, "step": 95 }, { "epoch": 0.012020675561966582, "grad_norm": 0.1886824667453766, "learning_rate": 6.009615384615385e-05, "loss": 2.3516, "step": 100 }, { "epoch": 0.012621709340064912, "grad_norm": 0.25855502486228943, "learning_rate": 6.310096153846154e-05, "loss": 2.4, "step": 105 }, { "epoch": 0.01322274311816324, "grad_norm": 0.22833962738513947, "learning_rate": 6.610576923076923e-05, "loss": 2.2844, "step": 110 }, { "epoch": 0.01382377689626157, "grad_norm": 0.30784738063812256, "learning_rate": 6.911057692307693e-05, "loss": 2.2016, "step": 115 }, { "epoch": 0.0144248106743599, "grad_norm": 0.3998744487762451, "learning_rate": 7.211538461538461e-05, "loss": 2.4125, "step": 120 }, { "epoch": 0.015025844452458229, "grad_norm": 0.24773858487606049, "learning_rate": 7.512019230769231e-05, "loss": 2.4156, "step": 125 }, { "epoch": 0.015626878230556557, "grad_norm": 0.26020580530166626, "learning_rate": 7.8125e-05, "loss": 2.0094, "step": 130 }, { "epoch": 0.016227912008654886, "grad_norm": 0.25112366676330566, "learning_rate": 8.112980769230769e-05, "loss": 2.5969, "step": 135 }, { "epoch": 0.016828945786753216, "grad_norm": 0.3155271112918854, "learning_rate": 8.41346153846154e-05, "loss": 1.9844, "step": 140 }, { "epoch": 0.017429979564851546, "grad_norm": 0.2684473693370819, "learning_rate": 8.713942307692307e-05, "loss": 2.3594, "step": 145 }, { "epoch": 0.018031013342949875, "grad_norm": 0.19519321620464325, "learning_rate": 9.014423076923077e-05, "loss": 2.1906, "step": 150 }, { "epoch": 0.0186320471210482, "grad_norm": 0.29595857858657837, "learning_rate": 9.314903846153846e-05, "loss": 2.4844, "step": 155 }, { "epoch": 0.01923308089914653, "grad_norm": 0.21725840866565704, "learning_rate": 9.615384615384617e-05, "loss": 1.9969, "step": 160 }, { "epoch": 0.01983411467724486, "grad_norm": 0.250431627035141, "learning_rate": 9.915865384615384e-05, "loss": 2.1469, "step": 165 }, { "epoch": 0.02043514845534319, "grad_norm": 0.22979402542114258, "learning_rate": 0.00010216346153846153, "loss": 2.0891, "step": 170 }, { "epoch": 0.02103618223344152, "grad_norm": 0.29841649532318115, "learning_rate": 0.00010516826923076924, "loss": 2.0891, "step": 175 }, { "epoch": 0.02163721601153985, "grad_norm": 0.3121524155139923, "learning_rate": 0.00010817307692307693, "loss": 2.2938, "step": 180 }, { "epoch": 0.02223824978963818, "grad_norm": 0.25094497203826904, "learning_rate": 0.00011117788461538462, "loss": 2.0672, "step": 185 }, { "epoch": 0.022839283567736506, "grad_norm": 0.32229083776474, "learning_rate": 0.0001141826923076923, "loss": 1.9781, "step": 190 }, { "epoch": 0.023440317345834835, "grad_norm": 0.30247944593429565, "learning_rate": 0.0001171875, "loss": 2.4469, "step": 195 }, { "epoch": 0.024041351123933165, "grad_norm": 0.3992522358894348, "learning_rate": 0.0001201923076923077, "loss": 2.1609, "step": 200 }, { "epoch": 0.024041351123933165, "eval_loss": 2.7308592796325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.262, "eval_samples_per_second": 4.537, "eval_steps_per_second": 1.134, "step": 200 }, { "epoch": 0.024642384902031494, "grad_norm": 0.28425589203834534, "learning_rate": 0.0001231971153846154, "loss": 2.625, "step": 205 }, { "epoch": 0.025243418680129824, "grad_norm": 0.31964734196662903, "learning_rate": 0.00012620192307692308, "loss": 1.9328, "step": 210 }, { "epoch": 0.025844452458228154, "grad_norm": 0.37272173166275024, "learning_rate": 0.00012920673076923078, "loss": 2.1641, "step": 215 }, { "epoch": 0.02644548623632648, "grad_norm": 0.32725071907043457, "learning_rate": 0.00013221153846153846, "loss": 2.225, "step": 220 }, { "epoch": 0.02704652001442481, "grad_norm": 0.25303465127944946, "learning_rate": 0.00013521634615384616, "loss": 2.2375, "step": 225 }, { "epoch": 0.02764755379252314, "grad_norm": 0.4098326861858368, "learning_rate": 0.00013822115384615386, "loss": 2.0203, "step": 230 }, { "epoch": 0.02824858757062147, "grad_norm": 0.3435593545436859, "learning_rate": 0.00014122596153846154, "loss": 2.3016, "step": 235 }, { "epoch": 0.0288496213487198, "grad_norm": 0.4556426703929901, "learning_rate": 0.00014423076923076922, "loss": 2.2969, "step": 240 }, { "epoch": 0.029450655126818128, "grad_norm": 0.39692002534866333, "learning_rate": 0.00014723557692307692, "loss": 2.3031, "step": 245 }, { "epoch": 0.030051688904916458, "grad_norm": 0.31686538457870483, "learning_rate": 0.00015024038461538462, "loss": 2.2984, "step": 250 }, { "epoch": 0.030652722683014784, "grad_norm": 0.30815422534942627, "learning_rate": 0.00015324519230769233, "loss": 2.1734, "step": 255 }, { "epoch": 0.031253756461113114, "grad_norm": 0.3927950859069824, "learning_rate": 0.00015625, "loss": 2.0031, "step": 260 }, { "epoch": 0.03185479023921144, "grad_norm": 0.3010413944721222, "learning_rate": 0.00015925480769230768, "loss": 2.1875, "step": 265 }, { "epoch": 0.03245582401730977, "grad_norm": 0.39929866790771484, "learning_rate": 0.00016225961538461538, "loss": 2.2266, "step": 270 }, { "epoch": 0.0330568577954081, "grad_norm": 0.3709786832332611, "learning_rate": 0.00016526442307692309, "loss": 2.2344, "step": 275 }, { "epoch": 0.03365789157350643, "grad_norm": 0.38551804423332214, "learning_rate": 0.0001682692307692308, "loss": 2.0391, "step": 280 }, { "epoch": 0.03425892535160476, "grad_norm": 0.3497028350830078, "learning_rate": 0.00017127403846153847, "loss": 2.1328, "step": 285 }, { "epoch": 0.03485995912970309, "grad_norm": 0.22066070139408112, "learning_rate": 0.00017427884615384614, "loss": 1.9891, "step": 290 }, { "epoch": 0.03546099290780142, "grad_norm": 0.3861188590526581, "learning_rate": 0.00017728365384615385, "loss": 2.0266, "step": 295 }, { "epoch": 0.03606202668589975, "grad_norm": 0.43038997054100037, "learning_rate": 0.00018028846153846155, "loss": 2.2062, "step": 300 }, { "epoch": 0.03666306046399807, "grad_norm": 0.4089072644710541, "learning_rate": 0.00018329326923076922, "loss": 2.2016, "step": 305 }, { "epoch": 0.0372640942420964, "grad_norm": 0.40281516313552856, "learning_rate": 0.00018629807692307693, "loss": 2.2578, "step": 310 }, { "epoch": 0.03786512802019473, "grad_norm": 0.33316513895988464, "learning_rate": 0.0001893028846153846, "loss": 2.1844, "step": 315 }, { "epoch": 0.03846616179829306, "grad_norm": 0.4020228087902069, "learning_rate": 0.00019230769230769233, "loss": 2.2109, "step": 320 }, { "epoch": 0.03906719557639139, "grad_norm": 0.36403888463974, "learning_rate": 0.0001953125, "loss": 2.0063, "step": 325 }, { "epoch": 0.03966822935448972, "grad_norm": 0.4289080500602722, "learning_rate": 0.0001983173076923077, "loss": 2.1641, "step": 330 }, { "epoch": 0.04026926313258805, "grad_norm": 0.3827407658100128, "learning_rate": 0.0002013221153846154, "loss": 2.4125, "step": 335 }, { "epoch": 0.04087029691068638, "grad_norm": 0.28297996520996094, "learning_rate": 0.00020432692307692307, "loss": 2.2047, "step": 340 }, { "epoch": 0.04147133068878471, "grad_norm": 0.3654349744319916, "learning_rate": 0.0002073317307692308, "loss": 2.0344, "step": 345 }, { "epoch": 0.04207236446688304, "grad_norm": 0.44768983125686646, "learning_rate": 0.00021033653846153847, "loss": 2.0469, "step": 350 }, { "epoch": 0.04267339824498137, "grad_norm": 0.36050865054130554, "learning_rate": 0.00021334134615384615, "loss": 1.8203, "step": 355 }, { "epoch": 0.0432744320230797, "grad_norm": 0.41343504190444946, "learning_rate": 0.00021634615384615385, "loss": 1.9031, "step": 360 }, { "epoch": 0.04387546580117803, "grad_norm": 0.33549779653549194, "learning_rate": 0.00021935096153846153, "loss": 1.9859, "step": 365 }, { "epoch": 0.04447649957927636, "grad_norm": 0.39200559258461, "learning_rate": 0.00022235576923076923, "loss": 2.0672, "step": 370 }, { "epoch": 0.04507753335737468, "grad_norm": 0.5816010236740112, "learning_rate": 0.00022536057692307694, "loss": 2.1625, "step": 375 }, { "epoch": 0.04567856713547301, "grad_norm": 0.4004225432872772, "learning_rate": 0.0002283653846153846, "loss": 2.1297, "step": 380 }, { "epoch": 0.04627960091357134, "grad_norm": 0.3329584300518036, "learning_rate": 0.00023137019230769232, "loss": 1.8969, "step": 385 }, { "epoch": 0.04688063469166967, "grad_norm": 0.3800398111343384, "learning_rate": 0.000234375, "loss": 1.875, "step": 390 }, { "epoch": 0.047481668469768, "grad_norm": 0.5345351696014404, "learning_rate": 0.0002373798076923077, "loss": 2.0641, "step": 395 }, { "epoch": 0.04808270224786633, "grad_norm": 0.31537583470344543, "learning_rate": 0.0002403846153846154, "loss": 2.0828, "step": 400 }, { "epoch": 0.04808270224786633, "eval_loss": 2.657031297683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2197, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 400 }, { "epoch": 0.04868373602596466, "grad_norm": 0.3651765286922455, "learning_rate": 0.00024338942307692307, "loss": 2.2188, "step": 405 }, { "epoch": 0.04928476980406299, "grad_norm": 0.42044126987457275, "learning_rate": 0.0002463942307692308, "loss": 1.9625, "step": 410 }, { "epoch": 0.04988580358216132, "grad_norm": 0.3405047357082367, "learning_rate": 0.00024939903846153845, "loss": 2.1203, "step": 415 }, { "epoch": 0.05048683736025965, "grad_norm": 0.5022028088569641, "learning_rate": 0.00025240384615384616, "loss": 1.7672, "step": 420 }, { "epoch": 0.05108787113835798, "grad_norm": 0.31208300590515137, "learning_rate": 0.00025540865384615386, "loss": 1.9266, "step": 425 }, { "epoch": 0.05168890491645631, "grad_norm": 0.39399516582489014, "learning_rate": 0.00025841346153846156, "loss": 1.8828, "step": 430 }, { "epoch": 0.05228993869455464, "grad_norm": 0.42515093088150024, "learning_rate": 0.0002614182692307692, "loss": 1.7656, "step": 435 }, { "epoch": 0.05289097247265296, "grad_norm": 0.3947089910507202, "learning_rate": 0.0002644230769230769, "loss": 2.0484, "step": 440 }, { "epoch": 0.05349200625075129, "grad_norm": 0.6280628442764282, "learning_rate": 0.0002674278846153846, "loss": 2.1422, "step": 445 }, { "epoch": 0.05409304002884962, "grad_norm": 0.3639807105064392, "learning_rate": 0.0002704326923076923, "loss": 1.9781, "step": 450 }, { "epoch": 0.05469407380694795, "grad_norm": 0.3984295427799225, "learning_rate": 0.0002734375, "loss": 2.2359, "step": 455 }, { "epoch": 0.05529510758504628, "grad_norm": 0.33954715728759766, "learning_rate": 0.00027644230769230773, "loss": 2.3547, "step": 460 }, { "epoch": 0.05589614136314461, "grad_norm": 0.4361511468887329, "learning_rate": 0.0002794471153846154, "loss": 2.0859, "step": 465 }, { "epoch": 0.05649717514124294, "grad_norm": 0.471563458442688, "learning_rate": 0.0002824519230769231, "loss": 2.1703, "step": 470 }, { "epoch": 0.05709820891934127, "grad_norm": 0.2517772614955902, "learning_rate": 0.0002854567307692308, "loss": 2.0281, "step": 475 }, { "epoch": 0.0576992426974396, "grad_norm": 0.3190082907676697, "learning_rate": 0.00028846153846153843, "loss": 2.0234, "step": 480 }, { "epoch": 0.05830027647553793, "grad_norm": 0.37972012162208557, "learning_rate": 0.00029146634615384614, "loss": 2.15, "step": 485 }, { "epoch": 0.058901310253636256, "grad_norm": 0.37980136275291443, "learning_rate": 0.00029447115384615384, "loss": 2.1219, "step": 490 }, { "epoch": 0.059502344031734586, "grad_norm": 0.32648953795433044, "learning_rate": 0.00029747596153846154, "loss": 1.9703, "step": 495 }, { "epoch": 0.060103377809832916, "grad_norm": 0.28836116194725037, "learning_rate": 0.00030048076923076925, "loss": 2.0406, "step": 500 }, { "epoch": 0.060704411587931245, "grad_norm": 0.2953934967517853, "learning_rate": 0.00030348557692307695, "loss": 2.2156, "step": 505 }, { "epoch": 0.06130544536602957, "grad_norm": 0.4778139889240265, "learning_rate": 0.00030649038461538465, "loss": 2.0672, "step": 510 }, { "epoch": 0.0619064791441279, "grad_norm": 0.27339640259742737, "learning_rate": 0.0003094951923076923, "loss": 1.8953, "step": 515 }, { "epoch": 0.06250751292222623, "grad_norm": 0.3127667009830475, "learning_rate": 0.0003125, "loss": 2.0859, "step": 520 }, { "epoch": 0.06310854670032456, "grad_norm": 0.2676738500595093, "learning_rate": 0.0003155048076923077, "loss": 1.9656, "step": 525 }, { "epoch": 0.06370958047842289, "grad_norm": 0.3519584834575653, "learning_rate": 0.00031850961538461536, "loss": 2.0828, "step": 530 }, { "epoch": 0.06431061425652122, "grad_norm": 0.38000714778900146, "learning_rate": 0.00032151442307692306, "loss": 1.8656, "step": 535 }, { "epoch": 0.06491164803461955, "grad_norm": 0.5076779127120972, "learning_rate": 0.00032451923076923077, "loss": 1.8938, "step": 540 }, { "epoch": 0.06551268181271787, "grad_norm": 0.3919801414012909, "learning_rate": 0.00032752403846153847, "loss": 2.1203, "step": 545 }, { "epoch": 0.0661137155908162, "grad_norm": 0.3263305425643921, "learning_rate": 0.00033052884615384617, "loss": 2.0344, "step": 550 }, { "epoch": 0.06671474936891453, "grad_norm": 0.4196506440639496, "learning_rate": 0.0003335336538461539, "loss": 2.1078, "step": 555 }, { "epoch": 0.06731578314701286, "grad_norm": 0.3997637927532196, "learning_rate": 0.0003365384615384616, "loss": 1.8844, "step": 560 }, { "epoch": 0.06791681692511119, "grad_norm": 0.39547184109687805, "learning_rate": 0.00033954326923076923, "loss": 1.9406, "step": 565 }, { "epoch": 0.06851785070320952, "grad_norm": 0.36170271039009094, "learning_rate": 0.00034254807692307693, "loss": 2.2469, "step": 570 }, { "epoch": 0.06911888448130785, "grad_norm": 0.3041069507598877, "learning_rate": 0.00034555288461538463, "loss": 1.7734, "step": 575 }, { "epoch": 0.06971991825940618, "grad_norm": 0.31936579942703247, "learning_rate": 0.0003485576923076923, "loss": 2.1141, "step": 580 }, { "epoch": 0.0703209520375045, "grad_norm": 0.5643404722213745, "learning_rate": 0.0003515625, "loss": 1.9, "step": 585 }, { "epoch": 0.07092198581560284, "grad_norm": 0.43453335762023926, "learning_rate": 0.0003545673076923077, "loss": 1.5203, "step": 590 }, { "epoch": 0.07152301959370116, "grad_norm": 0.28918376564979553, "learning_rate": 0.0003575721153846154, "loss": 1.9859, "step": 595 }, { "epoch": 0.0721240533717995, "grad_norm": 0.4441574215888977, "learning_rate": 0.0003605769230769231, "loss": 1.7422, "step": 600 }, { "epoch": 0.0721240533717995, "eval_loss": 2.598828077316284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2182, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 600 }, { "epoch": 0.07272508714989782, "grad_norm": 0.47054970264434814, "learning_rate": 0.0003635817307692308, "loss": 1.975, "step": 605 }, { "epoch": 0.07332612092799615, "grad_norm": 0.4808221459388733, "learning_rate": 0.00036658653846153845, "loss": 2.0969, "step": 610 }, { "epoch": 0.07392715470609448, "grad_norm": 0.40260159969329834, "learning_rate": 0.00036959134615384615, "loss": 2.0781, "step": 615 }, { "epoch": 0.0745281884841928, "grad_norm": 0.3565881848335266, "learning_rate": 0.00037259615384615386, "loss": 1.8562, "step": 620 }, { "epoch": 0.07512922226229114, "grad_norm": 0.41623455286026, "learning_rate": 0.00037560096153846156, "loss": 1.9688, "step": 625 }, { "epoch": 0.07573025604038947, "grad_norm": 0.442056804895401, "learning_rate": 0.0003786057692307692, "loss": 2.0531, "step": 630 }, { "epoch": 0.0763312898184878, "grad_norm": 0.5474425554275513, "learning_rate": 0.0003816105769230769, "loss": 2.1391, "step": 635 }, { "epoch": 0.07693232359658612, "grad_norm": 0.29002273082733154, "learning_rate": 0.00038461538461538467, "loss": 1.6906, "step": 640 }, { "epoch": 0.07753335737468446, "grad_norm": 0.30469194054603577, "learning_rate": 0.0003876201923076923, "loss": 1.6859, "step": 645 }, { "epoch": 0.07813439115278278, "grad_norm": 0.3932645618915558, "learning_rate": 0.000390625, "loss": 1.8328, "step": 650 }, { "epoch": 0.07873542493088112, "grad_norm": 0.4049251079559326, "learning_rate": 0.0003936298076923077, "loss": 1.8672, "step": 655 }, { "epoch": 0.07933645870897944, "grad_norm": 0.4889291524887085, "learning_rate": 0.0003966346153846154, "loss": 2.0531, "step": 660 }, { "epoch": 0.07993749248707778, "grad_norm": 0.38475117087364197, "learning_rate": 0.0003996394230769231, "loss": 1.8422, "step": 665 }, { "epoch": 0.0805385262651761, "grad_norm": 0.34599217772483826, "learning_rate": 0.0004026442307692308, "loss": 1.8391, "step": 670 }, { "epoch": 0.08113956004327443, "grad_norm": 0.39600178599357605, "learning_rate": 0.00040564903846153843, "loss": 1.8484, "step": 675 }, { "epoch": 0.08174059382137276, "grad_norm": 0.3293285071849823, "learning_rate": 0.00040865384615384613, "loss": 1.6656, "step": 680 }, { "epoch": 0.08234162759947108, "grad_norm": 0.37310031056404114, "learning_rate": 0.00041165865384615384, "loss": 1.9609, "step": 685 }, { "epoch": 0.08294266137756942, "grad_norm": 0.41512343287467957, "learning_rate": 0.0004146634615384616, "loss": 1.9937, "step": 690 }, { "epoch": 0.08354369515566774, "grad_norm": 0.47950249910354614, "learning_rate": 0.00041766826923076924, "loss": 1.9109, "step": 695 }, { "epoch": 0.08414472893376608, "grad_norm": 0.4324653744697571, "learning_rate": 0.00042067307692307695, "loss": 1.9953, "step": 700 }, { "epoch": 0.0847457627118644, "grad_norm": 0.3693973422050476, "learning_rate": 0.00042367788461538465, "loss": 1.9016, "step": 705 }, { "epoch": 0.08534679648996274, "grad_norm": 0.33113107085227966, "learning_rate": 0.0004266826923076923, "loss": 2.2266, "step": 710 }, { "epoch": 0.08594783026806106, "grad_norm": 0.5808571577072144, "learning_rate": 0.0004296875, "loss": 1.5063, "step": 715 }, { "epoch": 0.0865488640461594, "grad_norm": 0.3792312443256378, "learning_rate": 0.0004326923076923077, "loss": 1.8016, "step": 720 }, { "epoch": 0.08714989782425772, "grad_norm": 0.43698450922966003, "learning_rate": 0.00043569711538461535, "loss": 1.7219, "step": 725 }, { "epoch": 0.08775093160235606, "grad_norm": 0.43264222145080566, "learning_rate": 0.00043870192307692306, "loss": 1.7234, "step": 730 }, { "epoch": 0.08835196538045438, "grad_norm": 0.5246540307998657, "learning_rate": 0.0004417067307692308, "loss": 1.7531, "step": 735 }, { "epoch": 0.08895299915855272, "grad_norm": 0.2953200936317444, "learning_rate": 0.00044471153846153846, "loss": 1.9438, "step": 740 }, { "epoch": 0.08955403293665104, "grad_norm": 0.39238616824150085, "learning_rate": 0.00044771634615384617, "loss": 1.7172, "step": 745 }, { "epoch": 0.09015506671474936, "grad_norm": 0.4887576401233673, "learning_rate": 0.00045072115384615387, "loss": 1.9594, "step": 750 }, { "epoch": 0.0907561004928477, "grad_norm": 0.391634076833725, "learning_rate": 0.0004537259615384616, "loss": 1.8406, "step": 755 }, { "epoch": 0.09135713427094602, "grad_norm": 0.4006985127925873, "learning_rate": 0.0004567307692307692, "loss": 1.7984, "step": 760 }, { "epoch": 0.09195816804904436, "grad_norm": 0.3601657748222351, "learning_rate": 0.0004597355769230769, "loss": 1.9844, "step": 765 }, { "epoch": 0.09255920182714268, "grad_norm": 0.5057326555252075, "learning_rate": 0.00046274038461538463, "loss": 1.6703, "step": 770 }, { "epoch": 0.09316023560524102, "grad_norm": 0.5787122845649719, "learning_rate": 0.0004657451923076923, "loss": 1.8984, "step": 775 }, { "epoch": 0.09376126938333934, "grad_norm": 0.4849441945552826, "learning_rate": 0.00046875, "loss": 1.8672, "step": 780 }, { "epoch": 0.09436230316143768, "grad_norm": 0.44167378544807434, "learning_rate": 0.00047175480769230774, "loss": 1.6422, "step": 785 }, { "epoch": 0.094963336939536, "grad_norm": 0.6295076608657837, "learning_rate": 0.0004747596153846154, "loss": 1.6875, "step": 790 }, { "epoch": 0.09556437071763434, "grad_norm": 0.4804101586341858, "learning_rate": 0.0004777644230769231, "loss": 1.8203, "step": 795 }, { "epoch": 0.09616540449573266, "grad_norm": 0.4898495674133301, "learning_rate": 0.0004807692307692308, "loss": 1.9891, "step": 800 }, { "epoch": 0.09616540449573266, "eval_loss": 2.535351514816284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1754, "eval_samples_per_second": 4.549, "eval_steps_per_second": 1.137, "step": 800 }, { "epoch": 0.096766438273831, "grad_norm": 0.43688085675239563, "learning_rate": 0.00048377403846153845, "loss": 1.7234, "step": 805 }, { "epoch": 0.09736747205192932, "grad_norm": 0.5891087651252747, "learning_rate": 0.00048677884615384615, "loss": 1.7969, "step": 810 }, { "epoch": 0.09796850583002764, "grad_norm": 0.5140319466590881, "learning_rate": 0.0004897836538461539, "loss": 2.0719, "step": 815 }, { "epoch": 0.09856953960812598, "grad_norm": 0.40886375308036804, "learning_rate": 0.0004927884615384616, "loss": 2.0891, "step": 820 }, { "epoch": 0.0991705733862243, "grad_norm": 0.3513309955596924, "learning_rate": 0.0004957932692307692, "loss": 1.8453, "step": 825 }, { "epoch": 0.09977160716432264, "grad_norm": 0.5530559420585632, "learning_rate": 0.0004987980769230769, "loss": 1.675, "step": 830 }, { "epoch": 0.10037264094242096, "grad_norm": 0.4348265528678894, "learning_rate": 0.0004999999983630302, "loss": 1.7891, "step": 835 }, { "epoch": 0.1009736747205193, "grad_norm": 0.5396342277526855, "learning_rate": 0.0004999999883593255, "loss": 1.9047, "step": 840 }, { "epoch": 0.10157470849861762, "grad_norm": 0.5154384970664978, "learning_rate": 0.0004999999692613442, "loss": 1.8844, "step": 845 }, { "epoch": 0.10217574227671596, "grad_norm": 0.29072120785713196, "learning_rate": 0.0004999999410690872, "loss": 1.6531, "step": 850 }, { "epoch": 0.10277677605481428, "grad_norm": 0.4125816822052002, "learning_rate": 0.0004999999037825552, "loss": 1.9031, "step": 855 }, { "epoch": 0.10337780983291261, "grad_norm": 0.34915369749069214, "learning_rate": 0.0004999998574017497, "loss": 1.8609, "step": 860 }, { "epoch": 0.10397884361101094, "grad_norm": 0.3622804284095764, "learning_rate": 0.0004999998019266724, "loss": 1.7484, "step": 865 }, { "epoch": 0.10457987738910927, "grad_norm": 0.36787149310112, "learning_rate": 0.0004999997373573254, "loss": 1.7812, "step": 870 }, { "epoch": 0.1051809111672076, "grad_norm": 0.4469545781612396, "learning_rate": 0.0004999996636937108, "loss": 1.5484, "step": 875 }, { "epoch": 0.10578194494530592, "grad_norm": 0.30026400089263916, "learning_rate": 0.0004999995809358316, "loss": 1.6703, "step": 880 }, { "epoch": 0.10638297872340426, "grad_norm": 0.4870736002922058, "learning_rate": 0.0004999994890836904, "loss": 1.7547, "step": 885 }, { "epoch": 0.10698401250150258, "grad_norm": 0.6516287326812744, "learning_rate": 0.000499999388137291, "loss": 1.7891, "step": 890 }, { "epoch": 0.10758504627960092, "grad_norm": 0.2974604368209839, "learning_rate": 0.0004999992780966368, "loss": 1.8359, "step": 895 }, { "epoch": 0.10818608005769924, "grad_norm": 0.3521243929862976, "learning_rate": 0.0004999991589617318, "loss": 1.9141, "step": 900 }, { "epoch": 0.10878711383579757, "grad_norm": 0.38353726267814636, "learning_rate": 0.0004999990307325803, "loss": 1.775, "step": 905 }, { "epoch": 0.1093881476138959, "grad_norm": 0.46048542857170105, "learning_rate": 0.0004999988934091872, "loss": 1.7297, "step": 910 }, { "epoch": 0.10998918139199423, "grad_norm": 0.4313719570636749, "learning_rate": 0.0004999987469915573, "loss": 1.2891, "step": 915 }, { "epoch": 0.11059021517009256, "grad_norm": 0.5933486223220825, "learning_rate": 0.0004999985914796961, "loss": 1.6938, "step": 920 }, { "epoch": 0.1111912489481909, "grad_norm": 0.5271236300468445, "learning_rate": 0.000499998426873609, "loss": 1.8, "step": 925 }, { "epoch": 0.11179228272628922, "grad_norm": 0.3807511031627655, "learning_rate": 0.0004999982531733022, "loss": 1.3086, "step": 930 }, { "epoch": 0.11239331650438755, "grad_norm": 0.4684934914112091, "learning_rate": 0.0004999980703787819, "loss": 1.4875, "step": 935 }, { "epoch": 0.11299435028248588, "grad_norm": 0.5648980140686035, "learning_rate": 0.0004999978784900549, "loss": 1.6578, "step": 940 }, { "epoch": 0.1135953840605842, "grad_norm": 0.4021349549293518, "learning_rate": 0.0004999976775071278, "loss": 1.8266, "step": 945 }, { "epoch": 0.11419641783868253, "grad_norm": 0.3722395598888397, "learning_rate": 0.0004999974674300084, "loss": 1.8969, "step": 950 }, { "epoch": 0.11479745161678086, "grad_norm": 0.407781720161438, "learning_rate": 0.000499997248258704, "loss": 1.6562, "step": 955 }, { "epoch": 0.1153984853948792, "grad_norm": 0.44156748056411743, "learning_rate": 0.0004999970199932229, "loss": 2.0688, "step": 960 }, { "epoch": 0.11599951917297752, "grad_norm": 0.40020808577537537, "learning_rate": 0.000499996782633573, "loss": 1.5047, "step": 965 }, { "epoch": 0.11660055295107585, "grad_norm": 0.38710176944732666, "learning_rate": 0.0004999965361797633, "loss": 1.7367, "step": 970 }, { "epoch": 0.11720158672917418, "grad_norm": 0.344836562871933, "learning_rate": 0.0004999962806318025, "loss": 1.7828, "step": 975 }, { "epoch": 0.11780262050727251, "grad_norm": 0.3811284899711609, "learning_rate": 0.0004999960159897, "loss": 1.7766, "step": 980 }, { "epoch": 0.11840365428537084, "grad_norm": 0.5141933560371399, "learning_rate": 0.0004999957422534654, "loss": 1.75, "step": 985 }, { "epoch": 0.11900468806346917, "grad_norm": 0.37530529499053955, "learning_rate": 0.0004999954594231088, "loss": 2.0922, "step": 990 }, { "epoch": 0.1196057218415675, "grad_norm": 0.41129302978515625, "learning_rate": 0.0004999951674986401, "loss": 1.5781, "step": 995 }, { "epoch": 0.12020675561966583, "grad_norm": 0.3869934380054474, "learning_rate": 0.0004999948664800704, "loss": 1.7422, "step": 1000 }, { "epoch": 0.12020675561966583, "eval_loss": 2.4908204078674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1997, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 1000 }, { "epoch": 0.12080778939776415, "grad_norm": 0.36643335223197937, "learning_rate": 0.0004999945563674105, "loss": 1.6797, "step": 1005 }, { "epoch": 0.12140882317586249, "grad_norm": 0.45910894870758057, "learning_rate": 0.0004999942371606714, "loss": 1.7063, "step": 1010 }, { "epoch": 0.12200985695396081, "grad_norm": 0.350729763507843, "learning_rate": 0.0004999939088598652, "loss": 1.6344, "step": 1015 }, { "epoch": 0.12261089073205914, "grad_norm": 0.46493440866470337, "learning_rate": 0.0004999935714650034, "loss": 1.9641, "step": 1020 }, { "epoch": 0.12321192451015747, "grad_norm": 0.42726650834083557, "learning_rate": 0.0004999932249760984, "loss": 1.7094, "step": 1025 }, { "epoch": 0.1238129582882558, "grad_norm": 0.28014904260635376, "learning_rate": 0.000499992869393163, "loss": 1.8516, "step": 1030 }, { "epoch": 0.12441399206635413, "grad_norm": 0.4522114098072052, "learning_rate": 0.0004999925047162099, "loss": 1.3961, "step": 1035 }, { "epoch": 0.12501502584445245, "grad_norm": 0.46475955843925476, "learning_rate": 0.0004999921309452526, "loss": 1.4062, "step": 1040 }, { "epoch": 0.1256160596225508, "grad_norm": 0.44490954279899597, "learning_rate": 0.0004999917480803044, "loss": 1.6719, "step": 1045 }, { "epoch": 0.12621709340064913, "grad_norm": 0.40904587507247925, "learning_rate": 0.0004999913561213793, "loss": 1.7734, "step": 1050 }, { "epoch": 0.12681812717874744, "grad_norm": 0.36412525177001953, "learning_rate": 0.0004999909550684918, "loss": 1.2594, "step": 1055 }, { "epoch": 0.12741916095684577, "grad_norm": 0.7560976147651672, "learning_rate": 0.0004999905449216563, "loss": 1.6047, "step": 1060 }, { "epoch": 0.1280201947349441, "grad_norm": 0.5383388996124268, "learning_rate": 0.0004999901256808878, "loss": 1.6016, "step": 1065 }, { "epoch": 0.12862122851304245, "grad_norm": 0.5255587100982666, "learning_rate": 0.0004999896973462012, "loss": 1.7828, "step": 1070 }, { "epoch": 0.12922226229114075, "grad_norm": 0.4830612242221832, "learning_rate": 0.0004999892599176127, "loss": 1.8781, "step": 1075 }, { "epoch": 0.1298232960692391, "grad_norm": 0.3687385618686676, "learning_rate": 0.0004999888133951377, "loss": 1.4797, "step": 1080 }, { "epoch": 0.13042432984733743, "grad_norm": 0.3518010675907135, "learning_rate": 0.0004999883577787927, "loss": 1.7234, "step": 1085 }, { "epoch": 0.13102536362543574, "grad_norm": 0.4522668719291687, "learning_rate": 0.0004999878930685943, "loss": 1.675, "step": 1090 }, { "epoch": 0.13162639740353407, "grad_norm": 0.3153088390827179, "learning_rate": 0.0004999874192645592, "loss": 1.7328, "step": 1095 }, { "epoch": 0.1322274311816324, "grad_norm": 0.4520825147628784, "learning_rate": 0.0004999869363667048, "loss": 1.925, "step": 1100 }, { "epoch": 0.13282846495973075, "grad_norm": 0.3040079176425934, "learning_rate": 0.0004999864443750486, "loss": 1.6922, "step": 1105 }, { "epoch": 0.13342949873782906, "grad_norm": 0.5198135375976562, "learning_rate": 0.0004999859432896084, "loss": 1.6562, "step": 1110 }, { "epoch": 0.1340305325159274, "grad_norm": 0.30772989988327026, "learning_rate": 0.0004999854331104028, "loss": 1.8078, "step": 1115 }, { "epoch": 0.13463156629402573, "grad_norm": 0.39027324318885803, "learning_rate": 0.0004999849138374498, "loss": 1.625, "step": 1120 }, { "epoch": 0.13523260007212407, "grad_norm": 0.4438004195690155, "learning_rate": 0.0004999843854707688, "loss": 1.5414, "step": 1125 }, { "epoch": 0.13583363385022237, "grad_norm": 0.4966782033443451, "learning_rate": 0.0004999838480103787, "loss": 1.4836, "step": 1130 }, { "epoch": 0.1364346676283207, "grad_norm": 0.5602577328681946, "learning_rate": 0.0004999833014562992, "loss": 1.3961, "step": 1135 }, { "epoch": 0.13703570140641905, "grad_norm": 0.5276179909706116, "learning_rate": 0.0004999827458085502, "loss": 1.8422, "step": 1140 }, { "epoch": 0.13763673518451738, "grad_norm": 0.4706065058708191, "learning_rate": 0.0004999821810671518, "loss": 1.7109, "step": 1145 }, { "epoch": 0.1382377689626157, "grad_norm": 0.38341307640075684, "learning_rate": 0.0004999816072321245, "loss": 1.8859, "step": 1150 }, { "epoch": 0.13883880274071403, "grad_norm": 0.5754373073577881, "learning_rate": 0.0004999810243034894, "loss": 1.8, "step": 1155 }, { "epoch": 0.13943983651881237, "grad_norm": 0.5003094673156738, "learning_rate": 0.0004999804322812676, "loss": 1.6766, "step": 1160 }, { "epoch": 0.14004087029691067, "grad_norm": 0.31239280104637146, "learning_rate": 0.0004999798311654805, "loss": 1.775, "step": 1165 }, { "epoch": 0.140641904075009, "grad_norm": 0.3998953700065613, "learning_rate": 0.0004999792209561501, "loss": 1.7516, "step": 1170 }, { "epoch": 0.14124293785310735, "grad_norm": 0.3099336624145508, "learning_rate": 0.0004999786016532986, "loss": 1.8422, "step": 1175 }, { "epoch": 0.14184397163120568, "grad_norm": 0.48160257935523987, "learning_rate": 0.0004999779732569485, "loss": 1.6062, "step": 1180 }, { "epoch": 0.142445005409304, "grad_norm": 0.5494711399078369, "learning_rate": 0.0004999773357671227, "loss": 1.5906, "step": 1185 }, { "epoch": 0.14304603918740233, "grad_norm": 0.7721512913703918, "learning_rate": 0.0004999766891838444, "loss": 1.7734, "step": 1190 }, { "epoch": 0.14364707296550067, "grad_norm": 0.5135265588760376, "learning_rate": 0.000499976033507137, "loss": 1.4812, "step": 1195 }, { "epoch": 0.144248106743599, "grad_norm": 0.7913392186164856, "learning_rate": 0.0004999753687370245, "loss": 1.5484, "step": 1200 }, { "epoch": 0.144248106743599, "eval_loss": 2.5082030296325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2256, "eval_samples_per_second": 4.542, "eval_steps_per_second": 1.136, "step": 1200 }, { "epoch": 0.1448491405216973, "grad_norm": 0.6069223880767822, "learning_rate": 0.0004999746948735308, "loss": 1.4484, "step": 1205 }, { "epoch": 0.14545017429979565, "grad_norm": 0.4137849807739258, "learning_rate": 0.0004999740119166809, "loss": 1.6719, "step": 1210 }, { "epoch": 0.14605120807789398, "grad_norm": 0.7047042846679688, "learning_rate": 0.0004999733198664992, "loss": 1.5312, "step": 1215 }, { "epoch": 0.1466522418559923, "grad_norm": 0.5389900207519531, "learning_rate": 0.0004999726187230111, "loss": 1.4297, "step": 1220 }, { "epoch": 0.14725327563409063, "grad_norm": 0.5395992994308472, "learning_rate": 0.0004999719084862421, "loss": 1.6328, "step": 1225 }, { "epoch": 0.14785430941218897, "grad_norm": 0.43566471338272095, "learning_rate": 0.0004999711891562179, "loss": 1.7094, "step": 1230 }, { "epoch": 0.1484553431902873, "grad_norm": 0.3409474194049835, "learning_rate": 0.0004999704607329648, "loss": 1.6656, "step": 1235 }, { "epoch": 0.1490563769683856, "grad_norm": 0.5498088002204895, "learning_rate": 0.0004999697232165092, "loss": 1.6016, "step": 1240 }, { "epoch": 0.14965741074648395, "grad_norm": 0.567551851272583, "learning_rate": 0.000499968976606878, "loss": 1.6828, "step": 1245 }, { "epoch": 0.15025844452458229, "grad_norm": 0.4866923987865448, "learning_rate": 0.0004999682209040983, "loss": 1.6547, "step": 1250 }, { "epoch": 0.15085947830268062, "grad_norm": 0.3780736029148102, "learning_rate": 0.0004999674561081977, "loss": 1.6719, "step": 1255 }, { "epoch": 0.15146051208077893, "grad_norm": 0.3219822347164154, "learning_rate": 0.0004999666822192039, "loss": 1.4195, "step": 1260 }, { "epoch": 0.15206154585887727, "grad_norm": 0.3056913912296295, "learning_rate": 0.0004999658992371451, "loss": 1.7484, "step": 1265 }, { "epoch": 0.1526625796369756, "grad_norm": 0.4860096573829651, "learning_rate": 0.0004999651071620499, "loss": 1.6516, "step": 1270 }, { "epoch": 0.15326361341507394, "grad_norm": 0.4047755002975464, "learning_rate": 0.0004999643059939469, "loss": 1.6984, "step": 1275 }, { "epoch": 0.15386464719317225, "grad_norm": 0.27880361676216125, "learning_rate": 0.0004999634957328652, "loss": 1.8078, "step": 1280 }, { "epoch": 0.15446568097127059, "grad_norm": 0.4087715148925781, "learning_rate": 0.0004999626763788346, "loss": 1.6422, "step": 1285 }, { "epoch": 0.15506671474936892, "grad_norm": 0.556612491607666, "learning_rate": 0.0004999618479318847, "loss": 1.5359, "step": 1290 }, { "epoch": 0.15566774852746723, "grad_norm": 0.5415599346160889, "learning_rate": 0.0004999610103920457, "loss": 1.5641, "step": 1295 }, { "epoch": 0.15626878230556557, "grad_norm": 0.48660141229629517, "learning_rate": 0.0004999601637593479, "loss": 1.5, "step": 1300 }, { "epoch": 0.1568698160836639, "grad_norm": 0.5874481797218323, "learning_rate": 0.0004999593080338224, "loss": 1.3844, "step": 1305 }, { "epoch": 0.15747084986176224, "grad_norm": 0.3727753460407257, "learning_rate": 0.0004999584432155, "loss": 1.8125, "step": 1310 }, { "epoch": 0.15807188363986055, "grad_norm": 0.35395169258117676, "learning_rate": 0.0004999575693044124, "loss": 1.3305, "step": 1315 }, { "epoch": 0.1586729174179589, "grad_norm": 0.7356476783752441, "learning_rate": 0.0004999566863005912, "loss": 1.7078, "step": 1320 }, { "epoch": 0.15927395119605722, "grad_norm": 0.4838991165161133, "learning_rate": 0.0004999557942040687, "loss": 1.5969, "step": 1325 }, { "epoch": 0.15987498497415556, "grad_norm": 0.4504292905330658, "learning_rate": 0.0004999548930148773, "loss": 1.4555, "step": 1330 }, { "epoch": 0.16047601875225387, "grad_norm": 0.5174041390419006, "learning_rate": 0.0004999539827330497, "loss": 1.5266, "step": 1335 }, { "epoch": 0.1610770525303522, "grad_norm": 0.42709511518478394, "learning_rate": 0.000499953063358619, "loss": 1.4344, "step": 1340 }, { "epoch": 0.16167808630845054, "grad_norm": 0.34575751423835754, "learning_rate": 0.0004999521348916189, "loss": 1.5219, "step": 1345 }, { "epoch": 0.16227912008654885, "grad_norm": 0.4409041404724121, "learning_rate": 0.0004999511973320829, "loss": 1.6172, "step": 1350 }, { "epoch": 0.1628801538646472, "grad_norm": 0.37874963879585266, "learning_rate": 0.0004999502506800452, "loss": 1.3156, "step": 1355 }, { "epoch": 0.16348118764274552, "grad_norm": 0.39675143361091614, "learning_rate": 0.0004999492949355401, "loss": 1.7672, "step": 1360 }, { "epoch": 0.16408222142084386, "grad_norm": 0.4887191951274872, "learning_rate": 0.0004999483300986027, "loss": 1.6578, "step": 1365 }, { "epoch": 0.16468325519894217, "grad_norm": 0.5052289366722107, "learning_rate": 0.000499947356169268, "loss": 1.5766, "step": 1370 }, { "epoch": 0.1652842889770405, "grad_norm": 0.3420865833759308, "learning_rate": 0.000499946373147571, "loss": 1.4281, "step": 1375 }, { "epoch": 0.16588532275513884, "grad_norm": 0.6112978458404541, "learning_rate": 0.0004999453810335479, "loss": 1.4234, "step": 1380 }, { "epoch": 0.16648635653323718, "grad_norm": 0.46144208312034607, "learning_rate": 0.0004999443798272348, "loss": 1.4609, "step": 1385 }, { "epoch": 0.1670873903113355, "grad_norm": 0.5132108926773071, "learning_rate": 0.000499943369528668, "loss": 1.5656, "step": 1390 }, { "epoch": 0.16768842408943382, "grad_norm": 0.5717546939849854, "learning_rate": 0.000499942350137884, "loss": 1.3617, "step": 1395 }, { "epoch": 0.16828945786753216, "grad_norm": 0.4766351580619812, "learning_rate": 0.0004999413216549203, "loss": 1.5016, "step": 1400 }, { "epoch": 0.16828945786753216, "eval_loss": 2.3990235328674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.19, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 1400 }, { "epoch": 0.1688904916456305, "grad_norm": 0.43601974844932556, "learning_rate": 0.0004999402840798142, "loss": 1.4156, "step": 1405 }, { "epoch": 0.1694915254237288, "grad_norm": 0.6105599403381348, "learning_rate": 0.0004999392374126034, "loss": 1.7, "step": 1410 }, { "epoch": 0.17009255920182714, "grad_norm": 0.4957026243209839, "learning_rate": 0.0004999381816533259, "loss": 1.7969, "step": 1415 }, { "epoch": 0.17069359297992548, "grad_norm": 0.44195666909217834, "learning_rate": 0.0004999371168020201, "loss": 1.4375, "step": 1420 }, { "epoch": 0.1712946267580238, "grad_norm": 0.45855048298835754, "learning_rate": 0.0004999360428587249, "loss": 1.6141, "step": 1425 }, { "epoch": 0.17189566053612212, "grad_norm": 0.6269901990890503, "learning_rate": 0.0004999349598234792, "loss": 1.3953, "step": 1430 }, { "epoch": 0.17249669431422046, "grad_norm": 0.3805680274963379, "learning_rate": 0.0004999338676963225, "loss": 1.6484, "step": 1435 }, { "epoch": 0.1730977280923188, "grad_norm": 0.6604627966880798, "learning_rate": 0.0004999327664772945, "loss": 1.5969, "step": 1440 }, { "epoch": 0.1736987618704171, "grad_norm": 0.4411623179912567, "learning_rate": 0.0004999316561664353, "loss": 1.2609, "step": 1445 }, { "epoch": 0.17429979564851544, "grad_norm": 0.5301747918128967, "learning_rate": 0.0004999305367637852, "loss": 1.6141, "step": 1450 }, { "epoch": 0.17490082942661378, "grad_norm": 0.5128594040870667, "learning_rate": 0.000499929408269385, "loss": 1.6187, "step": 1455 }, { "epoch": 0.17550186320471212, "grad_norm": 0.596217155456543, "learning_rate": 0.0004999282706832758, "loss": 1.4531, "step": 1460 }, { "epoch": 0.17610289698281043, "grad_norm": 0.45486292243003845, "learning_rate": 0.0004999271240054987, "loss": 1.4012, "step": 1465 }, { "epoch": 0.17670393076090876, "grad_norm": 0.6031058430671692, "learning_rate": 0.0004999259682360957, "loss": 1.6203, "step": 1470 }, { "epoch": 0.1773049645390071, "grad_norm": 0.4107096493244171, "learning_rate": 0.0004999248033751088, "loss": 1.7312, "step": 1475 }, { "epoch": 0.17790599831710543, "grad_norm": 0.46700888872146606, "learning_rate": 0.0004999236294225803, "loss": 1.5234, "step": 1480 }, { "epoch": 0.17850703209520374, "grad_norm": 0.7690737247467041, "learning_rate": 0.000499922446378553, "loss": 1.307, "step": 1485 }, { "epoch": 0.17910806587330208, "grad_norm": 0.5420579314231873, "learning_rate": 0.0004999212542430698, "loss": 1.6562, "step": 1490 }, { "epoch": 0.17970909965140042, "grad_norm": 0.4624311625957489, "learning_rate": 0.0004999200530161742, "loss": 1.3234, "step": 1495 }, { "epoch": 0.18031013342949873, "grad_norm": 0.4610016345977783, "learning_rate": 0.0004999188426979097, "loss": 1.5516, "step": 1500 }, { "epoch": 0.18091116720759706, "grad_norm": 0.5131213068962097, "learning_rate": 0.0004999176232883206, "loss": 1.5867, "step": 1505 }, { "epoch": 0.1815122009856954, "grad_norm": 0.5673689842224121, "learning_rate": 0.0004999163947874511, "loss": 1.5078, "step": 1510 }, { "epoch": 0.18211323476379374, "grad_norm": 0.7008316516876221, "learning_rate": 0.000499915157195346, "loss": 1.5562, "step": 1515 }, { "epoch": 0.18271426854189204, "grad_norm": 0.5652767419815063, "learning_rate": 0.00049991391051205, "loss": 1.4469, "step": 1520 }, { "epoch": 0.18331530231999038, "grad_norm": 0.5506184101104736, "learning_rate": 0.0004999126547376089, "loss": 1.4531, "step": 1525 }, { "epoch": 0.18391633609808872, "grad_norm": 0.5806117057800293, "learning_rate": 0.000499911389872068, "loss": 1.7594, "step": 1530 }, { "epoch": 0.18451736987618705, "grad_norm": 0.5860136151313782, "learning_rate": 0.0004999101159154736, "loss": 1.5562, "step": 1535 }, { "epoch": 0.18511840365428536, "grad_norm": 0.5575783252716064, "learning_rate": 0.000499908832867872, "loss": 1.6391, "step": 1540 }, { "epoch": 0.1857194374323837, "grad_norm": 0.3992920219898224, "learning_rate": 0.0004999075407293096, "loss": 1.3859, "step": 1545 }, { "epoch": 0.18632047121048204, "grad_norm": 0.8294938206672668, "learning_rate": 0.0004999062394998336, "loss": 1.25, "step": 1550 }, { "epoch": 0.18692150498858034, "grad_norm": 0.6560512185096741, "learning_rate": 0.0004999049291794915, "loss": 1.4453, "step": 1555 }, { "epoch": 0.18752253876667868, "grad_norm": 0.5583436489105225, "learning_rate": 0.0004999036097683307, "loss": 1.3969, "step": 1560 }, { "epoch": 0.18812357254477702, "grad_norm": 0.6256234645843506, "learning_rate": 0.0004999022812663993, "loss": 1.518, "step": 1565 }, { "epoch": 0.18872460632287535, "grad_norm": 0.5769176483154297, "learning_rate": 0.0004999009436737457, "loss": 1.6609, "step": 1570 }, { "epoch": 0.18932564010097366, "grad_norm": 0.6486324071884155, "learning_rate": 0.0004998995969904183, "loss": 1.3172, "step": 1575 }, { "epoch": 0.189926673879072, "grad_norm": 0.34935474395751953, "learning_rate": 0.0004998982412164663, "loss": 1.5562, "step": 1580 }, { "epoch": 0.19052770765717034, "grad_norm": 0.5806995630264282, "learning_rate": 0.000499896876351939, "loss": 1.6219, "step": 1585 }, { "epoch": 0.19112874143526867, "grad_norm": 0.6906558275222778, "learning_rate": 0.0004998955023968862, "loss": 1.5172, "step": 1590 }, { "epoch": 0.19172977521336698, "grad_norm": 0.49730750918388367, "learning_rate": 0.0004998941193513575, "loss": 1.6797, "step": 1595 }, { "epoch": 0.19233080899146532, "grad_norm": 0.5871158242225647, "learning_rate": 0.0004998927272154036, "loss": 1.6125, "step": 1600 }, { "epoch": 0.19233080899146532, "eval_loss": 2.360156297683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1647, "eval_samples_per_second": 4.55, "eval_steps_per_second": 1.138, "step": 1600 }, { "epoch": 0.19293184276956366, "grad_norm": 0.3994157910346985, "learning_rate": 0.000499891325989075, "loss": 1.3305, "step": 1605 }, { "epoch": 0.193532876547662, "grad_norm": 0.3497280180454254, "learning_rate": 0.0004998899156724224, "loss": 1.3531, "step": 1610 }, { "epoch": 0.1941339103257603, "grad_norm": 0.4835513234138489, "learning_rate": 0.0004998884962654976, "loss": 1.293, "step": 1615 }, { "epoch": 0.19473494410385864, "grad_norm": 0.4717245101928711, "learning_rate": 0.0004998870677683519, "loss": 1.3742, "step": 1620 }, { "epoch": 0.19533597788195697, "grad_norm": 0.3917827308177948, "learning_rate": 0.0004998856301810373, "loss": 1.5719, "step": 1625 }, { "epoch": 0.19593701166005528, "grad_norm": 0.4725429117679596, "learning_rate": 0.0004998841835036061, "loss": 1.3859, "step": 1630 }, { "epoch": 0.19653804543815362, "grad_norm": 0.4728795289993286, "learning_rate": 0.0004998827277361111, "loss": 1.4203, "step": 1635 }, { "epoch": 0.19713907921625196, "grad_norm": 0.6246328949928284, "learning_rate": 0.000499881262878605, "loss": 1.7719, "step": 1640 }, { "epoch": 0.1977401129943503, "grad_norm": 0.7019891738891602, "learning_rate": 0.0004998797889311413, "loss": 1.3781, "step": 1645 }, { "epoch": 0.1983411467724486, "grad_norm": 0.2940036654472351, "learning_rate": 0.0004998783058937735, "loss": 1.4148, "step": 1650 }, { "epoch": 0.19894218055054694, "grad_norm": 0.434410959482193, "learning_rate": 0.0004998768137665556, "loss": 1.6094, "step": 1655 }, { "epoch": 0.19954321432864527, "grad_norm": 0.5853382349014282, "learning_rate": 0.0004998753125495418, "loss": 1.4125, "step": 1660 }, { "epoch": 0.2001442481067436, "grad_norm": 0.5105974078178406, "learning_rate": 0.0004998738022427867, "loss": 1.3313, "step": 1665 }, { "epoch": 0.20074528188484192, "grad_norm": 0.4266336262226105, "learning_rate": 0.0004998722828463455, "loss": 1.5953, "step": 1670 }, { "epoch": 0.20134631566294026, "grad_norm": 0.4918626844882965, "learning_rate": 0.0004998707543602731, "loss": 1.8383, "step": 1675 }, { "epoch": 0.2019473494410386, "grad_norm": 0.4804850220680237, "learning_rate": 0.0004998692167846253, "loss": 1.1484, "step": 1680 }, { "epoch": 0.20254838321913693, "grad_norm": 0.5131824612617493, "learning_rate": 0.0004998676701194581, "loss": 1.7109, "step": 1685 }, { "epoch": 0.20314941699723524, "grad_norm": 0.4895535111427307, "learning_rate": 0.0004998661143648277, "loss": 1.7453, "step": 1690 }, { "epoch": 0.20375045077533357, "grad_norm": 0.4180288314819336, "learning_rate": 0.0004998645495207906, "loss": 1.0766, "step": 1695 }, { "epoch": 0.2043514845534319, "grad_norm": 0.4888496696949005, "learning_rate": 0.0004998629755874037, "loss": 1.5359, "step": 1700 }, { "epoch": 0.20495251833153022, "grad_norm": 0.666147768497467, "learning_rate": 0.0004998613925647245, "loss": 1.5609, "step": 1705 }, { "epoch": 0.20555355210962856, "grad_norm": 0.563382625579834, "learning_rate": 0.0004998598004528103, "loss": 1.4187, "step": 1710 }, { "epoch": 0.2061545858877269, "grad_norm": 0.619296669960022, "learning_rate": 0.0004998581992517192, "loss": 1.3367, "step": 1715 }, { "epoch": 0.20675561966582523, "grad_norm": 0.928014874458313, "learning_rate": 0.0004998565889615096, "loss": 1.4094, "step": 1720 }, { "epoch": 0.20735665344392354, "grad_norm": 0.4932372272014618, "learning_rate": 0.0004998549695822397, "loss": 1.3719, "step": 1725 }, { "epoch": 0.20795768722202188, "grad_norm": 0.6022034287452698, "learning_rate": 0.0004998533411139685, "loss": 1.5781, "step": 1730 }, { "epoch": 0.2085587210001202, "grad_norm": 0.41716283559799194, "learning_rate": 0.0004998517035567554, "loss": 1.1914, "step": 1735 }, { "epoch": 0.20915975477821855, "grad_norm": 0.4988159239292145, "learning_rate": 0.0004998500569106599, "loss": 1.475, "step": 1740 }, { "epoch": 0.20976078855631686, "grad_norm": 0.4242478907108307, "learning_rate": 0.0004998484011757419, "loss": 1.2859, "step": 1745 }, { "epoch": 0.2103618223344152, "grad_norm": 0.5382992625236511, "learning_rate": 0.0004998467363520617, "loss": 1.3687, "step": 1750 }, { "epoch": 0.21096285611251353, "grad_norm": 0.31303003430366516, "learning_rate": 0.0004998450624396797, "loss": 1.9281, "step": 1755 }, { "epoch": 0.21156388989061184, "grad_norm": 0.5793948173522949, "learning_rate": 0.0004998433794386569, "loss": 1.457, "step": 1760 }, { "epoch": 0.21216492366871018, "grad_norm": 0.48824676871299744, "learning_rate": 0.0004998416873490544, "loss": 1.5359, "step": 1765 }, { "epoch": 0.2127659574468085, "grad_norm": 0.5384695529937744, "learning_rate": 0.000499839986170934, "loss": 1.4461, "step": 1770 }, { "epoch": 0.21336699122490685, "grad_norm": 0.5212387442588806, "learning_rate": 0.0004998382759043574, "loss": 1.2844, "step": 1775 }, { "epoch": 0.21396802500300516, "grad_norm": 0.5552918910980225, "learning_rate": 0.0004998365565493868, "loss": 1.5516, "step": 1780 }, { "epoch": 0.2145690587811035, "grad_norm": 0.5672168135643005, "learning_rate": 0.0004998348281060848, "loss": 1.6297, "step": 1785 }, { "epoch": 0.21517009255920183, "grad_norm": 0.620464026927948, "learning_rate": 0.0004998330905745143, "loss": 1.5047, "step": 1790 }, { "epoch": 0.21577112633730017, "grad_norm": 0.5900077819824219, "learning_rate": 0.0004998313439547384, "loss": 1.2367, "step": 1795 }, { "epoch": 0.21637216011539848, "grad_norm": 0.5305217504501343, "learning_rate": 0.0004998295882468209, "loss": 1.5906, "step": 1800 }, { "epoch": 0.21637216011539848, "eval_loss": 2.288867235183716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2003, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 1800 }, { "epoch": 0.2169731938934968, "grad_norm": 0.5836020112037659, "learning_rate": 0.0004998278234508253, "loss": 1.4891, "step": 1805 }, { "epoch": 0.21757422767159515, "grad_norm": 0.3793884813785553, "learning_rate": 0.0004998260495668161, "loss": 1.3328, "step": 1810 }, { "epoch": 0.21817526144969349, "grad_norm": 0.5394117832183838, "learning_rate": 0.0004998242665948577, "loss": 1.368, "step": 1815 }, { "epoch": 0.2187762952277918, "grad_norm": 0.39613473415374756, "learning_rate": 0.0004998224745350148, "loss": 1.2285, "step": 1820 }, { "epoch": 0.21937732900589013, "grad_norm": 0.543116569519043, "learning_rate": 0.0004998206733873529, "loss": 1.5078, "step": 1825 }, { "epoch": 0.21997836278398847, "grad_norm": 0.4901551306247711, "learning_rate": 0.0004998188631519375, "loss": 1.4516, "step": 1830 }, { "epoch": 0.22057939656208678, "grad_norm": 0.5067916512489319, "learning_rate": 0.0004998170438288342, "loss": 1.5719, "step": 1835 }, { "epoch": 0.2211804303401851, "grad_norm": 0.4343029856681824, "learning_rate": 0.0004998152154181093, "loss": 1.3766, "step": 1840 }, { "epoch": 0.22178146411828345, "grad_norm": 0.5296164155006409, "learning_rate": 0.0004998133779198293, "loss": 1.3625, "step": 1845 }, { "epoch": 0.2223824978963818, "grad_norm": 0.4429774284362793, "learning_rate": 0.0004998115313340611, "loss": 1.3891, "step": 1850 }, { "epoch": 0.2229835316744801, "grad_norm": 0.5772582292556763, "learning_rate": 0.0004998096756608719, "loss": 1.5437, "step": 1855 }, { "epoch": 0.22358456545257843, "grad_norm": 0.5951064825057983, "learning_rate": 0.0004998078109003291, "loss": 1.4672, "step": 1860 }, { "epoch": 0.22418559923067677, "grad_norm": 0.3261686861515045, "learning_rate": 0.0004998059370525006, "loss": 1.5063, "step": 1865 }, { "epoch": 0.2247866330087751, "grad_norm": 0.3098331689834595, "learning_rate": 0.0004998040541174545, "loss": 1.5094, "step": 1870 }, { "epoch": 0.22538766678687341, "grad_norm": 0.8590214252471924, "learning_rate": 0.0004998021620952593, "loss": 1.3977, "step": 1875 }, { "epoch": 0.22598870056497175, "grad_norm": 0.5078855752944946, "learning_rate": 0.0004998002609859839, "loss": 1.2789, "step": 1880 }, { "epoch": 0.2265897343430701, "grad_norm": 0.4515461027622223, "learning_rate": 0.0004997983507896976, "loss": 1.368, "step": 1885 }, { "epoch": 0.2271907681211684, "grad_norm": 0.4937264025211334, "learning_rate": 0.0004997964315064695, "loss": 1.1953, "step": 1890 }, { "epoch": 0.22779180189926673, "grad_norm": 0.6028769612312317, "learning_rate": 0.0004997945031363697, "loss": 1.4859, "step": 1895 }, { "epoch": 0.22839283567736507, "grad_norm": 0.4746128022670746, "learning_rate": 0.0004997925656794683, "loss": 1.6016, "step": 1900 }, { "epoch": 0.2289938694554634, "grad_norm": 0.519091010093689, "learning_rate": 0.0004997906191358358, "loss": 1.3906, "step": 1905 }, { "epoch": 0.22959490323356171, "grad_norm": 0.4584903419017792, "learning_rate": 0.0004997886635055429, "loss": 1.3258, "step": 1910 }, { "epoch": 0.23019593701166005, "grad_norm": 0.7446622252464294, "learning_rate": 0.0004997866987886608, "loss": 1.2141, "step": 1915 }, { "epoch": 0.2307969707897584, "grad_norm": 0.5405495166778564, "learning_rate": 0.0004997847249852609, "loss": 1.4359, "step": 1920 }, { "epoch": 0.23139800456785672, "grad_norm": 0.38187775015830994, "learning_rate": 0.0004997827420954152, "loss": 1.7219, "step": 1925 }, { "epoch": 0.23199903834595503, "grad_norm": 0.503364622592926, "learning_rate": 0.0004997807501191957, "loss": 1.3586, "step": 1930 }, { "epoch": 0.23260007212405337, "grad_norm": 0.43855008482933044, "learning_rate": 0.0004997787490566749, "loss": 1.7625, "step": 1935 }, { "epoch": 0.2332011059021517, "grad_norm": 0.4955185651779175, "learning_rate": 0.0004997767389079255, "loss": 1.2281, "step": 1940 }, { "epoch": 0.23380213968025004, "grad_norm": 0.7726651430130005, "learning_rate": 0.0004997747196730206, "loss": 1.5445, "step": 1945 }, { "epoch": 0.23440317345834835, "grad_norm": 0.38199684023857117, "learning_rate": 0.000499772691352034, "loss": 1.4203, "step": 1950 }, { "epoch": 0.2350042072364467, "grad_norm": 0.4838792383670807, "learning_rate": 0.000499770653945039, "loss": 1.2484, "step": 1955 }, { "epoch": 0.23560524101454502, "grad_norm": 0.43874993920326233, "learning_rate": 0.00049976860745211, "loss": 1.3594, "step": 1960 }, { "epoch": 0.23620627479264333, "grad_norm": 0.4992177188396454, "learning_rate": 0.0004997665518733215, "loss": 1.1977, "step": 1965 }, { "epoch": 0.23680730857074167, "grad_norm": 0.526907742023468, "learning_rate": 0.000499764487208748, "loss": 1.1609, "step": 1970 }, { "epoch": 0.23740834234884, "grad_norm": 0.599902868270874, "learning_rate": 0.000499762413458465, "loss": 1.4203, "step": 1975 }, { "epoch": 0.23800937612693834, "grad_norm": 0.42601317167282104, "learning_rate": 0.0004997603306225475, "loss": 1.1516, "step": 1980 }, { "epoch": 0.23861040990503665, "grad_norm": 0.3787403404712677, "learning_rate": 0.0004997582387010715, "loss": 1.3391, "step": 1985 }, { "epoch": 0.239211443683135, "grad_norm": 0.5586139559745789, "learning_rate": 0.0004997561376941131, "loss": 1.6656, "step": 1990 }, { "epoch": 0.23981247746123333, "grad_norm": 0.44761109352111816, "learning_rate": 0.0004997540276017487, "loss": 1.5828, "step": 1995 }, { "epoch": 0.24041351123933166, "grad_norm": 0.4291538894176483, "learning_rate": 0.000499751908424055, "loss": 1.4539, "step": 2000 }, { "epoch": 0.24041351123933166, "eval_loss": 2.2626953125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2012, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 2000 }, { "epoch": 0.24101454501742997, "grad_norm": 0.46680477261543274, "learning_rate": 0.0004997497801611093, "loss": 1.2609, "step": 2005 }, { "epoch": 0.2416155787955283, "grad_norm": 0.42086416482925415, "learning_rate": 0.0004997476428129887, "loss": 1.1609, "step": 2010 }, { "epoch": 0.24221661257362664, "grad_norm": 0.7524279952049255, "learning_rate": 0.0004997454963797713, "loss": 1.0633, "step": 2015 }, { "epoch": 0.24281764635172498, "grad_norm": 0.43722498416900635, "learning_rate": 0.0004997433408615349, "loss": 1.2969, "step": 2020 }, { "epoch": 0.2434186801298233, "grad_norm": 0.2848932147026062, "learning_rate": 0.0004997411762583581, "loss": 1.2063, "step": 2025 }, { "epoch": 0.24401971390792163, "grad_norm": 0.4349381923675537, "learning_rate": 0.0004997390025703194, "loss": 1.3625, "step": 2030 }, { "epoch": 0.24462074768601996, "grad_norm": 0.4666562080383301, "learning_rate": 0.0004997368197974982, "loss": 1.4164, "step": 2035 }, { "epoch": 0.24522178146411827, "grad_norm": 0.5730391144752502, "learning_rate": 0.0004997346279399736, "loss": 1.1633, "step": 2040 }, { "epoch": 0.2458228152422166, "grad_norm": 0.5395126938819885, "learning_rate": 0.0004997324269978255, "loss": 1.2398, "step": 2045 }, { "epoch": 0.24642384902031494, "grad_norm": 0.3828608989715576, "learning_rate": 0.000499730216971134, "loss": 1.0594, "step": 2050 }, { "epoch": 0.24702488279841328, "grad_norm": 0.796903133392334, "learning_rate": 0.0004997279978599794, "loss": 1.3055, "step": 2055 }, { "epoch": 0.2476259165765116, "grad_norm": 0.35091638565063477, "learning_rate": 0.0004997257696644424, "loss": 1.2023, "step": 2060 }, { "epoch": 0.24822695035460993, "grad_norm": 0.46753543615341187, "learning_rate": 0.000499723532384604, "loss": 1.3203, "step": 2065 }, { "epoch": 0.24882798413270826, "grad_norm": 0.5231248736381531, "learning_rate": 0.0004997212860205459, "loss": 1.3438, "step": 2070 }, { "epoch": 0.2494290179108066, "grad_norm": 0.470639169216156, "learning_rate": 0.0004997190305723495, "loss": 1.3031, "step": 2075 }, { "epoch": 0.2500300516889049, "grad_norm": 0.4669177234172821, "learning_rate": 0.000499716766040097, "loss": 1.5188, "step": 2080 }, { "epoch": 0.25063108546700325, "grad_norm": 0.5113319754600525, "learning_rate": 0.0004997144924238706, "loss": 1.0992, "step": 2085 }, { "epoch": 0.2512321192451016, "grad_norm": 0.5395264625549316, "learning_rate": 0.0004997122097237533, "loss": 1.3281, "step": 2090 }, { "epoch": 0.2518331530231999, "grad_norm": 0.47676244378089905, "learning_rate": 0.0004997099179398279, "loss": 1.2898, "step": 2095 }, { "epoch": 0.25243418680129825, "grad_norm": 0.3385642468929291, "learning_rate": 0.0004997076170721778, "loss": 1.2078, "step": 2100 }, { "epoch": 0.25303522057939654, "grad_norm": 0.3868078887462616, "learning_rate": 0.0004997053071208868, "loss": 1.4563, "step": 2105 }, { "epoch": 0.2536362543574949, "grad_norm": 0.437321275472641, "learning_rate": 0.0004997029880860389, "loss": 1.3977, "step": 2110 }, { "epoch": 0.2542372881355932, "grad_norm": 0.6515981554985046, "learning_rate": 0.0004997006599677183, "loss": 1.2461, "step": 2115 }, { "epoch": 0.25483832191369155, "grad_norm": 0.3654949367046356, "learning_rate": 0.0004996983227660099, "loss": 1.4187, "step": 2120 }, { "epoch": 0.2554393556917899, "grad_norm": 0.5203860998153687, "learning_rate": 0.0004996959764809987, "loss": 1.4328, "step": 2125 }, { "epoch": 0.2560403894698882, "grad_norm": 0.5454062223434448, "learning_rate": 0.00049969362111277, "loss": 1.5125, "step": 2130 }, { "epoch": 0.25664142324798656, "grad_norm": 0.5460174679756165, "learning_rate": 0.0004996912566614094, "loss": 1.4344, "step": 2135 }, { "epoch": 0.2572424570260849, "grad_norm": 0.4798714816570282, "learning_rate": 0.000499688883127003, "loss": 1.1953, "step": 2140 }, { "epoch": 0.2578434908041832, "grad_norm": 0.679547905921936, "learning_rate": 0.0004996865005096372, "loss": 1.2688, "step": 2145 }, { "epoch": 0.2584445245822815, "grad_norm": 0.42334336042404175, "learning_rate": 0.0004996841088093985, "loss": 1.1516, "step": 2150 }, { "epoch": 0.25904555836037985, "grad_norm": 0.4171724021434784, "learning_rate": 0.000499681708026374, "loss": 1.0961, "step": 2155 }, { "epoch": 0.2596465921384782, "grad_norm": 0.6091195940971375, "learning_rate": 0.0004996792981606511, "loss": 1.0164, "step": 2160 }, { "epoch": 0.2602476259165765, "grad_norm": 0.7312507033348083, "learning_rate": 0.0004996768792123173, "loss": 1.3031, "step": 2165 }, { "epoch": 0.26084865969467486, "grad_norm": 0.8120207786560059, "learning_rate": 0.0004996744511814609, "loss": 1.1641, "step": 2170 }, { "epoch": 0.2614496934727732, "grad_norm": 0.4702399969100952, "learning_rate": 0.0004996720140681699, "loss": 1.2805, "step": 2175 }, { "epoch": 0.2620507272508715, "grad_norm": 0.45239925384521484, "learning_rate": 0.0004996695678725331, "loss": 1.5539, "step": 2180 }, { "epoch": 0.2626517610289698, "grad_norm": 0.6370692253112793, "learning_rate": 0.0004996671125946394, "loss": 1.2156, "step": 2185 }, { "epoch": 0.26325279480706815, "grad_norm": 0.6115698218345642, "learning_rate": 0.0004996646482345781, "loss": 1.2891, "step": 2190 }, { "epoch": 0.2638538285851665, "grad_norm": 0.611488401889801, "learning_rate": 0.0004996621747924391, "loss": 1.3023, "step": 2195 }, { "epoch": 0.2644548623632648, "grad_norm": 0.6977550983428955, "learning_rate": 0.0004996596922683122, "loss": 1.3555, "step": 2200 }, { "epoch": 0.2644548623632648, "eval_loss": 2.2613282203674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2156, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 2200 }, { "epoch": 0.26505589614136316, "grad_norm": 0.6270340085029602, "learning_rate": 0.0004996572006622876, "loss": 1.5938, "step": 2205 }, { "epoch": 0.2656569299194615, "grad_norm": 0.5670061707496643, "learning_rate": 0.0004996546999744561, "loss": 1.5016, "step": 2210 }, { "epoch": 0.26625796369755983, "grad_norm": 0.38163918256759644, "learning_rate": 0.0004996521902049086, "loss": 1.2812, "step": 2215 }, { "epoch": 0.2668589974756581, "grad_norm": 0.45828545093536377, "learning_rate": 0.0004996496713537365, "loss": 1.3023, "step": 2220 }, { "epoch": 0.26746003125375645, "grad_norm": 0.4318217933177948, "learning_rate": 0.0004996471434210312, "loss": 1.6039, "step": 2225 }, { "epoch": 0.2680610650318548, "grad_norm": 0.5099067091941833, "learning_rate": 0.0004996446064068848, "loss": 1.5562, "step": 2230 }, { "epoch": 0.2686620988099531, "grad_norm": 0.7253368496894836, "learning_rate": 0.0004996420603113897, "loss": 1.2523, "step": 2235 }, { "epoch": 0.26926313258805146, "grad_norm": 0.6101372838020325, "learning_rate": 0.0004996395051346384, "loss": 1.4125, "step": 2240 }, { "epoch": 0.2698641663661498, "grad_norm": 0.5073166489601135, "learning_rate": 0.0004996369408767238, "loss": 1.1109, "step": 2245 }, { "epoch": 0.27046520014424813, "grad_norm": 0.4978417456150055, "learning_rate": 0.0004996343675377393, "loss": 1.3438, "step": 2250 }, { "epoch": 0.2710662339223464, "grad_norm": 0.695686936378479, "learning_rate": 0.0004996317851177784, "loss": 1.0445, "step": 2255 }, { "epoch": 0.27166726770044475, "grad_norm": 0.5276048183441162, "learning_rate": 0.000499629193616935, "loss": 1.2703, "step": 2260 }, { "epoch": 0.2722683014785431, "grad_norm": 0.7686821222305298, "learning_rate": 0.0004996265930353036, "loss": 1.2656, "step": 2265 }, { "epoch": 0.2728693352566414, "grad_norm": 0.673497200012207, "learning_rate": 0.0004996239833729786, "loss": 1.4055, "step": 2270 }, { "epoch": 0.27347036903473976, "grad_norm": 0.4770069122314453, "learning_rate": 0.000499621364630055, "loss": 1.1227, "step": 2275 }, { "epoch": 0.2740714028128381, "grad_norm": 0.630565881729126, "learning_rate": 0.000499618736806628, "loss": 1.293, "step": 2280 }, { "epoch": 0.27467243659093643, "grad_norm": 0.5288265943527222, "learning_rate": 0.0004996160999027933, "loss": 1.5109, "step": 2285 }, { "epoch": 0.27527347036903477, "grad_norm": 0.35486194491386414, "learning_rate": 0.0004996134539186469, "loss": 1.5078, "step": 2290 }, { "epoch": 0.27587450414713305, "grad_norm": 0.5654587745666504, "learning_rate": 0.0004996107988542847, "loss": 1.625, "step": 2295 }, { "epoch": 0.2764755379252314, "grad_norm": 0.40694040060043335, "learning_rate": 0.0004996081347098037, "loss": 1.4531, "step": 2300 }, { "epoch": 0.2770765717033297, "grad_norm": 0.5765879154205322, "learning_rate": 0.0004996054614853005, "loss": 1.343, "step": 2305 }, { "epoch": 0.27767760548142806, "grad_norm": 0.49710384011268616, "learning_rate": 0.0004996027791808725, "loss": 1.3266, "step": 2310 }, { "epoch": 0.2782786392595264, "grad_norm": 0.5011634826660156, "learning_rate": 0.0004996000877966172, "loss": 1.3438, "step": 2315 }, { "epoch": 0.27887967303762473, "grad_norm": 0.6307665705680847, "learning_rate": 0.0004995973873326326, "loss": 1.5703, "step": 2320 }, { "epoch": 0.27948070681572307, "grad_norm": 0.46662095189094543, "learning_rate": 0.0004995946777890169, "loss": 1.4414, "step": 2325 }, { "epoch": 0.28008174059382135, "grad_norm": 0.49989181756973267, "learning_rate": 0.0004995919591658687, "loss": 1.3789, "step": 2330 }, { "epoch": 0.2806827743719197, "grad_norm": 0.4880094528198242, "learning_rate": 0.0004995892314632867, "loss": 1.2633, "step": 2335 }, { "epoch": 0.281283808150018, "grad_norm": 0.6314132213592529, "learning_rate": 0.0004995864946813703, "loss": 1.5539, "step": 2340 }, { "epoch": 0.28188484192811636, "grad_norm": 0.7073726654052734, "learning_rate": 0.0004995837488202191, "loss": 1.3766, "step": 2345 }, { "epoch": 0.2824858757062147, "grad_norm": 0.5559587478637695, "learning_rate": 0.0004995809938799329, "loss": 1.4875, "step": 2350 }, { "epoch": 0.28308690948431303, "grad_norm": 0.4955267906188965, "learning_rate": 0.0004995782298606119, "loss": 1.3156, "step": 2355 }, { "epoch": 0.28368794326241137, "grad_norm": 0.4989592432975769, "learning_rate": 0.0004995754567623567, "loss": 1.2484, "step": 2360 }, { "epoch": 0.28428897704050965, "grad_norm": 0.5886387228965759, "learning_rate": 0.0004995726745852681, "loss": 1.2344, "step": 2365 }, { "epoch": 0.284890010818608, "grad_norm": 0.5085893273353577, "learning_rate": 0.0004995698833294474, "loss": 1.407, "step": 2370 }, { "epoch": 0.2854910445967063, "grad_norm": 0.4706375002861023, "learning_rate": 0.000499567082994996, "loss": 1.3117, "step": 2375 }, { "epoch": 0.28609207837480466, "grad_norm": 0.5287367701530457, "learning_rate": 0.000499564273582016, "loss": 1.2594, "step": 2380 }, { "epoch": 0.286693112152903, "grad_norm": 0.5483081936836243, "learning_rate": 0.0004995614550906093, "loss": 1.1008, "step": 2385 }, { "epoch": 0.28729414593100133, "grad_norm": 0.8154200911521912, "learning_rate": 0.0004995586275208788, "loss": 1.5164, "step": 2390 }, { "epoch": 0.28789517970909967, "grad_norm": 0.837818443775177, "learning_rate": 0.000499555790872927, "loss": 1.2531, "step": 2395 }, { "epoch": 0.288496213487198, "grad_norm": 0.4989728033542633, "learning_rate": 0.0004995529451468574, "loss": 1.3719, "step": 2400 }, { "epoch": 0.288496213487198, "eval_loss": 2.189453125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.202, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 2400 }, { "epoch": 0.2890972472652963, "grad_norm": 0.5615506768226624, "learning_rate": 0.0004995500903427732, "loss": 1.1023, "step": 2405 }, { "epoch": 0.2896982810433946, "grad_norm": 0.7758134007453918, "learning_rate": 0.0004995472264607784, "loss": 1.2625, "step": 2410 }, { "epoch": 0.29029931482149296, "grad_norm": 0.6751444935798645, "learning_rate": 0.0004995443535009773, "loss": 1.5734, "step": 2415 }, { "epoch": 0.2909003485995913, "grad_norm": 0.5839786529541016, "learning_rate": 0.0004995414714634743, "loss": 1.3625, "step": 2420 }, { "epoch": 0.29150138237768963, "grad_norm": 0.5906524062156677, "learning_rate": 0.0004995385803483742, "loss": 1.0875, "step": 2425 }, { "epoch": 0.29210241615578797, "grad_norm": 0.7597156763076782, "learning_rate": 0.0004995356801557821, "loss": 1.4781, "step": 2430 }, { "epoch": 0.2927034499338863, "grad_norm": 0.5112520456314087, "learning_rate": 0.0004995327708858038, "loss": 1.2758, "step": 2435 }, { "epoch": 0.2933044837119846, "grad_norm": 0.44212523102760315, "learning_rate": 0.0004995298525385447, "loss": 1.5094, "step": 2440 }, { "epoch": 0.2939055174900829, "grad_norm": 0.43641284108161926, "learning_rate": 0.0004995269251141114, "loss": 1.1656, "step": 2445 }, { "epoch": 0.29450655126818126, "grad_norm": 0.4382478892803192, "learning_rate": 0.0004995239886126102, "loss": 1.5023, "step": 2450 }, { "epoch": 0.2951075850462796, "grad_norm": 0.6196442246437073, "learning_rate": 0.0004995210430341478, "loss": 1.2875, "step": 2455 }, { "epoch": 0.29570861882437793, "grad_norm": 0.6048389673233032, "learning_rate": 0.0004995180883788316, "loss": 0.9516, "step": 2460 }, { "epoch": 0.29630965260247627, "grad_norm": 0.5682608485221863, "learning_rate": 0.0004995151246467689, "loss": 1.3422, "step": 2465 }, { "epoch": 0.2969106863805746, "grad_norm": 0.5677405595779419, "learning_rate": 0.0004995121518380674, "loss": 1.5016, "step": 2470 }, { "epoch": 0.29751172015867294, "grad_norm": 0.48005715012550354, "learning_rate": 0.0004995091699528355, "loss": 1.3219, "step": 2475 }, { "epoch": 0.2981127539367712, "grad_norm": 0.48294246196746826, "learning_rate": 0.0004995061789911817, "loss": 1.2516, "step": 2480 }, { "epoch": 0.29871378771486956, "grad_norm": 0.7167287468910217, "learning_rate": 0.0004995031789532147, "loss": 1.3531, "step": 2485 }, { "epoch": 0.2993148214929679, "grad_norm": 0.5675193667411804, "learning_rate": 0.0004995001698390434, "loss": 1.3648, "step": 2490 }, { "epoch": 0.29991585527106623, "grad_norm": 0.5264390707015991, "learning_rate": 0.0004994971516487775, "loss": 1.1133, "step": 2495 }, { "epoch": 0.30051688904916457, "grad_norm": 0.5506901144981384, "learning_rate": 0.0004994941243825269, "loss": 1.1594, "step": 2500 }, { "epoch": 0.3011179228272629, "grad_norm": 0.9272066950798035, "learning_rate": 0.0004994910880404015, "loss": 1.4906, "step": 2505 }, { "epoch": 0.30171895660536124, "grad_norm": 0.5853176712989807, "learning_rate": 0.0004994880426225119, "loss": 1.3508, "step": 2510 }, { "epoch": 0.3023199903834595, "grad_norm": 0.4796172082424164, "learning_rate": 0.0004994849881289687, "loss": 1.3484, "step": 2515 }, { "epoch": 0.30292102416155786, "grad_norm": 0.6331420540809631, "learning_rate": 0.0004994819245598833, "loss": 1.2188, "step": 2520 }, { "epoch": 0.3035220579396562, "grad_norm": 0.6519079208374023, "learning_rate": 0.000499478851915367, "loss": 1.3531, "step": 2525 }, { "epoch": 0.30412309171775453, "grad_norm": 0.6366649866104126, "learning_rate": 0.0004994757701955314, "loss": 1.1703, "step": 2530 }, { "epoch": 0.30472412549585287, "grad_norm": 0.5621868371963501, "learning_rate": 0.0004994726794004888, "loss": 1.0441, "step": 2535 }, { "epoch": 0.3053251592739512, "grad_norm": 0.6726334095001221, "learning_rate": 0.0004994695795303517, "loss": 1.2984, "step": 2540 }, { "epoch": 0.30592619305204954, "grad_norm": 0.5448851585388184, "learning_rate": 0.0004994664705852326, "loss": 0.8781, "step": 2545 }, { "epoch": 0.3065272268301479, "grad_norm": 0.6853761076927185, "learning_rate": 0.0004994633525652448, "loss": 1.6891, "step": 2550 }, { "epoch": 0.30712826060824616, "grad_norm": 0.5627267956733704, "learning_rate": 0.0004994602254705017, "loss": 1.368, "step": 2555 }, { "epoch": 0.3077292943863445, "grad_norm": 0.38999640941619873, "learning_rate": 0.0004994570893011171, "loss": 1.3789, "step": 2560 }, { "epoch": 0.30833032816444284, "grad_norm": 0.6671114563941956, "learning_rate": 0.000499453944057205, "loss": 1.4078, "step": 2565 }, { "epoch": 0.30893136194254117, "grad_norm": 0.5521063208580017, "learning_rate": 0.0004994507897388798, "loss": 1.5859, "step": 2570 }, { "epoch": 0.3095323957206395, "grad_norm": 0.6885313391685486, "learning_rate": 0.0004994476263462563, "loss": 1.2578, "step": 2575 }, { "epoch": 0.31013342949873784, "grad_norm": 0.45498156547546387, "learning_rate": 0.0004994444538794495, "loss": 1.3914, "step": 2580 }, { "epoch": 0.3107344632768362, "grad_norm": 0.5482655167579651, "learning_rate": 0.0004994412723385749, "loss": 1.3391, "step": 2585 }, { "epoch": 0.31133549705493446, "grad_norm": 0.5240392684936523, "learning_rate": 0.0004994380817237482, "loss": 1.25, "step": 2590 }, { "epoch": 0.3119365308330328, "grad_norm": 0.5129856467247009, "learning_rate": 0.0004994348820350854, "loss": 1.4406, "step": 2595 }, { "epoch": 0.31253756461113114, "grad_norm": 0.5252668261528015, "learning_rate": 0.000499431673272703, "loss": 1.0773, "step": 2600 }, { "epoch": 0.31253756461113114, "eval_loss": 2.1500000953674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1975, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 2600 }, { "epoch": 0.31313859838922947, "grad_norm": 0.6648097634315491, "learning_rate": 0.0004994284554367176, "loss": 1.1133, "step": 2605 }, { "epoch": 0.3137396321673278, "grad_norm": 0.6218547224998474, "learning_rate": 0.0004994252285272465, "loss": 1.2937, "step": 2610 }, { "epoch": 0.31434066594542615, "grad_norm": 0.6880519390106201, "learning_rate": 0.0004994219925444068, "loss": 1.8039, "step": 2615 }, { "epoch": 0.3149416997235245, "grad_norm": 0.6464706063270569, "learning_rate": 0.0004994187474883164, "loss": 1.5594, "step": 2620 }, { "epoch": 0.3155427335016228, "grad_norm": 0.7200093865394592, "learning_rate": 0.0004994154933590932, "loss": 1.0945, "step": 2625 }, { "epoch": 0.3161437672797211, "grad_norm": 0.6853864789009094, "learning_rate": 0.0004994122301568557, "loss": 1.268, "step": 2630 }, { "epoch": 0.31674480105781944, "grad_norm": 0.5081961750984192, "learning_rate": 0.0004994089578817226, "loss": 1.4062, "step": 2635 }, { "epoch": 0.3173458348359178, "grad_norm": 0.4750553071498871, "learning_rate": 0.0004994056765338129, "loss": 1.2828, "step": 2640 }, { "epoch": 0.3179468686140161, "grad_norm": 0.5867997407913208, "learning_rate": 0.0004994023861132459, "loss": 1.2484, "step": 2645 }, { "epoch": 0.31854790239211445, "grad_norm": 0.7348740696907043, "learning_rate": 0.0004993990866201414, "loss": 1.2258, "step": 2650 }, { "epoch": 0.3191489361702128, "grad_norm": 0.5523998141288757, "learning_rate": 0.0004993957780546193, "loss": 1.2805, "step": 2655 }, { "epoch": 0.3197499699483111, "grad_norm": 0.5308116674423218, "learning_rate": 0.0004993924604168001, "loss": 1.3188, "step": 2660 }, { "epoch": 0.3203510037264094, "grad_norm": 0.40592867136001587, "learning_rate": 0.0004993891337068046, "loss": 1.2148, "step": 2665 }, { "epoch": 0.32095203750450774, "grad_norm": 0.6522583365440369, "learning_rate": 0.0004993857979247535, "loss": 1.175, "step": 2670 }, { "epoch": 0.3215530712826061, "grad_norm": 0.5981694459915161, "learning_rate": 0.0004993824530707682, "loss": 1.143, "step": 2675 }, { "epoch": 0.3221541050607044, "grad_norm": 0.6832042932510376, "learning_rate": 0.0004993790991449707, "loss": 1.2242, "step": 2680 }, { "epoch": 0.32275513883880275, "grad_norm": 0.6935708522796631, "learning_rate": 0.0004993757361474825, "loss": 0.9617, "step": 2685 }, { "epoch": 0.3233561726169011, "grad_norm": 0.5491186380386353, "learning_rate": 0.0004993723640784265, "loss": 1.3672, "step": 2690 }, { "epoch": 0.3239572063949994, "grad_norm": 0.4743538498878479, "learning_rate": 0.0004993689829379249, "loss": 1.1547, "step": 2695 }, { "epoch": 0.3245582401730977, "grad_norm": 0.641859769821167, "learning_rate": 0.0004993655927261008, "loss": 1.4078, "step": 2700 }, { "epoch": 0.32515927395119604, "grad_norm": 0.5002933144569397, "learning_rate": 0.0004993621934430778, "loss": 0.9492, "step": 2705 }, { "epoch": 0.3257603077292944, "grad_norm": 0.7241799831390381, "learning_rate": 0.0004993587850889793, "loss": 1.575, "step": 2710 }, { "epoch": 0.3263613415073927, "grad_norm": 0.5693483948707581, "learning_rate": 0.0004993553676639292, "loss": 0.9961, "step": 2715 }, { "epoch": 0.32696237528549105, "grad_norm": 0.43130815029144287, "learning_rate": 0.000499351941168052, "loss": 1.2984, "step": 2720 }, { "epoch": 0.3275634090635894, "grad_norm": 0.5054978728294373, "learning_rate": 0.0004993485056014724, "loss": 1.1375, "step": 2725 }, { "epoch": 0.3281644428416877, "grad_norm": 0.5581235289573669, "learning_rate": 0.0004993450609643152, "loss": 1.0164, "step": 2730 }, { "epoch": 0.32876547661978606, "grad_norm": 0.6733124256134033, "learning_rate": 0.0004993416072567059, "loss": 1.4078, "step": 2735 }, { "epoch": 0.32936651039788434, "grad_norm": 0.5003538727760315, "learning_rate": 0.0004993381444787699, "loss": 0.8742, "step": 2740 }, { "epoch": 0.3299675441759827, "grad_norm": 0.6292559504508972, "learning_rate": 0.0004993346726306333, "loss": 1.007, "step": 2745 }, { "epoch": 0.330568577954081, "grad_norm": 0.6760239005088806, "learning_rate": 0.0004993311917124224, "loss": 1.25, "step": 2750 }, { "epoch": 0.33116961173217935, "grad_norm": 0.6075654625892639, "learning_rate": 0.0004993277017242638, "loss": 1.5766, "step": 2755 }, { "epoch": 0.3317706455102777, "grad_norm": 0.5432557463645935, "learning_rate": 0.0004993242026662846, "loss": 1.0883, "step": 2760 }, { "epoch": 0.332371679288376, "grad_norm": 0.6972253918647766, "learning_rate": 0.0004993206945386118, "loss": 0.9992, "step": 2765 }, { "epoch": 0.33297271306647436, "grad_norm": 0.45837146043777466, "learning_rate": 0.0004993171773413731, "loss": 1.6766, "step": 2770 }, { "epoch": 0.33357374684457264, "grad_norm": 0.5207621455192566, "learning_rate": 0.0004993136510746966, "loss": 1.2578, "step": 2775 }, { "epoch": 0.334174780622671, "grad_norm": 0.7034028768539429, "learning_rate": 0.0004993101157387106, "loss": 1.3578, "step": 2780 }, { "epoch": 0.3347758144007693, "grad_norm": 0.544851541519165, "learning_rate": 0.0004993065713335434, "loss": 1.2836, "step": 2785 }, { "epoch": 0.33537684817886765, "grad_norm": 0.705143928527832, "learning_rate": 0.0004993030178593241, "loss": 1.4453, "step": 2790 }, { "epoch": 0.335977881956966, "grad_norm": 0.6619438529014587, "learning_rate": 0.0004992994553161823, "loss": 1.0547, "step": 2795 }, { "epoch": 0.3365789157350643, "grad_norm": 0.5982903242111206, "learning_rate": 0.000499295883704247, "loss": 1.4281, "step": 2800 }, { "epoch": 0.3365789157350643, "eval_loss": 2.189453125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1965, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 2800 }, { "epoch": 0.33717994951316266, "grad_norm": 0.589056670665741, "learning_rate": 0.0004992923030236485, "loss": 1.3727, "step": 2805 }, { "epoch": 0.337780983291261, "grad_norm": 0.39378607273101807, "learning_rate": 0.000499288713274517, "loss": 1.1789, "step": 2810 }, { "epoch": 0.3383820170693593, "grad_norm": 0.5460519790649414, "learning_rate": 0.000499285114456983, "loss": 1.1719, "step": 2815 }, { "epoch": 0.3389830508474576, "grad_norm": 0.4953864812850952, "learning_rate": 0.0004992815065711774, "loss": 1.1672, "step": 2820 }, { "epoch": 0.33958408462555595, "grad_norm": 0.5705846548080444, "learning_rate": 0.0004992778896172317, "loss": 1.5328, "step": 2825 }, { "epoch": 0.3401851184036543, "grad_norm": 0.5687447190284729, "learning_rate": 0.0004992742635952771, "loss": 1.0063, "step": 2830 }, { "epoch": 0.3407861521817526, "grad_norm": 0.516343891620636, "learning_rate": 0.0004992706285054458, "loss": 1.1492, "step": 2835 }, { "epoch": 0.34138718595985096, "grad_norm": 0.6128392815589905, "learning_rate": 0.0004992669843478699, "loss": 1.325, "step": 2840 }, { "epoch": 0.3419882197379493, "grad_norm": 0.5104270577430725, "learning_rate": 0.000499263331122682, "loss": 1.0195, "step": 2845 }, { "epoch": 0.3425892535160476, "grad_norm": 0.38332250714302063, "learning_rate": 0.0004992596688300149, "loss": 1.302, "step": 2850 }, { "epoch": 0.3431902872941459, "grad_norm": 0.5674039125442505, "learning_rate": 0.000499255997470002, "loss": 1.2094, "step": 2855 }, { "epoch": 0.34379132107224425, "grad_norm": 0.7987366914749146, "learning_rate": 0.0004992523170427766, "loss": 1.2047, "step": 2860 }, { "epoch": 0.3443923548503426, "grad_norm": 0.45501282811164856, "learning_rate": 0.0004992486275484729, "loss": 1.1539, "step": 2865 }, { "epoch": 0.3449933886284409, "grad_norm": 0.5390669703483582, "learning_rate": 0.0004992449289872249, "loss": 1.2102, "step": 2870 }, { "epoch": 0.34559442240653926, "grad_norm": 0.6710581183433533, "learning_rate": 0.0004992412213591672, "loss": 1.4297, "step": 2875 }, { "epoch": 0.3461954561846376, "grad_norm": 0.6371570825576782, "learning_rate": 0.0004992375046644347, "loss": 1.0164, "step": 2880 }, { "epoch": 0.34679648996273593, "grad_norm": 0.49934741854667664, "learning_rate": 0.0004992337789031625, "loss": 1.1313, "step": 2885 }, { "epoch": 0.3473975237408342, "grad_norm": 0.41756120324134827, "learning_rate": 0.0004992300440754862, "loss": 1.1969, "step": 2890 }, { "epoch": 0.34799855751893255, "grad_norm": 0.8102174997329712, "learning_rate": 0.0004992263001815418, "loss": 1.2719, "step": 2895 }, { "epoch": 0.3485995912970309, "grad_norm": 0.45573851466178894, "learning_rate": 0.0004992225472214653, "loss": 1.1375, "step": 2900 }, { "epoch": 0.3492006250751292, "grad_norm": 0.5512142777442932, "learning_rate": 0.0004992187851953932, "loss": 1.4781, "step": 2905 }, { "epoch": 0.34980165885322756, "grad_norm": 0.6429489850997925, "learning_rate": 0.0004992150141034624, "loss": 1.3453, "step": 2910 }, { "epoch": 0.3504026926313259, "grad_norm": 0.6230481266975403, "learning_rate": 0.0004992112339458103, "loss": 1.2766, "step": 2915 }, { "epoch": 0.35100372640942423, "grad_norm": 0.6134311556816101, "learning_rate": 0.0004992074447225741, "loss": 1.0664, "step": 2920 }, { "epoch": 0.3516047601875225, "grad_norm": 0.5894711017608643, "learning_rate": 0.0004992036464338918, "loss": 0.9, "step": 2925 }, { "epoch": 0.35220579396562085, "grad_norm": 0.525947630405426, "learning_rate": 0.0004991998390799016, "loss": 1.4844, "step": 2930 }, { "epoch": 0.3528068277437192, "grad_norm": 0.5282221436500549, "learning_rate": 0.0004991960226607418, "loss": 1.0398, "step": 2935 }, { "epoch": 0.3534078615218175, "grad_norm": 0.5808281302452087, "learning_rate": 0.0004991921971765514, "loss": 0.943, "step": 2940 }, { "epoch": 0.35400889529991586, "grad_norm": 0.6163946390151978, "learning_rate": 0.0004991883626274696, "loss": 1.1086, "step": 2945 }, { "epoch": 0.3546099290780142, "grad_norm": 0.4154224991798401, "learning_rate": 0.0004991845190136357, "loss": 1.2703, "step": 2950 }, { "epoch": 0.35521096285611253, "grad_norm": 0.4030189514160156, "learning_rate": 0.0004991806663351897, "loss": 1.1086, "step": 2955 }, { "epoch": 0.35581199663421087, "grad_norm": 0.5927292704582214, "learning_rate": 0.0004991768045922718, "loss": 0.9758, "step": 2960 }, { "epoch": 0.35641303041230915, "grad_norm": 0.476971834897995, "learning_rate": 0.0004991729337850223, "loss": 1.525, "step": 2965 }, { "epoch": 0.3570140641904075, "grad_norm": 0.5584660768508911, "learning_rate": 0.000499169053913582, "loss": 1.2125, "step": 2970 }, { "epoch": 0.3576150979685058, "grad_norm": 0.5617804527282715, "learning_rate": 0.0004991651649780922, "loss": 1.3102, "step": 2975 }, { "epoch": 0.35821613174660416, "grad_norm": 0.3463181257247925, "learning_rate": 0.0004991612669786942, "loss": 1.4227, "step": 2980 }, { "epoch": 0.3588171655247025, "grad_norm": 0.5156741142272949, "learning_rate": 0.0004991573599155299, "loss": 1.3828, "step": 2985 }, { "epoch": 0.35941819930280083, "grad_norm": 0.6080055832862854, "learning_rate": 0.0004991534437887414, "loss": 1.0102, "step": 2990 }, { "epoch": 0.36001923308089917, "grad_norm": 0.5142369866371155, "learning_rate": 0.0004991495185984711, "loss": 1.3469, "step": 2995 }, { "epoch": 0.36062026685899745, "grad_norm": 0.4148232638835907, "learning_rate": 0.000499145584344862, "loss": 1.3102, "step": 3000 }, { "epoch": 0.36062026685899745, "eval_loss": 2.1435546875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2124, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 3000 }, { "epoch": 0.3612213006370958, "grad_norm": 0.43302541971206665, "learning_rate": 0.000499141641028057, "loss": 1.1625, "step": 3005 }, { "epoch": 0.3618223344151941, "grad_norm": 0.5702094435691833, "learning_rate": 0.0004991376886481996, "loss": 1.4937, "step": 3010 }, { "epoch": 0.36242336819329246, "grad_norm": 0.602148175239563, "learning_rate": 0.0004991337272054336, "loss": 1.4453, "step": 3015 }, { "epoch": 0.3630244019713908, "grad_norm": 0.7094736099243164, "learning_rate": 0.0004991297566999031, "loss": 1.1242, "step": 3020 }, { "epoch": 0.36362543574948913, "grad_norm": 1.0491294860839844, "learning_rate": 0.0004991257771317525, "loss": 1.2945, "step": 3025 }, { "epoch": 0.36422646952758747, "grad_norm": 0.5383880734443665, "learning_rate": 0.0004991217885011266, "loss": 1.4398, "step": 3030 }, { "epoch": 0.36482750330568575, "grad_norm": 0.5303685665130615, "learning_rate": 0.0004991177908081706, "loss": 1.357, "step": 3035 }, { "epoch": 0.3654285370837841, "grad_norm": 0.5658953785896301, "learning_rate": 0.0004991137840530297, "loss": 1.625, "step": 3040 }, { "epoch": 0.3660295708618824, "grad_norm": 0.6413328051567078, "learning_rate": 0.0004991097682358498, "loss": 1.0664, "step": 3045 }, { "epoch": 0.36663060463998076, "grad_norm": 0.40573734045028687, "learning_rate": 0.000499105743356777, "loss": 1.4219, "step": 3050 }, { "epoch": 0.3672316384180791, "grad_norm": 0.4775443375110626, "learning_rate": 0.0004991017094159576, "loss": 1.1383, "step": 3055 }, { "epoch": 0.36783267219617743, "grad_norm": 0.4478199779987335, "learning_rate": 0.0004990976664135384, "loss": 1.1906, "step": 3060 }, { "epoch": 0.36843370597427577, "grad_norm": 0.5353224277496338, "learning_rate": 0.0004990936143496664, "loss": 1.0695, "step": 3065 }, { "epoch": 0.3690347397523741, "grad_norm": 0.6140496730804443, "learning_rate": 0.0004990895532244893, "loss": 1.45, "step": 3070 }, { "epoch": 0.3696357735304724, "grad_norm": 0.674697995185852, "learning_rate": 0.0004990854830381545, "loss": 1.2617, "step": 3075 }, { "epoch": 0.3702368073085707, "grad_norm": 0.793539822101593, "learning_rate": 0.0004990814037908102, "loss": 1.0797, "step": 3080 }, { "epoch": 0.37083784108666906, "grad_norm": 0.4356972873210907, "learning_rate": 0.0004990773154826048, "loss": 1.257, "step": 3085 }, { "epoch": 0.3714388748647674, "grad_norm": 0.4007517099380493, "learning_rate": 0.000499073218113687, "loss": 1.6695, "step": 3090 }, { "epoch": 0.37203990864286574, "grad_norm": 0.7213647961616516, "learning_rate": 0.0004990691116842058, "loss": 1.1039, "step": 3095 }, { "epoch": 0.37264094242096407, "grad_norm": 0.5188817977905273, "learning_rate": 0.0004990649961943105, "loss": 1.3844, "step": 3100 }, { "epoch": 0.3732419761990624, "grad_norm": 0.5494900345802307, "learning_rate": 0.0004990608716441511, "loss": 1.3094, "step": 3105 }, { "epoch": 0.3738430099771607, "grad_norm": 0.6364961266517639, "learning_rate": 0.0004990567380338774, "loss": 1.432, "step": 3110 }, { "epoch": 0.374444043755259, "grad_norm": 0.7705929279327393, "learning_rate": 0.0004990525953636399, "loss": 1.3594, "step": 3115 }, { "epoch": 0.37504507753335736, "grad_norm": 0.6256239414215088, "learning_rate": 0.0004990484436335892, "loss": 1.4969, "step": 3120 }, { "epoch": 0.3756461113114557, "grad_norm": 0.8040322661399841, "learning_rate": 0.0004990442828438764, "loss": 1.3664, "step": 3125 }, { "epoch": 0.37624714508955404, "grad_norm": 0.7349644303321838, "learning_rate": 0.0004990401129946528, "loss": 1.3016, "step": 3130 }, { "epoch": 0.3768481788676524, "grad_norm": 0.7406263947486877, "learning_rate": 0.0004990359340860701, "loss": 1.2672, "step": 3135 }, { "epoch": 0.3774492126457507, "grad_norm": 0.6488270163536072, "learning_rate": 0.0004990317461182803, "loss": 1.5125, "step": 3140 }, { "epoch": 0.37805024642384905, "grad_norm": 0.450980544090271, "learning_rate": 0.0004990275490914358, "loss": 0.9531, "step": 3145 }, { "epoch": 0.3786512802019473, "grad_norm": 0.5992457866668701, "learning_rate": 0.0004990233430056892, "loss": 1.2563, "step": 3150 }, { "epoch": 0.37925231398004566, "grad_norm": 0.4323638677597046, "learning_rate": 0.0004990191278611936, "loss": 1.1438, "step": 3155 }, { "epoch": 0.379853347758144, "grad_norm": 0.9859302639961243, "learning_rate": 0.0004990149036581023, "loss": 1.2555, "step": 3160 }, { "epoch": 0.38045438153624234, "grad_norm": 0.8136280179023743, "learning_rate": 0.0004990106703965689, "loss": 1.3172, "step": 3165 }, { "epoch": 0.3810554153143407, "grad_norm": 0.5532881021499634, "learning_rate": 0.0004990064280767475, "loss": 0.9656, "step": 3170 }, { "epoch": 0.381656449092439, "grad_norm": 0.5996264219284058, "learning_rate": 0.0004990021766987923, "loss": 1.2688, "step": 3175 }, { "epoch": 0.38225748287053735, "grad_norm": 0.6474243402481079, "learning_rate": 0.0004989979162628582, "loss": 0.9461, "step": 3180 }, { "epoch": 0.3828585166486356, "grad_norm": 0.5693522691726685, "learning_rate": 0.0004989936467690998, "loss": 1.0906, "step": 3185 }, { "epoch": 0.38345955042673396, "grad_norm": 0.5393794775009155, "learning_rate": 0.0004989893682176727, "loss": 1.2445, "step": 3190 }, { "epoch": 0.3840605842048323, "grad_norm": 0.6597670316696167, "learning_rate": 0.0004989850806087325, "loss": 1.4289, "step": 3195 }, { "epoch": 0.38466161798293064, "grad_norm": 0.6367236971855164, "learning_rate": 0.0004989807839424352, "loss": 1.2266, "step": 3200 }, { "epoch": 0.38466161798293064, "eval_loss": 2.1474609375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1942, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 3200 }, { "epoch": 0.385262651761029, "grad_norm": 0.5790736079216003, "learning_rate": 0.0004989764782189369, "loss": 1.2164, "step": 3205 }, { "epoch": 0.3858636855391273, "grad_norm": 0.502346932888031, "learning_rate": 0.0004989721634383943, "loss": 1.3391, "step": 3210 }, { "epoch": 0.38646471931722565, "grad_norm": 0.6391408443450928, "learning_rate": 0.0004989678396009645, "loss": 1.2711, "step": 3215 }, { "epoch": 0.387065753095324, "grad_norm": 0.40186697244644165, "learning_rate": 0.0004989635067068047, "loss": 1.1691, "step": 3220 }, { "epoch": 0.38766678687342226, "grad_norm": 0.5028790235519409, "learning_rate": 0.0004989591647560726, "loss": 1.1609, "step": 3225 }, { "epoch": 0.3882678206515206, "grad_norm": 0.4072652757167816, "learning_rate": 0.0004989548137489259, "loss": 1.5672, "step": 3230 }, { "epoch": 0.38886885442961894, "grad_norm": 0.4738612174987793, "learning_rate": 0.0004989504536855232, "loss": 1.3344, "step": 3235 }, { "epoch": 0.3894698882077173, "grad_norm": 0.9375470876693726, "learning_rate": 0.0004989460845660229, "loss": 1.0484, "step": 3240 }, { "epoch": 0.3900709219858156, "grad_norm": 0.4789920449256897, "learning_rate": 0.000498941706390584, "loss": 1.1109, "step": 3245 }, { "epoch": 0.39067195576391395, "grad_norm": 0.6214213967323303, "learning_rate": 0.0004989373191593658, "loss": 1.3516, "step": 3250 }, { "epoch": 0.3912729895420123, "grad_norm": 0.49041748046875, "learning_rate": 0.0004989329228725277, "loss": 1.2977, "step": 3255 }, { "epoch": 0.39187402332011056, "grad_norm": 0.5892441272735596, "learning_rate": 0.00049892851753023, "loss": 1.1109, "step": 3260 }, { "epoch": 0.3924750570982089, "grad_norm": 0.4488331973552704, "learning_rate": 0.0004989241031326326, "loss": 1.2734, "step": 3265 }, { "epoch": 0.39307609087630724, "grad_norm": 0.43196800351142883, "learning_rate": 0.0004989196796798963, "loss": 1.1059, "step": 3270 }, { "epoch": 0.3936771246544056, "grad_norm": 0.3739017844200134, "learning_rate": 0.0004989152471721819, "loss": 0.8492, "step": 3275 }, { "epoch": 0.3942781584325039, "grad_norm": 0.5097094178199768, "learning_rate": 0.0004989108056096505, "loss": 1.1836, "step": 3280 }, { "epoch": 0.39487919221060225, "grad_norm": 0.7751893401145935, "learning_rate": 0.000498906354992464, "loss": 1.443, "step": 3285 }, { "epoch": 0.3954802259887006, "grad_norm": 0.6941812634468079, "learning_rate": 0.0004989018953207841, "loss": 1.2094, "step": 3290 }, { "epoch": 0.3960812597667989, "grad_norm": 0.5261918902397156, "learning_rate": 0.0004988974265947731, "loss": 1.1297, "step": 3295 }, { "epoch": 0.3966822935448972, "grad_norm": 0.6817163228988647, "learning_rate": 0.0004988929488145934, "loss": 1.5297, "step": 3300 }, { "epoch": 0.39728332732299554, "grad_norm": 0.6343579888343811, "learning_rate": 0.0004988884619804082, "loss": 1.1766, "step": 3305 }, { "epoch": 0.3978843611010939, "grad_norm": 0.4660755693912506, "learning_rate": 0.0004988839660923805, "loss": 0.9953, "step": 3310 }, { "epoch": 0.3984853948791922, "grad_norm": 0.6850960850715637, "learning_rate": 0.0004988794611506738, "loss": 1.4023, "step": 3315 }, { "epoch": 0.39908642865729055, "grad_norm": 0.7170994877815247, "learning_rate": 0.0004988749471554521, "loss": 1.1391, "step": 3320 }, { "epoch": 0.3996874624353889, "grad_norm": 0.9181567430496216, "learning_rate": 0.0004988704241068795, "loss": 1.3047, "step": 3325 }, { "epoch": 0.4002884962134872, "grad_norm": 0.5317991971969604, "learning_rate": 0.0004988658920051207, "loss": 1.3273, "step": 3330 }, { "epoch": 0.4008895299915855, "grad_norm": 0.5464211702346802, "learning_rate": 0.0004988613508503405, "loss": 1.0758, "step": 3335 }, { "epoch": 0.40149056376968384, "grad_norm": 0.664720892906189, "learning_rate": 0.0004988568006427039, "loss": 1.1383, "step": 3340 }, { "epoch": 0.4020915975477822, "grad_norm": 0.5137344002723694, "learning_rate": 0.0004988522413823767, "loss": 0.8992, "step": 3345 }, { "epoch": 0.4026926313258805, "grad_norm": 0.6021727919578552, "learning_rate": 0.0004988476730695246, "loss": 1.0391, "step": 3350 }, { "epoch": 0.40329366510397885, "grad_norm": 0.5493384003639221, "learning_rate": 0.0004988430957043138, "loss": 1.2063, "step": 3355 }, { "epoch": 0.4038946988820772, "grad_norm": 0.44344305992126465, "learning_rate": 0.0004988385092869109, "loss": 1.1539, "step": 3360 }, { "epoch": 0.4044957326601755, "grad_norm": 0.5496264100074768, "learning_rate": 0.0004988339138174827, "loss": 1.0008, "step": 3365 }, { "epoch": 0.40509676643827386, "grad_norm": 0.5868191719055176, "learning_rate": 0.0004988293092961962, "loss": 1.0273, "step": 3370 }, { "epoch": 0.40569780021637214, "grad_norm": 0.47252243757247925, "learning_rate": 0.0004988246957232191, "loss": 1.1547, "step": 3375 }, { "epoch": 0.4062988339944705, "grad_norm": 0.5384260416030884, "learning_rate": 0.0004988200730987192, "loss": 0.9969, "step": 3380 }, { "epoch": 0.4068998677725688, "grad_norm": 0.5157658457756042, "learning_rate": 0.0004988154414228645, "loss": 1.218, "step": 3385 }, { "epoch": 0.40750090155066715, "grad_norm": 0.609667181968689, "learning_rate": 0.0004988108006958237, "loss": 1.2977, "step": 3390 }, { "epoch": 0.4081019353287655, "grad_norm": 0.6255223751068115, "learning_rate": 0.0004988061509177656, "loss": 1.0859, "step": 3395 }, { "epoch": 0.4087029691068638, "grad_norm": 0.48681220412254333, "learning_rate": 0.0004988014920888592, "loss": 1.1094, "step": 3400 }, { "epoch": 0.4087029691068638, "eval_loss": 2.1009764671325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1797, "eval_samples_per_second": 4.548, "eval_steps_per_second": 1.137, "step": 3400 }, { "epoch": 0.40930400288496216, "grad_norm": 0.7313075065612793, "learning_rate": 0.0004987968242092741, "loss": 1.4648, "step": 3405 }, { "epoch": 0.40990503666306044, "grad_norm": 0.3973584771156311, "learning_rate": 0.00049879214727918, "loss": 1.4195, "step": 3410 }, { "epoch": 0.4105060704411588, "grad_norm": 0.5780074596405029, "learning_rate": 0.0004987874612987471, "loss": 1.1383, "step": 3415 }, { "epoch": 0.4111071042192571, "grad_norm": 0.5280987620353699, "learning_rate": 0.0004987827662681459, "loss": 1.2125, "step": 3420 }, { "epoch": 0.41170813799735545, "grad_norm": 0.672178328037262, "learning_rate": 0.0004987780621875471, "loss": 1.2563, "step": 3425 }, { "epoch": 0.4123091717754538, "grad_norm": 0.5133812427520752, "learning_rate": 0.0004987733490571218, "loss": 1.2266, "step": 3430 }, { "epoch": 0.4129102055535521, "grad_norm": 0.7771773338317871, "learning_rate": 0.0004987686268770415, "loss": 1.2648, "step": 3435 }, { "epoch": 0.41351123933165046, "grad_norm": 0.555830180644989, "learning_rate": 0.0004987638956474781, "loss": 1.2445, "step": 3440 }, { "epoch": 0.41411227310974874, "grad_norm": 0.377913236618042, "learning_rate": 0.0004987591553686035, "loss": 1.0813, "step": 3445 }, { "epoch": 0.4147133068878471, "grad_norm": 0.6742091774940491, "learning_rate": 0.0004987544060405903, "loss": 1.1516, "step": 3450 }, { "epoch": 0.4153143406659454, "grad_norm": 0.6832530498504639, "learning_rate": 0.0004987496476636112, "loss": 0.9953, "step": 3455 }, { "epoch": 0.41591537444404375, "grad_norm": 0.7047104835510254, "learning_rate": 0.0004987448802378393, "loss": 1.2781, "step": 3460 }, { "epoch": 0.4165164082221421, "grad_norm": 0.5788468718528748, "learning_rate": 0.000498740103763448, "loss": 1.1266, "step": 3465 }, { "epoch": 0.4171174420002404, "grad_norm": 0.6551057696342468, "learning_rate": 0.0004987353182406111, "loss": 0.9711, "step": 3470 }, { "epoch": 0.41771847577833876, "grad_norm": 0.7851700782775879, "learning_rate": 0.0004987305236695027, "loss": 1.2258, "step": 3475 }, { "epoch": 0.4183195095564371, "grad_norm": 0.45742517709732056, "learning_rate": 0.000498725720050297, "loss": 1.1375, "step": 3480 }, { "epoch": 0.4189205433345354, "grad_norm": 0.5413776636123657, "learning_rate": 0.0004987209073831691, "loss": 1.3938, "step": 3485 }, { "epoch": 0.4195215771126337, "grad_norm": 0.5107463598251343, "learning_rate": 0.0004987160856682938, "loss": 1.0172, "step": 3490 }, { "epoch": 0.42012261089073205, "grad_norm": 0.6791651844978333, "learning_rate": 0.0004987112549058466, "loss": 1.2555, "step": 3495 }, { "epoch": 0.4207236446688304, "grad_norm": 0.71052086353302, "learning_rate": 0.0004987064150960033, "loss": 1.057, "step": 3500 }, { "epoch": 0.4213246784469287, "grad_norm": 0.5631623268127441, "learning_rate": 0.0004987015662389398, "loss": 1.143, "step": 3505 }, { "epoch": 0.42192571222502706, "grad_norm": 0.7245007753372192, "learning_rate": 0.0004986967083348325, "loss": 1.0203, "step": 3510 }, { "epoch": 0.4225267460031254, "grad_norm": 0.5436373353004456, "learning_rate": 0.0004986918413838583, "loss": 1.0805, "step": 3515 }, { "epoch": 0.4231277797812237, "grad_norm": 0.4831235110759735, "learning_rate": 0.0004986869653861941, "loss": 1.2867, "step": 3520 }, { "epoch": 0.423728813559322, "grad_norm": 0.4958447813987732, "learning_rate": 0.0004986820803420172, "loss": 1.143, "step": 3525 }, { "epoch": 0.42432984733742035, "grad_norm": 0.7917611002922058, "learning_rate": 0.0004986771862515055, "loss": 1.2012, "step": 3530 }, { "epoch": 0.4249308811155187, "grad_norm": 0.6926260590553284, "learning_rate": 0.0004986722831148369, "loss": 1.3297, "step": 3535 }, { "epoch": 0.425531914893617, "grad_norm": 0.5297942757606506, "learning_rate": 0.0004986673709321898, "loss": 1.0195, "step": 3540 }, { "epoch": 0.42613294867171536, "grad_norm": 0.6709704399108887, "learning_rate": 0.0004986624497037429, "loss": 1.307, "step": 3545 }, { "epoch": 0.4267339824498137, "grad_norm": 0.586955189704895, "learning_rate": 0.0004986575194296752, "loss": 0.9141, "step": 3550 }, { "epoch": 0.42733501622791203, "grad_norm": 0.573215663433075, "learning_rate": 0.000498652580110166, "loss": 1.0344, "step": 3555 }, { "epoch": 0.4279360500060103, "grad_norm": 0.4756597578525543, "learning_rate": 0.0004986476317453951, "loss": 1.1664, "step": 3560 }, { "epoch": 0.42853708378410865, "grad_norm": 0.5231835246086121, "learning_rate": 0.0004986426743355425, "loss": 1.1281, "step": 3565 }, { "epoch": 0.429138117562207, "grad_norm": 0.4669915735721588, "learning_rate": 0.0004986377078807884, "loss": 1.2641, "step": 3570 }, { "epoch": 0.4297391513403053, "grad_norm": 0.49261239171028137, "learning_rate": 0.0004986327323813135, "loss": 1.1812, "step": 3575 }, { "epoch": 0.43034018511840366, "grad_norm": 0.5701055526733398, "learning_rate": 0.0004986277478372989, "loss": 0.943, "step": 3580 }, { "epoch": 0.430941218896502, "grad_norm": 0.8740291595458984, "learning_rate": 0.0004986227542489259, "loss": 1.4688, "step": 3585 }, { "epoch": 0.43154225267460034, "grad_norm": 0.5957376956939697, "learning_rate": 0.000498617751616376, "loss": 1.0031, "step": 3590 }, { "epoch": 0.4321432864526986, "grad_norm": 0.5455657243728638, "learning_rate": 0.0004986127399398315, "loss": 1.1375, "step": 3595 }, { "epoch": 0.43274432023079695, "grad_norm": 0.7837244868278503, "learning_rate": 0.0004986077192194743, "loss": 1.1492, "step": 3600 }, { "epoch": 0.43274432023079695, "eval_loss": 2.097851514816284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2375, "eval_samples_per_second": 4.541, "eval_steps_per_second": 1.135, "step": 3600 }, { "epoch": 0.4333453540088953, "grad_norm": 0.6420156359672546, "learning_rate": 0.0004986026894554874, "loss": 1.6125, "step": 3605 }, { "epoch": 0.4339463877869936, "grad_norm": 0.37207821011543274, "learning_rate": 0.0004985976506480535, "loss": 1.0586, "step": 3610 }, { "epoch": 0.43454742156509196, "grad_norm": 0.4620397090911865, "learning_rate": 0.000498592602797356, "loss": 1.1328, "step": 3615 }, { "epoch": 0.4351484553431903, "grad_norm": 0.8707792162895203, "learning_rate": 0.0004985875459035786, "loss": 1.2578, "step": 3620 }, { "epoch": 0.43574948912128864, "grad_norm": 0.5785757303237915, "learning_rate": 0.0004985824799669052, "loss": 1.1313, "step": 3625 }, { "epoch": 0.43635052289938697, "grad_norm": 0.581447958946228, "learning_rate": 0.00049857740498752, "loss": 1.2219, "step": 3630 }, { "epoch": 0.43695155667748525, "grad_norm": 0.5232359170913696, "learning_rate": 0.0004985723209656078, "loss": 1.0891, "step": 3635 }, { "epoch": 0.4375525904555836, "grad_norm": 0.4474778175354004, "learning_rate": 0.0004985672279013534, "loss": 1.625, "step": 3640 }, { "epoch": 0.4381536242336819, "grad_norm": 0.7095141410827637, "learning_rate": 0.000498562125794942, "loss": 0.9273, "step": 3645 }, { "epoch": 0.43875465801178026, "grad_norm": 0.6456997394561768, "learning_rate": 0.0004985570146465593, "loss": 1.1453, "step": 3650 }, { "epoch": 0.4393556917898786, "grad_norm": 0.7230183482170105, "learning_rate": 0.0004985518944563914, "loss": 1.15, "step": 3655 }, { "epoch": 0.43995672556797694, "grad_norm": 0.6240507364273071, "learning_rate": 0.0004985467652246243, "loss": 1.0586, "step": 3660 }, { "epoch": 0.4405577593460753, "grad_norm": 1.0956453084945679, "learning_rate": 0.0004985416269514447, "loss": 1.3141, "step": 3665 }, { "epoch": 0.44115879312417355, "grad_norm": 0.4983053505420685, "learning_rate": 0.0004985364796370394, "loss": 1.0805, "step": 3670 }, { "epoch": 0.4417598269022719, "grad_norm": 0.5128179788589478, "learning_rate": 0.0004985313232815958, "loss": 0.9055, "step": 3675 }, { "epoch": 0.4423608606803702, "grad_norm": 0.535322368144989, "learning_rate": 0.0004985261578853014, "loss": 0.9563, "step": 3680 }, { "epoch": 0.44296189445846856, "grad_norm": 0.6636273264884949, "learning_rate": 0.000498520983448344, "loss": 1.2719, "step": 3685 }, { "epoch": 0.4435629282365669, "grad_norm": 0.7313540577888489, "learning_rate": 0.0004985157999709122, "loss": 1.5672, "step": 3690 }, { "epoch": 0.44416396201466524, "grad_norm": 0.45398956537246704, "learning_rate": 0.000498510607453194, "loss": 1.0688, "step": 3695 }, { "epoch": 0.4447649957927636, "grad_norm": 0.4940120577812195, "learning_rate": 0.0004985054058953788, "loss": 1.2211, "step": 3700 }, { "epoch": 0.4453660295708619, "grad_norm": 0.48061200976371765, "learning_rate": 0.0004985001952976556, "loss": 1.1328, "step": 3705 }, { "epoch": 0.4459670633489602, "grad_norm": 0.6256884336471558, "learning_rate": 0.0004984949756602139, "loss": 1.2969, "step": 3710 }, { "epoch": 0.4465680971270585, "grad_norm": 0.5207997560501099, "learning_rate": 0.0004984897469832437, "loss": 1.3641, "step": 3715 }, { "epoch": 0.44716913090515686, "grad_norm": 0.4868186414241791, "learning_rate": 0.000498484509266935, "loss": 0.8984, "step": 3720 }, { "epoch": 0.4477701646832552, "grad_norm": 0.5324142575263977, "learning_rate": 0.0004984792625114786, "loss": 1.1617, "step": 3725 }, { "epoch": 0.44837119846135354, "grad_norm": 0.5691832304000854, "learning_rate": 0.0004984740067170651, "loss": 1.2117, "step": 3730 }, { "epoch": 0.4489722322394519, "grad_norm": 0.419612318277359, "learning_rate": 0.0004984687418838859, "loss": 1.0063, "step": 3735 }, { "epoch": 0.4495732660175502, "grad_norm": 0.5638041496276855, "learning_rate": 0.0004984634680121325, "loss": 1.0805, "step": 3740 }, { "epoch": 0.4501742997956485, "grad_norm": 0.5611728429794312, "learning_rate": 0.0004984581851019966, "loss": 1.282, "step": 3745 }, { "epoch": 0.45077533357374683, "grad_norm": 0.6137074828147888, "learning_rate": 0.0004984528931536705, "loss": 1.2445, "step": 3750 }, { "epoch": 0.45137636735184516, "grad_norm": 0.4265820384025574, "learning_rate": 0.0004984475921673466, "loss": 0.9336, "step": 3755 }, { "epoch": 0.4519774011299435, "grad_norm": 0.6829708218574524, "learning_rate": 0.0004984422821432178, "loss": 1.4375, "step": 3760 }, { "epoch": 0.45257843490804184, "grad_norm": 0.41247573494911194, "learning_rate": 0.0004984369630814773, "loss": 1.3852, "step": 3765 }, { "epoch": 0.4531794686861402, "grad_norm": 0.5414575338363647, "learning_rate": 0.0004984316349823186, "loss": 1.0758, "step": 3770 }, { "epoch": 0.4537805024642385, "grad_norm": 0.5673078298568726, "learning_rate": 0.0004984262978459355, "loss": 0.9523, "step": 3775 }, { "epoch": 0.4543815362423368, "grad_norm": 0.5617050528526306, "learning_rate": 0.0004984209516725221, "loss": 1.0805, "step": 3780 }, { "epoch": 0.45498257002043513, "grad_norm": 0.5333116054534912, "learning_rate": 0.0004984155964622729, "loss": 1.0984, "step": 3785 }, { "epoch": 0.45558360379853347, "grad_norm": 0.6070541739463806, "learning_rate": 0.0004984102322153827, "loss": 1.2125, "step": 3790 }, { "epoch": 0.4561846375766318, "grad_norm": 0.6610611081123352, "learning_rate": 0.0004984048589320467, "loss": 1.1281, "step": 3795 }, { "epoch": 0.45678567135473014, "grad_norm": 0.6735252737998962, "learning_rate": 0.0004983994766124602, "loss": 1.0371, "step": 3800 }, { "epoch": 0.45678567135473014, "eval_loss": 2.081835985183716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1969, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 3800 }, { "epoch": 0.4573867051328285, "grad_norm": 0.48256927728652954, "learning_rate": 0.0004983940852568193, "loss": 1.0188, "step": 3805 }, { "epoch": 0.4579877389109268, "grad_norm": 0.4252387583255768, "learning_rate": 0.0004983886848653197, "loss": 1.1875, "step": 3810 }, { "epoch": 0.45858877268902515, "grad_norm": 0.5931745767593384, "learning_rate": 0.0004983832754381582, "loss": 1.0766, "step": 3815 }, { "epoch": 0.45918980646712343, "grad_norm": 0.5438306927680969, "learning_rate": 0.0004983778569755315, "loss": 1.4563, "step": 3820 }, { "epoch": 0.45979084024522177, "grad_norm": 0.4355262219905853, "learning_rate": 0.0004983724294776366, "loss": 1.0938, "step": 3825 }, { "epoch": 0.4603918740233201, "grad_norm": 0.44057753682136536, "learning_rate": 0.0004983669929446711, "loss": 1.0375, "step": 3830 }, { "epoch": 0.46099290780141844, "grad_norm": 0.5248241424560547, "learning_rate": 0.0004983615473768326, "loss": 1.1828, "step": 3835 }, { "epoch": 0.4615939415795168, "grad_norm": 0.6102302074432373, "learning_rate": 0.0004983560927743193, "loss": 1.2516, "step": 3840 }, { "epoch": 0.4621949753576151, "grad_norm": 0.6916151642799377, "learning_rate": 0.0004983506291373295, "loss": 1.6047, "step": 3845 }, { "epoch": 0.46279600913571345, "grad_norm": 0.5055292844772339, "learning_rate": 0.0004983451564660622, "loss": 1.0172, "step": 3850 }, { "epoch": 0.46339704291381173, "grad_norm": 0.6418437957763672, "learning_rate": 0.0004983396747607161, "loss": 1.1805, "step": 3855 }, { "epoch": 0.46399807669191007, "grad_norm": 0.5011945962905884, "learning_rate": 0.000498334184021491, "loss": 1.0055, "step": 3860 }, { "epoch": 0.4645991104700084, "grad_norm": 0.5504122376441956, "learning_rate": 0.0004983286842485864, "loss": 0.9742, "step": 3865 }, { "epoch": 0.46520014424810674, "grad_norm": 0.3638380467891693, "learning_rate": 0.0004983231754422024, "loss": 0.9906, "step": 3870 }, { "epoch": 0.4658011780262051, "grad_norm": 0.5893705487251282, "learning_rate": 0.0004983176576025394, "loss": 1.0367, "step": 3875 }, { "epoch": 0.4664022118043034, "grad_norm": 0.5191811323165894, "learning_rate": 0.0004983121307297983, "loss": 1.4234, "step": 3880 }, { "epoch": 0.46700324558240175, "grad_norm": 0.5002449750900269, "learning_rate": 0.0004983065948241799, "loss": 1.1242, "step": 3885 }, { "epoch": 0.4676042793605001, "grad_norm": 0.5473368167877197, "learning_rate": 0.0004983010498858857, "loss": 1.1219, "step": 3890 }, { "epoch": 0.46820531313859837, "grad_norm": 0.5295068621635437, "learning_rate": 0.0004982954959151174, "loss": 1.4508, "step": 3895 }, { "epoch": 0.4688063469166967, "grad_norm": 0.8546839356422424, "learning_rate": 0.000498289932912077, "loss": 0.9664, "step": 3900 }, { "epoch": 0.46940738069479504, "grad_norm": 0.6534102559089661, "learning_rate": 0.000498284360876967, "loss": 1.4344, "step": 3905 }, { "epoch": 0.4700084144728934, "grad_norm": 0.4570360779762268, "learning_rate": 0.0004982787798099898, "loss": 0.9531, "step": 3910 }, { "epoch": 0.4706094482509917, "grad_norm": 0.5392407178878784, "learning_rate": 0.0004982731897113488, "loss": 1.243, "step": 3915 }, { "epoch": 0.47121048202909005, "grad_norm": 0.7176635265350342, "learning_rate": 0.0004982675905812469, "loss": 0.968, "step": 3920 }, { "epoch": 0.4718115158071884, "grad_norm": 0.5123677253723145, "learning_rate": 0.0004982619824198882, "loss": 1.4969, "step": 3925 }, { "epoch": 0.47241254958528667, "grad_norm": 0.5915489196777344, "learning_rate": 0.0004982563652274766, "loss": 1.3805, "step": 3930 }, { "epoch": 0.473013583363385, "grad_norm": 0.5139390230178833, "learning_rate": 0.0004982507390042163, "loss": 1.057, "step": 3935 }, { "epoch": 0.47361461714148334, "grad_norm": 0.6480752229690552, "learning_rate": 0.0004982451037503121, "loss": 1.0063, "step": 3940 }, { "epoch": 0.4742156509195817, "grad_norm": 0.6889259219169617, "learning_rate": 0.0004982394594659689, "loss": 1.4828, "step": 3945 }, { "epoch": 0.47481668469768, "grad_norm": 0.5907976627349854, "learning_rate": 0.0004982338061513921, "loss": 1.2148, "step": 3950 }, { "epoch": 0.47541771847577835, "grad_norm": 0.6065306067466736, "learning_rate": 0.0004982281438067874, "loss": 1.3336, "step": 3955 }, { "epoch": 0.4760187522538767, "grad_norm": 0.5058939456939697, "learning_rate": 0.0004982224724323606, "loss": 1.2148, "step": 3960 }, { "epoch": 0.476619786031975, "grad_norm": 0.633465588092804, "learning_rate": 0.0004982167920283181, "loss": 1.1977, "step": 3965 }, { "epoch": 0.4772208198100733, "grad_norm": 0.48402997851371765, "learning_rate": 0.0004982111025948666, "loss": 1.0758, "step": 3970 }, { "epoch": 0.47782185358817164, "grad_norm": 0.4740035831928253, "learning_rate": 0.000498205404132213, "loss": 0.9914, "step": 3975 }, { "epoch": 0.47842288736627, "grad_norm": 0.6403579711914062, "learning_rate": 0.0004981996966405646, "loss": 0.9035, "step": 3980 }, { "epoch": 0.4790239211443683, "grad_norm": 0.6459212303161621, "learning_rate": 0.000498193980120129, "loss": 1.1273, "step": 3985 }, { "epoch": 0.47962495492246665, "grad_norm": 0.5674965977668762, "learning_rate": 0.0004981882545711142, "loss": 1.3828, "step": 3990 }, { "epoch": 0.480225988700565, "grad_norm": 0.6159428954124451, "learning_rate": 0.0004981825199937285, "loss": 1.2344, "step": 3995 }, { "epoch": 0.4808270224786633, "grad_norm": 0.6476476788520813, "learning_rate": 0.0004981767763881803, "loss": 1.4625, "step": 4000 }, { "epoch": 0.4808270224786633, "eval_loss": 2.0904297828674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1956, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 4000 }, { "epoch": 0.4814280562567616, "grad_norm": 0.7665592432022095, "learning_rate": 0.0004981710237546789, "loss": 1.2703, "step": 4005 }, { "epoch": 0.48202909003485994, "grad_norm": 0.5057299733161926, "learning_rate": 0.0004981652620934333, "loss": 1.3039, "step": 4010 }, { "epoch": 0.4826301238129583, "grad_norm": 0.46735164523124695, "learning_rate": 0.0004981594914046532, "loss": 1.0281, "step": 4015 }, { "epoch": 0.4832311575910566, "grad_norm": 0.6298277378082275, "learning_rate": 0.0004981537116885484, "loss": 1.0656, "step": 4020 }, { "epoch": 0.48383219136915495, "grad_norm": 0.6472790837287903, "learning_rate": 0.0004981479229453292, "loss": 1.0418, "step": 4025 }, { "epoch": 0.4844332251472533, "grad_norm": 0.5558010339736938, "learning_rate": 0.0004981421251752063, "loss": 1.0992, "step": 4030 }, { "epoch": 0.4850342589253516, "grad_norm": 0.46242016553878784, "learning_rate": 0.0004981363183783903, "loss": 1.002, "step": 4035 }, { "epoch": 0.48563529270344996, "grad_norm": 0.48881420493125916, "learning_rate": 0.0004981305025550929, "loss": 1.1867, "step": 4040 }, { "epoch": 0.48623632648154824, "grad_norm": 0.4354248642921448, "learning_rate": 0.0004981246777055252, "loss": 1.4141, "step": 4045 }, { "epoch": 0.4868373602596466, "grad_norm": 0.5004878044128418, "learning_rate": 0.0004981188438298995, "loss": 0.9684, "step": 4050 }, { "epoch": 0.4874383940377449, "grad_norm": 0.5613626837730408, "learning_rate": 0.0004981130009284277, "loss": 1.1555, "step": 4055 }, { "epoch": 0.48803942781584325, "grad_norm": 0.4278530776500702, "learning_rate": 0.0004981071490013225, "loss": 1.1289, "step": 4060 }, { "epoch": 0.4886404615939416, "grad_norm": 0.6259580850601196, "learning_rate": 0.0004981012880487968, "loss": 0.9828, "step": 4065 }, { "epoch": 0.4892414953720399, "grad_norm": 0.5686140656471252, "learning_rate": 0.0004980954180710636, "loss": 1.1805, "step": 4070 }, { "epoch": 0.48984252915013826, "grad_norm": 1.0444506406784058, "learning_rate": 0.0004980895390683367, "loss": 0.9242, "step": 4075 }, { "epoch": 0.49044356292823654, "grad_norm": 0.49180054664611816, "learning_rate": 0.0004980836510408297, "loss": 0.9906, "step": 4080 }, { "epoch": 0.4910445967063349, "grad_norm": 0.39391833543777466, "learning_rate": 0.000498077753988757, "loss": 0.9578, "step": 4085 }, { "epoch": 0.4916456304844332, "grad_norm": 0.6872186660766602, "learning_rate": 0.0004980718479123332, "loss": 1.2711, "step": 4090 }, { "epoch": 0.49224666426253155, "grad_norm": 0.4728960692882538, "learning_rate": 0.0004980659328117728, "loss": 1.0828, "step": 4095 }, { "epoch": 0.4928476980406299, "grad_norm": 0.4187394380569458, "learning_rate": 0.0004980600086872913, "loss": 1.0727, "step": 4100 }, { "epoch": 0.4934487318187282, "grad_norm": 0.5191169381141663, "learning_rate": 0.000498054075539104, "loss": 0.9246, "step": 4105 }, { "epoch": 0.49404976559682656, "grad_norm": 0.5990204811096191, "learning_rate": 0.0004980481333674269, "loss": 1.2625, "step": 4110 }, { "epoch": 0.49465079937492484, "grad_norm": 0.5898841619491577, "learning_rate": 0.0004980421821724759, "loss": 0.9133, "step": 4115 }, { "epoch": 0.4952518331530232, "grad_norm": 0.576693594455719, "learning_rate": 0.0004980362219544677, "loss": 1.2969, "step": 4120 }, { "epoch": 0.4958528669311215, "grad_norm": 0.5620527267456055, "learning_rate": 0.000498030252713619, "loss": 0.9883, "step": 4125 }, { "epoch": 0.49645390070921985, "grad_norm": 0.46309694647789, "learning_rate": 0.0004980242744501472, "loss": 1.3211, "step": 4130 }, { "epoch": 0.4970549344873182, "grad_norm": 0.6207218170166016, "learning_rate": 0.0004980182871642694, "loss": 1.2563, "step": 4135 }, { "epoch": 0.4976559682654165, "grad_norm": 0.5662885904312134, "learning_rate": 0.0004980122908562036, "loss": 1.3328, "step": 4140 }, { "epoch": 0.49825700204351486, "grad_norm": 0.6134732961654663, "learning_rate": 0.000498006285526168, "loss": 1.132, "step": 4145 }, { "epoch": 0.4988580358216132, "grad_norm": 0.7582330107688904, "learning_rate": 0.0004980002711743809, "loss": 0.8836, "step": 4150 }, { "epoch": 0.4994590695997115, "grad_norm": 0.6320111155509949, "learning_rate": 0.0004979942478010612, "loss": 1.1727, "step": 4155 }, { "epoch": 0.5000601033778098, "grad_norm": 0.5162654519081116, "learning_rate": 0.0004979882154064279, "loss": 1.2016, "step": 4160 }, { "epoch": 0.5006611371559082, "grad_norm": 0.25934234261512756, "learning_rate": 0.0004979821739907005, "loss": 1.0988, "step": 4165 }, { "epoch": 0.5012621709340065, "grad_norm": 0.6046193838119507, "learning_rate": 0.0004979761235540988, "loss": 1.3953, "step": 4170 }, { "epoch": 0.5018632047121048, "grad_norm": 0.5944646596908569, "learning_rate": 0.0004979700640968429, "loss": 1.1305, "step": 4175 }, { "epoch": 0.5024642384902032, "grad_norm": 0.5635605454444885, "learning_rate": 0.0004979639956191531, "loss": 1.2656, "step": 4180 }, { "epoch": 0.5030652722683014, "grad_norm": 0.5029686689376831, "learning_rate": 0.0004979579181212504, "loss": 1.2305, "step": 4185 }, { "epoch": 0.5036663060463998, "grad_norm": 0.8044158816337585, "learning_rate": 0.0004979518316033556, "loss": 1.225, "step": 4190 }, { "epoch": 0.5042673398244981, "grad_norm": 0.6193830966949463, "learning_rate": 0.0004979457360656902, "loss": 0.9453, "step": 4195 }, { "epoch": 0.5048683736025965, "grad_norm": 0.5999032258987427, "learning_rate": 0.0004979396315084761, "loss": 1.2148, "step": 4200 }, { "epoch": 0.5048683736025965, "eval_loss": 2.069140672683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1981, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 4200 }, { "epoch": 0.5054694073806948, "grad_norm": 0.6504601836204529, "learning_rate": 0.0004979335179319352, "loss": 1.082, "step": 4205 }, { "epoch": 0.5060704411587931, "grad_norm": 0.5235728025436401, "learning_rate": 0.00049792739533629, "loss": 0.9156, "step": 4210 }, { "epoch": 0.5066714749368915, "grad_norm": 0.46760815382003784, "learning_rate": 0.0004979212637217631, "loss": 1.007, "step": 4215 }, { "epoch": 0.5072725087149897, "grad_norm": 0.5325601696968079, "learning_rate": 0.0004979151230885776, "loss": 1.1094, "step": 4220 }, { "epoch": 0.5078735424930881, "grad_norm": 0.4098140299320221, "learning_rate": 0.0004979089734369568, "loss": 0.8449, "step": 4225 }, { "epoch": 0.5084745762711864, "grad_norm": 0.6042998433113098, "learning_rate": 0.0004979028147671246, "loss": 1.1672, "step": 4230 }, { "epoch": 0.5090756100492848, "grad_norm": 0.41027069091796875, "learning_rate": 0.0004978966470793049, "loss": 1.1992, "step": 4235 }, { "epoch": 0.5096766438273831, "grad_norm": 0.5344287753105164, "learning_rate": 0.0004978904703737221, "loss": 0.9934, "step": 4240 }, { "epoch": 0.5102776776054815, "grad_norm": 0.5234737992286682, "learning_rate": 0.000497884284650601, "loss": 1.0617, "step": 4245 }, { "epoch": 0.5108787113835798, "grad_norm": 0.4804365336894989, "learning_rate": 0.0004978780899101663, "loss": 1.0098, "step": 4250 }, { "epoch": 0.511479745161678, "grad_norm": 0.6976252198219299, "learning_rate": 0.0004978718861526438, "loss": 1.2852, "step": 4255 }, { "epoch": 0.5120807789397764, "grad_norm": 0.4136990010738373, "learning_rate": 0.0004978656733782588, "loss": 1.1062, "step": 4260 }, { "epoch": 0.5126818127178747, "grad_norm": 0.5344454646110535, "learning_rate": 0.0004978594515872373, "loss": 1.2984, "step": 4265 }, { "epoch": 0.5132828464959731, "grad_norm": 0.42136499285697937, "learning_rate": 0.0004978532207798059, "loss": 1.0766, "step": 4270 }, { "epoch": 0.5138838802740714, "grad_norm": 0.4809805750846863, "learning_rate": 0.0004978469809561911, "loss": 1.1469, "step": 4275 }, { "epoch": 0.5144849140521698, "grad_norm": 0.5629724264144897, "learning_rate": 0.0004978407321166199, "loss": 1.218, "step": 4280 }, { "epoch": 0.5150859478302681, "grad_norm": 0.34920796751976013, "learning_rate": 0.0004978344742613195, "loss": 0.8383, "step": 4285 }, { "epoch": 0.5156869816083663, "grad_norm": 0.5682929158210754, "learning_rate": 0.0004978282073905178, "loss": 1.198, "step": 4290 }, { "epoch": 0.5162880153864647, "grad_norm": 0.49201318621635437, "learning_rate": 0.0004978219315044426, "loss": 1.2352, "step": 4295 }, { "epoch": 0.516889049164563, "grad_norm": 0.721458911895752, "learning_rate": 0.0004978156466033222, "loss": 0.9203, "step": 4300 }, { "epoch": 0.5174900829426614, "grad_norm": 0.5688953995704651, "learning_rate": 0.0004978093526873853, "loss": 1.1977, "step": 4305 }, { "epoch": 0.5180911167207597, "grad_norm": 0.6028774976730347, "learning_rate": 0.0004978030497568607, "loss": 1.475, "step": 4310 }, { "epoch": 0.5186921504988581, "grad_norm": 0.6064693331718445, "learning_rate": 0.000497796737811978, "loss": 1.1285, "step": 4315 }, { "epoch": 0.5192931842769564, "grad_norm": 0.47966837882995605, "learning_rate": 0.0004977904168529664, "loss": 1.2102, "step": 4320 }, { "epoch": 0.5198942180550546, "grad_norm": 0.7327610850334167, "learning_rate": 0.0004977840868800561, "loss": 1.1367, "step": 4325 }, { "epoch": 0.520495251833153, "grad_norm": 0.5901091694831848, "learning_rate": 0.0004977777478934774, "loss": 1.3352, "step": 4330 }, { "epoch": 0.5210962856112513, "grad_norm": 0.5034139752388, "learning_rate": 0.0004977713998934607, "loss": 0.9094, "step": 4335 }, { "epoch": 0.5216973193893497, "grad_norm": 0.43822070956230164, "learning_rate": 0.0004977650428802371, "loss": 1.3539, "step": 4340 }, { "epoch": 0.522298353167448, "grad_norm": 0.5016989707946777, "learning_rate": 0.0004977586768540377, "loss": 1.2398, "step": 4345 }, { "epoch": 0.5228993869455464, "grad_norm": 0.4488658607006073, "learning_rate": 0.0004977523018150941, "loss": 1.0188, "step": 4350 }, { "epoch": 0.5235004207236447, "grad_norm": 0.5360990762710571, "learning_rate": 0.0004977459177636384, "loss": 1.1133, "step": 4355 }, { "epoch": 0.524101454501743, "grad_norm": 0.41790762543678284, "learning_rate": 0.0004977395246999026, "loss": 1.0016, "step": 4360 }, { "epoch": 0.5247024882798413, "grad_norm": 0.8560382723808289, "learning_rate": 0.0004977331226241194, "loss": 1.1953, "step": 4365 }, { "epoch": 0.5253035220579396, "grad_norm": 0.5320670008659363, "learning_rate": 0.0004977267115365216, "loss": 1.0711, "step": 4370 }, { "epoch": 0.525904555836038, "grad_norm": 0.5246152877807617, "learning_rate": 0.0004977202914373426, "loss": 1.2891, "step": 4375 }, { "epoch": 0.5265055896141363, "grad_norm": 0.6537752747535706, "learning_rate": 0.0004977138623268156, "loss": 1.2719, "step": 4380 }, { "epoch": 0.5271066233922347, "grad_norm": 0.5286105275154114, "learning_rate": 0.0004977074242051748, "loss": 1.1391, "step": 4385 }, { "epoch": 0.527707657170333, "grad_norm": 0.5731997489929199, "learning_rate": 0.0004977009770726541, "loss": 1.0484, "step": 4390 }, { "epoch": 0.5283086909484312, "grad_norm": 0.6958596110343933, "learning_rate": 0.0004976945209294884, "loss": 0.9648, "step": 4395 }, { "epoch": 0.5289097247265296, "grad_norm": 0.8171592950820923, "learning_rate": 0.0004976880557759124, "loss": 1.5773, "step": 4400 }, { "epoch": 0.5289097247265296, "eval_loss": 2.049023389816284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1838, "eval_samples_per_second": 4.548, "eval_steps_per_second": 1.137, "step": 4400 }, { "epoch": 0.5295107585046279, "grad_norm": 0.4283256530761719, "learning_rate": 0.000497681581612161, "loss": 0.9379, "step": 4405 }, { "epoch": 0.5301117922827263, "grad_norm": 0.6683317422866821, "learning_rate": 0.0004976750984384701, "loss": 1.3484, "step": 4410 }, { "epoch": 0.5307128260608246, "grad_norm": 0.7179862260818481, "learning_rate": 0.0004976686062550754, "loss": 0.7375, "step": 4415 }, { "epoch": 0.531313859838923, "grad_norm": 0.5547930002212524, "learning_rate": 0.000497662105062213, "loss": 0.932, "step": 4420 }, { "epoch": 0.5319148936170213, "grad_norm": 0.6749304533004761, "learning_rate": 0.0004976555948601194, "loss": 1.0969, "step": 4425 }, { "epoch": 0.5325159273951197, "grad_norm": 0.4598415493965149, "learning_rate": 0.0004976490756490316, "loss": 1.2563, "step": 4430 }, { "epoch": 0.5331169611732179, "grad_norm": 1.0315395593643188, "learning_rate": 0.0004976425474291866, "loss": 0.9234, "step": 4435 }, { "epoch": 0.5337179949513162, "grad_norm": 0.5202517509460449, "learning_rate": 0.0004976360102008219, "loss": 0.9219, "step": 4440 }, { "epoch": 0.5343190287294146, "grad_norm": 0.6117017269134521, "learning_rate": 0.0004976294639641753, "loss": 1.1805, "step": 4445 }, { "epoch": 0.5349200625075129, "grad_norm": 0.6244452595710754, "learning_rate": 0.000497622908719485, "loss": 1.1406, "step": 4450 }, { "epoch": 0.5355210962856113, "grad_norm": 0.6520891785621643, "learning_rate": 0.0004976163444669893, "loss": 1.2078, "step": 4455 }, { "epoch": 0.5361221300637096, "grad_norm": 0.6139251589775085, "learning_rate": 0.0004976097712069272, "loss": 1.0719, "step": 4460 }, { "epoch": 0.536723163841808, "grad_norm": 0.46594560146331787, "learning_rate": 0.0004976031889395376, "loss": 0.9531, "step": 4465 }, { "epoch": 0.5373241976199062, "grad_norm": 0.6443547606468201, "learning_rate": 0.0004975965976650601, "loss": 1.0945, "step": 4470 }, { "epoch": 0.5379252313980045, "grad_norm": 0.6175803542137146, "learning_rate": 0.0004975899973837344, "loss": 1.5594, "step": 4475 }, { "epoch": 0.5385262651761029, "grad_norm": 0.7328972220420837, "learning_rate": 0.0004975833880958006, "loss": 1.1953, "step": 4480 }, { "epoch": 0.5391272989542012, "grad_norm": 0.485140860080719, "learning_rate": 0.0004975767698014992, "loss": 1.1117, "step": 4485 }, { "epoch": 0.5397283327322996, "grad_norm": 0.5293512344360352, "learning_rate": 0.0004975701425010709, "loss": 1.025, "step": 4490 }, { "epoch": 0.5403293665103979, "grad_norm": 0.7554160952568054, "learning_rate": 0.0004975635061947568, "loss": 1.0391, "step": 4495 }, { "epoch": 0.5409304002884963, "grad_norm": 0.41734662652015686, "learning_rate": 0.0004975568608827982, "loss": 0.8695, "step": 4500 }, { "epoch": 0.5415314340665945, "grad_norm": 0.39808404445648193, "learning_rate": 0.0004975502065654371, "loss": 1.1461, "step": 4505 }, { "epoch": 0.5421324678446928, "grad_norm": 0.6031619906425476, "learning_rate": 0.0004975435432429153, "loss": 1.2563, "step": 4510 }, { "epoch": 0.5427335016227912, "grad_norm": 0.7665862441062927, "learning_rate": 0.0004975368709154753, "loss": 1.2812, "step": 4515 }, { "epoch": 0.5433345354008895, "grad_norm": 0.7608307003974915, "learning_rate": 0.0004975301895833598, "loss": 1.1766, "step": 4520 }, { "epoch": 0.5439355691789879, "grad_norm": 0.6584916114807129, "learning_rate": 0.0004975234992468118, "loss": 1.0078, "step": 4525 }, { "epoch": 0.5445366029570862, "grad_norm": 0.5199896097183228, "learning_rate": 0.0004975167999060748, "loss": 1.2406, "step": 4530 }, { "epoch": 0.5451376367351846, "grad_norm": 0.5209721922874451, "learning_rate": 0.0004975100915613925, "loss": 1.0367, "step": 4535 }, { "epoch": 0.5457386705132828, "grad_norm": 0.6140447854995728, "learning_rate": 0.0004975033742130087, "loss": 0.9734, "step": 4540 }, { "epoch": 0.5463397042913811, "grad_norm": 0.5982779860496521, "learning_rate": 0.0004974966478611681, "loss": 1.182, "step": 4545 }, { "epoch": 0.5469407380694795, "grad_norm": 0.48514458537101746, "learning_rate": 0.0004974899125061151, "loss": 1.2273, "step": 4550 }, { "epoch": 0.5475417718475778, "grad_norm": 0.5001315474510193, "learning_rate": 0.0004974831681480949, "loss": 1.2789, "step": 4555 }, { "epoch": 0.5481428056256762, "grad_norm": 0.4541556239128113, "learning_rate": 0.0004974764147873526, "loss": 0.9297, "step": 4560 }, { "epoch": 0.5487438394037745, "grad_norm": 0.5090661644935608, "learning_rate": 0.0004974696524241342, "loss": 0.984, "step": 4565 }, { "epoch": 0.5493448731818729, "grad_norm": 0.49955224990844727, "learning_rate": 0.0004974628810586854, "loss": 1.0625, "step": 4570 }, { "epoch": 0.5499459069599711, "grad_norm": 0.6976388692855835, "learning_rate": 0.0004974561006912527, "loss": 1.1914, "step": 4575 }, { "epoch": 0.5505469407380695, "grad_norm": 0.4535270929336548, "learning_rate": 0.0004974493113220827, "loss": 0.7617, "step": 4580 }, { "epoch": 0.5511479745161678, "grad_norm": 0.8714334964752197, "learning_rate": 0.0004974425129514224, "loss": 0.9938, "step": 4585 }, { "epoch": 0.5517490082942661, "grad_norm": 0.49312853813171387, "learning_rate": 0.000497435705579519, "loss": 1.2945, "step": 4590 }, { "epoch": 0.5523500420723645, "grad_norm": 0.5688885450363159, "learning_rate": 0.0004974288892066203, "loss": 1.0527, "step": 4595 }, { "epoch": 0.5529510758504628, "grad_norm": 0.5999502539634705, "learning_rate": 0.0004974220638329741, "loss": 0.9891, "step": 4600 }, { "epoch": 0.5529510758504628, "eval_loss": 2.064257860183716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2145, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 4600 }, { "epoch": 0.5535521096285612, "grad_norm": 0.5289875268936157, "learning_rate": 0.0004974152294588289, "loss": 1.1328, "step": 4605 }, { "epoch": 0.5541531434066594, "grad_norm": 0.7003397941589355, "learning_rate": 0.000497408386084433, "loss": 0.8477, "step": 4610 }, { "epoch": 0.5547541771847578, "grad_norm": 0.5111108422279358, "learning_rate": 0.0004974015337100357, "loss": 1.168, "step": 4615 }, { "epoch": 0.5553552109628561, "grad_norm": 0.4160408675670624, "learning_rate": 0.0004973946723358858, "loss": 1.3211, "step": 4620 }, { "epoch": 0.5559562447409544, "grad_norm": 0.5501172542572021, "learning_rate": 0.0004973878019622335, "loss": 1.3945, "step": 4625 }, { "epoch": 0.5565572785190528, "grad_norm": 0.4727821350097656, "learning_rate": 0.0004973809225893282, "loss": 1.3078, "step": 4630 }, { "epoch": 0.5571583122971511, "grad_norm": 0.5417948365211487, "learning_rate": 0.0004973740342174204, "loss": 1.1062, "step": 4635 }, { "epoch": 0.5577593460752495, "grad_norm": 0.39009690284729004, "learning_rate": 0.0004973671368467607, "loss": 0.7863, "step": 4640 }, { "epoch": 0.5583603798533477, "grad_norm": 0.6143473386764526, "learning_rate": 0.0004973602304776, "loss": 1.0836, "step": 4645 }, { "epoch": 0.5589614136314461, "grad_norm": 0.46591776609420776, "learning_rate": 0.0004973533151101893, "loss": 1.0797, "step": 4650 }, { "epoch": 0.5595624474095444, "grad_norm": 0.636924684047699, "learning_rate": 0.0004973463907447804, "loss": 0.9961, "step": 4655 }, { "epoch": 0.5601634811876427, "grad_norm": 0.7269019484519958, "learning_rate": 0.0004973394573816252, "loss": 1.0727, "step": 4660 }, { "epoch": 0.5607645149657411, "grad_norm": 0.5581966638565063, "learning_rate": 0.0004973325150209758, "loss": 0.832, "step": 4665 }, { "epoch": 0.5613655487438394, "grad_norm": 0.586113691329956, "learning_rate": 0.0004973255636630847, "loss": 1.132, "step": 4670 }, { "epoch": 0.5619665825219378, "grad_norm": 0.7046157121658325, "learning_rate": 0.0004973186033082049, "loss": 1.1008, "step": 4675 }, { "epoch": 0.562567616300036, "grad_norm": 0.6651023626327515, "learning_rate": 0.0004973116339565897, "loss": 1.1445, "step": 4680 }, { "epoch": 0.5631686500781344, "grad_norm": 0.6627675890922546, "learning_rate": 0.0004973046556084923, "loss": 1.2563, "step": 4685 }, { "epoch": 0.5637696838562327, "grad_norm": 0.4929293692111969, "learning_rate": 0.0004972976682641668, "loss": 1.1836, "step": 4690 }, { "epoch": 0.564370717634331, "grad_norm": 0.500619649887085, "learning_rate": 0.0004972906719238673, "loss": 1.1789, "step": 4695 }, { "epoch": 0.5649717514124294, "grad_norm": 0.456321120262146, "learning_rate": 0.0004972836665878483, "loss": 1.1375, "step": 4700 }, { "epoch": 0.5655727851905277, "grad_norm": 0.43033653497695923, "learning_rate": 0.0004972766522563648, "loss": 0.7367, "step": 4705 }, { "epoch": 0.5661738189686261, "grad_norm": 0.5689796805381775, "learning_rate": 0.0004972696289296715, "loss": 0.7828, "step": 4710 }, { "epoch": 0.5667748527467243, "grad_norm": 0.6046550273895264, "learning_rate": 0.0004972625966080244, "loss": 1.082, "step": 4715 }, { "epoch": 0.5673758865248227, "grad_norm": 0.6092924475669861, "learning_rate": 0.0004972555552916791, "loss": 1.0945, "step": 4720 }, { "epoch": 0.567976920302921, "grad_norm": 0.6022126078605652, "learning_rate": 0.0004972485049808918, "loss": 1.0648, "step": 4725 }, { "epoch": 0.5685779540810193, "grad_norm": 0.6475672721862793, "learning_rate": 0.0004972414456759189, "loss": 1.4375, "step": 4730 }, { "epoch": 0.5691789878591177, "grad_norm": 0.5474737882614136, "learning_rate": 0.0004972343773770172, "loss": 1.2367, "step": 4735 }, { "epoch": 0.569780021637216, "grad_norm": 0.5624226331710815, "learning_rate": 0.0004972273000844439, "loss": 0.9305, "step": 4740 }, { "epoch": 0.5703810554153144, "grad_norm": 0.5779100656509399, "learning_rate": 0.0004972202137984564, "loss": 0.9156, "step": 4745 }, { "epoch": 0.5709820891934126, "grad_norm": 0.8340283036231995, "learning_rate": 0.0004972131185193123, "loss": 1.3273, "step": 4750 }, { "epoch": 0.571583122971511, "grad_norm": 0.5648449063301086, "learning_rate": 0.0004972060142472702, "loss": 1.1727, "step": 4755 }, { "epoch": 0.5721841567496093, "grad_norm": 0.6421749591827393, "learning_rate": 0.0004971989009825879, "loss": 1.0184, "step": 4760 }, { "epoch": 0.5727851905277077, "grad_norm": 0.5583392381668091, "learning_rate": 0.0004971917787255247, "loss": 1.3945, "step": 4765 }, { "epoch": 0.573386224305806, "grad_norm": 0.6780814528465271, "learning_rate": 0.0004971846474763394, "loss": 1.2617, "step": 4770 }, { "epoch": 0.5739872580839043, "grad_norm": 0.6801932454109192, "learning_rate": 0.0004971775072352914, "loss": 1.127, "step": 4775 }, { "epoch": 0.5745882918620027, "grad_norm": 0.7682610154151917, "learning_rate": 0.0004971703580026407, "loss": 1.093, "step": 4780 }, { "epoch": 0.575189325640101, "grad_norm": 0.5586374998092651, "learning_rate": 0.000497163199778647, "loss": 1.1852, "step": 4785 }, { "epoch": 0.5757903594181993, "grad_norm": 0.574213445186615, "learning_rate": 0.000497156032563571, "loss": 1.257, "step": 4790 }, { "epoch": 0.5763913931962976, "grad_norm": 0.4758005738258362, "learning_rate": 0.0004971488563576732, "loss": 0.7699, "step": 4795 }, { "epoch": 0.576992426974396, "grad_norm": 0.7012805938720703, "learning_rate": 0.0004971416711612149, "loss": 0.9437, "step": 4800 }, { "epoch": 0.576992426974396, "eval_loss": 2.10546875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1867, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 4800 }, { "epoch": 0.5775934607524943, "grad_norm": 0.5702618360519409, "learning_rate": 0.0004971344769744572, "loss": 1.4156, "step": 4805 }, { "epoch": 0.5781944945305926, "grad_norm": 0.5781665444374084, "learning_rate": 0.000497127273797662, "loss": 1.1836, "step": 4810 }, { "epoch": 0.578795528308691, "grad_norm": 0.6150456070899963, "learning_rate": 0.0004971200616310914, "loss": 1.0434, "step": 4815 }, { "epoch": 0.5793965620867892, "grad_norm": 0.49010351300239563, "learning_rate": 0.0004971128404750075, "loss": 1.0312, "step": 4820 }, { "epoch": 0.5799975958648876, "grad_norm": 0.5255641341209412, "learning_rate": 0.000497105610329673, "loss": 1.1898, "step": 4825 }, { "epoch": 0.5805986296429859, "grad_norm": 0.5797913670539856, "learning_rate": 0.0004970983711953512, "loss": 0.9797, "step": 4830 }, { "epoch": 0.5811996634210843, "grad_norm": 0.3937300741672516, "learning_rate": 0.0004970911230723052, "loss": 1.1031, "step": 4835 }, { "epoch": 0.5818006971991826, "grad_norm": 0.5963497161865234, "learning_rate": 0.0004970838659607987, "loss": 1.0164, "step": 4840 }, { "epoch": 0.5824017309772809, "grad_norm": 0.7100831866264343, "learning_rate": 0.0004970765998610957, "loss": 0.943, "step": 4845 }, { "epoch": 0.5830027647553793, "grad_norm": 0.5534042119979858, "learning_rate": 0.0004970693247734606, "loss": 1.143, "step": 4850 }, { "epoch": 0.5836037985334775, "grad_norm": 0.8651660084724426, "learning_rate": 0.000497062040698158, "loss": 0.8562, "step": 4855 }, { "epoch": 0.5842048323115759, "grad_norm": 0.5462768077850342, "learning_rate": 0.0004970547476354528, "loss": 1.118, "step": 4860 }, { "epoch": 0.5848058660896742, "grad_norm": 0.5735900402069092, "learning_rate": 0.0004970474455856103, "loss": 1.0475, "step": 4865 }, { "epoch": 0.5854068998677726, "grad_norm": 0.7304772138595581, "learning_rate": 0.0004970401345488962, "loss": 0.9406, "step": 4870 }, { "epoch": 0.5860079336458709, "grad_norm": 0.8543664813041687, "learning_rate": 0.0004970328145255767, "loss": 1.1461, "step": 4875 }, { "epoch": 0.5866089674239692, "grad_norm": 0.47991877794265747, "learning_rate": 0.0004970254855159176, "loss": 1.2852, "step": 4880 }, { "epoch": 0.5872100012020676, "grad_norm": 0.5912867188453674, "learning_rate": 0.0004970181475201857, "loss": 1.0023, "step": 4885 }, { "epoch": 0.5878110349801658, "grad_norm": 0.598426878452301, "learning_rate": 0.0004970108005386482, "loss": 1.3953, "step": 4890 }, { "epoch": 0.5884120687582642, "grad_norm": 0.40192949771881104, "learning_rate": 0.0004970034445715719, "loss": 1.3156, "step": 4895 }, { "epoch": 0.5890131025363625, "grad_norm": 0.663162112236023, "learning_rate": 0.0004969960796192246, "loss": 1.0555, "step": 4900 }, { "epoch": 0.5896141363144609, "grad_norm": 0.5530306696891785, "learning_rate": 0.0004969887056818743, "loss": 1.3445, "step": 4905 }, { "epoch": 0.5902151700925592, "grad_norm": 0.5497463941574097, "learning_rate": 0.0004969813227597892, "loss": 1.3188, "step": 4910 }, { "epoch": 0.5908162038706576, "grad_norm": 0.40855851769447327, "learning_rate": 0.0004969739308532379, "loss": 1.1141, "step": 4915 }, { "epoch": 0.5914172376487559, "grad_norm": 0.6550401449203491, "learning_rate": 0.0004969665299624891, "loss": 1.0875, "step": 4920 }, { "epoch": 0.5920182714268541, "grad_norm": 0.6358505487442017, "learning_rate": 0.0004969591200878122, "loss": 1.4852, "step": 4925 }, { "epoch": 0.5926193052049525, "grad_norm": 0.45583033561706543, "learning_rate": 0.0004969517012294768, "loss": 1.1062, "step": 4930 }, { "epoch": 0.5932203389830508, "grad_norm": 0.6163007020950317, "learning_rate": 0.0004969442733877526, "loss": 1.1102, "step": 4935 }, { "epoch": 0.5938213727611492, "grad_norm": 0.7980563640594482, "learning_rate": 0.00049693683656291, "loss": 1.1547, "step": 4940 }, { "epoch": 0.5944224065392475, "grad_norm": 0.371162086725235, "learning_rate": 0.0004969293907552193, "loss": 1.2648, "step": 4945 }, { "epoch": 0.5950234403173459, "grad_norm": 0.4477928578853607, "learning_rate": 0.0004969219359649516, "loss": 1.193, "step": 4950 }, { "epoch": 0.5956244740954442, "grad_norm": 0.5350117683410645, "learning_rate": 0.0004969144721923779, "loss": 0.7547, "step": 4955 }, { "epoch": 0.5962255078735424, "grad_norm": 0.7265620231628418, "learning_rate": 0.0004969069994377697, "loss": 1.1359, "step": 4960 }, { "epoch": 0.5968265416516408, "grad_norm": 0.4857397973537445, "learning_rate": 0.0004968995177013991, "loss": 1.2906, "step": 4965 }, { "epoch": 0.5974275754297391, "grad_norm": 0.43424108624458313, "learning_rate": 0.000496892026983538, "loss": 1.1633, "step": 4970 }, { "epoch": 0.5980286092078375, "grad_norm": 0.5249903202056885, "learning_rate": 0.0004968845272844589, "loss": 0.8766, "step": 4975 }, { "epoch": 0.5986296429859358, "grad_norm": 0.591312825679779, "learning_rate": 0.0004968770186044347, "loss": 1.107, "step": 4980 }, { "epoch": 0.5992306767640342, "grad_norm": 0.5988960266113281, "learning_rate": 0.0004968695009437385, "loss": 1.0281, "step": 4985 }, { "epoch": 0.5998317105421325, "grad_norm": 0.5821179151535034, "learning_rate": 0.0004968619743026439, "loss": 1.0852, "step": 4990 }, { "epoch": 0.6004327443202307, "grad_norm": 0.5867944359779358, "learning_rate": 0.0004968544386814245, "loss": 1.1008, "step": 4995 }, { "epoch": 0.6010337780983291, "grad_norm": 0.461322158575058, "learning_rate": 0.0004968468940803546, "loss": 0.9305, "step": 5000 }, { "epoch": 0.6010337780983291, "eval_loss": 2.072265625, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2054, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 5000 }, { "epoch": 0.6016348118764274, "grad_norm": 0.47805655002593994, "learning_rate": 0.0004968393404997085, "loss": 0.8844, "step": 5005 }, { "epoch": 0.6022358456545258, "grad_norm": 0.6164557933807373, "learning_rate": 0.0004968317779397611, "loss": 1.0875, "step": 5010 }, { "epoch": 0.6028368794326241, "grad_norm": 0.6458466053009033, "learning_rate": 0.0004968242064007875, "loss": 0.8664, "step": 5015 }, { "epoch": 0.6034379132107225, "grad_norm": 0.5833812355995178, "learning_rate": 0.000496816625883063, "loss": 0.7797, "step": 5020 }, { "epoch": 0.6040389469888208, "grad_norm": 0.6610152721405029, "learning_rate": 0.0004968090363868634, "loss": 0.8711, "step": 5025 }, { "epoch": 0.604639980766919, "grad_norm": 0.5444918274879456, "learning_rate": 0.0004968014379124649, "loss": 1.1195, "step": 5030 }, { "epoch": 0.6052410145450174, "grad_norm": 0.7095007300376892, "learning_rate": 0.0004967938304601438, "loss": 1.2203, "step": 5035 }, { "epoch": 0.6058420483231157, "grad_norm": 0.5106276869773865, "learning_rate": 0.000496786214030177, "loss": 0.9242, "step": 5040 }, { "epoch": 0.6064430821012141, "grad_norm": 0.7541503310203552, "learning_rate": 0.0004967785886228414, "loss": 1.1953, "step": 5045 }, { "epoch": 0.6070441158793124, "grad_norm": 0.6698989868164062, "learning_rate": 0.0004967709542384142, "loss": 1.0977, "step": 5050 }, { "epoch": 0.6076451496574108, "grad_norm": 0.65958172082901, "learning_rate": 0.0004967633108771735, "loss": 1.1953, "step": 5055 }, { "epoch": 0.6082461834355091, "grad_norm": 0.4623226225376129, "learning_rate": 0.0004967556585393972, "loss": 1.0875, "step": 5060 }, { "epoch": 0.6088472172136074, "grad_norm": 0.5534173846244812, "learning_rate": 0.0004967479972253637, "loss": 0.8086, "step": 5065 }, { "epoch": 0.6094482509917057, "grad_norm": 0.5159333944320679, "learning_rate": 0.0004967403269353516, "loss": 1.0781, "step": 5070 }, { "epoch": 0.610049284769804, "grad_norm": 0.9555249810218811, "learning_rate": 0.00049673264766964, "loss": 1.3039, "step": 5075 }, { "epoch": 0.6106503185479024, "grad_norm": 0.5126526951789856, "learning_rate": 0.0004967249594285081, "loss": 1.3594, "step": 5080 }, { "epoch": 0.6112513523260007, "grad_norm": 0.4960355758666992, "learning_rate": 0.0004967172622122358, "loss": 1.0219, "step": 5085 }, { "epoch": 0.6118523861040991, "grad_norm": 0.6622259020805359, "learning_rate": 0.000496709556021103, "loss": 0.9633, "step": 5090 }, { "epoch": 0.6124534198821974, "grad_norm": 0.5355902910232544, "learning_rate": 0.0004967018408553901, "loss": 1.2641, "step": 5095 }, { "epoch": 0.6130544536602958, "grad_norm": 0.5535646080970764, "learning_rate": 0.0004966941167153776, "loss": 1.125, "step": 5100 }, { "epoch": 0.613655487438394, "grad_norm": 0.40969082713127136, "learning_rate": 0.0004966863836013465, "loss": 1.5328, "step": 5105 }, { "epoch": 0.6142565212164923, "grad_norm": 0.7018454074859619, "learning_rate": 0.0004966786415135783, "loss": 1.009, "step": 5110 }, { "epoch": 0.6148575549945907, "grad_norm": 0.7394018769264221, "learning_rate": 0.0004966708904523546, "loss": 1.4625, "step": 5115 }, { "epoch": 0.615458588772689, "grad_norm": 0.43013033270835876, "learning_rate": 0.0004966631304179571, "loss": 0.7812, "step": 5120 }, { "epoch": 0.6160596225507874, "grad_norm": 0.6355203986167908, "learning_rate": 0.0004966553614106684, "loss": 0.8352, "step": 5125 }, { "epoch": 0.6166606563288857, "grad_norm": 0.4331319332122803, "learning_rate": 0.0004966475834307708, "loss": 1.0063, "step": 5130 }, { "epoch": 0.6172616901069841, "grad_norm": 0.47516930103302, "learning_rate": 0.0004966397964785475, "loss": 1.1465, "step": 5135 }, { "epoch": 0.6178627238850823, "grad_norm": 0.46409985423088074, "learning_rate": 0.0004966320005542817, "loss": 1.3172, "step": 5140 }, { "epoch": 0.6184637576631806, "grad_norm": 0.5062604546546936, "learning_rate": 0.000496624195658257, "loss": 1.0352, "step": 5145 }, { "epoch": 0.619064791441279, "grad_norm": 0.5797644257545471, "learning_rate": 0.0004966163817907573, "loss": 1.043, "step": 5150 }, { "epoch": 0.6196658252193773, "grad_norm": 0.6194161176681519, "learning_rate": 0.0004966085589520668, "loss": 0.9633, "step": 5155 }, { "epoch": 0.6202668589974757, "grad_norm": 0.570642352104187, "learning_rate": 0.0004966007271424701, "loss": 1.6047, "step": 5160 }, { "epoch": 0.620867892775574, "grad_norm": 0.6968045830726624, "learning_rate": 0.0004965928863622522, "loss": 1.2914, "step": 5165 }, { "epoch": 0.6214689265536724, "grad_norm": 0.4688586890697479, "learning_rate": 0.0004965850366116982, "loss": 1.4031, "step": 5170 }, { "epoch": 0.6220699603317706, "grad_norm": 0.6257545351982117, "learning_rate": 0.0004965771778910936, "loss": 1.1641, "step": 5175 }, { "epoch": 0.6226709941098689, "grad_norm": 0.5326806902885437, "learning_rate": 0.0004965693102007245, "loss": 0.973, "step": 5180 }, { "epoch": 0.6232720278879673, "grad_norm": 0.5571873188018799, "learning_rate": 0.0004965614335408769, "loss": 0.9023, "step": 5185 }, { "epoch": 0.6238730616660656, "grad_norm": 0.67780601978302, "learning_rate": 0.0004965535479118374, "loss": 1.1531, "step": 5190 }, { "epoch": 0.624474095444164, "grad_norm": 0.5032584071159363, "learning_rate": 0.0004965456533138928, "loss": 1.3109, "step": 5195 }, { "epoch": 0.6250751292222623, "grad_norm": 0.5984988808631897, "learning_rate": 0.0004965377497473304, "loss": 1.3664, "step": 5200 }, { "epoch": 0.6250751292222623, "eval_loss": 1.9845702648162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2079, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 5200 }, { "epoch": 0.6256761630003607, "grad_norm": 0.6601026654243469, "learning_rate": 0.0004965298372124376, "loss": 1.0969, "step": 5205 }, { "epoch": 0.6262771967784589, "grad_norm": 0.6069896817207336, "learning_rate": 0.0004965219157095024, "loss": 0.7707, "step": 5210 }, { "epoch": 0.6268782305565572, "grad_norm": 0.5757645964622498, "learning_rate": 0.0004965139852388127, "loss": 0.9836, "step": 5215 }, { "epoch": 0.6274792643346556, "grad_norm": 0.658743143081665, "learning_rate": 0.0004965060458006573, "loss": 0.7297, "step": 5220 }, { "epoch": 0.6280802981127539, "grad_norm": 0.6893699765205383, "learning_rate": 0.0004964980973953247, "loss": 1.0586, "step": 5225 }, { "epoch": 0.6286813318908523, "grad_norm": 0.6678071022033691, "learning_rate": 0.0004964901400231043, "loss": 0.9922, "step": 5230 }, { "epoch": 0.6292823656689506, "grad_norm": 0.8199008703231812, "learning_rate": 0.0004964821736842854, "loss": 1.0664, "step": 5235 }, { "epoch": 0.629883399447049, "grad_norm": 0.516659140586853, "learning_rate": 0.0004964741983791578, "loss": 1.0359, "step": 5240 }, { "epoch": 0.6304844332251472, "grad_norm": 0.5256388187408447, "learning_rate": 0.0004964662141080117, "loss": 0.8906, "step": 5245 }, { "epoch": 0.6310854670032456, "grad_norm": 0.6450999975204468, "learning_rate": 0.0004964582208711375, "loss": 0.8363, "step": 5250 }, { "epoch": 0.6316865007813439, "grad_norm": 0.7471349239349365, "learning_rate": 0.000496450218668826, "loss": 1.198, "step": 5255 }, { "epoch": 0.6322875345594422, "grad_norm": 0.5109617114067078, "learning_rate": 0.0004964422075013682, "loss": 0.9227, "step": 5260 }, { "epoch": 0.6328885683375406, "grad_norm": 0.48837974667549133, "learning_rate": 0.0004964341873690557, "loss": 0.9469, "step": 5265 }, { "epoch": 0.6334896021156389, "grad_norm": 0.7013422250747681, "learning_rate": 0.0004964261582721801, "loss": 1.1594, "step": 5270 }, { "epoch": 0.6340906358937373, "grad_norm": 0.3042369484901428, "learning_rate": 0.0004964181202110335, "loss": 1.3625, "step": 5275 }, { "epoch": 0.6346916696718355, "grad_norm": 0.8467724919319153, "learning_rate": 0.0004964100731859084, "loss": 0.9328, "step": 5280 }, { "epoch": 0.6352927034499339, "grad_norm": 0.7622195482254028, "learning_rate": 0.0004964020171970974, "loss": 1.0672, "step": 5285 }, { "epoch": 0.6358937372280322, "grad_norm": 0.5378862023353577, "learning_rate": 0.0004963939522448936, "loss": 1.0277, "step": 5290 }, { "epoch": 0.6364947710061305, "grad_norm": 0.5819463133811951, "learning_rate": 0.0004963858783295905, "loss": 1.1039, "step": 5295 }, { "epoch": 0.6370958047842289, "grad_norm": 0.5175995826721191, "learning_rate": 0.0004963777954514816, "loss": 1.3875, "step": 5300 }, { "epoch": 0.6376968385623272, "grad_norm": 0.6846615076065063, "learning_rate": 0.000496369703610861, "loss": 0.8723, "step": 5305 }, { "epoch": 0.6382978723404256, "grad_norm": 0.6325473189353943, "learning_rate": 0.000496361602808023, "loss": 1.2469, "step": 5310 }, { "epoch": 0.6388989061185238, "grad_norm": 0.49080929160118103, "learning_rate": 0.0004963534930432625, "loss": 1.2977, "step": 5315 }, { "epoch": 0.6394999398966222, "grad_norm": 0.4832301437854767, "learning_rate": 0.0004963453743168743, "loss": 1.1719, "step": 5320 }, { "epoch": 0.6401009736747205, "grad_norm": 0.7145053744316101, "learning_rate": 0.000496337246629154, "loss": 1.2859, "step": 5325 }, { "epoch": 0.6407020074528188, "grad_norm": 0.5795300602912903, "learning_rate": 0.0004963291099803969, "loss": 0.9617, "step": 5330 }, { "epoch": 0.6413030412309172, "grad_norm": 0.9390627145767212, "learning_rate": 0.0004963209643708991, "loss": 1.1703, "step": 5335 }, { "epoch": 0.6419040750090155, "grad_norm": 0.7437942624092102, "learning_rate": 0.000496312809800957, "loss": 1.0711, "step": 5340 }, { "epoch": 0.6425051087871139, "grad_norm": 0.5893829464912415, "learning_rate": 0.0004963046462708673, "loss": 1.2203, "step": 5345 }, { "epoch": 0.6431061425652121, "grad_norm": 0.430411696434021, "learning_rate": 0.0004962964737809268, "loss": 1.068, "step": 5350 }, { "epoch": 0.6437071763433105, "grad_norm": 0.4798862040042877, "learning_rate": 0.0004962882923314329, "loss": 1.0195, "step": 5355 }, { "epoch": 0.6443082101214088, "grad_norm": 0.7981201410293579, "learning_rate": 0.0004962801019226833, "loss": 0.9211, "step": 5360 }, { "epoch": 0.6449092438995071, "grad_norm": 0.4801620841026306, "learning_rate": 0.0004962719025549757, "loss": 1.2961, "step": 5365 }, { "epoch": 0.6455102776776055, "grad_norm": 0.4748445451259613, "learning_rate": 0.0004962636942286086, "loss": 1.207, "step": 5370 }, { "epoch": 0.6461113114557038, "grad_norm": 0.5984495282173157, "learning_rate": 0.0004962554769438802, "loss": 0.9734, "step": 5375 }, { "epoch": 0.6467123452338022, "grad_norm": 0.6702393889427185, "learning_rate": 0.0004962472507010901, "loss": 1.0977, "step": 5380 }, { "epoch": 0.6473133790119004, "grad_norm": 0.5429012775421143, "learning_rate": 0.0004962390155005369, "loss": 1.0641, "step": 5385 }, { "epoch": 0.6479144127899988, "grad_norm": 0.39091694355010986, "learning_rate": 0.0004962307713425206, "loss": 0.9906, "step": 5390 }, { "epoch": 0.6485154465680971, "grad_norm": 0.4490283131599426, "learning_rate": 0.0004962225182273409, "loss": 1.1297, "step": 5395 }, { "epoch": 0.6491164803461954, "grad_norm": 0.6250773072242737, "learning_rate": 0.0004962142561552981, "loss": 0.8102, "step": 5400 }, { "epoch": 0.6491164803461954, "eval_loss": 2.007031202316284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2151, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 5400 }, { "epoch": 0.6497175141242938, "grad_norm": 0.4846701920032501, "learning_rate": 0.0004962059851266926, "loss": 1.4289, "step": 5405 }, { "epoch": 0.6503185479023921, "grad_norm": 0.5710358023643494, "learning_rate": 0.0004961977051418253, "loss": 1.0961, "step": 5410 }, { "epoch": 0.6509195816804905, "grad_norm": 0.46397438645362854, "learning_rate": 0.0004961894162009977, "loss": 0.9297, "step": 5415 }, { "epoch": 0.6515206154585887, "grad_norm": 0.5259897708892822, "learning_rate": 0.0004961811183045111, "loss": 1.0891, "step": 5420 }, { "epoch": 0.6521216492366871, "grad_norm": 0.44706469774246216, "learning_rate": 0.0004961728114526672, "loss": 1.175, "step": 5425 }, { "epoch": 0.6527226830147854, "grad_norm": 0.6101322174072266, "learning_rate": 0.0004961644956457685, "loss": 1.143, "step": 5430 }, { "epoch": 0.6533237167928838, "grad_norm": 0.544548511505127, "learning_rate": 0.0004961561708841173, "loss": 0.9246, "step": 5435 }, { "epoch": 0.6539247505709821, "grad_norm": 0.7637187242507935, "learning_rate": 0.0004961478371680165, "loss": 1.3031, "step": 5440 }, { "epoch": 0.6545257843490804, "grad_norm": 0.5239999294281006, "learning_rate": 0.0004961394944977692, "loss": 1.0242, "step": 5445 }, { "epoch": 0.6551268181271788, "grad_norm": 0.6028966307640076, "learning_rate": 0.000496131142873679, "loss": 1.2961, "step": 5450 }, { "epoch": 0.655727851905277, "grad_norm": 0.9021722674369812, "learning_rate": 0.0004961227822960495, "loss": 1.0543, "step": 5455 }, { "epoch": 0.6563288856833754, "grad_norm": 0.8960506916046143, "learning_rate": 0.0004961144127651851, "loss": 1.4555, "step": 5460 }, { "epoch": 0.6569299194614737, "grad_norm": 0.6960546374320984, "learning_rate": 0.0004961060342813901, "loss": 0.7801, "step": 5465 }, { "epoch": 0.6575309532395721, "grad_norm": 0.7192292213439941, "learning_rate": 0.0004960976468449692, "loss": 0.9641, "step": 5470 }, { "epoch": 0.6581319870176704, "grad_norm": 0.605060875415802, "learning_rate": 0.0004960892504562277, "loss": 1.6898, "step": 5475 }, { "epoch": 0.6587330207957687, "grad_norm": 0.5543851256370544, "learning_rate": 0.000496080845115471, "loss": 1.4141, "step": 5480 }, { "epoch": 0.6593340545738671, "grad_norm": 0.6614383459091187, "learning_rate": 0.0004960724308230047, "loss": 1.0016, "step": 5485 }, { "epoch": 0.6599350883519653, "grad_norm": 0.5792709589004517, "learning_rate": 0.0004960640075791351, "loss": 1.2906, "step": 5490 }, { "epoch": 0.6605361221300637, "grad_norm": 0.509619951248169, "learning_rate": 0.0004960555753841685, "loss": 0.9914, "step": 5495 }, { "epoch": 0.661137155908162, "grad_norm": 0.6596083045005798, "learning_rate": 0.0004960471342384116, "loss": 0.8969, "step": 5500 }, { "epoch": 0.6617381896862604, "grad_norm": 0.43772363662719727, "learning_rate": 0.0004960386841421716, "loss": 1.3281, "step": 5505 }, { "epoch": 0.6623392234643587, "grad_norm": 0.551668643951416, "learning_rate": 0.0004960302250957558, "loss": 0.9844, "step": 5510 }, { "epoch": 0.662940257242457, "grad_norm": 0.5039249658584595, "learning_rate": 0.0004960217570994719, "loss": 1.1273, "step": 5515 }, { "epoch": 0.6635412910205554, "grad_norm": 0.48881009221076965, "learning_rate": 0.0004960132801536281, "loss": 1.3953, "step": 5520 }, { "epoch": 0.6641423247986536, "grad_norm": 0.7355161905288696, "learning_rate": 0.0004960047942585324, "loss": 0.868, "step": 5525 }, { "epoch": 0.664743358576752, "grad_norm": 0.38211506605148315, "learning_rate": 0.0004959962994144939, "loss": 1.1547, "step": 5530 }, { "epoch": 0.6653443923548503, "grad_norm": 0.5978872776031494, "learning_rate": 0.0004959877956218213, "loss": 1.1145, "step": 5535 }, { "epoch": 0.6659454261329487, "grad_norm": 0.6254252791404724, "learning_rate": 0.0004959792828808241, "loss": 1.1023, "step": 5540 }, { "epoch": 0.666546459911047, "grad_norm": 0.5093739032745361, "learning_rate": 0.0004959707611918121, "loss": 1.1742, "step": 5545 }, { "epoch": 0.6671474936891453, "grad_norm": 0.5325815677642822, "learning_rate": 0.0004959622305550951, "loss": 1.1641, "step": 5550 }, { "epoch": 0.6677485274672437, "grad_norm": 0.5326328873634338, "learning_rate": 0.0004959536909709834, "loss": 0.8969, "step": 5555 }, { "epoch": 0.668349561245342, "grad_norm": 0.45378610491752625, "learning_rate": 0.0004959451424397879, "loss": 1.0051, "step": 5560 }, { "epoch": 0.6689505950234403, "grad_norm": 0.6395484209060669, "learning_rate": 0.0004959365849618192, "loss": 1.0594, "step": 5565 }, { "epoch": 0.6695516288015386, "grad_norm": 0.6629273891448975, "learning_rate": 0.0004959280185373888, "loss": 1.182, "step": 5570 }, { "epoch": 0.670152662579637, "grad_norm": 0.6001206040382385, "learning_rate": 0.0004959194431668084, "loss": 0.825, "step": 5575 }, { "epoch": 0.6707536963577353, "grad_norm": 0.6217541098594666, "learning_rate": 0.0004959108588503898, "loss": 1.0355, "step": 5580 }, { "epoch": 0.6713547301358337, "grad_norm": 0.5813589096069336, "learning_rate": 0.0004959022655884453, "loss": 0.9965, "step": 5585 }, { "epoch": 0.671955763913932, "grad_norm": 0.4870590269565582, "learning_rate": 0.0004958936633812876, "loss": 1.1906, "step": 5590 }, { "epoch": 0.6725567976920303, "grad_norm": 0.46450790762901306, "learning_rate": 0.0004958850522292295, "loss": 0.9266, "step": 5595 }, { "epoch": 0.6731578314701286, "grad_norm": 0.7803342938423157, "learning_rate": 0.0004958764321325843, "loss": 1.1867, "step": 5600 }, { "epoch": 0.6731578314701286, "eval_loss": 1.960351586341858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1953, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 5600 }, { "epoch": 0.6737588652482269, "grad_norm": 0.39002323150634766, "learning_rate": 0.0004958678030916655, "loss": 0.9242, "step": 5605 }, { "epoch": 0.6743598990263253, "grad_norm": 0.6863328814506531, "learning_rate": 0.0004958591651067872, "loss": 0.9203, "step": 5610 }, { "epoch": 0.6749609328044236, "grad_norm": 0.5728829503059387, "learning_rate": 0.0004958505181782635, "loss": 1.1797, "step": 5615 }, { "epoch": 0.675561966582522, "grad_norm": 0.5186361074447632, "learning_rate": 0.0004958418623064088, "loss": 0.7125, "step": 5620 }, { "epoch": 0.6761630003606203, "grad_norm": 0.5343161225318909, "learning_rate": 0.0004958331974915382, "loss": 1.0008, "step": 5625 }, { "epoch": 0.6767640341387186, "grad_norm": 0.6222845315933228, "learning_rate": 0.0004958245237339669, "loss": 1.0906, "step": 5630 }, { "epoch": 0.6773650679168169, "grad_norm": 0.5673485398292542, "learning_rate": 0.0004958158410340103, "loss": 1.0516, "step": 5635 }, { "epoch": 0.6779661016949152, "grad_norm": 0.6129093170166016, "learning_rate": 0.0004958071493919842, "loss": 0.932, "step": 5640 }, { "epoch": 0.6785671354730136, "grad_norm": 0.43740183115005493, "learning_rate": 0.0004957984488082049, "loss": 1.4109, "step": 5645 }, { "epoch": 0.6791681692511119, "grad_norm": 0.632262647151947, "learning_rate": 0.0004957897392829889, "loss": 1.3547, "step": 5650 }, { "epoch": 0.6797692030292103, "grad_norm": 0.5351587533950806, "learning_rate": 0.0004957810208166531, "loss": 1.1938, "step": 5655 }, { "epoch": 0.6803702368073086, "grad_norm": 0.6918035745620728, "learning_rate": 0.0004957722934095145, "loss": 0.9172, "step": 5660 }, { "epoch": 0.6809712705854069, "grad_norm": 0.5031920671463013, "learning_rate": 0.0004957635570618906, "loss": 0.9242, "step": 5665 }, { "epoch": 0.6815723043635052, "grad_norm": 0.8015581965446472, "learning_rate": 0.0004957548117740993, "loss": 0.884, "step": 5670 }, { "epoch": 0.6821733381416035, "grad_norm": 0.5075511336326599, "learning_rate": 0.0004957460575464586, "loss": 1.0656, "step": 5675 }, { "epoch": 0.6827743719197019, "grad_norm": 0.43553194403648376, "learning_rate": 0.000495737294379287, "loss": 1.1762, "step": 5680 }, { "epoch": 0.6833754056978002, "grad_norm": 0.48835834860801697, "learning_rate": 0.0004957285222729034, "loss": 1.0547, "step": 5685 }, { "epoch": 0.6839764394758986, "grad_norm": 0.5027529001235962, "learning_rate": 0.0004957197412276267, "loss": 1.1742, "step": 5690 }, { "epoch": 0.6845774732539969, "grad_norm": 0.581287682056427, "learning_rate": 0.0004957109512437766, "loss": 0.9242, "step": 5695 }, { "epoch": 0.6851785070320952, "grad_norm": 0.5454129576683044, "learning_rate": 0.0004957021523216725, "loss": 1.1492, "step": 5700 }, { "epoch": 0.6857795408101935, "grad_norm": 0.5323740839958191, "learning_rate": 0.0004956933444616347, "loss": 0.8586, "step": 5705 }, { "epoch": 0.6863805745882918, "grad_norm": 0.8149203658103943, "learning_rate": 0.0004956845276639836, "loss": 1.15, "step": 5710 }, { "epoch": 0.6869816083663902, "grad_norm": 0.6589764952659607, "learning_rate": 0.00049567570192904, "loss": 1.132, "step": 5715 }, { "epoch": 0.6875826421444885, "grad_norm": 0.5598886609077454, "learning_rate": 0.0004956668672571247, "loss": 0.9125, "step": 5720 }, { "epoch": 0.6881836759225869, "grad_norm": 0.4387410581111908, "learning_rate": 0.0004956580236485593, "loss": 0.8129, "step": 5725 }, { "epoch": 0.6887847097006852, "grad_norm": 1.346904993057251, "learning_rate": 0.0004956491711036654, "loss": 1.3328, "step": 5730 }, { "epoch": 0.6893857434787835, "grad_norm": 0.597332775592804, "learning_rate": 0.0004956403096227651, "loss": 1.2812, "step": 5735 }, { "epoch": 0.6899867772568818, "grad_norm": 0.4443131685256958, "learning_rate": 0.0004956314392061808, "loss": 0.9059, "step": 5740 }, { "epoch": 0.6905878110349801, "grad_norm": 0.547560453414917, "learning_rate": 0.000495622559854235, "loss": 1.1344, "step": 5745 }, { "epoch": 0.6911888448130785, "grad_norm": 0.5721062421798706, "learning_rate": 0.0004956136715672509, "loss": 1.1805, "step": 5750 }, { "epoch": 0.6917898785911768, "grad_norm": 0.5476487278938293, "learning_rate": 0.0004956047743455517, "loss": 1.2406, "step": 5755 }, { "epoch": 0.6923909123692752, "grad_norm": 0.5772802829742432, "learning_rate": 0.0004955958681894611, "loss": 1.1797, "step": 5760 }, { "epoch": 0.6929919461473735, "grad_norm": 0.5622345805168152, "learning_rate": 0.000495586953099303, "loss": 1.0258, "step": 5765 }, { "epoch": 0.6935929799254719, "grad_norm": 0.3974035680294037, "learning_rate": 0.0004955780290754018, "loss": 0.9348, "step": 5770 }, { "epoch": 0.6941940137035701, "grad_norm": 0.7566075921058655, "learning_rate": 0.0004955690961180822, "loss": 1.2188, "step": 5775 }, { "epoch": 0.6947950474816684, "grad_norm": 0.5623620748519897, "learning_rate": 0.0004955601542276691, "loss": 1.0195, "step": 5780 }, { "epoch": 0.6953960812597668, "grad_norm": 0.5877131819725037, "learning_rate": 0.0004955512034044876, "loss": 0.7508, "step": 5785 }, { "epoch": 0.6959971150378651, "grad_norm": 0.35780635476112366, "learning_rate": 0.0004955422436488635, "loss": 1.0418, "step": 5790 }, { "epoch": 0.6965981488159635, "grad_norm": 0.6338667869567871, "learning_rate": 0.0004955332749611227, "loss": 0.866, "step": 5795 }, { "epoch": 0.6971991825940618, "grad_norm": 0.6346995830535889, "learning_rate": 0.0004955242973415915, "loss": 0.9914, "step": 5800 }, { "epoch": 0.6971991825940618, "eval_loss": 1.979101538658142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1923, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 5800 }, { "epoch": 0.6978002163721602, "grad_norm": 0.6727893948554993, "learning_rate": 0.0004955153107905964, "loss": 1.1859, "step": 5805 }, { "epoch": 0.6984012501502584, "grad_norm": 0.4857117235660553, "learning_rate": 0.0004955063153084642, "loss": 1.5766, "step": 5810 }, { "epoch": 0.6990022839283567, "grad_norm": 0.6355534791946411, "learning_rate": 0.0004954973108955223, "loss": 1.1781, "step": 5815 }, { "epoch": 0.6996033177064551, "grad_norm": 0.5181488990783691, "learning_rate": 0.0004954882975520983, "loss": 1.2094, "step": 5820 }, { "epoch": 0.7002043514845534, "grad_norm": 0.3698882460594177, "learning_rate": 0.0004954792752785198, "loss": 1.0766, "step": 5825 }, { "epoch": 0.7008053852626518, "grad_norm": 0.6260622143745422, "learning_rate": 0.0004954702440751153, "loss": 0.9172, "step": 5830 }, { "epoch": 0.7014064190407501, "grad_norm": 0.5361654162406921, "learning_rate": 0.0004954612039422132, "loss": 1.0484, "step": 5835 }, { "epoch": 0.7020074528188485, "grad_norm": 0.4724903404712677, "learning_rate": 0.0004954521548801424, "loss": 1.225, "step": 5840 }, { "epoch": 0.7026084865969467, "grad_norm": 0.5695177912712097, "learning_rate": 0.000495443096889232, "loss": 0.9734, "step": 5845 }, { "epoch": 0.703209520375045, "grad_norm": 0.8220880627632141, "learning_rate": 0.0004954340299698116, "loss": 0.9305, "step": 5850 }, { "epoch": 0.7038105541531434, "grad_norm": 0.46409645676612854, "learning_rate": 0.000495424954122211, "loss": 1.5328, "step": 5855 }, { "epoch": 0.7044115879312417, "grad_norm": 0.5940005779266357, "learning_rate": 0.0004954158693467603, "loss": 0.9586, "step": 5860 }, { "epoch": 0.7050126217093401, "grad_norm": 0.6850906014442444, "learning_rate": 0.00049540677564379, "loss": 1.2406, "step": 5865 }, { "epoch": 0.7056136554874384, "grad_norm": 0.6025176048278809, "learning_rate": 0.0004953976730136309, "loss": 1.0621, "step": 5870 }, { "epoch": 0.7062146892655368, "grad_norm": 0.5903462767601013, "learning_rate": 0.0004953885614566142, "loss": 0.8078, "step": 5875 }, { "epoch": 0.706815723043635, "grad_norm": 0.8402755856513977, "learning_rate": 0.0004953794409730713, "loss": 0.8309, "step": 5880 }, { "epoch": 0.7074167568217333, "grad_norm": 0.525725245475769, "learning_rate": 0.0004953703115633339, "loss": 1.1781, "step": 5885 }, { "epoch": 0.7080177905998317, "grad_norm": 0.5770952701568604, "learning_rate": 0.0004953611732277342, "loss": 1.3102, "step": 5890 }, { "epoch": 0.70861882437793, "grad_norm": 0.5640763640403748, "learning_rate": 0.0004953520259666046, "loss": 1.1016, "step": 5895 }, { "epoch": 0.7092198581560284, "grad_norm": 0.6585139036178589, "learning_rate": 0.0004953428697802778, "loss": 0.9719, "step": 5900 }, { "epoch": 0.7098208919341267, "grad_norm": 0.8663554191589355, "learning_rate": 0.0004953337046690871, "loss": 0.9508, "step": 5905 }, { "epoch": 0.7104219257122251, "grad_norm": 0.7086400985717773, "learning_rate": 0.0004953245306333656, "loss": 0.8703, "step": 5910 }, { "epoch": 0.7110229594903233, "grad_norm": 0.36062702536582947, "learning_rate": 0.0004953153476734472, "loss": 0.8773, "step": 5915 }, { "epoch": 0.7116239932684217, "grad_norm": 0.5696423053741455, "learning_rate": 0.0004953061557896658, "loss": 1.15, "step": 5920 }, { "epoch": 0.71222502704652, "grad_norm": 0.3699018955230713, "learning_rate": 0.000495296954982356, "loss": 1.0984, "step": 5925 }, { "epoch": 0.7128260608246183, "grad_norm": 0.5031812787055969, "learning_rate": 0.0004952877452518523, "loss": 0.802, "step": 5930 }, { "epoch": 0.7134270946027167, "grad_norm": 0.5342919230461121, "learning_rate": 0.0004952785265984898, "loss": 1.0727, "step": 5935 }, { "epoch": 0.714028128380815, "grad_norm": 0.7044137120246887, "learning_rate": 0.0004952692990226039, "loss": 0.7984, "step": 5940 }, { "epoch": 0.7146291621589134, "grad_norm": 0.5580726861953735, "learning_rate": 0.0004952600625245301, "loss": 1.1875, "step": 5945 }, { "epoch": 0.7152301959370116, "grad_norm": 0.5382014513015747, "learning_rate": 0.0004952508171046046, "loss": 0.9633, "step": 5950 }, { "epoch": 0.71583122971511, "grad_norm": 0.6365616321563721, "learning_rate": 0.0004952415627631636, "loss": 1.0867, "step": 5955 }, { "epoch": 0.7164322634932083, "grad_norm": 0.572903573513031, "learning_rate": 0.0004952322995005438, "loss": 0.9461, "step": 5960 }, { "epoch": 0.7170332972713066, "grad_norm": 0.5480600595474243, "learning_rate": 0.0004952230273170822, "loss": 1.0547, "step": 5965 }, { "epoch": 0.717634331049405, "grad_norm": 0.6455062031745911, "learning_rate": 0.000495213746213116, "loss": 0.8516, "step": 5970 }, { "epoch": 0.7182353648275033, "grad_norm": 0.4712236225605011, "learning_rate": 0.0004952044561889829, "loss": 0.8742, "step": 5975 }, { "epoch": 0.7188363986056017, "grad_norm": 0.4829672873020172, "learning_rate": 0.0004951951572450207, "loss": 0.9465, "step": 5980 }, { "epoch": 0.7194374323837, "grad_norm": 0.5550187826156616, "learning_rate": 0.000495185849381568, "loss": 0.8234, "step": 5985 }, { "epoch": 0.7200384661617983, "grad_norm": 0.4478211998939514, "learning_rate": 0.000495176532598963, "loss": 1.3477, "step": 5990 }, { "epoch": 0.7206394999398966, "grad_norm": 0.5679304599761963, "learning_rate": 0.0004951672068975448, "loss": 1.0531, "step": 5995 }, { "epoch": 0.7212405337179949, "grad_norm": 0.4058151841163635, "learning_rate": 0.0004951578722776526, "loss": 0.9563, "step": 6000 }, { "epoch": 0.7212405337179949, "eval_loss": 1.9480469226837158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1671, "eval_samples_per_second": 4.55, "eval_steps_per_second": 1.137, "step": 6000 } ], "logging_steps": 5, "max_steps": 83190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.307687484653568e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }