{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.36062026685899745, "eval_steps": 200, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006010337780983291, "grad_norm": 0.9436860084533691, "learning_rate": 3.0048076923076927e-06, "loss": 4.4875, "step": 5 }, { "epoch": 0.0012020675561966582, "grad_norm": 0.49743083119392395, "learning_rate": 6.0096153846153855e-06, "loss": 4.2438, "step": 10 }, { "epoch": 0.0018031013342949874, "grad_norm": 0.7305557131767273, "learning_rate": 9.014423076923076e-06, "loss": 4.7438, "step": 15 }, { "epoch": 0.0024041351123933164, "grad_norm": 1.2842742204666138, "learning_rate": 1.2019230769230771e-05, "loss": 4.3125, "step": 20 }, { "epoch": 0.0030051688904916456, "grad_norm": 1.0128940343856812, "learning_rate": 1.5024038461538462e-05, "loss": 4.2969, "step": 25 }, { "epoch": 0.003606202668589975, "grad_norm": 1.6097222566604614, "learning_rate": 1.8028846153846152e-05, "loss": 4.3625, "step": 30 }, { "epoch": 0.004207236446688304, "grad_norm": 0.7380394339561462, "learning_rate": 2.103365384615385e-05, "loss": 3.6562, "step": 35 }, { "epoch": 0.004808270224786633, "grad_norm": 2.499553918838501, "learning_rate": 2.4038461538461542e-05, "loss": 3.9656, "step": 40 }, { "epoch": 0.005409304002884962, "grad_norm": 0.9382426142692566, "learning_rate": 2.704326923076923e-05, "loss": 3.7906, "step": 45 }, { "epoch": 0.006010337780983291, "grad_norm": 0.4448552429676056, "learning_rate": 3.0048076923076925e-05, "loss": 3.4531, "step": 50 }, { "epoch": 0.00661137155908162, "grad_norm": 0.6187996864318848, "learning_rate": 3.3052884615384615e-05, "loss": 3.0406, "step": 55 }, { "epoch": 0.00721240533717995, "grad_norm": 0.4894959032535553, "learning_rate": 3.6057692307692304e-05, "loss": 2.9844, "step": 60 }, { "epoch": 0.007813439115278278, "grad_norm": 0.523160994052887, "learning_rate": 3.90625e-05, "loss": 2.825, "step": 65 }, { "epoch": 0.008414472893376608, "grad_norm": 0.41818058490753174, "learning_rate": 4.20673076923077e-05, "loss": 2.5125, "step": 70 }, { "epoch": 0.009015506671474938, "grad_norm": 0.37683457136154175, "learning_rate": 4.507211538461539e-05, "loss": 2.6844, "step": 75 }, { "epoch": 0.009616540449573266, "grad_norm": 0.428375780582428, "learning_rate": 4.8076923076923084e-05, "loss": 2.6719, "step": 80 }, { "epoch": 0.010217574227671595, "grad_norm": 0.3897765576839447, "learning_rate": 5.108173076923077e-05, "loss": 2.2531, "step": 85 }, { "epoch": 0.010818608005769925, "grad_norm": 0.2265370637178421, "learning_rate": 5.408653846153846e-05, "loss": 2.2844, "step": 90 }, { "epoch": 0.011419641783868253, "grad_norm": 0.2113611400127411, "learning_rate": 5.709134615384615e-05, "loss": 2.1922, "step": 95 }, { "epoch": 0.012020675561966582, "grad_norm": 0.1886824667453766, "learning_rate": 6.009615384615385e-05, "loss": 2.3516, "step": 100 }, { "epoch": 0.012621709340064912, "grad_norm": 0.25855502486228943, "learning_rate": 6.310096153846154e-05, "loss": 2.4, "step": 105 }, { "epoch": 0.01322274311816324, "grad_norm": 0.22833962738513947, "learning_rate": 6.610576923076923e-05, "loss": 2.2844, "step": 110 }, { "epoch": 0.01382377689626157, "grad_norm": 0.30784738063812256, "learning_rate": 6.911057692307693e-05, "loss": 2.2016, "step": 115 }, { "epoch": 0.0144248106743599, "grad_norm": 0.3998744487762451, "learning_rate": 7.211538461538461e-05, "loss": 2.4125, "step": 120 }, { "epoch": 0.015025844452458229, "grad_norm": 0.24773858487606049, "learning_rate": 7.512019230769231e-05, "loss": 2.4156, "step": 125 }, { "epoch": 0.015626878230556557, "grad_norm": 0.26020580530166626, "learning_rate": 7.8125e-05, "loss": 2.0094, "step": 130 }, { "epoch": 0.016227912008654886, "grad_norm": 0.25112366676330566, "learning_rate": 8.112980769230769e-05, "loss": 2.5969, "step": 135 }, { "epoch": 0.016828945786753216, "grad_norm": 0.3155271112918854, "learning_rate": 8.41346153846154e-05, "loss": 1.9844, "step": 140 }, { "epoch": 0.017429979564851546, "grad_norm": 0.2684473693370819, "learning_rate": 8.713942307692307e-05, "loss": 2.3594, "step": 145 }, { "epoch": 0.018031013342949875, "grad_norm": 0.19519321620464325, "learning_rate": 9.014423076923077e-05, "loss": 2.1906, "step": 150 }, { "epoch": 0.0186320471210482, "grad_norm": 0.29595857858657837, "learning_rate": 9.314903846153846e-05, "loss": 2.4844, "step": 155 }, { "epoch": 0.01923308089914653, "grad_norm": 0.21725840866565704, "learning_rate": 9.615384615384617e-05, "loss": 1.9969, "step": 160 }, { "epoch": 0.01983411467724486, "grad_norm": 0.250431627035141, "learning_rate": 9.915865384615384e-05, "loss": 2.1469, "step": 165 }, { "epoch": 0.02043514845534319, "grad_norm": 0.22979402542114258, "learning_rate": 0.00010216346153846153, "loss": 2.0891, "step": 170 }, { "epoch": 0.02103618223344152, "grad_norm": 0.29841649532318115, "learning_rate": 0.00010516826923076924, "loss": 2.0891, "step": 175 }, { "epoch": 0.02163721601153985, "grad_norm": 0.3121524155139923, "learning_rate": 0.00010817307692307693, "loss": 2.2938, "step": 180 }, { "epoch": 0.02223824978963818, "grad_norm": 0.25094497203826904, "learning_rate": 0.00011117788461538462, "loss": 2.0672, "step": 185 }, { "epoch": 0.022839283567736506, "grad_norm": 0.32229083776474, "learning_rate": 0.0001141826923076923, "loss": 1.9781, "step": 190 }, { "epoch": 0.023440317345834835, "grad_norm": 0.30247944593429565, "learning_rate": 0.0001171875, "loss": 2.4469, "step": 195 }, { "epoch": 0.024041351123933165, "grad_norm": 0.3992522358894348, "learning_rate": 0.0001201923076923077, "loss": 2.1609, "step": 200 }, { "epoch": 0.024041351123933165, "eval_loss": 2.7308592796325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.262, "eval_samples_per_second": 4.537, "eval_steps_per_second": 1.134, "step": 200 }, { "epoch": 0.024642384902031494, "grad_norm": 0.28425589203834534, "learning_rate": 0.0001231971153846154, "loss": 2.625, "step": 205 }, { "epoch": 0.025243418680129824, "grad_norm": 0.31964734196662903, "learning_rate": 0.00012620192307692308, "loss": 1.9328, "step": 210 }, { "epoch": 0.025844452458228154, "grad_norm": 0.37272173166275024, "learning_rate": 0.00012920673076923078, "loss": 2.1641, "step": 215 }, { "epoch": 0.02644548623632648, "grad_norm": 0.32725071907043457, "learning_rate": 0.00013221153846153846, "loss": 2.225, "step": 220 }, { "epoch": 0.02704652001442481, "grad_norm": 0.25303465127944946, "learning_rate": 0.00013521634615384616, "loss": 2.2375, "step": 225 }, { "epoch": 0.02764755379252314, "grad_norm": 0.4098326861858368, "learning_rate": 0.00013822115384615386, "loss": 2.0203, "step": 230 }, { "epoch": 0.02824858757062147, "grad_norm": 0.3435593545436859, "learning_rate": 0.00014122596153846154, "loss": 2.3016, "step": 235 }, { "epoch": 0.0288496213487198, "grad_norm": 0.4556426703929901, "learning_rate": 0.00014423076923076922, "loss": 2.2969, "step": 240 }, { "epoch": 0.029450655126818128, "grad_norm": 0.39692002534866333, "learning_rate": 0.00014723557692307692, "loss": 2.3031, "step": 245 }, { "epoch": 0.030051688904916458, "grad_norm": 0.31686538457870483, "learning_rate": 0.00015024038461538462, "loss": 2.2984, "step": 250 }, { "epoch": 0.030652722683014784, "grad_norm": 0.30815422534942627, "learning_rate": 0.00015324519230769233, "loss": 2.1734, "step": 255 }, { "epoch": 0.031253756461113114, "grad_norm": 0.3927950859069824, "learning_rate": 0.00015625, "loss": 2.0031, "step": 260 }, { "epoch": 0.03185479023921144, "grad_norm": 0.3010413944721222, "learning_rate": 0.00015925480769230768, "loss": 2.1875, "step": 265 }, { "epoch": 0.03245582401730977, "grad_norm": 0.39929866790771484, "learning_rate": 0.00016225961538461538, "loss": 2.2266, "step": 270 }, { "epoch": 0.0330568577954081, "grad_norm": 0.3709786832332611, "learning_rate": 0.00016526442307692309, "loss": 2.2344, "step": 275 }, { "epoch": 0.03365789157350643, "grad_norm": 0.38551804423332214, "learning_rate": 0.0001682692307692308, "loss": 2.0391, "step": 280 }, { "epoch": 0.03425892535160476, "grad_norm": 0.3497028350830078, "learning_rate": 0.00017127403846153847, "loss": 2.1328, "step": 285 }, { "epoch": 0.03485995912970309, "grad_norm": 0.22066070139408112, "learning_rate": 0.00017427884615384614, "loss": 1.9891, "step": 290 }, { "epoch": 0.03546099290780142, "grad_norm": 0.3861188590526581, "learning_rate": 0.00017728365384615385, "loss": 2.0266, "step": 295 }, { "epoch": 0.03606202668589975, "grad_norm": 0.43038997054100037, "learning_rate": 0.00018028846153846155, "loss": 2.2062, "step": 300 }, { "epoch": 0.03666306046399807, "grad_norm": 0.4089072644710541, "learning_rate": 0.00018329326923076922, "loss": 2.2016, "step": 305 }, { "epoch": 0.0372640942420964, "grad_norm": 0.40281516313552856, "learning_rate": 0.00018629807692307693, "loss": 2.2578, "step": 310 }, { "epoch": 0.03786512802019473, "grad_norm": 0.33316513895988464, "learning_rate": 0.0001893028846153846, "loss": 2.1844, "step": 315 }, { "epoch": 0.03846616179829306, "grad_norm": 0.4020228087902069, "learning_rate": 0.00019230769230769233, "loss": 2.2109, "step": 320 }, { "epoch": 0.03906719557639139, "grad_norm": 0.36403888463974, "learning_rate": 0.0001953125, "loss": 2.0063, "step": 325 }, { "epoch": 0.03966822935448972, "grad_norm": 0.4289080500602722, "learning_rate": 0.0001983173076923077, "loss": 2.1641, "step": 330 }, { "epoch": 0.04026926313258805, "grad_norm": 0.3827407658100128, "learning_rate": 0.0002013221153846154, "loss": 2.4125, "step": 335 }, { "epoch": 0.04087029691068638, "grad_norm": 0.28297996520996094, "learning_rate": 0.00020432692307692307, "loss": 2.2047, "step": 340 }, { "epoch": 0.04147133068878471, "grad_norm": 0.3654349744319916, "learning_rate": 0.0002073317307692308, "loss": 2.0344, "step": 345 }, { "epoch": 0.04207236446688304, "grad_norm": 0.44768983125686646, "learning_rate": 0.00021033653846153847, "loss": 2.0469, "step": 350 }, { "epoch": 0.04267339824498137, "grad_norm": 0.36050865054130554, "learning_rate": 0.00021334134615384615, "loss": 1.8203, "step": 355 }, { "epoch": 0.0432744320230797, "grad_norm": 0.41343504190444946, "learning_rate": 0.00021634615384615385, "loss": 1.9031, "step": 360 }, { "epoch": 0.04387546580117803, "grad_norm": 0.33549779653549194, "learning_rate": 0.00021935096153846153, "loss": 1.9859, "step": 365 }, { "epoch": 0.04447649957927636, "grad_norm": 0.39200559258461, "learning_rate": 0.00022235576923076923, "loss": 2.0672, "step": 370 }, { "epoch": 0.04507753335737468, "grad_norm": 0.5816010236740112, "learning_rate": 0.00022536057692307694, "loss": 2.1625, "step": 375 }, { "epoch": 0.04567856713547301, "grad_norm": 0.4004225432872772, "learning_rate": 0.0002283653846153846, "loss": 2.1297, "step": 380 }, { "epoch": 0.04627960091357134, "grad_norm": 0.3329584300518036, "learning_rate": 0.00023137019230769232, "loss": 1.8969, "step": 385 }, { "epoch": 0.04688063469166967, "grad_norm": 0.3800398111343384, "learning_rate": 0.000234375, "loss": 1.875, "step": 390 }, { "epoch": 0.047481668469768, "grad_norm": 0.5345351696014404, "learning_rate": 0.0002373798076923077, "loss": 2.0641, "step": 395 }, { "epoch": 0.04808270224786633, "grad_norm": 0.31537583470344543, "learning_rate": 0.0002403846153846154, "loss": 2.0828, "step": 400 }, { "epoch": 0.04808270224786633, "eval_loss": 2.657031297683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2197, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 400 }, { "epoch": 0.04868373602596466, "grad_norm": 0.3651765286922455, "learning_rate": 0.00024338942307692307, "loss": 2.2188, "step": 405 }, { "epoch": 0.04928476980406299, "grad_norm": 0.42044126987457275, "learning_rate": 0.0002463942307692308, "loss": 1.9625, "step": 410 }, { "epoch": 0.04988580358216132, "grad_norm": 0.3405047357082367, "learning_rate": 0.00024939903846153845, "loss": 2.1203, "step": 415 }, { "epoch": 0.05048683736025965, "grad_norm": 0.5022028088569641, "learning_rate": 0.00025240384615384616, "loss": 1.7672, "step": 420 }, { "epoch": 0.05108787113835798, "grad_norm": 0.31208300590515137, "learning_rate": 0.00025540865384615386, "loss": 1.9266, "step": 425 }, { "epoch": 0.05168890491645631, "grad_norm": 0.39399516582489014, "learning_rate": 0.00025841346153846156, "loss": 1.8828, "step": 430 }, { "epoch": 0.05228993869455464, "grad_norm": 0.42515093088150024, "learning_rate": 0.0002614182692307692, "loss": 1.7656, "step": 435 }, { "epoch": 0.05289097247265296, "grad_norm": 0.3947089910507202, "learning_rate": 0.0002644230769230769, "loss": 2.0484, "step": 440 }, { "epoch": 0.05349200625075129, "grad_norm": 0.6280628442764282, "learning_rate": 0.0002674278846153846, "loss": 2.1422, "step": 445 }, { "epoch": 0.05409304002884962, "grad_norm": 0.3639807105064392, "learning_rate": 0.0002704326923076923, "loss": 1.9781, "step": 450 }, { "epoch": 0.05469407380694795, "grad_norm": 0.3984295427799225, "learning_rate": 0.0002734375, "loss": 2.2359, "step": 455 }, { "epoch": 0.05529510758504628, "grad_norm": 0.33954715728759766, "learning_rate": 0.00027644230769230773, "loss": 2.3547, "step": 460 }, { "epoch": 0.05589614136314461, "grad_norm": 0.4361511468887329, "learning_rate": 0.0002794471153846154, "loss": 2.0859, "step": 465 }, { "epoch": 0.05649717514124294, "grad_norm": 0.471563458442688, "learning_rate": 0.0002824519230769231, "loss": 2.1703, "step": 470 }, { "epoch": 0.05709820891934127, "grad_norm": 0.2517772614955902, "learning_rate": 0.0002854567307692308, "loss": 2.0281, "step": 475 }, { "epoch": 0.0576992426974396, "grad_norm": 0.3190082907676697, "learning_rate": 0.00028846153846153843, "loss": 2.0234, "step": 480 }, { "epoch": 0.05830027647553793, "grad_norm": 0.37972012162208557, "learning_rate": 0.00029146634615384614, "loss": 2.15, "step": 485 }, { "epoch": 0.058901310253636256, "grad_norm": 0.37980136275291443, "learning_rate": 0.00029447115384615384, "loss": 2.1219, "step": 490 }, { "epoch": 0.059502344031734586, "grad_norm": 0.32648953795433044, "learning_rate": 0.00029747596153846154, "loss": 1.9703, "step": 495 }, { "epoch": 0.060103377809832916, "grad_norm": 0.28836116194725037, "learning_rate": 0.00030048076923076925, "loss": 2.0406, "step": 500 }, { "epoch": 0.060704411587931245, "grad_norm": 0.2953934967517853, "learning_rate": 0.00030348557692307695, "loss": 2.2156, "step": 505 }, { "epoch": 0.06130544536602957, "grad_norm": 0.4778139889240265, "learning_rate": 0.00030649038461538465, "loss": 2.0672, "step": 510 }, { "epoch": 0.0619064791441279, "grad_norm": 0.27339640259742737, "learning_rate": 0.0003094951923076923, "loss": 1.8953, "step": 515 }, { "epoch": 0.06250751292222623, "grad_norm": 0.3127667009830475, "learning_rate": 0.0003125, "loss": 2.0859, "step": 520 }, { "epoch": 0.06310854670032456, "grad_norm": 0.2676738500595093, "learning_rate": 0.0003155048076923077, "loss": 1.9656, "step": 525 }, { "epoch": 0.06370958047842289, "grad_norm": 0.3519584834575653, "learning_rate": 0.00031850961538461536, "loss": 2.0828, "step": 530 }, { "epoch": 0.06431061425652122, "grad_norm": 0.38000714778900146, "learning_rate": 0.00032151442307692306, "loss": 1.8656, "step": 535 }, { "epoch": 0.06491164803461955, "grad_norm": 0.5076779127120972, "learning_rate": 0.00032451923076923077, "loss": 1.8938, "step": 540 }, { "epoch": 0.06551268181271787, "grad_norm": 0.3919801414012909, "learning_rate": 0.00032752403846153847, "loss": 2.1203, "step": 545 }, { "epoch": 0.0661137155908162, "grad_norm": 0.3263305425643921, "learning_rate": 0.00033052884615384617, "loss": 2.0344, "step": 550 }, { "epoch": 0.06671474936891453, "grad_norm": 0.4196506440639496, "learning_rate": 0.0003335336538461539, "loss": 2.1078, "step": 555 }, { "epoch": 0.06731578314701286, "grad_norm": 0.3997637927532196, "learning_rate": 0.0003365384615384616, "loss": 1.8844, "step": 560 }, { "epoch": 0.06791681692511119, "grad_norm": 0.39547184109687805, "learning_rate": 0.00033954326923076923, "loss": 1.9406, "step": 565 }, { "epoch": 0.06851785070320952, "grad_norm": 0.36170271039009094, "learning_rate": 0.00034254807692307693, "loss": 2.2469, "step": 570 }, { "epoch": 0.06911888448130785, "grad_norm": 0.3041069507598877, "learning_rate": 0.00034555288461538463, "loss": 1.7734, "step": 575 }, { "epoch": 0.06971991825940618, "grad_norm": 0.31936579942703247, "learning_rate": 0.0003485576923076923, "loss": 2.1141, "step": 580 }, { "epoch": 0.0703209520375045, "grad_norm": 0.5643404722213745, "learning_rate": 0.0003515625, "loss": 1.9, "step": 585 }, { "epoch": 0.07092198581560284, "grad_norm": 0.43453335762023926, "learning_rate": 0.0003545673076923077, "loss": 1.5203, "step": 590 }, { "epoch": 0.07152301959370116, "grad_norm": 0.28918376564979553, "learning_rate": 0.0003575721153846154, "loss": 1.9859, "step": 595 }, { "epoch": 0.0721240533717995, "grad_norm": 0.4441574215888977, "learning_rate": 0.0003605769230769231, "loss": 1.7422, "step": 600 }, { "epoch": 0.0721240533717995, "eval_loss": 2.598828077316284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2182, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 600 }, { "epoch": 0.07272508714989782, "grad_norm": 0.47054970264434814, "learning_rate": 0.0003635817307692308, "loss": 1.975, "step": 605 }, { "epoch": 0.07332612092799615, "grad_norm": 0.4808221459388733, "learning_rate": 0.00036658653846153845, "loss": 2.0969, "step": 610 }, { "epoch": 0.07392715470609448, "grad_norm": 0.40260159969329834, "learning_rate": 0.00036959134615384615, "loss": 2.0781, "step": 615 }, { "epoch": 0.0745281884841928, "grad_norm": 0.3565881848335266, "learning_rate": 0.00037259615384615386, "loss": 1.8562, "step": 620 }, { "epoch": 0.07512922226229114, "grad_norm": 0.41623455286026, "learning_rate": 0.00037560096153846156, "loss": 1.9688, "step": 625 }, { "epoch": 0.07573025604038947, "grad_norm": 0.442056804895401, "learning_rate": 0.0003786057692307692, "loss": 2.0531, "step": 630 }, { "epoch": 0.0763312898184878, "grad_norm": 0.5474425554275513, "learning_rate": 0.0003816105769230769, "loss": 2.1391, "step": 635 }, { "epoch": 0.07693232359658612, "grad_norm": 0.29002273082733154, "learning_rate": 0.00038461538461538467, "loss": 1.6906, "step": 640 }, { "epoch": 0.07753335737468446, "grad_norm": 0.30469194054603577, "learning_rate": 0.0003876201923076923, "loss": 1.6859, "step": 645 }, { "epoch": 0.07813439115278278, "grad_norm": 0.3932645618915558, "learning_rate": 0.000390625, "loss": 1.8328, "step": 650 }, { "epoch": 0.07873542493088112, "grad_norm": 0.4049251079559326, "learning_rate": 0.0003936298076923077, "loss": 1.8672, "step": 655 }, { "epoch": 0.07933645870897944, "grad_norm": 0.4889291524887085, "learning_rate": 0.0003966346153846154, "loss": 2.0531, "step": 660 }, { "epoch": 0.07993749248707778, "grad_norm": 0.38475117087364197, "learning_rate": 0.0003996394230769231, "loss": 1.8422, "step": 665 }, { "epoch": 0.0805385262651761, "grad_norm": 0.34599217772483826, "learning_rate": 0.0004026442307692308, "loss": 1.8391, "step": 670 }, { "epoch": 0.08113956004327443, "grad_norm": 0.39600178599357605, "learning_rate": 0.00040564903846153843, "loss": 1.8484, "step": 675 }, { "epoch": 0.08174059382137276, "grad_norm": 0.3293285071849823, "learning_rate": 0.00040865384615384613, "loss": 1.6656, "step": 680 }, { "epoch": 0.08234162759947108, "grad_norm": 0.37310031056404114, "learning_rate": 0.00041165865384615384, "loss": 1.9609, "step": 685 }, { "epoch": 0.08294266137756942, "grad_norm": 0.41512343287467957, "learning_rate": 0.0004146634615384616, "loss": 1.9937, "step": 690 }, { "epoch": 0.08354369515566774, "grad_norm": 0.47950249910354614, "learning_rate": 0.00041766826923076924, "loss": 1.9109, "step": 695 }, { "epoch": 0.08414472893376608, "grad_norm": 0.4324653744697571, "learning_rate": 0.00042067307692307695, "loss": 1.9953, "step": 700 }, { "epoch": 0.0847457627118644, "grad_norm": 0.3693973422050476, "learning_rate": 0.00042367788461538465, "loss": 1.9016, "step": 705 }, { "epoch": 0.08534679648996274, "grad_norm": 0.33113107085227966, "learning_rate": 0.0004266826923076923, "loss": 2.2266, "step": 710 }, { "epoch": 0.08594783026806106, "grad_norm": 0.5808571577072144, "learning_rate": 0.0004296875, "loss": 1.5063, "step": 715 }, { "epoch": 0.0865488640461594, "grad_norm": 0.3792312443256378, "learning_rate": 0.0004326923076923077, "loss": 1.8016, "step": 720 }, { "epoch": 0.08714989782425772, "grad_norm": 0.43698450922966003, "learning_rate": 0.00043569711538461535, "loss": 1.7219, "step": 725 }, { "epoch": 0.08775093160235606, "grad_norm": 0.43264222145080566, "learning_rate": 0.00043870192307692306, "loss": 1.7234, "step": 730 }, { "epoch": 0.08835196538045438, "grad_norm": 0.5246540307998657, "learning_rate": 0.0004417067307692308, "loss": 1.7531, "step": 735 }, { "epoch": 0.08895299915855272, "grad_norm": 0.2953200936317444, "learning_rate": 0.00044471153846153846, "loss": 1.9438, "step": 740 }, { "epoch": 0.08955403293665104, "grad_norm": 0.39238616824150085, "learning_rate": 0.00044771634615384617, "loss": 1.7172, "step": 745 }, { "epoch": 0.09015506671474936, "grad_norm": 0.4887576401233673, "learning_rate": 0.00045072115384615387, "loss": 1.9594, "step": 750 }, { "epoch": 0.0907561004928477, "grad_norm": 0.391634076833725, "learning_rate": 0.0004537259615384616, "loss": 1.8406, "step": 755 }, { "epoch": 0.09135713427094602, "grad_norm": 0.4006985127925873, "learning_rate": 0.0004567307692307692, "loss": 1.7984, "step": 760 }, { "epoch": 0.09195816804904436, "grad_norm": 0.3601657748222351, "learning_rate": 0.0004597355769230769, "loss": 1.9844, "step": 765 }, { "epoch": 0.09255920182714268, "grad_norm": 0.5057326555252075, "learning_rate": 0.00046274038461538463, "loss": 1.6703, "step": 770 }, { "epoch": 0.09316023560524102, "grad_norm": 0.5787122845649719, "learning_rate": 0.0004657451923076923, "loss": 1.8984, "step": 775 }, { "epoch": 0.09376126938333934, "grad_norm": 0.4849441945552826, "learning_rate": 0.00046875, "loss": 1.8672, "step": 780 }, { "epoch": 0.09436230316143768, "grad_norm": 0.44167378544807434, "learning_rate": 0.00047175480769230774, "loss": 1.6422, "step": 785 }, { "epoch": 0.094963336939536, "grad_norm": 0.6295076608657837, "learning_rate": 0.0004747596153846154, "loss": 1.6875, "step": 790 }, { "epoch": 0.09556437071763434, "grad_norm": 0.4804101586341858, "learning_rate": 0.0004777644230769231, "loss": 1.8203, "step": 795 }, { "epoch": 0.09616540449573266, "grad_norm": 0.4898495674133301, "learning_rate": 0.0004807692307692308, "loss": 1.9891, "step": 800 }, { "epoch": 0.09616540449573266, "eval_loss": 2.535351514816284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1754, "eval_samples_per_second": 4.549, "eval_steps_per_second": 1.137, "step": 800 }, { "epoch": 0.096766438273831, "grad_norm": 0.43688085675239563, "learning_rate": 0.00048377403846153845, "loss": 1.7234, "step": 805 }, { "epoch": 0.09736747205192932, "grad_norm": 0.5891087651252747, "learning_rate": 0.00048677884615384615, "loss": 1.7969, "step": 810 }, { "epoch": 0.09796850583002764, "grad_norm": 0.5140319466590881, "learning_rate": 0.0004897836538461539, "loss": 2.0719, "step": 815 }, { "epoch": 0.09856953960812598, "grad_norm": 0.40886375308036804, "learning_rate": 0.0004927884615384616, "loss": 2.0891, "step": 820 }, { "epoch": 0.0991705733862243, "grad_norm": 0.3513309955596924, "learning_rate": 0.0004957932692307692, "loss": 1.8453, "step": 825 }, { "epoch": 0.09977160716432264, "grad_norm": 0.5530559420585632, "learning_rate": 0.0004987980769230769, "loss": 1.675, "step": 830 }, { "epoch": 0.10037264094242096, "grad_norm": 0.4348265528678894, "learning_rate": 0.0004999999983630302, "loss": 1.7891, "step": 835 }, { "epoch": 0.1009736747205193, "grad_norm": 0.5396342277526855, "learning_rate": 0.0004999999883593255, "loss": 1.9047, "step": 840 }, { "epoch": 0.10157470849861762, "grad_norm": 0.5154384970664978, "learning_rate": 0.0004999999692613442, "loss": 1.8844, "step": 845 }, { "epoch": 0.10217574227671596, "grad_norm": 0.29072120785713196, "learning_rate": 0.0004999999410690872, "loss": 1.6531, "step": 850 }, { "epoch": 0.10277677605481428, "grad_norm": 0.4125816822052002, "learning_rate": 0.0004999999037825552, "loss": 1.9031, "step": 855 }, { "epoch": 0.10337780983291261, "grad_norm": 0.34915369749069214, "learning_rate": 0.0004999998574017497, "loss": 1.8609, "step": 860 }, { "epoch": 0.10397884361101094, "grad_norm": 0.3622804284095764, "learning_rate": 0.0004999998019266724, "loss": 1.7484, "step": 865 }, { "epoch": 0.10457987738910927, "grad_norm": 0.36787149310112, "learning_rate": 0.0004999997373573254, "loss": 1.7812, "step": 870 }, { "epoch": 0.1051809111672076, "grad_norm": 0.4469545781612396, "learning_rate": 0.0004999996636937108, "loss": 1.5484, "step": 875 }, { "epoch": 0.10578194494530592, "grad_norm": 0.30026400089263916, "learning_rate": 0.0004999995809358316, "loss": 1.6703, "step": 880 }, { "epoch": 0.10638297872340426, "grad_norm": 0.4870736002922058, "learning_rate": 0.0004999994890836904, "loss": 1.7547, "step": 885 }, { "epoch": 0.10698401250150258, "grad_norm": 0.6516287326812744, "learning_rate": 0.000499999388137291, "loss": 1.7891, "step": 890 }, { "epoch": 0.10758504627960092, "grad_norm": 0.2974604368209839, "learning_rate": 0.0004999992780966368, "loss": 1.8359, "step": 895 }, { "epoch": 0.10818608005769924, "grad_norm": 0.3521243929862976, "learning_rate": 0.0004999991589617318, "loss": 1.9141, "step": 900 }, { "epoch": 0.10878711383579757, "grad_norm": 0.38353726267814636, "learning_rate": 0.0004999990307325803, "loss": 1.775, "step": 905 }, { "epoch": 0.1093881476138959, "grad_norm": 0.46048542857170105, "learning_rate": 0.0004999988934091872, "loss": 1.7297, "step": 910 }, { "epoch": 0.10998918139199423, "grad_norm": 0.4313719570636749, "learning_rate": 0.0004999987469915573, "loss": 1.2891, "step": 915 }, { "epoch": 0.11059021517009256, "grad_norm": 0.5933486223220825, "learning_rate": 0.0004999985914796961, "loss": 1.6938, "step": 920 }, { "epoch": 0.1111912489481909, "grad_norm": 0.5271236300468445, "learning_rate": 0.000499998426873609, "loss": 1.8, "step": 925 }, { "epoch": 0.11179228272628922, "grad_norm": 0.3807511031627655, "learning_rate": 0.0004999982531733022, "loss": 1.3086, "step": 930 }, { "epoch": 0.11239331650438755, "grad_norm": 0.4684934914112091, "learning_rate": 0.0004999980703787819, "loss": 1.4875, "step": 935 }, { "epoch": 0.11299435028248588, "grad_norm": 0.5648980140686035, "learning_rate": 0.0004999978784900549, "loss": 1.6578, "step": 940 }, { "epoch": 0.1135953840605842, "grad_norm": 0.4021349549293518, "learning_rate": 0.0004999976775071278, "loss": 1.8266, "step": 945 }, { "epoch": 0.11419641783868253, "grad_norm": 0.3722395598888397, "learning_rate": 0.0004999974674300084, "loss": 1.8969, "step": 950 }, { "epoch": 0.11479745161678086, "grad_norm": 0.407781720161438, "learning_rate": 0.000499997248258704, "loss": 1.6562, "step": 955 }, { "epoch": 0.1153984853948792, "grad_norm": 0.44156748056411743, "learning_rate": 0.0004999970199932229, "loss": 2.0688, "step": 960 }, { "epoch": 0.11599951917297752, "grad_norm": 0.40020808577537537, "learning_rate": 0.000499996782633573, "loss": 1.5047, "step": 965 }, { "epoch": 0.11660055295107585, "grad_norm": 0.38710176944732666, "learning_rate": 0.0004999965361797633, "loss": 1.7367, "step": 970 }, { "epoch": 0.11720158672917418, "grad_norm": 0.344836562871933, "learning_rate": 0.0004999962806318025, "loss": 1.7828, "step": 975 }, { "epoch": 0.11780262050727251, "grad_norm": 0.3811284899711609, "learning_rate": 0.0004999960159897, "loss": 1.7766, "step": 980 }, { "epoch": 0.11840365428537084, "grad_norm": 0.5141933560371399, "learning_rate": 0.0004999957422534654, "loss": 1.75, "step": 985 }, { "epoch": 0.11900468806346917, "grad_norm": 0.37530529499053955, "learning_rate": 0.0004999954594231088, "loss": 2.0922, "step": 990 }, { "epoch": 0.1196057218415675, "grad_norm": 0.41129302978515625, "learning_rate": 0.0004999951674986401, "loss": 1.5781, "step": 995 }, { "epoch": 0.12020675561966583, "grad_norm": 0.3869934380054474, "learning_rate": 0.0004999948664800704, "loss": 1.7422, "step": 1000 }, { "epoch": 0.12020675561966583, "eval_loss": 2.4908204078674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1997, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 1000 }, { "epoch": 0.12080778939776415, "grad_norm": 0.36643335223197937, "learning_rate": 0.0004999945563674105, "loss": 1.6797, "step": 1005 }, { "epoch": 0.12140882317586249, "grad_norm": 0.45910894870758057, "learning_rate": 0.0004999942371606714, "loss": 1.7063, "step": 1010 }, { "epoch": 0.12200985695396081, "grad_norm": 0.350729763507843, "learning_rate": 0.0004999939088598652, "loss": 1.6344, "step": 1015 }, { "epoch": 0.12261089073205914, "grad_norm": 0.46493440866470337, "learning_rate": 0.0004999935714650034, "loss": 1.9641, "step": 1020 }, { "epoch": 0.12321192451015747, "grad_norm": 0.42726650834083557, "learning_rate": 0.0004999932249760984, "loss": 1.7094, "step": 1025 }, { "epoch": 0.1238129582882558, "grad_norm": 0.28014904260635376, "learning_rate": 0.000499992869393163, "loss": 1.8516, "step": 1030 }, { "epoch": 0.12441399206635413, "grad_norm": 0.4522114098072052, "learning_rate": 0.0004999925047162099, "loss": 1.3961, "step": 1035 }, { "epoch": 0.12501502584445245, "grad_norm": 0.46475955843925476, "learning_rate": 0.0004999921309452526, "loss": 1.4062, "step": 1040 }, { "epoch": 0.1256160596225508, "grad_norm": 0.44490954279899597, "learning_rate": 0.0004999917480803044, "loss": 1.6719, "step": 1045 }, { "epoch": 0.12621709340064913, "grad_norm": 0.40904587507247925, "learning_rate": 0.0004999913561213793, "loss": 1.7734, "step": 1050 }, { "epoch": 0.12681812717874744, "grad_norm": 0.36412525177001953, "learning_rate": 0.0004999909550684918, "loss": 1.2594, "step": 1055 }, { "epoch": 0.12741916095684577, "grad_norm": 0.7560976147651672, "learning_rate": 0.0004999905449216563, "loss": 1.6047, "step": 1060 }, { "epoch": 0.1280201947349441, "grad_norm": 0.5383388996124268, "learning_rate": 0.0004999901256808878, "loss": 1.6016, "step": 1065 }, { "epoch": 0.12862122851304245, "grad_norm": 0.5255587100982666, "learning_rate": 0.0004999896973462012, "loss": 1.7828, "step": 1070 }, { "epoch": 0.12922226229114075, "grad_norm": 0.4830612242221832, "learning_rate": 0.0004999892599176127, "loss": 1.8781, "step": 1075 }, { "epoch": 0.1298232960692391, "grad_norm": 0.3687385618686676, "learning_rate": 0.0004999888133951377, "loss": 1.4797, "step": 1080 }, { "epoch": 0.13042432984733743, "grad_norm": 0.3518010675907135, "learning_rate": 0.0004999883577787927, "loss": 1.7234, "step": 1085 }, { "epoch": 0.13102536362543574, "grad_norm": 0.4522668719291687, "learning_rate": 0.0004999878930685943, "loss": 1.675, "step": 1090 }, { "epoch": 0.13162639740353407, "grad_norm": 0.3153088390827179, "learning_rate": 0.0004999874192645592, "loss": 1.7328, "step": 1095 }, { "epoch": 0.1322274311816324, "grad_norm": 0.4520825147628784, "learning_rate": 0.0004999869363667048, "loss": 1.925, "step": 1100 }, { "epoch": 0.13282846495973075, "grad_norm": 0.3040079176425934, "learning_rate": 0.0004999864443750486, "loss": 1.6922, "step": 1105 }, { "epoch": 0.13342949873782906, "grad_norm": 0.5198135375976562, "learning_rate": 0.0004999859432896084, "loss": 1.6562, "step": 1110 }, { "epoch": 0.1340305325159274, "grad_norm": 0.30772989988327026, "learning_rate": 0.0004999854331104028, "loss": 1.8078, "step": 1115 }, { "epoch": 0.13463156629402573, "grad_norm": 0.39027324318885803, "learning_rate": 0.0004999849138374498, "loss": 1.625, "step": 1120 }, { "epoch": 0.13523260007212407, "grad_norm": 0.4438004195690155, "learning_rate": 0.0004999843854707688, "loss": 1.5414, "step": 1125 }, { "epoch": 0.13583363385022237, "grad_norm": 0.4966782033443451, "learning_rate": 0.0004999838480103787, "loss": 1.4836, "step": 1130 }, { "epoch": 0.1364346676283207, "grad_norm": 0.5602577328681946, "learning_rate": 0.0004999833014562992, "loss": 1.3961, "step": 1135 }, { "epoch": 0.13703570140641905, "grad_norm": 0.5276179909706116, "learning_rate": 0.0004999827458085502, "loss": 1.8422, "step": 1140 }, { "epoch": 0.13763673518451738, "grad_norm": 0.4706065058708191, "learning_rate": 0.0004999821810671518, "loss": 1.7109, "step": 1145 }, { "epoch": 0.1382377689626157, "grad_norm": 0.38341307640075684, "learning_rate": 0.0004999816072321245, "loss": 1.8859, "step": 1150 }, { "epoch": 0.13883880274071403, "grad_norm": 0.5754373073577881, "learning_rate": 0.0004999810243034894, "loss": 1.8, "step": 1155 }, { "epoch": 0.13943983651881237, "grad_norm": 0.5003094673156738, "learning_rate": 0.0004999804322812676, "loss": 1.6766, "step": 1160 }, { "epoch": 0.14004087029691067, "grad_norm": 0.31239280104637146, "learning_rate": 0.0004999798311654805, "loss": 1.775, "step": 1165 }, { "epoch": 0.140641904075009, "grad_norm": 0.3998953700065613, "learning_rate": 0.0004999792209561501, "loss": 1.7516, "step": 1170 }, { "epoch": 0.14124293785310735, "grad_norm": 0.3099336624145508, "learning_rate": 0.0004999786016532986, "loss": 1.8422, "step": 1175 }, { "epoch": 0.14184397163120568, "grad_norm": 0.48160257935523987, "learning_rate": 0.0004999779732569485, "loss": 1.6062, "step": 1180 }, { "epoch": 0.142445005409304, "grad_norm": 0.5494711399078369, "learning_rate": 0.0004999773357671227, "loss": 1.5906, "step": 1185 }, { "epoch": 0.14304603918740233, "grad_norm": 0.7721512913703918, "learning_rate": 0.0004999766891838444, "loss": 1.7734, "step": 1190 }, { "epoch": 0.14364707296550067, "grad_norm": 0.5135265588760376, "learning_rate": 0.000499976033507137, "loss": 1.4812, "step": 1195 }, { "epoch": 0.144248106743599, "grad_norm": 0.7913392186164856, "learning_rate": 0.0004999753687370245, "loss": 1.5484, "step": 1200 }, { "epoch": 0.144248106743599, "eval_loss": 2.5082030296325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2256, "eval_samples_per_second": 4.542, "eval_steps_per_second": 1.136, "step": 1200 }, { "epoch": 0.1448491405216973, "grad_norm": 0.6069223880767822, "learning_rate": 0.0004999746948735308, "loss": 1.4484, "step": 1205 }, { "epoch": 0.14545017429979565, "grad_norm": 0.4137849807739258, "learning_rate": 0.0004999740119166809, "loss": 1.6719, "step": 1210 }, { "epoch": 0.14605120807789398, "grad_norm": 0.7047042846679688, "learning_rate": 0.0004999733198664992, "loss": 1.5312, "step": 1215 }, { "epoch": 0.1466522418559923, "grad_norm": 0.5389900207519531, "learning_rate": 0.0004999726187230111, "loss": 1.4297, "step": 1220 }, { "epoch": 0.14725327563409063, "grad_norm": 0.5395992994308472, "learning_rate": 0.0004999719084862421, "loss": 1.6328, "step": 1225 }, { "epoch": 0.14785430941218897, "grad_norm": 0.43566471338272095, "learning_rate": 0.0004999711891562179, "loss": 1.7094, "step": 1230 }, { "epoch": 0.1484553431902873, "grad_norm": 0.3409474194049835, "learning_rate": 0.0004999704607329648, "loss": 1.6656, "step": 1235 }, { "epoch": 0.1490563769683856, "grad_norm": 0.5498088002204895, "learning_rate": 0.0004999697232165092, "loss": 1.6016, "step": 1240 }, { "epoch": 0.14965741074648395, "grad_norm": 0.567551851272583, "learning_rate": 0.000499968976606878, "loss": 1.6828, "step": 1245 }, { "epoch": 0.15025844452458229, "grad_norm": 0.4866923987865448, "learning_rate": 0.0004999682209040983, "loss": 1.6547, "step": 1250 }, { "epoch": 0.15085947830268062, "grad_norm": 0.3780736029148102, "learning_rate": 0.0004999674561081977, "loss": 1.6719, "step": 1255 }, { "epoch": 0.15146051208077893, "grad_norm": 0.3219822347164154, "learning_rate": 0.0004999666822192039, "loss": 1.4195, "step": 1260 }, { "epoch": 0.15206154585887727, "grad_norm": 0.3056913912296295, "learning_rate": 0.0004999658992371451, "loss": 1.7484, "step": 1265 }, { "epoch": 0.1526625796369756, "grad_norm": 0.4860096573829651, "learning_rate": 0.0004999651071620499, "loss": 1.6516, "step": 1270 }, { "epoch": 0.15326361341507394, "grad_norm": 0.4047755002975464, "learning_rate": 0.0004999643059939469, "loss": 1.6984, "step": 1275 }, { "epoch": 0.15386464719317225, "grad_norm": 0.27880361676216125, "learning_rate": 0.0004999634957328652, "loss": 1.8078, "step": 1280 }, { "epoch": 0.15446568097127059, "grad_norm": 0.4087715148925781, "learning_rate": 0.0004999626763788346, "loss": 1.6422, "step": 1285 }, { "epoch": 0.15506671474936892, "grad_norm": 0.556612491607666, "learning_rate": 0.0004999618479318847, "loss": 1.5359, "step": 1290 }, { "epoch": 0.15566774852746723, "grad_norm": 0.5415599346160889, "learning_rate": 0.0004999610103920457, "loss": 1.5641, "step": 1295 }, { "epoch": 0.15626878230556557, "grad_norm": 0.48660141229629517, "learning_rate": 0.0004999601637593479, "loss": 1.5, "step": 1300 }, { "epoch": 0.1568698160836639, "grad_norm": 0.5874481797218323, "learning_rate": 0.0004999593080338224, "loss": 1.3844, "step": 1305 }, { "epoch": 0.15747084986176224, "grad_norm": 0.3727753460407257, "learning_rate": 0.0004999584432155, "loss": 1.8125, "step": 1310 }, { "epoch": 0.15807188363986055, "grad_norm": 0.35395169258117676, "learning_rate": 0.0004999575693044124, "loss": 1.3305, "step": 1315 }, { "epoch": 0.1586729174179589, "grad_norm": 0.7356476783752441, "learning_rate": 0.0004999566863005912, "loss": 1.7078, "step": 1320 }, { "epoch": 0.15927395119605722, "grad_norm": 0.4838991165161133, "learning_rate": 0.0004999557942040687, "loss": 1.5969, "step": 1325 }, { "epoch": 0.15987498497415556, "grad_norm": 0.4504292905330658, "learning_rate": 0.0004999548930148773, "loss": 1.4555, "step": 1330 }, { "epoch": 0.16047601875225387, "grad_norm": 0.5174041390419006, "learning_rate": 0.0004999539827330497, "loss": 1.5266, "step": 1335 }, { "epoch": 0.1610770525303522, "grad_norm": 0.42709511518478394, "learning_rate": 0.000499953063358619, "loss": 1.4344, "step": 1340 }, { "epoch": 0.16167808630845054, "grad_norm": 0.34575751423835754, "learning_rate": 0.0004999521348916189, "loss": 1.5219, "step": 1345 }, { "epoch": 0.16227912008654885, "grad_norm": 0.4409041404724121, "learning_rate": 0.0004999511973320829, "loss": 1.6172, "step": 1350 }, { "epoch": 0.1628801538646472, "grad_norm": 0.37874963879585266, "learning_rate": 0.0004999502506800452, "loss": 1.3156, "step": 1355 }, { "epoch": 0.16348118764274552, "grad_norm": 0.39675143361091614, "learning_rate": 0.0004999492949355401, "loss": 1.7672, "step": 1360 }, { "epoch": 0.16408222142084386, "grad_norm": 0.4887191951274872, "learning_rate": 0.0004999483300986027, "loss": 1.6578, "step": 1365 }, { "epoch": 0.16468325519894217, "grad_norm": 0.5052289366722107, "learning_rate": 0.000499947356169268, "loss": 1.5766, "step": 1370 }, { "epoch": 0.1652842889770405, "grad_norm": 0.3420865833759308, "learning_rate": 0.000499946373147571, "loss": 1.4281, "step": 1375 }, { "epoch": 0.16588532275513884, "grad_norm": 0.6112978458404541, "learning_rate": 0.0004999453810335479, "loss": 1.4234, "step": 1380 }, { "epoch": 0.16648635653323718, "grad_norm": 0.46144208312034607, "learning_rate": 0.0004999443798272348, "loss": 1.4609, "step": 1385 }, { "epoch": 0.1670873903113355, "grad_norm": 0.5132108926773071, "learning_rate": 0.000499943369528668, "loss": 1.5656, "step": 1390 }, { "epoch": 0.16768842408943382, "grad_norm": 0.5717546939849854, "learning_rate": 0.000499942350137884, "loss": 1.3617, "step": 1395 }, { "epoch": 0.16828945786753216, "grad_norm": 0.4766351580619812, "learning_rate": 0.0004999413216549203, "loss": 1.5016, "step": 1400 }, { "epoch": 0.16828945786753216, "eval_loss": 2.3990235328674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.19, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 1400 }, { "epoch": 0.1688904916456305, "grad_norm": 0.43601974844932556, "learning_rate": 0.0004999402840798142, "loss": 1.4156, "step": 1405 }, { "epoch": 0.1694915254237288, "grad_norm": 0.6105599403381348, "learning_rate": 0.0004999392374126034, "loss": 1.7, "step": 1410 }, { "epoch": 0.17009255920182714, "grad_norm": 0.4957026243209839, "learning_rate": 0.0004999381816533259, "loss": 1.7969, "step": 1415 }, { "epoch": 0.17069359297992548, "grad_norm": 0.44195666909217834, "learning_rate": 0.0004999371168020201, "loss": 1.4375, "step": 1420 }, { "epoch": 0.1712946267580238, "grad_norm": 0.45855048298835754, "learning_rate": 0.0004999360428587249, "loss": 1.6141, "step": 1425 }, { "epoch": 0.17189566053612212, "grad_norm": 0.6269901990890503, "learning_rate": 0.0004999349598234792, "loss": 1.3953, "step": 1430 }, { "epoch": 0.17249669431422046, "grad_norm": 0.3805680274963379, "learning_rate": 0.0004999338676963225, "loss": 1.6484, "step": 1435 }, { "epoch": 0.1730977280923188, "grad_norm": 0.6604627966880798, "learning_rate": 0.0004999327664772945, "loss": 1.5969, "step": 1440 }, { "epoch": 0.1736987618704171, "grad_norm": 0.4411623179912567, "learning_rate": 0.0004999316561664353, "loss": 1.2609, "step": 1445 }, { "epoch": 0.17429979564851544, "grad_norm": 0.5301747918128967, "learning_rate": 0.0004999305367637852, "loss": 1.6141, "step": 1450 }, { "epoch": 0.17490082942661378, "grad_norm": 0.5128594040870667, "learning_rate": 0.000499929408269385, "loss": 1.6187, "step": 1455 }, { "epoch": 0.17550186320471212, "grad_norm": 0.596217155456543, "learning_rate": 0.0004999282706832758, "loss": 1.4531, "step": 1460 }, { "epoch": 0.17610289698281043, "grad_norm": 0.45486292243003845, "learning_rate": 0.0004999271240054987, "loss": 1.4012, "step": 1465 }, { "epoch": 0.17670393076090876, "grad_norm": 0.6031058430671692, "learning_rate": 0.0004999259682360957, "loss": 1.6203, "step": 1470 }, { "epoch": 0.1773049645390071, "grad_norm": 0.4107096493244171, "learning_rate": 0.0004999248033751088, "loss": 1.7312, "step": 1475 }, { "epoch": 0.17790599831710543, "grad_norm": 0.46700888872146606, "learning_rate": 0.0004999236294225803, "loss": 1.5234, "step": 1480 }, { "epoch": 0.17850703209520374, "grad_norm": 0.7690737247467041, "learning_rate": 0.000499922446378553, "loss": 1.307, "step": 1485 }, { "epoch": 0.17910806587330208, "grad_norm": 0.5420579314231873, "learning_rate": 0.0004999212542430698, "loss": 1.6562, "step": 1490 }, { "epoch": 0.17970909965140042, "grad_norm": 0.4624311625957489, "learning_rate": 0.0004999200530161742, "loss": 1.3234, "step": 1495 }, { "epoch": 0.18031013342949873, "grad_norm": 0.4610016345977783, "learning_rate": 0.0004999188426979097, "loss": 1.5516, "step": 1500 }, { "epoch": 0.18091116720759706, "grad_norm": 0.5131213068962097, "learning_rate": 0.0004999176232883206, "loss": 1.5867, "step": 1505 }, { "epoch": 0.1815122009856954, "grad_norm": 0.5673689842224121, "learning_rate": 0.0004999163947874511, "loss": 1.5078, "step": 1510 }, { "epoch": 0.18211323476379374, "grad_norm": 0.7008316516876221, "learning_rate": 0.000499915157195346, "loss": 1.5562, "step": 1515 }, { "epoch": 0.18271426854189204, "grad_norm": 0.5652767419815063, "learning_rate": 0.00049991391051205, "loss": 1.4469, "step": 1520 }, { "epoch": 0.18331530231999038, "grad_norm": 0.5506184101104736, "learning_rate": 0.0004999126547376089, "loss": 1.4531, "step": 1525 }, { "epoch": 0.18391633609808872, "grad_norm": 0.5806117057800293, "learning_rate": 0.000499911389872068, "loss": 1.7594, "step": 1530 }, { "epoch": 0.18451736987618705, "grad_norm": 0.5860136151313782, "learning_rate": 0.0004999101159154736, "loss": 1.5562, "step": 1535 }, { "epoch": 0.18511840365428536, "grad_norm": 0.5575783252716064, "learning_rate": 0.000499908832867872, "loss": 1.6391, "step": 1540 }, { "epoch": 0.1857194374323837, "grad_norm": 0.3992920219898224, "learning_rate": 0.0004999075407293096, "loss": 1.3859, "step": 1545 }, { "epoch": 0.18632047121048204, "grad_norm": 0.8294938206672668, "learning_rate": 0.0004999062394998336, "loss": 1.25, "step": 1550 }, { "epoch": 0.18692150498858034, "grad_norm": 0.6560512185096741, "learning_rate": 0.0004999049291794915, "loss": 1.4453, "step": 1555 }, { "epoch": 0.18752253876667868, "grad_norm": 0.5583436489105225, "learning_rate": 0.0004999036097683307, "loss": 1.3969, "step": 1560 }, { "epoch": 0.18812357254477702, "grad_norm": 0.6256234645843506, "learning_rate": 0.0004999022812663993, "loss": 1.518, "step": 1565 }, { "epoch": 0.18872460632287535, "grad_norm": 0.5769176483154297, "learning_rate": 0.0004999009436737457, "loss": 1.6609, "step": 1570 }, { "epoch": 0.18932564010097366, "grad_norm": 0.6486324071884155, "learning_rate": 0.0004998995969904183, "loss": 1.3172, "step": 1575 }, { "epoch": 0.189926673879072, "grad_norm": 0.34935474395751953, "learning_rate": 0.0004998982412164663, "loss": 1.5562, "step": 1580 }, { "epoch": 0.19052770765717034, "grad_norm": 0.5806995630264282, "learning_rate": 0.000499896876351939, "loss": 1.6219, "step": 1585 }, { "epoch": 0.19112874143526867, "grad_norm": 0.6906558275222778, "learning_rate": 0.0004998955023968862, "loss": 1.5172, "step": 1590 }, { "epoch": 0.19172977521336698, "grad_norm": 0.49730750918388367, "learning_rate": 0.0004998941193513575, "loss": 1.6797, "step": 1595 }, { "epoch": 0.19233080899146532, "grad_norm": 0.5871158242225647, "learning_rate": 0.0004998927272154036, "loss": 1.6125, "step": 1600 }, { "epoch": 0.19233080899146532, "eval_loss": 2.360156297683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1647, "eval_samples_per_second": 4.55, "eval_steps_per_second": 1.138, "step": 1600 }, { "epoch": 0.19293184276956366, "grad_norm": 0.3994157910346985, "learning_rate": 0.000499891325989075, "loss": 1.3305, "step": 1605 }, { "epoch": 0.193532876547662, "grad_norm": 0.3497280180454254, "learning_rate": 0.0004998899156724224, "loss": 1.3531, "step": 1610 }, { "epoch": 0.1941339103257603, "grad_norm": 0.4835513234138489, "learning_rate": 0.0004998884962654976, "loss": 1.293, "step": 1615 }, { "epoch": 0.19473494410385864, "grad_norm": 0.4717245101928711, "learning_rate": 0.0004998870677683519, "loss": 1.3742, "step": 1620 }, { "epoch": 0.19533597788195697, "grad_norm": 0.3917827308177948, "learning_rate": 0.0004998856301810373, "loss": 1.5719, "step": 1625 }, { "epoch": 0.19593701166005528, "grad_norm": 0.4725429117679596, "learning_rate": 0.0004998841835036061, "loss": 1.3859, "step": 1630 }, { "epoch": 0.19653804543815362, "grad_norm": 0.4728795289993286, "learning_rate": 0.0004998827277361111, "loss": 1.4203, "step": 1635 }, { "epoch": 0.19713907921625196, "grad_norm": 0.6246328949928284, "learning_rate": 0.000499881262878605, "loss": 1.7719, "step": 1640 }, { "epoch": 0.1977401129943503, "grad_norm": 0.7019891738891602, "learning_rate": 0.0004998797889311413, "loss": 1.3781, "step": 1645 }, { "epoch": 0.1983411467724486, "grad_norm": 0.2940036654472351, "learning_rate": 0.0004998783058937735, "loss": 1.4148, "step": 1650 }, { "epoch": 0.19894218055054694, "grad_norm": 0.434410959482193, "learning_rate": 0.0004998768137665556, "loss": 1.6094, "step": 1655 }, { "epoch": 0.19954321432864527, "grad_norm": 0.5853382349014282, "learning_rate": 0.0004998753125495418, "loss": 1.4125, "step": 1660 }, { "epoch": 0.2001442481067436, "grad_norm": 0.5105974078178406, "learning_rate": 0.0004998738022427867, "loss": 1.3313, "step": 1665 }, { "epoch": 0.20074528188484192, "grad_norm": 0.4266336262226105, "learning_rate": 0.0004998722828463455, "loss": 1.5953, "step": 1670 }, { "epoch": 0.20134631566294026, "grad_norm": 0.4918626844882965, "learning_rate": 0.0004998707543602731, "loss": 1.8383, "step": 1675 }, { "epoch": 0.2019473494410386, "grad_norm": 0.4804850220680237, "learning_rate": 0.0004998692167846253, "loss": 1.1484, "step": 1680 }, { "epoch": 0.20254838321913693, "grad_norm": 0.5131824612617493, "learning_rate": 0.0004998676701194581, "loss": 1.7109, "step": 1685 }, { "epoch": 0.20314941699723524, "grad_norm": 0.4895535111427307, "learning_rate": 0.0004998661143648277, "loss": 1.7453, "step": 1690 }, { "epoch": 0.20375045077533357, "grad_norm": 0.4180288314819336, "learning_rate": 0.0004998645495207906, "loss": 1.0766, "step": 1695 }, { "epoch": 0.2043514845534319, "grad_norm": 0.4888496696949005, "learning_rate": 0.0004998629755874037, "loss": 1.5359, "step": 1700 }, { "epoch": 0.20495251833153022, "grad_norm": 0.666147768497467, "learning_rate": 0.0004998613925647245, "loss": 1.5609, "step": 1705 }, { "epoch": 0.20555355210962856, "grad_norm": 0.563382625579834, "learning_rate": 0.0004998598004528103, "loss": 1.4187, "step": 1710 }, { "epoch": 0.2061545858877269, "grad_norm": 0.619296669960022, "learning_rate": 0.0004998581992517192, "loss": 1.3367, "step": 1715 }, { "epoch": 0.20675561966582523, "grad_norm": 0.928014874458313, "learning_rate": 0.0004998565889615096, "loss": 1.4094, "step": 1720 }, { "epoch": 0.20735665344392354, "grad_norm": 0.4932372272014618, "learning_rate": 0.0004998549695822397, "loss": 1.3719, "step": 1725 }, { "epoch": 0.20795768722202188, "grad_norm": 0.6022034287452698, "learning_rate": 0.0004998533411139685, "loss": 1.5781, "step": 1730 }, { "epoch": 0.2085587210001202, "grad_norm": 0.41716283559799194, "learning_rate": 0.0004998517035567554, "loss": 1.1914, "step": 1735 }, { "epoch": 0.20915975477821855, "grad_norm": 0.4988159239292145, "learning_rate": 0.0004998500569106599, "loss": 1.475, "step": 1740 }, { "epoch": 0.20976078855631686, "grad_norm": 0.4242478907108307, "learning_rate": 0.0004998484011757419, "loss": 1.2859, "step": 1745 }, { "epoch": 0.2103618223344152, "grad_norm": 0.5382992625236511, "learning_rate": 0.0004998467363520617, "loss": 1.3687, "step": 1750 }, { "epoch": 0.21096285611251353, "grad_norm": 0.31303003430366516, "learning_rate": 0.0004998450624396797, "loss": 1.9281, "step": 1755 }, { "epoch": 0.21156388989061184, "grad_norm": 0.5793948173522949, "learning_rate": 0.0004998433794386569, "loss": 1.457, "step": 1760 }, { "epoch": 0.21216492366871018, "grad_norm": 0.48824676871299744, "learning_rate": 0.0004998416873490544, "loss": 1.5359, "step": 1765 }, { "epoch": 0.2127659574468085, "grad_norm": 0.5384695529937744, "learning_rate": 0.000499839986170934, "loss": 1.4461, "step": 1770 }, { "epoch": 0.21336699122490685, "grad_norm": 0.5212387442588806, "learning_rate": 0.0004998382759043574, "loss": 1.2844, "step": 1775 }, { "epoch": 0.21396802500300516, "grad_norm": 0.5552918910980225, "learning_rate": 0.0004998365565493868, "loss": 1.5516, "step": 1780 }, { "epoch": 0.2145690587811035, "grad_norm": 0.5672168135643005, "learning_rate": 0.0004998348281060848, "loss": 1.6297, "step": 1785 }, { "epoch": 0.21517009255920183, "grad_norm": 0.620464026927948, "learning_rate": 0.0004998330905745143, "loss": 1.5047, "step": 1790 }, { "epoch": 0.21577112633730017, "grad_norm": 0.5900077819824219, "learning_rate": 0.0004998313439547384, "loss": 1.2367, "step": 1795 }, { "epoch": 0.21637216011539848, "grad_norm": 0.5305217504501343, "learning_rate": 0.0004998295882468209, "loss": 1.5906, "step": 1800 }, { "epoch": 0.21637216011539848, "eval_loss": 2.288867235183716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2003, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 1800 }, { "epoch": 0.2169731938934968, "grad_norm": 0.5836020112037659, "learning_rate": 0.0004998278234508253, "loss": 1.4891, "step": 1805 }, { "epoch": 0.21757422767159515, "grad_norm": 0.3793884813785553, "learning_rate": 0.0004998260495668161, "loss": 1.3328, "step": 1810 }, { "epoch": 0.21817526144969349, "grad_norm": 0.5394117832183838, "learning_rate": 0.0004998242665948577, "loss": 1.368, "step": 1815 }, { "epoch": 0.2187762952277918, "grad_norm": 0.39613473415374756, "learning_rate": 0.0004998224745350148, "loss": 1.2285, "step": 1820 }, { "epoch": 0.21937732900589013, "grad_norm": 0.543116569519043, "learning_rate": 0.0004998206733873529, "loss": 1.5078, "step": 1825 }, { "epoch": 0.21997836278398847, "grad_norm": 0.4901551306247711, "learning_rate": 0.0004998188631519375, "loss": 1.4516, "step": 1830 }, { "epoch": 0.22057939656208678, "grad_norm": 0.5067916512489319, "learning_rate": 0.0004998170438288342, "loss": 1.5719, "step": 1835 }, { "epoch": 0.2211804303401851, "grad_norm": 0.4343029856681824, "learning_rate": 0.0004998152154181093, "loss": 1.3766, "step": 1840 }, { "epoch": 0.22178146411828345, "grad_norm": 0.5296164155006409, "learning_rate": 0.0004998133779198293, "loss": 1.3625, "step": 1845 }, { "epoch": 0.2223824978963818, "grad_norm": 0.4429774284362793, "learning_rate": 0.0004998115313340611, "loss": 1.3891, "step": 1850 }, { "epoch": 0.2229835316744801, "grad_norm": 0.5772582292556763, "learning_rate": 0.0004998096756608719, "loss": 1.5437, "step": 1855 }, { "epoch": 0.22358456545257843, "grad_norm": 0.5951064825057983, "learning_rate": 0.0004998078109003291, "loss": 1.4672, "step": 1860 }, { "epoch": 0.22418559923067677, "grad_norm": 0.3261686861515045, "learning_rate": 0.0004998059370525006, "loss": 1.5063, "step": 1865 }, { "epoch": 0.2247866330087751, "grad_norm": 0.3098331689834595, "learning_rate": 0.0004998040541174545, "loss": 1.5094, "step": 1870 }, { "epoch": 0.22538766678687341, "grad_norm": 0.8590214252471924, "learning_rate": 0.0004998021620952593, "loss": 1.3977, "step": 1875 }, { "epoch": 0.22598870056497175, "grad_norm": 0.5078855752944946, "learning_rate": 0.0004998002609859839, "loss": 1.2789, "step": 1880 }, { "epoch": 0.2265897343430701, "grad_norm": 0.4515461027622223, "learning_rate": 0.0004997983507896976, "loss": 1.368, "step": 1885 }, { "epoch": 0.2271907681211684, "grad_norm": 0.4937264025211334, "learning_rate": 0.0004997964315064695, "loss": 1.1953, "step": 1890 }, { "epoch": 0.22779180189926673, "grad_norm": 0.6028769612312317, "learning_rate": 0.0004997945031363697, "loss": 1.4859, "step": 1895 }, { "epoch": 0.22839283567736507, "grad_norm": 0.4746128022670746, "learning_rate": 0.0004997925656794683, "loss": 1.6016, "step": 1900 }, { "epoch": 0.2289938694554634, "grad_norm": 0.519091010093689, "learning_rate": 0.0004997906191358358, "loss": 1.3906, "step": 1905 }, { "epoch": 0.22959490323356171, "grad_norm": 0.4584903419017792, "learning_rate": 0.0004997886635055429, "loss": 1.3258, "step": 1910 }, { "epoch": 0.23019593701166005, "grad_norm": 0.7446622252464294, "learning_rate": 0.0004997866987886608, "loss": 1.2141, "step": 1915 }, { "epoch": 0.2307969707897584, "grad_norm": 0.5405495166778564, "learning_rate": 0.0004997847249852609, "loss": 1.4359, "step": 1920 }, { "epoch": 0.23139800456785672, "grad_norm": 0.38187775015830994, "learning_rate": 0.0004997827420954152, "loss": 1.7219, "step": 1925 }, { "epoch": 0.23199903834595503, "grad_norm": 0.503364622592926, "learning_rate": 0.0004997807501191957, "loss": 1.3586, "step": 1930 }, { "epoch": 0.23260007212405337, "grad_norm": 0.43855008482933044, "learning_rate": 0.0004997787490566749, "loss": 1.7625, "step": 1935 }, { "epoch": 0.2332011059021517, "grad_norm": 0.4955185651779175, "learning_rate": 0.0004997767389079255, "loss": 1.2281, "step": 1940 }, { "epoch": 0.23380213968025004, "grad_norm": 0.7726651430130005, "learning_rate": 0.0004997747196730206, "loss": 1.5445, "step": 1945 }, { "epoch": 0.23440317345834835, "grad_norm": 0.38199684023857117, "learning_rate": 0.000499772691352034, "loss": 1.4203, "step": 1950 }, { "epoch": 0.2350042072364467, "grad_norm": 0.4838792383670807, "learning_rate": 0.000499770653945039, "loss": 1.2484, "step": 1955 }, { "epoch": 0.23560524101454502, "grad_norm": 0.43874993920326233, "learning_rate": 0.00049976860745211, "loss": 1.3594, "step": 1960 }, { "epoch": 0.23620627479264333, "grad_norm": 0.4992177188396454, "learning_rate": 0.0004997665518733215, "loss": 1.1977, "step": 1965 }, { "epoch": 0.23680730857074167, "grad_norm": 0.526907742023468, "learning_rate": 0.000499764487208748, "loss": 1.1609, "step": 1970 }, { "epoch": 0.23740834234884, "grad_norm": 0.599902868270874, "learning_rate": 0.000499762413458465, "loss": 1.4203, "step": 1975 }, { "epoch": 0.23800937612693834, "grad_norm": 0.42601317167282104, "learning_rate": 0.0004997603306225475, "loss": 1.1516, "step": 1980 }, { "epoch": 0.23861040990503665, "grad_norm": 0.3787403404712677, "learning_rate": 0.0004997582387010715, "loss": 1.3391, "step": 1985 }, { "epoch": 0.239211443683135, "grad_norm": 0.5586139559745789, "learning_rate": 0.0004997561376941131, "loss": 1.6656, "step": 1990 }, { "epoch": 0.23981247746123333, "grad_norm": 0.44761109352111816, "learning_rate": 0.0004997540276017487, "loss": 1.5828, "step": 1995 }, { "epoch": 0.24041351123933166, "grad_norm": 0.4291538894176483, "learning_rate": 0.000499751908424055, "loss": 1.4539, "step": 2000 }, { "epoch": 0.24041351123933166, "eval_loss": 2.2626953125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2012, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 2000 }, { "epoch": 0.24101454501742997, "grad_norm": 0.46680477261543274, "learning_rate": 0.0004997497801611093, "loss": 1.2609, "step": 2005 }, { "epoch": 0.2416155787955283, "grad_norm": 0.42086416482925415, "learning_rate": 0.0004997476428129887, "loss": 1.1609, "step": 2010 }, { "epoch": 0.24221661257362664, "grad_norm": 0.7524279952049255, "learning_rate": 0.0004997454963797713, "loss": 1.0633, "step": 2015 }, { "epoch": 0.24281764635172498, "grad_norm": 0.43722498416900635, "learning_rate": 0.0004997433408615349, "loss": 1.2969, "step": 2020 }, { "epoch": 0.2434186801298233, "grad_norm": 0.2848932147026062, "learning_rate": 0.0004997411762583581, "loss": 1.2063, "step": 2025 }, { "epoch": 0.24401971390792163, "grad_norm": 0.4349381923675537, "learning_rate": 0.0004997390025703194, "loss": 1.3625, "step": 2030 }, { "epoch": 0.24462074768601996, "grad_norm": 0.4666562080383301, "learning_rate": 0.0004997368197974982, "loss": 1.4164, "step": 2035 }, { "epoch": 0.24522178146411827, "grad_norm": 0.5730391144752502, "learning_rate": 0.0004997346279399736, "loss": 1.1633, "step": 2040 }, { "epoch": 0.2458228152422166, "grad_norm": 0.5395126938819885, "learning_rate": 0.0004997324269978255, "loss": 1.2398, "step": 2045 }, { "epoch": 0.24642384902031494, "grad_norm": 0.3828608989715576, "learning_rate": 0.000499730216971134, "loss": 1.0594, "step": 2050 }, { "epoch": 0.24702488279841328, "grad_norm": 0.796903133392334, "learning_rate": 0.0004997279978599794, "loss": 1.3055, "step": 2055 }, { "epoch": 0.2476259165765116, "grad_norm": 0.35091638565063477, "learning_rate": 0.0004997257696644424, "loss": 1.2023, "step": 2060 }, { "epoch": 0.24822695035460993, "grad_norm": 0.46753543615341187, "learning_rate": 0.000499723532384604, "loss": 1.3203, "step": 2065 }, { "epoch": 0.24882798413270826, "grad_norm": 0.5231248736381531, "learning_rate": 0.0004997212860205459, "loss": 1.3438, "step": 2070 }, { "epoch": 0.2494290179108066, "grad_norm": 0.470639169216156, "learning_rate": 0.0004997190305723495, "loss": 1.3031, "step": 2075 }, { "epoch": 0.2500300516889049, "grad_norm": 0.4669177234172821, "learning_rate": 0.000499716766040097, "loss": 1.5188, "step": 2080 }, { "epoch": 0.25063108546700325, "grad_norm": 0.5113319754600525, "learning_rate": 0.0004997144924238706, "loss": 1.0992, "step": 2085 }, { "epoch": 0.2512321192451016, "grad_norm": 0.5395264625549316, "learning_rate": 0.0004997122097237533, "loss": 1.3281, "step": 2090 }, { "epoch": 0.2518331530231999, "grad_norm": 0.47676244378089905, "learning_rate": 0.0004997099179398279, "loss": 1.2898, "step": 2095 }, { "epoch": 0.25243418680129825, "grad_norm": 0.3385642468929291, "learning_rate": 0.0004997076170721778, "loss": 1.2078, "step": 2100 }, { "epoch": 0.25303522057939654, "grad_norm": 0.3868078887462616, "learning_rate": 0.0004997053071208868, "loss": 1.4563, "step": 2105 }, { "epoch": 0.2536362543574949, "grad_norm": 0.437321275472641, "learning_rate": 0.0004997029880860389, "loss": 1.3977, "step": 2110 }, { "epoch": 0.2542372881355932, "grad_norm": 0.6515981554985046, "learning_rate": 0.0004997006599677183, "loss": 1.2461, "step": 2115 }, { "epoch": 0.25483832191369155, "grad_norm": 0.3654949367046356, "learning_rate": 0.0004996983227660099, "loss": 1.4187, "step": 2120 }, { "epoch": 0.2554393556917899, "grad_norm": 0.5203860998153687, "learning_rate": 0.0004996959764809987, "loss": 1.4328, "step": 2125 }, { "epoch": 0.2560403894698882, "grad_norm": 0.5454062223434448, "learning_rate": 0.00049969362111277, "loss": 1.5125, "step": 2130 }, { "epoch": 0.25664142324798656, "grad_norm": 0.5460174679756165, "learning_rate": 0.0004996912566614094, "loss": 1.4344, "step": 2135 }, { "epoch": 0.2572424570260849, "grad_norm": 0.4798714816570282, "learning_rate": 0.000499688883127003, "loss": 1.1953, "step": 2140 }, { "epoch": 0.2578434908041832, "grad_norm": 0.679547905921936, "learning_rate": 0.0004996865005096372, "loss": 1.2688, "step": 2145 }, { "epoch": 0.2584445245822815, "grad_norm": 0.42334336042404175, "learning_rate": 0.0004996841088093985, "loss": 1.1516, "step": 2150 }, { "epoch": 0.25904555836037985, "grad_norm": 0.4171724021434784, "learning_rate": 0.000499681708026374, "loss": 1.0961, "step": 2155 }, { "epoch": 0.2596465921384782, "grad_norm": 0.6091195940971375, "learning_rate": 0.0004996792981606511, "loss": 1.0164, "step": 2160 }, { "epoch": 0.2602476259165765, "grad_norm": 0.7312507033348083, "learning_rate": 0.0004996768792123173, "loss": 1.3031, "step": 2165 }, { "epoch": 0.26084865969467486, "grad_norm": 0.8120207786560059, "learning_rate": 0.0004996744511814609, "loss": 1.1641, "step": 2170 }, { "epoch": 0.2614496934727732, "grad_norm": 0.4702399969100952, "learning_rate": 0.0004996720140681699, "loss": 1.2805, "step": 2175 }, { "epoch": 0.2620507272508715, "grad_norm": 0.45239925384521484, "learning_rate": 0.0004996695678725331, "loss": 1.5539, "step": 2180 }, { "epoch": 0.2626517610289698, "grad_norm": 0.6370692253112793, "learning_rate": 0.0004996671125946394, "loss": 1.2156, "step": 2185 }, { "epoch": 0.26325279480706815, "grad_norm": 0.6115698218345642, "learning_rate": 0.0004996646482345781, "loss": 1.2891, "step": 2190 }, { "epoch": 0.2638538285851665, "grad_norm": 0.611488401889801, "learning_rate": 0.0004996621747924391, "loss": 1.3023, "step": 2195 }, { "epoch": 0.2644548623632648, "grad_norm": 0.6977550983428955, "learning_rate": 0.0004996596922683122, "loss": 1.3555, "step": 2200 }, { "epoch": 0.2644548623632648, "eval_loss": 2.2613282203674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2156, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 2200 }, { "epoch": 0.26505589614136316, "grad_norm": 0.6270340085029602, "learning_rate": 0.0004996572006622876, "loss": 1.5938, "step": 2205 }, { "epoch": 0.2656569299194615, "grad_norm": 0.5670061707496643, "learning_rate": 0.0004996546999744561, "loss": 1.5016, "step": 2210 }, { "epoch": 0.26625796369755983, "grad_norm": 0.38163918256759644, "learning_rate": 0.0004996521902049086, "loss": 1.2812, "step": 2215 }, { "epoch": 0.2668589974756581, "grad_norm": 0.45828545093536377, "learning_rate": 0.0004996496713537365, "loss": 1.3023, "step": 2220 }, { "epoch": 0.26746003125375645, "grad_norm": 0.4318217933177948, "learning_rate": 0.0004996471434210312, "loss": 1.6039, "step": 2225 }, { "epoch": 0.2680610650318548, "grad_norm": 0.5099067091941833, "learning_rate": 0.0004996446064068848, "loss": 1.5562, "step": 2230 }, { "epoch": 0.2686620988099531, "grad_norm": 0.7253368496894836, "learning_rate": 0.0004996420603113897, "loss": 1.2523, "step": 2235 }, { "epoch": 0.26926313258805146, "grad_norm": 0.6101372838020325, "learning_rate": 0.0004996395051346384, "loss": 1.4125, "step": 2240 }, { "epoch": 0.2698641663661498, "grad_norm": 0.5073166489601135, "learning_rate": 0.0004996369408767238, "loss": 1.1109, "step": 2245 }, { "epoch": 0.27046520014424813, "grad_norm": 0.4978417456150055, "learning_rate": 0.0004996343675377393, "loss": 1.3438, "step": 2250 }, { "epoch": 0.2710662339223464, "grad_norm": 0.695686936378479, "learning_rate": 0.0004996317851177784, "loss": 1.0445, "step": 2255 }, { "epoch": 0.27166726770044475, "grad_norm": 0.5276048183441162, "learning_rate": 0.000499629193616935, "loss": 1.2703, "step": 2260 }, { "epoch": 0.2722683014785431, "grad_norm": 0.7686821222305298, "learning_rate": 0.0004996265930353036, "loss": 1.2656, "step": 2265 }, { "epoch": 0.2728693352566414, "grad_norm": 0.673497200012207, "learning_rate": 0.0004996239833729786, "loss": 1.4055, "step": 2270 }, { "epoch": 0.27347036903473976, "grad_norm": 0.4770069122314453, "learning_rate": 0.000499621364630055, "loss": 1.1227, "step": 2275 }, { "epoch": 0.2740714028128381, "grad_norm": 0.630565881729126, "learning_rate": 0.000499618736806628, "loss": 1.293, "step": 2280 }, { "epoch": 0.27467243659093643, "grad_norm": 0.5288265943527222, "learning_rate": 0.0004996160999027933, "loss": 1.5109, "step": 2285 }, { "epoch": 0.27527347036903477, "grad_norm": 0.35486194491386414, "learning_rate": 0.0004996134539186469, "loss": 1.5078, "step": 2290 }, { "epoch": 0.27587450414713305, "grad_norm": 0.5654587745666504, "learning_rate": 0.0004996107988542847, "loss": 1.625, "step": 2295 }, { "epoch": 0.2764755379252314, "grad_norm": 0.40694040060043335, "learning_rate": 0.0004996081347098037, "loss": 1.4531, "step": 2300 }, { "epoch": 0.2770765717033297, "grad_norm": 0.5765879154205322, "learning_rate": 0.0004996054614853005, "loss": 1.343, "step": 2305 }, { "epoch": 0.27767760548142806, "grad_norm": 0.49710384011268616, "learning_rate": 0.0004996027791808725, "loss": 1.3266, "step": 2310 }, { "epoch": 0.2782786392595264, "grad_norm": 0.5011634826660156, "learning_rate": 0.0004996000877966172, "loss": 1.3438, "step": 2315 }, { "epoch": 0.27887967303762473, "grad_norm": 0.6307665705680847, "learning_rate": 0.0004995973873326326, "loss": 1.5703, "step": 2320 }, { "epoch": 0.27948070681572307, "grad_norm": 0.46662095189094543, "learning_rate": 0.0004995946777890169, "loss": 1.4414, "step": 2325 }, { "epoch": 0.28008174059382135, "grad_norm": 0.49989181756973267, "learning_rate": 0.0004995919591658687, "loss": 1.3789, "step": 2330 }, { "epoch": 0.2806827743719197, "grad_norm": 0.4880094528198242, "learning_rate": 0.0004995892314632867, "loss": 1.2633, "step": 2335 }, { "epoch": 0.281283808150018, "grad_norm": 0.6314132213592529, "learning_rate": 0.0004995864946813703, "loss": 1.5539, "step": 2340 }, { "epoch": 0.28188484192811636, "grad_norm": 0.7073726654052734, "learning_rate": 0.0004995837488202191, "loss": 1.3766, "step": 2345 }, { "epoch": 0.2824858757062147, "grad_norm": 0.5559587478637695, "learning_rate": 0.0004995809938799329, "loss": 1.4875, "step": 2350 }, { "epoch": 0.28308690948431303, "grad_norm": 0.4955267906188965, "learning_rate": 0.0004995782298606119, "loss": 1.3156, "step": 2355 }, { "epoch": 0.28368794326241137, "grad_norm": 0.4989592432975769, "learning_rate": 0.0004995754567623567, "loss": 1.2484, "step": 2360 }, { "epoch": 0.28428897704050965, "grad_norm": 0.5886387228965759, "learning_rate": 0.0004995726745852681, "loss": 1.2344, "step": 2365 }, { "epoch": 0.284890010818608, "grad_norm": 0.5085893273353577, "learning_rate": 0.0004995698833294474, "loss": 1.407, "step": 2370 }, { "epoch": 0.2854910445967063, "grad_norm": 0.4706375002861023, "learning_rate": 0.000499567082994996, "loss": 1.3117, "step": 2375 }, { "epoch": 0.28609207837480466, "grad_norm": 0.5287367701530457, "learning_rate": 0.000499564273582016, "loss": 1.2594, "step": 2380 }, { "epoch": 0.286693112152903, "grad_norm": 0.5483081936836243, "learning_rate": 0.0004995614550906093, "loss": 1.1008, "step": 2385 }, { "epoch": 0.28729414593100133, "grad_norm": 0.8154200911521912, "learning_rate": 0.0004995586275208788, "loss": 1.5164, "step": 2390 }, { "epoch": 0.28789517970909967, "grad_norm": 0.837818443775177, "learning_rate": 0.000499555790872927, "loss": 1.2531, "step": 2395 }, { "epoch": 0.288496213487198, "grad_norm": 0.4989728033542633, "learning_rate": 0.0004995529451468574, "loss": 1.3719, "step": 2400 }, { "epoch": 0.288496213487198, "eval_loss": 2.189453125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.202, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 2400 }, { "epoch": 0.2890972472652963, "grad_norm": 0.5615506768226624, "learning_rate": 0.0004995500903427732, "loss": 1.1023, "step": 2405 }, { "epoch": 0.2896982810433946, "grad_norm": 0.7758134007453918, "learning_rate": 0.0004995472264607784, "loss": 1.2625, "step": 2410 }, { "epoch": 0.29029931482149296, "grad_norm": 0.6751444935798645, "learning_rate": 0.0004995443535009773, "loss": 1.5734, "step": 2415 }, { "epoch": 0.2909003485995913, "grad_norm": 0.5839786529541016, "learning_rate": 0.0004995414714634743, "loss": 1.3625, "step": 2420 }, { "epoch": 0.29150138237768963, "grad_norm": 0.5906524062156677, "learning_rate": 0.0004995385803483742, "loss": 1.0875, "step": 2425 }, { "epoch": 0.29210241615578797, "grad_norm": 0.7597156763076782, "learning_rate": 0.0004995356801557821, "loss": 1.4781, "step": 2430 }, { "epoch": 0.2927034499338863, "grad_norm": 0.5112520456314087, "learning_rate": 0.0004995327708858038, "loss": 1.2758, "step": 2435 }, { "epoch": 0.2933044837119846, "grad_norm": 0.44212523102760315, "learning_rate": 0.0004995298525385447, "loss": 1.5094, "step": 2440 }, { "epoch": 0.2939055174900829, "grad_norm": 0.43641284108161926, "learning_rate": 0.0004995269251141114, "loss": 1.1656, "step": 2445 }, { "epoch": 0.29450655126818126, "grad_norm": 0.4382478892803192, "learning_rate": 0.0004995239886126102, "loss": 1.5023, "step": 2450 }, { "epoch": 0.2951075850462796, "grad_norm": 0.6196442246437073, "learning_rate": 0.0004995210430341478, "loss": 1.2875, "step": 2455 }, { "epoch": 0.29570861882437793, "grad_norm": 0.6048389673233032, "learning_rate": 0.0004995180883788316, "loss": 0.9516, "step": 2460 }, { "epoch": 0.29630965260247627, "grad_norm": 0.5682608485221863, "learning_rate": 0.0004995151246467689, "loss": 1.3422, "step": 2465 }, { "epoch": 0.2969106863805746, "grad_norm": 0.5677405595779419, "learning_rate": 0.0004995121518380674, "loss": 1.5016, "step": 2470 }, { "epoch": 0.29751172015867294, "grad_norm": 0.48005715012550354, "learning_rate": 0.0004995091699528355, "loss": 1.3219, "step": 2475 }, { "epoch": 0.2981127539367712, "grad_norm": 0.48294246196746826, "learning_rate": 0.0004995061789911817, "loss": 1.2516, "step": 2480 }, { "epoch": 0.29871378771486956, "grad_norm": 0.7167287468910217, "learning_rate": 0.0004995031789532147, "loss": 1.3531, "step": 2485 }, { "epoch": 0.2993148214929679, "grad_norm": 0.5675193667411804, "learning_rate": 0.0004995001698390434, "loss": 1.3648, "step": 2490 }, { "epoch": 0.29991585527106623, "grad_norm": 0.5264390707015991, "learning_rate": 0.0004994971516487775, "loss": 1.1133, "step": 2495 }, { "epoch": 0.30051688904916457, "grad_norm": 0.5506901144981384, "learning_rate": 0.0004994941243825269, "loss": 1.1594, "step": 2500 }, { "epoch": 0.3011179228272629, "grad_norm": 0.9272066950798035, "learning_rate": 0.0004994910880404015, "loss": 1.4906, "step": 2505 }, { "epoch": 0.30171895660536124, "grad_norm": 0.5853176712989807, "learning_rate": 0.0004994880426225119, "loss": 1.3508, "step": 2510 }, { "epoch": 0.3023199903834595, "grad_norm": 0.4796172082424164, "learning_rate": 0.0004994849881289687, "loss": 1.3484, "step": 2515 }, { "epoch": 0.30292102416155786, "grad_norm": 0.6331420540809631, "learning_rate": 0.0004994819245598833, "loss": 1.2188, "step": 2520 }, { "epoch": 0.3035220579396562, "grad_norm": 0.6519079208374023, "learning_rate": 0.000499478851915367, "loss": 1.3531, "step": 2525 }, { "epoch": 0.30412309171775453, "grad_norm": 0.6366649866104126, "learning_rate": 0.0004994757701955314, "loss": 1.1703, "step": 2530 }, { "epoch": 0.30472412549585287, "grad_norm": 0.5621868371963501, "learning_rate": 0.0004994726794004888, "loss": 1.0441, "step": 2535 }, { "epoch": 0.3053251592739512, "grad_norm": 0.6726334095001221, "learning_rate": 0.0004994695795303517, "loss": 1.2984, "step": 2540 }, { "epoch": 0.30592619305204954, "grad_norm": 0.5448851585388184, "learning_rate": 0.0004994664705852326, "loss": 0.8781, "step": 2545 }, { "epoch": 0.3065272268301479, "grad_norm": 0.6853761076927185, "learning_rate": 0.0004994633525652448, "loss": 1.6891, "step": 2550 }, { "epoch": 0.30712826060824616, "grad_norm": 0.5627267956733704, "learning_rate": 0.0004994602254705017, "loss": 1.368, "step": 2555 }, { "epoch": 0.3077292943863445, "grad_norm": 0.38999640941619873, "learning_rate": 0.0004994570893011171, "loss": 1.3789, "step": 2560 }, { "epoch": 0.30833032816444284, "grad_norm": 0.6671114563941956, "learning_rate": 0.000499453944057205, "loss": 1.4078, "step": 2565 }, { "epoch": 0.30893136194254117, "grad_norm": 0.5521063208580017, "learning_rate": 0.0004994507897388798, "loss": 1.5859, "step": 2570 }, { "epoch": 0.3095323957206395, "grad_norm": 0.6885313391685486, "learning_rate": 0.0004994476263462563, "loss": 1.2578, "step": 2575 }, { "epoch": 0.31013342949873784, "grad_norm": 0.45498156547546387, "learning_rate": 0.0004994444538794495, "loss": 1.3914, "step": 2580 }, { "epoch": 0.3107344632768362, "grad_norm": 0.5482655167579651, "learning_rate": 0.0004994412723385749, "loss": 1.3391, "step": 2585 }, { "epoch": 0.31133549705493446, "grad_norm": 0.5240392684936523, "learning_rate": 0.0004994380817237482, "loss": 1.25, "step": 2590 }, { "epoch": 0.3119365308330328, "grad_norm": 0.5129856467247009, "learning_rate": 0.0004994348820350854, "loss": 1.4406, "step": 2595 }, { "epoch": 0.31253756461113114, "grad_norm": 0.5252668261528015, "learning_rate": 0.000499431673272703, "loss": 1.0773, "step": 2600 }, { "epoch": 0.31253756461113114, "eval_loss": 2.1500000953674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1975, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 2600 }, { "epoch": 0.31313859838922947, "grad_norm": 0.6648097634315491, "learning_rate": 0.0004994284554367176, "loss": 1.1133, "step": 2605 }, { "epoch": 0.3137396321673278, "grad_norm": 0.6218547224998474, "learning_rate": 0.0004994252285272465, "loss": 1.2937, "step": 2610 }, { "epoch": 0.31434066594542615, "grad_norm": 0.6880519390106201, "learning_rate": 0.0004994219925444068, "loss": 1.8039, "step": 2615 }, { "epoch": 0.3149416997235245, "grad_norm": 0.6464706063270569, "learning_rate": 0.0004994187474883164, "loss": 1.5594, "step": 2620 }, { "epoch": 0.3155427335016228, "grad_norm": 0.7200093865394592, "learning_rate": 0.0004994154933590932, "loss": 1.0945, "step": 2625 }, { "epoch": 0.3161437672797211, "grad_norm": 0.6853864789009094, "learning_rate": 0.0004994122301568557, "loss": 1.268, "step": 2630 }, { "epoch": 0.31674480105781944, "grad_norm": 0.5081961750984192, "learning_rate": 0.0004994089578817226, "loss": 1.4062, "step": 2635 }, { "epoch": 0.3173458348359178, "grad_norm": 0.4750553071498871, "learning_rate": 0.0004994056765338129, "loss": 1.2828, "step": 2640 }, { "epoch": 0.3179468686140161, "grad_norm": 0.5867997407913208, "learning_rate": 0.0004994023861132459, "loss": 1.2484, "step": 2645 }, { "epoch": 0.31854790239211445, "grad_norm": 0.7348740696907043, "learning_rate": 0.0004993990866201414, "loss": 1.2258, "step": 2650 }, { "epoch": 0.3191489361702128, "grad_norm": 0.5523998141288757, "learning_rate": 0.0004993957780546193, "loss": 1.2805, "step": 2655 }, { "epoch": 0.3197499699483111, "grad_norm": 0.5308116674423218, "learning_rate": 0.0004993924604168001, "loss": 1.3188, "step": 2660 }, { "epoch": 0.3203510037264094, "grad_norm": 0.40592867136001587, "learning_rate": 0.0004993891337068046, "loss": 1.2148, "step": 2665 }, { "epoch": 0.32095203750450774, "grad_norm": 0.6522583365440369, "learning_rate": 0.0004993857979247535, "loss": 1.175, "step": 2670 }, { "epoch": 0.3215530712826061, "grad_norm": 0.5981694459915161, "learning_rate": 0.0004993824530707682, "loss": 1.143, "step": 2675 }, { "epoch": 0.3221541050607044, "grad_norm": 0.6832042932510376, "learning_rate": 0.0004993790991449707, "loss": 1.2242, "step": 2680 }, { "epoch": 0.32275513883880275, "grad_norm": 0.6935708522796631, "learning_rate": 0.0004993757361474825, "loss": 0.9617, "step": 2685 }, { "epoch": 0.3233561726169011, "grad_norm": 0.5491186380386353, "learning_rate": 0.0004993723640784265, "loss": 1.3672, "step": 2690 }, { "epoch": 0.3239572063949994, "grad_norm": 0.4743538498878479, "learning_rate": 0.0004993689829379249, "loss": 1.1547, "step": 2695 }, { "epoch": 0.3245582401730977, "grad_norm": 0.641859769821167, "learning_rate": 0.0004993655927261008, "loss": 1.4078, "step": 2700 }, { "epoch": 0.32515927395119604, "grad_norm": 0.5002933144569397, "learning_rate": 0.0004993621934430778, "loss": 0.9492, "step": 2705 }, { "epoch": 0.3257603077292944, "grad_norm": 0.7241799831390381, "learning_rate": 0.0004993587850889793, "loss": 1.575, "step": 2710 }, { "epoch": 0.3263613415073927, "grad_norm": 0.5693483948707581, "learning_rate": 0.0004993553676639292, "loss": 0.9961, "step": 2715 }, { "epoch": 0.32696237528549105, "grad_norm": 0.43130815029144287, "learning_rate": 0.000499351941168052, "loss": 1.2984, "step": 2720 }, { "epoch": 0.3275634090635894, "grad_norm": 0.5054978728294373, "learning_rate": 0.0004993485056014724, "loss": 1.1375, "step": 2725 }, { "epoch": 0.3281644428416877, "grad_norm": 0.5581235289573669, "learning_rate": 0.0004993450609643152, "loss": 1.0164, "step": 2730 }, { "epoch": 0.32876547661978606, "grad_norm": 0.6733124256134033, "learning_rate": 0.0004993416072567059, "loss": 1.4078, "step": 2735 }, { "epoch": 0.32936651039788434, "grad_norm": 0.5003538727760315, "learning_rate": 0.0004993381444787699, "loss": 0.8742, "step": 2740 }, { "epoch": 0.3299675441759827, "grad_norm": 0.6292559504508972, "learning_rate": 0.0004993346726306333, "loss": 1.007, "step": 2745 }, { "epoch": 0.330568577954081, "grad_norm": 0.6760239005088806, "learning_rate": 0.0004993311917124224, "loss": 1.25, "step": 2750 }, { "epoch": 0.33116961173217935, "grad_norm": 0.6075654625892639, "learning_rate": 0.0004993277017242638, "loss": 1.5766, "step": 2755 }, { "epoch": 0.3317706455102777, "grad_norm": 0.5432557463645935, "learning_rate": 0.0004993242026662846, "loss": 1.0883, "step": 2760 }, { "epoch": 0.332371679288376, "grad_norm": 0.6972253918647766, "learning_rate": 0.0004993206945386118, "loss": 0.9992, "step": 2765 }, { "epoch": 0.33297271306647436, "grad_norm": 0.45837146043777466, "learning_rate": 0.0004993171773413731, "loss": 1.6766, "step": 2770 }, { "epoch": 0.33357374684457264, "grad_norm": 0.5207621455192566, "learning_rate": 0.0004993136510746966, "loss": 1.2578, "step": 2775 }, { "epoch": 0.334174780622671, "grad_norm": 0.7034028768539429, "learning_rate": 0.0004993101157387106, "loss": 1.3578, "step": 2780 }, { "epoch": 0.3347758144007693, "grad_norm": 0.544851541519165, "learning_rate": 0.0004993065713335434, "loss": 1.2836, "step": 2785 }, { "epoch": 0.33537684817886765, "grad_norm": 0.705143928527832, "learning_rate": 0.0004993030178593241, "loss": 1.4453, "step": 2790 }, { "epoch": 0.335977881956966, "grad_norm": 0.6619438529014587, "learning_rate": 0.0004992994553161823, "loss": 1.0547, "step": 2795 }, { "epoch": 0.3365789157350643, "grad_norm": 0.5982903242111206, "learning_rate": 0.000499295883704247, "loss": 1.4281, "step": 2800 }, { "epoch": 0.3365789157350643, "eval_loss": 2.189453125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1965, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 2800 }, { "epoch": 0.33717994951316266, "grad_norm": 0.589056670665741, "learning_rate": 0.0004992923030236485, "loss": 1.3727, "step": 2805 }, { "epoch": 0.337780983291261, "grad_norm": 0.39378607273101807, "learning_rate": 0.000499288713274517, "loss": 1.1789, "step": 2810 }, { "epoch": 0.3383820170693593, "grad_norm": 0.5460519790649414, "learning_rate": 0.000499285114456983, "loss": 1.1719, "step": 2815 }, { "epoch": 0.3389830508474576, "grad_norm": 0.4953864812850952, "learning_rate": 0.0004992815065711774, "loss": 1.1672, "step": 2820 }, { "epoch": 0.33958408462555595, "grad_norm": 0.5705846548080444, "learning_rate": 0.0004992778896172317, "loss": 1.5328, "step": 2825 }, { "epoch": 0.3401851184036543, "grad_norm": 0.5687447190284729, "learning_rate": 0.0004992742635952771, "loss": 1.0063, "step": 2830 }, { "epoch": 0.3407861521817526, "grad_norm": 0.516343891620636, "learning_rate": 0.0004992706285054458, "loss": 1.1492, "step": 2835 }, { "epoch": 0.34138718595985096, "grad_norm": 0.6128392815589905, "learning_rate": 0.0004992669843478699, "loss": 1.325, "step": 2840 }, { "epoch": 0.3419882197379493, "grad_norm": 0.5104270577430725, "learning_rate": 0.000499263331122682, "loss": 1.0195, "step": 2845 }, { "epoch": 0.3425892535160476, "grad_norm": 0.38332250714302063, "learning_rate": 0.0004992596688300149, "loss": 1.302, "step": 2850 }, { "epoch": 0.3431902872941459, "grad_norm": 0.5674039125442505, "learning_rate": 0.000499255997470002, "loss": 1.2094, "step": 2855 }, { "epoch": 0.34379132107224425, "grad_norm": 0.7987366914749146, "learning_rate": 0.0004992523170427766, "loss": 1.2047, "step": 2860 }, { "epoch": 0.3443923548503426, "grad_norm": 0.45501282811164856, "learning_rate": 0.0004992486275484729, "loss": 1.1539, "step": 2865 }, { "epoch": 0.3449933886284409, "grad_norm": 0.5390669703483582, "learning_rate": 0.0004992449289872249, "loss": 1.2102, "step": 2870 }, { "epoch": 0.34559442240653926, "grad_norm": 0.6710581183433533, "learning_rate": 0.0004992412213591672, "loss": 1.4297, "step": 2875 }, { "epoch": 0.3461954561846376, "grad_norm": 0.6371570825576782, "learning_rate": 0.0004992375046644347, "loss": 1.0164, "step": 2880 }, { "epoch": 0.34679648996273593, "grad_norm": 0.49934741854667664, "learning_rate": 0.0004992337789031625, "loss": 1.1313, "step": 2885 }, { "epoch": 0.3473975237408342, "grad_norm": 0.41756120324134827, "learning_rate": 0.0004992300440754862, "loss": 1.1969, "step": 2890 }, { "epoch": 0.34799855751893255, "grad_norm": 0.8102174997329712, "learning_rate": 0.0004992263001815418, "loss": 1.2719, "step": 2895 }, { "epoch": 0.3485995912970309, "grad_norm": 0.45573851466178894, "learning_rate": 0.0004992225472214653, "loss": 1.1375, "step": 2900 }, { "epoch": 0.3492006250751292, "grad_norm": 0.5512142777442932, "learning_rate": 0.0004992187851953932, "loss": 1.4781, "step": 2905 }, { "epoch": 0.34980165885322756, "grad_norm": 0.6429489850997925, "learning_rate": 0.0004992150141034624, "loss": 1.3453, "step": 2910 }, { "epoch": 0.3504026926313259, "grad_norm": 0.6230481266975403, "learning_rate": 0.0004992112339458103, "loss": 1.2766, "step": 2915 }, { "epoch": 0.35100372640942423, "grad_norm": 0.6134311556816101, "learning_rate": 0.0004992074447225741, "loss": 1.0664, "step": 2920 }, { "epoch": 0.3516047601875225, "grad_norm": 0.5894711017608643, "learning_rate": 0.0004992036464338918, "loss": 0.9, "step": 2925 }, { "epoch": 0.35220579396562085, "grad_norm": 0.525947630405426, "learning_rate": 0.0004991998390799016, "loss": 1.4844, "step": 2930 }, { "epoch": 0.3528068277437192, "grad_norm": 0.5282221436500549, "learning_rate": 0.0004991960226607418, "loss": 1.0398, "step": 2935 }, { "epoch": 0.3534078615218175, "grad_norm": 0.5808281302452087, "learning_rate": 0.0004991921971765514, "loss": 0.943, "step": 2940 }, { "epoch": 0.35400889529991586, "grad_norm": 0.6163946390151978, "learning_rate": 0.0004991883626274696, "loss": 1.1086, "step": 2945 }, { "epoch": 0.3546099290780142, "grad_norm": 0.4154224991798401, "learning_rate": 0.0004991845190136357, "loss": 1.2703, "step": 2950 }, { "epoch": 0.35521096285611253, "grad_norm": 0.4030189514160156, "learning_rate": 0.0004991806663351897, "loss": 1.1086, "step": 2955 }, { "epoch": 0.35581199663421087, "grad_norm": 0.5927292704582214, "learning_rate": 0.0004991768045922718, "loss": 0.9758, "step": 2960 }, { "epoch": 0.35641303041230915, "grad_norm": 0.476971834897995, "learning_rate": 0.0004991729337850223, "loss": 1.525, "step": 2965 }, { "epoch": 0.3570140641904075, "grad_norm": 0.5584660768508911, "learning_rate": 0.000499169053913582, "loss": 1.2125, "step": 2970 }, { "epoch": 0.3576150979685058, "grad_norm": 0.5617804527282715, "learning_rate": 0.0004991651649780922, "loss": 1.3102, "step": 2975 }, { "epoch": 0.35821613174660416, "grad_norm": 0.3463181257247925, "learning_rate": 0.0004991612669786942, "loss": 1.4227, "step": 2980 }, { "epoch": 0.3588171655247025, "grad_norm": 0.5156741142272949, "learning_rate": 0.0004991573599155299, "loss": 1.3828, "step": 2985 }, { "epoch": 0.35941819930280083, "grad_norm": 0.6080055832862854, "learning_rate": 0.0004991534437887414, "loss": 1.0102, "step": 2990 }, { "epoch": 0.36001923308089917, "grad_norm": 0.5142369866371155, "learning_rate": 0.0004991495185984711, "loss": 1.3469, "step": 2995 }, { "epoch": 0.36062026685899745, "grad_norm": 0.4148232638835907, "learning_rate": 0.000499145584344862, "loss": 1.3102, "step": 3000 }, { "epoch": 0.36062026685899745, "eval_loss": 2.1435546875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2124, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 3000 } ], "logging_steps": 5, "max_steps": 83190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.53843742326784e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }