qavito_model / trainer_state.json
olisval's picture
Update LoRA weights
18ebc0f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.783166904422254,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001783166904422254,
"grad_norm": 2.0930111408233643,
"learning_rate": 4.999995641358869e-05,
"loss": 0.7967,
"num_input_tokens_seen": 63024,
"step": 5
},
{
"epoch": 0.003566333808844508,
"grad_norm": 1.2970882654190063,
"learning_rate": 4.999982565450674e-05,
"loss": 0.7382,
"num_input_tokens_seen": 126336,
"step": 10
},
{
"epoch": 0.005349500713266762,
"grad_norm": 0.8319762349128723,
"learning_rate": 4.999960772321009e-05,
"loss": 0.6823,
"num_input_tokens_seen": 184688,
"step": 15
},
{
"epoch": 0.007132667617689016,
"grad_norm": 0.9985227584838867,
"learning_rate": 4.999930262045865e-05,
"loss": 0.6836,
"num_input_tokens_seen": 245808,
"step": 20
},
{
"epoch": 0.00891583452211127,
"grad_norm": 1.065556287765503,
"learning_rate": 4.9998910347316286e-05,
"loss": 0.7561,
"num_input_tokens_seen": 306944,
"step": 25
},
{
"epoch": 0.010699001426533523,
"grad_norm": 1.066805362701416,
"learning_rate": 4.9998430905150826e-05,
"loss": 0.7299,
"num_input_tokens_seen": 371616,
"step": 30
},
{
"epoch": 0.012482168330955777,
"grad_norm": 1.2590147256851196,
"learning_rate": 4.999786429563404e-05,
"loss": 0.6834,
"num_input_tokens_seen": 435536,
"step": 35
},
{
"epoch": 0.014265335235378032,
"grad_norm": 1.0066215991973877,
"learning_rate": 4.999721052074164e-05,
"loss": 0.6511,
"num_input_tokens_seen": 499328,
"step": 40
},
{
"epoch": 0.016048502139800285,
"grad_norm": 1.0162546634674072,
"learning_rate": 4.99964695827533e-05,
"loss": 0.5992,
"num_input_tokens_seen": 557504,
"step": 45
},
{
"epoch": 0.01783166904422254,
"grad_norm": 0.9829245209693909,
"learning_rate": 4.999564148425258e-05,
"loss": 0.6245,
"num_input_tokens_seen": 621440,
"step": 50
},
{
"epoch": 0.019614835948644792,
"grad_norm": 0.9447645545005798,
"learning_rate": 4.999472622812701e-05,
"loss": 0.6444,
"num_input_tokens_seen": 685856,
"step": 55
},
{
"epoch": 0.021398002853067047,
"grad_norm": 1.0958608388900757,
"learning_rate": 4.9993723817567996e-05,
"loss": 0.5194,
"num_input_tokens_seen": 748112,
"step": 60
},
{
"epoch": 0.023181169757489302,
"grad_norm": 0.9865729808807373,
"learning_rate": 4.999263425607086e-05,
"loss": 0.5021,
"num_input_tokens_seen": 811008,
"step": 65
},
{
"epoch": 0.024964336661911554,
"grad_norm": 1.2535978555679321,
"learning_rate": 4.9991457547434805e-05,
"loss": 0.6641,
"num_input_tokens_seen": 878272,
"step": 70
},
{
"epoch": 0.02674750356633381,
"grad_norm": 1.6020156145095825,
"learning_rate": 4.9990193695762914e-05,
"loss": 0.5479,
"num_input_tokens_seen": 942608,
"step": 75
},
{
"epoch": 0.028530670470756064,
"grad_norm": 1.1668367385864258,
"learning_rate": 4.998884270546214e-05,
"loss": 0.6181,
"num_input_tokens_seen": 1005776,
"step": 80
},
{
"epoch": 0.030313837375178315,
"grad_norm": 1.1580744981765747,
"learning_rate": 4.998740458124324e-05,
"loss": 0.6266,
"num_input_tokens_seen": 1068192,
"step": 85
},
{
"epoch": 0.03209700427960057,
"grad_norm": 0.9773775339126587,
"learning_rate": 4.9985879328120846e-05,
"loss": 0.5088,
"num_input_tokens_seen": 1128592,
"step": 90
},
{
"epoch": 0.033880171184022825,
"grad_norm": 1.4142199754714966,
"learning_rate": 4.9984266951413396e-05,
"loss": 0.5199,
"num_input_tokens_seen": 1194592,
"step": 95
},
{
"epoch": 0.03566333808844508,
"grad_norm": 1.459350347518921,
"learning_rate": 4.998256745674308e-05,
"loss": 0.5855,
"num_input_tokens_seen": 1257744,
"step": 100
},
{
"epoch": 0.037446504992867335,
"grad_norm": 1.118642807006836,
"learning_rate": 4.99807808500359e-05,
"loss": 0.6148,
"num_input_tokens_seen": 1320944,
"step": 105
},
{
"epoch": 0.039229671897289584,
"grad_norm": 1.1180983781814575,
"learning_rate": 4.99789071375216e-05,
"loss": 0.5517,
"num_input_tokens_seen": 1382928,
"step": 110
},
{
"epoch": 0.04101283880171184,
"grad_norm": 1.2651177644729614,
"learning_rate": 4.9976946325733654e-05,
"loss": 0.5959,
"num_input_tokens_seen": 1449408,
"step": 115
},
{
"epoch": 0.042796005706134094,
"grad_norm": 0.9860583543777466,
"learning_rate": 4.997489842150924e-05,
"loss": 0.4779,
"num_input_tokens_seen": 1510752,
"step": 120
},
{
"epoch": 0.04457917261055635,
"grad_norm": 1.0358836650848389,
"learning_rate": 4.997276343198922e-05,
"loss": 0.5474,
"num_input_tokens_seen": 1568928,
"step": 125
},
{
"epoch": 0.046362339514978604,
"grad_norm": 1.3108216524124146,
"learning_rate": 4.997054136461811e-05,
"loss": 0.4624,
"num_input_tokens_seen": 1631872,
"step": 130
},
{
"epoch": 0.04814550641940086,
"grad_norm": 1.0577709674835205,
"learning_rate": 4.996823222714408e-05,
"loss": 0.558,
"num_input_tokens_seen": 1694000,
"step": 135
},
{
"epoch": 0.04992867332382311,
"grad_norm": 0.9583589434623718,
"learning_rate": 4.996583602761887e-05,
"loss": 0.535,
"num_input_tokens_seen": 1752208,
"step": 140
},
{
"epoch": 0.05171184022824536,
"grad_norm": 1.1273239850997925,
"learning_rate": 4.9963352774397845e-05,
"loss": 0.581,
"num_input_tokens_seen": 1809968,
"step": 145
},
{
"epoch": 0.05349500713266762,
"grad_norm": 0.9180589914321899,
"learning_rate": 4.9960782476139875e-05,
"loss": 0.5853,
"num_input_tokens_seen": 1875584,
"step": 150
},
{
"epoch": 0.05527817403708987,
"grad_norm": 0.9368972778320312,
"learning_rate": 4.9958125141807376e-05,
"loss": 0.5655,
"num_input_tokens_seen": 1936544,
"step": 155
},
{
"epoch": 0.05706134094151213,
"grad_norm": 1.093083143234253,
"learning_rate": 4.9955380780666233e-05,
"loss": 0.5248,
"num_input_tokens_seen": 1997312,
"step": 160
},
{
"epoch": 0.05884450784593438,
"grad_norm": 1.0452104806900024,
"learning_rate": 4.99525494022858e-05,
"loss": 0.5912,
"num_input_tokens_seen": 2058400,
"step": 165
},
{
"epoch": 0.06062767475035663,
"grad_norm": 1.655479073524475,
"learning_rate": 4.9949631016538845e-05,
"loss": 0.5465,
"num_input_tokens_seen": 2123584,
"step": 170
},
{
"epoch": 0.062410841654778886,
"grad_norm": 1.295340895652771,
"learning_rate": 4.994662563360152e-05,
"loss": 0.6319,
"num_input_tokens_seen": 2187776,
"step": 175
},
{
"epoch": 0.06419400855920114,
"grad_norm": 1.1385325193405151,
"learning_rate": 4.994353326395334e-05,
"loss": 0.6121,
"num_input_tokens_seen": 2248592,
"step": 180
},
{
"epoch": 0.06597717546362339,
"grad_norm": 1.2202588319778442,
"learning_rate": 4.994035391837713e-05,
"loss": 0.5926,
"num_input_tokens_seen": 2311472,
"step": 185
},
{
"epoch": 0.06776034236804565,
"grad_norm": 1.1300709247589111,
"learning_rate": 4.9937087607958987e-05,
"loss": 0.5075,
"num_input_tokens_seen": 2374240,
"step": 190
},
{
"epoch": 0.0695435092724679,
"grad_norm": 1.0753881931304932,
"learning_rate": 4.993373434408825e-05,
"loss": 0.5187,
"num_input_tokens_seen": 2434864,
"step": 195
},
{
"epoch": 0.07132667617689016,
"grad_norm": 1.0271146297454834,
"learning_rate": 4.993029413845746e-05,
"loss": 0.5777,
"num_input_tokens_seen": 2495712,
"step": 200
},
{
"epoch": 0.07310984308131241,
"grad_norm": 1.7475312948226929,
"learning_rate": 4.9926767003062316e-05,
"loss": 0.5091,
"num_input_tokens_seen": 2555184,
"step": 205
},
{
"epoch": 0.07489300998573467,
"grad_norm": 1.1732685565948486,
"learning_rate": 4.992315295020163e-05,
"loss": 0.5594,
"num_input_tokens_seen": 2616736,
"step": 210
},
{
"epoch": 0.07667617689015692,
"grad_norm": 1.1418745517730713,
"learning_rate": 4.991945199247728e-05,
"loss": 0.633,
"num_input_tokens_seen": 2679568,
"step": 215
},
{
"epoch": 0.07845934379457917,
"grad_norm": 1.5812561511993408,
"learning_rate": 4.991566414279421e-05,
"loss": 0.5361,
"num_input_tokens_seen": 2741888,
"step": 220
},
{
"epoch": 0.08024251069900143,
"grad_norm": 1.2565455436706543,
"learning_rate": 4.99117894143603e-05,
"loss": 0.5128,
"num_input_tokens_seen": 2804736,
"step": 225
},
{
"epoch": 0.08202567760342368,
"grad_norm": 1.081152081489563,
"learning_rate": 4.990782782068639e-05,
"loss": 0.4925,
"num_input_tokens_seen": 2864768,
"step": 230
},
{
"epoch": 0.08380884450784594,
"grad_norm": 1.157086730003357,
"learning_rate": 4.9903779375586224e-05,
"loss": 0.5091,
"num_input_tokens_seen": 2925776,
"step": 235
},
{
"epoch": 0.08559201141226819,
"grad_norm": 1.496232032775879,
"learning_rate": 4.989964409317637e-05,
"loss": 0.5611,
"num_input_tokens_seen": 2984032,
"step": 240
},
{
"epoch": 0.08737517831669044,
"grad_norm": 1.5581008195877075,
"learning_rate": 4.989542198787619e-05,
"loss": 0.4574,
"num_input_tokens_seen": 3047024,
"step": 245
},
{
"epoch": 0.0891583452211127,
"grad_norm": 1.1673293113708496,
"learning_rate": 4.9891113074407816e-05,
"loss": 0.4982,
"num_input_tokens_seen": 3105552,
"step": 250
},
{
"epoch": 0.09094151212553495,
"grad_norm": 1.1178501844406128,
"learning_rate": 4.988671736779604e-05,
"loss": 0.5412,
"num_input_tokens_seen": 3165632,
"step": 255
},
{
"epoch": 0.09272467902995721,
"grad_norm": 1.1773957014083862,
"learning_rate": 4.988223488336832e-05,
"loss": 0.5028,
"num_input_tokens_seen": 3229392,
"step": 260
},
{
"epoch": 0.09450784593437946,
"grad_norm": 1.1285181045532227,
"learning_rate": 4.987766563675467e-05,
"loss": 0.5414,
"num_input_tokens_seen": 3287616,
"step": 265
},
{
"epoch": 0.09629101283880172,
"grad_norm": 1.630057454109192,
"learning_rate": 4.9873009643887666e-05,
"loss": 0.5512,
"num_input_tokens_seen": 3346496,
"step": 270
},
{
"epoch": 0.09807417974322397,
"grad_norm": 2.1637048721313477,
"learning_rate": 4.986826692100236e-05,
"loss": 0.4881,
"num_input_tokens_seen": 3409312,
"step": 275
},
{
"epoch": 0.09985734664764621,
"grad_norm": 1.9481849670410156,
"learning_rate": 4.98634374846362e-05,
"loss": 0.4716,
"num_input_tokens_seen": 3472752,
"step": 280
},
{
"epoch": 0.10164051355206848,
"grad_norm": 1.3725030422210693,
"learning_rate": 4.9858521351629005e-05,
"loss": 0.5286,
"num_input_tokens_seen": 3534032,
"step": 285
},
{
"epoch": 0.10342368045649072,
"grad_norm": 1.5664440393447876,
"learning_rate": 4.985351853912292e-05,
"loss": 0.4985,
"num_input_tokens_seen": 3598336,
"step": 290
},
{
"epoch": 0.10520684736091299,
"grad_norm": 1.3553557395935059,
"learning_rate": 4.984842906456231e-05,
"loss": 0.5768,
"num_input_tokens_seen": 3662144,
"step": 295
},
{
"epoch": 0.10699001426533523,
"grad_norm": 1.2202125787734985,
"learning_rate": 4.984325294569372e-05,
"loss": 0.4933,
"num_input_tokens_seen": 3724048,
"step": 300
},
{
"epoch": 0.10877318116975748,
"grad_norm": 1.0455083847045898,
"learning_rate": 4.9837990200565834e-05,
"loss": 0.5675,
"num_input_tokens_seen": 3784320,
"step": 305
},
{
"epoch": 0.11055634807417974,
"grad_norm": 1.5656300783157349,
"learning_rate": 4.983264084752939e-05,
"loss": 0.5315,
"num_input_tokens_seen": 3849040,
"step": 310
},
{
"epoch": 0.11233951497860199,
"grad_norm": 1.4153857231140137,
"learning_rate": 4.98272049052371e-05,
"loss": 0.5444,
"num_input_tokens_seen": 3909552,
"step": 315
},
{
"epoch": 0.11412268188302425,
"grad_norm": 1.9048830270767212,
"learning_rate": 4.982168239264364e-05,
"loss": 0.4808,
"num_input_tokens_seen": 3969120,
"step": 320
},
{
"epoch": 0.1159058487874465,
"grad_norm": 1.0821411609649658,
"learning_rate": 4.981607332900552e-05,
"loss": 0.4829,
"num_input_tokens_seen": 4029360,
"step": 325
},
{
"epoch": 0.11768901569186876,
"grad_norm": 1.2863287925720215,
"learning_rate": 4.9810377733881065e-05,
"loss": 0.5273,
"num_input_tokens_seen": 4091296,
"step": 330
},
{
"epoch": 0.11947218259629101,
"grad_norm": 1.3957486152648926,
"learning_rate": 4.98045956271303e-05,
"loss": 0.5443,
"num_input_tokens_seen": 4154304,
"step": 335
},
{
"epoch": 0.12125534950071326,
"grad_norm": 1.1562933921813965,
"learning_rate": 4.979872702891495e-05,
"loss": 0.5046,
"num_input_tokens_seen": 4220400,
"step": 340
},
{
"epoch": 0.12303851640513552,
"grad_norm": 1.1498775482177734,
"learning_rate": 4.979277195969829e-05,
"loss": 0.5393,
"num_input_tokens_seen": 4279408,
"step": 345
},
{
"epoch": 0.12482168330955777,
"grad_norm": 1.2570199966430664,
"learning_rate": 4.978673044024514e-05,
"loss": 0.451,
"num_input_tokens_seen": 4339392,
"step": 350
},
{
"epoch": 0.12660485021398002,
"grad_norm": 1.3947458267211914,
"learning_rate": 4.978060249162175e-05,
"loss": 0.5715,
"num_input_tokens_seen": 4399424,
"step": 355
},
{
"epoch": 0.12838801711840228,
"grad_norm": 1.1799883842468262,
"learning_rate": 4.977438813519574e-05,
"loss": 0.5409,
"num_input_tokens_seen": 4460992,
"step": 360
},
{
"epoch": 0.13017118402282454,
"grad_norm": 0.9736462831497192,
"learning_rate": 4.976808739263602e-05,
"loss": 0.5298,
"num_input_tokens_seen": 4525664,
"step": 365
},
{
"epoch": 0.13195435092724678,
"grad_norm": 1.1682716608047485,
"learning_rate": 4.976170028591274e-05,
"loss": 0.481,
"num_input_tokens_seen": 4582160,
"step": 370
},
{
"epoch": 0.13373751783166904,
"grad_norm": 1.3871419429779053,
"learning_rate": 4.975522683729719e-05,
"loss": 0.5021,
"num_input_tokens_seen": 4649328,
"step": 375
},
{
"epoch": 0.1355206847360913,
"grad_norm": 1.1554944515228271,
"learning_rate": 4.9748667069361715e-05,
"loss": 0.5064,
"num_input_tokens_seen": 4711088,
"step": 380
},
{
"epoch": 0.13730385164051356,
"grad_norm": 1.3844372034072876,
"learning_rate": 4.9742021004979656e-05,
"loss": 0.5516,
"num_input_tokens_seen": 4774864,
"step": 385
},
{
"epoch": 0.1390870185449358,
"grad_norm": 1.4874283075332642,
"learning_rate": 4.9735288667325257e-05,
"loss": 0.4712,
"num_input_tokens_seen": 4834944,
"step": 390
},
{
"epoch": 0.14087018544935806,
"grad_norm": 1.195500373840332,
"learning_rate": 4.97284700798736e-05,
"loss": 0.5326,
"num_input_tokens_seen": 4897264,
"step": 395
},
{
"epoch": 0.14265335235378032,
"grad_norm": 1.1240135431289673,
"learning_rate": 4.97215652664005e-05,
"loss": 0.5958,
"num_input_tokens_seen": 4962208,
"step": 400
},
{
"epoch": 0.14443651925820256,
"grad_norm": 0.8974002599716187,
"learning_rate": 4.971457425098244e-05,
"loss": 0.5536,
"num_input_tokens_seen": 5027264,
"step": 405
},
{
"epoch": 0.14621968616262482,
"grad_norm": 1.0974167585372925,
"learning_rate": 4.970749705799649e-05,
"loss": 0.4721,
"num_input_tokens_seen": 5093216,
"step": 410
},
{
"epoch": 0.14800285306704708,
"grad_norm": 1.3087302446365356,
"learning_rate": 4.9700333712120195e-05,
"loss": 0.4383,
"num_input_tokens_seen": 5155296,
"step": 415
},
{
"epoch": 0.14978601997146934,
"grad_norm": 5.880493640899658,
"learning_rate": 4.969308423833152e-05,
"loss": 0.5098,
"num_input_tokens_seen": 5216416,
"step": 420
},
{
"epoch": 0.15156918687589158,
"grad_norm": 1.2446019649505615,
"learning_rate": 4.9685748661908756e-05,
"loss": 0.494,
"num_input_tokens_seen": 5278816,
"step": 425
},
{
"epoch": 0.15335235378031384,
"grad_norm": 1.1921520233154297,
"learning_rate": 4.967832700843041e-05,
"loss": 0.5728,
"num_input_tokens_seen": 5344896,
"step": 430
},
{
"epoch": 0.1551355206847361,
"grad_norm": 1.161622166633606,
"learning_rate": 4.967081930377515e-05,
"loss": 0.5036,
"num_input_tokens_seen": 5400960,
"step": 435
},
{
"epoch": 0.15691868758915833,
"grad_norm": 1.0513135194778442,
"learning_rate": 4.966322557412168e-05,
"loss": 0.4347,
"num_input_tokens_seen": 5462928,
"step": 440
},
{
"epoch": 0.1587018544935806,
"grad_norm": 1.2251578569412231,
"learning_rate": 4.965554584594868e-05,
"loss": 0.4997,
"num_input_tokens_seen": 5525296,
"step": 445
},
{
"epoch": 0.16048502139800286,
"grad_norm": 1.2554380893707275,
"learning_rate": 4.9647780146034695e-05,
"loss": 0.511,
"num_input_tokens_seen": 5590640,
"step": 450
},
{
"epoch": 0.16226818830242512,
"grad_norm": 2.3998403549194336,
"learning_rate": 4.9639928501458035e-05,
"loss": 0.5376,
"num_input_tokens_seen": 5652912,
"step": 455
},
{
"epoch": 0.16405135520684735,
"grad_norm": 1.3643852472305298,
"learning_rate": 4.963199093959671e-05,
"loss": 0.5668,
"num_input_tokens_seen": 5711952,
"step": 460
},
{
"epoch": 0.16583452211126962,
"grad_norm": 1.4717122316360474,
"learning_rate": 4.96239674881283e-05,
"loss": 0.4877,
"num_input_tokens_seen": 5773968,
"step": 465
},
{
"epoch": 0.16761768901569188,
"grad_norm": 1.8179185390472412,
"learning_rate": 4.9615858175029884e-05,
"loss": 0.4669,
"num_input_tokens_seen": 5836064,
"step": 470
},
{
"epoch": 0.1694008559201141,
"grad_norm": 2.963438034057617,
"learning_rate": 4.960766302857793e-05,
"loss": 0.4766,
"num_input_tokens_seen": 5897600,
"step": 475
},
{
"epoch": 0.17118402282453637,
"grad_norm": 2.9000422954559326,
"learning_rate": 4.9599382077348205e-05,
"loss": 0.542,
"num_input_tokens_seen": 5959856,
"step": 480
},
{
"epoch": 0.17296718972895864,
"grad_norm": 1.1453759670257568,
"learning_rate": 4.959101535021566e-05,
"loss": 0.5482,
"num_input_tokens_seen": 6016128,
"step": 485
},
{
"epoch": 0.17475035663338087,
"grad_norm": 1.1614904403686523,
"learning_rate": 4.9582562876354346e-05,
"loss": 0.5361,
"num_input_tokens_seen": 6079664,
"step": 490
},
{
"epoch": 0.17653352353780313,
"grad_norm": 1.3136591911315918,
"learning_rate": 4.95740246852373e-05,
"loss": 0.5131,
"num_input_tokens_seen": 6137568,
"step": 495
},
{
"epoch": 0.1783166904422254,
"grad_norm": 1.0961729288101196,
"learning_rate": 4.9565400806636447e-05,
"loss": 0.431,
"num_input_tokens_seen": 6199280,
"step": 500
},
{
"epoch": 0.18009985734664766,
"grad_norm": 1.3530110120773315,
"learning_rate": 4.9556691270622515e-05,
"loss": 0.526,
"num_input_tokens_seen": 6262272,
"step": 505
},
{
"epoch": 0.1818830242510699,
"grad_norm": 1.2133769989013672,
"learning_rate": 4.9547896107564886e-05,
"loss": 0.5082,
"num_input_tokens_seen": 6324144,
"step": 510
},
{
"epoch": 0.18366619115549215,
"grad_norm": 1.2528913021087646,
"learning_rate": 4.9539015348131526e-05,
"loss": 0.5343,
"num_input_tokens_seen": 6386096,
"step": 515
},
{
"epoch": 0.18544935805991442,
"grad_norm": 1.4908058643341064,
"learning_rate": 4.953004902328887e-05,
"loss": 0.5408,
"num_input_tokens_seen": 6450704,
"step": 520
},
{
"epoch": 0.18723252496433665,
"grad_norm": 1.0931016206741333,
"learning_rate": 4.9520997164301726e-05,
"loss": 0.53,
"num_input_tokens_seen": 6512512,
"step": 525
},
{
"epoch": 0.1890156918687589,
"grad_norm": 1.317772626876831,
"learning_rate": 4.951185980273312e-05,
"loss": 0.4741,
"num_input_tokens_seen": 6572848,
"step": 530
},
{
"epoch": 0.19079885877318117,
"grad_norm": 1.114240288734436,
"learning_rate": 4.9502636970444246e-05,
"loss": 0.5021,
"num_input_tokens_seen": 6634064,
"step": 535
},
{
"epoch": 0.19258202567760344,
"grad_norm": 1.1686744689941406,
"learning_rate": 4.949332869959432e-05,
"loss": 0.5557,
"num_input_tokens_seen": 6698560,
"step": 540
},
{
"epoch": 0.19436519258202567,
"grad_norm": 1.2107973098754883,
"learning_rate": 4.948393502264046e-05,
"loss": 0.5101,
"num_input_tokens_seen": 6758000,
"step": 545
},
{
"epoch": 0.19614835948644793,
"grad_norm": 1.067867398262024,
"learning_rate": 4.9474455972337607e-05,
"loss": 0.4712,
"num_input_tokens_seen": 6823616,
"step": 550
},
{
"epoch": 0.1979315263908702,
"grad_norm": 1.0068106651306152,
"learning_rate": 4.946489158173838e-05,
"loss": 0.4854,
"num_input_tokens_seen": 6883376,
"step": 555
},
{
"epoch": 0.19971469329529243,
"grad_norm": 1.490473747253418,
"learning_rate": 4.945524188419298e-05,
"loss": 0.5664,
"num_input_tokens_seen": 6943808,
"step": 560
},
{
"epoch": 0.2014978601997147,
"grad_norm": 1.0813665390014648,
"learning_rate": 4.9445506913349063e-05,
"loss": 0.6241,
"num_input_tokens_seen": 7005728,
"step": 565
},
{
"epoch": 0.20328102710413695,
"grad_norm": 1.3641761541366577,
"learning_rate": 4.943568670315162e-05,
"loss": 0.4916,
"num_input_tokens_seen": 7068608,
"step": 570
},
{
"epoch": 0.20506419400855921,
"grad_norm": 1.0902137756347656,
"learning_rate": 4.942578128784287e-05,
"loss": 0.4833,
"num_input_tokens_seen": 7127008,
"step": 575
},
{
"epoch": 0.20684736091298145,
"grad_norm": 1.430445909500122,
"learning_rate": 4.941579070196214e-05,
"loss": 0.422,
"num_input_tokens_seen": 7191776,
"step": 580
},
{
"epoch": 0.2086305278174037,
"grad_norm": 1.6088680028915405,
"learning_rate": 4.940571498034572e-05,
"loss": 0.4913,
"num_input_tokens_seen": 7251536,
"step": 585
},
{
"epoch": 0.21041369472182597,
"grad_norm": 1.3081697225570679,
"learning_rate": 4.939555415812678e-05,
"loss": 0.451,
"num_input_tokens_seen": 7315696,
"step": 590
},
{
"epoch": 0.2121968616262482,
"grad_norm": 1.3625929355621338,
"learning_rate": 4.938530827073522e-05,
"loss": 0.5694,
"num_input_tokens_seen": 7373792,
"step": 595
},
{
"epoch": 0.21398002853067047,
"grad_norm": 1.1833407878875732,
"learning_rate": 4.9374977353897566e-05,
"loss": 0.5647,
"num_input_tokens_seen": 7434464,
"step": 600
},
{
"epoch": 0.21576319543509273,
"grad_norm": 1.3193016052246094,
"learning_rate": 4.936456144363681e-05,
"loss": 0.5739,
"num_input_tokens_seen": 7497328,
"step": 605
},
{
"epoch": 0.21754636233951496,
"grad_norm": 1.4671732187271118,
"learning_rate": 4.935406057627234e-05,
"loss": 0.5399,
"num_input_tokens_seen": 7560816,
"step": 610
},
{
"epoch": 0.21932952924393723,
"grad_norm": 1.0455771684646606,
"learning_rate": 4.9343474788419767e-05,
"loss": 0.4423,
"num_input_tokens_seen": 7623280,
"step": 615
},
{
"epoch": 0.2211126961483595,
"grad_norm": 1.2360905408859253,
"learning_rate": 4.9332804116990795e-05,
"loss": 0.4595,
"num_input_tokens_seen": 7685264,
"step": 620
},
{
"epoch": 0.22289586305278175,
"grad_norm": 1.3082692623138428,
"learning_rate": 4.9322048599193124e-05,
"loss": 0.5022,
"num_input_tokens_seen": 7748000,
"step": 625
},
{
"epoch": 0.22467902995720399,
"grad_norm": 1.306279182434082,
"learning_rate": 4.931120827253033e-05,
"loss": 0.4287,
"num_input_tokens_seen": 7812992,
"step": 630
},
{
"epoch": 0.22646219686162625,
"grad_norm": 1.3158313035964966,
"learning_rate": 4.930028317480167e-05,
"loss": 0.4895,
"num_input_tokens_seen": 7876416,
"step": 635
},
{
"epoch": 0.2282453637660485,
"grad_norm": 1.1636604070663452,
"learning_rate": 4.9289273344102014e-05,
"loss": 0.4975,
"num_input_tokens_seen": 7940544,
"step": 640
},
{
"epoch": 0.23002853067047074,
"grad_norm": 1.23000168800354,
"learning_rate": 4.927817881882169e-05,
"loss": 0.4295,
"num_input_tokens_seen": 7999472,
"step": 645
},
{
"epoch": 0.231811697574893,
"grad_norm": 1.54082453250885,
"learning_rate": 4.9266999637646326e-05,
"loss": 0.5753,
"num_input_tokens_seen": 8061168,
"step": 650
},
{
"epoch": 0.23359486447931527,
"grad_norm": 2.485759973526001,
"learning_rate": 4.925573583955676e-05,
"loss": 0.443,
"num_input_tokens_seen": 8118944,
"step": 655
},
{
"epoch": 0.23537803138373753,
"grad_norm": 1.284912347793579,
"learning_rate": 4.9244387463828876e-05,
"loss": 0.5421,
"num_input_tokens_seen": 8185072,
"step": 660
},
{
"epoch": 0.23716119828815976,
"grad_norm": 3.4624996185302734,
"learning_rate": 4.9232954550033484e-05,
"loss": 0.4099,
"num_input_tokens_seen": 8247616,
"step": 665
},
{
"epoch": 0.23894436519258203,
"grad_norm": 1.1022762060165405,
"learning_rate": 4.922143713803613e-05,
"loss": 0.4784,
"num_input_tokens_seen": 8312240,
"step": 670
},
{
"epoch": 0.2407275320970043,
"grad_norm": 1.1634345054626465,
"learning_rate": 4.920983526799705e-05,
"loss": 0.3882,
"num_input_tokens_seen": 8371088,
"step": 675
},
{
"epoch": 0.24251069900142652,
"grad_norm": 1.4921728372573853,
"learning_rate": 4.919814898037095e-05,
"loss": 0.5662,
"num_input_tokens_seen": 8435264,
"step": 680
},
{
"epoch": 0.24429386590584878,
"grad_norm": 1.2474942207336426,
"learning_rate": 4.918637831590689e-05,
"loss": 0.4169,
"num_input_tokens_seen": 8498960,
"step": 685
},
{
"epoch": 0.24607703281027105,
"grad_norm": 0.9692139625549316,
"learning_rate": 4.917452331564816e-05,
"loss": 0.4681,
"num_input_tokens_seen": 8561168,
"step": 690
},
{
"epoch": 0.2478601997146933,
"grad_norm": 1.57968008518219,
"learning_rate": 4.9162584020932114e-05,
"loss": 0.4668,
"num_input_tokens_seen": 8624528,
"step": 695
},
{
"epoch": 0.24964336661911554,
"grad_norm": 1.7983195781707764,
"learning_rate": 4.915056047339002e-05,
"loss": 0.5366,
"num_input_tokens_seen": 8684608,
"step": 700
},
{
"epoch": 0.2514265335235378,
"grad_norm": 1.3157538175582886,
"learning_rate": 4.913845271494695e-05,
"loss": 0.4451,
"num_input_tokens_seen": 8747216,
"step": 705
},
{
"epoch": 0.25320970042796004,
"grad_norm": 1.193604588508606,
"learning_rate": 4.91262607878216e-05,
"loss": 0.5626,
"num_input_tokens_seen": 8807392,
"step": 710
},
{
"epoch": 0.2549928673323823,
"grad_norm": 1.0445785522460938,
"learning_rate": 4.911398473452616e-05,
"loss": 0.4848,
"num_input_tokens_seen": 8868496,
"step": 715
},
{
"epoch": 0.25677603423680456,
"grad_norm": 1.8069995641708374,
"learning_rate": 4.910162459786617e-05,
"loss": 0.4672,
"num_input_tokens_seen": 8929056,
"step": 720
},
{
"epoch": 0.2585592011412268,
"grad_norm": 1.1339744329452515,
"learning_rate": 4.908918042094033e-05,
"loss": 0.399,
"num_input_tokens_seen": 8991968,
"step": 725
},
{
"epoch": 0.2603423680456491,
"grad_norm": 1.2230961322784424,
"learning_rate": 4.907665224714042e-05,
"loss": 0.5477,
"num_input_tokens_seen": 9053408,
"step": 730
},
{
"epoch": 0.26212553495007135,
"grad_norm": 1.2331055402755737,
"learning_rate": 4.906404012015108e-05,
"loss": 0.4485,
"num_input_tokens_seen": 9115920,
"step": 735
},
{
"epoch": 0.26390870185449355,
"grad_norm": 1.8696657419204712,
"learning_rate": 4.905134408394969e-05,
"loss": 0.4714,
"num_input_tokens_seen": 9184576,
"step": 740
},
{
"epoch": 0.2656918687589158,
"grad_norm": 1.9693909883499146,
"learning_rate": 4.9038564182806234e-05,
"loss": 0.516,
"num_input_tokens_seen": 9247872,
"step": 745
},
{
"epoch": 0.2674750356633381,
"grad_norm": 1.0184056758880615,
"learning_rate": 4.902570046128312e-05,
"loss": 0.4914,
"num_input_tokens_seen": 9310976,
"step": 750
},
{
"epoch": 0.26925820256776034,
"grad_norm": 1.165300726890564,
"learning_rate": 4.9012752964235014e-05,
"loss": 0.4695,
"num_input_tokens_seen": 9372016,
"step": 755
},
{
"epoch": 0.2710413694721826,
"grad_norm": 1.0303696393966675,
"learning_rate": 4.8999721736808714e-05,
"loss": 0.4741,
"num_input_tokens_seen": 9432624,
"step": 760
},
{
"epoch": 0.27282453637660486,
"grad_norm": 1.2935962677001953,
"learning_rate": 4.898660682444297e-05,
"loss": 0.5044,
"num_input_tokens_seen": 9493360,
"step": 765
},
{
"epoch": 0.2746077032810271,
"grad_norm": 1.3259665966033936,
"learning_rate": 4.8973408272868347e-05,
"loss": 0.4618,
"num_input_tokens_seen": 9555136,
"step": 770
},
{
"epoch": 0.27639087018544933,
"grad_norm": 4.303719520568848,
"learning_rate": 4.896012612810704e-05,
"loss": 0.3954,
"num_input_tokens_seen": 9616896,
"step": 775
},
{
"epoch": 0.2781740370898716,
"grad_norm": 1.2892228364944458,
"learning_rate": 4.894676043647274e-05,
"loss": 0.3872,
"num_input_tokens_seen": 9674752,
"step": 780
},
{
"epoch": 0.27995720399429386,
"grad_norm": 1.360479474067688,
"learning_rate": 4.8933311244570434e-05,
"loss": 0.4713,
"num_input_tokens_seen": 9736976,
"step": 785
},
{
"epoch": 0.2817403708987161,
"grad_norm": 1.3140631914138794,
"learning_rate": 4.8919778599296293e-05,
"loss": 0.3917,
"num_input_tokens_seen": 9797136,
"step": 790
},
{
"epoch": 0.2835235378031384,
"grad_norm": 1.0723479986190796,
"learning_rate": 4.890616254783748e-05,
"loss": 0.4911,
"num_input_tokens_seen": 9858928,
"step": 795
},
{
"epoch": 0.28530670470756064,
"grad_norm": 1.4321138858795166,
"learning_rate": 4.8892463137671963e-05,
"loss": 0.4682,
"num_input_tokens_seen": 9917776,
"step": 800
},
{
"epoch": 0.2870898716119829,
"grad_norm": 1.2900583744049072,
"learning_rate": 4.887868041656839e-05,
"loss": 0.4978,
"num_input_tokens_seen": 9982464,
"step": 805
},
{
"epoch": 0.2888730385164051,
"grad_norm": 1.1396691799163818,
"learning_rate": 4.886481443258594e-05,
"loss": 0.4178,
"num_input_tokens_seen": 10044208,
"step": 810
},
{
"epoch": 0.2906562054208274,
"grad_norm": 1.40047025680542,
"learning_rate": 4.885086523407405e-05,
"loss": 0.455,
"num_input_tokens_seen": 10105968,
"step": 815
},
{
"epoch": 0.29243937232524964,
"grad_norm": 1.263271689414978,
"learning_rate": 4.88368328696724e-05,
"loss": 0.4933,
"num_input_tokens_seen": 10166992,
"step": 820
},
{
"epoch": 0.2942225392296719,
"grad_norm": 1.3891979455947876,
"learning_rate": 4.882271738831059e-05,
"loss": 0.5043,
"num_input_tokens_seen": 10232144,
"step": 825
},
{
"epoch": 0.29600570613409416,
"grad_norm": 1.4529082775115967,
"learning_rate": 4.880851883920809e-05,
"loss": 0.5188,
"num_input_tokens_seen": 10292944,
"step": 830
},
{
"epoch": 0.2977888730385164,
"grad_norm": 1.1358407735824585,
"learning_rate": 4.879423727187401e-05,
"loss": 0.5159,
"num_input_tokens_seen": 10354256,
"step": 835
},
{
"epoch": 0.2995720399429387,
"grad_norm": 1.7224394083023071,
"learning_rate": 4.8779872736106916e-05,
"loss": 0.5063,
"num_input_tokens_seen": 10416688,
"step": 840
},
{
"epoch": 0.3013552068473609,
"grad_norm": 1.557830810546875,
"learning_rate": 4.8765425281994704e-05,
"loss": 0.44,
"num_input_tokens_seen": 10477712,
"step": 845
},
{
"epoch": 0.30313837375178315,
"grad_norm": 1.207537293434143,
"learning_rate": 4.8750894959914377e-05,
"loss": 0.457,
"num_input_tokens_seen": 10539120,
"step": 850
},
{
"epoch": 0.3049215406562054,
"grad_norm": 1.2876931428909302,
"learning_rate": 4.873628182053191e-05,
"loss": 0.4583,
"num_input_tokens_seen": 10602400,
"step": 855
},
{
"epoch": 0.3067047075606277,
"grad_norm": 1.5556424856185913,
"learning_rate": 4.872158591480206e-05,
"loss": 0.4462,
"num_input_tokens_seen": 10665920,
"step": 860
},
{
"epoch": 0.30848787446504994,
"grad_norm": 1.2405084371566772,
"learning_rate": 4.870680729396815e-05,
"loss": 0.4229,
"num_input_tokens_seen": 10732768,
"step": 865
},
{
"epoch": 0.3102710413694722,
"grad_norm": 1.3671534061431885,
"learning_rate": 4.869194600956195e-05,
"loss": 0.5017,
"num_input_tokens_seen": 10794368,
"step": 870
},
{
"epoch": 0.31205420827389446,
"grad_norm": 1.0638670921325684,
"learning_rate": 4.867700211340347e-05,
"loss": 0.4751,
"num_input_tokens_seen": 10853408,
"step": 875
},
{
"epoch": 0.31383737517831667,
"grad_norm": 1.2563133239746094,
"learning_rate": 4.8661975657600765e-05,
"loss": 0.4873,
"num_input_tokens_seen": 10918576,
"step": 880
},
{
"epoch": 0.31562054208273893,
"grad_norm": 1.0638364553451538,
"learning_rate": 4.8646866694549795e-05,
"loss": 0.4572,
"num_input_tokens_seen": 10980976,
"step": 885
},
{
"epoch": 0.3174037089871612,
"grad_norm": 1.3460172414779663,
"learning_rate": 4.863167527693417e-05,
"loss": 0.4758,
"num_input_tokens_seen": 11040448,
"step": 890
},
{
"epoch": 0.31918687589158345,
"grad_norm": 1.210242509841919,
"learning_rate": 4.861640145772507e-05,
"loss": 0.5092,
"num_input_tokens_seen": 11104160,
"step": 895
},
{
"epoch": 0.3209700427960057,
"grad_norm": 1.0002316236495972,
"learning_rate": 4.8601045290180946e-05,
"loss": 0.4447,
"num_input_tokens_seen": 11164224,
"step": 900
},
{
"epoch": 0.322753209700428,
"grad_norm": 1.332479476928711,
"learning_rate": 4.858560682784744e-05,
"loss": 0.4335,
"num_input_tokens_seen": 11227376,
"step": 905
},
{
"epoch": 0.32453637660485024,
"grad_norm": 1.223310112953186,
"learning_rate": 4.8570086124557116e-05,
"loss": 0.4156,
"num_input_tokens_seen": 11284704,
"step": 910
},
{
"epoch": 0.32631954350927245,
"grad_norm": 1.439526915550232,
"learning_rate": 4.85544832344293e-05,
"loss": 0.431,
"num_input_tokens_seen": 11348448,
"step": 915
},
{
"epoch": 0.3281027104136947,
"grad_norm": 1.27132248878479,
"learning_rate": 4.853879821186993e-05,
"loss": 0.4941,
"num_input_tokens_seen": 11406160,
"step": 920
},
{
"epoch": 0.32988587731811697,
"grad_norm": 1.6706770658493042,
"learning_rate": 4.8523031111571316e-05,
"loss": 0.4718,
"num_input_tokens_seen": 11467088,
"step": 925
},
{
"epoch": 0.33166904422253923,
"grad_norm": 1.131922960281372,
"learning_rate": 4.850718198851195e-05,
"loss": 0.4172,
"num_input_tokens_seen": 11532768,
"step": 930
},
{
"epoch": 0.3334522111269615,
"grad_norm": 1.1946320533752441,
"learning_rate": 4.849125089795634e-05,
"loss": 0.3736,
"num_input_tokens_seen": 11591984,
"step": 935
},
{
"epoch": 0.33523537803138376,
"grad_norm": 1.2938627004623413,
"learning_rate": 4.8475237895454833e-05,
"loss": 0.462,
"num_input_tokens_seen": 11656624,
"step": 940
},
{
"epoch": 0.33701854493580596,
"grad_norm": 1.3345882892608643,
"learning_rate": 4.845914303684336e-05,
"loss": 0.4584,
"num_input_tokens_seen": 11718256,
"step": 945
},
{
"epoch": 0.3388017118402282,
"grad_norm": 0.8923389315605164,
"learning_rate": 4.844296637824329e-05,
"loss": 0.5339,
"num_input_tokens_seen": 11776080,
"step": 950
},
{
"epoch": 0.3405848787446505,
"grad_norm": 1.2611216306686401,
"learning_rate": 4.8426707976061226e-05,
"loss": 0.5625,
"num_input_tokens_seen": 11840768,
"step": 955
},
{
"epoch": 0.34236804564907275,
"grad_norm": 1.1760461330413818,
"learning_rate": 4.84103678869888e-05,
"loss": 0.5043,
"num_input_tokens_seen": 11904208,
"step": 960
},
{
"epoch": 0.344151212553495,
"grad_norm": 1.19206964969635,
"learning_rate": 4.8393946168002477e-05,
"loss": 0.4183,
"num_input_tokens_seen": 11967952,
"step": 965
},
{
"epoch": 0.3459343794579173,
"grad_norm": 1.1648989915847778,
"learning_rate": 4.8377442876363364e-05,
"loss": 0.4095,
"num_input_tokens_seen": 12033136,
"step": 970
},
{
"epoch": 0.34771754636233954,
"grad_norm": 1.2076241970062256,
"learning_rate": 4.8360858069617006e-05,
"loss": 0.4537,
"num_input_tokens_seen": 12097584,
"step": 975
},
{
"epoch": 0.34950071326676174,
"grad_norm": 1.0648747682571411,
"learning_rate": 4.834419180559317e-05,
"loss": 0.3932,
"num_input_tokens_seen": 12156320,
"step": 980
},
{
"epoch": 0.351283880171184,
"grad_norm": 1.228440523147583,
"learning_rate": 4.832744414240567e-05,
"loss": 0.4313,
"num_input_tokens_seen": 12218384,
"step": 985
},
{
"epoch": 0.35306704707560627,
"grad_norm": 1.4440958499908447,
"learning_rate": 4.8310615138452156e-05,
"loss": 0.4685,
"num_input_tokens_seen": 12281856,
"step": 990
},
{
"epoch": 0.35485021398002853,
"grad_norm": 1.0754982233047485,
"learning_rate": 4.829370485241388e-05,
"loss": 0.4623,
"num_input_tokens_seen": 12343904,
"step": 995
},
{
"epoch": 0.3566333808844508,
"grad_norm": 1.3745994567871094,
"learning_rate": 4.827671334325556e-05,
"loss": 0.4334,
"num_input_tokens_seen": 12402256,
"step": 1000
},
{
"epoch": 0.35841654778887305,
"grad_norm": 1.0649508237838745,
"learning_rate": 4.82596406702251e-05,
"loss": 0.4728,
"num_input_tokens_seen": 12465536,
"step": 1005
},
{
"epoch": 0.3601997146932953,
"grad_norm": 1.061046838760376,
"learning_rate": 4.8242486892853424e-05,
"loss": 0.421,
"num_input_tokens_seen": 12530464,
"step": 1010
},
{
"epoch": 0.3619828815977175,
"grad_norm": 1.7068672180175781,
"learning_rate": 4.822525207095425e-05,
"loss": 0.4843,
"num_input_tokens_seen": 12593216,
"step": 1015
},
{
"epoch": 0.3637660485021398,
"grad_norm": 1.2018966674804688,
"learning_rate": 4.820793626462391e-05,
"loss": 0.4604,
"num_input_tokens_seen": 12655248,
"step": 1020
},
{
"epoch": 0.36554921540656204,
"grad_norm": 1.2888667583465576,
"learning_rate": 4.819053953424112e-05,
"loss": 0.427,
"num_input_tokens_seen": 12718048,
"step": 1025
},
{
"epoch": 0.3673323823109843,
"grad_norm": 1.3183050155639648,
"learning_rate": 4.817306194046675e-05,
"loss": 0.4415,
"num_input_tokens_seen": 12781536,
"step": 1030
},
{
"epoch": 0.36911554921540657,
"grad_norm": 1.7154110670089722,
"learning_rate": 4.815550354424365e-05,
"loss": 0.5193,
"num_input_tokens_seen": 12844336,
"step": 1035
},
{
"epoch": 0.37089871611982883,
"grad_norm": 1.3131228685379028,
"learning_rate": 4.813786440679642e-05,
"loss": 0.4078,
"num_input_tokens_seen": 12906288,
"step": 1040
},
{
"epoch": 0.3726818830242511,
"grad_norm": 1.1588881015777588,
"learning_rate": 4.81201445896312e-05,
"loss": 0.3672,
"num_input_tokens_seen": 12965200,
"step": 1045
},
{
"epoch": 0.3744650499286733,
"grad_norm": 1.5113669633865356,
"learning_rate": 4.810234415453545e-05,
"loss": 0.4896,
"num_input_tokens_seen": 13033248,
"step": 1050
},
{
"epoch": 0.37624821683309556,
"grad_norm": 1.4646553993225098,
"learning_rate": 4.808446316357773e-05,
"loss": 0.4772,
"num_input_tokens_seen": 13096752,
"step": 1055
},
{
"epoch": 0.3780313837375178,
"grad_norm": 1.9652795791625977,
"learning_rate": 4.80665016791075e-05,
"loss": 0.4468,
"num_input_tokens_seen": 13158992,
"step": 1060
},
{
"epoch": 0.3798145506419401,
"grad_norm": 3.033592700958252,
"learning_rate": 4.804845976375489e-05,
"loss": 0.3997,
"num_input_tokens_seen": 13222064,
"step": 1065
},
{
"epoch": 0.38159771754636235,
"grad_norm": 1.2086786031723022,
"learning_rate": 4.8030337480430496e-05,
"loss": 0.4966,
"num_input_tokens_seen": 13286112,
"step": 1070
},
{
"epoch": 0.3833808844507846,
"grad_norm": 1.7219619750976562,
"learning_rate": 4.801213489232514e-05,
"loss": 0.4918,
"num_input_tokens_seen": 13346832,
"step": 1075
},
{
"epoch": 0.38516405135520687,
"grad_norm": 1.256044864654541,
"learning_rate": 4.799385206290965e-05,
"loss": 0.4734,
"num_input_tokens_seen": 13408992,
"step": 1080
},
{
"epoch": 0.3869472182596291,
"grad_norm": 1.150932788848877,
"learning_rate": 4.7975489055934666e-05,
"loss": 0.3703,
"num_input_tokens_seen": 13469280,
"step": 1085
},
{
"epoch": 0.38873038516405134,
"grad_norm": 1.4256497621536255,
"learning_rate": 4.79570459354304e-05,
"loss": 0.5076,
"num_input_tokens_seen": 13533536,
"step": 1090
},
{
"epoch": 0.3905135520684736,
"grad_norm": 1.1593137979507446,
"learning_rate": 4.79385227657064e-05,
"loss": 0.4351,
"num_input_tokens_seen": 13594304,
"step": 1095
},
{
"epoch": 0.39229671897289586,
"grad_norm": 0.9239110350608826,
"learning_rate": 4.791991961135135e-05,
"loss": 0.4984,
"num_input_tokens_seen": 13657328,
"step": 1100
},
{
"epoch": 0.3940798858773181,
"grad_norm": 0.999727189540863,
"learning_rate": 4.790123653723282e-05,
"loss": 0.4598,
"num_input_tokens_seen": 13720224,
"step": 1105
},
{
"epoch": 0.3958630527817404,
"grad_norm": 1.0658410787582397,
"learning_rate": 4.788247360849708e-05,
"loss": 0.4409,
"num_input_tokens_seen": 13782656,
"step": 1110
},
{
"epoch": 0.39764621968616265,
"grad_norm": 1.2038542032241821,
"learning_rate": 4.786363089056881e-05,
"loss": 0.4719,
"num_input_tokens_seen": 13849120,
"step": 1115
},
{
"epoch": 0.39942938659058486,
"grad_norm": 1.1782008409500122,
"learning_rate": 4.784470844915093e-05,
"loss": 0.4147,
"num_input_tokens_seen": 13910944,
"step": 1120
},
{
"epoch": 0.4012125534950071,
"grad_norm": 0.9827120304107666,
"learning_rate": 4.782570635022436e-05,
"loss": 0.3883,
"num_input_tokens_seen": 13969248,
"step": 1125
},
{
"epoch": 0.4029957203994294,
"grad_norm": 1.0276070833206177,
"learning_rate": 4.7806624660047744e-05,
"loss": 0.4337,
"num_input_tokens_seen": 14028112,
"step": 1130
},
{
"epoch": 0.40477888730385164,
"grad_norm": 1.923315167427063,
"learning_rate": 4.7787463445157286e-05,
"loss": 0.5135,
"num_input_tokens_seen": 14090320,
"step": 1135
},
{
"epoch": 0.4065620542082739,
"grad_norm": 1.3430618047714233,
"learning_rate": 4.7768222772366466e-05,
"loss": 0.5111,
"num_input_tokens_seen": 14151840,
"step": 1140
},
{
"epoch": 0.40834522111269617,
"grad_norm": 1.5225883722305298,
"learning_rate": 4.774890270876584e-05,
"loss": 0.5005,
"num_input_tokens_seen": 14213824,
"step": 1145
},
{
"epoch": 0.41012838801711843,
"grad_norm": 1.0013866424560547,
"learning_rate": 4.772950332172279e-05,
"loss": 0.6018,
"num_input_tokens_seen": 14278736,
"step": 1150
},
{
"epoch": 0.41191155492154063,
"grad_norm": 1.0078413486480713,
"learning_rate": 4.771002467888128e-05,
"loss": 0.3879,
"num_input_tokens_seen": 14339408,
"step": 1155
},
{
"epoch": 0.4136947218259629,
"grad_norm": 1.1650017499923706,
"learning_rate": 4.769046684816165e-05,
"loss": 0.4924,
"num_input_tokens_seen": 14399008,
"step": 1160
},
{
"epoch": 0.41547788873038516,
"grad_norm": 1.351217269897461,
"learning_rate": 4.767082989776034e-05,
"loss": 0.4104,
"num_input_tokens_seen": 14462656,
"step": 1165
},
{
"epoch": 0.4172610556348074,
"grad_norm": 1.3392795324325562,
"learning_rate": 4.76511138961497e-05,
"loss": 0.4629,
"num_input_tokens_seen": 14527568,
"step": 1170
},
{
"epoch": 0.4190442225392297,
"grad_norm": 1.3544095754623413,
"learning_rate": 4.763131891207771e-05,
"loss": 0.486,
"num_input_tokens_seen": 14590944,
"step": 1175
},
{
"epoch": 0.42082738944365194,
"grad_norm": 1.1842771768569946,
"learning_rate": 4.761144501456773e-05,
"loss": 0.4529,
"num_input_tokens_seen": 14651104,
"step": 1180
},
{
"epoch": 0.4226105563480742,
"grad_norm": 0.9588406085968018,
"learning_rate": 4.7591492272918344e-05,
"loss": 0.3739,
"num_input_tokens_seen": 14711344,
"step": 1185
},
{
"epoch": 0.4243937232524964,
"grad_norm": 1.1637108325958252,
"learning_rate": 4.7571460756703e-05,
"loss": 0.4772,
"num_input_tokens_seen": 14772656,
"step": 1190
},
{
"epoch": 0.4261768901569187,
"grad_norm": 1.225539207458496,
"learning_rate": 4.755135053576987e-05,
"loss": 0.4606,
"num_input_tokens_seen": 14833840,
"step": 1195
},
{
"epoch": 0.42796005706134094,
"grad_norm": 1.4021525382995605,
"learning_rate": 4.753116168024153e-05,
"loss": 0.4168,
"num_input_tokens_seen": 14896544,
"step": 1200
},
{
"epoch": 0.4297432239657632,
"grad_norm": 1.615051507949829,
"learning_rate": 4.751089426051476e-05,
"loss": 0.4156,
"num_input_tokens_seen": 14956432,
"step": 1205
},
{
"epoch": 0.43152639087018546,
"grad_norm": 3.979645252227783,
"learning_rate": 4.749054834726029e-05,
"loss": 0.5188,
"num_input_tokens_seen": 15021296,
"step": 1210
},
{
"epoch": 0.4333095577746077,
"grad_norm": 1.3276335000991821,
"learning_rate": 4.7470124011422555e-05,
"loss": 0.4941,
"num_input_tokens_seen": 15080688,
"step": 1215
},
{
"epoch": 0.43509272467902993,
"grad_norm": 1.2513278722763062,
"learning_rate": 4.744962132421943e-05,
"loss": 0.4719,
"num_input_tokens_seen": 15141456,
"step": 1220
},
{
"epoch": 0.4368758915834522,
"grad_norm": 1.1449891328811646,
"learning_rate": 4.742904035714199e-05,
"loss": 0.4811,
"num_input_tokens_seen": 15202768,
"step": 1225
},
{
"epoch": 0.43865905848787445,
"grad_norm": 1.0668220520019531,
"learning_rate": 4.7408381181954284e-05,
"loss": 0.4801,
"num_input_tokens_seen": 15266416,
"step": 1230
},
{
"epoch": 0.4404422253922967,
"grad_norm": 1.576777458190918,
"learning_rate": 4.7387643870693055e-05,
"loss": 0.4551,
"num_input_tokens_seen": 15328416,
"step": 1235
},
{
"epoch": 0.442225392296719,
"grad_norm": 1.0677021741867065,
"learning_rate": 4.736682849566751e-05,
"loss": 0.3682,
"num_input_tokens_seen": 15387392,
"step": 1240
},
{
"epoch": 0.44400855920114124,
"grad_norm": 1.105083703994751,
"learning_rate": 4.734593512945904e-05,
"loss": 0.4721,
"num_input_tokens_seen": 15444928,
"step": 1245
},
{
"epoch": 0.4457917261055635,
"grad_norm": 1.1016100645065308,
"learning_rate": 4.7324963844920986e-05,
"loss": 0.4568,
"num_input_tokens_seen": 15505488,
"step": 1250
},
{
"epoch": 0.4475748930099857,
"grad_norm": 1.4010059833526611,
"learning_rate": 4.7303914715178396e-05,
"loss": 0.5337,
"num_input_tokens_seen": 15566336,
"step": 1255
},
{
"epoch": 0.44935805991440797,
"grad_norm": 1.149238109588623,
"learning_rate": 4.728278781362777e-05,
"loss": 0.3965,
"num_input_tokens_seen": 15632768,
"step": 1260
},
{
"epoch": 0.45114122681883023,
"grad_norm": 1.4296883344650269,
"learning_rate": 4.7261583213936746e-05,
"loss": 0.5366,
"num_input_tokens_seen": 15694944,
"step": 1265
},
{
"epoch": 0.4529243937232525,
"grad_norm": 1.2786849737167358,
"learning_rate": 4.7240300990043926e-05,
"loss": 0.4339,
"num_input_tokens_seen": 15756496,
"step": 1270
},
{
"epoch": 0.45470756062767476,
"grad_norm": 1.1299382448196411,
"learning_rate": 4.721894121615859e-05,
"loss": 0.4866,
"num_input_tokens_seen": 15821200,
"step": 1275
},
{
"epoch": 0.456490727532097,
"grad_norm": 1.1465532779693604,
"learning_rate": 4.7197503966760375e-05,
"loss": 0.4288,
"num_input_tokens_seen": 15882736,
"step": 1280
},
{
"epoch": 0.4582738944365193,
"grad_norm": 1.4677292108535767,
"learning_rate": 4.717598931659913e-05,
"loss": 0.443,
"num_input_tokens_seen": 15944560,
"step": 1285
},
{
"epoch": 0.4600570613409415,
"grad_norm": 1.8437912464141846,
"learning_rate": 4.7154397340694556e-05,
"loss": 0.4923,
"num_input_tokens_seen": 16006784,
"step": 1290
},
{
"epoch": 0.46184022824536375,
"grad_norm": 1.5408210754394531,
"learning_rate": 4.713272811433599e-05,
"loss": 0.4868,
"num_input_tokens_seen": 16068896,
"step": 1295
},
{
"epoch": 0.463623395149786,
"grad_norm": 1.1977325677871704,
"learning_rate": 4.711098171308214e-05,
"loss": 0.4781,
"num_input_tokens_seen": 16128640,
"step": 1300
},
{
"epoch": 0.4654065620542083,
"grad_norm": 1.470975399017334,
"learning_rate": 4.708915821276082e-05,
"loss": 0.4748,
"num_input_tokens_seen": 16192800,
"step": 1305
},
{
"epoch": 0.46718972895863053,
"grad_norm": 1.460138201713562,
"learning_rate": 4.706725768946866e-05,
"loss": 0.5107,
"num_input_tokens_seen": 16251248,
"step": 1310
},
{
"epoch": 0.4689728958630528,
"grad_norm": 1.2103915214538574,
"learning_rate": 4.7045280219570896e-05,
"loss": 0.4768,
"num_input_tokens_seen": 16314704,
"step": 1315
},
{
"epoch": 0.47075606276747506,
"grad_norm": 1.1669901609420776,
"learning_rate": 4.702322587970104e-05,
"loss": 0.4624,
"num_input_tokens_seen": 16375792,
"step": 1320
},
{
"epoch": 0.47253922967189727,
"grad_norm": 1.1790727376937866,
"learning_rate": 4.700109474676064e-05,
"loss": 0.4735,
"num_input_tokens_seen": 16438672,
"step": 1325
},
{
"epoch": 0.4743223965763195,
"grad_norm": 1.019875407218933,
"learning_rate": 4.697888689791906e-05,
"loss": 0.3809,
"num_input_tokens_seen": 16498896,
"step": 1330
},
{
"epoch": 0.4761055634807418,
"grad_norm": 1.2999383211135864,
"learning_rate": 4.6956602410613115e-05,
"loss": 0.4421,
"num_input_tokens_seen": 16566736,
"step": 1335
},
{
"epoch": 0.47788873038516405,
"grad_norm": 1.4289456605911255,
"learning_rate": 4.6934241362546874e-05,
"loss": 0.5083,
"num_input_tokens_seen": 16630480,
"step": 1340
},
{
"epoch": 0.4796718972895863,
"grad_norm": 1.2647002935409546,
"learning_rate": 4.691180383169137e-05,
"loss": 0.5118,
"num_input_tokens_seen": 16688832,
"step": 1345
},
{
"epoch": 0.4814550641940086,
"grad_norm": 1.1783503293991089,
"learning_rate": 4.688928989628431e-05,
"loss": 0.4128,
"num_input_tokens_seen": 16752432,
"step": 1350
},
{
"epoch": 0.48323823109843084,
"grad_norm": 1.2250187397003174,
"learning_rate": 4.686669963482983e-05,
"loss": 0.3974,
"num_input_tokens_seen": 16814912,
"step": 1355
},
{
"epoch": 0.48502139800285304,
"grad_norm": 1.5874429941177368,
"learning_rate": 4.6844033126098206e-05,
"loss": 0.5244,
"num_input_tokens_seen": 16875696,
"step": 1360
},
{
"epoch": 0.4868045649072753,
"grad_norm": 1.5424435138702393,
"learning_rate": 4.682129044912558e-05,
"loss": 0.3909,
"num_input_tokens_seen": 16934768,
"step": 1365
},
{
"epoch": 0.48858773181169757,
"grad_norm": 1.395538568496704,
"learning_rate": 4.679847168321368e-05,
"loss": 0.4208,
"num_input_tokens_seen": 16994192,
"step": 1370
},
{
"epoch": 0.49037089871611983,
"grad_norm": 1.3311400413513184,
"learning_rate": 4.677557690792956e-05,
"loss": 0.5148,
"num_input_tokens_seen": 17055952,
"step": 1375
},
{
"epoch": 0.4921540656205421,
"grad_norm": 1.0483784675598145,
"learning_rate": 4.6752606203105314e-05,
"loss": 0.4838,
"num_input_tokens_seen": 17118352,
"step": 1380
},
{
"epoch": 0.49393723252496435,
"grad_norm": 1.4240469932556152,
"learning_rate": 4.6729559648837777e-05,
"loss": 0.4676,
"num_input_tokens_seen": 17181856,
"step": 1385
},
{
"epoch": 0.4957203994293866,
"grad_norm": 1.1497527360916138,
"learning_rate": 4.6706437325488285e-05,
"loss": 0.4607,
"num_input_tokens_seen": 17239040,
"step": 1390
},
{
"epoch": 0.4975035663338088,
"grad_norm": 1.324589490890503,
"learning_rate": 4.6683239313682356e-05,
"loss": 0.3867,
"num_input_tokens_seen": 17300096,
"step": 1395
},
{
"epoch": 0.4992867332382311,
"grad_norm": 1.401481032371521,
"learning_rate": 4.6659965694309446e-05,
"loss": 0.477,
"num_input_tokens_seen": 17367088,
"step": 1400
},
{
"epoch": 0.5010699001426534,
"grad_norm": 1.0556763410568237,
"learning_rate": 4.6636616548522637e-05,
"loss": 0.4092,
"num_input_tokens_seen": 17427648,
"step": 1405
},
{
"epoch": 0.5028530670470756,
"grad_norm": 1.5187320709228516,
"learning_rate": 4.661319195773837e-05,
"loss": 0.4266,
"num_input_tokens_seen": 17491664,
"step": 1410
},
{
"epoch": 0.5046362339514978,
"grad_norm": 1.2626229524612427,
"learning_rate": 4.658969200363614e-05,
"loss": 0.5192,
"num_input_tokens_seen": 17553312,
"step": 1415
},
{
"epoch": 0.5064194008559201,
"grad_norm": 1.3596255779266357,
"learning_rate": 4.6566116768158254e-05,
"loss": 0.4983,
"num_input_tokens_seen": 17614656,
"step": 1420
},
{
"epoch": 0.5082025677603423,
"grad_norm": 1.131866455078125,
"learning_rate": 4.6542466333509496e-05,
"loss": 0.4593,
"num_input_tokens_seen": 17673104,
"step": 1425
},
{
"epoch": 0.5099857346647646,
"grad_norm": 1.1720597743988037,
"learning_rate": 4.651874078215688e-05,
"loss": 0.3885,
"num_input_tokens_seen": 17733920,
"step": 1430
},
{
"epoch": 0.5117689015691869,
"grad_norm": 1.1201550960540771,
"learning_rate": 4.6494940196829326e-05,
"loss": 0.4661,
"num_input_tokens_seen": 17795024,
"step": 1435
},
{
"epoch": 0.5135520684736091,
"grad_norm": 1.4359281063079834,
"learning_rate": 4.647106466051741e-05,
"loss": 0.4327,
"num_input_tokens_seen": 17856080,
"step": 1440
},
{
"epoch": 0.5153352353780314,
"grad_norm": 1.2126119136810303,
"learning_rate": 4.644711425647305e-05,
"loss": 0.4281,
"num_input_tokens_seen": 17918592,
"step": 1445
},
{
"epoch": 0.5171184022824536,
"grad_norm": 1.1998052597045898,
"learning_rate": 4.642308906820921e-05,
"loss": 0.4234,
"num_input_tokens_seen": 17985056,
"step": 1450
},
{
"epoch": 0.5189015691868759,
"grad_norm": 1.2513782978057861,
"learning_rate": 4.6398989179499635e-05,
"loss": 0.4952,
"num_input_tokens_seen": 18047856,
"step": 1455
},
{
"epoch": 0.5206847360912982,
"grad_norm": 1.5451606512069702,
"learning_rate": 4.637481467437854e-05,
"loss": 0.4061,
"num_input_tokens_seen": 18110608,
"step": 1460
},
{
"epoch": 0.5224679029957204,
"grad_norm": 1.280383586883545,
"learning_rate": 4.635056563714031e-05,
"loss": 0.4709,
"num_input_tokens_seen": 18170192,
"step": 1465
},
{
"epoch": 0.5242510699001427,
"grad_norm": 1.536872386932373,
"learning_rate": 4.632624215233924e-05,
"loss": 0.5166,
"num_input_tokens_seen": 18234512,
"step": 1470
},
{
"epoch": 0.526034236804565,
"grad_norm": 1.1344192028045654,
"learning_rate": 4.6301844304789185e-05,
"loss": 0.4313,
"num_input_tokens_seen": 18297872,
"step": 1475
},
{
"epoch": 0.5278174037089871,
"grad_norm": 1.2558397054672241,
"learning_rate": 4.6277372179563336e-05,
"loss": 0.4426,
"num_input_tokens_seen": 18360688,
"step": 1480
},
{
"epoch": 0.5296005706134094,
"grad_norm": 1.3379613161087036,
"learning_rate": 4.625282586199384e-05,
"loss": 0.4684,
"num_input_tokens_seen": 18421600,
"step": 1485
},
{
"epoch": 0.5313837375178316,
"grad_norm": 1.471182942390442,
"learning_rate": 4.622820543767159e-05,
"loss": 0.3746,
"num_input_tokens_seen": 18482608,
"step": 1490
},
{
"epoch": 0.5331669044222539,
"grad_norm": 1.147135853767395,
"learning_rate": 4.6203510992445844e-05,
"loss": 0.3896,
"num_input_tokens_seen": 18542720,
"step": 1495
},
{
"epoch": 0.5349500713266762,
"grad_norm": 1.6015293598175049,
"learning_rate": 4.617874261242399e-05,
"loss": 0.4613,
"num_input_tokens_seen": 18604304,
"step": 1500
},
{
"epoch": 0.5367332382310984,
"grad_norm": 1.1261463165283203,
"learning_rate": 4.615390038397121e-05,
"loss": 0.4636,
"num_input_tokens_seen": 18666336,
"step": 1505
},
{
"epoch": 0.5385164051355207,
"grad_norm": 1.1836202144622803,
"learning_rate": 4.612898439371019e-05,
"loss": 0.4072,
"num_input_tokens_seen": 18724912,
"step": 1510
},
{
"epoch": 0.5402995720399429,
"grad_norm": 1.108585238456726,
"learning_rate": 4.6103994728520815e-05,
"loss": 0.3483,
"num_input_tokens_seen": 18786352,
"step": 1515
},
{
"epoch": 0.5420827389443652,
"grad_norm": 1.3794957399368286,
"learning_rate": 4.607893147553989e-05,
"loss": 0.4259,
"num_input_tokens_seen": 18851488,
"step": 1520
},
{
"epoch": 0.5438659058487875,
"grad_norm": 1.4083433151245117,
"learning_rate": 4.605379472216076e-05,
"loss": 0.4364,
"num_input_tokens_seen": 18915008,
"step": 1525
},
{
"epoch": 0.5456490727532097,
"grad_norm": 1.3088963031768799,
"learning_rate": 4.602858455603313e-05,
"loss": 0.4098,
"num_input_tokens_seen": 18976256,
"step": 1530
},
{
"epoch": 0.547432239657632,
"grad_norm": 1.3022725582122803,
"learning_rate": 4.600330106506263e-05,
"loss": 0.4449,
"num_input_tokens_seen": 19036560,
"step": 1535
},
{
"epoch": 0.5492154065620543,
"grad_norm": 1.7286397218704224,
"learning_rate": 4.597794433741061e-05,
"loss": 0.5088,
"num_input_tokens_seen": 19097568,
"step": 1540
},
{
"epoch": 0.5509985734664765,
"grad_norm": 1.4286762475967407,
"learning_rate": 4.5952514461493754e-05,
"loss": 0.445,
"num_input_tokens_seen": 19158592,
"step": 1545
},
{
"epoch": 0.5527817403708987,
"grad_norm": 1.2713367938995361,
"learning_rate": 4.5927011525983824e-05,
"loss": 0.3791,
"num_input_tokens_seen": 19215600,
"step": 1550
},
{
"epoch": 0.5545649072753209,
"grad_norm": 1.3422623872756958,
"learning_rate": 4.590143561980736e-05,
"loss": 0.4897,
"num_input_tokens_seen": 19277184,
"step": 1555
},
{
"epoch": 0.5563480741797432,
"grad_norm": 1.278333306312561,
"learning_rate": 4.5875786832145287e-05,
"loss": 0.4426,
"num_input_tokens_seen": 19338032,
"step": 1560
},
{
"epoch": 0.5581312410841655,
"grad_norm": 1.4938713312149048,
"learning_rate": 4.5850065252432706e-05,
"loss": 0.4246,
"num_input_tokens_seen": 19397040,
"step": 1565
},
{
"epoch": 0.5599144079885877,
"grad_norm": 2.4364399909973145,
"learning_rate": 4.582427097035854e-05,
"loss": 0.4777,
"num_input_tokens_seen": 19456144,
"step": 1570
},
{
"epoch": 0.56169757489301,
"grad_norm": 3.5539422035217285,
"learning_rate": 4.579840407586517e-05,
"loss": 0.4894,
"num_input_tokens_seen": 19518176,
"step": 1575
},
{
"epoch": 0.5634807417974322,
"grad_norm": 1.4036399126052856,
"learning_rate": 4.577246465914825e-05,
"loss": 0.4704,
"num_input_tokens_seen": 19581024,
"step": 1580
},
{
"epoch": 0.5652639087018545,
"grad_norm": 0.9552262425422668,
"learning_rate": 4.5746452810656225e-05,
"loss": 0.4527,
"num_input_tokens_seen": 19643104,
"step": 1585
},
{
"epoch": 0.5670470756062768,
"grad_norm": 1.2145711183547974,
"learning_rate": 4.572036862109017e-05,
"loss": 0.4612,
"num_input_tokens_seen": 19702528,
"step": 1590
},
{
"epoch": 0.568830242510699,
"grad_norm": 1.0046789646148682,
"learning_rate": 4.5694212181403374e-05,
"loss": 0.4235,
"num_input_tokens_seen": 19763424,
"step": 1595
},
{
"epoch": 0.5706134094151213,
"grad_norm": 1.3540983200073242,
"learning_rate": 4.5667983582801064e-05,
"loss": 0.3833,
"num_input_tokens_seen": 19823200,
"step": 1600
},
{
"epoch": 0.5723965763195435,
"grad_norm": 1.2544758319854736,
"learning_rate": 4.5641682916740084e-05,
"loss": 0.4586,
"num_input_tokens_seen": 19883888,
"step": 1605
},
{
"epoch": 0.5741797432239658,
"grad_norm": 1.1667801141738892,
"learning_rate": 4.5615310274928556e-05,
"loss": 0.5969,
"num_input_tokens_seen": 19949840,
"step": 1610
},
{
"epoch": 0.5759629101283881,
"grad_norm": 0.9844037294387817,
"learning_rate": 4.5588865749325594e-05,
"loss": 0.3798,
"num_input_tokens_seen": 20014640,
"step": 1615
},
{
"epoch": 0.5777460770328102,
"grad_norm": 1.3161027431488037,
"learning_rate": 4.556234943214095e-05,
"loss": 0.4234,
"num_input_tokens_seen": 20077008,
"step": 1620
},
{
"epoch": 0.5795292439372325,
"grad_norm": 1.1113629341125488,
"learning_rate": 4.5535761415834724e-05,
"loss": 0.4714,
"num_input_tokens_seen": 20141488,
"step": 1625
},
{
"epoch": 0.5813124108416547,
"grad_norm": 1.3117053508758545,
"learning_rate": 4.550910179311699e-05,
"loss": 0.5514,
"num_input_tokens_seen": 20206016,
"step": 1630
},
{
"epoch": 0.583095577746077,
"grad_norm": 1.151132345199585,
"learning_rate": 4.5482370656947554e-05,
"loss": 0.4626,
"num_input_tokens_seen": 20270880,
"step": 1635
},
{
"epoch": 0.5848787446504993,
"grad_norm": 2.0122318267822266,
"learning_rate": 4.5455568100535545e-05,
"loss": 0.4758,
"num_input_tokens_seen": 20334448,
"step": 1640
},
{
"epoch": 0.5866619115549215,
"grad_norm": 1.6800963878631592,
"learning_rate": 4.542869421733915e-05,
"loss": 0.4178,
"num_input_tokens_seen": 20398480,
"step": 1645
},
{
"epoch": 0.5884450784593438,
"grad_norm": 1.4573643207550049,
"learning_rate": 4.540174910106526e-05,
"loss": 0.4314,
"num_input_tokens_seen": 20458128,
"step": 1650
},
{
"epoch": 0.5902282453637661,
"grad_norm": 1.1499691009521484,
"learning_rate": 4.537473284566914e-05,
"loss": 0.4182,
"num_input_tokens_seen": 20521840,
"step": 1655
},
{
"epoch": 0.5920114122681883,
"grad_norm": 1.1684014797210693,
"learning_rate": 4.5347645545354136e-05,
"loss": 0.3945,
"num_input_tokens_seen": 20582304,
"step": 1660
},
{
"epoch": 0.5937945791726106,
"grad_norm": 1.358035683631897,
"learning_rate": 4.532048729457128e-05,
"loss": 0.4674,
"num_input_tokens_seen": 20642656,
"step": 1665
},
{
"epoch": 0.5955777460770328,
"grad_norm": 1.285057783126831,
"learning_rate": 4.5293258188019055e-05,
"loss": 0.4027,
"num_input_tokens_seen": 20709664,
"step": 1670
},
{
"epoch": 0.5973609129814551,
"grad_norm": 1.0107051134109497,
"learning_rate": 4.526595832064296e-05,
"loss": 0.4402,
"num_input_tokens_seen": 20769888,
"step": 1675
},
{
"epoch": 0.5991440798858774,
"grad_norm": 1.144665241241455,
"learning_rate": 4.523858778763528e-05,
"loss": 0.4725,
"num_input_tokens_seen": 20834912,
"step": 1680
},
{
"epoch": 0.6009272467902995,
"grad_norm": 1.5452603101730347,
"learning_rate": 4.521114668443464e-05,
"loss": 0.4413,
"num_input_tokens_seen": 20896784,
"step": 1685
},
{
"epoch": 0.6027104136947218,
"grad_norm": 1.0601692199707031,
"learning_rate": 4.518363510672583e-05,
"loss": 0.4758,
"num_input_tokens_seen": 20954224,
"step": 1690
},
{
"epoch": 0.604493580599144,
"grad_norm": 1.6150104999542236,
"learning_rate": 4.515605315043928e-05,
"loss": 0.4027,
"num_input_tokens_seen": 21019760,
"step": 1695
},
{
"epoch": 0.6062767475035663,
"grad_norm": 1.3952018022537231,
"learning_rate": 4.512840091175089e-05,
"loss": 0.4497,
"num_input_tokens_seen": 21081952,
"step": 1700
},
{
"epoch": 0.6080599144079886,
"grad_norm": 1.6579699516296387,
"learning_rate": 4.5100678487081614e-05,
"loss": 0.4343,
"num_input_tokens_seen": 21145680,
"step": 1705
},
{
"epoch": 0.6098430813124108,
"grad_norm": 1.5067193508148193,
"learning_rate": 4.507288597309711e-05,
"loss": 0.4142,
"num_input_tokens_seen": 21206048,
"step": 1710
},
{
"epoch": 0.6116262482168331,
"grad_norm": 1.2458901405334473,
"learning_rate": 4.504502346670748e-05,
"loss": 0.5092,
"num_input_tokens_seen": 21269520,
"step": 1715
},
{
"epoch": 0.6134094151212554,
"grad_norm": 1.33489990234375,
"learning_rate": 4.5017091065066837e-05,
"loss": 0.4563,
"num_input_tokens_seen": 21331136,
"step": 1720
},
{
"epoch": 0.6151925820256776,
"grad_norm": 1.4016698598861694,
"learning_rate": 4.4989088865573035e-05,
"loss": 0.3743,
"num_input_tokens_seen": 21392496,
"step": 1725
},
{
"epoch": 0.6169757489300999,
"grad_norm": 1.5638152360916138,
"learning_rate": 4.496101696586732e-05,
"loss": 0.4823,
"num_input_tokens_seen": 21455504,
"step": 1730
},
{
"epoch": 0.6187589158345221,
"grad_norm": 1.2184085845947266,
"learning_rate": 4.4932875463833944e-05,
"loss": 0.4219,
"num_input_tokens_seen": 21518800,
"step": 1735
},
{
"epoch": 0.6205420827389444,
"grad_norm": 1.5745280981063843,
"learning_rate": 4.490466445759988e-05,
"loss": 0.506,
"num_input_tokens_seen": 21579120,
"step": 1740
},
{
"epoch": 0.6223252496433667,
"grad_norm": 1.4783879518508911,
"learning_rate": 4.487638404553445e-05,
"loss": 0.4638,
"num_input_tokens_seen": 21638528,
"step": 1745
},
{
"epoch": 0.6241084165477889,
"grad_norm": 1.4319891929626465,
"learning_rate": 4.484803432624899e-05,
"loss": 0.434,
"num_input_tokens_seen": 21703664,
"step": 1750
},
{
"epoch": 0.6258915834522111,
"grad_norm": 1.3542821407318115,
"learning_rate": 4.48196153985965e-05,
"loss": 0.4472,
"num_input_tokens_seen": 21764336,
"step": 1755
},
{
"epoch": 0.6276747503566333,
"grad_norm": 1.1602082252502441,
"learning_rate": 4.4791127361671304e-05,
"loss": 0.3541,
"num_input_tokens_seen": 21825392,
"step": 1760
},
{
"epoch": 0.6294579172610556,
"grad_norm": 1.6145776510238647,
"learning_rate": 4.476257031480871e-05,
"loss": 0.4401,
"num_input_tokens_seen": 21886848,
"step": 1765
},
{
"epoch": 0.6312410841654779,
"grad_norm": 1.1257821321487427,
"learning_rate": 4.4733944357584644e-05,
"loss": 0.5242,
"num_input_tokens_seen": 21951680,
"step": 1770
},
{
"epoch": 0.6330242510699001,
"grad_norm": 1.4322980642318726,
"learning_rate": 4.470524958981534e-05,
"loss": 0.4926,
"num_input_tokens_seen": 22016624,
"step": 1775
},
{
"epoch": 0.6348074179743224,
"grad_norm": 1.255799651145935,
"learning_rate": 4.4676486111556936e-05,
"loss": 0.4128,
"num_input_tokens_seen": 22079040,
"step": 1780
},
{
"epoch": 0.6365905848787446,
"grad_norm": 1.157120943069458,
"learning_rate": 4.46476540231052e-05,
"loss": 0.3521,
"num_input_tokens_seen": 22142400,
"step": 1785
},
{
"epoch": 0.6383737517831669,
"grad_norm": 1.5262624025344849,
"learning_rate": 4.461875342499509e-05,
"loss": 0.4028,
"num_input_tokens_seen": 22199136,
"step": 1790
},
{
"epoch": 0.6401569186875892,
"grad_norm": 1.7937567234039307,
"learning_rate": 4.458978441800048e-05,
"loss": 0.4126,
"num_input_tokens_seen": 22260608,
"step": 1795
},
{
"epoch": 0.6419400855920114,
"grad_norm": 1.3475735187530518,
"learning_rate": 4.456074710313378e-05,
"loss": 0.4692,
"num_input_tokens_seen": 22322272,
"step": 1800
},
{
"epoch": 0.6437232524964337,
"grad_norm": 1.2804908752441406,
"learning_rate": 4.4531641581645576e-05,
"loss": 0.4931,
"num_input_tokens_seen": 22384368,
"step": 1805
},
{
"epoch": 0.645506419400856,
"grad_norm": 1.2529658079147339,
"learning_rate": 4.4502467955024294e-05,
"loss": 0.386,
"num_input_tokens_seen": 22447888,
"step": 1810
},
{
"epoch": 0.6472895863052782,
"grad_norm": 1.3398923873901367,
"learning_rate": 4.447322632499581e-05,
"loss": 0.4522,
"num_input_tokens_seen": 22514704,
"step": 1815
},
{
"epoch": 0.6490727532097005,
"grad_norm": 1.320273518562317,
"learning_rate": 4.444391679352315e-05,
"loss": 0.4082,
"num_input_tokens_seen": 22573024,
"step": 1820
},
{
"epoch": 0.6508559201141226,
"grad_norm": 1.2203108072280884,
"learning_rate": 4.441453946280612e-05,
"loss": 0.4551,
"num_input_tokens_seen": 22632080,
"step": 1825
},
{
"epoch": 0.6526390870185449,
"grad_norm": 1.1191906929016113,
"learning_rate": 4.4385094435280873e-05,
"loss": 0.3873,
"num_input_tokens_seen": 22692192,
"step": 1830
},
{
"epoch": 0.6544222539229672,
"grad_norm": 1.249611496925354,
"learning_rate": 4.435558181361969e-05,
"loss": 0.398,
"num_input_tokens_seen": 22754624,
"step": 1835
},
{
"epoch": 0.6562054208273894,
"grad_norm": 1.4326295852661133,
"learning_rate": 4.432600170073048e-05,
"loss": 0.4159,
"num_input_tokens_seen": 22819616,
"step": 1840
},
{
"epoch": 0.6579885877318117,
"grad_norm": 1.2453666925430298,
"learning_rate": 4.429635419975655e-05,
"loss": 0.4343,
"num_input_tokens_seen": 22879136,
"step": 1845
},
{
"epoch": 0.6597717546362339,
"grad_norm": 1.1724647283554077,
"learning_rate": 4.426663941407614e-05,
"loss": 0.4287,
"num_input_tokens_seen": 22940528,
"step": 1850
},
{
"epoch": 0.6615549215406562,
"grad_norm": 1.185964822769165,
"learning_rate": 4.423685744730213e-05,
"loss": 0.3901,
"num_input_tokens_seen": 23004128,
"step": 1855
},
{
"epoch": 0.6633380884450785,
"grad_norm": 1.167861819267273,
"learning_rate": 4.420700840328162e-05,
"loss": 0.512,
"num_input_tokens_seen": 23066240,
"step": 1860
},
{
"epoch": 0.6651212553495007,
"grad_norm": 1.6327167749404907,
"learning_rate": 4.417709238609566e-05,
"loss": 0.4102,
"num_input_tokens_seen": 23126128,
"step": 1865
},
{
"epoch": 0.666904422253923,
"grad_norm": 1.0951687097549438,
"learning_rate": 4.4147109500058776e-05,
"loss": 0.4767,
"num_input_tokens_seen": 23182704,
"step": 1870
},
{
"epoch": 0.6686875891583453,
"grad_norm": 1.1051822900772095,
"learning_rate": 4.411705984971868e-05,
"loss": 0.4009,
"num_input_tokens_seen": 23244816,
"step": 1875
},
{
"epoch": 0.6704707560627675,
"grad_norm": 1.4562581777572632,
"learning_rate": 4.408694353985589e-05,
"loss": 0.5083,
"num_input_tokens_seen": 23307776,
"step": 1880
},
{
"epoch": 0.6722539229671898,
"grad_norm": 1.4651310443878174,
"learning_rate": 4.4056760675483356e-05,
"loss": 0.5302,
"num_input_tokens_seen": 23370368,
"step": 1885
},
{
"epoch": 0.6740370898716119,
"grad_norm": 1.1008446216583252,
"learning_rate": 4.402651136184609e-05,
"loss": 0.3035,
"num_input_tokens_seen": 23436192,
"step": 1890
},
{
"epoch": 0.6758202567760342,
"grad_norm": 1.7820332050323486,
"learning_rate": 4.3996195704420826e-05,
"loss": 0.3972,
"num_input_tokens_seen": 23501408,
"step": 1895
},
{
"epoch": 0.6776034236804565,
"grad_norm": 1.2907474040985107,
"learning_rate": 4.396581380891562e-05,
"loss": 0.4644,
"num_input_tokens_seen": 23561072,
"step": 1900
},
{
"epoch": 0.6793865905848787,
"grad_norm": 1.3212149143218994,
"learning_rate": 4.3935365781269476e-05,
"loss": 0.5038,
"num_input_tokens_seen": 23622784,
"step": 1905
},
{
"epoch": 0.681169757489301,
"grad_norm": 1.5679349899291992,
"learning_rate": 4.390485172765204e-05,
"loss": 0.4286,
"num_input_tokens_seen": 23682096,
"step": 1910
},
{
"epoch": 0.6829529243937232,
"grad_norm": 1.3519368171691895,
"learning_rate": 4.387427175446315e-05,
"loss": 0.395,
"num_input_tokens_seen": 23742608,
"step": 1915
},
{
"epoch": 0.6847360912981455,
"grad_norm": 1.419407844543457,
"learning_rate": 4.38436259683325e-05,
"loss": 0.4882,
"num_input_tokens_seen": 23805008,
"step": 1920
},
{
"epoch": 0.6865192582025678,
"grad_norm": 1.1762245893478394,
"learning_rate": 4.3812914476119293e-05,
"loss": 0.3958,
"num_input_tokens_seen": 23868960,
"step": 1925
},
{
"epoch": 0.68830242510699,
"grad_norm": 1.192785382270813,
"learning_rate": 4.378213738491182e-05,
"loss": 0.4351,
"num_input_tokens_seen": 23933808,
"step": 1930
},
{
"epoch": 0.6900855920114123,
"grad_norm": 1.0715630054473877,
"learning_rate": 4.375129480202711e-05,
"loss": 0.365,
"num_input_tokens_seen": 23990160,
"step": 1935
},
{
"epoch": 0.6918687589158345,
"grad_norm": 1.3897589445114136,
"learning_rate": 4.372038683501057e-05,
"loss": 0.5394,
"num_input_tokens_seen": 24056944,
"step": 1940
},
{
"epoch": 0.6936519258202568,
"grad_norm": 1.3039186000823975,
"learning_rate": 4.36894135916356e-05,
"loss": 0.4806,
"num_input_tokens_seen": 24120016,
"step": 1945
},
{
"epoch": 0.6954350927246791,
"grad_norm": 1.101616621017456,
"learning_rate": 4.3658375179903185e-05,
"loss": 0.5264,
"num_input_tokens_seen": 24181376,
"step": 1950
},
{
"epoch": 0.6972182596291013,
"grad_norm": 1.0111804008483887,
"learning_rate": 4.3627271708041565e-05,
"loss": 0.4425,
"num_input_tokens_seen": 24243440,
"step": 1955
},
{
"epoch": 0.6990014265335235,
"grad_norm": 1.257667899131775,
"learning_rate": 4.3596103284505854e-05,
"loss": 0.4541,
"num_input_tokens_seen": 24301264,
"step": 1960
},
{
"epoch": 0.7007845934379457,
"grad_norm": 1.5398532152175903,
"learning_rate": 4.35648700179776e-05,
"loss": 0.4665,
"num_input_tokens_seen": 24362208,
"step": 1965
},
{
"epoch": 0.702567760342368,
"grad_norm": 1.2097506523132324,
"learning_rate": 4.353357201736452e-05,
"loss": 0.409,
"num_input_tokens_seen": 24426144,
"step": 1970
},
{
"epoch": 0.7043509272467903,
"grad_norm": 1.243800163269043,
"learning_rate": 4.3502209391799985e-05,
"loss": 0.3783,
"num_input_tokens_seen": 24489120,
"step": 1975
},
{
"epoch": 0.7061340941512125,
"grad_norm": 1.3025860786437988,
"learning_rate": 4.347078225064276e-05,
"loss": 0.3957,
"num_input_tokens_seen": 24550720,
"step": 1980
},
{
"epoch": 0.7079172610556348,
"grad_norm": 1.6858657598495483,
"learning_rate": 4.343929070347653e-05,
"loss": 0.4329,
"num_input_tokens_seen": 24614608,
"step": 1985
},
{
"epoch": 0.7097004279600571,
"grad_norm": 1.7270207405090332,
"learning_rate": 4.34077348601096e-05,
"loss": 0.4347,
"num_input_tokens_seen": 24679136,
"step": 1990
},
{
"epoch": 0.7114835948644793,
"grad_norm": 1.3973181247711182,
"learning_rate": 4.337611483057443e-05,
"loss": 0.4126,
"num_input_tokens_seen": 24738160,
"step": 1995
},
{
"epoch": 0.7132667617689016,
"grad_norm": 1.5383044481277466,
"learning_rate": 4.3344430725127315e-05,
"loss": 0.4086,
"num_input_tokens_seen": 24803728,
"step": 2000
},
{
"epoch": 0.7150499286733238,
"grad_norm": 1.096655249595642,
"learning_rate": 4.331268265424797e-05,
"loss": 0.4139,
"num_input_tokens_seen": 24868144,
"step": 2005
},
{
"epoch": 0.7168330955777461,
"grad_norm": 1.1997692584991455,
"learning_rate": 4.328087072863915e-05,
"loss": 0.4188,
"num_input_tokens_seen": 24930448,
"step": 2010
},
{
"epoch": 0.7186162624821684,
"grad_norm": 1.1563403606414795,
"learning_rate": 4.3248995059226284e-05,
"loss": 0.5091,
"num_input_tokens_seen": 24991664,
"step": 2015
},
{
"epoch": 0.7203994293865906,
"grad_norm": 1.050632119178772,
"learning_rate": 4.321705575715703e-05,
"loss": 0.4012,
"num_input_tokens_seen": 25055840,
"step": 2020
},
{
"epoch": 0.7221825962910129,
"grad_norm": 1.3263276815414429,
"learning_rate": 4.318505293380097e-05,
"loss": 0.4003,
"num_input_tokens_seen": 25117968,
"step": 2025
},
{
"epoch": 0.723965763195435,
"grad_norm": 1.3196632862091064,
"learning_rate": 4.3152986700749165e-05,
"loss": 0.5408,
"num_input_tokens_seen": 25183168,
"step": 2030
},
{
"epoch": 0.7257489300998573,
"grad_norm": 1.1344470977783203,
"learning_rate": 4.3120857169813766e-05,
"loss": 0.4827,
"num_input_tokens_seen": 25245424,
"step": 2035
},
{
"epoch": 0.7275320970042796,
"grad_norm": 0.9428476691246033,
"learning_rate": 4.308866445302766e-05,
"loss": 0.4728,
"num_input_tokens_seen": 25307296,
"step": 2040
},
{
"epoch": 0.7293152639087018,
"grad_norm": 1.3459362983703613,
"learning_rate": 4.3056408662644024e-05,
"loss": 0.4732,
"num_input_tokens_seen": 25368656,
"step": 2045
},
{
"epoch": 0.7310984308131241,
"grad_norm": 1.1690179109573364,
"learning_rate": 4.302408991113601e-05,
"loss": 0.409,
"num_input_tokens_seen": 25429792,
"step": 2050
},
{
"epoch": 0.7328815977175464,
"grad_norm": 1.6628319025039673,
"learning_rate": 4.2991708311196285e-05,
"loss": 0.4224,
"num_input_tokens_seen": 25494064,
"step": 2055
},
{
"epoch": 0.7346647646219686,
"grad_norm": 1.2542611360549927,
"learning_rate": 4.2959263975736676e-05,
"loss": 0.4836,
"num_input_tokens_seen": 25559856,
"step": 2060
},
{
"epoch": 0.7364479315263909,
"grad_norm": 1.2287793159484863,
"learning_rate": 4.292675701788774e-05,
"loss": 0.3773,
"num_input_tokens_seen": 25622160,
"step": 2065
},
{
"epoch": 0.7382310984308131,
"grad_norm": 1.0096776485443115,
"learning_rate": 4.289418755099841e-05,
"loss": 0.4791,
"num_input_tokens_seen": 25685120,
"step": 2070
},
{
"epoch": 0.7400142653352354,
"grad_norm": 1.1361216306686401,
"learning_rate": 4.28615556886356e-05,
"loss": 0.4616,
"num_input_tokens_seen": 25748256,
"step": 2075
},
{
"epoch": 0.7417974322396577,
"grad_norm": 1.6110931634902954,
"learning_rate": 4.2828861544583746e-05,
"loss": 0.47,
"num_input_tokens_seen": 25810016,
"step": 2080
},
{
"epoch": 0.7435805991440799,
"grad_norm": 1.3852949142456055,
"learning_rate": 4.279610523284449e-05,
"loss": 0.4363,
"num_input_tokens_seen": 25875264,
"step": 2085
},
{
"epoch": 0.7453637660485022,
"grad_norm": 1.6177430152893066,
"learning_rate": 4.2763286867636244e-05,
"loss": 0.3818,
"num_input_tokens_seen": 25936032,
"step": 2090
},
{
"epoch": 0.7471469329529244,
"grad_norm": 1.313751220703125,
"learning_rate": 4.2730406563393777e-05,
"loss": 0.3704,
"num_input_tokens_seen": 25997696,
"step": 2095
},
{
"epoch": 0.7489300998573466,
"grad_norm": 1.566335678100586,
"learning_rate": 4.269746443476787e-05,
"loss": 0.4764,
"num_input_tokens_seen": 26060400,
"step": 2100
},
{
"epoch": 0.7507132667617689,
"grad_norm": 1.0574077367782593,
"learning_rate": 4.266446059662482e-05,
"loss": 0.3958,
"num_input_tokens_seen": 26117744,
"step": 2105
},
{
"epoch": 0.7524964336661911,
"grad_norm": 1.2322158813476562,
"learning_rate": 4.2631395164046165e-05,
"loss": 0.3588,
"num_input_tokens_seen": 26179472,
"step": 2110
},
{
"epoch": 0.7542796005706134,
"grad_norm": 1.3826179504394531,
"learning_rate": 4.259826825232819e-05,
"loss": 0.4088,
"num_input_tokens_seen": 26241296,
"step": 2115
},
{
"epoch": 0.7560627674750356,
"grad_norm": 1.3250257968902588,
"learning_rate": 4.256507997698152e-05,
"loss": 0.4987,
"num_input_tokens_seen": 26304048,
"step": 2120
},
{
"epoch": 0.7578459343794579,
"grad_norm": 1.3705068826675415,
"learning_rate": 4.2531830453730824e-05,
"loss": 0.4284,
"num_input_tokens_seen": 26366240,
"step": 2125
},
{
"epoch": 0.7596291012838802,
"grad_norm": 1.335214376449585,
"learning_rate": 4.249851979851426e-05,
"loss": 0.427,
"num_input_tokens_seen": 26427888,
"step": 2130
},
{
"epoch": 0.7614122681883024,
"grad_norm": 1.2708473205566406,
"learning_rate": 4.24651481274832e-05,
"loss": 0.3807,
"num_input_tokens_seen": 26489536,
"step": 2135
},
{
"epoch": 0.7631954350927247,
"grad_norm": 1.567751169204712,
"learning_rate": 4.243171555700174e-05,
"loss": 0.4036,
"num_input_tokens_seen": 26552608,
"step": 2140
},
{
"epoch": 0.764978601997147,
"grad_norm": 1.4113706350326538,
"learning_rate": 4.2398222203646355e-05,
"loss": 0.4032,
"num_input_tokens_seen": 26617392,
"step": 2145
},
{
"epoch": 0.7667617689015692,
"grad_norm": 1.4144871234893799,
"learning_rate": 4.2364668184205445e-05,
"loss": 0.408,
"num_input_tokens_seen": 26677216,
"step": 2150
},
{
"epoch": 0.7685449358059915,
"grad_norm": 1.2482500076293945,
"learning_rate": 4.233105361567894e-05,
"loss": 0.3975,
"num_input_tokens_seen": 26739344,
"step": 2155
},
{
"epoch": 0.7703281027104137,
"grad_norm": 1.270215630531311,
"learning_rate": 4.2297378615277935e-05,
"loss": 0.416,
"num_input_tokens_seen": 26801200,
"step": 2160
},
{
"epoch": 0.7721112696148359,
"grad_norm": 1.1123536825180054,
"learning_rate": 4.226364330042422e-05,
"loss": 0.3925,
"num_input_tokens_seen": 26863440,
"step": 2165
},
{
"epoch": 0.7738944365192582,
"grad_norm": 1.3251960277557373,
"learning_rate": 4.2229847788749886e-05,
"loss": 0.3339,
"num_input_tokens_seen": 26925632,
"step": 2170
},
{
"epoch": 0.7756776034236804,
"grad_norm": 1.4706435203552246,
"learning_rate": 4.2195992198096956e-05,
"loss": 0.4934,
"num_input_tokens_seen": 26989776,
"step": 2175
},
{
"epoch": 0.7774607703281027,
"grad_norm": 1.3785024881362915,
"learning_rate": 4.216207664651691e-05,
"loss": 0.4337,
"num_input_tokens_seen": 27051392,
"step": 2180
},
{
"epoch": 0.7792439372325249,
"grad_norm": 1.1321138143539429,
"learning_rate": 4.2128101252270335e-05,
"loss": 0.3584,
"num_input_tokens_seen": 27117712,
"step": 2185
},
{
"epoch": 0.7810271041369472,
"grad_norm": 1.3333991765975952,
"learning_rate": 4.2094066133826457e-05,
"loss": 0.4337,
"num_input_tokens_seen": 27181344,
"step": 2190
},
{
"epoch": 0.7828102710413695,
"grad_norm": 1.4429007768630981,
"learning_rate": 4.2059971409862785e-05,
"loss": 0.4527,
"num_input_tokens_seen": 27245440,
"step": 2195
},
{
"epoch": 0.7845934379457917,
"grad_norm": 0.9634119868278503,
"learning_rate": 4.202581719926465e-05,
"loss": 0.3812,
"num_input_tokens_seen": 27310976,
"step": 2200
},
{
"epoch": 0.786376604850214,
"grad_norm": 1.2908122539520264,
"learning_rate": 4.1991603621124796e-05,
"loss": 0.3835,
"num_input_tokens_seen": 27377088,
"step": 2205
},
{
"epoch": 0.7881597717546363,
"grad_norm": 1.4514962434768677,
"learning_rate": 4.195733079474301e-05,
"loss": 0.4294,
"num_input_tokens_seen": 27440160,
"step": 2210
},
{
"epoch": 0.7899429386590585,
"grad_norm": 1.1848522424697876,
"learning_rate": 4.192299883962564e-05,
"loss": 0.4469,
"num_input_tokens_seen": 27497936,
"step": 2215
},
{
"epoch": 0.7917261055634808,
"grad_norm": 1.6799744367599487,
"learning_rate": 4.188860787548522e-05,
"loss": 0.517,
"num_input_tokens_seen": 27558608,
"step": 2220
},
{
"epoch": 0.793509272467903,
"grad_norm": 1.2593986988067627,
"learning_rate": 4.185415802224005e-05,
"loss": 0.4464,
"num_input_tokens_seen": 27618704,
"step": 2225
},
{
"epoch": 0.7952924393723253,
"grad_norm": 0.8842148184776306,
"learning_rate": 4.181964940001378e-05,
"loss": 0.3391,
"num_input_tokens_seen": 27678272,
"step": 2230
},
{
"epoch": 0.7970756062767475,
"grad_norm": 1.397862434387207,
"learning_rate": 4.1785082129134955e-05,
"loss": 0.4106,
"num_input_tokens_seen": 27736064,
"step": 2235
},
{
"epoch": 0.7988587731811697,
"grad_norm": 1.234437346458435,
"learning_rate": 4.175045633013665e-05,
"loss": 0.4695,
"num_input_tokens_seen": 27797456,
"step": 2240
},
{
"epoch": 0.800641940085592,
"grad_norm": 1.6073672771453857,
"learning_rate": 4.1715772123755994e-05,
"loss": 0.4356,
"num_input_tokens_seen": 27860640,
"step": 2245
},
{
"epoch": 0.8024251069900142,
"grad_norm": 1.7200384140014648,
"learning_rate": 4.1681029630933804e-05,
"loss": 0.4549,
"num_input_tokens_seen": 27920992,
"step": 2250
},
{
"epoch": 0.8042082738944365,
"grad_norm": 1.1777783632278442,
"learning_rate": 4.1646228972814126e-05,
"loss": 0.4374,
"num_input_tokens_seen": 27984960,
"step": 2255
},
{
"epoch": 0.8059914407988588,
"grad_norm": 0.9882134795188904,
"learning_rate": 4.1611370270743826e-05,
"loss": 0.4478,
"num_input_tokens_seen": 28046064,
"step": 2260
},
{
"epoch": 0.807774607703281,
"grad_norm": 1.7729289531707764,
"learning_rate": 4.157645364627216e-05,
"loss": 0.4987,
"num_input_tokens_seen": 28106368,
"step": 2265
},
{
"epoch": 0.8095577746077033,
"grad_norm": 1.2028993368148804,
"learning_rate": 4.154147922115036e-05,
"loss": 0.4149,
"num_input_tokens_seen": 28170784,
"step": 2270
},
{
"epoch": 0.8113409415121255,
"grad_norm": 1.251348853111267,
"learning_rate": 4.1506447117331195e-05,
"loss": 0.4042,
"num_input_tokens_seen": 28231216,
"step": 2275
},
{
"epoch": 0.8131241084165478,
"grad_norm": 1.2988697290420532,
"learning_rate": 4.1471357456968556e-05,
"loss": 0.4622,
"num_input_tokens_seen": 28291392,
"step": 2280
},
{
"epoch": 0.8149072753209701,
"grad_norm": 1.257813811302185,
"learning_rate": 4.143621036241705e-05,
"loss": 0.427,
"num_input_tokens_seen": 28354848,
"step": 2285
},
{
"epoch": 0.8166904422253923,
"grad_norm": 1.110034465789795,
"learning_rate": 4.1401005956231506e-05,
"loss": 0.4259,
"num_input_tokens_seen": 28415808,
"step": 2290
},
{
"epoch": 0.8184736091298146,
"grad_norm": 2.1352555751800537,
"learning_rate": 4.136574436116665e-05,
"loss": 0.4289,
"num_input_tokens_seen": 28479648,
"step": 2295
},
{
"epoch": 0.8202567760342369,
"grad_norm": 1.8342629671096802,
"learning_rate": 4.1330425700176586e-05,
"loss": 0.4511,
"num_input_tokens_seen": 28539184,
"step": 2300
},
{
"epoch": 0.822039942938659,
"grad_norm": 1.5662261247634888,
"learning_rate": 4.12950500964144e-05,
"loss": 0.4681,
"num_input_tokens_seen": 28599488,
"step": 2305
},
{
"epoch": 0.8238231098430813,
"grad_norm": 1.5573149919509888,
"learning_rate": 4.125961767323174e-05,
"loss": 0.4644,
"num_input_tokens_seen": 28661568,
"step": 2310
},
{
"epoch": 0.8256062767475035,
"grad_norm": 1.1155126094818115,
"learning_rate": 4.1224128554178394e-05,
"loss": 0.4353,
"num_input_tokens_seen": 28720656,
"step": 2315
},
{
"epoch": 0.8273894436519258,
"grad_norm": 1.0635948181152344,
"learning_rate": 4.118858286300182e-05,
"loss": 0.3924,
"num_input_tokens_seen": 28783488,
"step": 2320
},
{
"epoch": 0.829172610556348,
"grad_norm": 1.1664304733276367,
"learning_rate": 4.1152980723646745e-05,
"loss": 0.4228,
"num_input_tokens_seen": 28843568,
"step": 2325
},
{
"epoch": 0.8309557774607703,
"grad_norm": 1.4405851364135742,
"learning_rate": 4.111732226025473e-05,
"loss": 0.5034,
"num_input_tokens_seen": 28907856,
"step": 2330
},
{
"epoch": 0.8327389443651926,
"grad_norm": 1.6694755554199219,
"learning_rate": 4.108160759716373e-05,
"loss": 0.4676,
"num_input_tokens_seen": 28973616,
"step": 2335
},
{
"epoch": 0.8345221112696148,
"grad_norm": 1.5665149688720703,
"learning_rate": 4.1045836858907676e-05,
"loss": 0.4959,
"num_input_tokens_seen": 29037232,
"step": 2340
},
{
"epoch": 0.8363052781740371,
"grad_norm": 1.4531359672546387,
"learning_rate": 4.1010010170216e-05,
"loss": 0.407,
"num_input_tokens_seen": 29096032,
"step": 2345
},
{
"epoch": 0.8380884450784594,
"grad_norm": 1.453543782234192,
"learning_rate": 4.097412765601326e-05,
"loss": 0.3479,
"num_input_tokens_seen": 29158688,
"step": 2350
},
{
"epoch": 0.8398716119828816,
"grad_norm": 1.0374047756195068,
"learning_rate": 4.093818944141865e-05,
"loss": 0.4006,
"num_input_tokens_seen": 29221328,
"step": 2355
},
{
"epoch": 0.8416547788873039,
"grad_norm": 0.9795634746551514,
"learning_rate": 4.090219565174559e-05,
"loss": 0.3167,
"num_input_tokens_seen": 29281968,
"step": 2360
},
{
"epoch": 0.8434379457917262,
"grad_norm": 1.4658923149108887,
"learning_rate": 4.08661464125013e-05,
"loss": 0.3598,
"num_input_tokens_seen": 29343616,
"step": 2365
},
{
"epoch": 0.8452211126961484,
"grad_norm": 1.236444354057312,
"learning_rate": 4.083004184938633e-05,
"loss": 0.4541,
"num_input_tokens_seen": 29407664,
"step": 2370
},
{
"epoch": 0.8470042796005706,
"grad_norm": 1.3159562349319458,
"learning_rate": 4.079388208829415e-05,
"loss": 0.4256,
"num_input_tokens_seen": 29476880,
"step": 2375
},
{
"epoch": 0.8487874465049928,
"grad_norm": 1.3893544673919678,
"learning_rate": 4.075766725531069e-05,
"loss": 0.3899,
"num_input_tokens_seen": 29536800,
"step": 2380
},
{
"epoch": 0.8505706134094151,
"grad_norm": 1.5271923542022705,
"learning_rate": 4.072139747671394e-05,
"loss": 0.4808,
"num_input_tokens_seen": 29595504,
"step": 2385
},
{
"epoch": 0.8523537803138374,
"grad_norm": 0.9985124468803406,
"learning_rate": 4.068507287897343e-05,
"loss": 0.4354,
"num_input_tokens_seen": 29658432,
"step": 2390
},
{
"epoch": 0.8541369472182596,
"grad_norm": 1.6747127771377563,
"learning_rate": 4.06486935887499e-05,
"loss": 0.4542,
"num_input_tokens_seen": 29717312,
"step": 2395
},
{
"epoch": 0.8559201141226819,
"grad_norm": 1.523199439048767,
"learning_rate": 4.061225973289473e-05,
"loss": 0.4326,
"num_input_tokens_seen": 29780544,
"step": 2400
},
{
"epoch": 0.8577032810271041,
"grad_norm": 1.2455990314483643,
"learning_rate": 4.057577143844964e-05,
"loss": 0.4117,
"num_input_tokens_seen": 29845408,
"step": 2405
},
{
"epoch": 0.8594864479315264,
"grad_norm": 1.330797553062439,
"learning_rate": 4.05392288326461e-05,
"loss": 0.4742,
"num_input_tokens_seen": 29909472,
"step": 2410
},
{
"epoch": 0.8612696148359487,
"grad_norm": 1.3107638359069824,
"learning_rate": 4.050263204290502e-05,
"loss": 0.4182,
"num_input_tokens_seen": 29972656,
"step": 2415
},
{
"epoch": 0.8630527817403709,
"grad_norm": 1.050984501838684,
"learning_rate": 4.046598119683621e-05,
"loss": 0.3584,
"num_input_tokens_seen": 30033440,
"step": 2420
},
{
"epoch": 0.8648359486447932,
"grad_norm": 1.2663660049438477,
"learning_rate": 4.042927642223798e-05,
"loss": 0.3989,
"num_input_tokens_seen": 30096256,
"step": 2425
},
{
"epoch": 0.8666191155492154,
"grad_norm": 1.2052220106124878,
"learning_rate": 4.039251784709667e-05,
"loss": 0.4054,
"num_input_tokens_seen": 30159776,
"step": 2430
},
{
"epoch": 0.8684022824536377,
"grad_norm": 1.296916127204895,
"learning_rate": 4.035570559958624e-05,
"loss": 0.4812,
"num_input_tokens_seen": 30220400,
"step": 2435
},
{
"epoch": 0.8701854493580599,
"grad_norm": 1.4973492622375488,
"learning_rate": 4.0318839808067796e-05,
"loss": 0.4122,
"num_input_tokens_seen": 30280144,
"step": 2440
},
{
"epoch": 0.8719686162624821,
"grad_norm": 1.5270328521728516,
"learning_rate": 4.028192060108914e-05,
"loss": 0.4983,
"num_input_tokens_seen": 30338336,
"step": 2445
},
{
"epoch": 0.8737517831669044,
"grad_norm": 0.987087070941925,
"learning_rate": 4.024494810738432e-05,
"loss": 0.4126,
"num_input_tokens_seen": 30401520,
"step": 2450
},
{
"epoch": 0.8755349500713266,
"grad_norm": 1.3958648443222046,
"learning_rate": 4.0207922455873205e-05,
"loss": 0.4404,
"num_input_tokens_seen": 30463280,
"step": 2455
},
{
"epoch": 0.8773181169757489,
"grad_norm": 1.5073976516723633,
"learning_rate": 4.0170843775661025e-05,
"loss": 0.3789,
"num_input_tokens_seen": 30525376,
"step": 2460
},
{
"epoch": 0.8791012838801712,
"grad_norm": 1.4449955224990845,
"learning_rate": 4.0133712196037906e-05,
"loss": 0.4454,
"num_input_tokens_seen": 30589792,
"step": 2465
},
{
"epoch": 0.8808844507845934,
"grad_norm": 1.3354792594909668,
"learning_rate": 4.009652784647843e-05,
"loss": 0.4149,
"num_input_tokens_seen": 30652512,
"step": 2470
},
{
"epoch": 0.8826676176890157,
"grad_norm": 1.1937047243118286,
"learning_rate": 4.005929085664119e-05,
"loss": 0.4385,
"num_input_tokens_seen": 30715456,
"step": 2475
},
{
"epoch": 0.884450784593438,
"grad_norm": 1.794368863105774,
"learning_rate": 4.002200135636832e-05,
"loss": 0.4807,
"num_input_tokens_seen": 30779760,
"step": 2480
},
{
"epoch": 0.8862339514978602,
"grad_norm": 1.3825370073318481,
"learning_rate": 3.998465947568506e-05,
"loss": 0.4214,
"num_input_tokens_seen": 30841680,
"step": 2485
},
{
"epoch": 0.8880171184022825,
"grad_norm": 1.2612850666046143,
"learning_rate": 3.9947265344799304e-05,
"loss": 0.3469,
"num_input_tokens_seen": 30904864,
"step": 2490
},
{
"epoch": 0.8898002853067047,
"grad_norm": 1.769665002822876,
"learning_rate": 3.990981909410112e-05,
"loss": 0.4782,
"num_input_tokens_seen": 30966048,
"step": 2495
},
{
"epoch": 0.891583452211127,
"grad_norm": 1.4009034633636475,
"learning_rate": 3.9872320854162324e-05,
"loss": 0.4822,
"num_input_tokens_seen": 31031488,
"step": 2500
},
{
"epoch": 0.8933666191155493,
"grad_norm": 1.411489725112915,
"learning_rate": 3.9834770755736014e-05,
"loss": 0.4813,
"num_input_tokens_seen": 31093488,
"step": 2505
},
{
"epoch": 0.8951497860199714,
"grad_norm": 1.448851227760315,
"learning_rate": 3.979716892975612e-05,
"loss": 0.4946,
"num_input_tokens_seen": 31156816,
"step": 2510
},
{
"epoch": 0.8969329529243937,
"grad_norm": 1.247158169746399,
"learning_rate": 3.975951550733693e-05,
"loss": 0.4141,
"num_input_tokens_seen": 31220592,
"step": 2515
},
{
"epoch": 0.8987161198288159,
"grad_norm": 1.0128424167633057,
"learning_rate": 3.9721810619772636e-05,
"loss": 0.4432,
"num_input_tokens_seen": 31283920,
"step": 2520
},
{
"epoch": 0.9004992867332382,
"grad_norm": 1.1553157567977905,
"learning_rate": 3.968405439853691e-05,
"loss": 0.4436,
"num_input_tokens_seen": 31348656,
"step": 2525
},
{
"epoch": 0.9022824536376605,
"grad_norm": 2.350649356842041,
"learning_rate": 3.964624697528241e-05,
"loss": 0.4768,
"num_input_tokens_seen": 31407968,
"step": 2530
},
{
"epoch": 0.9040656205420827,
"grad_norm": 1.277190089225769,
"learning_rate": 3.960838848184032e-05,
"loss": 0.4741,
"num_input_tokens_seen": 31473280,
"step": 2535
},
{
"epoch": 0.905848787446505,
"grad_norm": 1.0319541692733765,
"learning_rate": 3.957047905021991e-05,
"loss": 0.4863,
"num_input_tokens_seen": 31536784,
"step": 2540
},
{
"epoch": 0.9076319543509273,
"grad_norm": 1.1045851707458496,
"learning_rate": 3.9532518812608075e-05,
"loss": 0.442,
"num_input_tokens_seen": 31599904,
"step": 2545
},
{
"epoch": 0.9094151212553495,
"grad_norm": 1.2741860151290894,
"learning_rate": 3.949450790136885e-05,
"loss": 0.4147,
"num_input_tokens_seen": 31660832,
"step": 2550
},
{
"epoch": 0.9111982881597718,
"grad_norm": 1.344022274017334,
"learning_rate": 3.9456446449042967e-05,
"loss": 0.4988,
"num_input_tokens_seen": 31725776,
"step": 2555
},
{
"epoch": 0.912981455064194,
"grad_norm": 1.1035910844802856,
"learning_rate": 3.9418334588347406e-05,
"loss": 0.3968,
"num_input_tokens_seen": 31785440,
"step": 2560
},
{
"epoch": 0.9147646219686163,
"grad_norm": 1.2287249565124512,
"learning_rate": 3.9380172452174894e-05,
"loss": 0.4313,
"num_input_tokens_seen": 31847136,
"step": 2565
},
{
"epoch": 0.9165477888730386,
"grad_norm": 1.5301395654678345,
"learning_rate": 3.9341960173593495e-05,
"loss": 0.4506,
"num_input_tokens_seen": 31909488,
"step": 2570
},
{
"epoch": 0.9183309557774608,
"grad_norm": 1.2145062685012817,
"learning_rate": 3.930369788584607e-05,
"loss": 0.3616,
"num_input_tokens_seen": 31971488,
"step": 2575
},
{
"epoch": 0.920114122681883,
"grad_norm": 1.5201282501220703,
"learning_rate": 3.926538572234991e-05,
"loss": 0.4509,
"num_input_tokens_seen": 32033920,
"step": 2580
},
{
"epoch": 0.9218972895863052,
"grad_norm": 1.2658700942993164,
"learning_rate": 3.9227023816696176e-05,
"loss": 0.4035,
"num_input_tokens_seen": 32095680,
"step": 2585
},
{
"epoch": 0.9236804564907275,
"grad_norm": 2.191631555557251,
"learning_rate": 3.91886123026495e-05,
"loss": 0.5551,
"num_input_tokens_seen": 32153184,
"step": 2590
},
{
"epoch": 0.9254636233951498,
"grad_norm": 1.0200012922286987,
"learning_rate": 3.9150151314147474e-05,
"loss": 0.4772,
"num_input_tokens_seen": 32213536,
"step": 2595
},
{
"epoch": 0.927246790299572,
"grad_norm": 1.1653907299041748,
"learning_rate": 3.911164098530023e-05,
"loss": 0.374,
"num_input_tokens_seen": 32269760,
"step": 2600
},
{
"epoch": 0.9290299572039943,
"grad_norm": 1.4243419170379639,
"learning_rate": 3.907308145038993e-05,
"loss": 0.4263,
"num_input_tokens_seen": 32332112,
"step": 2605
},
{
"epoch": 0.9308131241084165,
"grad_norm": 1.1571168899536133,
"learning_rate": 3.903447284387029e-05,
"loss": 0.3365,
"num_input_tokens_seen": 32392352,
"step": 2610
},
{
"epoch": 0.9325962910128388,
"grad_norm": 1.3121576309204102,
"learning_rate": 3.899581530036619e-05,
"loss": 0.3868,
"num_input_tokens_seen": 32455136,
"step": 2615
},
{
"epoch": 0.9343794579172611,
"grad_norm": 1.7881336212158203,
"learning_rate": 3.89571089546731e-05,
"loss": 0.4519,
"num_input_tokens_seen": 32516496,
"step": 2620
},
{
"epoch": 0.9361626248216833,
"grad_norm": 1.2598882913589478,
"learning_rate": 3.8918353941756684e-05,
"loss": 0.4071,
"num_input_tokens_seen": 32576432,
"step": 2625
},
{
"epoch": 0.9379457917261056,
"grad_norm": 1.1581662893295288,
"learning_rate": 3.8879550396752295e-05,
"loss": 0.4122,
"num_input_tokens_seen": 32639328,
"step": 2630
},
{
"epoch": 0.9397289586305279,
"grad_norm": 1.4587301015853882,
"learning_rate": 3.8840698454964507e-05,
"loss": 0.39,
"num_input_tokens_seen": 32700896,
"step": 2635
},
{
"epoch": 0.9415121255349501,
"grad_norm": 1.2084256410598755,
"learning_rate": 3.880179825186667e-05,
"loss": 0.4211,
"num_input_tokens_seen": 32767616,
"step": 2640
},
{
"epoch": 0.9432952924393724,
"grad_norm": 1.432020664215088,
"learning_rate": 3.8762849923100384e-05,
"loss": 0.3842,
"num_input_tokens_seen": 32829216,
"step": 2645
},
{
"epoch": 0.9450784593437945,
"grad_norm": 0.9488567113876343,
"learning_rate": 3.8723853604475104e-05,
"loss": 0.4145,
"num_input_tokens_seen": 32894624,
"step": 2650
},
{
"epoch": 0.9468616262482168,
"grad_norm": 1.3725416660308838,
"learning_rate": 3.8684809431967576e-05,
"loss": 0.4174,
"num_input_tokens_seen": 32957296,
"step": 2655
},
{
"epoch": 0.948644793152639,
"grad_norm": 1.5407421588897705,
"learning_rate": 3.864571754172144e-05,
"loss": 0.4154,
"num_input_tokens_seen": 33020944,
"step": 2660
},
{
"epoch": 0.9504279600570613,
"grad_norm": 0.9749571681022644,
"learning_rate": 3.8606578070046715e-05,
"loss": 0.3845,
"num_input_tokens_seen": 33083056,
"step": 2665
},
{
"epoch": 0.9522111269614836,
"grad_norm": 0.9800275564193726,
"learning_rate": 3.856739115341933e-05,
"loss": 0.3894,
"num_input_tokens_seen": 33144688,
"step": 2670
},
{
"epoch": 0.9539942938659058,
"grad_norm": 1.2863636016845703,
"learning_rate": 3.852815692848064e-05,
"loss": 0.4649,
"num_input_tokens_seen": 33208288,
"step": 2675
},
{
"epoch": 0.9557774607703281,
"grad_norm": 1.2792158126831055,
"learning_rate": 3.8488875532036975e-05,
"loss": 0.452,
"num_input_tokens_seen": 33264672,
"step": 2680
},
{
"epoch": 0.9575606276747504,
"grad_norm": 1.3720332384109497,
"learning_rate": 3.8449547101059135e-05,
"loss": 0.4364,
"num_input_tokens_seen": 33323904,
"step": 2685
},
{
"epoch": 0.9593437945791726,
"grad_norm": 1.337188482284546,
"learning_rate": 3.8410171772681955e-05,
"loss": 0.3704,
"num_input_tokens_seen": 33385568,
"step": 2690
},
{
"epoch": 0.9611269614835949,
"grad_norm": 1.5518020391464233,
"learning_rate": 3.837074968420376e-05,
"loss": 0.4792,
"num_input_tokens_seen": 33448144,
"step": 2695
},
{
"epoch": 0.9629101283880172,
"grad_norm": 1.4631222486495972,
"learning_rate": 3.833128097308594e-05,
"loss": 0.3936,
"num_input_tokens_seen": 33508960,
"step": 2700
},
{
"epoch": 0.9646932952924394,
"grad_norm": 1.750189185142517,
"learning_rate": 3.829176577695246e-05,
"loss": 0.4272,
"num_input_tokens_seen": 33572384,
"step": 2705
},
{
"epoch": 0.9664764621968617,
"grad_norm": 1.2178962230682373,
"learning_rate": 3.825220423358936e-05,
"loss": 0.4352,
"num_input_tokens_seen": 33631776,
"step": 2710
},
{
"epoch": 0.9682596291012838,
"grad_norm": 1.262508749961853,
"learning_rate": 3.8212596480944294e-05,
"loss": 0.4359,
"num_input_tokens_seen": 33695792,
"step": 2715
},
{
"epoch": 0.9700427960057061,
"grad_norm": 1.2347853183746338,
"learning_rate": 3.817294265712606e-05,
"loss": 0.3832,
"num_input_tokens_seen": 33759456,
"step": 2720
},
{
"epoch": 0.9718259629101283,
"grad_norm": 1.334446907043457,
"learning_rate": 3.813324290040408e-05,
"loss": 0.4191,
"num_input_tokens_seen": 33820352,
"step": 2725
},
{
"epoch": 0.9736091298145506,
"grad_norm": 1.4268221855163574,
"learning_rate": 3.809349734920793e-05,
"loss": 0.4254,
"num_input_tokens_seen": 33882016,
"step": 2730
},
{
"epoch": 0.9753922967189729,
"grad_norm": 1.6015069484710693,
"learning_rate": 3.805370614212692e-05,
"loss": 0.4804,
"num_input_tokens_seen": 33943184,
"step": 2735
},
{
"epoch": 0.9771754636233951,
"grad_norm": 1.2011687755584717,
"learning_rate": 3.8013869417909496e-05,
"loss": 0.5089,
"num_input_tokens_seen": 34009888,
"step": 2740
},
{
"epoch": 0.9789586305278174,
"grad_norm": 1.290658712387085,
"learning_rate": 3.797398731546286e-05,
"loss": 0.4389,
"num_input_tokens_seen": 34070576,
"step": 2745
},
{
"epoch": 0.9807417974322397,
"grad_norm": 1.0900776386260986,
"learning_rate": 3.793405997385242e-05,
"loss": 0.3871,
"num_input_tokens_seen": 34129760,
"step": 2750
},
{
"epoch": 0.9825249643366619,
"grad_norm": 1.4768859148025513,
"learning_rate": 3.789408753230135e-05,
"loss": 0.3302,
"num_input_tokens_seen": 34193408,
"step": 2755
},
{
"epoch": 0.9843081312410842,
"grad_norm": 1.168250322341919,
"learning_rate": 3.785407013019006e-05,
"loss": 0.4042,
"num_input_tokens_seen": 34255024,
"step": 2760
},
{
"epoch": 0.9860912981455064,
"grad_norm": 1.1951143741607666,
"learning_rate": 3.781400790705576e-05,
"loss": 0.4567,
"num_input_tokens_seen": 34319424,
"step": 2765
},
{
"epoch": 0.9878744650499287,
"grad_norm": 1.4566255807876587,
"learning_rate": 3.777390100259192e-05,
"loss": 0.4034,
"num_input_tokens_seen": 34384912,
"step": 2770
},
{
"epoch": 0.989657631954351,
"grad_norm": 1.7778925895690918,
"learning_rate": 3.773374955664782e-05,
"loss": 0.4801,
"num_input_tokens_seen": 34445488,
"step": 2775
},
{
"epoch": 0.9914407988587732,
"grad_norm": 1.0619663000106812,
"learning_rate": 3.769355370922807e-05,
"loss": 0.3374,
"num_input_tokens_seen": 34504400,
"step": 2780
},
{
"epoch": 0.9932239657631954,
"grad_norm": 1.3074612617492676,
"learning_rate": 3.765331360049208e-05,
"loss": 0.5528,
"num_input_tokens_seen": 34568560,
"step": 2785
},
{
"epoch": 0.9950071326676176,
"grad_norm": 1.1420252323150635,
"learning_rate": 3.761302937075361e-05,
"loss": 0.4356,
"num_input_tokens_seen": 34631808,
"step": 2790
},
{
"epoch": 0.9967902995720399,
"grad_norm": 1.2096112966537476,
"learning_rate": 3.7572701160480254e-05,
"loss": 0.4348,
"num_input_tokens_seen": 34692560,
"step": 2795
},
{
"epoch": 0.9985734664764622,
"grad_norm": 1.4141584634780884,
"learning_rate": 3.7532329110292966e-05,
"loss": 0.383,
"num_input_tokens_seen": 34754128,
"step": 2800
},
{
"epoch": 1.0003566333808844,
"grad_norm": 1.2436716556549072,
"learning_rate": 3.749191336096558e-05,
"loss": 0.4626,
"num_input_tokens_seen": 34817616,
"step": 2805
},
{
"epoch": 1.0021398002853068,
"grad_norm": 1.2323740720748901,
"learning_rate": 3.745145405342429e-05,
"loss": 0.4048,
"num_input_tokens_seen": 34878544,
"step": 2810
},
{
"epoch": 1.003922967189729,
"grad_norm": 1.0738112926483154,
"learning_rate": 3.741095132874718e-05,
"loss": 0.2812,
"num_input_tokens_seen": 34940416,
"step": 2815
},
{
"epoch": 1.005706134094151,
"grad_norm": 1.168542742729187,
"learning_rate": 3.73704053281637e-05,
"loss": 0.4261,
"num_input_tokens_seen": 35004160,
"step": 2820
},
{
"epoch": 1.0074893009985735,
"grad_norm": 1.366598129272461,
"learning_rate": 3.7329816193054265e-05,
"loss": 0.4108,
"num_input_tokens_seen": 35066304,
"step": 2825
},
{
"epoch": 1.0092724679029956,
"grad_norm": 1.1343327760696411,
"learning_rate": 3.728918406494962e-05,
"loss": 0.3122,
"num_input_tokens_seen": 35128512,
"step": 2830
},
{
"epoch": 1.011055634807418,
"grad_norm": 1.558720588684082,
"learning_rate": 3.7248509085530465e-05,
"loss": 0.358,
"num_input_tokens_seen": 35188704,
"step": 2835
},
{
"epoch": 1.0128388017118402,
"grad_norm": 1.211642861366272,
"learning_rate": 3.720779139662691e-05,
"loss": 0.4065,
"num_input_tokens_seen": 35254032,
"step": 2840
},
{
"epoch": 1.0146219686162625,
"grad_norm": 1.1981390714645386,
"learning_rate": 3.7167031140218e-05,
"loss": 0.4427,
"num_input_tokens_seen": 35319520,
"step": 2845
},
{
"epoch": 1.0164051355206847,
"grad_norm": 1.1277800798416138,
"learning_rate": 3.712622845843119e-05,
"loss": 0.3631,
"num_input_tokens_seen": 35377936,
"step": 2850
},
{
"epoch": 1.018188302425107,
"grad_norm": 1.461593508720398,
"learning_rate": 3.708538349354189e-05,
"loss": 0.4219,
"num_input_tokens_seen": 35443072,
"step": 2855
},
{
"epoch": 1.0199714693295292,
"grad_norm": 1.250154733657837,
"learning_rate": 3.7044496387972914e-05,
"loss": 0.3502,
"num_input_tokens_seen": 35504144,
"step": 2860
},
{
"epoch": 1.0217546362339516,
"grad_norm": 1.126944661140442,
"learning_rate": 3.700356728429405e-05,
"loss": 0.3805,
"num_input_tokens_seen": 35566224,
"step": 2865
},
{
"epoch": 1.0235378031383737,
"grad_norm": 1.158125638961792,
"learning_rate": 3.696259632522152e-05,
"loss": 0.4136,
"num_input_tokens_seen": 35626384,
"step": 2870
},
{
"epoch": 1.025320970042796,
"grad_norm": 1.3660660982131958,
"learning_rate": 3.6921583653617476e-05,
"loss": 0.3744,
"num_input_tokens_seen": 35691424,
"step": 2875
},
{
"epoch": 1.0271041369472182,
"grad_norm": 1.327812671661377,
"learning_rate": 3.688052941248956e-05,
"loss": 0.3597,
"num_input_tokens_seen": 35752480,
"step": 2880
},
{
"epoch": 1.0288873038516406,
"grad_norm": 1.474350929260254,
"learning_rate": 3.683943374499031e-05,
"loss": 0.3859,
"num_input_tokens_seen": 35814640,
"step": 2885
},
{
"epoch": 1.0306704707560628,
"grad_norm": 1.1447596549987793,
"learning_rate": 3.679829679441674e-05,
"loss": 0.314,
"num_input_tokens_seen": 35874480,
"step": 2890
},
{
"epoch": 1.032453637660485,
"grad_norm": 1.4844218492507935,
"learning_rate": 3.675711870420983e-05,
"loss": 0.4063,
"num_input_tokens_seen": 35936528,
"step": 2895
},
{
"epoch": 1.0342368045649073,
"grad_norm": 1.397241234779358,
"learning_rate": 3.671589961795399e-05,
"loss": 0.3333,
"num_input_tokens_seen": 35999232,
"step": 2900
},
{
"epoch": 1.0360199714693294,
"grad_norm": 1.4940799474716187,
"learning_rate": 3.667463967937657e-05,
"loss": 0.4014,
"num_input_tokens_seen": 36060480,
"step": 2905
},
{
"epoch": 1.0378031383737518,
"grad_norm": 1.0337133407592773,
"learning_rate": 3.663333903234739e-05,
"loss": 0.3575,
"num_input_tokens_seen": 36125664,
"step": 2910
},
{
"epoch": 1.039586305278174,
"grad_norm": 1.4721628427505493,
"learning_rate": 3.659199782087821e-05,
"loss": 0.4124,
"num_input_tokens_seen": 36192256,
"step": 2915
},
{
"epoch": 1.0413694721825963,
"grad_norm": 1.5297164916992188,
"learning_rate": 3.655061618912224e-05,
"loss": 0.3779,
"num_input_tokens_seen": 36254208,
"step": 2920
},
{
"epoch": 1.0431526390870185,
"grad_norm": 1.5214314460754395,
"learning_rate": 3.650919428137362e-05,
"loss": 0.4273,
"num_input_tokens_seen": 36314944,
"step": 2925
},
{
"epoch": 1.0449358059914409,
"grad_norm": 1.232298731803894,
"learning_rate": 3.6467732242066936e-05,
"loss": 0.4427,
"num_input_tokens_seen": 36376832,
"step": 2930
},
{
"epoch": 1.046718972895863,
"grad_norm": 1.0849212408065796,
"learning_rate": 3.64262302157767e-05,
"loss": 0.3674,
"num_input_tokens_seen": 36440064,
"step": 2935
},
{
"epoch": 1.0485021398002854,
"grad_norm": 1.261112928390503,
"learning_rate": 3.6384688347216875e-05,
"loss": 0.3893,
"num_input_tokens_seen": 36501520,
"step": 2940
},
{
"epoch": 1.0502853067047075,
"grad_norm": 1.5491517782211304,
"learning_rate": 3.634310678124033e-05,
"loss": 0.4026,
"num_input_tokens_seen": 36563360,
"step": 2945
},
{
"epoch": 1.05206847360913,
"grad_norm": 1.3838887214660645,
"learning_rate": 3.630148566283837e-05,
"loss": 0.376,
"num_input_tokens_seen": 36624576,
"step": 2950
},
{
"epoch": 1.053851640513552,
"grad_norm": 1.4949077367782593,
"learning_rate": 3.6259825137140214e-05,
"loss": 0.46,
"num_input_tokens_seen": 36688448,
"step": 2955
},
{
"epoch": 1.0556348074179742,
"grad_norm": 1.5406221151351929,
"learning_rate": 3.621812534941246e-05,
"loss": 0.4085,
"num_input_tokens_seen": 36749520,
"step": 2960
},
{
"epoch": 1.0574179743223966,
"grad_norm": 1.39425528049469,
"learning_rate": 3.6176386445058666e-05,
"loss": 0.3876,
"num_input_tokens_seen": 36816208,
"step": 2965
},
{
"epoch": 1.0592011412268187,
"grad_norm": 1.120383858680725,
"learning_rate": 3.6134608569618754e-05,
"loss": 0.3333,
"num_input_tokens_seen": 36877008,
"step": 2970
},
{
"epoch": 1.0609843081312411,
"grad_norm": 2.9805078506469727,
"learning_rate": 3.609279186876853e-05,
"loss": 0.3594,
"num_input_tokens_seen": 36939824,
"step": 2975
},
{
"epoch": 1.0627674750356633,
"grad_norm": 1.165252685546875,
"learning_rate": 3.605093648831917e-05,
"loss": 0.3132,
"num_input_tokens_seen": 37002496,
"step": 2980
},
{
"epoch": 1.0645506419400856,
"grad_norm": 1.2931095361709595,
"learning_rate": 3.600904257421677e-05,
"loss": 0.3781,
"num_input_tokens_seen": 37064128,
"step": 2985
},
{
"epoch": 1.0663338088445078,
"grad_norm": 1.4180676937103271,
"learning_rate": 3.5967110272541745e-05,
"loss": 0.383,
"num_input_tokens_seen": 37124272,
"step": 2990
},
{
"epoch": 1.0681169757489302,
"grad_norm": 1.0311819314956665,
"learning_rate": 3.592513972950837e-05,
"loss": 0.3219,
"num_input_tokens_seen": 37184720,
"step": 2995
},
{
"epoch": 1.0699001426533523,
"grad_norm": 1.1849727630615234,
"learning_rate": 3.58831310914643e-05,
"loss": 0.3799,
"num_input_tokens_seen": 37248832,
"step": 3000
},
{
"epoch": 1.0716833095577747,
"grad_norm": 1.2104076147079468,
"learning_rate": 3.5841084504889974e-05,
"loss": 0.3072,
"num_input_tokens_seen": 37310960,
"step": 3005
},
{
"epoch": 1.0734664764621968,
"grad_norm": 1.3508752584457397,
"learning_rate": 3.5799000116398184e-05,
"loss": 0.3319,
"num_input_tokens_seen": 37368320,
"step": 3010
},
{
"epoch": 1.0752496433666192,
"grad_norm": 1.4133416414260864,
"learning_rate": 3.575687807273352e-05,
"loss": 0.3714,
"num_input_tokens_seen": 37428336,
"step": 3015
},
{
"epoch": 1.0770328102710414,
"grad_norm": 1.8722141981124878,
"learning_rate": 3.5714718520771904e-05,
"loss": 0.4047,
"num_input_tokens_seen": 37493216,
"step": 3020
},
{
"epoch": 1.0788159771754637,
"grad_norm": 1.4663044214248657,
"learning_rate": 3.5672521607519994e-05,
"loss": 0.4123,
"num_input_tokens_seen": 37555504,
"step": 3025
},
{
"epoch": 1.0805991440798859,
"grad_norm": 1.102156162261963,
"learning_rate": 3.563028748011476e-05,
"loss": 0.3994,
"num_input_tokens_seen": 37617728,
"step": 3030
},
{
"epoch": 1.082382310984308,
"grad_norm": 1.8379590511322021,
"learning_rate": 3.5588016285822936e-05,
"loss": 0.3766,
"num_input_tokens_seen": 37680176,
"step": 3035
},
{
"epoch": 1.0841654778887304,
"grad_norm": 1.6983126401901245,
"learning_rate": 3.554570817204048e-05,
"loss": 0.4254,
"num_input_tokens_seen": 37744656,
"step": 3040
},
{
"epoch": 1.0859486447931526,
"grad_norm": 1.4322010278701782,
"learning_rate": 3.550336328629211e-05,
"loss": 0.3099,
"num_input_tokens_seen": 37802912,
"step": 3045
},
{
"epoch": 1.087731811697575,
"grad_norm": 1.4063708782196045,
"learning_rate": 3.546098177623075e-05,
"loss": 0.3445,
"num_input_tokens_seen": 37863504,
"step": 3050
},
{
"epoch": 1.089514978601997,
"grad_norm": 1.2764664888381958,
"learning_rate": 3.541856378963704e-05,
"loss": 0.375,
"num_input_tokens_seen": 37922832,
"step": 3055
},
{
"epoch": 1.0912981455064195,
"grad_norm": 1.6607260704040527,
"learning_rate": 3.53761094744188e-05,
"loss": 0.4187,
"num_input_tokens_seen": 37985536,
"step": 3060
},
{
"epoch": 1.0930813124108416,
"grad_norm": 1.5022599697113037,
"learning_rate": 3.533361897861053e-05,
"loss": 0.454,
"num_input_tokens_seen": 38045824,
"step": 3065
},
{
"epoch": 1.094864479315264,
"grad_norm": 1.4386935234069824,
"learning_rate": 3.529109245037289e-05,
"loss": 0.4023,
"num_input_tokens_seen": 38104704,
"step": 3070
},
{
"epoch": 1.0966476462196861,
"grad_norm": 1.9962483644485474,
"learning_rate": 3.524853003799218e-05,
"loss": 0.3831,
"num_input_tokens_seen": 38165632,
"step": 3075
},
{
"epoch": 1.0984308131241085,
"grad_norm": 1.5470753908157349,
"learning_rate": 3.520593188987982e-05,
"loss": 0.4213,
"num_input_tokens_seen": 38228720,
"step": 3080
},
{
"epoch": 1.1002139800285307,
"grad_norm": 0.9549476504325867,
"learning_rate": 3.516329815457184e-05,
"loss": 0.4021,
"num_input_tokens_seen": 38291936,
"step": 3085
},
{
"epoch": 1.1019971469329528,
"grad_norm": 1.1868444681167603,
"learning_rate": 3.512062898072838e-05,
"loss": 0.3664,
"num_input_tokens_seen": 38353984,
"step": 3090
},
{
"epoch": 1.1037803138373752,
"grad_norm": 1.822995901107788,
"learning_rate": 3.5077924517133114e-05,
"loss": 0.3468,
"num_input_tokens_seen": 38416352,
"step": 3095
},
{
"epoch": 1.1055634807417973,
"grad_norm": 1.369612455368042,
"learning_rate": 3.503518491269279e-05,
"loss": 0.3248,
"num_input_tokens_seen": 38479152,
"step": 3100
},
{
"epoch": 1.1073466476462197,
"grad_norm": 1.3822919130325317,
"learning_rate": 3.49924103164367e-05,
"loss": 0.4058,
"num_input_tokens_seen": 38541776,
"step": 3105
},
{
"epoch": 1.1091298145506419,
"grad_norm": 1.54763925075531,
"learning_rate": 3.4949600877516126e-05,
"loss": 0.4997,
"num_input_tokens_seen": 38604704,
"step": 3110
},
{
"epoch": 1.1109129814550642,
"grad_norm": 1.6321046352386475,
"learning_rate": 3.490675674520385e-05,
"loss": 0.3429,
"num_input_tokens_seen": 38669408,
"step": 3115
},
{
"epoch": 1.1126961483594864,
"grad_norm": 1.2626839876174927,
"learning_rate": 3.4863878068893625e-05,
"loss": 0.3938,
"num_input_tokens_seen": 38732240,
"step": 3120
},
{
"epoch": 1.1144793152639088,
"grad_norm": 1.724424123764038,
"learning_rate": 3.482096499809967e-05,
"loss": 0.3934,
"num_input_tokens_seen": 38796240,
"step": 3125
},
{
"epoch": 1.116262482168331,
"grad_norm": 1.5761538743972778,
"learning_rate": 3.477801768245614e-05,
"loss": 0.3706,
"num_input_tokens_seen": 38859568,
"step": 3130
},
{
"epoch": 1.1180456490727533,
"grad_norm": 1.656040906906128,
"learning_rate": 3.473503627171655e-05,
"loss": 0.4048,
"num_input_tokens_seen": 38919472,
"step": 3135
},
{
"epoch": 1.1198288159771754,
"grad_norm": 1.6139919757843018,
"learning_rate": 3.469202091575337e-05,
"loss": 0.3609,
"num_input_tokens_seen": 38981744,
"step": 3140
},
{
"epoch": 1.1216119828815978,
"grad_norm": 1.5200504064559937,
"learning_rate": 3.464897176455737e-05,
"loss": 0.3309,
"num_input_tokens_seen": 39040848,
"step": 3145
},
{
"epoch": 1.12339514978602,
"grad_norm": 1.6206810474395752,
"learning_rate": 3.460588896823721e-05,
"loss": 0.4394,
"num_input_tokens_seen": 39100560,
"step": 3150
},
{
"epoch": 1.1251783166904423,
"grad_norm": 1.4544352293014526,
"learning_rate": 3.456277267701884e-05,
"loss": 0.3477,
"num_input_tokens_seen": 39163216,
"step": 3155
},
{
"epoch": 1.1269614835948645,
"grad_norm": 1.2673510313034058,
"learning_rate": 3.4519623041245026e-05,
"loss": 0.4006,
"num_input_tokens_seen": 39228880,
"step": 3160
},
{
"epoch": 1.1287446504992866,
"grad_norm": 1.2291406393051147,
"learning_rate": 3.447644021137477e-05,
"loss": 0.4547,
"num_input_tokens_seen": 39290176,
"step": 3165
},
{
"epoch": 1.130527817403709,
"grad_norm": 1.1247378587722778,
"learning_rate": 3.443322433798285e-05,
"loss": 0.361,
"num_input_tokens_seen": 39353136,
"step": 3170
},
{
"epoch": 1.1323109843081312,
"grad_norm": 1.3517816066741943,
"learning_rate": 3.438997557175925e-05,
"loss": 0.3889,
"num_input_tokens_seen": 39414672,
"step": 3175
},
{
"epoch": 1.1340941512125535,
"grad_norm": 1.177815318107605,
"learning_rate": 3.434669406350866e-05,
"loss": 0.4059,
"num_input_tokens_seen": 39474240,
"step": 3180
},
{
"epoch": 1.1358773181169757,
"grad_norm": 1.392850399017334,
"learning_rate": 3.430337996414991e-05,
"loss": 0.339,
"num_input_tokens_seen": 39537376,
"step": 3185
},
{
"epoch": 1.137660485021398,
"grad_norm": 1.5260528326034546,
"learning_rate": 3.4260033424715504e-05,
"loss": 0.4236,
"num_input_tokens_seen": 39597264,
"step": 3190
},
{
"epoch": 1.1394436519258202,
"grad_norm": 1.3453905582427979,
"learning_rate": 3.421665459635105e-05,
"loss": 0.3432,
"num_input_tokens_seen": 39659008,
"step": 3195
},
{
"epoch": 1.1412268188302426,
"grad_norm": 1.3098467588424683,
"learning_rate": 3.4173243630314754e-05,
"loss": 0.3199,
"num_input_tokens_seen": 39719792,
"step": 3200
},
{
"epoch": 1.1430099857346647,
"grad_norm": 1.5861715078353882,
"learning_rate": 3.4129800677976846e-05,
"loss": 0.4334,
"num_input_tokens_seen": 39781200,
"step": 3205
},
{
"epoch": 1.144793152639087,
"grad_norm": 1.5333304405212402,
"learning_rate": 3.408632589081915e-05,
"loss": 0.4005,
"num_input_tokens_seen": 39843616,
"step": 3210
},
{
"epoch": 1.1465763195435092,
"grad_norm": 1.2974801063537598,
"learning_rate": 3.4042819420434437e-05,
"loss": 0.4372,
"num_input_tokens_seen": 39909440,
"step": 3215
},
{
"epoch": 1.1483594864479316,
"grad_norm": 1.7935795783996582,
"learning_rate": 3.399928141852599e-05,
"loss": 0.4579,
"num_input_tokens_seen": 39972400,
"step": 3220
},
{
"epoch": 1.1501426533523538,
"grad_norm": 1.1913748979568481,
"learning_rate": 3.395571203690703e-05,
"loss": 0.4283,
"num_input_tokens_seen": 40035248,
"step": 3225
},
{
"epoch": 1.1519258202567761,
"grad_norm": 1.4842363595962524,
"learning_rate": 3.3912111427500205e-05,
"loss": 0.4128,
"num_input_tokens_seen": 40094368,
"step": 3230
},
{
"epoch": 1.1537089871611983,
"grad_norm": 1.7173250913619995,
"learning_rate": 3.3868479742337024e-05,
"loss": 0.3723,
"num_input_tokens_seen": 40156592,
"step": 3235
},
{
"epoch": 1.1554921540656204,
"grad_norm": 1.351295828819275,
"learning_rate": 3.382481713355738e-05,
"loss": 0.3311,
"num_input_tokens_seen": 40221712,
"step": 3240
},
{
"epoch": 1.1572753209700428,
"grad_norm": 1.3807501792907715,
"learning_rate": 3.3781123753409e-05,
"loss": 0.3431,
"num_input_tokens_seen": 40284224,
"step": 3245
},
{
"epoch": 1.159058487874465,
"grad_norm": 1.1675392389297485,
"learning_rate": 3.3737399754246875e-05,
"loss": 0.4291,
"num_input_tokens_seen": 40347328,
"step": 3250
},
{
"epoch": 1.1608416547788873,
"grad_norm": 1.3195664882659912,
"learning_rate": 3.36936452885328e-05,
"loss": 0.3906,
"num_input_tokens_seen": 40414544,
"step": 3255
},
{
"epoch": 1.1626248216833095,
"grad_norm": 1.4810302257537842,
"learning_rate": 3.364986050883476e-05,
"loss": 0.2888,
"num_input_tokens_seen": 40476528,
"step": 3260
},
{
"epoch": 1.1644079885877319,
"grad_norm": 1.142104983329773,
"learning_rate": 3.360604556782649e-05,
"loss": 0.3662,
"num_input_tokens_seen": 40539248,
"step": 3265
},
{
"epoch": 1.166191155492154,
"grad_norm": 1.2931478023529053,
"learning_rate": 3.356220061828689e-05,
"loss": 0.3909,
"num_input_tokens_seen": 40600832,
"step": 3270
},
{
"epoch": 1.1679743223965764,
"grad_norm": 1.4515559673309326,
"learning_rate": 3.351832581309944e-05,
"loss": 0.3883,
"num_input_tokens_seen": 40663600,
"step": 3275
},
{
"epoch": 1.1697574893009985,
"grad_norm": 1.3096765279769897,
"learning_rate": 3.3474421305251785e-05,
"loss": 0.4206,
"num_input_tokens_seen": 40727616,
"step": 3280
},
{
"epoch": 1.171540656205421,
"grad_norm": 1.3618152141571045,
"learning_rate": 3.343048724783512e-05,
"loss": 0.4285,
"num_input_tokens_seen": 40793168,
"step": 3285
},
{
"epoch": 1.173323823109843,
"grad_norm": 1.4577500820159912,
"learning_rate": 3.3386523794043677e-05,
"loss": 0.3513,
"num_input_tokens_seen": 40856128,
"step": 3290
},
{
"epoch": 1.1751069900142652,
"grad_norm": 1.5820814371109009,
"learning_rate": 3.334253109717419e-05,
"loss": 0.3188,
"num_input_tokens_seen": 40921200,
"step": 3295
},
{
"epoch": 1.1768901569186876,
"grad_norm": 1.4901388883590698,
"learning_rate": 3.3298509310625363e-05,
"loss": 0.3945,
"num_input_tokens_seen": 40981808,
"step": 3300
},
{
"epoch": 1.17867332382311,
"grad_norm": 1.420837640762329,
"learning_rate": 3.325445858789732e-05,
"loss": 0.3768,
"num_input_tokens_seen": 41042096,
"step": 3305
},
{
"epoch": 1.1804564907275321,
"grad_norm": 1.5397422313690186,
"learning_rate": 3.321037908259111e-05,
"loss": 0.3753,
"num_input_tokens_seen": 41105520,
"step": 3310
},
{
"epoch": 1.1822396576319543,
"grad_norm": 1.6058024168014526,
"learning_rate": 3.3166270948408126e-05,
"loss": 0.3827,
"num_input_tokens_seen": 41168112,
"step": 3315
},
{
"epoch": 1.1840228245363766,
"grad_norm": 1.4254966974258423,
"learning_rate": 3.3122134339149585e-05,
"loss": 0.4157,
"num_input_tokens_seen": 41231664,
"step": 3320
},
{
"epoch": 1.1858059914407988,
"grad_norm": 1.437872290611267,
"learning_rate": 3.3077969408715995e-05,
"loss": 0.3757,
"num_input_tokens_seen": 41293296,
"step": 3325
},
{
"epoch": 1.1875891583452212,
"grad_norm": 1.2461625337600708,
"learning_rate": 3.3033776311106626e-05,
"loss": 0.409,
"num_input_tokens_seen": 41355840,
"step": 3330
},
{
"epoch": 1.1893723252496433,
"grad_norm": 1.0907572507858276,
"learning_rate": 3.2989555200418977e-05,
"loss": 0.3948,
"num_input_tokens_seen": 41415680,
"step": 3335
},
{
"epoch": 1.1911554921540657,
"grad_norm": 1.3984030485153198,
"learning_rate": 3.2945306230848185e-05,
"loss": 0.3062,
"num_input_tokens_seen": 41476864,
"step": 3340
},
{
"epoch": 1.1929386590584878,
"grad_norm": 1.6210757493972778,
"learning_rate": 3.2901029556686555e-05,
"loss": 0.436,
"num_input_tokens_seen": 41541712,
"step": 3345
},
{
"epoch": 1.1947218259629102,
"grad_norm": 1.393109917640686,
"learning_rate": 3.285672533232301e-05,
"loss": 0.4281,
"num_input_tokens_seen": 41606016,
"step": 3350
},
{
"epoch": 1.1965049928673324,
"grad_norm": 1.9764735698699951,
"learning_rate": 3.281239371224252e-05,
"loss": 0.3698,
"num_input_tokens_seen": 41666896,
"step": 3355
},
{
"epoch": 1.1982881597717547,
"grad_norm": 1.2083278894424438,
"learning_rate": 3.276803485102557e-05,
"loss": 0.4166,
"num_input_tokens_seen": 41730864,
"step": 3360
},
{
"epoch": 1.2000713266761769,
"grad_norm": 1.567348599433899,
"learning_rate": 3.2723648903347646e-05,
"loss": 0.3745,
"num_input_tokens_seen": 41792336,
"step": 3365
},
{
"epoch": 1.201854493580599,
"grad_norm": 1.2875126600265503,
"learning_rate": 3.267923602397869e-05,
"loss": 0.3444,
"num_input_tokens_seen": 41850800,
"step": 3370
},
{
"epoch": 1.2036376604850214,
"grad_norm": 1.5980690717697144,
"learning_rate": 3.263479636778255e-05,
"loss": 0.3312,
"num_input_tokens_seen": 41914192,
"step": 3375
},
{
"epoch": 1.2054208273894436,
"grad_norm": 1.565764307975769,
"learning_rate": 3.259033008971642e-05,
"loss": 0.3275,
"num_input_tokens_seen": 41978608,
"step": 3380
},
{
"epoch": 1.207203994293866,
"grad_norm": 1.3292726278305054,
"learning_rate": 3.2545837344830356e-05,
"loss": 0.4084,
"num_input_tokens_seen": 42039376,
"step": 3385
},
{
"epoch": 1.208987161198288,
"grad_norm": 1.6631255149841309,
"learning_rate": 3.2501318288266667e-05,
"loss": 0.3749,
"num_input_tokens_seen": 42103296,
"step": 3390
},
{
"epoch": 1.2107703281027105,
"grad_norm": 2.20603609085083,
"learning_rate": 3.2456773075259437e-05,
"loss": 0.4643,
"num_input_tokens_seen": 42166896,
"step": 3395
},
{
"epoch": 1.2125534950071326,
"grad_norm": 1.4111806154251099,
"learning_rate": 3.241220186113394e-05,
"loss": 0.3692,
"num_input_tokens_seen": 42226960,
"step": 3400
},
{
"epoch": 1.214336661911555,
"grad_norm": 1.2245135307312012,
"learning_rate": 3.236760480130612e-05,
"loss": 0.3737,
"num_input_tokens_seen": 42286720,
"step": 3405
},
{
"epoch": 1.2161198288159771,
"grad_norm": 1.8092842102050781,
"learning_rate": 3.2322982051282044e-05,
"loss": 0.423,
"num_input_tokens_seen": 42349920,
"step": 3410
},
{
"epoch": 1.2179029957203995,
"grad_norm": 1.93547785282135,
"learning_rate": 3.227833376665734e-05,
"loss": 0.3346,
"num_input_tokens_seen": 42413520,
"step": 3415
},
{
"epoch": 1.2196861626248217,
"grad_norm": 1.9991748332977295,
"learning_rate": 3.223366010311671e-05,
"loss": 0.4164,
"num_input_tokens_seen": 42477488,
"step": 3420
},
{
"epoch": 1.221469329529244,
"grad_norm": 1.5855672359466553,
"learning_rate": 3.218896121643331e-05,
"loss": 0.3737,
"num_input_tokens_seen": 42538032,
"step": 3425
},
{
"epoch": 1.2232524964336662,
"grad_norm": 1.3733339309692383,
"learning_rate": 3.214423726246828e-05,
"loss": 0.3339,
"num_input_tokens_seen": 42602288,
"step": 3430
},
{
"epoch": 1.2250356633380886,
"grad_norm": 1.2213727235794067,
"learning_rate": 3.209948839717014e-05,
"loss": 0.3601,
"num_input_tokens_seen": 42662464,
"step": 3435
},
{
"epoch": 1.2268188302425107,
"grad_norm": 1.5007941722869873,
"learning_rate": 3.205471477657428e-05,
"loss": 0.2879,
"num_input_tokens_seen": 42724832,
"step": 3440
},
{
"epoch": 1.2286019971469329,
"grad_norm": 1.0848278999328613,
"learning_rate": 3.200991655680243e-05,
"loss": 0.3755,
"num_input_tokens_seen": 42784336,
"step": 3445
},
{
"epoch": 1.2303851640513552,
"grad_norm": 1.4045600891113281,
"learning_rate": 3.1965093894062084e-05,
"loss": 0.4618,
"num_input_tokens_seen": 42845744,
"step": 3450
},
{
"epoch": 1.2321683309557774,
"grad_norm": 1.537743330001831,
"learning_rate": 3.1920246944645945e-05,
"loss": 0.3932,
"num_input_tokens_seen": 42909840,
"step": 3455
},
{
"epoch": 1.2339514978601998,
"grad_norm": 1.8205797672271729,
"learning_rate": 3.1875375864931426e-05,
"loss": 0.4308,
"num_input_tokens_seen": 42971504,
"step": 3460
},
{
"epoch": 1.235734664764622,
"grad_norm": 1.714099407196045,
"learning_rate": 3.183048081138009e-05,
"loss": 0.4523,
"num_input_tokens_seen": 43033200,
"step": 3465
},
{
"epoch": 1.2375178316690443,
"grad_norm": 1.4274886846542358,
"learning_rate": 3.178556194053706e-05,
"loss": 0.4437,
"num_input_tokens_seen": 43097680,
"step": 3470
},
{
"epoch": 1.2393009985734664,
"grad_norm": 1.469146728515625,
"learning_rate": 3.174061940903053e-05,
"loss": 0.4199,
"num_input_tokens_seen": 43158240,
"step": 3475
},
{
"epoch": 1.2410841654778888,
"grad_norm": 1.4748458862304688,
"learning_rate": 3.1695653373571196e-05,
"loss": 0.417,
"num_input_tokens_seen": 43220000,
"step": 3480
},
{
"epoch": 1.242867332382311,
"grad_norm": 1.4848557710647583,
"learning_rate": 3.16506639909517e-05,
"loss": 0.3459,
"num_input_tokens_seen": 43284592,
"step": 3485
},
{
"epoch": 1.2446504992867333,
"grad_norm": 1.5085371732711792,
"learning_rate": 3.160565141804611e-05,
"loss": 0.4546,
"num_input_tokens_seen": 43343088,
"step": 3490
},
{
"epoch": 1.2464336661911555,
"grad_norm": 1.1072133779525757,
"learning_rate": 3.156061581180936e-05,
"loss": 0.3337,
"num_input_tokens_seen": 43405888,
"step": 3495
},
{
"epoch": 1.2482168330955776,
"grad_norm": 1.4966447353363037,
"learning_rate": 3.1515557329276654e-05,
"loss": 0.3346,
"num_input_tokens_seen": 43468000,
"step": 3500
},
{
"epoch": 1.25,
"grad_norm": 1.5697176456451416,
"learning_rate": 3.147047612756302e-05,
"loss": 0.3192,
"num_input_tokens_seen": 43531968,
"step": 3505
},
{
"epoch": 1.2517831669044224,
"grad_norm": 1.3662910461425781,
"learning_rate": 3.1425372363862676e-05,
"loss": 0.3736,
"num_input_tokens_seen": 43592880,
"step": 3510
},
{
"epoch": 1.2535663338088445,
"grad_norm": 1.7950197458267212,
"learning_rate": 3.1380246195448516e-05,
"loss": 0.4962,
"num_input_tokens_seen": 43659280,
"step": 3515
},
{
"epoch": 1.2553495007132667,
"grad_norm": 1.6455252170562744,
"learning_rate": 3.1335097779671564e-05,
"loss": 0.3387,
"num_input_tokens_seen": 43714592,
"step": 3520
},
{
"epoch": 1.257132667617689,
"grad_norm": 1.551438808441162,
"learning_rate": 3.128992727396041e-05,
"loss": 0.3563,
"num_input_tokens_seen": 43773584,
"step": 3525
},
{
"epoch": 1.2589158345221112,
"grad_norm": 2.015979051589966,
"learning_rate": 3.1244734835820666e-05,
"loss": 0.425,
"num_input_tokens_seen": 43834848,
"step": 3530
},
{
"epoch": 1.2606990014265336,
"grad_norm": 1.8502330780029297,
"learning_rate": 3.119952062283444e-05,
"loss": 0.3942,
"num_input_tokens_seen": 43896768,
"step": 3535
},
{
"epoch": 1.2624821683309557,
"grad_norm": 1.9823004007339478,
"learning_rate": 3.115428479265975e-05,
"loss": 0.3489,
"num_input_tokens_seen": 43956416,
"step": 3540
},
{
"epoch": 1.264265335235378,
"grad_norm": 1.3106993436813354,
"learning_rate": 3.1109027503029994e-05,
"loss": 0.3622,
"num_input_tokens_seen": 44018848,
"step": 3545
},
{
"epoch": 1.2660485021398002,
"grad_norm": 1.3623470067977905,
"learning_rate": 3.10637489117534e-05,
"loss": 0.383,
"num_input_tokens_seen": 44081952,
"step": 3550
},
{
"epoch": 1.2678316690442226,
"grad_norm": 2.47135853767395,
"learning_rate": 3.1018449176712474e-05,
"loss": 0.3496,
"num_input_tokens_seen": 44146544,
"step": 3555
},
{
"epoch": 1.2696148359486448,
"grad_norm": 1.5469810962677002,
"learning_rate": 3.097312845586345e-05,
"loss": 0.3219,
"num_input_tokens_seen": 44207664,
"step": 3560
},
{
"epoch": 1.2713980028530671,
"grad_norm": 1.3339722156524658,
"learning_rate": 3.0927786907235727e-05,
"loss": 0.3731,
"num_input_tokens_seen": 44268848,
"step": 3565
},
{
"epoch": 1.2731811697574893,
"grad_norm": 1.4878593683242798,
"learning_rate": 3.088242468893135e-05,
"loss": 0.4539,
"num_input_tokens_seen": 44329824,
"step": 3570
},
{
"epoch": 1.2749643366619114,
"grad_norm": 1.579049825668335,
"learning_rate": 3.083704195912439e-05,
"loss": 0.3579,
"num_input_tokens_seen": 44389872,
"step": 3575
},
{
"epoch": 1.2767475035663338,
"grad_norm": 1.2362189292907715,
"learning_rate": 3.079163887606051e-05,
"loss": 0.3178,
"num_input_tokens_seen": 44451040,
"step": 3580
},
{
"epoch": 1.2785306704707562,
"grad_norm": 1.586029052734375,
"learning_rate": 3.074621559805629e-05,
"loss": 0.37,
"num_input_tokens_seen": 44514000,
"step": 3585
},
{
"epoch": 1.2803138373751783,
"grad_norm": 2.0480854511260986,
"learning_rate": 3.070077228349875e-05,
"loss": 0.3557,
"num_input_tokens_seen": 44577760,
"step": 3590
},
{
"epoch": 1.2820970042796005,
"grad_norm": 1.724202036857605,
"learning_rate": 3.065530909084477e-05,
"loss": 0.4098,
"num_input_tokens_seen": 44638880,
"step": 3595
},
{
"epoch": 1.2838801711840229,
"grad_norm": 1.251970887184143,
"learning_rate": 3.060982617862053e-05,
"loss": 0.389,
"num_input_tokens_seen": 44705040,
"step": 3600
},
{
"epoch": 1.285663338088445,
"grad_norm": 1.652723789215088,
"learning_rate": 3.0564323705420996e-05,
"loss": 0.3512,
"num_input_tokens_seen": 44769344,
"step": 3605
},
{
"epoch": 1.2874465049928674,
"grad_norm": 1.1588093042373657,
"learning_rate": 3.051880182990932e-05,
"loss": 0.4157,
"num_input_tokens_seen": 44830528,
"step": 3610
},
{
"epoch": 1.2892296718972895,
"grad_norm": 1.424497365951538,
"learning_rate": 3.0473260710816333e-05,
"loss": 0.3897,
"num_input_tokens_seen": 44890592,
"step": 3615
},
{
"epoch": 1.291012838801712,
"grad_norm": 1.8615883588790894,
"learning_rate": 3.042770050693994e-05,
"loss": 0.3806,
"num_input_tokens_seen": 44955440,
"step": 3620
},
{
"epoch": 1.292796005706134,
"grad_norm": 1.5784142017364502,
"learning_rate": 3.0382121377144597e-05,
"loss": 0.3707,
"num_input_tokens_seen": 45015792,
"step": 3625
},
{
"epoch": 1.2945791726105562,
"grad_norm": 1.1894679069519043,
"learning_rate": 3.033652348036078e-05,
"loss": 0.3357,
"num_input_tokens_seen": 45075840,
"step": 3630
},
{
"epoch": 1.2963623395149786,
"grad_norm": 1.3909114599227905,
"learning_rate": 3.0290906975584364e-05,
"loss": 0.3775,
"num_input_tokens_seen": 45138448,
"step": 3635
},
{
"epoch": 1.298145506419401,
"grad_norm": 1.9987061023712158,
"learning_rate": 3.0245272021876144e-05,
"loss": 0.4345,
"num_input_tokens_seen": 45200016,
"step": 3640
},
{
"epoch": 1.2999286733238231,
"grad_norm": 1.595581293106079,
"learning_rate": 3.0199618778361205e-05,
"loss": 0.3906,
"num_input_tokens_seen": 45258832,
"step": 3645
},
{
"epoch": 1.3017118402282453,
"grad_norm": 1.3167765140533447,
"learning_rate": 3.015394740422846e-05,
"loss": 0.3751,
"num_input_tokens_seen": 45317744,
"step": 3650
},
{
"epoch": 1.3034950071326676,
"grad_norm": 1.4946184158325195,
"learning_rate": 3.0108258058730005e-05,
"loss": 0.3753,
"num_input_tokens_seen": 45382832,
"step": 3655
},
{
"epoch": 1.3052781740370898,
"grad_norm": 1.1609739065170288,
"learning_rate": 3.006255090118059e-05,
"loss": 0.3884,
"num_input_tokens_seen": 45445168,
"step": 3660
},
{
"epoch": 1.3070613409415122,
"grad_norm": 1.3701294660568237,
"learning_rate": 3.0016826090957106e-05,
"loss": 0.3847,
"num_input_tokens_seen": 45509216,
"step": 3665
},
{
"epoch": 1.3088445078459343,
"grad_norm": 1.580486536026001,
"learning_rate": 2.9971083787497988e-05,
"loss": 0.3808,
"num_input_tokens_seen": 45572064,
"step": 3670
},
{
"epoch": 1.3106276747503567,
"grad_norm": 1.3107916116714478,
"learning_rate": 2.9925324150302665e-05,
"loss": 0.3296,
"num_input_tokens_seen": 45634000,
"step": 3675
},
{
"epoch": 1.3124108416547788,
"grad_norm": 1.739546298980713,
"learning_rate": 2.9879547338930997e-05,
"loss": 0.4001,
"num_input_tokens_seen": 45695632,
"step": 3680
},
{
"epoch": 1.3141940085592012,
"grad_norm": 1.9113649129867554,
"learning_rate": 2.9833753513002743e-05,
"loss": 0.3726,
"num_input_tokens_seen": 45755824,
"step": 3685
},
{
"epoch": 1.3159771754636234,
"grad_norm": 1.4150328636169434,
"learning_rate": 2.978794283219698e-05,
"loss": 0.3018,
"num_input_tokens_seen": 45817024,
"step": 3690
},
{
"epoch": 1.3177603423680457,
"grad_norm": 1.5835505723953247,
"learning_rate": 2.9742115456251575e-05,
"loss": 0.4116,
"num_input_tokens_seen": 45883600,
"step": 3695
},
{
"epoch": 1.3195435092724679,
"grad_norm": 1.714812159538269,
"learning_rate": 2.9696271544962583e-05,
"loss": 0.4026,
"num_input_tokens_seen": 45946768,
"step": 3700
},
{
"epoch": 1.32132667617689,
"grad_norm": 1.2335606813430786,
"learning_rate": 2.965041125818374e-05,
"loss": 0.3238,
"num_input_tokens_seen": 46003360,
"step": 3705
},
{
"epoch": 1.3231098430813124,
"grad_norm": 1.5695058107376099,
"learning_rate": 2.9604534755825863e-05,
"loss": 0.4024,
"num_input_tokens_seen": 46064304,
"step": 3710
},
{
"epoch": 1.3248930099857348,
"grad_norm": 1.2431243658065796,
"learning_rate": 2.9558642197856322e-05,
"loss": 0.3557,
"num_input_tokens_seen": 46125168,
"step": 3715
},
{
"epoch": 1.326676176890157,
"grad_norm": 1.5153807401657104,
"learning_rate": 2.9512733744298482e-05,
"loss": 0.4022,
"num_input_tokens_seen": 46185408,
"step": 3720
},
{
"epoch": 1.328459343794579,
"grad_norm": 1.2606337070465088,
"learning_rate": 2.9466809555231112e-05,
"loss": 0.3489,
"num_input_tokens_seen": 46249776,
"step": 3725
},
{
"epoch": 1.3302425106990015,
"grad_norm": 1.4757767915725708,
"learning_rate": 2.9420869790787852e-05,
"loss": 0.4108,
"num_input_tokens_seen": 46312512,
"step": 3730
},
{
"epoch": 1.3320256776034236,
"grad_norm": 1.3754016160964966,
"learning_rate": 2.9374914611156668e-05,
"loss": 0.4006,
"num_input_tokens_seen": 46375680,
"step": 3735
},
{
"epoch": 1.333808844507846,
"grad_norm": 1.6682347059249878,
"learning_rate": 2.932894417657927e-05,
"loss": 0.4147,
"num_input_tokens_seen": 46440688,
"step": 3740
},
{
"epoch": 1.3355920114122681,
"grad_norm": 1.5695980787277222,
"learning_rate": 2.928295864735056e-05,
"loss": 0.3782,
"num_input_tokens_seen": 46502272,
"step": 3745
},
{
"epoch": 1.3373751783166905,
"grad_norm": 1.4191399812698364,
"learning_rate": 2.9236958183818076e-05,
"loss": 0.3895,
"num_input_tokens_seen": 46565120,
"step": 3750
},
{
"epoch": 1.3391583452211127,
"grad_norm": 1.4917359352111816,
"learning_rate": 2.9190942946381418e-05,
"loss": 0.4084,
"num_input_tokens_seen": 46628752,
"step": 3755
},
{
"epoch": 1.340941512125535,
"grad_norm": 1.8590489625930786,
"learning_rate": 2.914491309549171e-05,
"loss": 0.3762,
"num_input_tokens_seen": 46690560,
"step": 3760
},
{
"epoch": 1.3427246790299572,
"grad_norm": 1.7186845541000366,
"learning_rate": 2.9098868791651046e-05,
"loss": 0.4157,
"num_input_tokens_seen": 46755712,
"step": 3765
},
{
"epoch": 1.3445078459343796,
"grad_norm": 1.2674869298934937,
"learning_rate": 2.90528101954119e-05,
"loss": 0.3688,
"num_input_tokens_seen": 46821440,
"step": 3770
},
{
"epoch": 1.3462910128388017,
"grad_norm": 1.3083473443984985,
"learning_rate": 2.9006737467376577e-05,
"loss": 0.4088,
"num_input_tokens_seen": 46882768,
"step": 3775
},
{
"epoch": 1.3480741797432239,
"grad_norm": 1.3852232694625854,
"learning_rate": 2.8960650768196672e-05,
"loss": 0.4017,
"num_input_tokens_seen": 46945088,
"step": 3780
},
{
"epoch": 1.3498573466476462,
"grad_norm": 1.9241416454315186,
"learning_rate": 2.8914550258572487e-05,
"loss": 0.362,
"num_input_tokens_seen": 47013312,
"step": 3785
},
{
"epoch": 1.3516405135520686,
"grad_norm": 1.380786657333374,
"learning_rate": 2.8868436099252503e-05,
"loss": 0.3919,
"num_input_tokens_seen": 47078800,
"step": 3790
},
{
"epoch": 1.3534236804564908,
"grad_norm": 1.7454177141189575,
"learning_rate": 2.8822308451032754e-05,
"loss": 0.3647,
"num_input_tokens_seen": 47139856,
"step": 3795
},
{
"epoch": 1.355206847360913,
"grad_norm": 1.9598338603973389,
"learning_rate": 2.877616747475634e-05,
"loss": 0.3123,
"num_input_tokens_seen": 47202944,
"step": 3800
},
{
"epoch": 1.3569900142653353,
"grad_norm": 1.3957338333129883,
"learning_rate": 2.873001333131282e-05,
"loss": 0.3588,
"num_input_tokens_seen": 47262080,
"step": 3805
},
{
"epoch": 1.3587731811697574,
"grad_norm": 1.6384849548339844,
"learning_rate": 2.8683846181637685e-05,
"loss": 0.4088,
"num_input_tokens_seen": 47320400,
"step": 3810
},
{
"epoch": 1.3605563480741798,
"grad_norm": 1.4136253595352173,
"learning_rate": 2.863766618671177e-05,
"loss": 0.3077,
"num_input_tokens_seen": 47383184,
"step": 3815
},
{
"epoch": 1.362339514978602,
"grad_norm": 2.322763681411743,
"learning_rate": 2.8591473507560667e-05,
"loss": 0.4329,
"num_input_tokens_seen": 47444144,
"step": 3820
},
{
"epoch": 1.3641226818830243,
"grad_norm": 1.4467617273330688,
"learning_rate": 2.8545268305254254e-05,
"loss": 0.424,
"num_input_tokens_seen": 47509584,
"step": 3825
},
{
"epoch": 1.3659058487874465,
"grad_norm": 1.751305341720581,
"learning_rate": 2.8499050740906037e-05,
"loss": 0.3705,
"num_input_tokens_seen": 47573696,
"step": 3830
},
{
"epoch": 1.3676890156918686,
"grad_norm": 1.503512978553772,
"learning_rate": 2.8452820975672628e-05,
"loss": 0.3882,
"num_input_tokens_seen": 47635168,
"step": 3835
},
{
"epoch": 1.369472182596291,
"grad_norm": 1.5444144010543823,
"learning_rate": 2.8406579170753205e-05,
"loss": 0.3555,
"num_input_tokens_seen": 47693984,
"step": 3840
},
{
"epoch": 1.3712553495007134,
"grad_norm": 1.5627723932266235,
"learning_rate": 2.8360325487388913e-05,
"loss": 0.3481,
"num_input_tokens_seen": 47758704,
"step": 3845
},
{
"epoch": 1.3730385164051355,
"grad_norm": 1.991705298423767,
"learning_rate": 2.8314060086862308e-05,
"loss": 0.3879,
"num_input_tokens_seen": 47822832,
"step": 3850
},
{
"epoch": 1.3748216833095577,
"grad_norm": 1.3864610195159912,
"learning_rate": 2.8267783130496817e-05,
"loss": 0.4234,
"num_input_tokens_seen": 47887488,
"step": 3855
},
{
"epoch": 1.37660485021398,
"grad_norm": 1.6995244026184082,
"learning_rate": 2.822149477965617e-05,
"loss": 0.4011,
"num_input_tokens_seen": 47951888,
"step": 3860
},
{
"epoch": 1.3783880171184024,
"grad_norm": 1.4929567575454712,
"learning_rate": 2.8175195195743792e-05,
"loss": 0.351,
"num_input_tokens_seen": 48014128,
"step": 3865
},
{
"epoch": 1.3801711840228246,
"grad_norm": 1.6061722040176392,
"learning_rate": 2.8128884540202317e-05,
"loss": 0.4725,
"num_input_tokens_seen": 48072048,
"step": 3870
},
{
"epoch": 1.3819543509272467,
"grad_norm": 1.555999517440796,
"learning_rate": 2.8082562974512948e-05,
"loss": 0.3323,
"num_input_tokens_seen": 48134720,
"step": 3875
},
{
"epoch": 1.383737517831669,
"grad_norm": 1.188208818435669,
"learning_rate": 2.8036230660194972e-05,
"loss": 0.3187,
"num_input_tokens_seen": 48197488,
"step": 3880
},
{
"epoch": 1.3855206847360912,
"grad_norm": 1.965627670288086,
"learning_rate": 2.7989887758805134e-05,
"loss": 0.458,
"num_input_tokens_seen": 48262272,
"step": 3885
},
{
"epoch": 1.3873038516405136,
"grad_norm": 1.364537000656128,
"learning_rate": 2.794353443193707e-05,
"loss": 0.3833,
"num_input_tokens_seen": 48322592,
"step": 3890
},
{
"epoch": 1.3890870185449358,
"grad_norm": 1.5128754377365112,
"learning_rate": 2.789717084122081e-05,
"loss": 0.3946,
"num_input_tokens_seen": 48378000,
"step": 3895
},
{
"epoch": 1.3908701854493581,
"grad_norm": 1.3390058279037476,
"learning_rate": 2.785079714832216e-05,
"loss": 0.4381,
"num_input_tokens_seen": 48434736,
"step": 3900
},
{
"epoch": 1.3926533523537803,
"grad_norm": 1.3086851835250854,
"learning_rate": 2.7804413514942147e-05,
"loss": 0.3576,
"num_input_tokens_seen": 48494288,
"step": 3905
},
{
"epoch": 1.3944365192582024,
"grad_norm": 1.5228943824768066,
"learning_rate": 2.7758020102816456e-05,
"loss": 0.3978,
"num_input_tokens_seen": 48554784,
"step": 3910
},
{
"epoch": 1.3962196861626248,
"grad_norm": 1.810608983039856,
"learning_rate": 2.7711617073714872e-05,
"loss": 0.336,
"num_input_tokens_seen": 48620192,
"step": 3915
},
{
"epoch": 1.3980028530670472,
"grad_norm": 2.060600519180298,
"learning_rate": 2.766520458944073e-05,
"loss": 0.4053,
"num_input_tokens_seen": 48683392,
"step": 3920
},
{
"epoch": 1.3997860199714693,
"grad_norm": 1.435867190361023,
"learning_rate": 2.76187828118303e-05,
"loss": 0.3841,
"num_input_tokens_seen": 48752192,
"step": 3925
},
{
"epoch": 1.4015691868758915,
"grad_norm": 2.4842238426208496,
"learning_rate": 2.7572351902752296e-05,
"loss": 0.4873,
"num_input_tokens_seen": 48815952,
"step": 3930
},
{
"epoch": 1.4033523537803139,
"grad_norm": 1.696571707725525,
"learning_rate": 2.7525912024107242e-05,
"loss": 0.431,
"num_input_tokens_seen": 48880176,
"step": 3935
},
{
"epoch": 1.405135520684736,
"grad_norm": 1.5973232984542847,
"learning_rate": 2.747946333782696e-05,
"loss": 0.4383,
"num_input_tokens_seen": 48943680,
"step": 3940
},
{
"epoch": 1.4069186875891584,
"grad_norm": 1.621713638305664,
"learning_rate": 2.7433006005873956e-05,
"loss": 0.3977,
"num_input_tokens_seen": 49004224,
"step": 3945
},
{
"epoch": 1.4087018544935805,
"grad_norm": 1.6473205089569092,
"learning_rate": 2.738654019024093e-05,
"loss": 0.3486,
"num_input_tokens_seen": 49064512,
"step": 3950
},
{
"epoch": 1.410485021398003,
"grad_norm": 1.304693341255188,
"learning_rate": 2.7340066052950103e-05,
"loss": 0.4181,
"num_input_tokens_seen": 49126544,
"step": 3955
},
{
"epoch": 1.412268188302425,
"grad_norm": 1.6275643110275269,
"learning_rate": 2.7293583756052755e-05,
"loss": 0.3857,
"num_input_tokens_seen": 49191056,
"step": 3960
},
{
"epoch": 1.4140513552068474,
"grad_norm": 1.4851253032684326,
"learning_rate": 2.7247093461628616e-05,
"loss": 0.4096,
"num_input_tokens_seen": 49249936,
"step": 3965
},
{
"epoch": 1.4158345221112696,
"grad_norm": 1.1423369646072388,
"learning_rate": 2.720059533178529e-05,
"loss": 0.3315,
"num_input_tokens_seen": 49309968,
"step": 3970
},
{
"epoch": 1.417617689015692,
"grad_norm": 1.5465352535247803,
"learning_rate": 2.71540895286577e-05,
"loss": 0.3942,
"num_input_tokens_seen": 49368576,
"step": 3975
},
{
"epoch": 1.4194008559201141,
"grad_norm": 1.611109733581543,
"learning_rate": 2.710757621440753e-05,
"loss": 0.3589,
"num_input_tokens_seen": 49427888,
"step": 3980
},
{
"epoch": 1.4211840228245363,
"grad_norm": 1.5170013904571533,
"learning_rate": 2.7061055551222663e-05,
"loss": 0.3942,
"num_input_tokens_seen": 49491648,
"step": 3985
},
{
"epoch": 1.4229671897289586,
"grad_norm": 1.4070842266082764,
"learning_rate": 2.70145277013166e-05,
"loss": 0.3997,
"num_input_tokens_seen": 49557296,
"step": 3990
},
{
"epoch": 1.424750356633381,
"grad_norm": 1.52862548828125,
"learning_rate": 2.6967992826927897e-05,
"loss": 0.4189,
"num_input_tokens_seen": 49620208,
"step": 3995
},
{
"epoch": 1.4265335235378032,
"grad_norm": 1.0965608358383179,
"learning_rate": 2.6921451090319603e-05,
"loss": 0.3903,
"num_input_tokens_seen": 49683888,
"step": 4000
},
{
"epoch": 1.4283166904422253,
"grad_norm": 1.8186748027801514,
"learning_rate": 2.6874902653778712e-05,
"loss": 0.3987,
"num_input_tokens_seen": 49741728,
"step": 4005
},
{
"epoch": 1.4300998573466477,
"grad_norm": 1.3632290363311768,
"learning_rate": 2.6828347679615558e-05,
"loss": 0.377,
"num_input_tokens_seen": 49804832,
"step": 4010
},
{
"epoch": 1.4318830242510698,
"grad_norm": 1.560187816619873,
"learning_rate": 2.6781786330163282e-05,
"loss": 0.3524,
"num_input_tokens_seen": 49867744,
"step": 4015
},
{
"epoch": 1.4336661911554922,
"grad_norm": 1.8583840131759644,
"learning_rate": 2.673521876777727e-05,
"loss": 0.344,
"num_input_tokens_seen": 49932304,
"step": 4020
},
{
"epoch": 1.4354493580599144,
"grad_norm": 1.5638411045074463,
"learning_rate": 2.6688645154834537e-05,
"loss": 0.4498,
"num_input_tokens_seen": 49994560,
"step": 4025
},
{
"epoch": 1.4372325249643367,
"grad_norm": 1.6732484102249146,
"learning_rate": 2.6642065653733213e-05,
"loss": 0.3513,
"num_input_tokens_seen": 50053584,
"step": 4030
},
{
"epoch": 1.4390156918687589,
"grad_norm": 1.5469917058944702,
"learning_rate": 2.6595480426891976e-05,
"loss": 0.354,
"num_input_tokens_seen": 50111824,
"step": 4035
},
{
"epoch": 1.440798858773181,
"grad_norm": 1.4826576709747314,
"learning_rate": 2.654888963674945e-05,
"loss": 0.4709,
"num_input_tokens_seen": 50175024,
"step": 4040
},
{
"epoch": 1.4425820256776034,
"grad_norm": 1.5195841789245605,
"learning_rate": 2.650229344576367e-05,
"loss": 0.3669,
"num_input_tokens_seen": 50236896,
"step": 4045
},
{
"epoch": 1.4443651925820258,
"grad_norm": 1.735239863395691,
"learning_rate": 2.6455692016411476e-05,
"loss": 0.3854,
"num_input_tokens_seen": 50297360,
"step": 4050
},
{
"epoch": 1.446148359486448,
"grad_norm": 1.5575016736984253,
"learning_rate": 2.640908551118801e-05,
"loss": 0.4022,
"num_input_tokens_seen": 50357200,
"step": 4055
},
{
"epoch": 1.44793152639087,
"grad_norm": 1.480210781097412,
"learning_rate": 2.6362474092606088e-05,
"loss": 0.3373,
"num_input_tokens_seen": 50422896,
"step": 4060
},
{
"epoch": 1.4497146932952925,
"grad_norm": 1.5694204568862915,
"learning_rate": 2.631585792319567e-05,
"loss": 0.3583,
"num_input_tokens_seen": 50483168,
"step": 4065
},
{
"epoch": 1.4514978601997148,
"grad_norm": 1.4493743181228638,
"learning_rate": 2.626923716550328e-05,
"loss": 0.3734,
"num_input_tokens_seen": 50546688,
"step": 4070
},
{
"epoch": 1.453281027104137,
"grad_norm": 1.8895164728164673,
"learning_rate": 2.622261198209143e-05,
"loss": 0.4142,
"num_input_tokens_seen": 50611280,
"step": 4075
},
{
"epoch": 1.4550641940085591,
"grad_norm": 1.3316305875778198,
"learning_rate": 2.6175982535538098e-05,
"loss": 0.3354,
"num_input_tokens_seen": 50675728,
"step": 4080
},
{
"epoch": 1.4568473609129815,
"grad_norm": 1.2798820734024048,
"learning_rate": 2.6129348988436074e-05,
"loss": 0.4042,
"num_input_tokens_seen": 50735520,
"step": 4085
},
{
"epoch": 1.4586305278174037,
"grad_norm": 1.5085526704788208,
"learning_rate": 2.6082711503392494e-05,
"loss": 0.371,
"num_input_tokens_seen": 50797520,
"step": 4090
},
{
"epoch": 1.460413694721826,
"grad_norm": 1.8046773672103882,
"learning_rate": 2.60360702430282e-05,
"loss": 0.4519,
"num_input_tokens_seen": 50857456,
"step": 4095
},
{
"epoch": 1.4621968616262482,
"grad_norm": 4.003551006317139,
"learning_rate": 2.5989425369977195e-05,
"loss": 0.4206,
"num_input_tokens_seen": 50917760,
"step": 4100
},
{
"epoch": 1.4639800285306706,
"grad_norm": 1.346306324005127,
"learning_rate": 2.5942777046886108e-05,
"loss": 0.2947,
"num_input_tokens_seen": 50980272,
"step": 4105
},
{
"epoch": 1.4657631954350927,
"grad_norm": 1.428459882736206,
"learning_rate": 2.589612543641357e-05,
"loss": 0.3589,
"num_input_tokens_seen": 51044768,
"step": 4110
},
{
"epoch": 1.4675463623395149,
"grad_norm": 1.2181662321090698,
"learning_rate": 2.5849470701229685e-05,
"loss": 0.2415,
"num_input_tokens_seen": 51102032,
"step": 4115
},
{
"epoch": 1.4693295292439372,
"grad_norm": 1.5910693407058716,
"learning_rate": 2.5802813004015443e-05,
"loss": 0.3508,
"num_input_tokens_seen": 51164672,
"step": 4120
},
{
"epoch": 1.4711126961483596,
"grad_norm": 1.6249642372131348,
"learning_rate": 2.5756152507462177e-05,
"loss": 0.3944,
"num_input_tokens_seen": 51228176,
"step": 4125
},
{
"epoch": 1.4728958630527818,
"grad_norm": 2.0845768451690674,
"learning_rate": 2.5709489374270983e-05,
"loss": 0.3756,
"num_input_tokens_seen": 51289072,
"step": 4130
},
{
"epoch": 1.474679029957204,
"grad_norm": 1.638650894165039,
"learning_rate": 2.5662823767152127e-05,
"loss": 0.3464,
"num_input_tokens_seen": 51348944,
"step": 4135
},
{
"epoch": 1.4764621968616263,
"grad_norm": 1.555907130241394,
"learning_rate": 2.561615584882453e-05,
"loss": 0.3685,
"num_input_tokens_seen": 51408432,
"step": 4140
},
{
"epoch": 1.4782453637660484,
"grad_norm": 1.6691559553146362,
"learning_rate": 2.5569485782015144e-05,
"loss": 0.3658,
"num_input_tokens_seen": 51472704,
"step": 4145
},
{
"epoch": 1.4800285306704708,
"grad_norm": 1.5996575355529785,
"learning_rate": 2.5522813729458443e-05,
"loss": 0.3806,
"num_input_tokens_seen": 51533440,
"step": 4150
},
{
"epoch": 1.481811697574893,
"grad_norm": 2.0772979259490967,
"learning_rate": 2.5476139853895796e-05,
"loss": 0.4304,
"num_input_tokens_seen": 51597872,
"step": 4155
},
{
"epoch": 1.4835948644793153,
"grad_norm": 1.6055898666381836,
"learning_rate": 2.5429464318074952e-05,
"loss": 0.3482,
"num_input_tokens_seen": 51658736,
"step": 4160
},
{
"epoch": 1.4853780313837375,
"grad_norm": 1.9511430263519287,
"learning_rate": 2.538278728474944e-05,
"loss": 0.4127,
"num_input_tokens_seen": 51720672,
"step": 4165
},
{
"epoch": 1.4871611982881598,
"grad_norm": 1.5314128398895264,
"learning_rate": 2.5336108916677986e-05,
"loss": 0.362,
"num_input_tokens_seen": 51781760,
"step": 4170
},
{
"epoch": 1.488944365192582,
"grad_norm": 2.31260085105896,
"learning_rate": 2.528942937662403e-05,
"loss": 0.3639,
"num_input_tokens_seen": 51845840,
"step": 4175
},
{
"epoch": 1.4907275320970044,
"grad_norm": 1.428371548652649,
"learning_rate": 2.5242748827355046e-05,
"loss": 0.3494,
"num_input_tokens_seen": 51909856,
"step": 4180
},
{
"epoch": 1.4925106990014265,
"grad_norm": 1.799333930015564,
"learning_rate": 2.519606743164204e-05,
"loss": 0.3494,
"num_input_tokens_seen": 51970080,
"step": 4185
},
{
"epoch": 1.4942938659058487,
"grad_norm": 1.3890087604522705,
"learning_rate": 2.514938535225897e-05,
"loss": 0.3798,
"num_input_tokens_seen": 52034608,
"step": 4190
},
{
"epoch": 1.496077032810271,
"grad_norm": 1.6815882921218872,
"learning_rate": 2.5102702751982188e-05,
"loss": 0.3498,
"num_input_tokens_seen": 52100032,
"step": 4195
},
{
"epoch": 1.4978601997146934,
"grad_norm": 1.7529703378677368,
"learning_rate": 2.5056019793589858e-05,
"loss": 0.3873,
"num_input_tokens_seen": 52162800,
"step": 4200
},
{
"epoch": 1.4996433666191156,
"grad_norm": 1.700079321861267,
"learning_rate": 2.500933663986139e-05,
"loss": 0.3284,
"num_input_tokens_seen": 52222560,
"step": 4205
},
{
"epoch": 1.5014265335235377,
"grad_norm": 1.6496635675430298,
"learning_rate": 2.496265345357687e-05,
"loss": 0.3805,
"num_input_tokens_seen": 52285328,
"step": 4210
},
{
"epoch": 1.50320970042796,
"grad_norm": 1.5684199333190918,
"learning_rate": 2.49159703975165e-05,
"loss": 0.4581,
"num_input_tokens_seen": 52347456,
"step": 4215
},
{
"epoch": 1.5049928673323825,
"grad_norm": 1.2051026821136475,
"learning_rate": 2.4869287634460045e-05,
"loss": 0.3323,
"num_input_tokens_seen": 52409680,
"step": 4220
},
{
"epoch": 1.5067760342368046,
"grad_norm": 1.540032982826233,
"learning_rate": 2.4822605327186217e-05,
"loss": 0.3204,
"num_input_tokens_seen": 52471376,
"step": 4225
},
{
"epoch": 1.5085592011412268,
"grad_norm": 1.0840933322906494,
"learning_rate": 2.4775923638472172e-05,
"loss": 0.3218,
"num_input_tokens_seen": 52532256,
"step": 4230
},
{
"epoch": 1.5103423680456491,
"grad_norm": 1.5128995180130005,
"learning_rate": 2.472924273109287e-05,
"loss": 0.3686,
"num_input_tokens_seen": 52592928,
"step": 4235
},
{
"epoch": 1.5121255349500713,
"grad_norm": 1.375869870185852,
"learning_rate": 2.4682562767820587e-05,
"loss": 0.3571,
"num_input_tokens_seen": 52655968,
"step": 4240
},
{
"epoch": 1.5139087018544934,
"grad_norm": 1.3699325323104858,
"learning_rate": 2.4635883911424293e-05,
"loss": 0.4165,
"num_input_tokens_seen": 52716096,
"step": 4245
},
{
"epoch": 1.5156918687589158,
"grad_norm": 1.4494869709014893,
"learning_rate": 2.4589206324669082e-05,
"loss": 0.4172,
"num_input_tokens_seen": 52780176,
"step": 4250
},
{
"epoch": 1.5174750356633382,
"grad_norm": 1.4189242124557495,
"learning_rate": 2.4542530170315635e-05,
"loss": 0.3887,
"num_input_tokens_seen": 52841968,
"step": 4255
},
{
"epoch": 1.5192582025677603,
"grad_norm": 1.198378086090088,
"learning_rate": 2.449585561111965e-05,
"loss": 0.3482,
"num_input_tokens_seen": 52904544,
"step": 4260
},
{
"epoch": 1.5210413694721825,
"grad_norm": 1.5662075281143188,
"learning_rate": 2.4449182809831227e-05,
"loss": 0.3348,
"num_input_tokens_seen": 52964960,
"step": 4265
},
{
"epoch": 1.5228245363766049,
"grad_norm": 2.213198184967041,
"learning_rate": 2.4402511929194383e-05,
"loss": 0.4413,
"num_input_tokens_seen": 53026528,
"step": 4270
},
{
"epoch": 1.5246077032810272,
"grad_norm": 1.652597188949585,
"learning_rate": 2.4355843131946407e-05,
"loss": 0.3929,
"num_input_tokens_seen": 53091424,
"step": 4275
},
{
"epoch": 1.5263908701854494,
"grad_norm": 1.3295167684555054,
"learning_rate": 2.4309176580817318e-05,
"loss": 0.4019,
"num_input_tokens_seen": 53153552,
"step": 4280
},
{
"epoch": 1.5281740370898715,
"grad_norm": 1.6449896097183228,
"learning_rate": 2.426251243852932e-05,
"loss": 0.3761,
"num_input_tokens_seen": 53210416,
"step": 4285
},
{
"epoch": 1.529957203994294,
"grad_norm": 1.336366891860962,
"learning_rate": 2.421585086779623e-05,
"loss": 0.3722,
"num_input_tokens_seen": 53272864,
"step": 4290
},
{
"epoch": 1.5317403708987163,
"grad_norm": 2.004012107849121,
"learning_rate": 2.4169192031322865e-05,
"loss": 0.4091,
"num_input_tokens_seen": 53331424,
"step": 4295
},
{
"epoch": 1.5335235378031382,
"grad_norm": 1.515665054321289,
"learning_rate": 2.412253609180453e-05,
"loss": 0.4092,
"num_input_tokens_seen": 53392848,
"step": 4300
},
{
"epoch": 1.5353067047075606,
"grad_norm": 1.5377110242843628,
"learning_rate": 2.4075883211926415e-05,
"loss": 0.3362,
"num_input_tokens_seen": 53453392,
"step": 4305
},
{
"epoch": 1.537089871611983,
"grad_norm": 1.5057114362716675,
"learning_rate": 2.4029233554363047e-05,
"loss": 0.3732,
"num_input_tokens_seen": 53516752,
"step": 4310
},
{
"epoch": 1.5388730385164051,
"grad_norm": 1.4880399703979492,
"learning_rate": 2.3982587281777742e-05,
"loss": 0.3358,
"num_input_tokens_seen": 53580576,
"step": 4315
},
{
"epoch": 1.5406562054208273,
"grad_norm": 2.0042483806610107,
"learning_rate": 2.3935944556821966e-05,
"loss": 0.409,
"num_input_tokens_seen": 53644272,
"step": 4320
},
{
"epoch": 1.5424393723252496,
"grad_norm": 1.3616559505462646,
"learning_rate": 2.388930554213484e-05,
"loss": 0.3688,
"num_input_tokens_seen": 53705232,
"step": 4325
},
{
"epoch": 1.544222539229672,
"grad_norm": 1.4289284944534302,
"learning_rate": 2.3842670400342566e-05,
"loss": 0.3526,
"num_input_tokens_seen": 53760288,
"step": 4330
},
{
"epoch": 1.5460057061340942,
"grad_norm": 1.3744745254516602,
"learning_rate": 2.3796039294057795e-05,
"loss": 0.3635,
"num_input_tokens_seen": 53816640,
"step": 4335
},
{
"epoch": 1.5477888730385163,
"grad_norm": 1.7196354866027832,
"learning_rate": 2.3749412385879154e-05,
"loss": 0.3233,
"num_input_tokens_seen": 53876544,
"step": 4340
},
{
"epoch": 1.5495720399429387,
"grad_norm": 1.4938087463378906,
"learning_rate": 2.370278983839061e-05,
"loss": 0.389,
"num_input_tokens_seen": 53938928,
"step": 4345
},
{
"epoch": 1.551355206847361,
"grad_norm": 1.674802541732788,
"learning_rate": 2.3656171814160906e-05,
"loss": 0.3819,
"num_input_tokens_seen": 54000640,
"step": 4350
},
{
"epoch": 1.5531383737517832,
"grad_norm": 1.393610954284668,
"learning_rate": 2.3609558475743048e-05,
"loss": 0.4104,
"num_input_tokens_seen": 54062448,
"step": 4355
},
{
"epoch": 1.5549215406562054,
"grad_norm": 1.6902203559875488,
"learning_rate": 2.356294998567369e-05,
"loss": 0.3618,
"num_input_tokens_seen": 54123424,
"step": 4360
},
{
"epoch": 1.5567047075606277,
"grad_norm": 1.4336259365081787,
"learning_rate": 2.351634650647257e-05,
"loss": 0.37,
"num_input_tokens_seen": 54188112,
"step": 4365
},
{
"epoch": 1.5584878744650499,
"grad_norm": 1.1472690105438232,
"learning_rate": 2.3469748200641967e-05,
"loss": 0.3926,
"num_input_tokens_seen": 54252208,
"step": 4370
},
{
"epoch": 1.560271041369472,
"grad_norm": 1.2778717279434204,
"learning_rate": 2.34231552306661e-05,
"loss": 0.449,
"num_input_tokens_seen": 54316784,
"step": 4375
},
{
"epoch": 1.5620542082738944,
"grad_norm": 2.114806652069092,
"learning_rate": 2.3376567759010614e-05,
"loss": 0.4046,
"num_input_tokens_seen": 54376256,
"step": 4380
},
{
"epoch": 1.5638373751783168,
"grad_norm": 1.5421040058135986,
"learning_rate": 2.3329985948121963e-05,
"loss": 0.3858,
"num_input_tokens_seen": 54436576,
"step": 4385
},
{
"epoch": 1.565620542082739,
"grad_norm": 1.365997076034546,
"learning_rate": 2.3283409960426857e-05,
"loss": 0.3647,
"num_input_tokens_seen": 54497760,
"step": 4390
},
{
"epoch": 1.567403708987161,
"grad_norm": 1.4641869068145752,
"learning_rate": 2.323683995833171e-05,
"loss": 0.3754,
"num_input_tokens_seen": 54559504,
"step": 4395
},
{
"epoch": 1.5691868758915835,
"grad_norm": 1.8695045709609985,
"learning_rate": 2.3190276104222073e-05,
"loss": 0.4224,
"num_input_tokens_seen": 54623280,
"step": 4400
},
{
"epoch": 1.5709700427960058,
"grad_norm": 1.825547456741333,
"learning_rate": 2.3143718560462042e-05,
"loss": 0.3847,
"num_input_tokens_seen": 54688800,
"step": 4405
},
{
"epoch": 1.572753209700428,
"grad_norm": 1.671181321144104,
"learning_rate": 2.3097167489393705e-05,
"loss": 0.4252,
"num_input_tokens_seen": 54751120,
"step": 4410
},
{
"epoch": 1.5745363766048501,
"grad_norm": 1.5111570358276367,
"learning_rate": 2.3050623053336623e-05,
"loss": 0.3571,
"num_input_tokens_seen": 54814640,
"step": 4415
},
{
"epoch": 1.5763195435092725,
"grad_norm": 1.5124739408493042,
"learning_rate": 2.300408541458716e-05,
"loss": 0.3436,
"num_input_tokens_seen": 54873728,
"step": 4420
},
{
"epoch": 1.5781027104136949,
"grad_norm": 1.5977612733840942,
"learning_rate": 2.2957554735418023e-05,
"loss": 0.3866,
"num_input_tokens_seen": 54935536,
"step": 4425
},
{
"epoch": 1.579885877318117,
"grad_norm": 1.5673508644104004,
"learning_rate": 2.2911031178077648e-05,
"loss": 0.4096,
"num_input_tokens_seen": 54995776,
"step": 4430
},
{
"epoch": 1.5816690442225392,
"grad_norm": 1.5844247341156006,
"learning_rate": 2.2864514904789606e-05,
"loss": 0.4905,
"num_input_tokens_seen": 55053984,
"step": 4435
},
{
"epoch": 1.5834522111269616,
"grad_norm": 1.5645432472229004,
"learning_rate": 2.281800607775211e-05,
"loss": 0.3688,
"num_input_tokens_seen": 55115472,
"step": 4440
},
{
"epoch": 1.5852353780313837,
"grad_norm": 1.758135437965393,
"learning_rate": 2.2771504859137365e-05,
"loss": 0.4062,
"num_input_tokens_seen": 55177392,
"step": 4445
},
{
"epoch": 1.5870185449358059,
"grad_norm": 1.7348984479904175,
"learning_rate": 2.2725011411091097e-05,
"loss": 0.3988,
"num_input_tokens_seen": 55239744,
"step": 4450
},
{
"epoch": 1.5888017118402282,
"grad_norm": 1.2819241285324097,
"learning_rate": 2.26785258957319e-05,
"loss": 0.409,
"num_input_tokens_seen": 55300416,
"step": 4455
},
{
"epoch": 1.5905848787446506,
"grad_norm": 1.5527747869491577,
"learning_rate": 2.2632048475150705e-05,
"loss": 0.3621,
"num_input_tokens_seen": 55360480,
"step": 4460
},
{
"epoch": 1.5923680456490727,
"grad_norm": 1.163960576057434,
"learning_rate": 2.2585579311410242e-05,
"loss": 0.3758,
"num_input_tokens_seen": 55419840,
"step": 4465
},
{
"epoch": 1.594151212553495,
"grad_norm": 1.756473183631897,
"learning_rate": 2.2539118566544443e-05,
"loss": 0.3922,
"num_input_tokens_seen": 55479024,
"step": 4470
},
{
"epoch": 1.5959343794579173,
"grad_norm": 1.3992432355880737,
"learning_rate": 2.2492666402557873e-05,
"loss": 0.4174,
"num_input_tokens_seen": 55539360,
"step": 4475
},
{
"epoch": 1.5977175463623396,
"grad_norm": 1.5347425937652588,
"learning_rate": 2.244622298142517e-05,
"loss": 0.3443,
"num_input_tokens_seen": 55602384,
"step": 4480
},
{
"epoch": 1.5995007132667618,
"grad_norm": 1.5150680541992188,
"learning_rate": 2.239978846509052e-05,
"loss": 0.3496,
"num_input_tokens_seen": 55661456,
"step": 4485
},
{
"epoch": 1.601283880171184,
"grad_norm": 1.6969490051269531,
"learning_rate": 2.235336301546702e-05,
"loss": 0.3593,
"num_input_tokens_seen": 55727984,
"step": 4490
},
{
"epoch": 1.6030670470756063,
"grad_norm": 1.6206620931625366,
"learning_rate": 2.230694679443618e-05,
"loss": 0.3449,
"num_input_tokens_seen": 55790240,
"step": 4495
},
{
"epoch": 1.6048502139800287,
"grad_norm": 1.4925209283828735,
"learning_rate": 2.2260539963847317e-05,
"loss": 0.3625,
"num_input_tokens_seen": 55850592,
"step": 4500
},
{
"epoch": 1.6066333808844506,
"grad_norm": 1.5853619575500488,
"learning_rate": 2.2214142685517005e-05,
"loss": 0.3858,
"num_input_tokens_seen": 55911552,
"step": 4505
},
{
"epoch": 1.608416547788873,
"grad_norm": 1.5945818424224854,
"learning_rate": 2.2167755121228516e-05,
"loss": 0.3834,
"num_input_tokens_seen": 55974224,
"step": 4510
},
{
"epoch": 1.6101997146932954,
"grad_norm": 1.4650936126708984,
"learning_rate": 2.212137743273124e-05,
"loss": 0.3578,
"num_input_tokens_seen": 56035440,
"step": 4515
},
{
"epoch": 1.6119828815977175,
"grad_norm": 1.6041439771652222,
"learning_rate": 2.2075009781740144e-05,
"loss": 0.3204,
"num_input_tokens_seen": 56096256,
"step": 4520
},
{
"epoch": 1.6137660485021397,
"grad_norm": 1.5402308702468872,
"learning_rate": 2.2028652329935196e-05,
"loss": 0.3659,
"num_input_tokens_seen": 56158160,
"step": 4525
},
{
"epoch": 1.615549215406562,
"grad_norm": 1.6089669466018677,
"learning_rate": 2.198230523896077e-05,
"loss": 0.3354,
"num_input_tokens_seen": 56219072,
"step": 4530
},
{
"epoch": 1.6173323823109844,
"grad_norm": 1.728310465812683,
"learning_rate": 2.193596867042515e-05,
"loss": 0.3414,
"num_input_tokens_seen": 56280416,
"step": 4535
},
{
"epoch": 1.6191155492154066,
"grad_norm": 1.711105465888977,
"learning_rate": 2.1889642785899926e-05,
"loss": 0.4517,
"num_input_tokens_seen": 56341792,
"step": 4540
},
{
"epoch": 1.6208987161198287,
"grad_norm": 1.99299955368042,
"learning_rate": 2.1843327746919405e-05,
"loss": 0.3987,
"num_input_tokens_seen": 56400304,
"step": 4545
},
{
"epoch": 1.622681883024251,
"grad_norm": 1.4943881034851074,
"learning_rate": 2.1797023714980092e-05,
"loss": 0.4308,
"num_input_tokens_seen": 56463152,
"step": 4550
},
{
"epoch": 1.6244650499286735,
"grad_norm": 1.8707401752471924,
"learning_rate": 2.1750730851540135e-05,
"loss": 0.4295,
"num_input_tokens_seen": 56527024,
"step": 4555
},
{
"epoch": 1.6262482168330956,
"grad_norm": 1.4292758703231812,
"learning_rate": 2.1704449318018692e-05,
"loss": 0.3562,
"num_input_tokens_seen": 56588288,
"step": 4560
},
{
"epoch": 1.6280313837375178,
"grad_norm": 1.6914817094802856,
"learning_rate": 2.1658179275795457e-05,
"loss": 0.3525,
"num_input_tokens_seen": 56649728,
"step": 4565
},
{
"epoch": 1.6298145506419401,
"grad_norm": 1.3131465911865234,
"learning_rate": 2.1611920886210034e-05,
"loss": 0.335,
"num_input_tokens_seen": 56712144,
"step": 4570
},
{
"epoch": 1.6315977175463623,
"grad_norm": 2.274732828140259,
"learning_rate": 2.156567431056139e-05,
"loss": 0.3433,
"num_input_tokens_seen": 56774944,
"step": 4575
},
{
"epoch": 1.6333808844507844,
"grad_norm": 1.2653104066848755,
"learning_rate": 2.151943971010732e-05,
"loss": 0.3342,
"num_input_tokens_seen": 56839024,
"step": 4580
},
{
"epoch": 1.6351640513552068,
"grad_norm": 1.7100783586502075,
"learning_rate": 2.1473217246063833e-05,
"loss": 0.2926,
"num_input_tokens_seen": 56900368,
"step": 4585
},
{
"epoch": 1.6369472182596292,
"grad_norm": 1.8248704671859741,
"learning_rate": 2.1427007079604643e-05,
"loss": 0.3742,
"num_input_tokens_seen": 56962480,
"step": 4590
},
{
"epoch": 1.6387303851640513,
"grad_norm": 1.7061128616333008,
"learning_rate": 2.1380809371860588e-05,
"loss": 0.4454,
"num_input_tokens_seen": 57025088,
"step": 4595
},
{
"epoch": 1.6405135520684735,
"grad_norm": 2.0661263465881348,
"learning_rate": 2.1334624283919026e-05,
"loss": 0.3228,
"num_input_tokens_seen": 57087760,
"step": 4600
},
{
"epoch": 1.6422967189728959,
"grad_norm": 2.6758852005004883,
"learning_rate": 2.1288451976823352e-05,
"loss": 0.3899,
"num_input_tokens_seen": 57150864,
"step": 4605
},
{
"epoch": 1.6440798858773182,
"grad_norm": 1.6372278928756714,
"learning_rate": 2.1242292611572387e-05,
"loss": 0.4195,
"num_input_tokens_seen": 57213760,
"step": 4610
},
{
"epoch": 1.6458630527817404,
"grad_norm": 1.4883116483688354,
"learning_rate": 2.1196146349119802e-05,
"loss": 0.3935,
"num_input_tokens_seen": 57275264,
"step": 4615
},
{
"epoch": 1.6476462196861625,
"grad_norm": 1.9058313369750977,
"learning_rate": 2.1150013350373594e-05,
"loss": 0.4095,
"num_input_tokens_seen": 57335248,
"step": 4620
},
{
"epoch": 1.649429386590585,
"grad_norm": 1.4819669723510742,
"learning_rate": 2.110389377619553e-05,
"loss": 0.3567,
"num_input_tokens_seen": 57397200,
"step": 4625
},
{
"epoch": 1.6512125534950073,
"grad_norm": 1.2163094282150269,
"learning_rate": 2.1057787787400528e-05,
"loss": 0.3625,
"num_input_tokens_seen": 57456560,
"step": 4630
},
{
"epoch": 1.6529957203994294,
"grad_norm": 1.4048594236373901,
"learning_rate": 2.1011695544756172e-05,
"loss": 0.278,
"num_input_tokens_seen": 57515072,
"step": 4635
},
{
"epoch": 1.6547788873038516,
"grad_norm": 1.4936057329177856,
"learning_rate": 2.096561720898209e-05,
"loss": 0.3555,
"num_input_tokens_seen": 57576816,
"step": 4640
},
{
"epoch": 1.656562054208274,
"grad_norm": 1.3369996547698975,
"learning_rate": 2.0919552940749415e-05,
"loss": 0.373,
"num_input_tokens_seen": 57638784,
"step": 4645
},
{
"epoch": 1.658345221112696,
"grad_norm": 1.3848330974578857,
"learning_rate": 2.087350290068026e-05,
"loss": 0.32,
"num_input_tokens_seen": 57700128,
"step": 4650
},
{
"epoch": 1.6601283880171183,
"grad_norm": 2.041731595993042,
"learning_rate": 2.0827467249347085e-05,
"loss": 0.4344,
"num_input_tokens_seen": 57763072,
"step": 4655
},
{
"epoch": 1.6619115549215406,
"grad_norm": 1.5269125699996948,
"learning_rate": 2.078144614727221e-05,
"loss": 0.4432,
"num_input_tokens_seen": 57825312,
"step": 4660
},
{
"epoch": 1.663694721825963,
"grad_norm": 1.7696996927261353,
"learning_rate": 2.0735439754927206e-05,
"loss": 0.359,
"num_input_tokens_seen": 57887616,
"step": 4665
},
{
"epoch": 1.6654778887303852,
"grad_norm": 1.252975583076477,
"learning_rate": 2.0689448232732345e-05,
"loss": 0.3115,
"num_input_tokens_seen": 57945744,
"step": 4670
},
{
"epoch": 1.6672610556348073,
"grad_norm": 1.894482970237732,
"learning_rate": 2.0643471741056075e-05,
"loss": 0.4102,
"num_input_tokens_seen": 58006464,
"step": 4675
},
{
"epoch": 1.6690442225392297,
"grad_norm": 1.2879408597946167,
"learning_rate": 2.059751044021441e-05,
"loss": 0.4138,
"num_input_tokens_seen": 58068400,
"step": 4680
},
{
"epoch": 1.670827389443652,
"grad_norm": 1.8357077836990356,
"learning_rate": 2.055156449047041e-05,
"loss": 0.3768,
"num_input_tokens_seen": 58127664,
"step": 4685
},
{
"epoch": 1.6726105563480742,
"grad_norm": 1.647601842880249,
"learning_rate": 2.0505634052033585e-05,
"loss": 0.3792,
"num_input_tokens_seen": 58192000,
"step": 4690
},
{
"epoch": 1.6743937232524964,
"grad_norm": 1.852842092514038,
"learning_rate": 2.0459719285059396e-05,
"loss": 0.3663,
"num_input_tokens_seen": 58252192,
"step": 4695
},
{
"epoch": 1.6761768901569187,
"grad_norm": 1.6066093444824219,
"learning_rate": 2.041382034964862e-05,
"loss": 0.3448,
"num_input_tokens_seen": 58312416,
"step": 4700
},
{
"epoch": 1.677960057061341,
"grad_norm": 2.437854766845703,
"learning_rate": 2.0367937405846844e-05,
"loss": 0.3217,
"num_input_tokens_seen": 58373728,
"step": 4705
},
{
"epoch": 1.679743223965763,
"grad_norm": 1.6272107362747192,
"learning_rate": 2.0322070613643913e-05,
"loss": 0.4379,
"num_input_tokens_seen": 58436144,
"step": 4710
},
{
"epoch": 1.6815263908701854,
"grad_norm": 1.2958059310913086,
"learning_rate": 2.0276220132973316e-05,
"loss": 0.3759,
"num_input_tokens_seen": 58500384,
"step": 4715
},
{
"epoch": 1.6833095577746078,
"grad_norm": 1.5485390424728394,
"learning_rate": 2.0230386123711714e-05,
"loss": 0.3597,
"num_input_tokens_seen": 58560080,
"step": 4720
},
{
"epoch": 1.68509272467903,
"grad_norm": 1.8313623666763306,
"learning_rate": 2.0184568745678278e-05,
"loss": 0.3011,
"num_input_tokens_seen": 58624320,
"step": 4725
},
{
"epoch": 1.686875891583452,
"grad_norm": 1.8504736423492432,
"learning_rate": 2.0138768158634224e-05,
"loss": 0.4347,
"num_input_tokens_seen": 58688080,
"step": 4730
},
{
"epoch": 1.6886590584878745,
"grad_norm": 1.6756471395492554,
"learning_rate": 2.009298452228222e-05,
"loss": 0.4311,
"num_input_tokens_seen": 58753808,
"step": 4735
},
{
"epoch": 1.6904422253922968,
"grad_norm": 4.543945789337158,
"learning_rate": 2.00472179962658e-05,
"loss": 0.3863,
"num_input_tokens_seen": 58815120,
"step": 4740
},
{
"epoch": 1.692225392296719,
"grad_norm": 1.8706589937210083,
"learning_rate": 2.0001468740168872e-05,
"loss": 0.3552,
"num_input_tokens_seen": 58876960,
"step": 4745
},
{
"epoch": 1.6940085592011411,
"grad_norm": 1.59261953830719,
"learning_rate": 1.99557369135151e-05,
"loss": 0.3894,
"num_input_tokens_seen": 58940672,
"step": 4750
},
{
"epoch": 1.6957917261055635,
"grad_norm": 1.4362701177597046,
"learning_rate": 1.9910022675767376e-05,
"loss": 0.2959,
"num_input_tokens_seen": 58998544,
"step": 4755
},
{
"epoch": 1.6975748930099859,
"grad_norm": 1.9743127822875977,
"learning_rate": 1.9864326186327265e-05,
"loss": 0.3498,
"num_input_tokens_seen": 59059648,
"step": 4760
},
{
"epoch": 1.699358059914408,
"grad_norm": 1.3991342782974243,
"learning_rate": 1.9818647604534464e-05,
"loss": 0.3219,
"num_input_tokens_seen": 59122176,
"step": 4765
},
{
"epoch": 1.7011412268188302,
"grad_norm": 1.8187235593795776,
"learning_rate": 1.977298708966619e-05,
"loss": 0.3669,
"num_input_tokens_seen": 59186784,
"step": 4770
},
{
"epoch": 1.7029243937232525,
"grad_norm": 1.8813676834106445,
"learning_rate": 1.9727344800936683e-05,
"loss": 0.3834,
"num_input_tokens_seen": 59247216,
"step": 4775
},
{
"epoch": 1.7047075606276747,
"grad_norm": 2.3406333923339844,
"learning_rate": 1.968172089749664e-05,
"loss": 0.3516,
"num_input_tokens_seen": 59307904,
"step": 4780
},
{
"epoch": 1.7064907275320969,
"grad_norm": 1.5572633743286133,
"learning_rate": 1.963611553843262e-05,
"loss": 0.3393,
"num_input_tokens_seen": 59371232,
"step": 4785
},
{
"epoch": 1.7082738944365192,
"grad_norm": 1.5522667169570923,
"learning_rate": 1.9590528882766565e-05,
"loss": 0.3647,
"num_input_tokens_seen": 59431552,
"step": 4790
},
{
"epoch": 1.7100570613409416,
"grad_norm": 1.8206260204315186,
"learning_rate": 1.954496108945515e-05,
"loss": 0.3945,
"num_input_tokens_seen": 59493632,
"step": 4795
},
{
"epoch": 1.7118402282453637,
"grad_norm": 1.764434814453125,
"learning_rate": 1.9499412317389305e-05,
"loss": 0.3307,
"num_input_tokens_seen": 59555664,
"step": 4800
},
{
"epoch": 1.713623395149786,
"grad_norm": 1.5242831707000732,
"learning_rate": 1.9453882725393647e-05,
"loss": 0.3776,
"num_input_tokens_seen": 59618688,
"step": 4805
},
{
"epoch": 1.7154065620542083,
"grad_norm": 1.5517514944076538,
"learning_rate": 1.940837247222587e-05,
"loss": 0.3579,
"num_input_tokens_seen": 59678528,
"step": 4810
},
{
"epoch": 1.7171897289586306,
"grad_norm": 1.5298833847045898,
"learning_rate": 1.936288171657629e-05,
"loss": 0.3753,
"num_input_tokens_seen": 59741216,
"step": 4815
},
{
"epoch": 1.7189728958630528,
"grad_norm": 2.073880672454834,
"learning_rate": 1.93174106170672e-05,
"loss": 0.4658,
"num_input_tokens_seen": 59800048,
"step": 4820
},
{
"epoch": 1.720756062767475,
"grad_norm": 2.046309232711792,
"learning_rate": 1.927195933225236e-05,
"loss": 0.3769,
"num_input_tokens_seen": 59863248,
"step": 4825
},
{
"epoch": 1.7225392296718973,
"grad_norm": 2.1719560623168945,
"learning_rate": 1.922652802061644e-05,
"loss": 0.3404,
"num_input_tokens_seen": 59923552,
"step": 4830
},
{
"epoch": 1.7243223965763197,
"grad_norm": 1.5875083208084106,
"learning_rate": 1.9181116840574482e-05,
"loss": 0.4051,
"num_input_tokens_seen": 59985472,
"step": 4835
},
{
"epoch": 1.7261055634807418,
"grad_norm": 1.585699200630188,
"learning_rate": 1.91357259504713e-05,
"loss": 0.4092,
"num_input_tokens_seen": 60049280,
"step": 4840
},
{
"epoch": 1.727888730385164,
"grad_norm": 1.9710477590560913,
"learning_rate": 1.909035550858097e-05,
"loss": 0.4739,
"num_input_tokens_seen": 60117088,
"step": 4845
},
{
"epoch": 1.7296718972895864,
"grad_norm": 1.366571307182312,
"learning_rate": 1.9045005673106294e-05,
"loss": 0.3841,
"num_input_tokens_seen": 60182864,
"step": 4850
},
{
"epoch": 1.7314550641940085,
"grad_norm": 1.3404649496078491,
"learning_rate": 1.8999676602178177e-05,
"loss": 0.3849,
"num_input_tokens_seen": 60243184,
"step": 4855
},
{
"epoch": 1.7332382310984307,
"grad_norm": 1.3966970443725586,
"learning_rate": 1.895436845385516e-05,
"loss": 0.4027,
"num_input_tokens_seen": 60303088,
"step": 4860
},
{
"epoch": 1.735021398002853,
"grad_norm": 1.9772543907165527,
"learning_rate": 1.8909081386122794e-05,
"loss": 0.4104,
"num_input_tokens_seen": 60364368,
"step": 4865
},
{
"epoch": 1.7368045649072754,
"grad_norm": 1.3881804943084717,
"learning_rate": 1.886381555689314e-05,
"loss": 0.3492,
"num_input_tokens_seen": 60426320,
"step": 4870
},
{
"epoch": 1.7385877318116976,
"grad_norm": 1.6630189418792725,
"learning_rate": 1.8818571124004218e-05,
"loss": 0.4027,
"num_input_tokens_seen": 60488864,
"step": 4875
},
{
"epoch": 1.7403708987161197,
"grad_norm": 1.3820875883102417,
"learning_rate": 1.8773348245219403e-05,
"loss": 0.413,
"num_input_tokens_seen": 60551408,
"step": 4880
},
{
"epoch": 1.742154065620542,
"grad_norm": 1.9947419166564941,
"learning_rate": 1.8728147078226955e-05,
"loss": 0.3397,
"num_input_tokens_seen": 60616480,
"step": 4885
},
{
"epoch": 1.7439372325249645,
"grad_norm": 1.653064489364624,
"learning_rate": 1.8682967780639398e-05,
"loss": 0.3454,
"num_input_tokens_seen": 60679840,
"step": 4890
},
{
"epoch": 1.7457203994293866,
"grad_norm": 1.5282491445541382,
"learning_rate": 1.8637810509993002e-05,
"loss": 0.3116,
"num_input_tokens_seen": 60744928,
"step": 4895
},
{
"epoch": 1.7475035663338088,
"grad_norm": 1.984711766242981,
"learning_rate": 1.859267542374724e-05,
"loss": 0.3609,
"num_input_tokens_seen": 60807424,
"step": 4900
},
{
"epoch": 1.7492867332382311,
"grad_norm": 1.3897349834442139,
"learning_rate": 1.8547562679284243e-05,
"loss": 0.3705,
"num_input_tokens_seen": 60870304,
"step": 4905
},
{
"epoch": 1.7510699001426535,
"grad_norm": 1.379233479499817,
"learning_rate": 1.8502472433908197e-05,
"loss": 0.3989,
"num_input_tokens_seen": 60930880,
"step": 4910
},
{
"epoch": 1.7528530670470754,
"grad_norm": 1.5175212621688843,
"learning_rate": 1.8457404844844883e-05,
"loss": 0.3492,
"num_input_tokens_seen": 60995040,
"step": 4915
},
{
"epoch": 1.7546362339514978,
"grad_norm": 1.4529441595077515,
"learning_rate": 1.8412360069241034e-05,
"loss": 0.3915,
"num_input_tokens_seen": 61059296,
"step": 4920
},
{
"epoch": 1.7564194008559202,
"grad_norm": 1.897772192955017,
"learning_rate": 1.836733826416387e-05,
"loss": 0.3891,
"num_input_tokens_seen": 61116064,
"step": 4925
},
{
"epoch": 1.7582025677603423,
"grad_norm": 1.2248293161392212,
"learning_rate": 1.8322339586600524e-05,
"loss": 0.3858,
"num_input_tokens_seen": 61179536,
"step": 4930
},
{
"epoch": 1.7599857346647645,
"grad_norm": 1.7476718425750732,
"learning_rate": 1.8277364193457436e-05,
"loss": 0.4066,
"num_input_tokens_seen": 61239040,
"step": 4935
},
{
"epoch": 1.7617689015691869,
"grad_norm": 1.3817017078399658,
"learning_rate": 1.8232412241559896e-05,
"loss": 0.3426,
"num_input_tokens_seen": 61302576,
"step": 4940
},
{
"epoch": 1.7635520684736092,
"grad_norm": 1.6009461879730225,
"learning_rate": 1.818748388765146e-05,
"loss": 0.4506,
"num_input_tokens_seen": 61367472,
"step": 4945
},
{
"epoch": 1.7653352353780314,
"grad_norm": 1.4665969610214233,
"learning_rate": 1.8142579288393354e-05,
"loss": 0.4153,
"num_input_tokens_seen": 61428000,
"step": 4950
},
{
"epoch": 1.7671184022824535,
"grad_norm": 1.4972290992736816,
"learning_rate": 1.8097698600364026e-05,
"loss": 0.4038,
"num_input_tokens_seen": 61492752,
"step": 4955
},
{
"epoch": 1.768901569186876,
"grad_norm": 1.6141380071640015,
"learning_rate": 1.8052841980058533e-05,
"loss": 0.3921,
"num_input_tokens_seen": 61554016,
"step": 4960
},
{
"epoch": 1.7706847360912983,
"grad_norm": 2.656374931335449,
"learning_rate": 1.8008009583887982e-05,
"loss": 0.3052,
"num_input_tokens_seen": 61613216,
"step": 4965
},
{
"epoch": 1.7724679029957204,
"grad_norm": 1.6948812007904053,
"learning_rate": 1.7963201568179046e-05,
"loss": 0.3735,
"num_input_tokens_seen": 61675680,
"step": 4970
},
{
"epoch": 1.7742510699001426,
"grad_norm": 1.326504111289978,
"learning_rate": 1.791841808917338e-05,
"loss": 0.3857,
"num_input_tokens_seen": 61739056,
"step": 4975
},
{
"epoch": 1.776034236804565,
"grad_norm": 1.384664535522461,
"learning_rate": 1.7873659303027052e-05,
"loss": 0.3468,
"num_input_tokens_seen": 61801136,
"step": 4980
},
{
"epoch": 1.777817403708987,
"grad_norm": 1.8413240909576416,
"learning_rate": 1.7828925365810077e-05,
"loss": 0.4141,
"num_input_tokens_seen": 61862192,
"step": 4985
},
{
"epoch": 1.7796005706134093,
"grad_norm": 1.7805628776550293,
"learning_rate": 1.778421643350578e-05,
"loss": 0.3474,
"num_input_tokens_seen": 61923792,
"step": 4990
},
{
"epoch": 1.7813837375178316,
"grad_norm": 1.673789620399475,
"learning_rate": 1.7739532662010322e-05,
"loss": 0.4318,
"num_input_tokens_seen": 61984768,
"step": 4995
},
{
"epoch": 1.783166904422254,
"grad_norm": 1.5332224369049072,
"learning_rate": 1.7694874207132127e-05,
"loss": 0.464,
"num_input_tokens_seen": 62048432,
"step": 5000
}
],
"logging_steps": 5,
"max_steps": 8412,
"num_input_tokens_seen": 62048432,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.9126461037889126e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}