LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
1fce7fd verified
raw
history blame
192 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 18.900343642611684,
"eval_steps": 500,
"global_step": 11000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01718213058419244,
"grad_norm": 6.171707630157471,
"learning_rate": 1.7182130584192443e-06,
"loss": 0.9772,
"step": 10
},
{
"epoch": 0.03436426116838488,
"grad_norm": 8.092323303222656,
"learning_rate": 3.4364261168384886e-06,
"loss": 0.8776,
"step": 20
},
{
"epoch": 0.05154639175257732,
"grad_norm": 4.56458044052124,
"learning_rate": 5.154639175257732e-06,
"loss": 0.9078,
"step": 30
},
{
"epoch": 0.06872852233676977,
"grad_norm": 4.5016679763793945,
"learning_rate": 6.872852233676977e-06,
"loss": 0.6541,
"step": 40
},
{
"epoch": 0.0859106529209622,
"grad_norm": 3.0442380905151367,
"learning_rate": 8.591065292096221e-06,
"loss": 0.4879,
"step": 50
},
{
"epoch": 0.10309278350515463,
"grad_norm": 3.105207681655884,
"learning_rate": 1.0309278350515464e-05,
"loss": 0.4411,
"step": 60
},
{
"epoch": 0.12027491408934708,
"grad_norm": 2.3539834022521973,
"learning_rate": 1.2027491408934708e-05,
"loss": 0.326,
"step": 70
},
{
"epoch": 0.13745704467353953,
"grad_norm": 2.5519046783447266,
"learning_rate": 1.3745704467353954e-05,
"loss": 0.285,
"step": 80
},
{
"epoch": 0.15463917525773196,
"grad_norm": 4.304392337799072,
"learning_rate": 1.5463917525773197e-05,
"loss": 0.2539,
"step": 90
},
{
"epoch": 0.1718213058419244,
"grad_norm": 2.098220109939575,
"learning_rate": 1.7182130584192442e-05,
"loss": 0.2023,
"step": 100
},
{
"epoch": 0.18900343642611683,
"grad_norm": 1.672839879989624,
"learning_rate": 1.8900343642611683e-05,
"loss": 0.1724,
"step": 110
},
{
"epoch": 0.20618556701030927,
"grad_norm": 1.0948542356491089,
"learning_rate": 2.0618556701030927e-05,
"loss": 0.1859,
"step": 120
},
{
"epoch": 0.22336769759450173,
"grad_norm": 1.5368601083755493,
"learning_rate": 2.2336769759450175e-05,
"loss": 0.1309,
"step": 130
},
{
"epoch": 0.24054982817869416,
"grad_norm": 1.4471988677978516,
"learning_rate": 2.4054982817869417e-05,
"loss": 0.134,
"step": 140
},
{
"epoch": 0.25773195876288657,
"grad_norm": 1.2296886444091797,
"learning_rate": 2.5773195876288658e-05,
"loss": 0.1439,
"step": 150
},
{
"epoch": 0.27491408934707906,
"grad_norm": 4.241024017333984,
"learning_rate": 2.749140893470791e-05,
"loss": 0.1214,
"step": 160
},
{
"epoch": 0.2920962199312715,
"grad_norm": 0.9207940697669983,
"learning_rate": 2.920962199312715e-05,
"loss": 0.1141,
"step": 170
},
{
"epoch": 0.30927835051546393,
"grad_norm": 0.7457907199859619,
"learning_rate": 3.0927835051546395e-05,
"loss": 0.0938,
"step": 180
},
{
"epoch": 0.32646048109965636,
"grad_norm": 2.0314087867736816,
"learning_rate": 3.2646048109965636e-05,
"loss": 0.1287,
"step": 190
},
{
"epoch": 0.3436426116838488,
"grad_norm": 1.9431196451187134,
"learning_rate": 3.4364261168384884e-05,
"loss": 0.112,
"step": 200
},
{
"epoch": 0.36082474226804123,
"grad_norm": 1.023748755455017,
"learning_rate": 3.6082474226804125e-05,
"loss": 0.1039,
"step": 210
},
{
"epoch": 0.37800687285223367,
"grad_norm": 1.6307401657104492,
"learning_rate": 3.7800687285223366e-05,
"loss": 0.1076,
"step": 220
},
{
"epoch": 0.3951890034364261,
"grad_norm": 1.2871341705322266,
"learning_rate": 3.9518900343642614e-05,
"loss": 0.1007,
"step": 230
},
{
"epoch": 0.41237113402061853,
"grad_norm": 1.1707206964492798,
"learning_rate": 4.1237113402061855e-05,
"loss": 0.0978,
"step": 240
},
{
"epoch": 0.42955326460481097,
"grad_norm": 1.1074360609054565,
"learning_rate": 4.2955326460481096e-05,
"loss": 0.0853,
"step": 250
},
{
"epoch": 0.44673539518900346,
"grad_norm": 1.1564663648605347,
"learning_rate": 4.467353951890035e-05,
"loss": 0.0928,
"step": 260
},
{
"epoch": 0.4639175257731959,
"grad_norm": 0.8830773830413818,
"learning_rate": 4.639175257731959e-05,
"loss": 0.0852,
"step": 270
},
{
"epoch": 0.48109965635738833,
"grad_norm": 0.8775057792663574,
"learning_rate": 4.810996563573883e-05,
"loss": 0.0897,
"step": 280
},
{
"epoch": 0.49828178694158076,
"grad_norm": 0.5895084142684937,
"learning_rate": 4.982817869415808e-05,
"loss": 0.0741,
"step": 290
},
{
"epoch": 0.5154639175257731,
"grad_norm": 1.0802148580551147,
"learning_rate": 5.1546391752577315e-05,
"loss": 0.0829,
"step": 300
},
{
"epoch": 0.5326460481099656,
"grad_norm": 1.7113603353500366,
"learning_rate": 5.326460481099656e-05,
"loss": 0.0781,
"step": 310
},
{
"epoch": 0.5498281786941581,
"grad_norm": 1.0171607732772827,
"learning_rate": 5.498281786941582e-05,
"loss": 0.0794,
"step": 320
},
{
"epoch": 0.5670103092783505,
"grad_norm": 1.2694463729858398,
"learning_rate": 5.670103092783505e-05,
"loss": 0.0828,
"step": 330
},
{
"epoch": 0.584192439862543,
"grad_norm": 1.6448224782943726,
"learning_rate": 5.84192439862543e-05,
"loss": 0.0738,
"step": 340
},
{
"epoch": 0.6013745704467354,
"grad_norm": 1.3152124881744385,
"learning_rate": 6.013745704467354e-05,
"loss": 0.0805,
"step": 350
},
{
"epoch": 0.6185567010309279,
"grad_norm": 0.9917396306991577,
"learning_rate": 6.185567010309279e-05,
"loss": 0.0719,
"step": 360
},
{
"epoch": 0.6357388316151202,
"grad_norm": 1.0059962272644043,
"learning_rate": 6.357388316151203e-05,
"loss": 0.0648,
"step": 370
},
{
"epoch": 0.6529209621993127,
"grad_norm": 0.8844972848892212,
"learning_rate": 6.529209621993127e-05,
"loss": 0.0778,
"step": 380
},
{
"epoch": 0.6701030927835051,
"grad_norm": 0.7589945793151855,
"learning_rate": 6.701030927835051e-05,
"loss": 0.0924,
"step": 390
},
{
"epoch": 0.6872852233676976,
"grad_norm": 1.062225580215454,
"learning_rate": 6.872852233676977e-05,
"loss": 0.0637,
"step": 400
},
{
"epoch": 0.7044673539518901,
"grad_norm": 1.1478430032730103,
"learning_rate": 7.044673539518901e-05,
"loss": 0.0878,
"step": 410
},
{
"epoch": 0.7216494845360825,
"grad_norm": 1.2245433330535889,
"learning_rate": 7.216494845360825e-05,
"loss": 0.0856,
"step": 420
},
{
"epoch": 0.738831615120275,
"grad_norm": 0.8035943508148193,
"learning_rate": 7.38831615120275e-05,
"loss": 0.0789,
"step": 430
},
{
"epoch": 0.7560137457044673,
"grad_norm": 1.5007230043411255,
"learning_rate": 7.560137457044673e-05,
"loss": 0.099,
"step": 440
},
{
"epoch": 0.7731958762886598,
"grad_norm": 0.8082581162452698,
"learning_rate": 7.731958762886599e-05,
"loss": 0.0818,
"step": 450
},
{
"epoch": 0.7903780068728522,
"grad_norm": 1.0343904495239258,
"learning_rate": 7.903780068728523e-05,
"loss": 0.0622,
"step": 460
},
{
"epoch": 0.8075601374570447,
"grad_norm": 0.7941983342170715,
"learning_rate": 8.075601374570447e-05,
"loss": 0.077,
"step": 470
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.7006020545959473,
"learning_rate": 8.247422680412371e-05,
"loss": 0.056,
"step": 480
},
{
"epoch": 0.8419243986254296,
"grad_norm": 0.5468656420707703,
"learning_rate": 8.419243986254296e-05,
"loss": 0.0611,
"step": 490
},
{
"epoch": 0.8591065292096219,
"grad_norm": 0.581874668598175,
"learning_rate": 8.591065292096219e-05,
"loss": 0.0544,
"step": 500
},
{
"epoch": 0.8762886597938144,
"grad_norm": 0.7868462800979614,
"learning_rate": 8.762886597938145e-05,
"loss": 0.0639,
"step": 510
},
{
"epoch": 0.8934707903780069,
"grad_norm": 0.9123062491416931,
"learning_rate": 8.93470790378007e-05,
"loss": 0.0529,
"step": 520
},
{
"epoch": 0.9106529209621993,
"grad_norm": 0.9630204439163208,
"learning_rate": 9.106529209621993e-05,
"loss": 0.0599,
"step": 530
},
{
"epoch": 0.9278350515463918,
"grad_norm": 1.0028278827667236,
"learning_rate": 9.278350515463918e-05,
"loss": 0.0746,
"step": 540
},
{
"epoch": 0.9450171821305842,
"grad_norm": 0.8045145869255066,
"learning_rate": 9.450171821305843e-05,
"loss": 0.0604,
"step": 550
},
{
"epoch": 0.9621993127147767,
"grad_norm": 0.5860382914543152,
"learning_rate": 9.621993127147767e-05,
"loss": 0.0635,
"step": 560
},
{
"epoch": 0.979381443298969,
"grad_norm": 0.9446794986724854,
"learning_rate": 9.793814432989691e-05,
"loss": 0.0711,
"step": 570
},
{
"epoch": 0.9965635738831615,
"grad_norm": 0.9152433276176453,
"learning_rate": 9.965635738831616e-05,
"loss": 0.0651,
"step": 580
},
{
"epoch": 1.013745704467354,
"grad_norm": 0.7524177432060242,
"learning_rate": 9.999987081161148e-05,
"loss": 0.0593,
"step": 590
},
{
"epoch": 1.0309278350515463,
"grad_norm": 1.0932648181915283,
"learning_rate": 9.999934598492723e-05,
"loss": 0.0585,
"step": 600
},
{
"epoch": 1.0481099656357389,
"grad_norm": 0.5448580384254456,
"learning_rate": 9.999841744990731e-05,
"loss": 0.0705,
"step": 610
},
{
"epoch": 1.0652920962199313,
"grad_norm": 0.8481371402740479,
"learning_rate": 9.999708521404896e-05,
"loss": 0.0763,
"step": 620
},
{
"epoch": 1.0824742268041236,
"grad_norm": 0.8610166311264038,
"learning_rate": 9.999534928810904e-05,
"loss": 0.0598,
"step": 630
},
{
"epoch": 1.0996563573883162,
"grad_norm": 0.807761549949646,
"learning_rate": 9.999320968610386e-05,
"loss": 0.0567,
"step": 640
},
{
"epoch": 1.1168384879725086,
"grad_norm": 0.4783917963504791,
"learning_rate": 9.999066642530917e-05,
"loss": 0.056,
"step": 650
},
{
"epoch": 1.134020618556701,
"grad_norm": 0.6751272678375244,
"learning_rate": 9.998771952625992e-05,
"loss": 0.0498,
"step": 660
},
{
"epoch": 1.1512027491408934,
"grad_norm": 0.8272377848625183,
"learning_rate": 9.998436901275022e-05,
"loss": 0.0449,
"step": 670
},
{
"epoch": 1.168384879725086,
"grad_norm": 0.8059535026550293,
"learning_rate": 9.998061491183297e-05,
"loss": 0.0624,
"step": 680
},
{
"epoch": 1.1855670103092784,
"grad_norm": 0.7479894757270813,
"learning_rate": 9.997645725381986e-05,
"loss": 0.0471,
"step": 690
},
{
"epoch": 1.2027491408934707,
"grad_norm": 0.6483791470527649,
"learning_rate": 9.997189607228092e-05,
"loss": 0.0497,
"step": 700
},
{
"epoch": 1.2199312714776633,
"grad_norm": 0.8845646381378174,
"learning_rate": 9.99669314040444e-05,
"loss": 0.0617,
"step": 710
},
{
"epoch": 1.2371134020618557,
"grad_norm": 0.8434107303619385,
"learning_rate": 9.996156328919635e-05,
"loss": 0.0447,
"step": 720
},
{
"epoch": 1.254295532646048,
"grad_norm": 0.6829891800880432,
"learning_rate": 9.995579177108041e-05,
"loss": 0.059,
"step": 730
},
{
"epoch": 1.2714776632302405,
"grad_norm": 0.5923603773117065,
"learning_rate": 9.994961689629738e-05,
"loss": 0.0483,
"step": 740
},
{
"epoch": 1.2886597938144329,
"grad_norm": 0.48384591937065125,
"learning_rate": 9.994303871470489e-05,
"loss": 0.0565,
"step": 750
},
{
"epoch": 1.3058419243986255,
"grad_norm": 0.7825417518615723,
"learning_rate": 9.993605727941697e-05,
"loss": 0.0545,
"step": 760
},
{
"epoch": 1.3230240549828178,
"grad_norm": 0.9657111167907715,
"learning_rate": 9.992867264680361e-05,
"loss": 0.0532,
"step": 770
},
{
"epoch": 1.3402061855670104,
"grad_norm": 1.0996328592300415,
"learning_rate": 9.992088487649038e-05,
"loss": 0.0637,
"step": 780
},
{
"epoch": 1.3573883161512028,
"grad_norm": 0.8697621822357178,
"learning_rate": 9.991269403135783e-05,
"loss": 0.0445,
"step": 790
},
{
"epoch": 1.3745704467353952,
"grad_norm": 0.4780273735523224,
"learning_rate": 9.990410017754108e-05,
"loss": 0.0509,
"step": 800
},
{
"epoch": 1.3917525773195876,
"grad_norm": 0.386453777551651,
"learning_rate": 9.989510338442925e-05,
"loss": 0.0465,
"step": 810
},
{
"epoch": 1.40893470790378,
"grad_norm": 0.7011645436286926,
"learning_rate": 9.98857037246649e-05,
"loss": 0.0659,
"step": 820
},
{
"epoch": 1.4261168384879725,
"grad_norm": 0.47305113077163696,
"learning_rate": 9.987590127414344e-05,
"loss": 0.0391,
"step": 830
},
{
"epoch": 1.443298969072165,
"grad_norm": 0.6128239035606384,
"learning_rate": 9.986569611201251e-05,
"loss": 0.0433,
"step": 840
},
{
"epoch": 1.4604810996563573,
"grad_norm": 0.6045581698417664,
"learning_rate": 9.985508832067139e-05,
"loss": 0.0485,
"step": 850
},
{
"epoch": 1.47766323024055,
"grad_norm": 0.6033497452735901,
"learning_rate": 9.984407798577027e-05,
"loss": 0.049,
"step": 860
},
{
"epoch": 1.4948453608247423,
"grad_norm": 0.47953736782073975,
"learning_rate": 9.98326651962096e-05,
"loss": 0.0539,
"step": 870
},
{
"epoch": 1.5120274914089347,
"grad_norm": 0.8113358020782471,
"learning_rate": 9.982085004413933e-05,
"loss": 0.0481,
"step": 880
},
{
"epoch": 1.529209621993127,
"grad_norm": 0.5726741552352905,
"learning_rate": 9.980863262495821e-05,
"loss": 0.0512,
"step": 890
},
{
"epoch": 1.5463917525773194,
"grad_norm": 0.6560239195823669,
"learning_rate": 9.979601303731306e-05,
"loss": 0.0464,
"step": 900
},
{
"epoch": 1.563573883161512,
"grad_norm": 0.5235106348991394,
"learning_rate": 9.978299138309781e-05,
"loss": 0.0486,
"step": 910
},
{
"epoch": 1.5807560137457046,
"grad_norm": 0.6439309120178223,
"learning_rate": 9.976956776745287e-05,
"loss": 0.0536,
"step": 920
},
{
"epoch": 1.597938144329897,
"grad_norm": 0.8001301884651184,
"learning_rate": 9.975574229876417e-05,
"loss": 0.0641,
"step": 930
},
{
"epoch": 1.6151202749140894,
"grad_norm": 0.6167306900024414,
"learning_rate": 9.974151508866231e-05,
"loss": 0.0372,
"step": 940
},
{
"epoch": 1.6323024054982818,
"grad_norm": 0.5872222781181335,
"learning_rate": 9.972688625202164e-05,
"loss": 0.0452,
"step": 950
},
{
"epoch": 1.6494845360824741,
"grad_norm": 0.4873111844062805,
"learning_rate": 9.97118559069594e-05,
"loss": 0.0524,
"step": 960
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.4051572382450104,
"learning_rate": 9.969642417483466e-05,
"loss": 0.0407,
"step": 970
},
{
"epoch": 1.6838487972508591,
"grad_norm": 0.43837177753448486,
"learning_rate": 9.968059118024744e-05,
"loss": 0.0471,
"step": 980
},
{
"epoch": 1.7010309278350515,
"grad_norm": 0.5712767243385315,
"learning_rate": 9.966435705103765e-05,
"loss": 0.0398,
"step": 990
},
{
"epoch": 1.718213058419244,
"grad_norm": 0.6849848628044128,
"learning_rate": 9.964772191828407e-05,
"loss": 0.0428,
"step": 1000
},
{
"epoch": 1.7353951890034365,
"grad_norm": 0.3726734220981598,
"learning_rate": 9.96306859163033e-05,
"loss": 0.0584,
"step": 1010
},
{
"epoch": 1.7525773195876289,
"grad_norm": 0.3805288076400757,
"learning_rate": 9.961324918264865e-05,
"loss": 0.0414,
"step": 1020
},
{
"epoch": 1.7697594501718212,
"grad_norm": 0.45931509137153625,
"learning_rate": 9.959541185810906e-05,
"loss": 0.0438,
"step": 1030
},
{
"epoch": 1.7869415807560136,
"grad_norm": 0.4892406761646271,
"learning_rate": 9.957717408670793e-05,
"loss": 0.0402,
"step": 1040
},
{
"epoch": 1.8041237113402062,
"grad_norm": 0.6873617768287659,
"learning_rate": 9.955853601570202e-05,
"loss": 0.049,
"step": 1050
},
{
"epoch": 1.8213058419243986,
"grad_norm": 0.8491326570510864,
"learning_rate": 9.953949779558017e-05,
"loss": 0.0532,
"step": 1060
},
{
"epoch": 1.8384879725085912,
"grad_norm": 0.45153722167015076,
"learning_rate": 9.952005958006217e-05,
"loss": 0.0403,
"step": 1070
},
{
"epoch": 1.8556701030927836,
"grad_norm": 0.608630359172821,
"learning_rate": 9.950022152609745e-05,
"loss": 0.0479,
"step": 1080
},
{
"epoch": 1.872852233676976,
"grad_norm": 0.5155346989631653,
"learning_rate": 9.947998379386388e-05,
"loss": 0.045,
"step": 1090
},
{
"epoch": 1.8900343642611683,
"grad_norm": 0.2339087724685669,
"learning_rate": 9.945934654676639e-05,
"loss": 0.0361,
"step": 1100
},
{
"epoch": 1.9072164948453607,
"grad_norm": 0.4478403329849243,
"learning_rate": 9.943830995143577e-05,
"loss": 0.0355,
"step": 1110
},
{
"epoch": 1.9243986254295533,
"grad_norm": 0.3183349072933197,
"learning_rate": 9.941687417772718e-05,
"loss": 0.0349,
"step": 1120
},
{
"epoch": 1.9415807560137457,
"grad_norm": 0.3854424059391022,
"learning_rate": 9.939503939871893e-05,
"loss": 0.0329,
"step": 1130
},
{
"epoch": 1.9587628865979383,
"grad_norm": 0.7198600172996521,
"learning_rate": 9.937280579071095e-05,
"loss": 0.0372,
"step": 1140
},
{
"epoch": 1.9759450171821307,
"grad_norm": 0.5371730923652649,
"learning_rate": 9.935017353322347e-05,
"loss": 0.0388,
"step": 1150
},
{
"epoch": 1.993127147766323,
"grad_norm": 0.6025398373603821,
"learning_rate": 9.932714280899547e-05,
"loss": 0.0334,
"step": 1160
},
{
"epoch": 2.0103092783505154,
"grad_norm": 0.4118864834308624,
"learning_rate": 9.930371380398331e-05,
"loss": 0.0429,
"step": 1170
},
{
"epoch": 2.027491408934708,
"grad_norm": 0.5828579664230347,
"learning_rate": 9.927988670735915e-05,
"loss": 0.0377,
"step": 1180
},
{
"epoch": 2.0446735395189,
"grad_norm": 0.45602017641067505,
"learning_rate": 9.925566171150945e-05,
"loss": 0.0364,
"step": 1190
},
{
"epoch": 2.0618556701030926,
"grad_norm": 0.236759752035141,
"learning_rate": 9.923103901203343e-05,
"loss": 0.0456,
"step": 1200
},
{
"epoch": 2.0790378006872854,
"grad_norm": 0.5670115947723389,
"learning_rate": 9.920601880774148e-05,
"loss": 0.0514,
"step": 1210
},
{
"epoch": 2.0962199312714778,
"grad_norm": 0.5565935373306274,
"learning_rate": 9.918060130065354e-05,
"loss": 0.0413,
"step": 1220
},
{
"epoch": 2.11340206185567,
"grad_norm": 0.28620976209640503,
"learning_rate": 9.915478669599747e-05,
"loss": 0.0345,
"step": 1230
},
{
"epoch": 2.1305841924398625,
"grad_norm": 0.598374605178833,
"learning_rate": 9.912857520220743e-05,
"loss": 0.0409,
"step": 1240
},
{
"epoch": 2.147766323024055,
"grad_norm": 0.4782467186450958,
"learning_rate": 9.910196703092216e-05,
"loss": 0.0341,
"step": 1250
},
{
"epoch": 2.1649484536082473,
"grad_norm": 0.3740648925304413,
"learning_rate": 9.907496239698327e-05,
"loss": 0.0334,
"step": 1260
},
{
"epoch": 2.1821305841924397,
"grad_norm": 0.514352560043335,
"learning_rate": 9.904756151843353e-05,
"loss": 0.033,
"step": 1270
},
{
"epoch": 2.1993127147766325,
"grad_norm": 0.6190779209136963,
"learning_rate": 9.90197646165151e-05,
"loss": 0.0382,
"step": 1280
},
{
"epoch": 2.216494845360825,
"grad_norm": 0.45846354961395264,
"learning_rate": 9.899157191566775e-05,
"loss": 0.0352,
"step": 1290
},
{
"epoch": 2.2336769759450172,
"grad_norm": 0.3824189603328705,
"learning_rate": 9.8962983643527e-05,
"loss": 0.0401,
"step": 1300
},
{
"epoch": 2.2508591065292096,
"grad_norm": 0.3505632281303406,
"learning_rate": 9.893400003092237e-05,
"loss": 0.0335,
"step": 1310
},
{
"epoch": 2.268041237113402,
"grad_norm": 0.55964595079422,
"learning_rate": 9.890462131187543e-05,
"loss": 0.0349,
"step": 1320
},
{
"epoch": 2.2852233676975944,
"grad_norm": 0.41852259635925293,
"learning_rate": 9.887484772359795e-05,
"loss": 0.0489,
"step": 1330
},
{
"epoch": 2.3024054982817868,
"grad_norm": 0.7132606506347656,
"learning_rate": 9.884467950648998e-05,
"loss": 0.0531,
"step": 1340
},
{
"epoch": 2.319587628865979,
"grad_norm": 0.5213425755500793,
"learning_rate": 9.881411690413796e-05,
"loss": 0.034,
"step": 1350
},
{
"epoch": 2.336769759450172,
"grad_norm": 0.6458540558815002,
"learning_rate": 9.878316016331262e-05,
"loss": 0.0473,
"step": 1360
},
{
"epoch": 2.3539518900343643,
"grad_norm": 0.41432708501815796,
"learning_rate": 9.875180953396714e-05,
"loss": 0.0321,
"step": 1370
},
{
"epoch": 2.3711340206185567,
"grad_norm": 0.3965621888637543,
"learning_rate": 9.872006526923503e-05,
"loss": 0.0351,
"step": 1380
},
{
"epoch": 2.388316151202749,
"grad_norm": 0.2506723999977112,
"learning_rate": 9.868792762542814e-05,
"loss": 0.0402,
"step": 1390
},
{
"epoch": 2.4054982817869415,
"grad_norm": 0.2948648929595947,
"learning_rate": 9.865539686203455e-05,
"loss": 0.0335,
"step": 1400
},
{
"epoch": 2.422680412371134,
"grad_norm": 0.5881168842315674,
"learning_rate": 9.862247324171652e-05,
"loss": 0.0473,
"step": 1410
},
{
"epoch": 2.4398625429553267,
"grad_norm": 0.5597307085990906,
"learning_rate": 9.858915703030829e-05,
"loss": 0.0387,
"step": 1420
},
{
"epoch": 2.457044673539519,
"grad_norm": 0.3447171449661255,
"learning_rate": 9.855544849681404e-05,
"loss": 0.0395,
"step": 1430
},
{
"epoch": 2.4742268041237114,
"grad_norm": 0.675528347492218,
"learning_rate": 9.852134791340567e-05,
"loss": 0.0303,
"step": 1440
},
{
"epoch": 2.491408934707904,
"grad_norm": 0.4080379903316498,
"learning_rate": 9.848685555542055e-05,
"loss": 0.0414,
"step": 1450
},
{
"epoch": 2.508591065292096,
"grad_norm": 0.34045320749282837,
"learning_rate": 9.845197170135939e-05,
"loss": 0.0291,
"step": 1460
},
{
"epoch": 2.5257731958762886,
"grad_norm": 0.34041810035705566,
"learning_rate": 9.841669663288391e-05,
"loss": 0.0287,
"step": 1470
},
{
"epoch": 2.542955326460481,
"grad_norm": 0.35550206899642944,
"learning_rate": 9.838103063481464e-05,
"loss": 0.035,
"step": 1480
},
{
"epoch": 2.5601374570446733,
"grad_norm": 0.5085458755493164,
"learning_rate": 9.834497399512855e-05,
"loss": 0.0286,
"step": 1490
},
{
"epoch": 2.5773195876288657,
"grad_norm": 0.3794465959072113,
"learning_rate": 9.830852700495676e-05,
"loss": 0.0383,
"step": 1500
},
{
"epoch": 2.5945017182130585,
"grad_norm": 0.20820270478725433,
"learning_rate": 9.82716899585822e-05,
"loss": 0.0229,
"step": 1510
},
{
"epoch": 2.611683848797251,
"grad_norm": 0.31715983152389526,
"learning_rate": 9.823446315343723e-05,
"loss": 0.0267,
"step": 1520
},
{
"epoch": 2.6288659793814433,
"grad_norm": 0.518182635307312,
"learning_rate": 9.819684689010119e-05,
"loss": 0.0328,
"step": 1530
},
{
"epoch": 2.6460481099656357,
"grad_norm": 0.3830466568470001,
"learning_rate": 9.815884147229804e-05,
"loss": 0.0289,
"step": 1540
},
{
"epoch": 2.663230240549828,
"grad_norm": 0.4509371817111969,
"learning_rate": 9.812044720689387e-05,
"loss": 0.0369,
"step": 1550
},
{
"epoch": 2.680412371134021,
"grad_norm": 0.5616033673286438,
"learning_rate": 9.808166440389446e-05,
"loss": 0.0264,
"step": 1560
},
{
"epoch": 2.6975945017182132,
"grad_norm": 0.5223531723022461,
"learning_rate": 9.80424933764427e-05,
"loss": 0.0265,
"step": 1570
},
{
"epoch": 2.7147766323024056,
"grad_norm": 0.5588796734809875,
"learning_rate": 9.800293444081612e-05,
"loss": 0.0298,
"step": 1580
},
{
"epoch": 2.731958762886598,
"grad_norm": 0.5224287509918213,
"learning_rate": 9.796298791642435e-05,
"loss": 0.0334,
"step": 1590
},
{
"epoch": 2.7491408934707904,
"grad_norm": 0.510735809803009,
"learning_rate": 9.792265412580654e-05,
"loss": 0.0344,
"step": 1600
},
{
"epoch": 2.7663230240549828,
"grad_norm": 0.46988189220428467,
"learning_rate": 9.788193339462866e-05,
"loss": 0.034,
"step": 1610
},
{
"epoch": 2.783505154639175,
"grad_norm": 0.43194422125816345,
"learning_rate": 9.7840826051681e-05,
"loss": 0.033,
"step": 1620
},
{
"epoch": 2.8006872852233675,
"grad_norm": 0.5727249383926392,
"learning_rate": 9.779933242887542e-05,
"loss": 0.0321,
"step": 1630
},
{
"epoch": 2.81786941580756,
"grad_norm": 0.3941832482814789,
"learning_rate": 9.775745286124277e-05,
"loss": 0.0286,
"step": 1640
},
{
"epoch": 2.8350515463917527,
"grad_norm": 0.5706576704978943,
"learning_rate": 9.771518768693004e-05,
"loss": 0.0271,
"step": 1650
},
{
"epoch": 2.852233676975945,
"grad_norm": 0.5128160715103149,
"learning_rate": 9.76725372471978e-05,
"loss": 0.0434,
"step": 1660
},
{
"epoch": 2.8694158075601375,
"grad_norm": 0.34409016370773315,
"learning_rate": 9.762950188641728e-05,
"loss": 0.0314,
"step": 1670
},
{
"epoch": 2.88659793814433,
"grad_norm": 0.532747209072113,
"learning_rate": 9.758608195206771e-05,
"loss": 0.0369,
"step": 1680
},
{
"epoch": 2.9037800687285222,
"grad_norm": 0.5421701073646545,
"learning_rate": 9.754227779473349e-05,
"loss": 0.0404,
"step": 1690
},
{
"epoch": 2.9209621993127146,
"grad_norm": 0.36500459909439087,
"learning_rate": 9.749808976810128e-05,
"loss": 0.0332,
"step": 1700
},
{
"epoch": 2.9381443298969074,
"grad_norm": 0.5636774897575378,
"learning_rate": 9.745351822895727e-05,
"loss": 0.0309,
"step": 1710
},
{
"epoch": 2.9553264604811,
"grad_norm": 0.408263236284256,
"learning_rate": 9.740856353718419e-05,
"loss": 0.033,
"step": 1720
},
{
"epoch": 2.972508591065292,
"grad_norm": 0.4448431432247162,
"learning_rate": 9.736322605575845e-05,
"loss": 0.0248,
"step": 1730
},
{
"epoch": 2.9896907216494846,
"grad_norm": 0.3676033020019531,
"learning_rate": 9.731750615074724e-05,
"loss": 0.036,
"step": 1740
},
{
"epoch": 3.006872852233677,
"grad_norm": 0.3884856104850769,
"learning_rate": 9.727140419130553e-05,
"loss": 0.0256,
"step": 1750
},
{
"epoch": 3.0240549828178693,
"grad_norm": 0.4114404320716858,
"learning_rate": 9.72249205496731e-05,
"loss": 0.0273,
"step": 1760
},
{
"epoch": 3.0412371134020617,
"grad_norm": 0.5628842711448669,
"learning_rate": 9.717805560117149e-05,
"loss": 0.0254,
"step": 1770
},
{
"epoch": 3.058419243986254,
"grad_norm": 0.34935763478279114,
"learning_rate": 9.71308097242011e-05,
"loss": 0.0246,
"step": 1780
},
{
"epoch": 3.075601374570447,
"grad_norm": 0.8378509283065796,
"learning_rate": 9.708318330023798e-05,
"loss": 0.0358,
"step": 1790
},
{
"epoch": 3.0927835051546393,
"grad_norm": 0.4501832127571106,
"learning_rate": 9.703517671383086e-05,
"loss": 0.0314,
"step": 1800
},
{
"epoch": 3.1099656357388317,
"grad_norm": 0.5251947641372681,
"learning_rate": 9.698679035259801e-05,
"loss": 0.0291,
"step": 1810
},
{
"epoch": 3.127147766323024,
"grad_norm": 0.36063244938850403,
"learning_rate": 9.693802460722405e-05,
"loss": 0.0244,
"step": 1820
},
{
"epoch": 3.1443298969072164,
"grad_norm": 0.2640397548675537,
"learning_rate": 9.688887987145691e-05,
"loss": 0.0291,
"step": 1830
},
{
"epoch": 3.161512027491409,
"grad_norm": 0.35009852051734924,
"learning_rate": 9.683935654210457e-05,
"loss": 0.0355,
"step": 1840
},
{
"epoch": 3.178694158075601,
"grad_norm": 0.455991268157959,
"learning_rate": 9.678945501903188e-05,
"loss": 0.0244,
"step": 1850
},
{
"epoch": 3.195876288659794,
"grad_norm": 0.2577104866504669,
"learning_rate": 9.673917570515732e-05,
"loss": 0.0277,
"step": 1860
},
{
"epoch": 3.2130584192439864,
"grad_norm": 0.46351000666618347,
"learning_rate": 9.668851900644975e-05,
"loss": 0.0249,
"step": 1870
},
{
"epoch": 3.2302405498281788,
"grad_norm": 0.4203677773475647,
"learning_rate": 9.663748533192516e-05,
"loss": 0.0251,
"step": 1880
},
{
"epoch": 3.247422680412371,
"grad_norm": 0.24778026342391968,
"learning_rate": 9.658607509364337e-05,
"loss": 0.0286,
"step": 1890
},
{
"epoch": 3.2646048109965635,
"grad_norm": 0.5941663980484009,
"learning_rate": 9.653428870670459e-05,
"loss": 0.0375,
"step": 1900
},
{
"epoch": 3.281786941580756,
"grad_norm": 0.6710448265075684,
"learning_rate": 9.648212658924625e-05,
"loss": 0.0268,
"step": 1910
},
{
"epoch": 3.2989690721649483,
"grad_norm": 0.40934911370277405,
"learning_rate": 9.642958916243946e-05,
"loss": 0.0187,
"step": 1920
},
{
"epoch": 3.3161512027491407,
"grad_norm": 0.3697362542152405,
"learning_rate": 9.637667685048575e-05,
"loss": 0.0286,
"step": 1930
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.40777045488357544,
"learning_rate": 9.63233900806135e-05,
"loss": 0.0232,
"step": 1940
},
{
"epoch": 3.350515463917526,
"grad_norm": 0.2753160893917084,
"learning_rate": 9.62697292830746e-05,
"loss": 0.0305,
"step": 1950
},
{
"epoch": 3.3676975945017182,
"grad_norm": 0.5245633721351624,
"learning_rate": 9.6215694891141e-05,
"loss": 0.0268,
"step": 1960
},
{
"epoch": 3.3848797250859106,
"grad_norm": 0.4454520344734192,
"learning_rate": 9.616128734110103e-05,
"loss": 0.0334,
"step": 1970
},
{
"epoch": 3.402061855670103,
"grad_norm": 0.40832188725471497,
"learning_rate": 9.61065070722561e-05,
"loss": 0.0375,
"step": 1980
},
{
"epoch": 3.4192439862542954,
"grad_norm": 0.4421581029891968,
"learning_rate": 9.6051354526917e-05,
"loss": 0.0291,
"step": 1990
},
{
"epoch": 3.436426116838488,
"grad_norm": 0.3832218050956726,
"learning_rate": 9.59958301504004e-05,
"loss": 0.0348,
"step": 2000
},
{
"epoch": 3.4536082474226806,
"grad_norm": 0.2825784683227539,
"learning_rate": 9.593993439102526e-05,
"loss": 0.0285,
"step": 2010
},
{
"epoch": 3.470790378006873,
"grad_norm": 0.4989912211894989,
"learning_rate": 9.588366770010914e-05,
"loss": 0.0298,
"step": 2020
},
{
"epoch": 3.4879725085910653,
"grad_norm": 0.38946759700775146,
"learning_rate": 9.582703053196464e-05,
"loss": 0.0294,
"step": 2030
},
{
"epoch": 3.5051546391752577,
"grad_norm": 0.3553588092327118,
"learning_rate": 9.577002334389569e-05,
"loss": 0.0281,
"step": 2040
},
{
"epoch": 3.52233676975945,
"grad_norm": 0.48752427101135254,
"learning_rate": 9.571264659619382e-05,
"loss": 0.026,
"step": 2050
},
{
"epoch": 3.5395189003436425,
"grad_norm": 0.3820585310459137,
"learning_rate": 9.565490075213452e-05,
"loss": 0.0259,
"step": 2060
},
{
"epoch": 3.556701030927835,
"grad_norm": 0.35598281025886536,
"learning_rate": 9.55967862779735e-05,
"loss": 0.0343,
"step": 2070
},
{
"epoch": 3.5738831615120272,
"grad_norm": 0.4193035364151001,
"learning_rate": 9.55383036429428e-05,
"loss": 0.0296,
"step": 2080
},
{
"epoch": 3.59106529209622,
"grad_norm": 0.4993601441383362,
"learning_rate": 9.547945331924717e-05,
"loss": 0.0236,
"step": 2090
},
{
"epoch": 3.6082474226804124,
"grad_norm": 0.3591003119945526,
"learning_rate": 9.542023578206015e-05,
"loss": 0.0301,
"step": 2100
},
{
"epoch": 3.625429553264605,
"grad_norm": 0.30369478464126587,
"learning_rate": 9.536065150952025e-05,
"loss": 0.0327,
"step": 2110
},
{
"epoch": 3.642611683848797,
"grad_norm": 0.37964117527008057,
"learning_rate": 9.530070098272712e-05,
"loss": 0.0351,
"step": 2120
},
{
"epoch": 3.6597938144329896,
"grad_norm": 0.4031108617782593,
"learning_rate": 9.524038468573764e-05,
"loss": 0.0334,
"step": 2130
},
{
"epoch": 3.6769759450171824,
"grad_norm": 0.24876996874809265,
"learning_rate": 9.517970310556202e-05,
"loss": 0.0238,
"step": 2140
},
{
"epoch": 3.6941580756013748,
"grad_norm": 0.5632336139678955,
"learning_rate": 9.511865673215986e-05,
"loss": 0.0245,
"step": 2150
},
{
"epoch": 3.711340206185567,
"grad_norm": 0.4374890625476837,
"learning_rate": 9.50572460584362e-05,
"loss": 0.0364,
"step": 2160
},
{
"epoch": 3.7285223367697595,
"grad_norm": 0.4703497588634491,
"learning_rate": 9.499547158023755e-05,
"loss": 0.0248,
"step": 2170
},
{
"epoch": 3.745704467353952,
"grad_norm": 0.3067072927951813,
"learning_rate": 9.493333379634786e-05,
"loss": 0.0203,
"step": 2180
},
{
"epoch": 3.7628865979381443,
"grad_norm": 0.5396534204483032,
"learning_rate": 9.487083320848454e-05,
"loss": 0.0296,
"step": 2190
},
{
"epoch": 3.7800687285223367,
"grad_norm": 0.2977238595485687,
"learning_rate": 9.480797032129432e-05,
"loss": 0.0243,
"step": 2200
},
{
"epoch": 3.797250859106529,
"grad_norm": 0.35456737875938416,
"learning_rate": 9.474474564234931e-05,
"loss": 0.0331,
"step": 2210
},
{
"epoch": 3.8144329896907214,
"grad_norm": 0.4030454456806183,
"learning_rate": 9.468115968214276e-05,
"loss": 0.0271,
"step": 2220
},
{
"epoch": 3.8316151202749142,
"grad_norm": 0.4009501039981842,
"learning_rate": 9.461721295408505e-05,
"loss": 0.025,
"step": 2230
},
{
"epoch": 3.8487972508591066,
"grad_norm": 0.34113046526908875,
"learning_rate": 9.455290597449945e-05,
"loss": 0.0298,
"step": 2240
},
{
"epoch": 3.865979381443299,
"grad_norm": 0.4473305344581604,
"learning_rate": 9.448823926261805e-05,
"loss": 0.0293,
"step": 2250
},
{
"epoch": 3.8831615120274914,
"grad_norm": 0.4152556359767914,
"learning_rate": 9.442321334057748e-05,
"loss": 0.0365,
"step": 2260
},
{
"epoch": 3.9003436426116838,
"grad_norm": 0.5801966190338135,
"learning_rate": 9.435782873341474e-05,
"loss": 0.0283,
"step": 2270
},
{
"epoch": 3.917525773195876,
"grad_norm": 0.5143575668334961,
"learning_rate": 9.429208596906296e-05,
"loss": 0.0276,
"step": 2280
},
{
"epoch": 3.934707903780069,
"grad_norm": 0.28408244252204895,
"learning_rate": 9.422598557834712e-05,
"loss": 0.0266,
"step": 2290
},
{
"epoch": 3.9518900343642613,
"grad_norm": 0.30861398577690125,
"learning_rate": 9.415952809497979e-05,
"loss": 0.0307,
"step": 2300
},
{
"epoch": 3.9690721649484537,
"grad_norm": 0.5015305280685425,
"learning_rate": 9.409271405555677e-05,
"loss": 0.0238,
"step": 2310
},
{
"epoch": 3.986254295532646,
"grad_norm": 0.42114853858947754,
"learning_rate": 9.402554399955281e-05,
"loss": 0.0297,
"step": 2320
},
{
"epoch": 4.0034364261168385,
"grad_norm": 0.38618704676628113,
"learning_rate": 9.395801846931726e-05,
"loss": 0.0274,
"step": 2330
},
{
"epoch": 4.020618556701031,
"grad_norm": 0.44997620582580566,
"learning_rate": 9.389013801006961e-05,
"loss": 0.0294,
"step": 2340
},
{
"epoch": 4.037800687285223,
"grad_norm": 0.4600159227848053,
"learning_rate": 9.382190316989518e-05,
"loss": 0.0286,
"step": 2350
},
{
"epoch": 4.054982817869416,
"grad_norm": 0.35218673944473267,
"learning_rate": 9.375331449974066e-05,
"loss": 0.0248,
"step": 2360
},
{
"epoch": 4.072164948453608,
"grad_norm": 0.26790571212768555,
"learning_rate": 9.368437255340965e-05,
"loss": 0.0287,
"step": 2370
},
{
"epoch": 4.0893470790378,
"grad_norm": 0.356351375579834,
"learning_rate": 9.361507788755818e-05,
"loss": 0.0207,
"step": 2380
},
{
"epoch": 4.106529209621993,
"grad_norm": 0.3762167692184448,
"learning_rate": 9.354543106169029e-05,
"loss": 0.0303,
"step": 2390
},
{
"epoch": 4.123711340206185,
"grad_norm": 0.21644559502601624,
"learning_rate": 9.347543263815339e-05,
"loss": 0.0262,
"step": 2400
},
{
"epoch": 4.140893470790378,
"grad_norm": 0.3905271887779236,
"learning_rate": 9.340508318213383e-05,
"loss": 0.0267,
"step": 2410
},
{
"epoch": 4.158075601374571,
"grad_norm": 0.25276127457618713,
"learning_rate": 9.333438326165227e-05,
"loss": 0.0292,
"step": 2420
},
{
"epoch": 4.175257731958763,
"grad_norm": 0.21106575429439545,
"learning_rate": 9.326333344755912e-05,
"loss": 0.0218,
"step": 2430
},
{
"epoch": 4.1924398625429555,
"grad_norm": 0.37403470277786255,
"learning_rate": 9.319193431352993e-05,
"loss": 0.0261,
"step": 2440
},
{
"epoch": 4.209621993127148,
"grad_norm": 0.23083224892616272,
"learning_rate": 9.312018643606074e-05,
"loss": 0.0268,
"step": 2450
},
{
"epoch": 4.22680412371134,
"grad_norm": 0.29775136709213257,
"learning_rate": 9.304809039446347e-05,
"loss": 0.0286,
"step": 2460
},
{
"epoch": 4.243986254295533,
"grad_norm": 0.39073804020881653,
"learning_rate": 9.297564677086118e-05,
"loss": 0.0231,
"step": 2470
},
{
"epoch": 4.261168384879725,
"grad_norm": 0.3536919951438904,
"learning_rate": 9.290285615018342e-05,
"loss": 0.0269,
"step": 2480
},
{
"epoch": 4.278350515463917,
"grad_norm": 0.37961915135383606,
"learning_rate": 9.282971912016149e-05,
"loss": 0.0312,
"step": 2490
},
{
"epoch": 4.29553264604811,
"grad_norm": 0.444950670003891,
"learning_rate": 9.275623627132368e-05,
"loss": 0.0275,
"step": 2500
},
{
"epoch": 4.312714776632302,
"grad_norm": 0.3781861364841461,
"learning_rate": 9.268240819699054e-05,
"loss": 0.0285,
"step": 2510
},
{
"epoch": 4.329896907216495,
"grad_norm": 0.2931497395038605,
"learning_rate": 9.260823549327002e-05,
"loss": 0.0258,
"step": 2520
},
{
"epoch": 4.347079037800687,
"grad_norm": 0.26529255509376526,
"learning_rate": 9.253371875905274e-05,
"loss": 0.026,
"step": 2530
},
{
"epoch": 4.364261168384879,
"grad_norm": 0.6221164464950562,
"learning_rate": 9.245885859600712e-05,
"loss": 0.0366,
"step": 2540
},
{
"epoch": 4.381443298969073,
"grad_norm": 0.39402952790260315,
"learning_rate": 9.238365560857447e-05,
"loss": 0.0237,
"step": 2550
},
{
"epoch": 4.398625429553265,
"grad_norm": 0.33800095319747925,
"learning_rate": 9.230811040396423e-05,
"loss": 0.0328,
"step": 2560
},
{
"epoch": 4.415807560137457,
"grad_norm": 0.21751320362091064,
"learning_rate": 9.223222359214891e-05,
"loss": 0.0315,
"step": 2570
},
{
"epoch": 4.43298969072165,
"grad_norm": 0.34266844391822815,
"learning_rate": 9.215599578585936e-05,
"loss": 0.0374,
"step": 2580
},
{
"epoch": 4.450171821305842,
"grad_norm": 0.3306879997253418,
"learning_rate": 9.207942760057958e-05,
"loss": 0.023,
"step": 2590
},
{
"epoch": 4.4673539518900345,
"grad_norm": 0.3142191767692566,
"learning_rate": 9.200251965454199e-05,
"loss": 0.0263,
"step": 2600
},
{
"epoch": 4.484536082474227,
"grad_norm": 0.4040457010269165,
"learning_rate": 9.192527256872226e-05,
"loss": 0.0269,
"step": 2610
},
{
"epoch": 4.501718213058419,
"grad_norm": 0.3506450951099396,
"learning_rate": 9.184768696683443e-05,
"loss": 0.0227,
"step": 2620
},
{
"epoch": 4.518900343642612,
"grad_norm": 0.4573422074317932,
"learning_rate": 9.176976347532575e-05,
"loss": 0.0312,
"step": 2630
},
{
"epoch": 4.536082474226804,
"grad_norm": 0.2337106168270111,
"learning_rate": 9.169150272337172e-05,
"loss": 0.028,
"step": 2640
},
{
"epoch": 4.553264604810996,
"grad_norm": 0.28833648562431335,
"learning_rate": 9.161290534287099e-05,
"loss": 0.0245,
"step": 2650
},
{
"epoch": 4.570446735395189,
"grad_norm": 0.24830326437950134,
"learning_rate": 9.153397196844017e-05,
"loss": 0.0218,
"step": 2660
},
{
"epoch": 4.587628865979381,
"grad_norm": 0.55474853515625,
"learning_rate": 9.145470323740885e-05,
"loss": 0.0247,
"step": 2670
},
{
"epoch": 4.6048109965635735,
"grad_norm": 0.49275097250938416,
"learning_rate": 9.137509978981435e-05,
"loss": 0.0276,
"step": 2680
},
{
"epoch": 4.621993127147766,
"grad_norm": 0.30603882670402527,
"learning_rate": 9.129516226839658e-05,
"loss": 0.0208,
"step": 2690
},
{
"epoch": 4.639175257731958,
"grad_norm": 0.32763534784317017,
"learning_rate": 9.121489131859286e-05,
"loss": 0.0267,
"step": 2700
},
{
"epoch": 4.6563573883161515,
"grad_norm": 0.41010305285453796,
"learning_rate": 9.113428758853268e-05,
"loss": 0.0223,
"step": 2710
},
{
"epoch": 4.673539518900344,
"grad_norm": 0.2709559500217438,
"learning_rate": 9.105335172903253e-05,
"loss": 0.0253,
"step": 2720
},
{
"epoch": 4.690721649484536,
"grad_norm": 0.23412011563777924,
"learning_rate": 9.097208439359057e-05,
"loss": 0.0146,
"step": 2730
},
{
"epoch": 4.707903780068729,
"grad_norm": 0.4020395576953888,
"learning_rate": 9.08904862383814e-05,
"loss": 0.0188,
"step": 2740
},
{
"epoch": 4.725085910652921,
"grad_norm": 0.2301657497882843,
"learning_rate": 9.080855792225076e-05,
"loss": 0.0227,
"step": 2750
},
{
"epoch": 4.742268041237113,
"grad_norm": 0.3554215133190155,
"learning_rate": 9.072630010671015e-05,
"loss": 0.0213,
"step": 2760
},
{
"epoch": 4.759450171821306,
"grad_norm": 0.39176028966903687,
"learning_rate": 9.064371345593161e-05,
"loss": 0.0208,
"step": 2770
},
{
"epoch": 4.776632302405498,
"grad_norm": 0.304556280374527,
"learning_rate": 9.056079863674223e-05,
"loss": 0.0211,
"step": 2780
},
{
"epoch": 4.793814432989691,
"grad_norm": 0.3643367290496826,
"learning_rate": 9.047755631861884e-05,
"loss": 0.0237,
"step": 2790
},
{
"epoch": 4.810996563573883,
"grad_norm": 0.379891961812973,
"learning_rate": 9.039398717368259e-05,
"loss": 0.025,
"step": 2800
},
{
"epoch": 4.828178694158075,
"grad_norm": 0.3559863269329071,
"learning_rate": 9.031009187669353e-05,
"loss": 0.0204,
"step": 2810
},
{
"epoch": 4.845360824742268,
"grad_norm": 0.43560439348220825,
"learning_rate": 9.02258711050451e-05,
"loss": 0.0254,
"step": 2820
},
{
"epoch": 4.862542955326461,
"grad_norm": 0.28781425952911377,
"learning_rate": 9.014132553875878e-05,
"loss": 0.0319,
"step": 2830
},
{
"epoch": 4.879725085910653,
"grad_norm": 0.2920217514038086,
"learning_rate": 9.005645586047847e-05,
"loss": 0.0259,
"step": 2840
},
{
"epoch": 4.896907216494846,
"grad_norm": 0.28286901116371155,
"learning_rate": 8.997126275546509e-05,
"loss": 0.0213,
"step": 2850
},
{
"epoch": 4.914089347079038,
"grad_norm": 0.3962228298187256,
"learning_rate": 8.988574691159095e-05,
"loss": 0.0257,
"step": 2860
},
{
"epoch": 4.9312714776632305,
"grad_norm": 0.43689677119255066,
"learning_rate": 8.979990901933428e-05,
"loss": 0.0296,
"step": 2870
},
{
"epoch": 4.948453608247423,
"grad_norm": 0.35005268454551697,
"learning_rate": 8.971374977177356e-05,
"loss": 0.0259,
"step": 2880
},
{
"epoch": 4.965635738831615,
"grad_norm": 0.6228940486907959,
"learning_rate": 8.962726986458207e-05,
"loss": 0.0307,
"step": 2890
},
{
"epoch": 4.982817869415808,
"grad_norm": 0.2236226499080658,
"learning_rate": 8.954046999602211e-05,
"loss": 0.0249,
"step": 2900
},
{
"epoch": 5.0,
"grad_norm": 0.42723384499549866,
"learning_rate": 8.945335086693942e-05,
"loss": 0.0244,
"step": 2910
},
{
"epoch": 5.017182130584192,
"grad_norm": 0.2944800853729248,
"learning_rate": 8.936591318075764e-05,
"loss": 0.0248,
"step": 2920
},
{
"epoch": 5.034364261168385,
"grad_norm": 0.30557361245155334,
"learning_rate": 8.927815764347242e-05,
"loss": 0.0204,
"step": 2930
},
{
"epoch": 5.051546391752577,
"grad_norm": 0.3732447922229767,
"learning_rate": 8.919008496364587e-05,
"loss": 0.0308,
"step": 2940
},
{
"epoch": 5.0687285223367695,
"grad_norm": 0.30933091044425964,
"learning_rate": 8.910169585240078e-05,
"loss": 0.029,
"step": 2950
},
{
"epoch": 5.085910652920962,
"grad_norm": 0.554576575756073,
"learning_rate": 8.901299102341494e-05,
"loss": 0.03,
"step": 2960
},
{
"epoch": 5.103092783505154,
"grad_norm": 0.43166640400886536,
"learning_rate": 8.892397119291526e-05,
"loss": 0.0241,
"step": 2970
},
{
"epoch": 5.120274914089347,
"grad_norm": 0.4924606680870056,
"learning_rate": 8.883463707967211e-05,
"loss": 0.0224,
"step": 2980
},
{
"epoch": 5.13745704467354,
"grad_norm": 0.3466743230819702,
"learning_rate": 8.874498940499346e-05,
"loss": 0.0234,
"step": 2990
},
{
"epoch": 5.154639175257732,
"grad_norm": 0.5283942222595215,
"learning_rate": 8.865502889271901e-05,
"loss": 0.0416,
"step": 3000
},
{
"epoch": 5.171821305841925,
"grad_norm": 0.3286329209804535,
"learning_rate": 8.85647562692145e-05,
"loss": 0.0249,
"step": 3010
},
{
"epoch": 5.189003436426117,
"grad_norm": 0.5245858430862427,
"learning_rate": 8.847417226336561e-05,
"loss": 0.0272,
"step": 3020
},
{
"epoch": 5.206185567010309,
"grad_norm": 0.3810178339481354,
"learning_rate": 8.83832776065723e-05,
"loss": 0.0238,
"step": 3030
},
{
"epoch": 5.223367697594502,
"grad_norm": 0.48333027958869934,
"learning_rate": 8.829207303274279e-05,
"loss": 0.0262,
"step": 3040
},
{
"epoch": 5.240549828178694,
"grad_norm": 0.4119150638580322,
"learning_rate": 8.820055927828762e-05,
"loss": 0.0243,
"step": 3050
},
{
"epoch": 5.257731958762887,
"grad_norm": 0.26957616209983826,
"learning_rate": 8.810873708211383e-05,
"loss": 0.0228,
"step": 3060
},
{
"epoch": 5.274914089347079,
"grad_norm": 0.502048134803772,
"learning_rate": 8.801660718561875e-05,
"loss": 0.0215,
"step": 3070
},
{
"epoch": 5.292096219931271,
"grad_norm": 0.3506264090538025,
"learning_rate": 8.79241703326843e-05,
"loss": 0.0335,
"step": 3080
},
{
"epoch": 5.309278350515464,
"grad_norm": 0.42758750915527344,
"learning_rate": 8.78314272696708e-05,
"loss": 0.0294,
"step": 3090
},
{
"epoch": 5.326460481099656,
"grad_norm": 0.25186318159103394,
"learning_rate": 8.773837874541099e-05,
"loss": 0.0319,
"step": 3100
},
{
"epoch": 5.3436426116838485,
"grad_norm": 0.310088187456131,
"learning_rate": 8.7645025511204e-05,
"loss": 0.0263,
"step": 3110
},
{
"epoch": 5.360824742268041,
"grad_norm": 0.3250679075717926,
"learning_rate": 8.755136832080927e-05,
"loss": 0.027,
"step": 3120
},
{
"epoch": 5.378006872852234,
"grad_norm": 0.3429087698459625,
"learning_rate": 8.745740793044046e-05,
"loss": 0.024,
"step": 3130
},
{
"epoch": 5.3951890034364265,
"grad_norm": 0.2694869637489319,
"learning_rate": 8.736314509875934e-05,
"loss": 0.0256,
"step": 3140
},
{
"epoch": 5.412371134020619,
"grad_norm": 0.32267141342163086,
"learning_rate": 8.726858058686968e-05,
"loss": 0.0269,
"step": 3150
},
{
"epoch": 5.429553264604811,
"grad_norm": 0.3753204643726349,
"learning_rate": 8.717371515831112e-05,
"loss": 0.0209,
"step": 3160
},
{
"epoch": 5.446735395189004,
"grad_norm": 0.258056640625,
"learning_rate": 8.707854957905294e-05,
"loss": 0.0236,
"step": 3170
},
{
"epoch": 5.463917525773196,
"grad_norm": 0.19171011447906494,
"learning_rate": 8.698308461748799e-05,
"loss": 0.0241,
"step": 3180
},
{
"epoch": 5.481099656357388,
"grad_norm": 0.20204898715019226,
"learning_rate": 8.688732104442632e-05,
"loss": 0.0166,
"step": 3190
},
{
"epoch": 5.498281786941581,
"grad_norm": 0.4373096227645874,
"learning_rate": 8.679125963308909e-05,
"loss": 0.0271,
"step": 3200
},
{
"epoch": 5.515463917525773,
"grad_norm": 0.24470694363117218,
"learning_rate": 8.669490115910234e-05,
"loss": 0.0193,
"step": 3210
},
{
"epoch": 5.5326460481099655,
"grad_norm": 0.3045484125614166,
"learning_rate": 8.659824640049063e-05,
"loss": 0.0203,
"step": 3220
},
{
"epoch": 5.549828178694158,
"grad_norm": 0.18822188675403595,
"learning_rate": 8.650129613767075e-05,
"loss": 0.0179,
"step": 3230
},
{
"epoch": 5.56701030927835,
"grad_norm": 0.2681497037410736,
"learning_rate": 8.640405115344557e-05,
"loss": 0.0179,
"step": 3240
},
{
"epoch": 5.584192439862543,
"grad_norm": 0.2358640730381012,
"learning_rate": 8.630651223299755e-05,
"loss": 0.0231,
"step": 3250
},
{
"epoch": 5.601374570446735,
"grad_norm": 0.33891239762306213,
"learning_rate": 8.620868016388252e-05,
"loss": 0.02,
"step": 3260
},
{
"epoch": 5.618556701030927,
"grad_norm": 0.5138821005821228,
"learning_rate": 8.611055573602323e-05,
"loss": 0.0258,
"step": 3270
},
{
"epoch": 5.63573883161512,
"grad_norm": 0.20864839851856232,
"learning_rate": 8.601213974170303e-05,
"loss": 0.0178,
"step": 3280
},
{
"epoch": 5.652920962199313,
"grad_norm": 0.1877431720495224,
"learning_rate": 8.591343297555947e-05,
"loss": 0.0208,
"step": 3290
},
{
"epoch": 5.670103092783505,
"grad_norm": 0.38843151926994324,
"learning_rate": 8.581443623457785e-05,
"loss": 0.0277,
"step": 3300
},
{
"epoch": 5.687285223367698,
"grad_norm": 0.22977161407470703,
"learning_rate": 8.571515031808484e-05,
"loss": 0.0169,
"step": 3310
},
{
"epoch": 5.70446735395189,
"grad_norm": 0.39261528849601746,
"learning_rate": 8.561557602774196e-05,
"loss": 0.0151,
"step": 3320
},
{
"epoch": 5.721649484536083,
"grad_norm": 0.20939397811889648,
"learning_rate": 8.551571416753912e-05,
"loss": 0.0247,
"step": 3330
},
{
"epoch": 5.738831615120275,
"grad_norm": 0.3138323724269867,
"learning_rate": 8.54155655437882e-05,
"loss": 0.0202,
"step": 3340
},
{
"epoch": 5.756013745704467,
"grad_norm": 0.2958749830722809,
"learning_rate": 8.531513096511646e-05,
"loss": 0.0239,
"step": 3350
},
{
"epoch": 5.77319587628866,
"grad_norm": 0.43186619877815247,
"learning_rate": 8.521441124246002e-05,
"loss": 0.028,
"step": 3360
},
{
"epoch": 5.790378006872852,
"grad_norm": 0.3215327262878418,
"learning_rate": 8.511340718905737e-05,
"loss": 0.0273,
"step": 3370
},
{
"epoch": 5.8075601374570445,
"grad_norm": 0.584010899066925,
"learning_rate": 8.501211962044275e-05,
"loss": 0.0241,
"step": 3380
},
{
"epoch": 5.824742268041237,
"grad_norm": 0.4198577404022217,
"learning_rate": 8.491054935443954e-05,
"loss": 0.0197,
"step": 3390
},
{
"epoch": 5.841924398625429,
"grad_norm": 0.4115603566169739,
"learning_rate": 8.480869721115375e-05,
"loss": 0.0201,
"step": 3400
},
{
"epoch": 5.859106529209622,
"grad_norm": 0.19503287971019745,
"learning_rate": 8.470656401296732e-05,
"loss": 0.0226,
"step": 3410
},
{
"epoch": 5.876288659793815,
"grad_norm": 0.3533823490142822,
"learning_rate": 8.460415058453153e-05,
"loss": 0.0245,
"step": 3420
},
{
"epoch": 5.893470790378007,
"grad_norm": 0.22459329664707184,
"learning_rate": 8.450145775276024e-05,
"loss": 0.0203,
"step": 3430
},
{
"epoch": 5.9106529209622,
"grad_norm": 0.5531524419784546,
"learning_rate": 8.439848634682337e-05,
"loss": 0.0347,
"step": 3440
},
{
"epoch": 5.927835051546392,
"grad_norm": 0.3939720690250397,
"learning_rate": 8.429523719814008e-05,
"loss": 0.0217,
"step": 3450
},
{
"epoch": 5.945017182130584,
"grad_norm": 0.26560521125793457,
"learning_rate": 8.419171114037214e-05,
"loss": 0.0249,
"step": 3460
},
{
"epoch": 5.962199312714777,
"grad_norm": 0.26765570044517517,
"learning_rate": 8.40879090094171e-05,
"loss": 0.022,
"step": 3470
},
{
"epoch": 5.979381443298969,
"grad_norm": 0.32663553953170776,
"learning_rate": 8.398383164340167e-05,
"loss": 0.0234,
"step": 3480
},
{
"epoch": 5.9965635738831615,
"grad_norm": 0.3831205368041992,
"learning_rate": 8.387947988267482e-05,
"loss": 0.0265,
"step": 3490
},
{
"epoch": 6.013745704467354,
"grad_norm": 0.35195666551589966,
"learning_rate": 8.37748545698011e-05,
"loss": 0.0182,
"step": 3500
},
{
"epoch": 6.030927835051546,
"grad_norm": 0.3900887370109558,
"learning_rate": 8.366995654955375e-05,
"loss": 0.0234,
"step": 3510
},
{
"epoch": 6.048109965635739,
"grad_norm": 0.41412341594696045,
"learning_rate": 8.356478666890798e-05,
"loss": 0.0254,
"step": 3520
},
{
"epoch": 6.065292096219931,
"grad_norm": 0.2592662572860718,
"learning_rate": 8.345934577703403e-05,
"loss": 0.0163,
"step": 3530
},
{
"epoch": 6.082474226804123,
"grad_norm": 0.3936319947242737,
"learning_rate": 8.335363472529038e-05,
"loss": 0.0266,
"step": 3540
},
{
"epoch": 6.099656357388316,
"grad_norm": 0.3583790957927704,
"learning_rate": 8.324765436721688e-05,
"loss": 0.0178,
"step": 3550
},
{
"epoch": 6.116838487972508,
"grad_norm": 0.4558425843715668,
"learning_rate": 8.314140555852777e-05,
"loss": 0.0259,
"step": 3560
},
{
"epoch": 6.134020618556701,
"grad_norm": 0.3604467213153839,
"learning_rate": 8.303488915710484e-05,
"loss": 0.027,
"step": 3570
},
{
"epoch": 6.151202749140894,
"grad_norm": 0.22830836474895477,
"learning_rate": 8.292810602299059e-05,
"loss": 0.0239,
"step": 3580
},
{
"epoch": 6.168384879725086,
"grad_norm": 0.18954436480998993,
"learning_rate": 8.282105701838106e-05,
"loss": 0.0203,
"step": 3590
},
{
"epoch": 6.185567010309279,
"grad_norm": 0.25453153252601624,
"learning_rate": 8.271374300761911e-05,
"loss": 0.0247,
"step": 3600
},
{
"epoch": 6.202749140893471,
"grad_norm": 0.3951425552368164,
"learning_rate": 8.260616485718727e-05,
"loss": 0.0256,
"step": 3610
},
{
"epoch": 6.219931271477663,
"grad_norm": 0.3867959976196289,
"learning_rate": 8.249832343570082e-05,
"loss": 0.0218,
"step": 3620
},
{
"epoch": 6.237113402061856,
"grad_norm": 0.24521775543689728,
"learning_rate": 8.239021961390078e-05,
"loss": 0.0258,
"step": 3630
},
{
"epoch": 6.254295532646048,
"grad_norm": 0.3367408215999603,
"learning_rate": 8.228185426464684e-05,
"loss": 0.0184,
"step": 3640
},
{
"epoch": 6.2714776632302405,
"grad_norm": 0.28449004888534546,
"learning_rate": 8.217322826291032e-05,
"loss": 0.0235,
"step": 3650
},
{
"epoch": 6.288659793814433,
"grad_norm": 0.23285141587257385,
"learning_rate": 8.206434248576718e-05,
"loss": 0.0249,
"step": 3660
},
{
"epoch": 6.305841924398625,
"grad_norm": 0.4478093087673187,
"learning_rate": 8.195519781239079e-05,
"loss": 0.023,
"step": 3670
},
{
"epoch": 6.323024054982818,
"grad_norm": 0.3469564914703369,
"learning_rate": 8.1845795124045e-05,
"loss": 0.022,
"step": 3680
},
{
"epoch": 6.34020618556701,
"grad_norm": 0.24919480085372925,
"learning_rate": 8.173613530407691e-05,
"loss": 0.0191,
"step": 3690
},
{
"epoch": 6.357388316151202,
"grad_norm": 0.27461591362953186,
"learning_rate": 8.162621923790974e-05,
"loss": 0.0222,
"step": 3700
},
{
"epoch": 6.374570446735396,
"grad_norm": 0.35929545760154724,
"learning_rate": 8.151604781303577e-05,
"loss": 0.021,
"step": 3710
},
{
"epoch": 6.391752577319588,
"grad_norm": 0.4438592791557312,
"learning_rate": 8.140562191900909e-05,
"loss": 0.0266,
"step": 3720
},
{
"epoch": 6.40893470790378,
"grad_norm": 0.35622140765190125,
"learning_rate": 8.129494244743842e-05,
"loss": 0.0227,
"step": 3730
},
{
"epoch": 6.426116838487973,
"grad_norm": 0.3558623790740967,
"learning_rate": 8.118401029197996e-05,
"loss": 0.0294,
"step": 3740
},
{
"epoch": 6.443298969072165,
"grad_norm": 0.30522775650024414,
"learning_rate": 8.107282634833015e-05,
"loss": 0.0221,
"step": 3750
},
{
"epoch": 6.4604810996563575,
"grad_norm": 0.3831705152988434,
"learning_rate": 8.096139151421842e-05,
"loss": 0.0198,
"step": 3760
},
{
"epoch": 6.47766323024055,
"grad_norm": 0.2840515673160553,
"learning_rate": 8.084970668939998e-05,
"loss": 0.0215,
"step": 3770
},
{
"epoch": 6.494845360824742,
"grad_norm": 0.30647212266921997,
"learning_rate": 8.07377727756485e-05,
"loss": 0.0179,
"step": 3780
},
{
"epoch": 6.512027491408935,
"grad_norm": 0.2785893380641937,
"learning_rate": 8.06255906767489e-05,
"loss": 0.0205,
"step": 3790
},
{
"epoch": 6.529209621993127,
"grad_norm": 0.30499890446662903,
"learning_rate": 8.051316129849e-05,
"loss": 0.0273,
"step": 3800
},
{
"epoch": 6.546391752577319,
"grad_norm": 0.26266801357269287,
"learning_rate": 8.04004855486572e-05,
"loss": 0.0264,
"step": 3810
},
{
"epoch": 6.563573883161512,
"grad_norm": 0.24231982231140137,
"learning_rate": 8.02875643370252e-05,
"loss": 0.0213,
"step": 3820
},
{
"epoch": 6.580756013745704,
"grad_norm": 0.21040430665016174,
"learning_rate": 8.01743985753506e-05,
"loss": 0.0182,
"step": 3830
},
{
"epoch": 6.597938144329897,
"grad_norm": 0.30289342999458313,
"learning_rate": 8.006098917736461e-05,
"loss": 0.0231,
"step": 3840
},
{
"epoch": 6.615120274914089,
"grad_norm": 0.3555678427219391,
"learning_rate": 7.994733705876558e-05,
"loss": 0.0188,
"step": 3850
},
{
"epoch": 6.632302405498281,
"grad_norm": 0.2579226493835449,
"learning_rate": 7.983344313721166e-05,
"loss": 0.0218,
"step": 3860
},
{
"epoch": 6.649484536082475,
"grad_norm": 0.3365667164325714,
"learning_rate": 7.971930833231338e-05,
"loss": 0.0202,
"step": 3870
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.14933505654335022,
"learning_rate": 7.960493356562624e-05,
"loss": 0.0208,
"step": 3880
},
{
"epoch": 6.683848797250859,
"grad_norm": 0.1804179847240448,
"learning_rate": 7.949031976064327e-05,
"loss": 0.0237,
"step": 3890
},
{
"epoch": 6.701030927835052,
"grad_norm": 0.3504364788532257,
"learning_rate": 7.937546784278753e-05,
"loss": 0.0185,
"step": 3900
},
{
"epoch": 6.718213058419244,
"grad_norm": 0.2884169816970825,
"learning_rate": 7.926037873940469e-05,
"loss": 0.0166,
"step": 3910
},
{
"epoch": 6.7353951890034365,
"grad_norm": 0.3002106547355652,
"learning_rate": 7.91450533797555e-05,
"loss": 0.0198,
"step": 3920
},
{
"epoch": 6.752577319587629,
"grad_norm": 0.4687197506427765,
"learning_rate": 7.902949269500835e-05,
"loss": 0.0222,
"step": 3930
},
{
"epoch": 6.769759450171821,
"grad_norm": 0.3774946331977844,
"learning_rate": 7.891369761823164e-05,
"loss": 0.0245,
"step": 3940
},
{
"epoch": 6.786941580756014,
"grad_norm": 0.43464595079421997,
"learning_rate": 7.879766908438638e-05,
"loss": 0.0238,
"step": 3950
},
{
"epoch": 6.804123711340206,
"grad_norm": 0.338309109210968,
"learning_rate": 7.868140803031853e-05,
"loss": 0.0322,
"step": 3960
},
{
"epoch": 6.821305841924398,
"grad_norm": 0.4015257954597473,
"learning_rate": 7.85649153947515e-05,
"loss": 0.0219,
"step": 3970
},
{
"epoch": 6.838487972508591,
"grad_norm": 0.45906925201416016,
"learning_rate": 7.844819211827861e-05,
"loss": 0.0219,
"step": 3980
},
{
"epoch": 6.855670103092783,
"grad_norm": 0.25680992007255554,
"learning_rate": 7.83312391433553e-05,
"loss": 0.0191,
"step": 3990
},
{
"epoch": 6.872852233676976,
"grad_norm": 0.35143017768859863,
"learning_rate": 7.821405741429179e-05,
"loss": 0.0172,
"step": 4000
},
{
"epoch": 6.890034364261169,
"grad_norm": 0.2712146043777466,
"learning_rate": 7.809664787724527e-05,
"loss": 0.0207,
"step": 4010
},
{
"epoch": 6.907216494845361,
"grad_norm": 0.29727092385292053,
"learning_rate": 7.79790114802123e-05,
"loss": 0.0156,
"step": 4020
},
{
"epoch": 6.9243986254295535,
"grad_norm": 0.2169165462255478,
"learning_rate": 7.786114917302118e-05,
"loss": 0.0213,
"step": 4030
},
{
"epoch": 6.941580756013746,
"grad_norm": 0.27955862879753113,
"learning_rate": 7.77430619073243e-05,
"loss": 0.0205,
"step": 4040
},
{
"epoch": 6.958762886597938,
"grad_norm": 0.1645563244819641,
"learning_rate": 7.762475063659038e-05,
"loss": 0.0233,
"step": 4050
},
{
"epoch": 6.975945017182131,
"grad_norm": 0.22075164318084717,
"learning_rate": 7.750621631609684e-05,
"loss": 0.0229,
"step": 4060
},
{
"epoch": 6.993127147766323,
"grad_norm": 0.19816423952579498,
"learning_rate": 7.738745990292208e-05,
"loss": 0.0269,
"step": 4070
},
{
"epoch": 7.010309278350515,
"grad_norm": 0.2696300148963928,
"learning_rate": 7.726848235593771e-05,
"loss": 0.0209,
"step": 4080
},
{
"epoch": 7.027491408934708,
"grad_norm": 0.3369583785533905,
"learning_rate": 7.714928463580084e-05,
"loss": 0.0192,
"step": 4090
},
{
"epoch": 7.0446735395189,
"grad_norm": 0.20987503230571747,
"learning_rate": 7.702986770494633e-05,
"loss": 0.0203,
"step": 4100
},
{
"epoch": 7.061855670103093,
"grad_norm": 0.220789834856987,
"learning_rate": 7.691023252757901e-05,
"loss": 0.0176,
"step": 4110
},
{
"epoch": 7.079037800687285,
"grad_norm": 0.37767869234085083,
"learning_rate": 7.679038006966587e-05,
"loss": 0.0208,
"step": 4120
},
{
"epoch": 7.096219931271477,
"grad_norm": 0.3345067799091339,
"learning_rate": 7.66703112989283e-05,
"loss": 0.0168,
"step": 4130
},
{
"epoch": 7.11340206185567,
"grad_norm": 0.3052999973297119,
"learning_rate": 7.655002718483424e-05,
"loss": 0.0143,
"step": 4140
},
{
"epoch": 7.130584192439863,
"grad_norm": 0.3365825414657593,
"learning_rate": 7.64295286985904e-05,
"loss": 0.0206,
"step": 4150
},
{
"epoch": 7.147766323024055,
"grad_norm": 0.19445881247520447,
"learning_rate": 7.630881681313436e-05,
"loss": 0.0164,
"step": 4160
},
{
"epoch": 7.164948453608248,
"grad_norm": 0.3136243224143982,
"learning_rate": 7.618789250312675e-05,
"loss": 0.0141,
"step": 4170
},
{
"epoch": 7.18213058419244,
"grad_norm": 0.19267341494560242,
"learning_rate": 7.606675674494341e-05,
"loss": 0.0178,
"step": 4180
},
{
"epoch": 7.1993127147766325,
"grad_norm": 0.1413758099079132,
"learning_rate": 7.594541051666742e-05,
"loss": 0.0179,
"step": 4190
},
{
"epoch": 7.216494845360825,
"grad_norm": 0.19496262073516846,
"learning_rate": 7.582385479808127e-05,
"loss": 0.0141,
"step": 4200
},
{
"epoch": 7.233676975945017,
"grad_norm": 0.27552464604377747,
"learning_rate": 7.570209057065894e-05,
"loss": 0.0184,
"step": 4210
},
{
"epoch": 7.25085910652921,
"grad_norm": 0.19228124618530273,
"learning_rate": 7.558011881755797e-05,
"loss": 0.0144,
"step": 4220
},
{
"epoch": 7.268041237113402,
"grad_norm": 0.29144996404647827,
"learning_rate": 7.545794052361149e-05,
"loss": 0.0172,
"step": 4230
},
{
"epoch": 7.285223367697594,
"grad_norm": 0.21185532212257385,
"learning_rate": 7.533555667532035e-05,
"loss": 0.0126,
"step": 4240
},
{
"epoch": 7.302405498281787,
"grad_norm": 0.4170054495334625,
"learning_rate": 7.521296826084503e-05,
"loss": 0.0231,
"step": 4250
},
{
"epoch": 7.319587628865979,
"grad_norm": 0.26662755012512207,
"learning_rate": 7.50901762699978e-05,
"loss": 0.0161,
"step": 4260
},
{
"epoch": 7.3367697594501715,
"grad_norm": 0.323034405708313,
"learning_rate": 7.496718169423462e-05,
"loss": 0.0172,
"step": 4270
},
{
"epoch": 7.353951890034364,
"grad_norm": 0.22829285264015198,
"learning_rate": 7.484398552664722e-05,
"loss": 0.018,
"step": 4280
},
{
"epoch": 7.371134020618557,
"grad_norm": 0.4607219099998474,
"learning_rate": 7.472058876195496e-05,
"loss": 0.0302,
"step": 4290
},
{
"epoch": 7.3883161512027495,
"grad_norm": 0.3345796763896942,
"learning_rate": 7.459699239649696e-05,
"loss": 0.0177,
"step": 4300
},
{
"epoch": 7.405498281786942,
"grad_norm": 0.35420554876327515,
"learning_rate": 7.447319742822392e-05,
"loss": 0.0166,
"step": 4310
},
{
"epoch": 7.422680412371134,
"grad_norm": 0.2642367482185364,
"learning_rate": 7.43492048566901e-05,
"loss": 0.0186,
"step": 4320
},
{
"epoch": 7.439862542955327,
"grad_norm": 0.3100736141204834,
"learning_rate": 7.422501568304535e-05,
"loss": 0.0242,
"step": 4330
},
{
"epoch": 7.457044673539519,
"grad_norm": 0.34664222598075867,
"learning_rate": 7.410063091002682e-05,
"loss": 0.0139,
"step": 4340
},
{
"epoch": 7.474226804123711,
"grad_norm": 0.2938918471336365,
"learning_rate": 7.397605154195106e-05,
"loss": 0.016,
"step": 4350
},
{
"epoch": 7.491408934707904,
"grad_norm": 0.34358811378479004,
"learning_rate": 7.385127858470582e-05,
"loss": 0.0178,
"step": 4360
},
{
"epoch": 7.508591065292096,
"grad_norm": 0.2687462568283081,
"learning_rate": 7.372631304574194e-05,
"loss": 0.0186,
"step": 4370
},
{
"epoch": 7.525773195876289,
"grad_norm": 0.24897870421409607,
"learning_rate": 7.36011559340652e-05,
"loss": 0.0178,
"step": 4380
},
{
"epoch": 7.542955326460481,
"grad_norm": 0.36554020643234253,
"learning_rate": 7.347580826022821e-05,
"loss": 0.0218,
"step": 4390
},
{
"epoch": 7.560137457044673,
"grad_norm": 0.25389084219932556,
"learning_rate": 7.335027103632223e-05,
"loss": 0.016,
"step": 4400
},
{
"epoch": 7.577319587628866,
"grad_norm": 0.3902638256549835,
"learning_rate": 7.322454527596898e-05,
"loss": 0.0179,
"step": 4410
},
{
"epoch": 7.594501718213058,
"grad_norm": 0.220624178647995,
"learning_rate": 7.30986319943125e-05,
"loss": 0.0147,
"step": 4420
},
{
"epoch": 7.6116838487972505,
"grad_norm": 0.3258158266544342,
"learning_rate": 7.29725322080109e-05,
"loss": 0.0179,
"step": 4430
},
{
"epoch": 7.628865979381443,
"grad_norm": 0.34806087613105774,
"learning_rate": 7.28462469352282e-05,
"loss": 0.0218,
"step": 4440
},
{
"epoch": 7.646048109965636,
"grad_norm": 0.28883498907089233,
"learning_rate": 7.271977719562611e-05,
"loss": 0.017,
"step": 4450
},
{
"epoch": 7.6632302405498285,
"grad_norm": 0.17664246261119843,
"learning_rate": 7.259312401035572e-05,
"loss": 0.0154,
"step": 4460
},
{
"epoch": 7.680412371134021,
"grad_norm": 0.3173231780529022,
"learning_rate": 7.246628840204935e-05,
"loss": 0.0209,
"step": 4470
},
{
"epoch": 7.697594501718213,
"grad_norm": 0.34185221791267395,
"learning_rate": 7.233927139481224e-05,
"loss": 0.0174,
"step": 4480
},
{
"epoch": 7.714776632302406,
"grad_norm": 0.3024695813655853,
"learning_rate": 7.221207401421428e-05,
"loss": 0.021,
"step": 4490
},
{
"epoch": 7.731958762886598,
"grad_norm": 0.3330129086971283,
"learning_rate": 7.208469728728178e-05,
"loss": 0.0295,
"step": 4500
},
{
"epoch": 7.74914089347079,
"grad_norm": 0.29602715373039246,
"learning_rate": 7.195714224248912e-05,
"loss": 0.0182,
"step": 4510
},
{
"epoch": 7.766323024054983,
"grad_norm": 0.18014559149742126,
"learning_rate": 7.182940990975048e-05,
"loss": 0.0196,
"step": 4520
},
{
"epoch": 7.783505154639175,
"grad_norm": 0.2823367714881897,
"learning_rate": 7.170150132041146e-05,
"loss": 0.0233,
"step": 4530
},
{
"epoch": 7.8006872852233675,
"grad_norm": 0.31760045886039734,
"learning_rate": 7.15734175072409e-05,
"loss": 0.0155,
"step": 4540
},
{
"epoch": 7.81786941580756,
"grad_norm": 0.2565371096134186,
"learning_rate": 7.144515950442232e-05,
"loss": 0.0221,
"step": 4550
},
{
"epoch": 7.835051546391752,
"grad_norm": 0.39871808886528015,
"learning_rate": 7.131672834754582e-05,
"loss": 0.0184,
"step": 4560
},
{
"epoch": 7.852233676975945,
"grad_norm": 0.3175216615200043,
"learning_rate": 7.11881250735995e-05,
"loss": 0.0184,
"step": 4570
},
{
"epoch": 7.869415807560138,
"grad_norm": 0.37690746784210205,
"learning_rate": 7.105935072096125e-05,
"loss": 0.0198,
"step": 4580
},
{
"epoch": 7.88659793814433,
"grad_norm": 0.24447882175445557,
"learning_rate": 7.093040632939023e-05,
"loss": 0.0151,
"step": 4590
},
{
"epoch": 7.903780068728523,
"grad_norm": 0.2845030725002289,
"learning_rate": 7.08012929400186e-05,
"loss": 0.0168,
"step": 4600
},
{
"epoch": 7.920962199312715,
"grad_norm": 0.2334176003932953,
"learning_rate": 7.067201159534299e-05,
"loss": 0.0238,
"step": 4610
},
{
"epoch": 7.938144329896907,
"grad_norm": 0.2806495726108551,
"learning_rate": 7.054256333921623e-05,
"loss": 0.021,
"step": 4620
},
{
"epoch": 7.9553264604811,
"grad_norm": 0.25240814685821533,
"learning_rate": 7.041294921683876e-05,
"loss": 0.0153,
"step": 4630
},
{
"epoch": 7.972508591065292,
"grad_norm": 0.4571000337600708,
"learning_rate": 7.02831702747503e-05,
"loss": 0.0175,
"step": 4640
},
{
"epoch": 7.989690721649485,
"grad_norm": 0.27207332849502563,
"learning_rate": 7.01532275608214e-05,
"loss": 0.0198,
"step": 4650
},
{
"epoch": 8.006872852233677,
"grad_norm": 0.3235473930835724,
"learning_rate": 7.002312212424488e-05,
"loss": 0.0243,
"step": 4660
},
{
"epoch": 8.02405498281787,
"grad_norm": 0.20169375836849213,
"learning_rate": 6.989285501552751e-05,
"loss": 0.0254,
"step": 4670
},
{
"epoch": 8.041237113402062,
"grad_norm": 0.23877793550491333,
"learning_rate": 6.976242728648137e-05,
"loss": 0.0125,
"step": 4680
},
{
"epoch": 8.058419243986254,
"grad_norm": 0.2380063384771347,
"learning_rate": 6.963183999021546e-05,
"loss": 0.0293,
"step": 4690
},
{
"epoch": 8.075601374570446,
"grad_norm": 0.27434396743774414,
"learning_rate": 6.95010941811272e-05,
"loss": 0.0203,
"step": 4700
},
{
"epoch": 8.092783505154639,
"grad_norm": 0.24492555856704712,
"learning_rate": 6.93701909148938e-05,
"loss": 0.0196,
"step": 4710
},
{
"epoch": 8.109965635738831,
"grad_norm": 0.22814416885375977,
"learning_rate": 6.923913124846397e-05,
"loss": 0.0174,
"step": 4720
},
{
"epoch": 8.127147766323024,
"grad_norm": 0.2595348358154297,
"learning_rate": 6.910791624004907e-05,
"loss": 0.0151,
"step": 4730
},
{
"epoch": 8.144329896907216,
"grad_norm": 0.40572383999824524,
"learning_rate": 6.897654694911486e-05,
"loss": 0.021,
"step": 4740
},
{
"epoch": 8.161512027491408,
"grad_norm": 0.36821913719177246,
"learning_rate": 6.884502443637273e-05,
"loss": 0.0167,
"step": 4750
},
{
"epoch": 8.1786941580756,
"grad_norm": 0.2500125467777252,
"learning_rate": 6.871334976377132e-05,
"loss": 0.016,
"step": 4760
},
{
"epoch": 8.195876288659793,
"grad_norm": 0.2473415732383728,
"learning_rate": 6.858152399448773e-05,
"loss": 0.0187,
"step": 4770
},
{
"epoch": 8.213058419243985,
"grad_norm": 0.2067149579524994,
"learning_rate": 6.844954819291918e-05,
"loss": 0.0264,
"step": 4780
},
{
"epoch": 8.230240549828178,
"grad_norm": 0.24544283747673035,
"learning_rate": 6.831742342467418e-05,
"loss": 0.0207,
"step": 4790
},
{
"epoch": 8.24742268041237,
"grad_norm": 0.30843910574913025,
"learning_rate": 6.818515075656412e-05,
"loss": 0.017,
"step": 4800
},
{
"epoch": 8.264604810996564,
"grad_norm": 0.3309854567050934,
"learning_rate": 6.805273125659455e-05,
"loss": 0.0179,
"step": 4810
},
{
"epoch": 8.281786941580757,
"grad_norm": 0.21837979555130005,
"learning_rate": 6.792016599395655e-05,
"loss": 0.011,
"step": 4820
},
{
"epoch": 8.29896907216495,
"grad_norm": 0.3258560597896576,
"learning_rate": 6.778745603901817e-05,
"loss": 0.0168,
"step": 4830
},
{
"epoch": 8.316151202749142,
"grad_norm": 0.3291252553462982,
"learning_rate": 6.765460246331573e-05,
"loss": 0.0197,
"step": 4840
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.33732980489730835,
"learning_rate": 6.752160633954515e-05,
"loss": 0.0138,
"step": 4850
},
{
"epoch": 8.350515463917526,
"grad_norm": 0.2825522720813751,
"learning_rate": 6.73884687415534e-05,
"loss": 0.0156,
"step": 4860
},
{
"epoch": 8.367697594501719,
"grad_norm": 0.28338858485221863,
"learning_rate": 6.725519074432965e-05,
"loss": 0.0215,
"step": 4870
},
{
"epoch": 8.384879725085911,
"grad_norm": 0.258777916431427,
"learning_rate": 6.712177342399679e-05,
"loss": 0.0197,
"step": 4880
},
{
"epoch": 8.402061855670103,
"grad_norm": 0.3022059202194214,
"learning_rate": 6.698821785780257e-05,
"loss": 0.0177,
"step": 4890
},
{
"epoch": 8.419243986254296,
"grad_norm": 0.23812155425548553,
"learning_rate": 6.685452512411102e-05,
"loss": 0.0179,
"step": 4900
},
{
"epoch": 8.436426116838488,
"grad_norm": 0.1747688353061676,
"learning_rate": 6.672069630239366e-05,
"loss": 0.0207,
"step": 4910
},
{
"epoch": 8.45360824742268,
"grad_norm": 0.38623926043510437,
"learning_rate": 6.658673247322086e-05,
"loss": 0.02,
"step": 4920
},
{
"epoch": 8.470790378006873,
"grad_norm": 0.2657296657562256,
"learning_rate": 6.645263471825303e-05,
"loss": 0.0139,
"step": 4930
},
{
"epoch": 8.487972508591065,
"grad_norm": 0.3186751902103424,
"learning_rate": 6.631840412023201e-05,
"loss": 0.0163,
"step": 4940
},
{
"epoch": 8.505154639175258,
"grad_norm": 0.22730350494384766,
"learning_rate": 6.618404176297217e-05,
"loss": 0.015,
"step": 4950
},
{
"epoch": 8.52233676975945,
"grad_norm": 0.4089230000972748,
"learning_rate": 6.604954873135178e-05,
"loss": 0.017,
"step": 4960
},
{
"epoch": 8.539518900343642,
"grad_norm": 0.2689635753631592,
"learning_rate": 6.591492611130421e-05,
"loss": 0.0166,
"step": 4970
},
{
"epoch": 8.556701030927835,
"grad_norm": 0.2454978972673416,
"learning_rate": 6.578017498980913e-05,
"loss": 0.0133,
"step": 4980
},
{
"epoch": 8.573883161512027,
"grad_norm": 0.17714907228946686,
"learning_rate": 6.564529645488383e-05,
"loss": 0.018,
"step": 4990
},
{
"epoch": 8.59106529209622,
"grad_norm": 0.26513901352882385,
"learning_rate": 6.551029159557431e-05,
"loss": 0.0194,
"step": 5000
},
{
"epoch": 8.608247422680412,
"grad_norm": 0.20744717121124268,
"learning_rate": 6.537516150194656e-05,
"loss": 0.0187,
"step": 5010
},
{
"epoch": 8.625429553264604,
"grad_norm": 0.21573315560817719,
"learning_rate": 6.523990726507777e-05,
"loss": 0.0178,
"step": 5020
},
{
"epoch": 8.642611683848797,
"grad_norm": 0.1612836867570877,
"learning_rate": 6.510452997704748e-05,
"loss": 0.0195,
"step": 5030
},
{
"epoch": 8.65979381443299,
"grad_norm": 0.23485371470451355,
"learning_rate": 6.496903073092878e-05,
"loss": 0.016,
"step": 5040
},
{
"epoch": 8.676975945017182,
"grad_norm": 0.25484392046928406,
"learning_rate": 6.483341062077948e-05,
"loss": 0.0141,
"step": 5050
},
{
"epoch": 8.694158075601374,
"grad_norm": 0.24695904552936554,
"learning_rate": 6.46976707416333e-05,
"loss": 0.0122,
"step": 5060
},
{
"epoch": 8.711340206185566,
"grad_norm": 0.10241147875785828,
"learning_rate": 6.456181218949096e-05,
"loss": 0.0175,
"step": 5070
},
{
"epoch": 8.728522336769759,
"grad_norm": 0.31217408180236816,
"learning_rate": 6.442583606131143e-05,
"loss": 0.0151,
"step": 5080
},
{
"epoch": 8.745704467353953,
"grad_norm": 0.3693694472312927,
"learning_rate": 6.428974345500299e-05,
"loss": 0.0199,
"step": 5090
},
{
"epoch": 8.762886597938145,
"grad_norm": 0.23745276033878326,
"learning_rate": 6.415353546941441e-05,
"loss": 0.0221,
"step": 5100
},
{
"epoch": 8.780068728522338,
"grad_norm": 0.20179122686386108,
"learning_rate": 6.401721320432604e-05,
"loss": 0.0155,
"step": 5110
},
{
"epoch": 8.79725085910653,
"grad_norm": 0.36349353194236755,
"learning_rate": 6.388077776044102e-05,
"loss": 0.0183,
"step": 5120
},
{
"epoch": 8.814432989690722,
"grad_norm": 0.274783194065094,
"learning_rate": 6.374423023937621e-05,
"loss": 0.0153,
"step": 5130
},
{
"epoch": 8.831615120274915,
"grad_norm": 0.29849973320961,
"learning_rate": 6.360757174365355e-05,
"loss": 0.0174,
"step": 5140
},
{
"epoch": 8.848797250859107,
"grad_norm": 0.21367676556110382,
"learning_rate": 6.34708033766909e-05,
"loss": 0.0181,
"step": 5150
},
{
"epoch": 8.8659793814433,
"grad_norm": 0.23595260083675385,
"learning_rate": 6.333392624279333e-05,
"loss": 0.0174,
"step": 5160
},
{
"epoch": 8.883161512027492,
"grad_norm": 0.16049842536449432,
"learning_rate": 6.319694144714407e-05,
"loss": 0.0151,
"step": 5170
},
{
"epoch": 8.900343642611684,
"grad_norm": 0.2062782198190689,
"learning_rate": 6.30598500957957e-05,
"loss": 0.0211,
"step": 5180
},
{
"epoch": 8.917525773195877,
"grad_norm": 0.3576521873474121,
"learning_rate": 6.292265329566108e-05,
"loss": 0.0149,
"step": 5190
},
{
"epoch": 8.934707903780069,
"grad_norm": 0.28101012110710144,
"learning_rate": 6.278535215450458e-05,
"loss": 0.0162,
"step": 5200
},
{
"epoch": 8.951890034364261,
"grad_norm": 0.2609540522098541,
"learning_rate": 6.264794778093297e-05,
"loss": 0.0171,
"step": 5210
},
{
"epoch": 8.969072164948454,
"grad_norm": 0.27727997303009033,
"learning_rate": 6.25104412843866e-05,
"loss": 0.0132,
"step": 5220
},
{
"epoch": 8.986254295532646,
"grad_norm": 0.21067747473716736,
"learning_rate": 6.237283377513036e-05,
"loss": 0.0168,
"step": 5230
},
{
"epoch": 9.003436426116838,
"grad_norm": 0.41480588912963867,
"learning_rate": 6.223512636424478e-05,
"loss": 0.0197,
"step": 5240
},
{
"epoch": 9.02061855670103,
"grad_norm": 0.2617255449295044,
"learning_rate": 6.209732016361696e-05,
"loss": 0.0106,
"step": 5250
},
{
"epoch": 9.037800687285223,
"grad_norm": 0.1343929022550583,
"learning_rate": 6.19594162859317e-05,
"loss": 0.0186,
"step": 5260
},
{
"epoch": 9.054982817869416,
"grad_norm": 0.22022658586502075,
"learning_rate": 6.182141584466247e-05,
"loss": 0.0152,
"step": 5270
},
{
"epoch": 9.072164948453608,
"grad_norm": 0.19647003710269928,
"learning_rate": 6.168331995406244e-05,
"loss": 0.0124,
"step": 5280
},
{
"epoch": 9.0893470790378,
"grad_norm": 0.225993350148201,
"learning_rate": 6.154512972915542e-05,
"loss": 0.0182,
"step": 5290
},
{
"epoch": 9.106529209621993,
"grad_norm": 0.2652854919433594,
"learning_rate": 6.140684628572688e-05,
"loss": 0.0203,
"step": 5300
},
{
"epoch": 9.123711340206185,
"grad_norm": 0.18200494349002838,
"learning_rate": 6.126847074031507e-05,
"loss": 0.0241,
"step": 5310
},
{
"epoch": 9.140893470790378,
"grad_norm": 0.24488599598407745,
"learning_rate": 6.113000421020176e-05,
"loss": 0.0178,
"step": 5320
},
{
"epoch": 9.15807560137457,
"grad_norm": 0.28431186079978943,
"learning_rate": 6.099144781340347e-05,
"loss": 0.0231,
"step": 5330
},
{
"epoch": 9.175257731958762,
"grad_norm": 0.2814132869243622,
"learning_rate": 6.0852802668662256e-05,
"loss": 0.0191,
"step": 5340
},
{
"epoch": 9.192439862542955,
"grad_norm": 0.33205386996269226,
"learning_rate": 6.071406989543678e-05,
"loss": 0.0177,
"step": 5350
},
{
"epoch": 9.209621993127147,
"grad_norm": 0.24390940368175507,
"learning_rate": 6.057525061389324e-05,
"loss": 0.0217,
"step": 5360
},
{
"epoch": 9.22680412371134,
"grad_norm": 0.18197228014469147,
"learning_rate": 6.04363459448963e-05,
"loss": 0.0126,
"step": 5370
},
{
"epoch": 9.243986254295532,
"grad_norm": 0.2006153017282486,
"learning_rate": 6.0297357010000124e-05,
"loss": 0.0171,
"step": 5380
},
{
"epoch": 9.261168384879726,
"grad_norm": 0.199944868683815,
"learning_rate": 6.0158284931439177e-05,
"loss": 0.0165,
"step": 5390
},
{
"epoch": 9.278350515463918,
"grad_norm": 0.1962256133556366,
"learning_rate": 6.001913083211932e-05,
"loss": 0.0198,
"step": 5400
},
{
"epoch": 9.29553264604811,
"grad_norm": 0.2808385491371155,
"learning_rate": 5.987989583560864e-05,
"loss": 0.0164,
"step": 5410
},
{
"epoch": 9.312714776632303,
"grad_norm": 0.24396586418151855,
"learning_rate": 5.9740581066128435e-05,
"loss": 0.0202,
"step": 5420
},
{
"epoch": 9.329896907216495,
"grad_norm": 0.28668099641799927,
"learning_rate": 5.9601187648544056e-05,
"loss": 0.0156,
"step": 5430
},
{
"epoch": 9.347079037800688,
"grad_norm": 0.25964459776878357,
"learning_rate": 5.946171670835594e-05,
"loss": 0.0197,
"step": 5440
},
{
"epoch": 9.36426116838488,
"grad_norm": 0.3509371876716614,
"learning_rate": 5.932216937169044e-05,
"loss": 0.0229,
"step": 5450
},
{
"epoch": 9.381443298969073,
"grad_norm": 0.29809918999671936,
"learning_rate": 5.918254676529076e-05,
"loss": 0.0134,
"step": 5460
},
{
"epoch": 9.398625429553265,
"grad_norm": 0.20090153813362122,
"learning_rate": 5.904285001650783e-05,
"loss": 0.0184,
"step": 5470
},
{
"epoch": 9.415807560137457,
"grad_norm": 0.3226790726184845,
"learning_rate": 5.890308025329125e-05,
"loss": 0.017,
"step": 5480
},
{
"epoch": 9.43298969072165,
"grad_norm": 0.2159719467163086,
"learning_rate": 5.876323860418016e-05,
"loss": 0.0133,
"step": 5490
},
{
"epoch": 9.450171821305842,
"grad_norm": 0.2575574219226837,
"learning_rate": 5.8623326198294116e-05,
"loss": 0.0156,
"step": 5500
},
{
"epoch": 9.467353951890034,
"grad_norm": 0.2184896171092987,
"learning_rate": 5.8483344165323975e-05,
"loss": 0.0156,
"step": 5510
},
{
"epoch": 9.484536082474227,
"grad_norm": 0.2843054533004761,
"learning_rate": 5.834329363552279e-05,
"loss": 0.0163,
"step": 5520
},
{
"epoch": 9.50171821305842,
"grad_norm": 0.3006589710712433,
"learning_rate": 5.820317573969669e-05,
"loss": 0.0155,
"step": 5530
},
{
"epoch": 9.518900343642612,
"grad_norm": 0.23060756921768188,
"learning_rate": 5.806299160919573e-05,
"loss": 0.0127,
"step": 5540
},
{
"epoch": 9.536082474226804,
"grad_norm": 0.23474593460559845,
"learning_rate": 5.792274237590471e-05,
"loss": 0.0151,
"step": 5550
},
{
"epoch": 9.553264604810996,
"grad_norm": 0.2775484621524811,
"learning_rate": 5.7782429172234206e-05,
"loss": 0.0194,
"step": 5560
},
{
"epoch": 9.570446735395189,
"grad_norm": 0.20381006598472595,
"learning_rate": 5.7642053131111186e-05,
"loss": 0.0205,
"step": 5570
},
{
"epoch": 9.587628865979381,
"grad_norm": 0.2642858028411865,
"learning_rate": 5.7501615385970044e-05,
"loss": 0.012,
"step": 5580
},
{
"epoch": 9.604810996563574,
"grad_norm": 0.14698222279548645,
"learning_rate": 5.7361117070743374e-05,
"loss": 0.0151,
"step": 5590
},
{
"epoch": 9.621993127147766,
"grad_norm": 0.2586089074611664,
"learning_rate": 5.722055931985285e-05,
"loss": 0.0173,
"step": 5600
},
{
"epoch": 9.639175257731958,
"grad_norm": 0.2857683002948761,
"learning_rate": 5.707994326820002e-05,
"loss": 0.0173,
"step": 5610
},
{
"epoch": 9.65635738831615,
"grad_norm": 0.25553369522094727,
"learning_rate": 5.693927005115719e-05,
"loss": 0.0193,
"step": 5620
},
{
"epoch": 9.673539518900343,
"grad_norm": 0.2712913453578949,
"learning_rate": 5.679854080455821e-05,
"loss": 0.0104,
"step": 5630
},
{
"epoch": 9.690721649484535,
"grad_norm": 0.2559773325920105,
"learning_rate": 5.665775666468933e-05,
"loss": 0.0144,
"step": 5640
},
{
"epoch": 9.707903780068728,
"grad_norm": 0.3383992910385132,
"learning_rate": 5.651691876828007e-05,
"loss": 0.0144,
"step": 5650
},
{
"epoch": 9.72508591065292,
"grad_norm": 0.3301098048686981,
"learning_rate": 5.637602825249394e-05,
"loss": 0.0157,
"step": 5660
},
{
"epoch": 9.742268041237114,
"grad_norm": 0.22163395583629608,
"learning_rate": 5.6235086254919324e-05,
"loss": 0.0112,
"step": 5670
},
{
"epoch": 9.759450171821307,
"grad_norm": 0.10947784781455994,
"learning_rate": 5.609409391356031e-05,
"loss": 0.0212,
"step": 5680
},
{
"epoch": 9.776632302405499,
"grad_norm": 0.17621196806430817,
"learning_rate": 5.595305236682743e-05,
"loss": 0.0099,
"step": 5690
},
{
"epoch": 9.793814432989691,
"grad_norm": 0.1827089488506317,
"learning_rate": 5.581196275352858e-05,
"loss": 0.018,
"step": 5700
},
{
"epoch": 9.810996563573884,
"grad_norm": 0.20134034752845764,
"learning_rate": 5.567082621285969e-05,
"loss": 0.02,
"step": 5710
},
{
"epoch": 9.828178694158076,
"grad_norm": 0.2766471803188324,
"learning_rate": 5.5529643884395654e-05,
"loss": 0.0125,
"step": 5720
},
{
"epoch": 9.845360824742269,
"grad_norm": 0.1543634682893753,
"learning_rate": 5.538841690808101e-05,
"loss": 0.0166,
"step": 5730
},
{
"epoch": 9.862542955326461,
"grad_norm": 0.2152809351682663,
"learning_rate": 5.524714642422084e-05,
"loss": 0.01,
"step": 5740
},
{
"epoch": 9.879725085910653,
"grad_norm": 0.32943928241729736,
"learning_rate": 5.510583357347149e-05,
"loss": 0.0166,
"step": 5750
},
{
"epoch": 9.896907216494846,
"grad_norm": 0.24659444391727448,
"learning_rate": 5.4964479496831425e-05,
"loss": 0.0173,
"step": 5760
},
{
"epoch": 9.914089347079038,
"grad_norm": 0.173888698220253,
"learning_rate": 5.482308533563193e-05,
"loss": 0.0094,
"step": 5770
},
{
"epoch": 9.93127147766323,
"grad_norm": 0.19505925476551056,
"learning_rate": 5.468165223152798e-05,
"loss": 0.0142,
"step": 5780
},
{
"epoch": 9.948453608247423,
"grad_norm": 0.25433164834976196,
"learning_rate": 5.454018132648897e-05,
"loss": 0.015,
"step": 5790
},
{
"epoch": 9.965635738831615,
"grad_norm": 0.26114964485168457,
"learning_rate": 5.439867376278952e-05,
"loss": 0.0136,
"step": 5800
},
{
"epoch": 9.982817869415808,
"grad_norm": 0.36945995688438416,
"learning_rate": 5.425713068300022e-05,
"loss": 0.0213,
"step": 5810
},
{
"epoch": 10.0,
"grad_norm": 0.4136284291744232,
"learning_rate": 5.411555322997846e-05,
"loss": 0.0235,
"step": 5820
},
{
"epoch": 10.017182130584192,
"grad_norm": 0.2530066967010498,
"learning_rate": 5.3973942546859145e-05,
"loss": 0.0159,
"step": 5830
},
{
"epoch": 10.034364261168385,
"grad_norm": 0.3279346227645874,
"learning_rate": 5.3832299777045495e-05,
"loss": 0.0123,
"step": 5840
},
{
"epoch": 10.051546391752577,
"grad_norm": 0.2813730239868164,
"learning_rate": 5.36906260641998e-05,
"loss": 0.0152,
"step": 5850
},
{
"epoch": 10.06872852233677,
"grad_norm": 0.2074098438024521,
"learning_rate": 5.354892255223421e-05,
"loss": 0.0134,
"step": 5860
},
{
"epoch": 10.085910652920962,
"grad_norm": 0.2736356854438782,
"learning_rate": 5.3407190385301456e-05,
"loss": 0.0104,
"step": 5870
},
{
"epoch": 10.103092783505154,
"grad_norm": 0.25040575861930847,
"learning_rate": 5.3265430707785666e-05,
"loss": 0.0172,
"step": 5880
},
{
"epoch": 10.120274914089347,
"grad_norm": 0.3141660988330841,
"learning_rate": 5.312364466429307e-05,
"loss": 0.0125,
"step": 5890
},
{
"epoch": 10.137457044673539,
"grad_norm": 0.16908888518810272,
"learning_rate": 5.298183339964281e-05,
"loss": 0.0117,
"step": 5900
},
{
"epoch": 10.154639175257731,
"grad_norm": 0.2192607969045639,
"learning_rate": 5.283999805885764e-05,
"loss": 0.0212,
"step": 5910
},
{
"epoch": 10.171821305841924,
"grad_norm": 0.14075499773025513,
"learning_rate": 5.269813978715474e-05,
"loss": 0.0171,
"step": 5920
},
{
"epoch": 10.189003436426116,
"grad_norm": 0.15797455608844757,
"learning_rate": 5.255625972993642e-05,
"loss": 0.0143,
"step": 5930
},
{
"epoch": 10.206185567010309,
"grad_norm": 0.22639349102973938,
"learning_rate": 5.24143590327809e-05,
"loss": 0.0174,
"step": 5940
},
{
"epoch": 10.223367697594501,
"grad_norm": 0.22572936117649078,
"learning_rate": 5.227243884143306e-05,
"loss": 0.0123,
"step": 5950
},
{
"epoch": 10.240549828178693,
"grad_norm": 0.24433186650276184,
"learning_rate": 5.213050030179515e-05,
"loss": 0.0152,
"step": 5960
},
{
"epoch": 10.257731958762886,
"grad_norm": 0.2180275022983551,
"learning_rate": 5.198854455991763e-05,
"loss": 0.0136,
"step": 5970
},
{
"epoch": 10.27491408934708,
"grad_norm": 0.1412176787853241,
"learning_rate": 5.184657276198978e-05,
"loss": 0.0083,
"step": 5980
},
{
"epoch": 10.292096219931272,
"grad_norm": 0.23186911642551422,
"learning_rate": 5.170458605433059e-05,
"loss": 0.0128,
"step": 5990
},
{
"epoch": 10.309278350515465,
"grad_norm": 0.2739560604095459,
"learning_rate": 5.15625855833794e-05,
"loss": 0.0212,
"step": 6000
},
{
"epoch": 10.326460481099657,
"grad_norm": 0.2591661512851715,
"learning_rate": 5.1420572495686646e-05,
"loss": 0.0153,
"step": 6010
},
{
"epoch": 10.34364261168385,
"grad_norm": 0.301039457321167,
"learning_rate": 5.127854793790473e-05,
"loss": 0.0128,
"step": 6020
},
{
"epoch": 10.360824742268042,
"grad_norm": 0.30792465806007385,
"learning_rate": 5.113651305677856e-05,
"loss": 0.0206,
"step": 6030
},
{
"epoch": 10.378006872852234,
"grad_norm": 0.20730407536029816,
"learning_rate": 5.099446899913648e-05,
"loss": 0.0184,
"step": 6040
},
{
"epoch": 10.395189003436426,
"grad_norm": 0.2361646145582199,
"learning_rate": 5.085241691188086e-05,
"loss": 0.0142,
"step": 6050
},
{
"epoch": 10.412371134020619,
"grad_norm": 0.15994442999362946,
"learning_rate": 5.071035794197898e-05,
"loss": 0.0128,
"step": 6060
},
{
"epoch": 10.429553264604811,
"grad_norm": 0.1956380158662796,
"learning_rate": 5.0568293236453614e-05,
"loss": 0.0139,
"step": 6070
},
{
"epoch": 10.446735395189004,
"grad_norm": 0.14793916046619415,
"learning_rate": 5.042622394237391e-05,
"loss": 0.01,
"step": 6080
},
{
"epoch": 10.463917525773196,
"grad_norm": 0.23033088445663452,
"learning_rate": 5.0284151206845996e-05,
"loss": 0.0104,
"step": 6090
},
{
"epoch": 10.481099656357388,
"grad_norm": 0.21595941483974457,
"learning_rate": 5.014207617700388e-05,
"loss": 0.0208,
"step": 6100
},
{
"epoch": 10.49828178694158,
"grad_norm": 0.325511634349823,
"learning_rate": 5e-05,
"loss": 0.0226,
"step": 6110
},
{
"epoch": 10.515463917525773,
"grad_norm": 0.24100159108638763,
"learning_rate": 4.985792382299614e-05,
"loss": 0.013,
"step": 6120
},
{
"epoch": 10.532646048109966,
"grad_norm": 0.2464800477027893,
"learning_rate": 4.9715848793154e-05,
"loss": 0.0171,
"step": 6130
},
{
"epoch": 10.549828178694158,
"grad_norm": 0.24693673849105835,
"learning_rate": 4.957377605762611e-05,
"loss": 0.015,
"step": 6140
},
{
"epoch": 10.56701030927835,
"grad_norm": 0.13398700952529907,
"learning_rate": 4.94317067635464e-05,
"loss": 0.0126,
"step": 6150
},
{
"epoch": 10.584192439862543,
"grad_norm": 0.2720285654067993,
"learning_rate": 4.9289642058021043e-05,
"loss": 0.0161,
"step": 6160
},
{
"epoch": 10.601374570446735,
"grad_norm": 0.2861359119415283,
"learning_rate": 4.914758308811913e-05,
"loss": 0.0137,
"step": 6170
},
{
"epoch": 10.618556701030927,
"grad_norm": 0.15878301858901978,
"learning_rate": 4.900553100086353e-05,
"loss": 0.0173,
"step": 6180
},
{
"epoch": 10.63573883161512,
"grad_norm": 0.33061495423316956,
"learning_rate": 4.886348694322145e-05,
"loss": 0.013,
"step": 6190
},
{
"epoch": 10.652920962199312,
"grad_norm": 0.30866488814353943,
"learning_rate": 4.8721452062095294e-05,
"loss": 0.0168,
"step": 6200
},
{
"epoch": 10.670103092783505,
"grad_norm": 0.24568206071853638,
"learning_rate": 4.8579427504313366e-05,
"loss": 0.0152,
"step": 6210
},
{
"epoch": 10.687285223367697,
"grad_norm": 0.24803771078586578,
"learning_rate": 4.843741441662062e-05,
"loss": 0.0178,
"step": 6220
},
{
"epoch": 10.70446735395189,
"grad_norm": 0.17046746611595154,
"learning_rate": 4.829541394566942e-05,
"loss": 0.0124,
"step": 6230
},
{
"epoch": 10.721649484536082,
"grad_norm": 0.22589251399040222,
"learning_rate": 4.8153427238010227e-05,
"loss": 0.014,
"step": 6240
},
{
"epoch": 10.738831615120276,
"grad_norm": 0.24486307799816132,
"learning_rate": 4.801145544008239e-05,
"loss": 0.014,
"step": 6250
},
{
"epoch": 10.756013745704468,
"grad_norm": 0.13196790218353271,
"learning_rate": 4.7869499698204864e-05,
"loss": 0.0144,
"step": 6260
},
{
"epoch": 10.77319587628866,
"grad_norm": 0.20505741238594055,
"learning_rate": 4.772756115856695e-05,
"loss": 0.0146,
"step": 6270
},
{
"epoch": 10.790378006872853,
"grad_norm": 0.22166849672794342,
"learning_rate": 4.758564096721911e-05,
"loss": 0.0143,
"step": 6280
},
{
"epoch": 10.807560137457045,
"grad_norm": 0.27348771691322327,
"learning_rate": 4.7443740270063584e-05,
"loss": 0.0137,
"step": 6290
},
{
"epoch": 10.824742268041238,
"grad_norm": 0.2516573667526245,
"learning_rate": 4.7301860212845264e-05,
"loss": 0.0142,
"step": 6300
},
{
"epoch": 10.84192439862543,
"grad_norm": 0.28733956813812256,
"learning_rate": 4.7160001941142365e-05,
"loss": 0.0123,
"step": 6310
},
{
"epoch": 10.859106529209622,
"grad_norm": 0.3413456678390503,
"learning_rate": 4.7018166600357204e-05,
"loss": 0.0142,
"step": 6320
},
{
"epoch": 10.876288659793815,
"grad_norm": 0.3347049653530121,
"learning_rate": 4.687635533570693e-05,
"loss": 0.0137,
"step": 6330
},
{
"epoch": 10.893470790378007,
"grad_norm": 0.3175305426120758,
"learning_rate": 4.673456929221434e-05,
"loss": 0.0205,
"step": 6340
},
{
"epoch": 10.9106529209622,
"grad_norm": 0.1658443957567215,
"learning_rate": 4.6592809614698556e-05,
"loss": 0.013,
"step": 6350
},
{
"epoch": 10.927835051546392,
"grad_norm": 0.12746182084083557,
"learning_rate": 4.645107744776581e-05,
"loss": 0.0126,
"step": 6360
},
{
"epoch": 10.945017182130584,
"grad_norm": 0.20812661945819855,
"learning_rate": 4.6309373935800205e-05,
"loss": 0.0149,
"step": 6370
},
{
"epoch": 10.962199312714777,
"grad_norm": 0.18740630149841309,
"learning_rate": 4.616770022295451e-05,
"loss": 0.0115,
"step": 6380
},
{
"epoch": 10.97938144329897,
"grad_norm": 0.18948382139205933,
"learning_rate": 4.602605745314087e-05,
"loss": 0.0197,
"step": 6390
},
{
"epoch": 10.996563573883162,
"grad_norm": 0.4297175109386444,
"learning_rate": 4.5884446770021555e-05,
"loss": 0.016,
"step": 6400
},
{
"epoch": 11.013745704467354,
"grad_norm": 0.2623024880886078,
"learning_rate": 4.574286931699978e-05,
"loss": 0.0142,
"step": 6410
},
{
"epoch": 11.030927835051546,
"grad_norm": 0.2243795096874237,
"learning_rate": 4.560132623721049e-05,
"loss": 0.0156,
"step": 6420
},
{
"epoch": 11.048109965635739,
"grad_norm": 0.20103001594543457,
"learning_rate": 4.545981867351104e-05,
"loss": 0.0116,
"step": 6430
},
{
"epoch": 11.065292096219931,
"grad_norm": 0.11890780925750732,
"learning_rate": 4.5318347768472035e-05,
"loss": 0.0081,
"step": 6440
},
{
"epoch": 11.082474226804123,
"grad_norm": 0.26694929599761963,
"learning_rate": 4.517691466436807e-05,
"loss": 0.0155,
"step": 6450
},
{
"epoch": 11.099656357388316,
"grad_norm": 0.18821591138839722,
"learning_rate": 4.5035520503168586e-05,
"loss": 0.0104,
"step": 6460
},
{
"epoch": 11.116838487972508,
"grad_norm": 0.27548283338546753,
"learning_rate": 4.4894166426528524e-05,
"loss": 0.0114,
"step": 6470
},
{
"epoch": 11.1340206185567,
"grad_norm": 0.1965043544769287,
"learning_rate": 4.4752853575779185e-05,
"loss": 0.0104,
"step": 6480
},
{
"epoch": 11.151202749140893,
"grad_norm": 0.21741580963134766,
"learning_rate": 4.4611583091919e-05,
"loss": 0.0117,
"step": 6490
},
{
"epoch": 11.168384879725085,
"grad_norm": 0.1215846836566925,
"learning_rate": 4.4470356115604364e-05,
"loss": 0.0093,
"step": 6500
},
{
"epoch": 11.185567010309278,
"grad_norm": 0.1946978121995926,
"learning_rate": 4.432917378714032e-05,
"loss": 0.0194,
"step": 6510
},
{
"epoch": 11.20274914089347,
"grad_norm": 0.22516775131225586,
"learning_rate": 4.418803724647144e-05,
"loss": 0.0149,
"step": 6520
},
{
"epoch": 11.219931271477662,
"grad_norm": 0.22346094250679016,
"learning_rate": 4.4046947633172566e-05,
"loss": 0.0091,
"step": 6530
},
{
"epoch": 11.237113402061855,
"grad_norm": 0.23928742110729218,
"learning_rate": 4.3905906086439704e-05,
"loss": 0.0164,
"step": 6540
},
{
"epoch": 11.254295532646047,
"grad_norm": 0.34528031945228577,
"learning_rate": 4.3764913745080695e-05,
"loss": 0.0145,
"step": 6550
},
{
"epoch": 11.271477663230241,
"grad_norm": 0.134693905711174,
"learning_rate": 4.362397174750608e-05,
"loss": 0.0076,
"step": 6560
},
{
"epoch": 11.288659793814434,
"grad_norm": 0.35505372285842896,
"learning_rate": 4.348308123171994e-05,
"loss": 0.0138,
"step": 6570
},
{
"epoch": 11.305841924398626,
"grad_norm": 0.17052118480205536,
"learning_rate": 4.334224333531068e-05,
"loss": 0.012,
"step": 6580
},
{
"epoch": 11.323024054982818,
"grad_norm": 0.19103099405765533,
"learning_rate": 4.32014591954418e-05,
"loss": 0.013,
"step": 6590
},
{
"epoch": 11.34020618556701,
"grad_norm": 0.20789751410484314,
"learning_rate": 4.306072994884282e-05,
"loss": 0.0091,
"step": 6600
},
{
"epoch": 11.357388316151203,
"grad_norm": 0.2590029537677765,
"learning_rate": 4.292005673179998e-05,
"loss": 0.008,
"step": 6610
},
{
"epoch": 11.374570446735396,
"grad_norm": 0.16030985116958618,
"learning_rate": 4.277944068014716e-05,
"loss": 0.0142,
"step": 6620
},
{
"epoch": 11.391752577319588,
"grad_norm": 0.34259387850761414,
"learning_rate": 4.263888292925664e-05,
"loss": 0.0115,
"step": 6630
},
{
"epoch": 11.40893470790378,
"grad_norm": 0.24973253905773163,
"learning_rate": 4.249838461402997e-05,
"loss": 0.0112,
"step": 6640
},
{
"epoch": 11.426116838487973,
"grad_norm": 0.40062564611434937,
"learning_rate": 4.235794686888882e-05,
"loss": 0.0111,
"step": 6650
},
{
"epoch": 11.443298969072165,
"grad_norm": 0.23818433284759521,
"learning_rate": 4.22175708277658e-05,
"loss": 0.0124,
"step": 6660
},
{
"epoch": 11.460481099656358,
"grad_norm": 0.17521892488002777,
"learning_rate": 4.207725762409529e-05,
"loss": 0.0186,
"step": 6670
},
{
"epoch": 11.47766323024055,
"grad_norm": 0.2232678085565567,
"learning_rate": 4.19370083908043e-05,
"loss": 0.012,
"step": 6680
},
{
"epoch": 11.494845360824742,
"grad_norm": 0.1600189507007599,
"learning_rate": 4.179682426030331e-05,
"loss": 0.0107,
"step": 6690
},
{
"epoch": 11.512027491408935,
"grad_norm": 0.3540445566177368,
"learning_rate": 4.1656706364477214e-05,
"loss": 0.0182,
"step": 6700
},
{
"epoch": 11.529209621993127,
"grad_norm": 0.39657342433929443,
"learning_rate": 4.151665583467604e-05,
"loss": 0.0157,
"step": 6710
},
{
"epoch": 11.54639175257732,
"grad_norm": 0.35762307047843933,
"learning_rate": 4.137667380170591e-05,
"loss": 0.0115,
"step": 6720
},
{
"epoch": 11.563573883161512,
"grad_norm": 0.28293389081954956,
"learning_rate": 4.123676139581984e-05,
"loss": 0.0194,
"step": 6730
},
{
"epoch": 11.580756013745704,
"grad_norm": 0.1835634410381317,
"learning_rate": 4.1096919746708754e-05,
"loss": 0.0143,
"step": 6740
},
{
"epoch": 11.597938144329897,
"grad_norm": 0.1975705772638321,
"learning_rate": 4.095714998349218e-05,
"loss": 0.016,
"step": 6750
},
{
"epoch": 11.615120274914089,
"grad_norm": 0.17618152499198914,
"learning_rate": 4.081745323470926e-05,
"loss": 0.0198,
"step": 6760
},
{
"epoch": 11.632302405498281,
"grad_norm": 0.1503658890724182,
"learning_rate": 4.067783062830955e-05,
"loss": 0.0156,
"step": 6770
},
{
"epoch": 11.649484536082474,
"grad_norm": 0.28605377674102783,
"learning_rate": 4.053828329164407e-05,
"loss": 0.0146,
"step": 6780
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.3132267892360687,
"learning_rate": 4.0398812351455955e-05,
"loss": 0.0102,
"step": 6790
},
{
"epoch": 11.683848797250858,
"grad_norm": 0.2057536542415619,
"learning_rate": 4.025941893387159e-05,
"loss": 0.0176,
"step": 6800
},
{
"epoch": 11.70103092783505,
"grad_norm": 0.2427815943956375,
"learning_rate": 4.012010416439136e-05,
"loss": 0.0132,
"step": 6810
},
{
"epoch": 11.718213058419243,
"grad_norm": 0.2931414246559143,
"learning_rate": 3.998086916788069e-05,
"loss": 0.0108,
"step": 6820
},
{
"epoch": 11.735395189003437,
"grad_norm": 0.2122270166873932,
"learning_rate": 3.9841715068560835e-05,
"loss": 0.0146,
"step": 6830
},
{
"epoch": 11.75257731958763,
"grad_norm": 0.3742753565311432,
"learning_rate": 3.970264298999991e-05,
"loss": 0.0128,
"step": 6840
},
{
"epoch": 11.769759450171822,
"grad_norm": 0.13350647687911987,
"learning_rate": 3.956365405510369e-05,
"loss": 0.0105,
"step": 6850
},
{
"epoch": 11.786941580756015,
"grad_norm": 0.2694711685180664,
"learning_rate": 3.942474938610677e-05,
"loss": 0.0117,
"step": 6860
},
{
"epoch": 11.804123711340207,
"grad_norm": 0.2818795144557953,
"learning_rate": 3.9285930104563234e-05,
"loss": 0.0086,
"step": 6870
},
{
"epoch": 11.8213058419244,
"grad_norm": 0.2870750427246094,
"learning_rate": 3.914719733133776e-05,
"loss": 0.012,
"step": 6880
},
{
"epoch": 11.838487972508592,
"grad_norm": 0.14688880741596222,
"learning_rate": 3.900855218659655e-05,
"loss": 0.0169,
"step": 6890
},
{
"epoch": 11.855670103092784,
"grad_norm": 0.1673170030117035,
"learning_rate": 3.886999578979824e-05,
"loss": 0.011,
"step": 6900
},
{
"epoch": 11.872852233676976,
"grad_norm": 0.3427187502384186,
"learning_rate": 3.873152925968495e-05,
"loss": 0.0172,
"step": 6910
},
{
"epoch": 11.890034364261169,
"grad_norm": 0.32928958535194397,
"learning_rate": 3.859315371427312e-05,
"loss": 0.0157,
"step": 6920
},
{
"epoch": 11.907216494845361,
"grad_norm": 0.2496093362569809,
"learning_rate": 3.8454870270844593e-05,
"loss": 0.0119,
"step": 6930
},
{
"epoch": 11.924398625429554,
"grad_norm": 0.15401820838451385,
"learning_rate": 3.831668004593756e-05,
"loss": 0.0115,
"step": 6940
},
{
"epoch": 11.941580756013746,
"grad_norm": 0.14115320146083832,
"learning_rate": 3.8178584155337525e-05,
"loss": 0.0106,
"step": 6950
},
{
"epoch": 11.958762886597938,
"grad_norm": 0.20622394979000092,
"learning_rate": 3.804058371406831e-05,
"loss": 0.0138,
"step": 6960
},
{
"epoch": 11.97594501718213,
"grad_norm": 0.11186587810516357,
"learning_rate": 3.790267983638305e-05,
"loss": 0.0152,
"step": 6970
},
{
"epoch": 11.993127147766323,
"grad_norm": 0.18001288175582886,
"learning_rate": 3.776487363575524e-05,
"loss": 0.0098,
"step": 6980
},
{
"epoch": 12.010309278350515,
"grad_norm": 0.3391369879245758,
"learning_rate": 3.762716622486965e-05,
"loss": 0.0234,
"step": 6990
},
{
"epoch": 12.027491408934708,
"grad_norm": 0.19333554804325104,
"learning_rate": 3.748955871561341e-05,
"loss": 0.0127,
"step": 7000
},
{
"epoch": 12.0446735395189,
"grad_norm": 0.2803151607513428,
"learning_rate": 3.735205221906703e-05,
"loss": 0.0141,
"step": 7010
},
{
"epoch": 12.061855670103093,
"grad_norm": 0.28076592087745667,
"learning_rate": 3.721464784549543e-05,
"loss": 0.0116,
"step": 7020
},
{
"epoch": 12.079037800687285,
"grad_norm": 0.3014523386955261,
"learning_rate": 3.7077346704338935e-05,
"loss": 0.014,
"step": 7030
},
{
"epoch": 12.096219931271477,
"grad_norm": 0.15294674038887024,
"learning_rate": 3.694014990420433e-05,
"loss": 0.0133,
"step": 7040
},
{
"epoch": 12.11340206185567,
"grad_norm": 0.21652719378471375,
"learning_rate": 3.680305855285593e-05,
"loss": 0.0106,
"step": 7050
},
{
"epoch": 12.130584192439862,
"grad_norm": 0.24568617343902588,
"learning_rate": 3.6666073757206686e-05,
"loss": 0.0114,
"step": 7060
},
{
"epoch": 12.147766323024054,
"grad_norm": 0.2690240144729614,
"learning_rate": 3.6529196623309115e-05,
"loss": 0.016,
"step": 7070
},
{
"epoch": 12.164948453608247,
"grad_norm": 0.21522220969200134,
"learning_rate": 3.6392428256346475e-05,
"loss": 0.0136,
"step": 7080
},
{
"epoch": 12.18213058419244,
"grad_norm": 0.25682464241981506,
"learning_rate": 3.625576976062379e-05,
"loss": 0.0119,
"step": 7090
},
{
"epoch": 12.199312714776632,
"grad_norm": 0.15618295967578888,
"learning_rate": 3.6119222239559e-05,
"loss": 0.0131,
"step": 7100
},
{
"epoch": 12.216494845360824,
"grad_norm": 0.21718665957450867,
"learning_rate": 3.598278679567397e-05,
"loss": 0.0153,
"step": 7110
},
{
"epoch": 12.233676975945016,
"grad_norm": 0.17358386516571045,
"learning_rate": 3.5846464530585624e-05,
"loss": 0.0105,
"step": 7120
},
{
"epoch": 12.250859106529209,
"grad_norm": 0.2519778907299042,
"learning_rate": 3.571025654499702e-05,
"loss": 0.0157,
"step": 7130
},
{
"epoch": 12.268041237113403,
"grad_norm": 0.26433685421943665,
"learning_rate": 3.557416393868859e-05,
"loss": 0.0131,
"step": 7140
},
{
"epoch": 12.285223367697595,
"grad_norm": 0.2645297646522522,
"learning_rate": 3.543818781050906e-05,
"loss": 0.0098,
"step": 7150
},
{
"epoch": 12.302405498281788,
"grad_norm": 0.23010118305683136,
"learning_rate": 3.530232925836673e-05,
"loss": 0.018,
"step": 7160
},
{
"epoch": 12.31958762886598,
"grad_norm": 0.08610416948795319,
"learning_rate": 3.516658937922051e-05,
"loss": 0.0095,
"step": 7170
},
{
"epoch": 12.336769759450172,
"grad_norm": 0.14161959290504456,
"learning_rate": 3.503096926907123e-05,
"loss": 0.0153,
"step": 7180
},
{
"epoch": 12.353951890034365,
"grad_norm": 0.3274645209312439,
"learning_rate": 3.4895470022952536e-05,
"loss": 0.0118,
"step": 7190
},
{
"epoch": 12.371134020618557,
"grad_norm": 0.16021353006362915,
"learning_rate": 3.476009273492225e-05,
"loss": 0.0138,
"step": 7200
},
{
"epoch": 12.38831615120275,
"grad_norm": 0.2030124068260193,
"learning_rate": 3.462483849805346e-05,
"loss": 0.0106,
"step": 7210
},
{
"epoch": 12.405498281786942,
"grad_norm": 0.15385638177394867,
"learning_rate": 3.4489708404425704e-05,
"loss": 0.0102,
"step": 7220
},
{
"epoch": 12.422680412371134,
"grad_norm": 0.10668976604938507,
"learning_rate": 3.4354703545116185e-05,
"loss": 0.0109,
"step": 7230
},
{
"epoch": 12.439862542955327,
"grad_norm": 0.16402071714401245,
"learning_rate": 3.421982501019087e-05,
"loss": 0.0108,
"step": 7240
},
{
"epoch": 12.457044673539519,
"grad_norm": 0.10426975041627884,
"learning_rate": 3.4085073888695804e-05,
"loss": 0.0103,
"step": 7250
},
{
"epoch": 12.474226804123711,
"grad_norm": 0.23913106322288513,
"learning_rate": 3.3950451268648235e-05,
"loss": 0.0103,
"step": 7260
},
{
"epoch": 12.491408934707904,
"grad_norm": 0.1630750596523285,
"learning_rate": 3.381595823702784e-05,
"loss": 0.018,
"step": 7270
},
{
"epoch": 12.508591065292096,
"grad_norm": 0.3311632573604584,
"learning_rate": 3.368159587976799e-05,
"loss": 0.0089,
"step": 7280
},
{
"epoch": 12.525773195876289,
"grad_norm": 0.45006489753723145,
"learning_rate": 3.354736528174696e-05,
"loss": 0.0124,
"step": 7290
},
{
"epoch": 12.542955326460481,
"grad_norm": 0.23996764421463013,
"learning_rate": 3.341326752677916e-05,
"loss": 0.0179,
"step": 7300
},
{
"epoch": 12.560137457044673,
"grad_norm": 0.14841718971729279,
"learning_rate": 3.3279303697606354e-05,
"loss": 0.0063,
"step": 7310
},
{
"epoch": 12.577319587628866,
"grad_norm": 0.09983796626329422,
"learning_rate": 3.314547487588901e-05,
"loss": 0.0096,
"step": 7320
},
{
"epoch": 12.594501718213058,
"grad_norm": 0.17602872848510742,
"learning_rate": 3.301178214219744e-05,
"loss": 0.009,
"step": 7330
},
{
"epoch": 12.61168384879725,
"grad_norm": 0.24939224123954773,
"learning_rate": 3.2878226576003225e-05,
"loss": 0.013,
"step": 7340
},
{
"epoch": 12.628865979381443,
"grad_norm": 0.17927074432373047,
"learning_rate": 3.274480925567036e-05,
"loss": 0.011,
"step": 7350
},
{
"epoch": 12.646048109965635,
"grad_norm": 0.17862512171268463,
"learning_rate": 3.261153125844663e-05,
"loss": 0.0149,
"step": 7360
},
{
"epoch": 12.663230240549828,
"grad_norm": 0.2447875738143921,
"learning_rate": 3.247839366045485e-05,
"loss": 0.0137,
"step": 7370
},
{
"epoch": 12.68041237113402,
"grad_norm": 0.2494247555732727,
"learning_rate": 3.2345397536684286e-05,
"loss": 0.0126,
"step": 7380
},
{
"epoch": 12.697594501718212,
"grad_norm": 0.1975736767053604,
"learning_rate": 3.2212543960981845e-05,
"loss": 0.0104,
"step": 7390
},
{
"epoch": 12.714776632302405,
"grad_norm": 0.18755072355270386,
"learning_rate": 3.207983400604347e-05,
"loss": 0.009,
"step": 7400
},
{
"epoch": 12.731958762886597,
"grad_norm": 0.2701716423034668,
"learning_rate": 3.1947268743405457e-05,
"loss": 0.0136,
"step": 7410
},
{
"epoch": 12.749140893470791,
"grad_norm": 0.2599675953388214,
"learning_rate": 3.1814849243435886e-05,
"loss": 0.0217,
"step": 7420
},
{
"epoch": 12.766323024054984,
"grad_norm": 0.25833481550216675,
"learning_rate": 3.168257657532584e-05,
"loss": 0.0135,
"step": 7430
},
{
"epoch": 12.783505154639176,
"grad_norm": 0.33644336462020874,
"learning_rate": 3.155045180708085e-05,
"loss": 0.0098,
"step": 7440
},
{
"epoch": 12.800687285223368,
"grad_norm": 0.12960873544216156,
"learning_rate": 3.1418476005512265e-05,
"loss": 0.0099,
"step": 7450
},
{
"epoch": 12.81786941580756,
"grad_norm": 0.09624414145946503,
"learning_rate": 3.1286650236228696e-05,
"loss": 0.0084,
"step": 7460
},
{
"epoch": 12.835051546391753,
"grad_norm": 0.175624817609787,
"learning_rate": 3.115497556362727e-05,
"loss": 0.0137,
"step": 7470
},
{
"epoch": 12.852233676975946,
"grad_norm": 0.11060360819101334,
"learning_rate": 3.102345305088516e-05,
"loss": 0.0136,
"step": 7480
},
{
"epoch": 12.869415807560138,
"grad_norm": 0.1332932859659195,
"learning_rate": 3.089208375995092e-05,
"loss": 0.0141,
"step": 7490
},
{
"epoch": 12.88659793814433,
"grad_norm": 0.1730755716562271,
"learning_rate": 3.0760868751536045e-05,
"loss": 0.0111,
"step": 7500
},
{
"epoch": 12.903780068728523,
"grad_norm": 0.16571182012557983,
"learning_rate": 3.06298090851062e-05,
"loss": 0.0078,
"step": 7510
},
{
"epoch": 12.920962199312715,
"grad_norm": 0.2591513395309448,
"learning_rate": 3.0498905818872836e-05,
"loss": 0.0148,
"step": 7520
},
{
"epoch": 12.938144329896907,
"grad_norm": 0.1701243668794632,
"learning_rate": 3.036816000978455e-05,
"loss": 0.0159,
"step": 7530
},
{
"epoch": 12.9553264604811,
"grad_norm": 0.29323557019233704,
"learning_rate": 3.0237572713518647e-05,
"loss": 0.0127,
"step": 7540
},
{
"epoch": 12.972508591065292,
"grad_norm": 0.2534872889518738,
"learning_rate": 3.0107144984472502e-05,
"loss": 0.0163,
"step": 7550
},
{
"epoch": 12.989690721649485,
"grad_norm": 0.1676417738199234,
"learning_rate": 2.9976877875755128e-05,
"loss": 0.0083,
"step": 7560
},
{
"epoch": 13.006872852233677,
"grad_norm": 0.11713390052318573,
"learning_rate": 2.984677243917861e-05,
"loss": 0.0082,
"step": 7570
},
{
"epoch": 13.02405498281787,
"grad_norm": 0.35955625772476196,
"learning_rate": 2.9716829725249707e-05,
"loss": 0.0125,
"step": 7580
},
{
"epoch": 13.041237113402062,
"grad_norm": 0.1874362677335739,
"learning_rate": 2.9587050783161252e-05,
"loss": 0.0112,
"step": 7590
},
{
"epoch": 13.058419243986254,
"grad_norm": 0.06738214194774628,
"learning_rate": 2.9457436660783784e-05,
"loss": 0.0138,
"step": 7600
},
{
"epoch": 13.075601374570446,
"grad_norm": 0.22004689276218414,
"learning_rate": 2.9327988404657002e-05,
"loss": 0.0105,
"step": 7610
},
{
"epoch": 13.092783505154639,
"grad_norm": 0.11634822189807892,
"learning_rate": 2.9198707059981413e-05,
"loss": 0.0073,
"step": 7620
},
{
"epoch": 13.109965635738831,
"grad_norm": 0.08798322826623917,
"learning_rate": 2.9069593670609775e-05,
"loss": 0.018,
"step": 7630
},
{
"epoch": 13.127147766323024,
"grad_norm": 0.11149155348539352,
"learning_rate": 2.8940649279038768e-05,
"loss": 0.0091,
"step": 7640
},
{
"epoch": 13.144329896907216,
"grad_norm": 0.1387196183204651,
"learning_rate": 2.8811874926400483e-05,
"loss": 0.0101,
"step": 7650
},
{
"epoch": 13.161512027491408,
"grad_norm": 0.10784903913736343,
"learning_rate": 2.868327165245419e-05,
"loss": 0.0125,
"step": 7660
},
{
"epoch": 13.1786941580756,
"grad_norm": 0.293300598859787,
"learning_rate": 2.8554840495577682e-05,
"loss": 0.0099,
"step": 7670
},
{
"epoch": 13.195876288659793,
"grad_norm": 0.1339499056339264,
"learning_rate": 2.8426582492759134e-05,
"loss": 0.0089,
"step": 7680
},
{
"epoch": 13.213058419243985,
"grad_norm": 0.1549367606639862,
"learning_rate": 2.8298498679588525e-05,
"loss": 0.0108,
"step": 7690
},
{
"epoch": 13.230240549828178,
"grad_norm": 0.20458447933197021,
"learning_rate": 2.817059009024953e-05,
"loss": 0.0081,
"step": 7700
},
{
"epoch": 13.24742268041237,
"grad_norm": 0.17270691692829132,
"learning_rate": 2.8042857757510877e-05,
"loss": 0.0094,
"step": 7710
},
{
"epoch": 13.264604810996564,
"grad_norm": 0.17686305940151215,
"learning_rate": 2.7915302712718227e-05,
"loss": 0.0143,
"step": 7720
},
{
"epoch": 13.281786941580757,
"grad_norm": 0.2391350120306015,
"learning_rate": 2.7787925985785733e-05,
"loss": 0.0127,
"step": 7730
},
{
"epoch": 13.29896907216495,
"grad_norm": 0.21285896003246307,
"learning_rate": 2.7660728605187776e-05,
"loss": 0.0092,
"step": 7740
},
{
"epoch": 13.316151202749142,
"grad_norm": 0.2621266841888428,
"learning_rate": 2.753371159795065e-05,
"loss": 0.0128,
"step": 7750
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.13718031346797943,
"learning_rate": 2.740687598964429e-05,
"loss": 0.0113,
"step": 7760
},
{
"epoch": 13.350515463917526,
"grad_norm": 0.10009155422449112,
"learning_rate": 2.7280222804373895e-05,
"loss": 0.0088,
"step": 7770
},
{
"epoch": 13.367697594501719,
"grad_norm": 0.18854975700378418,
"learning_rate": 2.7153753064771792e-05,
"loss": 0.0102,
"step": 7780
},
{
"epoch": 13.384879725085911,
"grad_norm": 0.3908763825893402,
"learning_rate": 2.702746779198912e-05,
"loss": 0.0139,
"step": 7790
},
{
"epoch": 13.402061855670103,
"grad_norm": 0.08939257264137268,
"learning_rate": 2.690136800568752e-05,
"loss": 0.0083,
"step": 7800
},
{
"epoch": 13.419243986254296,
"grad_norm": 0.2188216745853424,
"learning_rate": 2.6775454724031036e-05,
"loss": 0.0114,
"step": 7810
},
{
"epoch": 13.436426116838488,
"grad_norm": 0.13271217048168182,
"learning_rate": 2.6649728963677783e-05,
"loss": 0.0088,
"step": 7820
},
{
"epoch": 13.45360824742268,
"grad_norm": 0.2332095205783844,
"learning_rate": 2.6524191739771815e-05,
"loss": 0.0105,
"step": 7830
},
{
"epoch": 13.470790378006873,
"grad_norm": 0.11752445995807648,
"learning_rate": 2.639884406593482e-05,
"loss": 0.0099,
"step": 7840
},
{
"epoch": 13.487972508591065,
"grad_norm": 0.22209575772285461,
"learning_rate": 2.627368695425808e-05,
"loss": 0.0098,
"step": 7850
},
{
"epoch": 13.505154639175258,
"grad_norm": 0.18378068506717682,
"learning_rate": 2.6148721415294186e-05,
"loss": 0.0099,
"step": 7860
},
{
"epoch": 13.52233676975945,
"grad_norm": 0.14152808487415314,
"learning_rate": 2.6023948458048965e-05,
"loss": 0.0102,
"step": 7870
},
{
"epoch": 13.539518900343642,
"grad_norm": 0.17036782205104828,
"learning_rate": 2.589936908997321e-05,
"loss": 0.0096,
"step": 7880
},
{
"epoch": 13.556701030927835,
"grad_norm": 0.16514594852924347,
"learning_rate": 2.5774984316954676e-05,
"loss": 0.0117,
"step": 7890
},
{
"epoch": 13.573883161512027,
"grad_norm": 0.23391450941562653,
"learning_rate": 2.5650795143309902e-05,
"loss": 0.0136,
"step": 7900
},
{
"epoch": 13.59106529209622,
"grad_norm": 0.22688448429107666,
"learning_rate": 2.552680257177611e-05,
"loss": 0.0099,
"step": 7910
},
{
"epoch": 13.608247422680412,
"grad_norm": 0.15916913747787476,
"learning_rate": 2.5403007603503053e-05,
"loss": 0.0089,
"step": 7920
},
{
"epoch": 13.625429553264604,
"grad_norm": 0.31960004568099976,
"learning_rate": 2.527941123804504e-05,
"loss": 0.0105,
"step": 7930
},
{
"epoch": 13.642611683848797,
"grad_norm": 0.17471857368946075,
"learning_rate": 2.5156014473352785e-05,
"loss": 0.0133,
"step": 7940
},
{
"epoch": 13.65979381443299,
"grad_norm": 0.16793015599250793,
"learning_rate": 2.5032818305765383e-05,
"loss": 0.0084,
"step": 7950
},
{
"epoch": 13.676975945017182,
"grad_norm": 0.21041658520698547,
"learning_rate": 2.4909823730002203e-05,
"loss": 0.0088,
"step": 7960
},
{
"epoch": 13.694158075601374,
"grad_norm": 0.2167925089597702,
"learning_rate": 2.478703173915497e-05,
"loss": 0.0097,
"step": 7970
},
{
"epoch": 13.711340206185566,
"grad_norm": 0.16766490042209625,
"learning_rate": 2.4664443324679653e-05,
"loss": 0.0059,
"step": 7980
},
{
"epoch": 13.728522336769759,
"grad_norm": 0.16163405776023865,
"learning_rate": 2.454205947638852e-05,
"loss": 0.0122,
"step": 7990
},
{
"epoch": 13.745704467353953,
"grad_norm": 0.2345849871635437,
"learning_rate": 2.4419881182442038e-05,
"loss": 0.0115,
"step": 8000
},
{
"epoch": 13.762886597938145,
"grad_norm": 0.10330498963594437,
"learning_rate": 2.429790942934106e-05,
"loss": 0.0097,
"step": 8010
},
{
"epoch": 13.780068728522338,
"grad_norm": 0.1268969476222992,
"learning_rate": 2.4176145201918726e-05,
"loss": 0.0094,
"step": 8020
},
{
"epoch": 13.79725085910653,
"grad_norm": 0.160488098859787,
"learning_rate": 2.4054589483332597e-05,
"loss": 0.0067,
"step": 8030
},
{
"epoch": 13.814432989690722,
"grad_norm": 0.26570194959640503,
"learning_rate": 2.3933243255056597e-05,
"loss": 0.0092,
"step": 8040
},
{
"epoch": 13.831615120274915,
"grad_norm": 0.3354252278804779,
"learning_rate": 2.3812107496873248e-05,
"loss": 0.0101,
"step": 8050
},
{
"epoch": 13.848797250859107,
"grad_norm": 0.1483275294303894,
"learning_rate": 2.3691183186865668e-05,
"loss": 0.0101,
"step": 8060
},
{
"epoch": 13.8659793814433,
"grad_norm": 0.26341909170150757,
"learning_rate": 2.3570471301409618e-05,
"loss": 0.0097,
"step": 8070
},
{
"epoch": 13.883161512027492,
"grad_norm": 0.16232207417488098,
"learning_rate": 2.3449972815165773e-05,
"loss": 0.0154,
"step": 8080
},
{
"epoch": 13.900343642611684,
"grad_norm": 0.19188156723976135,
"learning_rate": 2.332968870107171e-05,
"loss": 0.0069,
"step": 8090
},
{
"epoch": 13.917525773195877,
"grad_norm": 0.14537520706653595,
"learning_rate": 2.320961993033415e-05,
"loss": 0.0079,
"step": 8100
},
{
"epoch": 13.934707903780069,
"grad_norm": 0.10598124563694,
"learning_rate": 2.3089767472421e-05,
"loss": 0.0117,
"step": 8110
},
{
"epoch": 13.951890034364261,
"grad_norm": 0.15896451473236084,
"learning_rate": 2.297013229505367e-05,
"loss": 0.0134,
"step": 8120
},
{
"epoch": 13.969072164948454,
"grad_norm": 0.21453918516635895,
"learning_rate": 2.285071536419916e-05,
"loss": 0.0091,
"step": 8130
},
{
"epoch": 13.986254295532646,
"grad_norm": 0.17623427510261536,
"learning_rate": 2.2731517644062312e-05,
"loss": 0.012,
"step": 8140
},
{
"epoch": 14.003436426116838,
"grad_norm": 0.10579323023557663,
"learning_rate": 2.2612540097077935e-05,
"loss": 0.0067,
"step": 8150
},
{
"epoch": 14.02061855670103,
"grad_norm": 0.1269347220659256,
"learning_rate": 2.2493783683903185e-05,
"loss": 0.0092,
"step": 8160
},
{
"epoch": 14.037800687285223,
"grad_norm": 0.11808303743600845,
"learning_rate": 2.237524936340963e-05,
"loss": 0.0105,
"step": 8170
},
{
"epoch": 14.054982817869416,
"grad_norm": 0.10431456565856934,
"learning_rate": 2.2256938092675722e-05,
"loss": 0.0069,
"step": 8180
},
{
"epoch": 14.072164948453608,
"grad_norm": 0.3295063078403473,
"learning_rate": 2.213885082697883e-05,
"loss": 0.0107,
"step": 8190
},
{
"epoch": 14.0893470790378,
"grad_norm": 0.19644266366958618,
"learning_rate": 2.2020988519787733e-05,
"loss": 0.0109,
"step": 8200
},
{
"epoch": 14.106529209621993,
"grad_norm": 0.14295251667499542,
"learning_rate": 2.1903352122754732e-05,
"loss": 0.0095,
"step": 8210
},
{
"epoch": 14.123711340206185,
"grad_norm": 0.1610773205757141,
"learning_rate": 2.178594258570822e-05,
"loss": 0.0092,
"step": 8220
},
{
"epoch": 14.140893470790378,
"grad_norm": 0.18880592286586761,
"learning_rate": 2.1668760856644703e-05,
"loss": 0.0082,
"step": 8230
},
{
"epoch": 14.15807560137457,
"grad_norm": 0.1384887844324112,
"learning_rate": 2.1551807881721425e-05,
"loss": 0.0087,
"step": 8240
},
{
"epoch": 14.175257731958762,
"grad_norm": 0.19572345912456512,
"learning_rate": 2.1435084605248484e-05,
"loss": 0.0122,
"step": 8250
},
{
"epoch": 14.192439862542955,
"grad_norm": 0.11073683947324753,
"learning_rate": 2.131859196968149e-05,
"loss": 0.0079,
"step": 8260
},
{
"epoch": 14.209621993127147,
"grad_norm": 0.1309373676776886,
"learning_rate": 2.1202330915613638e-05,
"loss": 0.0089,
"step": 8270
},
{
"epoch": 14.22680412371134,
"grad_norm": 0.11186233907938004,
"learning_rate": 2.1086302381768385e-05,
"loss": 0.0109,
"step": 8280
},
{
"epoch": 14.243986254295532,
"grad_norm": 0.23258423805236816,
"learning_rate": 2.0970507304991656e-05,
"loss": 0.0145,
"step": 8290
},
{
"epoch": 14.261168384879726,
"grad_norm": 0.195637047290802,
"learning_rate": 2.0854946620244502e-05,
"loss": 0.0054,
"step": 8300
},
{
"epoch": 14.278350515463918,
"grad_norm": 0.17508986592292786,
"learning_rate": 2.0739621260595315e-05,
"loss": 0.0071,
"step": 8310
},
{
"epoch": 14.29553264604811,
"grad_norm": 0.07197950035333633,
"learning_rate": 2.0624532157212483e-05,
"loss": 0.005,
"step": 8320
},
{
"epoch": 14.312714776632303,
"grad_norm": 0.167429119348526,
"learning_rate": 2.0509680239356728e-05,
"loss": 0.0119,
"step": 8330
},
{
"epoch": 14.329896907216495,
"grad_norm": 0.1402851641178131,
"learning_rate": 2.0395066434373767e-05,
"loss": 0.0142,
"step": 8340
},
{
"epoch": 14.347079037800688,
"grad_norm": 0.08017238229513168,
"learning_rate": 2.028069166768663e-05,
"loss": 0.0086,
"step": 8350
},
{
"epoch": 14.36426116838488,
"grad_norm": 0.3312987685203552,
"learning_rate": 2.016655686278836e-05,
"loss": 0.0123,
"step": 8360
},
{
"epoch": 14.381443298969073,
"grad_norm": 0.1905941665172577,
"learning_rate": 2.005266294123443e-05,
"loss": 0.0119,
"step": 8370
},
{
"epoch": 14.398625429553265,
"grad_norm": 0.17473655939102173,
"learning_rate": 1.9939010822635384e-05,
"loss": 0.0098,
"step": 8380
},
{
"epoch": 14.415807560137457,
"grad_norm": 0.2841387689113617,
"learning_rate": 1.982560142464939e-05,
"loss": 0.007,
"step": 8390
},
{
"epoch": 14.43298969072165,
"grad_norm": 0.15247130393981934,
"learning_rate": 1.9712435662974816e-05,
"loss": 0.008,
"step": 8400
},
{
"epoch": 14.450171821305842,
"grad_norm": 0.11507634073495865,
"learning_rate": 1.9599514451342816e-05,
"loss": 0.0076,
"step": 8410
},
{
"epoch": 14.467353951890034,
"grad_norm": 0.20362359285354614,
"learning_rate": 1.9486838701510012e-05,
"loss": 0.0072,
"step": 8420
},
{
"epoch": 14.484536082474227,
"grad_norm": 0.30303680896759033,
"learning_rate": 1.937440932325112e-05,
"loss": 0.0061,
"step": 8430
},
{
"epoch": 14.50171821305842,
"grad_norm": 0.18510393798351288,
"learning_rate": 1.926222722435152e-05,
"loss": 0.008,
"step": 8440
},
{
"epoch": 14.518900343642612,
"grad_norm": 0.20415428280830383,
"learning_rate": 1.9150293310600042e-05,
"loss": 0.0094,
"step": 8450
},
{
"epoch": 14.536082474226804,
"grad_norm": 0.17331425845623016,
"learning_rate": 1.903860848578159e-05,
"loss": 0.006,
"step": 8460
},
{
"epoch": 14.553264604810996,
"grad_norm": 0.18844066560268402,
"learning_rate": 1.8927173651669877e-05,
"loss": 0.008,
"step": 8470
},
{
"epoch": 14.570446735395189,
"grad_norm": 0.14389431476593018,
"learning_rate": 1.8815989708020055e-05,
"loss": 0.0097,
"step": 8480
},
{
"epoch": 14.587628865979381,
"grad_norm": 0.10053612291812897,
"learning_rate": 1.8705057552561595e-05,
"loss": 0.0081,
"step": 8490
},
{
"epoch": 14.604810996563574,
"grad_norm": 0.1731092631816864,
"learning_rate": 1.8594378080990915e-05,
"loss": 0.0077,
"step": 8500
},
{
"epoch": 14.621993127147766,
"grad_norm": 0.1470867097377777,
"learning_rate": 1.8483952186964237e-05,
"loss": 0.0165,
"step": 8510
},
{
"epoch": 14.639175257731958,
"grad_norm": 0.1546664535999298,
"learning_rate": 1.8373780762090266e-05,
"loss": 0.012,
"step": 8520
},
{
"epoch": 14.65635738831615,
"grad_norm": 0.2409757524728775,
"learning_rate": 1.82638646959231e-05,
"loss": 0.0074,
"step": 8530
},
{
"epoch": 14.673539518900343,
"grad_norm": 0.1771342009305954,
"learning_rate": 1.8154204875955e-05,
"loss": 0.013,
"step": 8540
},
{
"epoch": 14.690721649484535,
"grad_norm": 0.11487053334712982,
"learning_rate": 1.804480218760922e-05,
"loss": 0.0074,
"step": 8550
},
{
"epoch": 14.707903780068728,
"grad_norm": 0.18041536211967468,
"learning_rate": 1.793565751423284e-05,
"loss": 0.0074,
"step": 8560
},
{
"epoch": 14.72508591065292,
"grad_norm": 0.1591220200061798,
"learning_rate": 1.782677173708968e-05,
"loss": 0.0066,
"step": 8570
},
{
"epoch": 14.742268041237114,
"grad_norm": 0.21568642556667328,
"learning_rate": 1.771814573535317e-05,
"loss": 0.0079,
"step": 8580
},
{
"epoch": 14.759450171821307,
"grad_norm": 0.12885995209217072,
"learning_rate": 1.7609780386099234e-05,
"loss": 0.0092,
"step": 8590
},
{
"epoch": 14.776632302405499,
"grad_norm": 0.158493772149086,
"learning_rate": 1.750167656429918e-05,
"loss": 0.0073,
"step": 8600
},
{
"epoch": 14.793814432989691,
"grad_norm": 0.14277073740959167,
"learning_rate": 1.739383514281273e-05,
"loss": 0.0096,
"step": 8610
},
{
"epoch": 14.810996563573884,
"grad_norm": 0.1386091113090515,
"learning_rate": 1.7286256992380888e-05,
"loss": 0.0069,
"step": 8620
},
{
"epoch": 14.828178694158076,
"grad_norm": 0.12749871611595154,
"learning_rate": 1.7178942981618945e-05,
"loss": 0.0127,
"step": 8630
},
{
"epoch": 14.845360824742269,
"grad_norm": 0.18113838136196136,
"learning_rate": 1.707189397700943e-05,
"loss": 0.0057,
"step": 8640
},
{
"epoch": 14.862542955326461,
"grad_norm": 0.2324298769235611,
"learning_rate": 1.696511084289516e-05,
"loss": 0.0078,
"step": 8650
},
{
"epoch": 14.879725085910653,
"grad_norm": 0.09419187903404236,
"learning_rate": 1.6858594441472242e-05,
"loss": 0.0049,
"step": 8660
},
{
"epoch": 14.896907216494846,
"grad_norm": 0.17267560958862305,
"learning_rate": 1.6752345632783135e-05,
"loss": 0.0081,
"step": 8670
},
{
"epoch": 14.914089347079038,
"grad_norm": 0.24412667751312256,
"learning_rate": 1.664636527470961e-05,
"loss": 0.0075,
"step": 8680
},
{
"epoch": 14.93127147766323,
"grad_norm": 0.10746461153030396,
"learning_rate": 1.6540654222965973e-05,
"loss": 0.0081,
"step": 8690
},
{
"epoch": 14.948453608247423,
"grad_norm": 0.2459520846605301,
"learning_rate": 1.6435213331092027e-05,
"loss": 0.0083,
"step": 8700
},
{
"epoch": 14.965635738831615,
"grad_norm": 0.26834601163864136,
"learning_rate": 1.6330043450446265e-05,
"loss": 0.0061,
"step": 8710
},
{
"epoch": 14.982817869415808,
"grad_norm": 0.20477992296218872,
"learning_rate": 1.6225145430198918e-05,
"loss": 0.008,
"step": 8720
},
{
"epoch": 15.0,
"grad_norm": 0.19616572558879852,
"learning_rate": 1.6120520117325184e-05,
"loss": 0.0074,
"step": 8730
},
{
"epoch": 15.017182130584192,
"grad_norm": 0.14601653814315796,
"learning_rate": 1.6016168356598343e-05,
"loss": 0.005,
"step": 8740
},
{
"epoch": 15.034364261168385,
"grad_norm": 0.11024006456136703,
"learning_rate": 1.59120909905829e-05,
"loss": 0.0068,
"step": 8750
},
{
"epoch": 15.051546391752577,
"grad_norm": 0.21115554869174957,
"learning_rate": 1.580828885962787e-05,
"loss": 0.0092,
"step": 8760
},
{
"epoch": 15.06872852233677,
"grad_norm": 0.23726613819599152,
"learning_rate": 1.5704762801859916e-05,
"loss": 0.0091,
"step": 8770
},
{
"epoch": 15.085910652920962,
"grad_norm": 0.08648664504289627,
"learning_rate": 1.560151365317665e-05,
"loss": 0.0112,
"step": 8780
},
{
"epoch": 15.103092783505154,
"grad_norm": 0.14515356719493866,
"learning_rate": 1.549854224723978e-05,
"loss": 0.0142,
"step": 8790
},
{
"epoch": 15.120274914089347,
"grad_norm": 0.28362399339675903,
"learning_rate": 1.5395849415468505e-05,
"loss": 0.0138,
"step": 8800
},
{
"epoch": 15.137457044673539,
"grad_norm": 0.16832397878170013,
"learning_rate": 1.529343598703267e-05,
"loss": 0.0146,
"step": 8810
},
{
"epoch": 15.154639175257731,
"grad_norm": 0.16828805208206177,
"learning_rate": 1.5191302788846256e-05,
"loss": 0.0061,
"step": 8820
},
{
"epoch": 15.171821305841924,
"grad_norm": 0.12565237283706665,
"learning_rate": 1.508945064556047e-05,
"loss": 0.0054,
"step": 8830
},
{
"epoch": 15.189003436426116,
"grad_norm": 0.06071079894900322,
"learning_rate": 1.498788037955728e-05,
"loss": 0.0131,
"step": 8840
},
{
"epoch": 15.206185567010309,
"grad_norm": 0.10003683716058731,
"learning_rate": 1.4886592810942629e-05,
"loss": 0.0128,
"step": 8850
},
{
"epoch": 15.223367697594501,
"grad_norm": 0.30496570467948914,
"learning_rate": 1.4785588757539991e-05,
"loss": 0.0083,
"step": 8860
},
{
"epoch": 15.240549828178693,
"grad_norm": 0.24895748496055603,
"learning_rate": 1.4684869034883554e-05,
"loss": 0.0106,
"step": 8870
},
{
"epoch": 15.257731958762886,
"grad_norm": 0.21190665662288666,
"learning_rate": 1.458443445621182e-05,
"loss": 0.0116,
"step": 8880
},
{
"epoch": 15.27491408934708,
"grad_norm": 0.29413625597953796,
"learning_rate": 1.448428583246088e-05,
"loss": 0.0078,
"step": 8890
},
{
"epoch": 15.292096219931272,
"grad_norm": 0.11148206889629364,
"learning_rate": 1.4384423972258055e-05,
"loss": 0.0073,
"step": 8900
},
{
"epoch": 15.309278350515465,
"grad_norm": 0.12171918898820877,
"learning_rate": 1.4284849681915158e-05,
"loss": 0.0073,
"step": 8910
},
{
"epoch": 15.326460481099657,
"grad_norm": 0.13666512072086334,
"learning_rate": 1.4185563765422155e-05,
"loss": 0.0072,
"step": 8920
},
{
"epoch": 15.34364261168385,
"grad_norm": 0.10317881405353546,
"learning_rate": 1.4086567024440527e-05,
"loss": 0.0059,
"step": 8930
},
{
"epoch": 15.360824742268042,
"grad_norm": 0.12942399084568024,
"learning_rate": 1.398786025829698e-05,
"loss": 0.006,
"step": 8940
},
{
"epoch": 15.378006872852234,
"grad_norm": 0.22688932716846466,
"learning_rate": 1.3889444263976786e-05,
"loss": 0.0074,
"step": 8950
},
{
"epoch": 15.395189003436426,
"grad_norm": 0.184346541762352,
"learning_rate": 1.3791319836117506e-05,
"loss": 0.0094,
"step": 8960
},
{
"epoch": 15.412371134020619,
"grad_norm": 0.3060619533061981,
"learning_rate": 1.3693487767002445e-05,
"loss": 0.0066,
"step": 8970
},
{
"epoch": 15.429553264604811,
"grad_norm": 0.2948829233646393,
"learning_rate": 1.3595948846554446e-05,
"loss": 0.0098,
"step": 8980
},
{
"epoch": 15.446735395189004,
"grad_norm": 0.22396834194660187,
"learning_rate": 1.3498703862329254e-05,
"loss": 0.0088,
"step": 8990
},
{
"epoch": 15.463917525773196,
"grad_norm": 0.10084215551614761,
"learning_rate": 1.3401753599509397e-05,
"loss": 0.0075,
"step": 9000
},
{
"epoch": 15.481099656357388,
"grad_norm": 0.24087756872177124,
"learning_rate": 1.3305098840897646e-05,
"loss": 0.0113,
"step": 9010
},
{
"epoch": 15.49828178694158,
"grad_norm": 0.10929588228464127,
"learning_rate": 1.3208740366910904e-05,
"loss": 0.0079,
"step": 9020
},
{
"epoch": 15.515463917525773,
"grad_norm": 0.10467175394296646,
"learning_rate": 1.3112678955573693e-05,
"loss": 0.0078,
"step": 9030
},
{
"epoch": 15.532646048109966,
"grad_norm": 0.16169194877147675,
"learning_rate": 1.3016915382512029e-05,
"loss": 0.0073,
"step": 9040
},
{
"epoch": 15.549828178694158,
"grad_norm": 0.23535805940628052,
"learning_rate": 1.2921450420947057e-05,
"loss": 0.009,
"step": 9050
},
{
"epoch": 15.56701030927835,
"grad_norm": 0.14566457271575928,
"learning_rate": 1.2826284841688885e-05,
"loss": 0.0091,
"step": 9060
},
{
"epoch": 15.584192439862543,
"grad_norm": 0.05960577726364136,
"learning_rate": 1.2731419413130325e-05,
"loss": 0.0077,
"step": 9070
},
{
"epoch": 15.601374570446735,
"grad_norm": 0.12475630640983582,
"learning_rate": 1.2636854901240681e-05,
"loss": 0.0071,
"step": 9080
},
{
"epoch": 15.618556701030927,
"grad_norm": 0.08590589463710785,
"learning_rate": 1.2542592069559556e-05,
"loss": 0.0058,
"step": 9090
},
{
"epoch": 15.63573883161512,
"grad_norm": 0.13284145295619965,
"learning_rate": 1.2448631679190736e-05,
"loss": 0.0061,
"step": 9100
},
{
"epoch": 15.652920962199312,
"grad_norm": 0.13162577152252197,
"learning_rate": 1.2354974488796017e-05,
"loss": 0.0059,
"step": 9110
},
{
"epoch": 15.670103092783505,
"grad_norm": 0.09567181766033173,
"learning_rate": 1.2261621254589022e-05,
"loss": 0.0066,
"step": 9120
},
{
"epoch": 15.687285223367697,
"grad_norm": 0.0930383950471878,
"learning_rate": 1.2168572730329214e-05,
"loss": 0.0095,
"step": 9130
},
{
"epoch": 15.70446735395189,
"grad_norm": 0.09626749902963638,
"learning_rate": 1.2075829667315708e-05,
"loss": 0.0073,
"step": 9140
},
{
"epoch": 15.721649484536082,
"grad_norm": 0.18719489872455597,
"learning_rate": 1.1983392814381273e-05,
"loss": 0.007,
"step": 9150
},
{
"epoch": 15.738831615120276,
"grad_norm": 0.1957959532737732,
"learning_rate": 1.1891262917886198e-05,
"loss": 0.0083,
"step": 9160
},
{
"epoch": 15.756013745704468,
"grad_norm": 0.21053771674633026,
"learning_rate": 1.1799440721712368e-05,
"loss": 0.0083,
"step": 9170
},
{
"epoch": 15.77319587628866,
"grad_norm": 0.2002599984407425,
"learning_rate": 1.170792696725721e-05,
"loss": 0.0065,
"step": 9180
},
{
"epoch": 15.790378006872853,
"grad_norm": 0.19797247648239136,
"learning_rate": 1.1616722393427704e-05,
"loss": 0.0062,
"step": 9190
},
{
"epoch": 15.807560137457045,
"grad_norm": 0.24263660609722137,
"learning_rate": 1.1525827736634398e-05,
"loss": 0.0067,
"step": 9200
},
{
"epoch": 15.824742268041238,
"grad_norm": 0.30493855476379395,
"learning_rate": 1.1435243730785511e-05,
"loss": 0.0128,
"step": 9210
},
{
"epoch": 15.84192439862543,
"grad_norm": 0.09943480044603348,
"learning_rate": 1.1344971107280978e-05,
"loss": 0.0042,
"step": 9220
},
{
"epoch": 15.859106529209622,
"grad_norm": 0.08174432069063187,
"learning_rate": 1.125501059500656e-05,
"loss": 0.0062,
"step": 9230
},
{
"epoch": 15.876288659793815,
"grad_norm": 0.15180669724941254,
"learning_rate": 1.1165362920327898e-05,
"loss": 0.012,
"step": 9240
},
{
"epoch": 15.893470790378007,
"grad_norm": 0.09688429534435272,
"learning_rate": 1.1076028807084748e-05,
"loss": 0.0074,
"step": 9250
},
{
"epoch": 15.9106529209622,
"grad_norm": 0.0951455757021904,
"learning_rate": 1.0987008976585073e-05,
"loss": 0.0083,
"step": 9260
},
{
"epoch": 15.927835051546392,
"grad_norm": 0.08295347541570663,
"learning_rate": 1.0898304147599231e-05,
"loss": 0.0066,
"step": 9270
},
{
"epoch": 15.945017182130584,
"grad_norm": 0.20688819885253906,
"learning_rate": 1.0809915036354152e-05,
"loss": 0.0095,
"step": 9280
},
{
"epoch": 15.962199312714777,
"grad_norm": 0.09953152388334274,
"learning_rate": 1.0721842356527595e-05,
"loss": 0.0052,
"step": 9290
},
{
"epoch": 15.97938144329897,
"grad_norm": 0.27744606137275696,
"learning_rate": 1.063408681924236e-05,
"loss": 0.0105,
"step": 9300
},
{
"epoch": 15.996563573883162,
"grad_norm": 0.217054545879364,
"learning_rate": 1.0546649133060583e-05,
"loss": 0.0076,
"step": 9310
},
{
"epoch": 16.013745704467354,
"grad_norm": 0.15550248324871063,
"learning_rate": 1.0459530003977908e-05,
"loss": 0.0195,
"step": 9320
},
{
"epoch": 16.030927835051546,
"grad_norm": 0.3384007513523102,
"learning_rate": 1.0372730135417936e-05,
"loss": 0.0066,
"step": 9330
},
{
"epoch": 16.04810996563574,
"grad_norm": 0.23240487277507782,
"learning_rate": 1.0286250228226434e-05,
"loss": 0.0064,
"step": 9340
},
{
"epoch": 16.06529209621993,
"grad_norm": 0.13849616050720215,
"learning_rate": 1.0200090980665739e-05,
"loss": 0.006,
"step": 9350
},
{
"epoch": 16.082474226804123,
"grad_norm": 0.09946257621049881,
"learning_rate": 1.0114253088409054e-05,
"loss": 0.0058,
"step": 9360
},
{
"epoch": 16.099656357388316,
"grad_norm": 0.17231334745883942,
"learning_rate": 1.0028737244534914e-05,
"loss": 0.0123,
"step": 9370
},
{
"epoch": 16.116838487972508,
"grad_norm": 0.23527809977531433,
"learning_rate": 9.943544139521521e-06,
"loss": 0.0047,
"step": 9380
},
{
"epoch": 16.1340206185567,
"grad_norm": 0.11022651195526123,
"learning_rate": 9.858674461241229e-06,
"loss": 0.0127,
"step": 9390
},
{
"epoch": 16.151202749140893,
"grad_norm": 0.09669523686170578,
"learning_rate": 9.774128894954904e-06,
"loss": 0.0137,
"step": 9400
},
{
"epoch": 16.168384879725085,
"grad_norm": 0.21540790796279907,
"learning_rate": 9.68990812330648e-06,
"loss": 0.0063,
"step": 9410
},
{
"epoch": 16.185567010309278,
"grad_norm": 0.10020115226507187,
"learning_rate": 9.606012826317417e-06,
"loss": 0.0051,
"step": 9420
},
{
"epoch": 16.20274914089347,
"grad_norm": 0.21819765865802765,
"learning_rate": 9.522443681381172e-06,
"loss": 0.0073,
"step": 9430
},
{
"epoch": 16.219931271477662,
"grad_norm": 0.15638329088687897,
"learning_rate": 9.439201363257778e-06,
"loss": 0.008,
"step": 9440
},
{
"epoch": 16.237113402061855,
"grad_norm": 0.23262475430965424,
"learning_rate": 9.356286544068394e-06,
"loss": 0.0093,
"step": 9450
},
{
"epoch": 16.254295532646047,
"grad_norm": 0.05870979651808739,
"learning_rate": 9.273699893289862e-06,
"loss": 0.0041,
"step": 9460
},
{
"epoch": 16.27147766323024,
"grad_norm": 0.1267542541027069,
"learning_rate": 9.191442077749257e-06,
"loss": 0.0068,
"step": 9470
},
{
"epoch": 16.288659793814432,
"grad_norm": 0.11667685955762863,
"learning_rate": 9.10951376161861e-06,
"loss": 0.0037,
"step": 9480
},
{
"epoch": 16.305841924398624,
"grad_norm": 0.13009728491306305,
"learning_rate": 9.027915606409427e-06,
"loss": 0.0088,
"step": 9490
},
{
"epoch": 16.323024054982817,
"grad_norm": 0.2052125632762909,
"learning_rate": 8.946648270967473e-06,
"loss": 0.0058,
"step": 9500
},
{
"epoch": 16.34020618556701,
"grad_norm": 0.06314068287611008,
"learning_rate": 8.86571241146732e-06,
"loss": 0.0074,
"step": 9510
},
{
"epoch": 16.3573883161512,
"grad_norm": 0.1686325967311859,
"learning_rate": 8.785108681407156e-06,
"loss": 0.0075,
"step": 9520
},
{
"epoch": 16.374570446735394,
"grad_norm": 0.07260609418153763,
"learning_rate": 8.704837731603415e-06,
"loss": 0.0055,
"step": 9530
},
{
"epoch": 16.391752577319586,
"grad_norm": 0.1643422245979309,
"learning_rate": 8.624900210185648e-06,
"loss": 0.0089,
"step": 9540
},
{
"epoch": 16.40893470790378,
"grad_norm": 0.20184507966041565,
"learning_rate": 8.545296762591144e-06,
"loss": 0.01,
"step": 9550
},
{
"epoch": 16.42611683848797,
"grad_norm": 0.07979606091976166,
"learning_rate": 8.466028031559836e-06,
"loss": 0.0054,
"step": 9560
},
{
"epoch": 16.443298969072163,
"grad_norm": 0.10305003076791763,
"learning_rate": 8.387094657129013e-06,
"loss": 0.0101,
"step": 9570
},
{
"epoch": 16.460481099656356,
"grad_norm": 0.14057329297065735,
"learning_rate": 8.308497276628279e-06,
"loss": 0.0038,
"step": 9580
},
{
"epoch": 16.477663230240548,
"grad_norm": 0.20156535506248474,
"learning_rate": 8.230236524674256e-06,
"loss": 0.0077,
"step": 9590
},
{
"epoch": 16.49484536082474,
"grad_norm": 0.12216826528310776,
"learning_rate": 8.152313033165592e-06,
"loss": 0.0114,
"step": 9600
},
{
"epoch": 16.512027491408936,
"grad_norm": 0.22437122464179993,
"learning_rate": 8.074727431277745e-06,
"loss": 0.006,
"step": 9610
},
{
"epoch": 16.52920962199313,
"grad_norm": 0.21979045867919922,
"learning_rate": 7.99748034545803e-06,
"loss": 0.0063,
"step": 9620
},
{
"epoch": 16.54639175257732,
"grad_norm": 0.13207381963729858,
"learning_rate": 7.920572399420428e-06,
"loss": 0.0069,
"step": 9630
},
{
"epoch": 16.563573883161514,
"grad_norm": 0.3022846579551697,
"learning_rate": 7.844004214140665e-06,
"loss": 0.0051,
"step": 9640
},
{
"epoch": 16.580756013745706,
"grad_norm": 0.16219815611839294,
"learning_rate": 7.76777640785108e-06,
"loss": 0.0094,
"step": 9650
},
{
"epoch": 16.5979381443299,
"grad_norm": 0.2360786646604538,
"learning_rate": 7.691889596035784e-06,
"loss": 0.0075,
"step": 9660
},
{
"epoch": 16.61512027491409,
"grad_norm": 0.2144838273525238,
"learning_rate": 7.616344391425534e-06,
"loss": 0.0158,
"step": 9670
},
{
"epoch": 16.632302405498283,
"grad_norm": 0.17959503829479218,
"learning_rate": 7.541141403992902e-06,
"loss": 0.0168,
"step": 9680
},
{
"epoch": 16.649484536082475,
"grad_norm": 0.06356091052293777,
"learning_rate": 7.4662812409472705e-06,
"loss": 0.0054,
"step": 9690
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.16825449466705322,
"learning_rate": 7.391764506729992e-06,
"loss": 0.0133,
"step": 9700
},
{
"epoch": 16.68384879725086,
"grad_norm": 0.06855875998735428,
"learning_rate": 7.317591803009472e-06,
"loss": 0.0053,
"step": 9710
},
{
"epoch": 16.701030927835053,
"grad_norm": 0.07593529671430588,
"learning_rate": 7.243763728676328e-06,
"loss": 0.0102,
"step": 9720
},
{
"epoch": 16.718213058419245,
"grad_norm": 0.2639977037906647,
"learning_rate": 7.170280879838515e-06,
"loss": 0.0063,
"step": 9730
},
{
"epoch": 16.735395189003437,
"grad_norm": 0.1285523623228073,
"learning_rate": 7.097143849816584e-06,
"loss": 0.0066,
"step": 9740
},
{
"epoch": 16.75257731958763,
"grad_norm": 0.07977993786334991,
"learning_rate": 7.024353229138836e-06,
"loss": 0.0098,
"step": 9750
},
{
"epoch": 16.769759450171822,
"grad_norm": 0.08578211069107056,
"learning_rate": 6.951909605536544e-06,
"loss": 0.0085,
"step": 9760
},
{
"epoch": 16.786941580756015,
"grad_norm": 0.12852418422698975,
"learning_rate": 6.879813563939269e-06,
"loss": 0.0053,
"step": 9770
},
{
"epoch": 16.804123711340207,
"grad_norm": 0.16339828073978424,
"learning_rate": 6.808065686470083e-06,
"loss": 0.0065,
"step": 9780
},
{
"epoch": 16.8213058419244,
"grad_norm": 0.14644475281238556,
"learning_rate": 6.736666552440896e-06,
"loss": 0.0091,
"step": 9790
},
{
"epoch": 16.83848797250859,
"grad_norm": 0.1779240071773529,
"learning_rate": 6.665616738347741e-06,
"loss": 0.0051,
"step": 9800
},
{
"epoch": 16.855670103092784,
"grad_norm": 0.2106473445892334,
"learning_rate": 6.5949168178661755e-06,
"loss": 0.0063,
"step": 9810
},
{
"epoch": 16.872852233676976,
"grad_norm": 0.09443774074316025,
"learning_rate": 6.524567361846612e-06,
"loss": 0.007,
"step": 9820
},
{
"epoch": 16.89003436426117,
"grad_norm": 0.1403152197599411,
"learning_rate": 6.454568938309724e-06,
"loss": 0.0055,
"step": 9830
},
{
"epoch": 16.90721649484536,
"grad_norm": 0.2038644254207611,
"learning_rate": 6.384922112441821e-06,
"loss": 0.0047,
"step": 9840
},
{
"epoch": 16.924398625429554,
"grad_norm": 0.22020995616912842,
"learning_rate": 6.315627446590367e-06,
"loss": 0.0052,
"step": 9850
},
{
"epoch": 16.941580756013746,
"grad_norm": 0.07635272294282913,
"learning_rate": 6.246685500259352e-06,
"loss": 0.0073,
"step": 9860
},
{
"epoch": 16.95876288659794,
"grad_norm": 0.18454794585704803,
"learning_rate": 6.1780968301048406e-06,
"loss": 0.0072,
"step": 9870
},
{
"epoch": 16.97594501718213,
"grad_norm": 0.3649043142795563,
"learning_rate": 6.10986198993041e-06,
"loss": 0.0048,
"step": 9880
},
{
"epoch": 16.993127147766323,
"grad_norm": 0.09747838973999023,
"learning_rate": 6.041981530682756e-06,
"loss": 0.0071,
"step": 9890
},
{
"epoch": 17.010309278350515,
"grad_norm": 0.170127272605896,
"learning_rate": 5.9744560004471874e-06,
"loss": 0.0089,
"step": 9900
},
{
"epoch": 17.027491408934708,
"grad_norm": 0.11401594430208206,
"learning_rate": 5.907285944443241e-06,
"loss": 0.0064,
"step": 9910
},
{
"epoch": 17.0446735395189,
"grad_norm": 0.09278935939073563,
"learning_rate": 5.840471905020223e-06,
"loss": 0.0038,
"step": 9920
},
{
"epoch": 17.061855670103093,
"grad_norm": 0.09396050870418549,
"learning_rate": 5.774014421652879e-06,
"loss": 0.0088,
"step": 9930
},
{
"epoch": 17.079037800687285,
"grad_norm": 0.22349518537521362,
"learning_rate": 5.707914030937045e-06,
"loss": 0.0116,
"step": 9940
},
{
"epoch": 17.096219931271477,
"grad_norm": 0.11371111124753952,
"learning_rate": 5.642171266585272e-06,
"loss": 0.0071,
"step": 9950
},
{
"epoch": 17.11340206185567,
"grad_norm": 0.11897526681423187,
"learning_rate": 5.576786659422534e-06,
"loss": 0.0124,
"step": 9960
},
{
"epoch": 17.130584192439862,
"grad_norm": 0.11207327991724014,
"learning_rate": 5.51176073738196e-06,
"loss": 0.0056,
"step": 9970
},
{
"epoch": 17.147766323024054,
"grad_norm": 0.11493603140115738,
"learning_rate": 5.447094025500554e-06,
"loss": 0.0057,
"step": 9980
},
{
"epoch": 17.164948453608247,
"grad_norm": 0.15954163670539856,
"learning_rate": 5.3827870459149665e-06,
"loss": 0.0062,
"step": 9990
},
{
"epoch": 17.18213058419244,
"grad_norm": 0.14759565889835358,
"learning_rate": 5.318840317857248e-06,
"loss": 0.0072,
"step": 10000
},
{
"epoch": 17.19931271477663,
"grad_norm": 0.12291218340396881,
"learning_rate": 5.2552543576506965e-06,
"loss": 0.0049,
"step": 10010
},
{
"epoch": 17.216494845360824,
"grad_norm": 0.15934151411056519,
"learning_rate": 5.192029678705679e-06,
"loss": 0.0117,
"step": 10020
},
{
"epoch": 17.233676975945016,
"grad_norm": 0.06648825109004974,
"learning_rate": 5.1291667915154774e-06,
"loss": 0.0056,
"step": 10030
},
{
"epoch": 17.25085910652921,
"grad_norm": 0.08082503825426102,
"learning_rate": 5.066666203652148e-06,
"loss": 0.0076,
"step": 10040
},
{
"epoch": 17.2680412371134,
"grad_norm": 0.06852155178785324,
"learning_rate": 5.004528419762455e-06,
"loss": 0.0075,
"step": 10050
},
{
"epoch": 17.285223367697593,
"grad_norm": 0.22112947702407837,
"learning_rate": 4.9427539415638106e-06,
"loss": 0.0064,
"step": 10060
},
{
"epoch": 17.302405498281786,
"grad_norm": 0.11393307894468307,
"learning_rate": 4.88134326784015e-06,
"loss": 0.0054,
"step": 10070
},
{
"epoch": 17.31958762886598,
"grad_norm": 0.07953692972660065,
"learning_rate": 4.8202968944379865e-06,
"loss": 0.0069,
"step": 10080
},
{
"epoch": 17.33676975945017,
"grad_norm": 0.12307379394769669,
"learning_rate": 4.759615314262361e-06,
"loss": 0.0086,
"step": 10090
},
{
"epoch": 17.353951890034363,
"grad_norm": 0.2868382930755615,
"learning_rate": 4.6992990172728846e-06,
"loss": 0.0067,
"step": 10100
},
{
"epoch": 17.371134020618555,
"grad_norm": 0.13150477409362793,
"learning_rate": 4.639348490479755e-06,
"loss": 0.0052,
"step": 10110
},
{
"epoch": 17.388316151202748,
"grad_norm": 0.13385729491710663,
"learning_rate": 4.579764217939863e-06,
"loss": 0.0085,
"step": 10120
},
{
"epoch": 17.40549828178694,
"grad_norm": 0.05345318466424942,
"learning_rate": 4.5205466807528294e-06,
"loss": 0.0053,
"step": 10130
},
{
"epoch": 17.422680412371133,
"grad_norm": 0.09881814569234848,
"learning_rate": 4.4616963570572105e-06,
"loss": 0.0037,
"step": 10140
},
{
"epoch": 17.439862542955325,
"grad_norm": 0.15886837244033813,
"learning_rate": 4.403213722026516e-06,
"loss": 0.0094,
"step": 10150
},
{
"epoch": 17.457044673539517,
"grad_norm": 0.046078938990831375,
"learning_rate": 4.345099247865486e-06,
"loss": 0.0066,
"step": 10160
},
{
"epoch": 17.47422680412371,
"grad_norm": 0.31892022490501404,
"learning_rate": 4.287353403806188e-06,
"loss": 0.0071,
"step": 10170
},
{
"epoch": 17.491408934707902,
"grad_norm": 0.23924608528614044,
"learning_rate": 4.229976656104323e-06,
"loss": 0.0065,
"step": 10180
},
{
"epoch": 17.508591065292094,
"grad_norm": 0.10865090042352676,
"learning_rate": 4.172969468035359e-06,
"loss": 0.0059,
"step": 10190
},
{
"epoch": 17.52577319587629,
"grad_norm": 0.10951688885688782,
"learning_rate": 4.116332299890868e-06,
"loss": 0.0114,
"step": 10200
},
{
"epoch": 17.542955326460483,
"grad_norm": 0.08340981602668762,
"learning_rate": 4.060065608974744e-06,
"loss": 0.006,
"step": 10210
},
{
"epoch": 17.560137457044675,
"grad_norm": 0.2836835980415344,
"learning_rate": 4.0041698495996095e-06,
"loss": 0.006,
"step": 10220
},
{
"epoch": 17.577319587628867,
"grad_norm": 0.34195101261138916,
"learning_rate": 3.948645473083018e-06,
"loss": 0.0111,
"step": 10230
},
{
"epoch": 17.59450171821306,
"grad_norm": 0.136323943734169,
"learning_rate": 3.893492927743925e-06,
"loss": 0.0095,
"step": 10240
},
{
"epoch": 17.611683848797252,
"grad_norm": 0.19916993379592896,
"learning_rate": 3.838712658898974e-06,
"loss": 0.0065,
"step": 10250
},
{
"epoch": 17.628865979381445,
"grad_norm": 0.10761203616857529,
"learning_rate": 3.7843051088590153e-06,
"loss": 0.0042,
"step": 10260
},
{
"epoch": 17.646048109965637,
"grad_norm": 0.08006154000759125,
"learning_rate": 3.730270716925394e-06,
"loss": 0.0062,
"step": 10270
},
{
"epoch": 17.66323024054983,
"grad_norm": 0.07104959338903427,
"learning_rate": 3.67660991938652e-06,
"loss": 0.005,
"step": 10280
},
{
"epoch": 17.68041237113402,
"grad_norm": 0.19979846477508545,
"learning_rate": 3.6233231495142626e-06,
"loss": 0.0053,
"step": 10290
},
{
"epoch": 17.697594501718214,
"grad_norm": 0.20565828680992126,
"learning_rate": 3.5704108375605448e-06,
"loss": 0.0055,
"step": 10300
},
{
"epoch": 17.714776632302407,
"grad_norm": 0.17104719579219818,
"learning_rate": 3.5178734107537637e-06,
"loss": 0.0105,
"step": 10310
},
{
"epoch": 17.7319587628866,
"grad_norm": 0.08832945674657822,
"learning_rate": 3.4657112932954204e-06,
"loss": 0.0056,
"step": 10320
},
{
"epoch": 17.74914089347079,
"grad_norm": 0.053497232496738434,
"learning_rate": 3.4139249063566415e-06,
"loss": 0.0037,
"step": 10330
},
{
"epoch": 17.766323024054984,
"grad_norm": 0.05350416153669357,
"learning_rate": 3.36251466807484e-06,
"loss": 0.0058,
"step": 10340
},
{
"epoch": 17.783505154639176,
"grad_norm": 0.19712647795677185,
"learning_rate": 3.311480993550259e-06,
"loss": 0.0059,
"step": 10350
},
{
"epoch": 17.80068728522337,
"grad_norm": 0.1699119359254837,
"learning_rate": 3.2608242948427017e-06,
"loss": 0.008,
"step": 10360
},
{
"epoch": 17.81786941580756,
"grad_norm": 0.09976794570684433,
"learning_rate": 3.2105449809681334e-06,
"loss": 0.01,
"step": 10370
},
{
"epoch": 17.835051546391753,
"grad_norm": 0.12520617246627808,
"learning_rate": 3.160643457895435e-06,
"loss": 0.0057,
"step": 10380
},
{
"epoch": 17.852233676975946,
"grad_norm": 0.23560845851898193,
"learning_rate": 3.111120128543088e-06,
"loss": 0.0127,
"step": 10390
},
{
"epoch": 17.869415807560138,
"grad_norm": 0.134457528591156,
"learning_rate": 3.0619753927759565e-06,
"loss": 0.0067,
"step": 10400
},
{
"epoch": 17.88659793814433,
"grad_norm": 0.20700249075889587,
"learning_rate": 3.013209647401999e-06,
"loss": 0.0053,
"step": 10410
},
{
"epoch": 17.903780068728523,
"grad_norm": 0.1971113532781601,
"learning_rate": 2.964823286169133e-06,
"loss": 0.0073,
"step": 10420
},
{
"epoch": 17.920962199312715,
"grad_norm": 0.2505052089691162,
"learning_rate": 2.9168166997620263e-06,
"loss": 0.0066,
"step": 10430
},
{
"epoch": 17.938144329896907,
"grad_norm": 0.1978893280029297,
"learning_rate": 2.869190275798911e-06,
"loss": 0.0067,
"step": 10440
},
{
"epoch": 17.9553264604811,
"grad_norm": 0.09993084520101547,
"learning_rate": 2.821944398828519e-06,
"loss": 0.005,
"step": 10450
},
{
"epoch": 17.972508591065292,
"grad_norm": 0.0836280807852745,
"learning_rate": 2.775079450326917e-06,
"loss": 0.0076,
"step": 10460
},
{
"epoch": 17.989690721649485,
"grad_norm": 0.20113223791122437,
"learning_rate": 2.7285958086944786e-06,
"loss": 0.0039,
"step": 10470
},
{
"epoch": 18.006872852233677,
"grad_norm": 0.10885003954172134,
"learning_rate": 2.6824938492527595e-06,
"loss": 0.0093,
"step": 10480
},
{
"epoch": 18.02405498281787,
"grad_norm": 0.05460460111498833,
"learning_rate": 2.636773944241555e-06,
"loss": 0.003,
"step": 10490
},
{
"epoch": 18.04123711340206,
"grad_norm": 0.08809126913547516,
"learning_rate": 2.5914364628158217e-06,
"loss": 0.0044,
"step": 10500
},
{
"epoch": 18.058419243986254,
"grad_norm": 0.1788802295923233,
"learning_rate": 2.5464817710427414e-06,
"loss": 0.0072,
"step": 10510
},
{
"epoch": 18.075601374570446,
"grad_norm": 0.08905819058418274,
"learning_rate": 2.501910231898724e-06,
"loss": 0.0066,
"step": 10520
},
{
"epoch": 18.09278350515464,
"grad_norm": 0.08635038882493973,
"learning_rate": 2.457722205266516e-06,
"loss": 0.0048,
"step": 10530
},
{
"epoch": 18.10996563573883,
"grad_norm": 0.3244889974594116,
"learning_rate": 2.413918047932284e-06,
"loss": 0.0084,
"step": 10540
},
{
"epoch": 18.127147766323024,
"grad_norm": 0.1447770893573761,
"learning_rate": 2.370498113582731e-06,
"loss": 0.0054,
"step": 10550
},
{
"epoch": 18.144329896907216,
"grad_norm": 0.13908503949642181,
"learning_rate": 2.327462752802212e-06,
"loss": 0.0045,
"step": 10560
},
{
"epoch": 18.16151202749141,
"grad_norm": 0.15273341536521912,
"learning_rate": 2.2848123130699562e-06,
"loss": 0.0074,
"step": 10570
},
{
"epoch": 18.1786941580756,
"grad_norm": 0.09530580788850784,
"learning_rate": 2.2425471387572337e-06,
"loss": 0.0054,
"step": 10580
},
{
"epoch": 18.195876288659793,
"grad_norm": 0.0629500150680542,
"learning_rate": 2.2006675711245818e-06,
"loss": 0.0053,
"step": 10590
},
{
"epoch": 18.213058419243985,
"grad_norm": 0.13536952435970306,
"learning_rate": 2.15917394831901e-06,
"loss": 0.0076,
"step": 10600
},
{
"epoch": 18.230240549828178,
"grad_norm": 0.1433546394109726,
"learning_rate": 2.118066605371344e-06,
"loss": 0.0062,
"step": 10610
},
{
"epoch": 18.24742268041237,
"grad_norm": 0.07788240164518356,
"learning_rate": 2.0773458741934646e-06,
"loss": 0.0043,
"step": 10620
},
{
"epoch": 18.264604810996563,
"grad_norm": 0.2006695419549942,
"learning_rate": 2.0370120835756513e-06,
"loss": 0.0064,
"step": 10630
},
{
"epoch": 18.281786941580755,
"grad_norm": 0.09646660089492798,
"learning_rate": 1.9970655591838917e-06,
"loss": 0.0066,
"step": 10640
},
{
"epoch": 18.298969072164947,
"grad_norm": 0.08818981796503067,
"learning_rate": 1.9575066235573205e-06,
"loss": 0.0061,
"step": 10650
},
{
"epoch": 18.31615120274914,
"grad_norm": 0.10929910838603973,
"learning_rate": 1.918335596105553e-06,
"loss": 0.0054,
"step": 10660
},
{
"epoch": 18.333333333333332,
"grad_norm": 0.23234823346138,
"learning_rate": 1.8795527931061374e-06,
"loss": 0.0088,
"step": 10670
},
{
"epoch": 18.350515463917525,
"grad_norm": 0.06758160889148712,
"learning_rate": 1.841158527701975e-06,
"loss": 0.0074,
"step": 10680
},
{
"epoch": 18.367697594501717,
"grad_norm": 0.186759814620018,
"learning_rate": 1.8031531098988252e-06,
"loss": 0.0064,
"step": 10690
},
{
"epoch": 18.38487972508591,
"grad_norm": 0.12347927689552307,
"learning_rate": 1.765536846562782e-06,
"loss": 0.0066,
"step": 10700
},
{
"epoch": 18.4020618556701,
"grad_norm": 0.05850611999630928,
"learning_rate": 1.7283100414178078e-06,
"loss": 0.0071,
"step": 10710
},
{
"epoch": 18.419243986254294,
"grad_norm": 0.22844308614730835,
"learning_rate": 1.6914729950432474e-06,
"loss": 0.0052,
"step": 10720
},
{
"epoch": 18.436426116838486,
"grad_norm": 0.23077848553657532,
"learning_rate": 1.6550260048714628e-06,
"loss": 0.0094,
"step": 10730
},
{
"epoch": 18.45360824742268,
"grad_norm": 0.04616402089595795,
"learning_rate": 1.6189693651853687e-06,
"loss": 0.0079,
"step": 10740
},
{
"epoch": 18.47079037800687,
"grad_norm": 0.1235852912068367,
"learning_rate": 1.58330336711609e-06,
"loss": 0.0055,
"step": 10750
},
{
"epoch": 18.487972508591064,
"grad_norm": 0.06739755719900131,
"learning_rate": 1.5480282986406136e-06,
"loss": 0.0088,
"step": 10760
},
{
"epoch": 18.50515463917526,
"grad_norm": 0.08690838515758514,
"learning_rate": 1.5131444445794506e-06,
"loss": 0.004,
"step": 10770
},
{
"epoch": 18.522336769759452,
"grad_norm": 0.12189892679452896,
"learning_rate": 1.4786520865943344e-06,
"loss": 0.0053,
"step": 10780
},
{
"epoch": 18.539518900343644,
"grad_norm": 0.10416446626186371,
"learning_rate": 1.4445515031859591e-06,
"loss": 0.0042,
"step": 10790
},
{
"epoch": 18.556701030927837,
"grad_norm": 0.13094037771224976,
"learning_rate": 1.4108429696917225e-06,
"loss": 0.0063,
"step": 10800
},
{
"epoch": 18.57388316151203,
"grad_norm": 0.07218027114868164,
"learning_rate": 1.3775267582834928e-06,
"loss": 0.0068,
"step": 10810
},
{
"epoch": 18.59106529209622,
"grad_norm": 0.10305195301771164,
"learning_rate": 1.34460313796545e-06,
"loss": 0.0059,
"step": 10820
},
{
"epoch": 18.608247422680414,
"grad_norm": 0.18571843206882477,
"learning_rate": 1.31207237457186e-06,
"loss": 0.0036,
"step": 10830
},
{
"epoch": 18.625429553264606,
"grad_norm": 0.09622086584568024,
"learning_rate": 1.2799347307649756e-06,
"loss": 0.0031,
"step": 10840
},
{
"epoch": 18.6426116838488,
"grad_norm": 0.06238604336977005,
"learning_rate": 1.248190466032867e-06,
"loss": 0.0041,
"step": 10850
},
{
"epoch": 18.65979381443299,
"grad_norm": 0.07316755503416061,
"learning_rate": 1.2168398366873946e-06,
"loss": 0.0048,
"step": 10860
},
{
"epoch": 18.676975945017183,
"grad_norm": 0.28695550560951233,
"learning_rate": 1.1858830958620559e-06,
"loss": 0.0064,
"step": 10870
},
{
"epoch": 18.694158075601376,
"grad_norm": 0.18788480758666992,
"learning_rate": 1.155320493510026e-06,
"loss": 0.008,
"step": 10880
},
{
"epoch": 18.711340206185568,
"grad_norm": 0.06678231060504913,
"learning_rate": 1.1251522764020638e-06,
"loss": 0.0058,
"step": 10890
},
{
"epoch": 18.72852233676976,
"grad_norm": 0.0773329809308052,
"learning_rate": 1.095378688124582e-06,
"loss": 0.0093,
"step": 10900
},
{
"epoch": 18.745704467353953,
"grad_norm": 0.12073255330324173,
"learning_rate": 1.0659999690776302e-06,
"loss": 0.0047,
"step": 10910
},
{
"epoch": 18.762886597938145,
"grad_norm": 0.09898494184017181,
"learning_rate": 1.0370163564729974e-06,
"loss": 0.007,
"step": 10920
},
{
"epoch": 18.780068728522338,
"grad_norm": 0.0713253766298294,
"learning_rate": 1.008428084332247e-06,
"loss": 0.0061,
"step": 10930
},
{
"epoch": 18.79725085910653,
"grad_norm": 0.1340664029121399,
"learning_rate": 9.802353834848953e-07,
"loss": 0.0045,
"step": 10940
},
{
"epoch": 18.814432989690722,
"grad_norm": 0.08532940596342087,
"learning_rate": 9.524384815664699e-07,
"loss": 0.0038,
"step": 10950
},
{
"epoch": 18.831615120274915,
"grad_norm": 0.12806546688079834,
"learning_rate": 9.250376030167429e-07,
"loss": 0.009,
"step": 10960
},
{
"epoch": 18.848797250859107,
"grad_norm": 0.07002566009759903,
"learning_rate": 8.980329690778499e-07,
"loss": 0.0035,
"step": 10970
},
{
"epoch": 18.8659793814433,
"grad_norm": 0.09975923597812653,
"learning_rate": 8.714247977925749e-07,
"loss": 0.004,
"step": 10980
},
{
"epoch": 18.883161512027492,
"grad_norm": 0.1369011402130127,
"learning_rate": 8.452133040025345e-07,
"loss": 0.004,
"step": 10990
},
{
"epoch": 18.900343642611684,
"grad_norm": 0.16670043766498566,
"learning_rate": 8.193986993464686e-07,
"loss": 0.0049,
"step": 11000
}
],
"logging_steps": 10,
"max_steps": 11638,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.7686827385256666e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}