q3-8b-ft-adpt-ep2 / trainer_state.json
Fizzarolli's picture
Upload folder using huggingface_hub
b305f8e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 780,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002564102564102564,
"grad_norm": 0.13979803025722504,
"learning_rate": 2.564102564102564e-07,
"loss": 0.9999,
"step": 1
},
{
"epoch": 0.005128205128205128,
"grad_norm": 0.15988187491893768,
"learning_rate": 5.128205128205128e-07,
"loss": 1.0694,
"step": 2
},
{
"epoch": 0.007692307692307693,
"grad_norm": 0.1669122576713562,
"learning_rate": 7.692307692307694e-07,
"loss": 1.1319,
"step": 3
},
{
"epoch": 0.010256410256410256,
"grad_norm": 0.15094897150993347,
"learning_rate": 1.0256410256410257e-06,
"loss": 0.956,
"step": 4
},
{
"epoch": 0.01282051282051282,
"grad_norm": 0.15886467695236206,
"learning_rate": 1.2820512820512822e-06,
"loss": 1.2478,
"step": 5
},
{
"epoch": 0.015384615384615385,
"grad_norm": 0.13728941977024078,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.0633,
"step": 6
},
{
"epoch": 0.017948717948717947,
"grad_norm": 0.14830218255519867,
"learning_rate": 1.794871794871795e-06,
"loss": 0.9022,
"step": 7
},
{
"epoch": 0.020512820512820513,
"grad_norm": 0.17115703225135803,
"learning_rate": 2.0512820512820513e-06,
"loss": 1.1275,
"step": 8
},
{
"epoch": 0.023076923076923078,
"grad_norm": 0.15854498744010925,
"learning_rate": 2.3076923076923077e-06,
"loss": 1.0972,
"step": 9
},
{
"epoch": 0.02564102564102564,
"grad_norm": 0.15408478677272797,
"learning_rate": 2.5641025641025644e-06,
"loss": 1.0453,
"step": 10
},
{
"epoch": 0.028205128205128206,
"grad_norm": 0.15272922813892365,
"learning_rate": 2.8205128205128207e-06,
"loss": 1.0399,
"step": 11
},
{
"epoch": 0.03076923076923077,
"grad_norm": 0.14716756343841553,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.0765,
"step": 12
},
{
"epoch": 0.03333333333333333,
"grad_norm": 0.14600983262062073,
"learning_rate": 3.3333333333333337e-06,
"loss": 1.0616,
"step": 13
},
{
"epoch": 0.035897435897435895,
"grad_norm": 0.14107345044612885,
"learning_rate": 3.58974358974359e-06,
"loss": 1.1168,
"step": 14
},
{
"epoch": 0.038461538461538464,
"grad_norm": 0.14267683029174805,
"learning_rate": 3.846153846153847e-06,
"loss": 1.0462,
"step": 15
},
{
"epoch": 0.041025641025641026,
"grad_norm": 0.1449165940284729,
"learning_rate": 4.102564102564103e-06,
"loss": 1.1341,
"step": 16
},
{
"epoch": 0.04358974358974359,
"grad_norm": 0.14054569602012634,
"learning_rate": 4.358974358974359e-06,
"loss": 1.0704,
"step": 17
},
{
"epoch": 0.046153846153846156,
"grad_norm": 0.14372889697551727,
"learning_rate": 4.615384615384615e-06,
"loss": 1.1511,
"step": 18
},
{
"epoch": 0.04871794871794872,
"grad_norm": 0.13227999210357666,
"learning_rate": 4.871794871794872e-06,
"loss": 1.009,
"step": 19
},
{
"epoch": 0.05128205128205128,
"grad_norm": 0.15212635695934296,
"learning_rate": 5.128205128205129e-06,
"loss": 1.1836,
"step": 20
},
{
"epoch": 0.05384615384615385,
"grad_norm": 0.1289321482181549,
"learning_rate": 5.384615384615385e-06,
"loss": 1.1769,
"step": 21
},
{
"epoch": 0.05641025641025641,
"grad_norm": 0.12752899527549744,
"learning_rate": 5.641025641025641e-06,
"loss": 1.0759,
"step": 22
},
{
"epoch": 0.05897435897435897,
"grad_norm": 0.12242157757282257,
"learning_rate": 5.897435897435897e-06,
"loss": 1.0826,
"step": 23
},
{
"epoch": 0.06153846153846154,
"grad_norm": 0.1286514699459076,
"learning_rate": 6.153846153846155e-06,
"loss": 1.05,
"step": 24
},
{
"epoch": 0.0641025641025641,
"grad_norm": 0.13825258612632751,
"learning_rate": 6.410256410256411e-06,
"loss": 1.0944,
"step": 25
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.13136117160320282,
"learning_rate": 6.6666666666666675e-06,
"loss": 1.0472,
"step": 26
},
{
"epoch": 0.06923076923076923,
"grad_norm": 0.11351772397756577,
"learning_rate": 6.923076923076923e-06,
"loss": 1.071,
"step": 27
},
{
"epoch": 0.07179487179487179,
"grad_norm": 0.12361160665750504,
"learning_rate": 7.17948717948718e-06,
"loss": 1.0955,
"step": 28
},
{
"epoch": 0.07435897435897436,
"grad_norm": 0.13740238547325134,
"learning_rate": 7.435897435897436e-06,
"loss": 1.1061,
"step": 29
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.11295292526483536,
"learning_rate": 7.692307692307694e-06,
"loss": 1.0159,
"step": 30
},
{
"epoch": 0.07948717948717948,
"grad_norm": 0.12402593344449997,
"learning_rate": 7.948717948717949e-06,
"loss": 1.0859,
"step": 31
},
{
"epoch": 0.08205128205128205,
"grad_norm": 0.1193617656826973,
"learning_rate": 8.205128205128205e-06,
"loss": 1.1554,
"step": 32
},
{
"epoch": 0.08461538461538462,
"grad_norm": 0.12157738953828812,
"learning_rate": 8.461538461538462e-06,
"loss": 1.0643,
"step": 33
},
{
"epoch": 0.08717948717948718,
"grad_norm": 0.11561132967472076,
"learning_rate": 8.717948717948719e-06,
"loss": 0.9329,
"step": 34
},
{
"epoch": 0.08974358974358974,
"grad_norm": 0.1199595183134079,
"learning_rate": 8.974358974358976e-06,
"loss": 0.9432,
"step": 35
},
{
"epoch": 0.09230769230769231,
"grad_norm": 0.27351143956184387,
"learning_rate": 9.23076923076923e-06,
"loss": 0.9671,
"step": 36
},
{
"epoch": 0.09487179487179487,
"grad_norm": 0.11380849033594131,
"learning_rate": 9.487179487179489e-06,
"loss": 1.0776,
"step": 37
},
{
"epoch": 0.09743589743589744,
"grad_norm": 0.12026315927505493,
"learning_rate": 9.743589743589744e-06,
"loss": 0.9837,
"step": 38
},
{
"epoch": 0.1,
"grad_norm": 0.11509953439235687,
"learning_rate": 1e-05,
"loss": 1.0578,
"step": 39
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.12789179384708405,
"learning_rate": 9.99878394811512e-06,
"loss": 1.0436,
"step": 40
},
{
"epoch": 0.10512820512820513,
"grad_norm": 0.10106956213712692,
"learning_rate": 9.997564935064936e-06,
"loss": 0.934,
"step": 41
},
{
"epoch": 0.1076923076923077,
"grad_norm": 0.11464275419712067,
"learning_rate": 9.996342950020318e-06,
"loss": 1.0297,
"step": 42
},
{
"epoch": 0.11025641025641025,
"grad_norm": 0.11068426072597504,
"learning_rate": 9.995117982099268e-06,
"loss": 1.1004,
"step": 43
},
{
"epoch": 0.11282051282051282,
"grad_norm": 0.10913486778736115,
"learning_rate": 9.993890020366601e-06,
"loss": 0.92,
"step": 44
},
{
"epoch": 0.11538461538461539,
"grad_norm": 0.11235719919204712,
"learning_rate": 9.992659053833607e-06,
"loss": 1.0534,
"step": 45
},
{
"epoch": 0.11794871794871795,
"grad_norm": 0.10900150239467621,
"learning_rate": 9.991425071457738e-06,
"loss": 1.0011,
"step": 46
},
{
"epoch": 0.12051282051282051,
"grad_norm": 0.11291161179542542,
"learning_rate": 9.990188062142274e-06,
"loss": 0.9889,
"step": 47
},
{
"epoch": 0.12307692307692308,
"grad_norm": 0.12270451337099075,
"learning_rate": 9.988948014735981e-06,
"loss": 1.1178,
"step": 48
},
{
"epoch": 0.12564102564102564,
"grad_norm": 0.109133280813694,
"learning_rate": 9.987704918032787e-06,
"loss": 1.0422,
"step": 49
},
{
"epoch": 0.1282051282051282,
"grad_norm": 0.11073730140924454,
"learning_rate": 9.98645876077144e-06,
"loss": 1.1226,
"step": 50
},
{
"epoch": 0.13076923076923078,
"grad_norm": 0.10467839986085892,
"learning_rate": 9.98520953163517e-06,
"loss": 1.089,
"step": 51
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.10366383194923401,
"learning_rate": 9.983957219251336e-06,
"loss": 1.1206,
"step": 52
},
{
"epoch": 0.1358974358974359,
"grad_norm": 0.10720381140708923,
"learning_rate": 9.982701812191105e-06,
"loss": 1.091,
"step": 53
},
{
"epoch": 0.13846153846153847,
"grad_norm": 0.2592061161994934,
"learning_rate": 9.981443298969074e-06,
"loss": 0.964,
"step": 54
},
{
"epoch": 0.14102564102564102,
"grad_norm": 0.10395167022943497,
"learning_rate": 9.98018166804294e-06,
"loss": 1.1305,
"step": 55
},
{
"epoch": 0.14358974358974358,
"grad_norm": 0.10875218361616135,
"learning_rate": 9.978916907813147e-06,
"loss": 1.1347,
"step": 56
},
{
"epoch": 0.14615384615384616,
"grad_norm": 0.10331016033887863,
"learning_rate": 9.977649006622518e-06,
"loss": 1.1921,
"step": 57
},
{
"epoch": 0.14871794871794872,
"grad_norm": 0.10478100180625916,
"learning_rate": 9.976377952755907e-06,
"loss": 1.0031,
"step": 58
},
{
"epoch": 0.15128205128205127,
"grad_norm": 0.09711793065071106,
"learning_rate": 9.975103734439834e-06,
"loss": 0.9949,
"step": 59
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.10558706521987915,
"learning_rate": 9.973826339842128e-06,
"loss": 1.0029,
"step": 60
},
{
"epoch": 0.1564102564102564,
"grad_norm": 0.09912573546171188,
"learning_rate": 9.972545757071548e-06,
"loss": 0.933,
"step": 61
},
{
"epoch": 0.15897435897435896,
"grad_norm": 0.11607331037521362,
"learning_rate": 9.971261974177426e-06,
"loss": 0.9942,
"step": 62
},
{
"epoch": 0.16153846153846155,
"grad_norm": 0.10281538218259811,
"learning_rate": 9.969974979149292e-06,
"loss": 1.0307,
"step": 63
},
{
"epoch": 0.1641025641025641,
"grad_norm": 0.10646649450063705,
"learning_rate": 9.968684759916494e-06,
"loss": 1.0052,
"step": 64
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.1755123883485794,
"learning_rate": 9.967391304347826e-06,
"loss": 0.9506,
"step": 65
},
{
"epoch": 0.16923076923076924,
"grad_norm": 0.11006496846675873,
"learning_rate": 9.966094600251151e-06,
"loss": 0.9679,
"step": 66
},
{
"epoch": 0.1717948717948718,
"grad_norm": 0.11007404327392578,
"learning_rate": 9.96479463537301e-06,
"loss": 1.0251,
"step": 67
},
{
"epoch": 0.17435897435897435,
"grad_norm": 0.1030791848897934,
"learning_rate": 9.963491397398239e-06,
"loss": 1.0104,
"step": 68
},
{
"epoch": 0.17692307692307693,
"grad_norm": 0.10340573638677597,
"learning_rate": 9.962184873949581e-06,
"loss": 1.1273,
"step": 69
},
{
"epoch": 0.1794871794871795,
"grad_norm": 0.10667295008897781,
"learning_rate": 9.960875052587295e-06,
"loss": 1.1031,
"step": 70
},
{
"epoch": 0.18205128205128204,
"grad_norm": 0.10353393852710724,
"learning_rate": 9.959561920808762e-06,
"loss": 0.9595,
"step": 71
},
{
"epoch": 0.18461538461538463,
"grad_norm": 0.10161738842725754,
"learning_rate": 9.95824546604808e-06,
"loss": 0.9629,
"step": 72
},
{
"epoch": 0.18717948717948718,
"grad_norm": 0.11324603855609894,
"learning_rate": 9.956925675675678e-06,
"loss": 1.2039,
"step": 73
},
{
"epoch": 0.18974358974358974,
"grad_norm": 0.11005936563014984,
"learning_rate": 9.955602536997886e-06,
"loss": 1.0425,
"step": 74
},
{
"epoch": 0.19230769230769232,
"grad_norm": 0.10767950117588043,
"learning_rate": 9.954276037256563e-06,
"loss": 1.0281,
"step": 75
},
{
"epoch": 0.19487179487179487,
"grad_norm": 0.10422754287719727,
"learning_rate": 9.952946163628658e-06,
"loss": 1.0155,
"step": 76
},
{
"epoch": 0.19743589743589743,
"grad_norm": 0.10673552006483078,
"learning_rate": 9.951612903225807e-06,
"loss": 1.184,
"step": 77
},
{
"epoch": 0.2,
"grad_norm": 0.11334969103336334,
"learning_rate": 9.950276243093924e-06,
"loss": 0.9366,
"step": 78
},
{
"epoch": 0.20256410256410257,
"grad_norm": 0.10220715403556824,
"learning_rate": 9.948936170212767e-06,
"loss": 1.0855,
"step": 79
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.10753922909498215,
"learning_rate": 9.947592671495527e-06,
"loss": 0.946,
"step": 80
},
{
"epoch": 0.2076923076923077,
"grad_norm": 0.1163082867860794,
"learning_rate": 9.946245733788397e-06,
"loss": 1.0971,
"step": 81
},
{
"epoch": 0.21025641025641026,
"grad_norm": 0.11111017316579819,
"learning_rate": 9.944895343870142e-06,
"loss": 1.0869,
"step": 82
},
{
"epoch": 0.2128205128205128,
"grad_norm": 0.10907071083784103,
"learning_rate": 9.943541488451669e-06,
"loss": 0.9786,
"step": 83
},
{
"epoch": 0.2153846153846154,
"grad_norm": 0.10257716476917267,
"learning_rate": 9.94218415417559e-06,
"loss": 0.9889,
"step": 84
},
{
"epoch": 0.21794871794871795,
"grad_norm": 0.10978135466575623,
"learning_rate": 9.94082332761578e-06,
"loss": 1.1028,
"step": 85
},
{
"epoch": 0.2205128205128205,
"grad_norm": 0.110615074634552,
"learning_rate": 9.939458995276944e-06,
"loss": 1.019,
"step": 86
},
{
"epoch": 0.2230769230769231,
"grad_norm": 0.11058582365512848,
"learning_rate": 9.938091143594154e-06,
"loss": 0.9996,
"step": 87
},
{
"epoch": 0.22564102564102564,
"grad_norm": 0.11037719249725342,
"learning_rate": 9.936719758932415e-06,
"loss": 1.0338,
"step": 88
},
{
"epoch": 0.2282051282051282,
"grad_norm": 0.10798349976539612,
"learning_rate": 9.935344827586207e-06,
"loss": 0.9677,
"step": 89
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.10694784671068192,
"learning_rate": 9.933966335779024e-06,
"loss": 1.0419,
"step": 90
},
{
"epoch": 0.23333333333333334,
"grad_norm": 0.13677257299423218,
"learning_rate": 9.932584269662922e-06,
"loss": 1.0015,
"step": 91
},
{
"epoch": 0.2358974358974359,
"grad_norm": 0.11084003746509552,
"learning_rate": 9.931198615318045e-06,
"loss": 1.0031,
"step": 92
},
{
"epoch": 0.23846153846153847,
"grad_norm": 0.10615186393260956,
"learning_rate": 9.929809358752167e-06,
"loss": 0.9214,
"step": 93
},
{
"epoch": 0.24102564102564103,
"grad_norm": 0.10620255023241043,
"learning_rate": 9.928416485900218e-06,
"loss": 0.9185,
"step": 94
},
{
"epoch": 0.24358974358974358,
"grad_norm": 0.11533376574516296,
"learning_rate": 9.927019982623805e-06,
"loss": 1.0308,
"step": 95
},
{
"epoch": 0.24615384615384617,
"grad_norm": 0.1096138209104538,
"learning_rate": 9.925619834710745e-06,
"loss": 1.0478,
"step": 96
},
{
"epoch": 0.24871794871794872,
"grad_norm": 0.11876872926950455,
"learning_rate": 9.924216027874566e-06,
"loss": 1.0602,
"step": 97
},
{
"epoch": 0.2512820512820513,
"grad_norm": 0.10606134682893753,
"learning_rate": 9.922808547754035e-06,
"loss": 0.901,
"step": 98
},
{
"epoch": 0.25384615384615383,
"grad_norm": 0.11515390872955322,
"learning_rate": 9.921397379912666e-06,
"loss": 1.0333,
"step": 99
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.11910593509674072,
"learning_rate": 9.919982509838217e-06,
"loss": 0.9956,
"step": 100
},
{
"epoch": 0.258974358974359,
"grad_norm": 0.12178193032741547,
"learning_rate": 9.918563922942208e-06,
"loss": 1.071,
"step": 101
},
{
"epoch": 0.26153846153846155,
"grad_norm": 0.1089189425110817,
"learning_rate": 9.917141604559404e-06,
"loss": 0.9349,
"step": 102
},
{
"epoch": 0.2641025641025641,
"grad_norm": 0.1138150617480278,
"learning_rate": 9.915715539947322e-06,
"loss": 1.0169,
"step": 103
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.1112711951136589,
"learning_rate": 9.914285714285713e-06,
"loss": 0.9022,
"step": 104
},
{
"epoch": 0.2692307692307692,
"grad_norm": 0.11421187967061996,
"learning_rate": 9.912852112676058e-06,
"loss": 0.9711,
"step": 105
},
{
"epoch": 0.2717948717948718,
"grad_norm": 0.10921610891819,
"learning_rate": 9.911414720141032e-06,
"loss": 0.9388,
"step": 106
},
{
"epoch": 0.2743589743589744,
"grad_norm": 0.11643636971712112,
"learning_rate": 9.909973521624008e-06,
"loss": 1.003,
"step": 107
},
{
"epoch": 0.27692307692307694,
"grad_norm": 0.13560256361961365,
"learning_rate": 9.908528501988513e-06,
"loss": 0.9955,
"step": 108
},
{
"epoch": 0.2794871794871795,
"grad_norm": 0.11191970109939575,
"learning_rate": 9.9070796460177e-06,
"loss": 1.0105,
"step": 109
},
{
"epoch": 0.28205128205128205,
"grad_norm": 0.1319538950920105,
"learning_rate": 9.905626938413824e-06,
"loss": 1.0841,
"step": 110
},
{
"epoch": 0.2846153846153846,
"grad_norm": 0.11922305077314377,
"learning_rate": 9.904170363797693e-06,
"loss": 0.9614,
"step": 111
},
{
"epoch": 0.28717948717948716,
"grad_norm": 0.11520028859376907,
"learning_rate": 9.902709906708132e-06,
"loss": 0.9648,
"step": 112
},
{
"epoch": 0.28974358974358977,
"grad_norm": 0.10564184933900833,
"learning_rate": 9.901245551601424e-06,
"loss": 0.9224,
"step": 113
},
{
"epoch": 0.2923076923076923,
"grad_norm": 0.11402294784784317,
"learning_rate": 9.89977728285078e-06,
"loss": 1.0839,
"step": 114
},
{
"epoch": 0.2948717948717949,
"grad_norm": 0.1240580752491951,
"learning_rate": 9.898305084745763e-06,
"loss": 0.9802,
"step": 115
},
{
"epoch": 0.29743589743589743,
"grad_norm": 0.12268956750631332,
"learning_rate": 9.896828941491739e-06,
"loss": 1.0552,
"step": 116
},
{
"epoch": 0.3,
"grad_norm": 0.11103710532188416,
"learning_rate": 9.895348837209303e-06,
"loss": 0.9337,
"step": 117
},
{
"epoch": 0.30256410256410254,
"grad_norm": 0.1281978338956833,
"learning_rate": 9.893864755933724e-06,
"loss": 1.0919,
"step": 118
},
{
"epoch": 0.30512820512820515,
"grad_norm": 0.11921875178813934,
"learning_rate": 9.892376681614351e-06,
"loss": 1.0019,
"step": 119
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.11357328295707703,
"learning_rate": 9.890884598114054e-06,
"loss": 0.9391,
"step": 120
},
{
"epoch": 0.31025641025641026,
"grad_norm": 0.12437216937541962,
"learning_rate": 9.889388489208635e-06,
"loss": 1.0949,
"step": 121
},
{
"epoch": 0.3128205128205128,
"grad_norm": 0.11032367497682571,
"learning_rate": 9.887888338586223e-06,
"loss": 1.0712,
"step": 122
},
{
"epoch": 0.3153846153846154,
"grad_norm": 0.10982154309749603,
"learning_rate": 9.886384129846709e-06,
"loss": 1.0995,
"step": 123
},
{
"epoch": 0.31794871794871793,
"grad_norm": 0.11465884000062943,
"learning_rate": 9.88487584650113e-06,
"loss": 1.1997,
"step": 124
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.11689360439777374,
"learning_rate": 9.883363471971068e-06,
"loss": 0.9555,
"step": 125
},
{
"epoch": 0.3230769230769231,
"grad_norm": 0.12228330969810486,
"learning_rate": 9.88184698958805e-06,
"loss": 1.0185,
"step": 126
},
{
"epoch": 0.32564102564102565,
"grad_norm": 0.11858666688203812,
"learning_rate": 9.88032638259293e-06,
"loss": 0.9995,
"step": 127
},
{
"epoch": 0.3282051282051282,
"grad_norm": 0.107363760471344,
"learning_rate": 9.87880163413527e-06,
"loss": 0.9283,
"step": 128
},
{
"epoch": 0.33076923076923076,
"grad_norm": 0.1286807507276535,
"learning_rate": 9.877272727272727e-06,
"loss": 0.9635,
"step": 129
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.13207103312015533,
"learning_rate": 9.875739644970415e-06,
"loss": 1.0259,
"step": 130
},
{
"epoch": 0.33589743589743587,
"grad_norm": 0.12238481640815735,
"learning_rate": 9.874202370100275e-06,
"loss": 0.9458,
"step": 131
},
{
"epoch": 0.3384615384615385,
"grad_norm": 0.12218200415372849,
"learning_rate": 9.87266088544044e-06,
"loss": 0.9856,
"step": 132
},
{
"epoch": 0.34102564102564104,
"grad_norm": 0.192653626203537,
"learning_rate": 9.871115173674589e-06,
"loss": 1.1864,
"step": 133
},
{
"epoch": 0.3435897435897436,
"grad_norm": 0.12083520740270615,
"learning_rate": 9.869565217391306e-06,
"loss": 1.0003,
"step": 134
},
{
"epoch": 0.34615384615384615,
"grad_norm": 0.11669037491083145,
"learning_rate": 9.86801099908341e-06,
"loss": 1.0615,
"step": 135
},
{
"epoch": 0.3487179487179487,
"grad_norm": 0.11430592834949493,
"learning_rate": 9.866452501147316e-06,
"loss": 0.927,
"step": 136
},
{
"epoch": 0.35128205128205126,
"grad_norm": 0.10570185631513596,
"learning_rate": 9.864889705882355e-06,
"loss": 0.9244,
"step": 137
},
{
"epoch": 0.35384615384615387,
"grad_norm": 0.12382305413484573,
"learning_rate": 9.863322595490108e-06,
"loss": 1.0086,
"step": 138
},
{
"epoch": 0.3564102564102564,
"grad_norm": 0.11679980158805847,
"learning_rate": 9.861751152073734e-06,
"loss": 0.9317,
"step": 139
},
{
"epoch": 0.358974358974359,
"grad_norm": 0.12016775459051132,
"learning_rate": 9.860175357637288e-06,
"loss": 0.9946,
"step": 140
},
{
"epoch": 0.36153846153846153,
"grad_norm": 0.11385658383369446,
"learning_rate": 9.858595194085029e-06,
"loss": 0.9507,
"step": 141
},
{
"epoch": 0.3641025641025641,
"grad_norm": 0.11400415003299713,
"learning_rate": 9.857010643220733e-06,
"loss": 0.977,
"step": 142
},
{
"epoch": 0.36666666666666664,
"grad_norm": 0.12075012922286987,
"learning_rate": 9.855421686746988e-06,
"loss": 1.1277,
"step": 143
},
{
"epoch": 0.36923076923076925,
"grad_norm": 0.12377439439296722,
"learning_rate": 9.853828306264502e-06,
"loss": 1.0703,
"step": 144
},
{
"epoch": 0.3717948717948718,
"grad_norm": 0.11683501303195953,
"learning_rate": 9.852230483271376e-06,
"loss": 0.9773,
"step": 145
},
{
"epoch": 0.37435897435897436,
"grad_norm": 0.11950255930423737,
"learning_rate": 9.850628199162401e-06,
"loss": 0.9428,
"step": 146
},
{
"epoch": 0.3769230769230769,
"grad_norm": 0.12005724757909775,
"learning_rate": 9.849021435228333e-06,
"loss": 1.1703,
"step": 147
},
{
"epoch": 0.37948717948717947,
"grad_norm": 0.11797571182250977,
"learning_rate": 9.847410172655158e-06,
"loss": 0.8719,
"step": 148
},
{
"epoch": 0.382051282051282,
"grad_norm": 0.1225227490067482,
"learning_rate": 9.845794392523365e-06,
"loss": 1.1311,
"step": 149
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.12024562805891037,
"learning_rate": 9.844174075807208e-06,
"loss": 1.1669,
"step": 150
},
{
"epoch": 0.3871794871794872,
"grad_norm": 0.12286081910133362,
"learning_rate": 9.842549203373947e-06,
"loss": 1.1541,
"step": 151
},
{
"epoch": 0.38974358974358975,
"grad_norm": 0.1257518231868744,
"learning_rate": 9.840919755983107e-06,
"loss": 0.9313,
"step": 152
},
{
"epoch": 0.3923076923076923,
"grad_norm": 0.12524078786373138,
"learning_rate": 9.839285714285715e-06,
"loss": 0.904,
"step": 153
},
{
"epoch": 0.39487179487179486,
"grad_norm": 0.1183227151632309,
"learning_rate": 9.83764705882353e-06,
"loss": 1.0132,
"step": 154
},
{
"epoch": 0.3974358974358974,
"grad_norm": 0.12392973154783249,
"learning_rate": 9.836003770028276e-06,
"loss": 1.069,
"step": 155
},
{
"epoch": 0.4,
"grad_norm": 0.13140057027339935,
"learning_rate": 9.83435582822086e-06,
"loss": 1.1211,
"step": 156
},
{
"epoch": 0.4025641025641026,
"grad_norm": 0.1755838245153427,
"learning_rate": 9.832703213610588e-06,
"loss": 1.053,
"step": 157
},
{
"epoch": 0.40512820512820513,
"grad_norm": 0.12414582073688507,
"learning_rate": 9.831045906294368e-06,
"loss": 1.0483,
"step": 158
},
{
"epoch": 0.4076923076923077,
"grad_norm": 0.13171876966953278,
"learning_rate": 9.829383886255924e-06,
"loss": 1.0296,
"step": 159
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.12738922238349915,
"learning_rate": 9.827717133364974e-06,
"loss": 1.0102,
"step": 160
},
{
"epoch": 0.4128205128205128,
"grad_norm": 0.1904231309890747,
"learning_rate": 9.826045627376427e-06,
"loss": 1.0314,
"step": 161
},
{
"epoch": 0.4153846153846154,
"grad_norm": 0.12011483311653137,
"learning_rate": 9.824369347929558e-06,
"loss": 0.9475,
"step": 162
},
{
"epoch": 0.41794871794871796,
"grad_norm": 0.1304839700460434,
"learning_rate": 9.822688274547189e-06,
"loss": 1.0456,
"step": 163
},
{
"epoch": 0.4205128205128205,
"grad_norm": 0.131229430437088,
"learning_rate": 9.821002386634847e-06,
"loss": 1.1589,
"step": 164
},
{
"epoch": 0.4230769230769231,
"grad_norm": 0.12201635539531708,
"learning_rate": 9.819311663479923e-06,
"loss": 0.966,
"step": 165
},
{
"epoch": 0.4256410256410256,
"grad_norm": 0.12963519990444183,
"learning_rate": 9.81761608425084e-06,
"loss": 1.0354,
"step": 166
},
{
"epoch": 0.4282051282051282,
"grad_norm": 0.12793965637683868,
"learning_rate": 9.815915627996166e-06,
"loss": 1.0306,
"step": 167
},
{
"epoch": 0.4307692307692308,
"grad_norm": 0.17451830208301544,
"learning_rate": 9.814210273643783e-06,
"loss": 1.0694,
"step": 168
},
{
"epoch": 0.43333333333333335,
"grad_norm": 0.14709219336509705,
"learning_rate": 9.812500000000001e-06,
"loss": 1.1155,
"step": 169
},
{
"epoch": 0.4358974358974359,
"grad_norm": 0.205572709441185,
"learning_rate": 9.810784785748676e-06,
"loss": 1.0661,
"step": 170
},
{
"epoch": 0.43846153846153846,
"grad_norm": 0.13123241066932678,
"learning_rate": 9.809064609450338e-06,
"loss": 0.9847,
"step": 171
},
{
"epoch": 0.441025641025641,
"grad_norm": 0.14202538132667542,
"learning_rate": 9.807339449541285e-06,
"loss": 1.0204,
"step": 172
},
{
"epoch": 0.44358974358974357,
"grad_norm": 0.15248540043830872,
"learning_rate": 9.80560928433269e-06,
"loss": 0.9468,
"step": 173
},
{
"epoch": 0.4461538461538462,
"grad_norm": 0.296898752450943,
"learning_rate": 9.803874092009686e-06,
"loss": 1.0747,
"step": 174
},
{
"epoch": 0.44871794871794873,
"grad_norm": 0.12776528298854828,
"learning_rate": 9.802133850630456e-06,
"loss": 1.0828,
"step": 175
},
{
"epoch": 0.4512820512820513,
"grad_norm": 0.1273936629295349,
"learning_rate": 9.800388538125306e-06,
"loss": 0.93,
"step": 176
},
{
"epoch": 0.45384615384615384,
"grad_norm": 0.1328604370355606,
"learning_rate": 9.79863813229572e-06,
"loss": 0.905,
"step": 177
},
{
"epoch": 0.4564102564102564,
"grad_norm": 0.14861007034778595,
"learning_rate": 9.796882610813444e-06,
"loss": 0.9129,
"step": 178
},
{
"epoch": 0.45897435897435895,
"grad_norm": 0.12911070883274078,
"learning_rate": 9.795121951219514e-06,
"loss": 0.9161,
"step": 179
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.12798583507537842,
"learning_rate": 9.793356130923302e-06,
"loss": 1.015,
"step": 180
},
{
"epoch": 0.4641025641025641,
"grad_norm": 0.14299742877483368,
"learning_rate": 9.791585127201565e-06,
"loss": 0.9986,
"step": 181
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.16951002180576324,
"learning_rate": 9.789808917197453e-06,
"loss": 1.0231,
"step": 182
},
{
"epoch": 0.46923076923076923,
"grad_norm": 0.1452597677707672,
"learning_rate": 9.78802747791953e-06,
"loss": 0.9805,
"step": 183
},
{
"epoch": 0.4717948717948718,
"grad_norm": 0.12342038750648499,
"learning_rate": 9.786240786240787e-06,
"loss": 0.9625,
"step": 184
},
{
"epoch": 0.47435897435897434,
"grad_norm": 0.13525085151195526,
"learning_rate": 9.784448818897639e-06,
"loss": 1.0564,
"step": 185
},
{
"epoch": 0.47692307692307695,
"grad_norm": 0.14499512314796448,
"learning_rate": 9.782651552488912e-06,
"loss": 1.0676,
"step": 186
},
{
"epoch": 0.4794871794871795,
"grad_norm": 0.13853202760219574,
"learning_rate": 9.780848963474828e-06,
"loss": 0.9944,
"step": 187
},
{
"epoch": 0.48205128205128206,
"grad_norm": 0.1524648517370224,
"learning_rate": 9.779041028175976e-06,
"loss": 1.0957,
"step": 188
},
{
"epoch": 0.4846153846153846,
"grad_norm": 0.13356293737888336,
"learning_rate": 9.77722772277228e-06,
"loss": 1.0397,
"step": 189
},
{
"epoch": 0.48717948717948717,
"grad_norm": 0.1403387039899826,
"learning_rate": 9.775409023301933e-06,
"loss": 1.1755,
"step": 190
},
{
"epoch": 0.4897435897435897,
"grad_norm": 0.13895130157470703,
"learning_rate": 9.773584905660379e-06,
"loss": 1.0341,
"step": 191
},
{
"epoch": 0.49230769230769234,
"grad_norm": 0.1415233463048935,
"learning_rate": 9.771755345599206e-06,
"loss": 1.092,
"step": 192
},
{
"epoch": 0.4948717948717949,
"grad_norm": 0.1509629786014557,
"learning_rate": 9.7699203187251e-06,
"loss": 0.9921,
"step": 193
},
{
"epoch": 0.49743589743589745,
"grad_norm": 0.13306330144405365,
"learning_rate": 9.768079800498753e-06,
"loss": 1.0274,
"step": 194
},
{
"epoch": 0.5,
"grad_norm": 0.1483563780784607,
"learning_rate": 9.766233766233768e-06,
"loss": 0.977,
"step": 195
},
{
"epoch": 0.5025641025641026,
"grad_norm": 0.13634060323238373,
"learning_rate": 9.764382191095549e-06,
"loss": 0.8818,
"step": 196
},
{
"epoch": 0.5051282051282051,
"grad_norm": 0.13927966356277466,
"learning_rate": 9.762525050100202e-06,
"loss": 0.9694,
"step": 197
},
{
"epoch": 0.5076923076923077,
"grad_norm": 0.13205285370349884,
"learning_rate": 9.760662318113397e-06,
"loss": 1.0713,
"step": 198
},
{
"epoch": 0.5102564102564102,
"grad_norm": 0.1272955685853958,
"learning_rate": 9.758793969849248e-06,
"loss": 0.9304,
"step": 199
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.14206095039844513,
"learning_rate": 9.75691997986915e-06,
"loss": 1.0151,
"step": 200
},
{
"epoch": 0.5153846153846153,
"grad_norm": 0.13040238618850708,
"learning_rate": 9.755040322580646e-06,
"loss": 1.056,
"step": 201
},
{
"epoch": 0.517948717948718,
"grad_norm": 0.13569800555706024,
"learning_rate": 9.753154972236246e-06,
"loss": 1.0058,
"step": 202
},
{
"epoch": 0.5205128205128206,
"grad_norm": 0.12954074144363403,
"learning_rate": 9.751263902932256e-06,
"loss": 1.0232,
"step": 203
},
{
"epoch": 0.5230769230769231,
"grad_norm": 0.1352427899837494,
"learning_rate": 9.749367088607595e-06,
"loss": 1.1409,
"step": 204
},
{
"epoch": 0.5256410256410257,
"grad_norm": 0.13935823738574982,
"learning_rate": 9.747464503042597e-06,
"loss": 0.9364,
"step": 205
},
{
"epoch": 0.5282051282051282,
"grad_norm": 0.14334161579608917,
"learning_rate": 9.745556119857798e-06,
"loss": 1.0412,
"step": 206
},
{
"epoch": 0.5307692307692308,
"grad_norm": 0.13454332947731018,
"learning_rate": 9.743641912512716e-06,
"loss": 1.0093,
"step": 207
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.1333240121603012,
"learning_rate": 9.741721854304638e-06,
"loss": 1.0294,
"step": 208
},
{
"epoch": 0.5358974358974359,
"grad_norm": 0.1349434107542038,
"learning_rate": 9.739795918367347e-06,
"loss": 1.0395,
"step": 209
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.12995462119579315,
"learning_rate": 9.737864077669904e-06,
"loss": 1.0675,
"step": 210
},
{
"epoch": 0.541025641025641,
"grad_norm": 0.13981810212135315,
"learning_rate": 9.735926305015354e-06,
"loss": 1.0434,
"step": 211
},
{
"epoch": 0.5435897435897435,
"grad_norm": 0.13797558844089508,
"learning_rate": 9.733982573039467e-06,
"loss": 1.1582,
"step": 212
},
{
"epoch": 0.5461538461538461,
"grad_norm": 0.136617973446846,
"learning_rate": 9.732032854209446e-06,
"loss": 0.9319,
"step": 213
},
{
"epoch": 0.5487179487179488,
"grad_norm": 0.15038198232650757,
"learning_rate": 9.730077120822623e-06,
"loss": 1.0026,
"step": 214
},
{
"epoch": 0.5512820512820513,
"grad_norm": 0.1529029756784439,
"learning_rate": 9.728115345005151e-06,
"loss": 1.0784,
"step": 215
},
{
"epoch": 0.5538461538461539,
"grad_norm": 0.13984259963035583,
"learning_rate": 9.726147498710677e-06,
"loss": 1.1533,
"step": 216
},
{
"epoch": 0.5564102564102564,
"grad_norm": 0.14129801094532013,
"learning_rate": 9.724173553719009e-06,
"loss": 0.9703,
"step": 217
},
{
"epoch": 0.558974358974359,
"grad_norm": 0.14009319245815277,
"learning_rate": 9.722193481634766e-06,
"loss": 1.1317,
"step": 218
},
{
"epoch": 0.5615384615384615,
"grad_norm": 0.13649149239063263,
"learning_rate": 9.720207253886011e-06,
"loss": 1.0083,
"step": 219
},
{
"epoch": 0.5641025641025641,
"grad_norm": 0.13949915766716003,
"learning_rate": 9.718214841722885e-06,
"loss": 0.9941,
"step": 220
},
{
"epoch": 0.5666666666666667,
"grad_norm": 0.17557266354560852,
"learning_rate": 9.716216216216216e-06,
"loss": 1.1887,
"step": 221
},
{
"epoch": 0.5692307692307692,
"grad_norm": 0.132981538772583,
"learning_rate": 9.714211348256117e-06,
"loss": 0.9682,
"step": 222
},
{
"epoch": 0.5717948717948718,
"grad_norm": 0.15944674611091614,
"learning_rate": 9.712200208550574e-06,
"loss": 0.9442,
"step": 223
},
{
"epoch": 0.5743589743589743,
"grad_norm": 0.15149790048599243,
"learning_rate": 9.710182767624022e-06,
"loss": 1.1134,
"step": 224
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.13614985346794128,
"learning_rate": 9.7081589958159e-06,
"loss": 0.8473,
"step": 225
},
{
"epoch": 0.5794871794871795,
"grad_norm": 0.1307866871356964,
"learning_rate": 9.706128863279205e-06,
"loss": 0.9158,
"step": 226
},
{
"epoch": 0.5820512820512821,
"grad_norm": 0.14745928347110748,
"learning_rate": 9.704092339979015e-06,
"loss": 0.9988,
"step": 227
},
{
"epoch": 0.5846153846153846,
"grad_norm": 0.14534030854701996,
"learning_rate": 9.702049395691015e-06,
"loss": 1.0931,
"step": 228
},
{
"epoch": 0.5871794871794872,
"grad_norm": 0.146283358335495,
"learning_rate": 9.7e-06,
"loss": 1.2088,
"step": 229
},
{
"epoch": 0.5897435897435898,
"grad_norm": 0.16283774375915527,
"learning_rate": 9.697944122298367e-06,
"loss": 0.9512,
"step": 230
},
{
"epoch": 0.5923076923076923,
"grad_norm": 0.1303090751171112,
"learning_rate": 9.695881731784583e-06,
"loss": 0.9181,
"step": 231
},
{
"epoch": 0.5948717948717949,
"grad_norm": 0.14575974643230438,
"learning_rate": 9.693812797461662e-06,
"loss": 1.0348,
"step": 232
},
{
"epoch": 0.5974358974358974,
"grad_norm": 0.14711220562458038,
"learning_rate": 9.691737288135593e-06,
"loss": 1.0401,
"step": 233
},
{
"epoch": 0.6,
"grad_norm": 0.14356166124343872,
"learning_rate": 9.689655172413794e-06,
"loss": 0.9484,
"step": 234
},
{
"epoch": 0.6025641025641025,
"grad_norm": 0.14533978700637817,
"learning_rate": 9.687566418703508e-06,
"loss": 0.9985,
"step": 235
},
{
"epoch": 0.6051282051282051,
"grad_norm": 0.14926594495773315,
"learning_rate": 9.685470995210218e-06,
"loss": 1.1273,
"step": 236
},
{
"epoch": 0.6076923076923076,
"grad_norm": 0.15611067414283752,
"learning_rate": 9.683368869936036e-06,
"loss": 1.0847,
"step": 237
},
{
"epoch": 0.6102564102564103,
"grad_norm": 0.15448501706123352,
"learning_rate": 9.681260010678057e-06,
"loss": 1.0333,
"step": 238
},
{
"epoch": 0.6128205128205129,
"grad_norm": 0.16903169453144073,
"learning_rate": 9.679144385026738e-06,
"loss": 0.9972,
"step": 239
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.14398221671581268,
"learning_rate": 9.677021960364222e-06,
"loss": 0.9373,
"step": 240
},
{
"epoch": 0.617948717948718,
"grad_norm": 0.16799390316009521,
"learning_rate": 9.674892703862662e-06,
"loss": 0.9425,
"step": 241
},
{
"epoch": 0.6205128205128205,
"grad_norm": 0.16503410041332245,
"learning_rate": 9.672756582482538e-06,
"loss": 0.991,
"step": 242
},
{
"epoch": 0.6230769230769231,
"grad_norm": 0.13837389647960663,
"learning_rate": 9.670613562970937e-06,
"loss": 1.0057,
"step": 243
},
{
"epoch": 0.6256410256410256,
"grad_norm": 0.15482862293720245,
"learning_rate": 9.66846361185984e-06,
"loss": 1.2884,
"step": 244
},
{
"epoch": 0.6282051282051282,
"grad_norm": 0.17946982383728027,
"learning_rate": 9.666306695464364e-06,
"loss": 1.0498,
"step": 245
},
{
"epoch": 0.6307692307692307,
"grad_norm": 0.18409568071365356,
"learning_rate": 9.664142779881018e-06,
"loss": 1.0383,
"step": 246
},
{
"epoch": 0.6333333333333333,
"grad_norm": 0.142312690615654,
"learning_rate": 9.661971830985917e-06,
"loss": 1.1336,
"step": 247
},
{
"epoch": 0.6358974358974359,
"grad_norm": 0.15140476822853088,
"learning_rate": 9.659793814432991e-06,
"loss": 1.0985,
"step": 248
},
{
"epoch": 0.6384615384615384,
"grad_norm": 0.1846708357334137,
"learning_rate": 9.657608695652173e-06,
"loss": 0.9555,
"step": 249
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.16700689494609833,
"learning_rate": 9.655416439847578e-06,
"loss": 0.9279,
"step": 250
},
{
"epoch": 0.6435897435897436,
"grad_norm": 0.1423652619123459,
"learning_rate": 9.653217011995637e-06,
"loss": 1.073,
"step": 251
},
{
"epoch": 0.6461538461538462,
"grad_norm": 0.16811993718147278,
"learning_rate": 9.651010376843254e-06,
"loss": 1.0924,
"step": 252
},
{
"epoch": 0.6487179487179487,
"grad_norm": 0.15465322136878967,
"learning_rate": 9.64879649890591e-06,
"loss": 1.078,
"step": 253
},
{
"epoch": 0.6512820512820513,
"grad_norm": 0.1621096432209015,
"learning_rate": 9.646575342465754e-06,
"loss": 1.0095,
"step": 254
},
{
"epoch": 0.6538461538461539,
"grad_norm": 0.16749772429466248,
"learning_rate": 9.644346871569704e-06,
"loss": 0.948,
"step": 255
},
{
"epoch": 0.6564102564102564,
"grad_norm": 0.1397644579410553,
"learning_rate": 9.64211105002749e-06,
"loss": 1.0232,
"step": 256
},
{
"epoch": 0.658974358974359,
"grad_norm": 0.179872065782547,
"learning_rate": 9.639867841409692e-06,
"loss": 1.0781,
"step": 257
},
{
"epoch": 0.6615384615384615,
"grad_norm": 0.18746939301490784,
"learning_rate": 9.63761720904578e-06,
"loss": 0.9158,
"step": 258
},
{
"epoch": 0.6641025641025641,
"grad_norm": 0.17777228355407715,
"learning_rate": 9.635359116022101e-06,
"loss": 1.0611,
"step": 259
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.15664607286453247,
"learning_rate": 9.633093525179857e-06,
"loss": 1.0061,
"step": 260
},
{
"epoch": 0.6692307692307692,
"grad_norm": 0.2109006941318512,
"learning_rate": 9.630820399113084e-06,
"loss": 1.0238,
"step": 261
},
{
"epoch": 0.6717948717948717,
"grad_norm": 0.16149462759494781,
"learning_rate": 9.628539700166575e-06,
"loss": 1.1312,
"step": 262
},
{
"epoch": 0.6743589743589744,
"grad_norm": 0.13910357654094696,
"learning_rate": 9.626251390433817e-06,
"loss": 0.9808,
"step": 263
},
{
"epoch": 0.676923076923077,
"grad_norm": 0.15184703469276428,
"learning_rate": 9.623955431754875e-06,
"loss": 0.9458,
"step": 264
},
{
"epoch": 0.6794871794871795,
"grad_norm": 0.17896021902561188,
"learning_rate": 9.621651785714285e-06,
"loss": 1.0504,
"step": 265
},
{
"epoch": 0.6820512820512821,
"grad_norm": 0.15594998002052307,
"learning_rate": 9.619340413638905e-06,
"loss": 1.1288,
"step": 266
},
{
"epoch": 0.6846153846153846,
"grad_norm": 0.14313378930091858,
"learning_rate": 9.617021276595746e-06,
"loss": 0.9985,
"step": 267
},
{
"epoch": 0.6871794871794872,
"grad_norm": 0.14239932596683502,
"learning_rate": 9.614694335389792e-06,
"loss": 0.9776,
"step": 268
},
{
"epoch": 0.6897435897435897,
"grad_norm": 0.1516118049621582,
"learning_rate": 9.612359550561798e-06,
"loss": 1.0086,
"step": 269
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.1527036875486374,
"learning_rate": 9.610016882386046e-06,
"loss": 1.1663,
"step": 270
},
{
"epoch": 0.6948717948717948,
"grad_norm": 0.1713275909423828,
"learning_rate": 9.607666290868095e-06,
"loss": 0.977,
"step": 271
},
{
"epoch": 0.6974358974358974,
"grad_norm": 0.13157938420772552,
"learning_rate": 9.60530773574252e-06,
"loss": 0.9617,
"step": 272
},
{
"epoch": 0.7,
"grad_norm": 0.17953188717365265,
"learning_rate": 9.60294117647059e-06,
"loss": 1.0687,
"step": 273
},
{
"epoch": 0.7025641025641025,
"grad_norm": 0.17509308457374573,
"learning_rate": 9.60056657223796e-06,
"loss": 0.9238,
"step": 274
},
{
"epoch": 0.7051282051282052,
"grad_norm": 0.16777881979942322,
"learning_rate": 9.598183881952327e-06,
"loss": 1.0203,
"step": 275
},
{
"epoch": 0.7076923076923077,
"grad_norm": 0.1494888812303543,
"learning_rate": 9.595793064241049e-06,
"loss": 1.0675,
"step": 276
},
{
"epoch": 0.7102564102564103,
"grad_norm": 0.1649765521287918,
"learning_rate": 9.593394077448748e-06,
"loss": 0.8814,
"step": 277
},
{
"epoch": 0.7128205128205128,
"grad_norm": 0.17049697041511536,
"learning_rate": 9.590986879634912e-06,
"loss": 0.891,
"step": 278
},
{
"epoch": 0.7153846153846154,
"grad_norm": 0.14463086426258087,
"learning_rate": 9.58857142857143e-06,
"loss": 0.957,
"step": 279
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.15929310023784637,
"learning_rate": 9.586147681740127e-06,
"loss": 1.017,
"step": 280
},
{
"epoch": 0.7205128205128205,
"grad_norm": 0.17474216222763062,
"learning_rate": 9.583715596330276e-06,
"loss": 0.9516,
"step": 281
},
{
"epoch": 0.7230769230769231,
"grad_norm": 0.1831640750169754,
"learning_rate": 9.581275129236071e-06,
"loss": 1.0512,
"step": 282
},
{
"epoch": 0.7256410256410256,
"grad_norm": 0.1618429571390152,
"learning_rate": 9.578826237054085e-06,
"loss": 0.9551,
"step": 283
},
{
"epoch": 0.7282051282051282,
"grad_norm": 0.24447672069072723,
"learning_rate": 9.576368876080691e-06,
"loss": 1.0373,
"step": 284
},
{
"epoch": 0.7307692307692307,
"grad_norm": 0.16472192108631134,
"learning_rate": 9.57390300230947e-06,
"loss": 0.9911,
"step": 285
},
{
"epoch": 0.7333333333333333,
"grad_norm": 0.1695912629365921,
"learning_rate": 9.571428571428573e-06,
"loss": 0.9008,
"step": 286
},
{
"epoch": 0.735897435897436,
"grad_norm": 0.1703156977891922,
"learning_rate": 9.568945538818077e-06,
"loss": 1.0863,
"step": 287
},
{
"epoch": 0.7384615384615385,
"grad_norm": 0.14874251186847687,
"learning_rate": 9.566453859547304e-06,
"loss": 0.9512,
"step": 288
},
{
"epoch": 0.7410256410256411,
"grad_norm": 0.1689365655183792,
"learning_rate": 9.563953488372094e-06,
"loss": 1.0063,
"step": 289
},
{
"epoch": 0.7435897435897436,
"grad_norm": 0.17003223299980164,
"learning_rate": 9.56144437973209e-06,
"loss": 0.9397,
"step": 290
},
{
"epoch": 0.7461538461538462,
"grad_norm": 0.20850569009780884,
"learning_rate": 9.55892648774796e-06,
"loss": 1.2076,
"step": 291
},
{
"epoch": 0.7487179487179487,
"grad_norm": 0.15689845383167267,
"learning_rate": 9.556399766218587e-06,
"loss": 1.1031,
"step": 292
},
{
"epoch": 0.7512820512820513,
"grad_norm": 0.160260871052742,
"learning_rate": 9.553864168618268e-06,
"loss": 1.0275,
"step": 293
},
{
"epoch": 0.7538461538461538,
"grad_norm": 0.1513524055480957,
"learning_rate": 9.551319648093842e-06,
"loss": 1.008,
"step": 294
},
{
"epoch": 0.7564102564102564,
"grad_norm": 0.17547191679477692,
"learning_rate": 9.54876615746181e-06,
"loss": 1.0009,
"step": 295
},
{
"epoch": 0.7589743589743589,
"grad_norm": 0.15460693836212158,
"learning_rate": 9.546203649205416e-06,
"loss": 1.1096,
"step": 296
},
{
"epoch": 0.7615384615384615,
"grad_norm": 0.17146429419517517,
"learning_rate": 9.543632075471698e-06,
"loss": 0.9587,
"step": 297
},
{
"epoch": 0.764102564102564,
"grad_norm": 0.15998685359954834,
"learning_rate": 9.54105138806852e-06,
"loss": 0.9264,
"step": 298
},
{
"epoch": 0.7666666666666667,
"grad_norm": 0.17196176946163177,
"learning_rate": 9.538461538461538e-06,
"loss": 1.0,
"step": 299
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.18652167916297913,
"learning_rate": 9.535862477771191e-06,
"loss": 1.1239,
"step": 300
},
{
"epoch": 0.7717948717948718,
"grad_norm": 0.1686553657054901,
"learning_rate": 9.533254156769596e-06,
"loss": 1.1414,
"step": 301
},
{
"epoch": 0.7743589743589744,
"grad_norm": 0.15988533198833466,
"learning_rate": 9.530636525877454e-06,
"loss": 0.9027,
"step": 302
},
{
"epoch": 0.7769230769230769,
"grad_norm": 0.15526551008224487,
"learning_rate": 9.528009535160905e-06,
"loss": 1.0569,
"step": 303
},
{
"epoch": 0.7794871794871795,
"grad_norm": 0.1854647994041443,
"learning_rate": 9.52537313432836e-06,
"loss": 1.1064,
"step": 304
},
{
"epoch": 0.782051282051282,
"grad_norm": 0.20110487937927246,
"learning_rate": 9.522727272727274e-06,
"loss": 1.0231,
"step": 305
},
{
"epoch": 0.7846153846153846,
"grad_norm": 0.15321309864521027,
"learning_rate": 9.520071899340924e-06,
"loss": 1.0294,
"step": 306
},
{
"epoch": 0.7871794871794872,
"grad_norm": 0.1512340009212494,
"learning_rate": 9.517406962785115e-06,
"loss": 0.9112,
"step": 307
},
{
"epoch": 0.7897435897435897,
"grad_norm": 0.19026243686676025,
"learning_rate": 9.514732411304872e-06,
"loss": 1.0988,
"step": 308
},
{
"epoch": 0.7923076923076923,
"grad_norm": 0.15860332548618317,
"learning_rate": 9.512048192771085e-06,
"loss": 0.8795,
"step": 309
},
{
"epoch": 0.7948717948717948,
"grad_norm": 0.2282475382089615,
"learning_rate": 9.509354254677129e-06,
"loss": 1.0206,
"step": 310
},
{
"epoch": 0.7974358974358975,
"grad_norm": 0.16409388184547424,
"learning_rate": 9.50665054413543e-06,
"loss": 1.01,
"step": 311
},
{
"epoch": 0.8,
"grad_norm": 0.15974940359592438,
"learning_rate": 9.503937007874017e-06,
"loss": 0.989,
"step": 312
},
{
"epoch": 0.8025641025641026,
"grad_norm": 0.16357499361038208,
"learning_rate": 9.50121359223301e-06,
"loss": 0.9589,
"step": 313
},
{
"epoch": 0.8051282051282052,
"grad_norm": 0.1798093467950821,
"learning_rate": 9.498480243161095e-06,
"loss": 0.9949,
"step": 314
},
{
"epoch": 0.8076923076923077,
"grad_norm": 0.18792827427387238,
"learning_rate": 9.495736906211937e-06,
"loss": 1.0233,
"step": 315
},
{
"epoch": 0.8102564102564103,
"grad_norm": 0.19793489575386047,
"learning_rate": 9.492983526540575e-06,
"loss": 1.0853,
"step": 316
},
{
"epoch": 0.8128205128205128,
"grad_norm": 0.148494690656662,
"learning_rate": 9.490220048899757e-06,
"loss": 1.0836,
"step": 317
},
{
"epoch": 0.8153846153846154,
"grad_norm": 0.20617227256298065,
"learning_rate": 9.487446417636253e-06,
"loss": 0.9434,
"step": 318
},
{
"epoch": 0.8179487179487179,
"grad_norm": 0.2122315913438797,
"learning_rate": 9.484662576687117e-06,
"loss": 0.9936,
"step": 319
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.19928601384162903,
"learning_rate": 9.481868469575908e-06,
"loss": 0.9267,
"step": 320
},
{
"epoch": 0.823076923076923,
"grad_norm": 0.1821938306093216,
"learning_rate": 9.479064039408867e-06,
"loss": 0.9761,
"step": 321
},
{
"epoch": 0.8256410256410256,
"grad_norm": 0.1640615314245224,
"learning_rate": 9.476249228871069e-06,
"loss": 0.9753,
"step": 322
},
{
"epoch": 0.8282051282051283,
"grad_norm": 0.2164408564567566,
"learning_rate": 9.4734239802225e-06,
"loss": 1.009,
"step": 323
},
{
"epoch": 0.8307692307692308,
"grad_norm": 0.17672689259052277,
"learning_rate": 9.470588235294119e-06,
"loss": 0.9376,
"step": 324
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.15482589602470398,
"learning_rate": 9.467741935483871e-06,
"loss": 1.0459,
"step": 325
},
{
"epoch": 0.8358974358974359,
"grad_norm": 0.1468273401260376,
"learning_rate": 9.464885021752642e-06,
"loss": 0.9625,
"step": 326
},
{
"epoch": 0.8384615384615385,
"grad_norm": 0.1662525236606598,
"learning_rate": 9.462017434620176e-06,
"loss": 0.9609,
"step": 327
},
{
"epoch": 0.841025641025641,
"grad_norm": 0.20608198642730713,
"learning_rate": 9.45913911416095e-06,
"loss": 0.9673,
"step": 328
},
{
"epoch": 0.8435897435897436,
"grad_norm": 0.18384887278079987,
"learning_rate": 9.45625e-06,
"loss": 1.02,
"step": 329
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.1636429876089096,
"learning_rate": 9.453350031308706e-06,
"loss": 1.0665,
"step": 330
},
{
"epoch": 0.8487179487179487,
"grad_norm": 0.16819709539413452,
"learning_rate": 9.450439146800503e-06,
"loss": 0.9494,
"step": 331
},
{
"epoch": 0.8512820512820513,
"grad_norm": 0.16869591176509857,
"learning_rate": 9.447517284726587e-06,
"loss": 0.9362,
"step": 332
},
{
"epoch": 0.8538461538461538,
"grad_norm": 0.17681722342967987,
"learning_rate": 9.444584382871537e-06,
"loss": 0.9596,
"step": 333
},
{
"epoch": 0.8564102564102564,
"grad_norm": 0.1720973253250122,
"learning_rate": 9.441640378548898e-06,
"loss": 0.9371,
"step": 334
},
{
"epoch": 0.8589743589743589,
"grad_norm": 0.1684177815914154,
"learning_rate": 9.438685208596712e-06,
"loss": 0.8739,
"step": 335
},
{
"epoch": 0.8615384615384616,
"grad_norm": 0.15152432024478912,
"learning_rate": 9.435718809373022e-06,
"loss": 1.0489,
"step": 336
},
{
"epoch": 0.8641025641025641,
"grad_norm": 0.16250121593475342,
"learning_rate": 9.43274111675127e-06,
"loss": 0.9873,
"step": 337
},
{
"epoch": 0.8666666666666667,
"grad_norm": 0.20848555862903595,
"learning_rate": 9.429752066115703e-06,
"loss": 1.1004,
"step": 338
},
{
"epoch": 0.8692307692307693,
"grad_norm": 0.1785045713186264,
"learning_rate": 9.426751592356688e-06,
"loss": 1.0217,
"step": 339
},
{
"epoch": 0.8717948717948718,
"grad_norm": 0.14325062930583954,
"learning_rate": 9.423739629865986e-06,
"loss": 1.0522,
"step": 340
},
{
"epoch": 0.8743589743589744,
"grad_norm": 0.17043401300907135,
"learning_rate": 9.420716112531971e-06,
"loss": 0.9412,
"step": 341
},
{
"epoch": 0.8769230769230769,
"grad_norm": 0.17380043864250183,
"learning_rate": 9.417680973734785e-06,
"loss": 1.0975,
"step": 342
},
{
"epoch": 0.8794871794871795,
"grad_norm": 0.15861521661281586,
"learning_rate": 9.414634146341465e-06,
"loss": 0.9772,
"step": 343
},
{
"epoch": 0.882051282051282,
"grad_norm": 0.17549242079257965,
"learning_rate": 9.411575562700965e-06,
"loss": 1.0992,
"step": 344
},
{
"epoch": 0.8846153846153846,
"grad_norm": 0.20035730302333832,
"learning_rate": 9.408505154639175e-06,
"loss": 0.9817,
"step": 345
},
{
"epoch": 0.8871794871794871,
"grad_norm": 0.16777153313159943,
"learning_rate": 9.405422853453843e-06,
"loss": 0.9846,
"step": 346
},
{
"epoch": 0.8897435897435897,
"grad_norm": 0.17003268003463745,
"learning_rate": 9.402328589909445e-06,
"loss": 1.022,
"step": 347
},
{
"epoch": 0.8923076923076924,
"grad_norm": 0.17670491337776184,
"learning_rate": 9.399222294232017e-06,
"loss": 1.1117,
"step": 348
},
{
"epoch": 0.8948717948717949,
"grad_norm": 0.18102656304836273,
"learning_rate": 9.396103896103898e-06,
"loss": 1.0409,
"step": 349
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.15800277888774872,
"learning_rate": 9.392973324658426e-06,
"loss": 0.8649,
"step": 350
},
{
"epoch": 0.9,
"grad_norm": 0.17744143307209015,
"learning_rate": 9.389830508474576e-06,
"loss": 0.9096,
"step": 351
},
{
"epoch": 0.9025641025641026,
"grad_norm": 0.16629469394683838,
"learning_rate": 9.386675375571523e-06,
"loss": 1.0208,
"step": 352
},
{
"epoch": 0.9051282051282051,
"grad_norm": 0.15229813754558563,
"learning_rate": 9.383507853403142e-06,
"loss": 0.9423,
"step": 353
},
{
"epoch": 0.9076923076923077,
"grad_norm": 0.40435972809791565,
"learning_rate": 9.38032786885246e-06,
"loss": 1.0585,
"step": 354
},
{
"epoch": 0.9102564102564102,
"grad_norm": 0.20711062848567963,
"learning_rate": 9.377135348226017e-06,
"loss": 1.0715,
"step": 355
},
{
"epoch": 0.9128205128205128,
"grad_norm": 0.20200040936470032,
"learning_rate": 9.37393021724819e-06,
"loss": 1.0494,
"step": 356
},
{
"epoch": 0.9153846153846154,
"grad_norm": 0.1699497103691101,
"learning_rate": 9.370712401055409e-06,
"loss": 1.0363,
"step": 357
},
{
"epoch": 0.9179487179487179,
"grad_norm": 0.17281508445739746,
"learning_rate": 9.36748182419035e-06,
"loss": 1.0238,
"step": 358
},
{
"epoch": 0.9205128205128205,
"grad_norm": 0.18644343316555023,
"learning_rate": 9.364238410596028e-06,
"loss": 1.0655,
"step": 359
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.16731494665145874,
"learning_rate": 9.360982083609822e-06,
"loss": 0.9855,
"step": 360
},
{
"epoch": 0.9256410256410257,
"grad_norm": 0.17364031076431274,
"learning_rate": 9.357712765957447e-06,
"loss": 0.9505,
"step": 361
},
{
"epoch": 0.9282051282051282,
"grad_norm": 0.1894925981760025,
"learning_rate": 9.354430379746837e-06,
"loss": 0.9665,
"step": 362
},
{
"epoch": 0.9307692307692308,
"grad_norm": 0.1816585510969162,
"learning_rate": 9.35113484646195e-06,
"loss": 0.8783,
"step": 363
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.177452951669693,
"learning_rate": 9.347826086956523e-06,
"loss": 1.0144,
"step": 364
},
{
"epoch": 0.9358974358974359,
"grad_norm": 0.1650353968143463,
"learning_rate": 9.344504021447722e-06,
"loss": 0.9883,
"step": 365
},
{
"epoch": 0.9384615384615385,
"grad_norm": 0.155875101685524,
"learning_rate": 9.341168569509738e-06,
"loss": 1.0195,
"step": 366
},
{
"epoch": 0.941025641025641,
"grad_norm": 0.19764171540737152,
"learning_rate": 9.337819650067296e-06,
"loss": 1.0666,
"step": 367
},
{
"epoch": 0.9435897435897436,
"grad_norm": 0.15158161520957947,
"learning_rate": 9.334457181389078e-06,
"loss": 1.1297,
"step": 368
},
{
"epoch": 0.9461538461538461,
"grad_norm": 0.16357675194740295,
"learning_rate": 9.331081081081083e-06,
"loss": 0.9474,
"step": 369
},
{
"epoch": 0.9487179487179487,
"grad_norm": 0.17813360691070557,
"learning_rate": 9.327691266079892e-06,
"loss": 1.0161,
"step": 370
},
{
"epoch": 0.9512820512820512,
"grad_norm": 0.15733803808689117,
"learning_rate": 9.324287652645861e-06,
"loss": 0.8616,
"step": 371
},
{
"epoch": 0.9538461538461539,
"grad_norm": 0.16512970626354218,
"learning_rate": 9.320870156356221e-06,
"loss": 0.906,
"step": 372
},
{
"epoch": 0.9564102564102565,
"grad_norm": 0.16653649508953094,
"learning_rate": 9.317438692098092e-06,
"loss": 0.9861,
"step": 373
},
{
"epoch": 0.958974358974359,
"grad_norm": 0.15374256670475006,
"learning_rate": 9.313993174061434e-06,
"loss": 0.9238,
"step": 374
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.1622532606124878,
"learning_rate": 9.310533515731875e-06,
"loss": 0.9759,
"step": 375
},
{
"epoch": 0.9641025641025641,
"grad_norm": 0.19126266241073608,
"learning_rate": 9.307059629883482e-06,
"loss": 0.9745,
"step": 376
},
{
"epoch": 0.9666666666666667,
"grad_norm": 0.1595565676689148,
"learning_rate": 9.303571428571428e-06,
"loss": 0.9289,
"step": 377
},
{
"epoch": 0.9692307692307692,
"grad_norm": 0.17021501064300537,
"learning_rate": 9.30006882312457e-06,
"loss": 1.0675,
"step": 378
},
{
"epoch": 0.9717948717948718,
"grad_norm": 0.17533089220523834,
"learning_rate": 9.296551724137932e-06,
"loss": 1.0388,
"step": 379
},
{
"epoch": 0.9743589743589743,
"grad_norm": 0.16750235855579376,
"learning_rate": 9.2930200414651e-06,
"loss": 1.0436,
"step": 380
},
{
"epoch": 0.9769230769230769,
"grad_norm": 0.15664179623126984,
"learning_rate": 9.289473684210525e-06,
"loss": 0.977,
"step": 381
},
{
"epoch": 0.9794871794871794,
"grad_norm": 0.16151364147663116,
"learning_rate": 9.285912560721721e-06,
"loss": 1.0423,
"step": 382
},
{
"epoch": 0.982051282051282,
"grad_norm": 0.16488024592399597,
"learning_rate": 9.282336578581363e-06,
"loss": 0.9958,
"step": 383
},
{
"epoch": 0.9846153846153847,
"grad_norm": 0.2102440744638443,
"learning_rate": 9.278745644599303e-06,
"loss": 0.9997,
"step": 384
},
{
"epoch": 0.9871794871794872,
"grad_norm": 0.18937990069389343,
"learning_rate": 9.275139664804471e-06,
"loss": 1.0116,
"step": 385
},
{
"epoch": 0.9897435897435898,
"grad_norm": 0.17054639756679535,
"learning_rate": 9.27151854443667e-06,
"loss": 0.9257,
"step": 386
},
{
"epoch": 0.9923076923076923,
"grad_norm": 0.16185376048088074,
"learning_rate": 9.26788218793829e-06,
"loss": 0.8911,
"step": 387
},
{
"epoch": 0.9948717948717949,
"grad_norm": 0.18112531304359436,
"learning_rate": 9.26423049894589e-06,
"loss": 1.0187,
"step": 388
},
{
"epoch": 0.9974358974358974,
"grad_norm": 0.17097817361354828,
"learning_rate": 9.26056338028169e-06,
"loss": 0.9781,
"step": 389
},
{
"epoch": 1.0,
"grad_norm": 0.33144575357437134,
"learning_rate": 9.256880733944955e-06,
"loss": 0.8241,
"step": 390
},
{
"epoch": 1.0025641025641026,
"grad_norm": 0.1937527358531952,
"learning_rate": 9.253182461103253e-06,
"loss": 1.0755,
"step": 391
},
{
"epoch": 1.005128205128205,
"grad_norm": 0.15882588922977448,
"learning_rate": 9.24946846208363e-06,
"loss": 0.9799,
"step": 392
},
{
"epoch": 1.0076923076923077,
"grad_norm": 0.16072547435760498,
"learning_rate": 9.245738636363637e-06,
"loss": 0.9872,
"step": 393
},
{
"epoch": 1.0102564102564102,
"grad_norm": 0.15416628122329712,
"learning_rate": 9.241992882562277e-06,
"loss": 1.0529,
"step": 394
},
{
"epoch": 1.0128205128205128,
"grad_norm": 0.16685126721858978,
"learning_rate": 9.238231098430814e-06,
"loss": 0.8965,
"step": 395
},
{
"epoch": 1.0153846153846153,
"grad_norm": 0.17164798080921173,
"learning_rate": 9.23445318084346e-06,
"loss": 1.0228,
"step": 396
},
{
"epoch": 1.0179487179487179,
"grad_norm": 0.1579882949590683,
"learning_rate": 9.230659025787966e-06,
"loss": 0.9462,
"step": 397
},
{
"epoch": 1.0205128205128204,
"grad_norm": 0.16084755957126617,
"learning_rate": 9.226848528356067e-06,
"loss": 1.0358,
"step": 398
},
{
"epoch": 1.023076923076923,
"grad_norm": 0.1533387154340744,
"learning_rate": 9.223021582733813e-06,
"loss": 0.9388,
"step": 399
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.16010062396526337,
"learning_rate": 9.21917808219178e-06,
"loss": 1.053,
"step": 400
},
{
"epoch": 1.028205128205128,
"grad_norm": 0.16239330172538757,
"learning_rate": 9.215317919075145e-06,
"loss": 0.9812,
"step": 401
},
{
"epoch": 1.0307692307692307,
"grad_norm": 0.1662273406982422,
"learning_rate": 9.211440984793628e-06,
"loss": 1.0265,
"step": 402
},
{
"epoch": 1.0333333333333334,
"grad_norm": 0.18184800446033478,
"learning_rate": 9.20754716981132e-06,
"loss": 0.8951,
"step": 403
},
{
"epoch": 1.035897435897436,
"grad_norm": 0.17101332545280457,
"learning_rate": 9.203636363636365e-06,
"loss": 1.0448,
"step": 404
},
{
"epoch": 1.0384615384615385,
"grad_norm": 0.16575555503368378,
"learning_rate": 9.199708454810497e-06,
"loss": 0.974,
"step": 405
},
{
"epoch": 1.041025641025641,
"grad_norm": 0.18276239931583405,
"learning_rate": 9.195763330898466e-06,
"loss": 1.0457,
"step": 406
},
{
"epoch": 1.0435897435897437,
"grad_norm": 0.1637968271970749,
"learning_rate": 9.191800878477306e-06,
"loss": 0.9338,
"step": 407
},
{
"epoch": 1.0461538461538462,
"grad_norm": 0.20241133868694305,
"learning_rate": 9.18782098312546e-06,
"loss": 1.0354,
"step": 408
},
{
"epoch": 1.0487179487179488,
"grad_norm": 0.161885067820549,
"learning_rate": 9.183823529411765e-06,
"loss": 0.9205,
"step": 409
},
{
"epoch": 1.0512820512820513,
"grad_norm": 0.14989648759365082,
"learning_rate": 9.179808400884306e-06,
"loss": 0.9291,
"step": 410
},
{
"epoch": 1.0538461538461539,
"grad_norm": 0.1757401078939438,
"learning_rate": 9.175775480059083e-06,
"loss": 0.9478,
"step": 411
},
{
"epoch": 1.0564102564102564,
"grad_norm": 0.1715121567249298,
"learning_rate": 9.171724648408586e-06,
"loss": 0.9509,
"step": 412
},
{
"epoch": 1.058974358974359,
"grad_norm": 0.1794794499874115,
"learning_rate": 9.167655786350149e-06,
"loss": 0.9052,
"step": 413
},
{
"epoch": 1.0615384615384615,
"grad_norm": 0.1767176240682602,
"learning_rate": 9.163568773234201e-06,
"loss": 0.8936,
"step": 414
},
{
"epoch": 1.064102564102564,
"grad_norm": 0.16628186404705048,
"learning_rate": 9.15946348733234e-06,
"loss": 0.9326,
"step": 415
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.21831132471561432,
"learning_rate": 9.155339805825244e-06,
"loss": 1.0548,
"step": 416
},
{
"epoch": 1.0692307692307692,
"grad_norm": 0.19534535706043243,
"learning_rate": 9.15119760479042e-06,
"loss": 0.9584,
"step": 417
},
{
"epoch": 1.0717948717948718,
"grad_norm": 0.16996845602989197,
"learning_rate": 9.147036759189797e-06,
"loss": 0.9573,
"step": 418
},
{
"epoch": 1.0743589743589743,
"grad_norm": 0.16845408082008362,
"learning_rate": 9.142857142857144e-06,
"loss": 0.8904,
"step": 419
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.1751023381948471,
"learning_rate": 9.138658628485306e-06,
"loss": 1.0049,
"step": 420
},
{
"epoch": 1.0794871794871794,
"grad_norm": 0.17877094447612762,
"learning_rate": 9.134441087613294e-06,
"loss": 1.0011,
"step": 421
},
{
"epoch": 1.082051282051282,
"grad_norm": 0.17528527975082397,
"learning_rate": 9.130204390613173e-06,
"loss": 1.0129,
"step": 422
},
{
"epoch": 1.0846153846153845,
"grad_norm": 0.18258242309093475,
"learning_rate": 9.125948406676782e-06,
"loss": 1.098,
"step": 423
},
{
"epoch": 1.087179487179487,
"grad_norm": 0.19751989841461182,
"learning_rate": 9.121673003802282e-06,
"loss": 0.9406,
"step": 424
},
{
"epoch": 1.0897435897435896,
"grad_norm": 0.1751803457736969,
"learning_rate": 9.117378048780488e-06,
"loss": 0.9812,
"step": 425
},
{
"epoch": 1.0923076923076924,
"grad_norm": 0.16526196897029877,
"learning_rate": 9.113063407181055e-06,
"loss": 0.9926,
"step": 426
},
{
"epoch": 1.094871794871795,
"grad_norm": 0.17794327437877655,
"learning_rate": 9.108728943338438e-06,
"loss": 0.8838,
"step": 427
},
{
"epoch": 1.0974358974358975,
"grad_norm": 0.16672289371490479,
"learning_rate": 9.104374520337684e-06,
"loss": 0.9296,
"step": 428
},
{
"epoch": 1.1,
"grad_norm": 0.1750965267419815,
"learning_rate": 9.100000000000001e-06,
"loss": 0.9608,
"step": 429
},
{
"epoch": 1.1025641025641026,
"grad_norm": 0.31471797823905945,
"learning_rate": 9.095605242868158e-06,
"loss": 1.113,
"step": 430
},
{
"epoch": 1.1051282051282052,
"grad_norm": 0.2636878192424774,
"learning_rate": 9.091190108191653e-06,
"loss": 1.0369,
"step": 431
},
{
"epoch": 1.1076923076923078,
"grad_norm": 0.16675381362438202,
"learning_rate": 9.086754453911697e-06,
"loss": 0.9234,
"step": 432
},
{
"epoch": 1.1102564102564103,
"grad_norm": 0.19578181207180023,
"learning_rate": 9.082298136645965e-06,
"loss": 0.8915,
"step": 433
},
{
"epoch": 1.1128205128205129,
"grad_norm": 0.15622036159038544,
"learning_rate": 9.077821011673151e-06,
"loss": 0.834,
"step": 434
},
{
"epoch": 1.1153846153846154,
"grad_norm": 0.1723235547542572,
"learning_rate": 9.073322932917318e-06,
"loss": 1.1672,
"step": 435
},
{
"epoch": 1.117948717948718,
"grad_norm": 0.16886036098003387,
"learning_rate": 9.068803752931978e-06,
"loss": 1.0947,
"step": 436
},
{
"epoch": 1.1205128205128205,
"grad_norm": 0.1680499017238617,
"learning_rate": 9.064263322884012e-06,
"loss": 1.0609,
"step": 437
},
{
"epoch": 1.123076923076923,
"grad_norm": 0.16996805369853973,
"learning_rate": 9.059701492537314e-06,
"loss": 0.9502,
"step": 438
},
{
"epoch": 1.1256410256410256,
"grad_norm": 0.17708763480186462,
"learning_rate": 9.05511811023622e-06,
"loss": 0.9767,
"step": 439
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.19685539603233337,
"learning_rate": 9.050513022888715e-06,
"loss": 0.957,
"step": 440
},
{
"epoch": 1.1307692307692307,
"grad_norm": 0.18247175216674805,
"learning_rate": 9.045886075949368e-06,
"loss": 0.9098,
"step": 441
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.1733454316854477,
"learning_rate": 9.041237113402062e-06,
"loss": 0.9007,
"step": 442
},
{
"epoch": 1.1358974358974359,
"grad_norm": 0.19381284713745117,
"learning_rate": 9.036565977742447e-06,
"loss": 0.8715,
"step": 443
},
{
"epoch": 1.1384615384615384,
"grad_norm": 0.19639398157596588,
"learning_rate": 9.031872509960161e-06,
"loss": 1.0139,
"step": 444
},
{
"epoch": 1.141025641025641,
"grad_norm": 0.17254269123077393,
"learning_rate": 9.027156549520768e-06,
"loss": 1.0033,
"step": 445
},
{
"epoch": 1.1435897435897435,
"grad_norm": 0.16717708110809326,
"learning_rate": 9.022417934347479e-06,
"loss": 0.9119,
"step": 446
},
{
"epoch": 1.146153846153846,
"grad_norm": 0.19381357729434967,
"learning_rate": 9.01765650080257e-06,
"loss": 1.058,
"step": 447
},
{
"epoch": 1.1487179487179486,
"grad_norm": 0.1698828637599945,
"learning_rate": 9.012872083668544e-06,
"loss": 1.0112,
"step": 448
},
{
"epoch": 1.1512820512820512,
"grad_norm": 0.1623694747686386,
"learning_rate": 9.008064516129034e-06,
"loss": 0.9848,
"step": 449
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.22092927992343903,
"learning_rate": 9.003233629749395e-06,
"loss": 0.9878,
"step": 450
},
{
"epoch": 1.1564102564102563,
"grad_norm": 0.17867566645145416,
"learning_rate": 8.998379254457052e-06,
"loss": 1.0226,
"step": 451
},
{
"epoch": 1.1589743589743589,
"grad_norm": 0.1954340934753418,
"learning_rate": 8.993501218521528e-06,
"loss": 1.044,
"step": 452
},
{
"epoch": 1.1615384615384616,
"grad_norm": 0.1862252950668335,
"learning_rate": 8.988599348534203e-06,
"loss": 0.9865,
"step": 453
},
{
"epoch": 1.1641025641025642,
"grad_norm": 0.2021472156047821,
"learning_rate": 8.983673469387756e-06,
"loss": 0.9522,
"step": 454
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.179282084107399,
"learning_rate": 8.97872340425532e-06,
"loss": 0.9702,
"step": 455
},
{
"epoch": 1.1692307692307693,
"grad_norm": 0.16460926830768585,
"learning_rate": 8.973748974569319e-06,
"loss": 1.031,
"step": 456
},
{
"epoch": 1.1717948717948719,
"grad_norm": 0.17866218090057373,
"learning_rate": 8.968750000000001e-06,
"loss": 1.0303,
"step": 457
},
{
"epoch": 1.1743589743589744,
"grad_norm": 0.21034833788871765,
"learning_rate": 8.963726298433634e-06,
"loss": 0.9513,
"step": 458
},
{
"epoch": 1.176923076923077,
"grad_norm": 0.24306456744670868,
"learning_rate": 8.958677685950415e-06,
"loss": 0.9149,
"step": 459
},
{
"epoch": 1.1794871794871795,
"grad_norm": 0.21025407314300537,
"learning_rate": 8.95360397680199e-06,
"loss": 1.097,
"step": 460
},
{
"epoch": 1.182051282051282,
"grad_norm": 0.18221627175807953,
"learning_rate": 8.948504983388704e-06,
"loss": 0.9259,
"step": 461
},
{
"epoch": 1.1846153846153846,
"grad_norm": 0.18507330119609833,
"learning_rate": 8.94338051623647e-06,
"loss": 1.075,
"step": 462
},
{
"epoch": 1.1871794871794872,
"grad_norm": 0.20780031383037567,
"learning_rate": 8.938230383973288e-06,
"loss": 1.0373,
"step": 463
},
{
"epoch": 1.1897435897435897,
"grad_norm": 0.18843470513820648,
"learning_rate": 8.93305439330544e-06,
"loss": 0.9142,
"step": 464
},
{
"epoch": 1.1923076923076923,
"grad_norm": 0.20828303694725037,
"learning_rate": 8.92785234899329e-06,
"loss": 0.9692,
"step": 465
},
{
"epoch": 1.1948717948717948,
"grad_norm": 0.15796665847301483,
"learning_rate": 8.922624053826745e-06,
"loss": 0.936,
"step": 466
},
{
"epoch": 1.1974358974358974,
"grad_norm": 0.17404665052890778,
"learning_rate": 8.917369308600336e-06,
"loss": 0.9466,
"step": 467
},
{
"epoch": 1.2,
"grad_norm": 0.20071259140968323,
"learning_rate": 8.912087912087912e-06,
"loss": 0.9609,
"step": 468
},
{
"epoch": 1.2025641025641025,
"grad_norm": 0.21828190982341766,
"learning_rate": 8.90677966101695e-06,
"loss": 0.9867,
"step": 469
},
{
"epoch": 1.205128205128205,
"grad_norm": 0.17222779989242554,
"learning_rate": 8.901444350042482e-06,
"loss": 1.055,
"step": 470
},
{
"epoch": 1.2076923076923076,
"grad_norm": 0.17729543149471283,
"learning_rate": 8.896081771720614e-06,
"loss": 1.0886,
"step": 471
},
{
"epoch": 1.2102564102564102,
"grad_norm": 0.17077124118804932,
"learning_rate": 8.89069171648164e-06,
"loss": 0.9318,
"step": 472
},
{
"epoch": 1.2128205128205127,
"grad_norm": 0.1938466727733612,
"learning_rate": 8.88527397260274e-06,
"loss": 0.9199,
"step": 473
},
{
"epoch": 1.2153846153846155,
"grad_norm": 0.21834906935691833,
"learning_rate": 8.879828326180258e-06,
"loss": 1.0522,
"step": 474
},
{
"epoch": 1.217948717948718,
"grad_norm": 0.18741706013679504,
"learning_rate": 8.87435456110155e-06,
"loss": 0.9653,
"step": 475
},
{
"epoch": 1.2205128205128206,
"grad_norm": 0.18282394111156464,
"learning_rate": 8.868852459016393e-06,
"loss": 1.0237,
"step": 476
},
{
"epoch": 1.2230769230769232,
"grad_norm": 0.17378900945186615,
"learning_rate": 8.86332179930796e-06,
"loss": 1.055,
"step": 477
},
{
"epoch": 1.2256410256410257,
"grad_norm": 0.21999238431453705,
"learning_rate": 8.857762359063314e-06,
"loss": 0.97,
"step": 478
},
{
"epoch": 1.2282051282051283,
"grad_norm": 0.18467235565185547,
"learning_rate": 8.852173913043478e-06,
"loss": 1.0498,
"step": 479
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.16720303893089294,
"learning_rate": 8.846556233653009e-06,
"loss": 0.9951,
"step": 480
},
{
"epoch": 1.2333333333333334,
"grad_norm": 0.16237983107566833,
"learning_rate": 8.840909090909092e-06,
"loss": 1.0361,
"step": 481
},
{
"epoch": 1.235897435897436,
"grad_norm": 0.15937431156635284,
"learning_rate": 8.835232252410167e-06,
"loss": 0.9251,
"step": 482
},
{
"epoch": 1.2384615384615385,
"grad_norm": 0.17901954054832458,
"learning_rate": 8.829525483304043e-06,
"loss": 1.0279,
"step": 483
},
{
"epoch": 1.241025641025641,
"grad_norm": 0.21782898902893066,
"learning_rate": 8.823788546255506e-06,
"loss": 1.0042,
"step": 484
},
{
"epoch": 1.2435897435897436,
"grad_norm": 0.172428160905838,
"learning_rate": 8.818021201413429e-06,
"loss": 1.069,
"step": 485
},
{
"epoch": 1.2461538461538462,
"grad_norm": 0.1770864725112915,
"learning_rate": 8.812223206377328e-06,
"loss": 1.0291,
"step": 486
},
{
"epoch": 1.2487179487179487,
"grad_norm": 0.1773838996887207,
"learning_rate": 8.80639431616341e-06,
"loss": 1.0329,
"step": 487
},
{
"epoch": 1.2512820512820513,
"grad_norm": 0.20391307771205902,
"learning_rate": 8.80053428317008e-06,
"loss": 0.9828,
"step": 488
},
{
"epoch": 1.2538461538461538,
"grad_norm": 0.16360723972320557,
"learning_rate": 8.794642857142858e-06,
"loss": 0.9086,
"step": 489
},
{
"epoch": 1.2564102564102564,
"grad_norm": 0.21982532739639282,
"learning_rate": 8.788719785138765e-06,
"loss": 1.097,
"step": 490
},
{
"epoch": 1.258974358974359,
"grad_norm": 0.19650976359844208,
"learning_rate": 8.782764811490128e-06,
"loss": 1.014,
"step": 491
},
{
"epoch": 1.2615384615384615,
"grad_norm": 0.1792370080947876,
"learning_rate": 8.776777677767778e-06,
"loss": 0.9578,
"step": 492
},
{
"epoch": 1.264102564102564,
"grad_norm": 0.2550472617149353,
"learning_rate": 8.770758122743683e-06,
"loss": 0.985,
"step": 493
},
{
"epoch": 1.2666666666666666,
"grad_norm": 0.18580225110054016,
"learning_rate": 8.764705882352942e-06,
"loss": 0.9796,
"step": 494
},
{
"epoch": 1.2692307692307692,
"grad_norm": 0.2560383975505829,
"learning_rate": 8.758620689655173e-06,
"loss": 1.0939,
"step": 495
},
{
"epoch": 1.2717948717948717,
"grad_norm": 0.18652617931365967,
"learning_rate": 8.752502274795269e-06,
"loss": 0.9389,
"step": 496
},
{
"epoch": 1.2743589743589743,
"grad_norm": 0.170726478099823,
"learning_rate": 8.746350364963505e-06,
"loss": 1.0236,
"step": 497
},
{
"epoch": 1.2769230769230768,
"grad_norm": 0.20897836983203888,
"learning_rate": 8.740164684354986e-06,
"loss": 0.9533,
"step": 498
},
{
"epoch": 1.2794871794871794,
"grad_norm": 0.19295988976955414,
"learning_rate": 8.73394495412844e-06,
"loss": 0.9425,
"step": 499
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.20471826195716858,
"learning_rate": 8.727690892364306e-06,
"loss": 0.9558,
"step": 500
},
{
"epoch": 1.2846153846153845,
"grad_norm": 0.18632952868938446,
"learning_rate": 8.72140221402214e-06,
"loss": 0.9701,
"step": 501
},
{
"epoch": 1.287179487179487,
"grad_norm": 0.20619980990886688,
"learning_rate": 8.715078630897317e-06,
"loss": 0.9442,
"step": 502
},
{
"epoch": 1.2897435897435898,
"grad_norm": 0.16518618166446686,
"learning_rate": 8.708719851576993e-06,
"loss": 1.0207,
"step": 503
},
{
"epoch": 1.2923076923076924,
"grad_norm": 0.1911863535642624,
"learning_rate": 8.70232558139535e-06,
"loss": 1.0024,
"step": 504
},
{
"epoch": 1.294871794871795,
"grad_norm": 1.0833367109298706,
"learning_rate": 8.695895522388062e-06,
"loss": 0.9605,
"step": 505
},
{
"epoch": 1.2974358974358975,
"grad_norm": 0.18326595425605774,
"learning_rate": 8.689429373246025e-06,
"loss": 0.9348,
"step": 506
},
{
"epoch": 1.3,
"grad_norm": 0.18599998950958252,
"learning_rate": 8.682926829268294e-06,
"loss": 1.2229,
"step": 507
},
{
"epoch": 1.3025641025641026,
"grad_norm": 0.19638995826244354,
"learning_rate": 8.676387582314206e-06,
"loss": 0.9343,
"step": 508
},
{
"epoch": 1.3051282051282052,
"grad_norm": 0.1773020327091217,
"learning_rate": 8.669811320754717e-06,
"loss": 0.9836,
"step": 509
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.19725504517555237,
"learning_rate": 8.663197729422896e-06,
"loss": 0.9532,
"step": 510
},
{
"epoch": 1.3102564102564103,
"grad_norm": 0.18866512179374695,
"learning_rate": 8.656546489563568e-06,
"loss": 0.9729,
"step": 511
},
{
"epoch": 1.3128205128205128,
"grad_norm": 0.18089522421360016,
"learning_rate": 8.649857278782113e-06,
"loss": 1.0848,
"step": 512
},
{
"epoch": 1.3153846153846154,
"grad_norm": 0.18652409315109253,
"learning_rate": 8.643129770992367e-06,
"loss": 0.9687,
"step": 513
},
{
"epoch": 1.317948717948718,
"grad_norm": 0.19303199648857117,
"learning_rate": 8.636363636363635e-06,
"loss": 1.0083,
"step": 514
},
{
"epoch": 1.3205128205128205,
"grad_norm": 0.207601860165596,
"learning_rate": 8.629558541266796e-06,
"loss": 0.9553,
"step": 515
},
{
"epoch": 1.323076923076923,
"grad_norm": 0.18684937059879303,
"learning_rate": 8.622714148219442e-06,
"loss": 1.0599,
"step": 516
},
{
"epoch": 1.3256410256410256,
"grad_norm": 0.1821713149547577,
"learning_rate": 8.615830115830118e-06,
"loss": 1.0457,
"step": 517
},
{
"epoch": 1.3282051282051281,
"grad_norm": 0.1726110726594925,
"learning_rate": 8.608906098741529e-06,
"loss": 0.8477,
"step": 518
},
{
"epoch": 1.3307692307692307,
"grad_norm": 0.17926542460918427,
"learning_rate": 8.601941747572816e-06,
"loss": 0.9752,
"step": 519
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.1952233761548996,
"learning_rate": 8.59493670886076e-06,
"loss": 0.9636,
"step": 520
},
{
"epoch": 1.3358974358974358,
"grad_norm": 0.1748773604631424,
"learning_rate": 8.587890625000001e-06,
"loss": 0.9876,
"step": 521
},
{
"epoch": 1.3384615384615386,
"grad_norm": 0.1747111827135086,
"learning_rate": 8.580803134182175e-06,
"loss": 0.9405,
"step": 522
},
{
"epoch": 1.3410256410256411,
"grad_norm": 0.18903814256191254,
"learning_rate": 8.573673870333989e-06,
"loss": 1.0787,
"step": 523
},
{
"epoch": 1.3435897435897437,
"grad_norm": 0.16885128617286682,
"learning_rate": 8.566502463054187e-06,
"loss": 0.9353,
"step": 524
},
{
"epoch": 1.3461538461538463,
"grad_norm": 0.19254456460475922,
"learning_rate": 8.559288537549409e-06,
"loss": 0.9973,
"step": 525
},
{
"epoch": 1.3487179487179488,
"grad_norm": 0.18546819686889648,
"learning_rate": 8.552031714568882e-06,
"loss": 0.9529,
"step": 526
},
{
"epoch": 1.3512820512820514,
"grad_norm": 0.17594410479068756,
"learning_rate": 8.544731610337974e-06,
"loss": 1.0129,
"step": 527
},
{
"epoch": 1.353846153846154,
"grad_norm": 0.19100527465343475,
"learning_rate": 8.537387836490528e-06,
"loss": 1.043,
"step": 528
},
{
"epoch": 1.3564102564102565,
"grad_norm": 0.18892578780651093,
"learning_rate": 8.53e-06,
"loss": 0.953,
"step": 529
},
{
"epoch": 1.358974358974359,
"grad_norm": 0.1750698834657669,
"learning_rate": 8.522567703109327e-06,
"loss": 1.0616,
"step": 530
},
{
"epoch": 1.3615384615384616,
"grad_norm": 0.17712536454200745,
"learning_rate": 8.515090543259558e-06,
"loss": 0.8927,
"step": 531
},
{
"epoch": 1.3641025641025641,
"grad_norm": 0.1855439990758896,
"learning_rate": 8.507568113017155e-06,
"loss": 0.995,
"step": 532
},
{
"epoch": 1.3666666666666667,
"grad_norm": 0.17967894673347473,
"learning_rate": 8.5e-06,
"loss": 0.8988,
"step": 533
},
{
"epoch": 1.3692307692307693,
"grad_norm": 0.167103573679924,
"learning_rate": 8.492385786802031e-06,
"loss": 0.9392,
"step": 534
},
{
"epoch": 1.3717948717948718,
"grad_norm": 0.1761719435453415,
"learning_rate": 8.484725050916498e-06,
"loss": 0.9431,
"step": 535
},
{
"epoch": 1.3743589743589744,
"grad_norm": 0.19669947028160095,
"learning_rate": 8.477017364657814e-06,
"loss": 0.956,
"step": 536
},
{
"epoch": 1.376923076923077,
"grad_norm": 0.17305508255958557,
"learning_rate": 8.469262295081969e-06,
"loss": 1.0234,
"step": 537
},
{
"epoch": 1.3794871794871795,
"grad_norm": 0.18830622732639313,
"learning_rate": 8.461459403905446e-06,
"loss": 0.9498,
"step": 538
},
{
"epoch": 1.382051282051282,
"grad_norm": 0.20369920134544373,
"learning_rate": 8.453608247422681e-06,
"loss": 1.1387,
"step": 539
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.18848799169063568,
"learning_rate": 8.445708376421923e-06,
"loss": 0.9122,
"step": 540
},
{
"epoch": 1.3871794871794871,
"grad_norm": 0.17956501245498657,
"learning_rate": 8.437759336099585e-06,
"loss": 0.9424,
"step": 541
},
{
"epoch": 1.3897435897435897,
"grad_norm": 0.19759565591812134,
"learning_rate": 8.429760665972945e-06,
"loss": 0.9336,
"step": 542
},
{
"epoch": 1.3923076923076922,
"grad_norm": 0.20953185856342316,
"learning_rate": 8.421711899791232e-06,
"loss": 0.9995,
"step": 543
},
{
"epoch": 1.3948717948717948,
"grad_norm": 0.1723688542842865,
"learning_rate": 8.413612565445026e-06,
"loss": 0.9328,
"step": 544
},
{
"epoch": 1.3974358974358974,
"grad_norm": 0.16942423582077026,
"learning_rate": 8.405462184873949e-06,
"loss": 0.9179,
"step": 545
},
{
"epoch": 1.4,
"grad_norm": 0.16917023062705994,
"learning_rate": 8.397260273972604e-06,
"loss": 1.0107,
"step": 546
},
{
"epoch": 1.4025641025641025,
"grad_norm": 0.18283595144748688,
"learning_rate": 8.389006342494715e-06,
"loss": 0.8784,
"step": 547
},
{
"epoch": 1.405128205128205,
"grad_norm": 0.17370331287384033,
"learning_rate": 8.380699893955462e-06,
"loss": 1.1566,
"step": 548
},
{
"epoch": 1.4076923076923076,
"grad_norm": 0.21643978357315063,
"learning_rate": 8.372340425531915e-06,
"loss": 1.045,
"step": 549
},
{
"epoch": 1.4102564102564101,
"grad_norm": 0.18621404469013214,
"learning_rate": 8.36392742796158e-06,
"loss": 0.9533,
"step": 550
},
{
"epoch": 1.4128205128205127,
"grad_norm": 0.1949056088924408,
"learning_rate": 8.355460385438972e-06,
"loss": 1.0161,
"step": 551
},
{
"epoch": 1.4153846153846155,
"grad_norm": 0.1903102844953537,
"learning_rate": 8.346938775510205e-06,
"loss": 1.047,
"step": 552
},
{
"epoch": 1.417948717948718,
"grad_norm": 0.17839354276657104,
"learning_rate": 8.338362068965518e-06,
"loss": 0.9366,
"step": 553
},
{
"epoch": 1.4205128205128206,
"grad_norm": 0.18962249159812927,
"learning_rate": 8.32972972972973e-06,
"loss": 0.9484,
"step": 554
},
{
"epoch": 1.4230769230769231,
"grad_norm": 0.17600049078464508,
"learning_rate": 8.321041214750544e-06,
"loss": 0.9337,
"step": 555
},
{
"epoch": 1.4256410256410257,
"grad_norm": 0.20685282349586487,
"learning_rate": 8.312295973884657e-06,
"loss": 0.9831,
"step": 556
},
{
"epoch": 1.4282051282051282,
"grad_norm": 0.20490646362304688,
"learning_rate": 8.303493449781661e-06,
"loss": 1.0035,
"step": 557
},
{
"epoch": 1.4307692307692308,
"grad_norm": 0.17430691421031952,
"learning_rate": 8.294633077765607e-06,
"loss": 0.8622,
"step": 558
},
{
"epoch": 1.4333333333333333,
"grad_norm": 0.2322288304567337,
"learning_rate": 8.285714285714285e-06,
"loss": 0.9546,
"step": 559
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.19194380939006805,
"learning_rate": 8.276736493936054e-06,
"loss": 1.0656,
"step": 560
},
{
"epoch": 1.4384615384615385,
"grad_norm": 0.1931033879518509,
"learning_rate": 8.267699115044248e-06,
"loss": 0.9399,
"step": 561
},
{
"epoch": 1.441025641025641,
"grad_norm": 0.184538334608078,
"learning_rate": 8.25860155382908e-06,
"loss": 0.9948,
"step": 562
},
{
"epoch": 1.4435897435897436,
"grad_norm": 0.19109323620796204,
"learning_rate": 8.249443207126949e-06,
"loss": 0.9874,
"step": 563
},
{
"epoch": 1.4461538461538461,
"grad_norm": 0.1646609902381897,
"learning_rate": 8.24022346368715e-06,
"loss": 0.9157,
"step": 564
},
{
"epoch": 1.4487179487179487,
"grad_norm": 0.19419412314891815,
"learning_rate": 8.230941704035874e-06,
"loss": 0.9856,
"step": 565
},
{
"epoch": 1.4512820512820512,
"grad_norm": 0.18451392650604248,
"learning_rate": 8.221597300337459e-06,
"loss": 0.9303,
"step": 566
},
{
"epoch": 1.4538461538461538,
"grad_norm": 0.20760126411914825,
"learning_rate": 8.212189616252821e-06,
"loss": 1.1956,
"step": 567
},
{
"epoch": 1.4564102564102563,
"grad_norm": 0.2049357295036316,
"learning_rate": 8.202718006795016e-06,
"loss": 0.9186,
"step": 568
},
{
"epoch": 1.458974358974359,
"grad_norm": 0.18056929111480713,
"learning_rate": 8.193181818181819e-06,
"loss": 0.9874,
"step": 569
},
{
"epoch": 1.4615384615384617,
"grad_norm": 0.2029920220375061,
"learning_rate": 8.18358038768529e-06,
"loss": 1.0627,
"step": 570
},
{
"epoch": 1.4641025641025642,
"grad_norm": 0.1772759109735489,
"learning_rate": 8.173913043478263e-06,
"loss": 0.9109,
"step": 571
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.2249906063079834,
"learning_rate": 8.164179104477612e-06,
"loss": 0.92,
"step": 572
},
{
"epoch": 1.4692307692307693,
"grad_norm": 0.1960502713918686,
"learning_rate": 8.154377880184333e-06,
"loss": 1.07,
"step": 573
},
{
"epoch": 1.471794871794872,
"grad_norm": 0.17915907502174377,
"learning_rate": 8.14450867052023e-06,
"loss": 1.0765,
"step": 574
},
{
"epoch": 1.4743589743589745,
"grad_norm": 0.214991495013237,
"learning_rate": 8.134570765661253e-06,
"loss": 0.9856,
"step": 575
},
{
"epoch": 1.476923076923077,
"grad_norm": 0.19141773879528046,
"learning_rate": 8.124563445867288e-06,
"loss": 1.0069,
"step": 576
},
{
"epoch": 1.4794871794871796,
"grad_norm": 0.18558935821056366,
"learning_rate": 8.114485981308412e-06,
"loss": 0.9061,
"step": 577
},
{
"epoch": 1.4820512820512821,
"grad_norm": 0.2104201316833496,
"learning_rate": 8.104337631887457e-06,
"loss": 0.9805,
"step": 578
},
{
"epoch": 1.4846153846153847,
"grad_norm": 0.18049705028533936,
"learning_rate": 8.094117647058823e-06,
"loss": 0.9658,
"step": 579
},
{
"epoch": 1.4871794871794872,
"grad_norm": 0.22525040805339813,
"learning_rate": 8.083825265643448e-06,
"loss": 1.0575,
"step": 580
},
{
"epoch": 1.4897435897435898,
"grad_norm": 0.20596688985824585,
"learning_rate": 8.07345971563981e-06,
"loss": 0.8823,
"step": 581
},
{
"epoch": 1.4923076923076923,
"grad_norm": 0.24059003591537476,
"learning_rate": 8.063020214030916e-06,
"loss": 0.9827,
"step": 582
},
{
"epoch": 1.494871794871795,
"grad_norm": 0.18533092737197876,
"learning_rate": 8.052505966587113e-06,
"loss": 1.0123,
"step": 583
},
{
"epoch": 1.4974358974358974,
"grad_norm": 0.20136979222297668,
"learning_rate": 8.04191616766467e-06,
"loss": 1.0111,
"step": 584
},
{
"epoch": 1.5,
"grad_norm": 0.19839423894882202,
"learning_rate": 8.03125e-06,
"loss": 1.0131,
"step": 585
},
{
"epoch": 1.5025641025641026,
"grad_norm": 0.18837936222553253,
"learning_rate": 8.020506634499398e-06,
"loss": 1.166,
"step": 586
},
{
"epoch": 1.505128205128205,
"grad_norm": 0.18904945254325867,
"learning_rate": 8.009685230024214e-06,
"loss": 0.9491,
"step": 587
},
{
"epoch": 1.5076923076923077,
"grad_norm": 0.17879720032215118,
"learning_rate": 7.998784933171326e-06,
"loss": 1.0575,
"step": 588
},
{
"epoch": 1.5102564102564102,
"grad_norm": 0.19607414305210114,
"learning_rate": 7.98780487804878e-06,
"loss": 1.0261,
"step": 589
},
{
"epoch": 1.5128205128205128,
"grad_norm": 0.23364603519439697,
"learning_rate": 7.97674418604651e-06,
"loss": 0.951,
"step": 590
},
{
"epoch": 1.5153846153846153,
"grad_norm": 0.20051056146621704,
"learning_rate": 7.965601965601966e-06,
"loss": 1.2431,
"step": 591
},
{
"epoch": 1.5179487179487179,
"grad_norm": 0.19472134113311768,
"learning_rate": 7.954377311960544e-06,
"loss": 1.0066,
"step": 592
},
{
"epoch": 1.5205128205128204,
"grad_norm": 0.21720701456069946,
"learning_rate": 7.943069306930693e-06,
"loss": 0.9888,
"step": 593
},
{
"epoch": 1.523076923076923,
"grad_norm": 0.18797412514686584,
"learning_rate": 7.93167701863354e-06,
"loss": 0.9282,
"step": 594
},
{
"epoch": 1.5256410256410255,
"grad_norm": 0.18229195475578308,
"learning_rate": 7.920199501246883e-06,
"loss": 1.1075,
"step": 595
},
{
"epoch": 1.528205128205128,
"grad_norm": 0.20988033711910248,
"learning_rate": 7.90863579474343e-06,
"loss": 0.9834,
"step": 596
},
{
"epoch": 1.5307692307692307,
"grad_norm": 0.18902920186519623,
"learning_rate": 7.896984924623117e-06,
"loss": 1.0285,
"step": 597
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.2365204393863678,
"learning_rate": 7.885245901639344e-06,
"loss": 1.0288,
"step": 598
},
{
"epoch": 1.5358974358974358,
"grad_norm": 0.18257446587085724,
"learning_rate": 7.873417721518988e-06,
"loss": 1.0293,
"step": 599
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.17291095852851868,
"learning_rate": 7.861499364675985e-06,
"loss": 0.9189,
"step": 600
},
{
"epoch": 1.5410256410256409,
"grad_norm": 0.1902029812335968,
"learning_rate": 7.849489795918368e-06,
"loss": 0.8937,
"step": 601
},
{
"epoch": 1.5435897435897434,
"grad_norm": 0.17989574372768402,
"learning_rate": 7.837387964148529e-06,
"loss": 1.0091,
"step": 602
},
{
"epoch": 1.546153846153846,
"grad_norm": 0.19586458802223206,
"learning_rate": 7.825192802056556e-06,
"loss": 0.9092,
"step": 603
},
{
"epoch": 1.5487179487179488,
"grad_norm": 0.2133467197418213,
"learning_rate": 7.812903225806452e-06,
"loss": 0.957,
"step": 604
},
{
"epoch": 1.5512820512820513,
"grad_norm": 0.22505982220172882,
"learning_rate": 7.800518134715025e-06,
"loss": 1.0118,
"step": 605
},
{
"epoch": 1.5538461538461539,
"grad_norm": 0.20532438158988953,
"learning_rate": 7.788036410923278e-06,
"loss": 0.9181,
"step": 606
},
{
"epoch": 1.5564102564102564,
"grad_norm": 0.17881132662296295,
"learning_rate": 7.775456919060053e-06,
"loss": 1.0308,
"step": 607
},
{
"epoch": 1.558974358974359,
"grad_norm": 0.21090662479400635,
"learning_rate": 7.762778505897773e-06,
"loss": 1.1082,
"step": 608
},
{
"epoch": 1.5615384615384615,
"grad_norm": 0.223121777176857,
"learning_rate": 7.75e-06,
"loss": 0.9349,
"step": 609
},
{
"epoch": 1.564102564102564,
"grad_norm": 0.20706158876419067,
"learning_rate": 7.737120211360633e-06,
"loss": 1.0369,
"step": 610
},
{
"epoch": 1.5666666666666667,
"grad_norm": 0.19180113077163696,
"learning_rate": 7.724137931034483e-06,
"loss": 0.917,
"step": 611
},
{
"epoch": 1.5692307692307692,
"grad_norm": 0.19626112282276154,
"learning_rate": 7.711051930758989e-06,
"loss": 1.0926,
"step": 612
},
{
"epoch": 1.5717948717948718,
"grad_norm": 0.19783137738704681,
"learning_rate": 7.697860962566846e-06,
"loss": 0.9433,
"step": 613
},
{
"epoch": 1.5743589743589743,
"grad_norm": 0.21266983449459076,
"learning_rate": 7.684563758389262e-06,
"loss": 1.0266,
"step": 614
},
{
"epoch": 1.5769230769230769,
"grad_norm": 0.1945042610168457,
"learning_rate": 7.671159029649595e-06,
"loss": 0.9966,
"step": 615
},
{
"epoch": 1.5794871794871796,
"grad_norm": 0.1982981264591217,
"learning_rate": 7.657645466847092e-06,
"loss": 0.9904,
"step": 616
},
{
"epoch": 1.5820512820512822,
"grad_norm": 0.1927499920129776,
"learning_rate": 7.644021739130436e-06,
"loss": 1.0763,
"step": 617
},
{
"epoch": 1.5846153846153848,
"grad_norm": 0.19995129108428955,
"learning_rate": 7.630286493860846e-06,
"loss": 0.9884,
"step": 618
},
{
"epoch": 1.5871794871794873,
"grad_norm": 0.17647652328014374,
"learning_rate": 7.616438356164383e-06,
"loss": 1.0011,
"step": 619
},
{
"epoch": 1.5897435897435899,
"grad_norm": 0.1947464793920517,
"learning_rate": 7.6024759284731776e-06,
"loss": 1.0027,
"step": 620
},
{
"epoch": 1.5923076923076924,
"grad_norm": 0.20255906879901886,
"learning_rate": 7.5883977900552484e-06,
"loss": 0.9758,
"step": 621
},
{
"epoch": 1.594871794871795,
"grad_norm": 0.21405860781669617,
"learning_rate": 7.574202496532593e-06,
"loss": 1.1207,
"step": 622
},
{
"epoch": 1.5974358974358975,
"grad_norm": 0.16839265823364258,
"learning_rate": 7.559888579387188e-06,
"loss": 0.9939,
"step": 623
},
{
"epoch": 1.6,
"grad_norm": 0.19284895062446594,
"learning_rate": 7.545454545454545e-06,
"loss": 1.0519,
"step": 624
},
{
"epoch": 1.6025641025641026,
"grad_norm": 0.1863621473312378,
"learning_rate": 7.5308988764044946e-06,
"loss": 0.9174,
"step": 625
},
{
"epoch": 1.6051282051282052,
"grad_norm": 0.2013963907957077,
"learning_rate": 7.516220028208745e-06,
"loss": 0.9832,
"step": 626
},
{
"epoch": 1.6076923076923078,
"grad_norm": 0.18340826034545898,
"learning_rate": 7.501416430594901e-06,
"loss": 0.8789,
"step": 627
},
{
"epoch": 1.6102564102564103,
"grad_norm": 0.1774785965681076,
"learning_rate": 7.486486486486487e-06,
"loss": 1.1,
"step": 628
},
{
"epoch": 1.6128205128205129,
"grad_norm": 0.18885089457035065,
"learning_rate": 7.471428571428571e-06,
"loss": 0.9079,
"step": 629
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.19248345494270325,
"learning_rate": 7.456241032998566e-06,
"loss": 0.9162,
"step": 630
},
{
"epoch": 1.617948717948718,
"grad_norm": 0.17770878970623016,
"learning_rate": 7.440922190201729e-06,
"loss": 0.9829,
"step": 631
},
{
"epoch": 1.6205128205128205,
"grad_norm": 0.19071798026561737,
"learning_rate": 7.42547033285094e-06,
"loss": 0.9304,
"step": 632
},
{
"epoch": 1.623076923076923,
"grad_norm": 0.1921025514602661,
"learning_rate": 7.409883720930233e-06,
"loss": 0.9004,
"step": 633
},
{
"epoch": 1.6256410256410256,
"grad_norm": 0.21452969312667847,
"learning_rate": 7.394160583941606e-06,
"loss": 0.9945,
"step": 634
},
{
"epoch": 1.6282051282051282,
"grad_norm": 0.16074183583259583,
"learning_rate": 7.378299120234605e-06,
"loss": 0.925,
"step": 635
},
{
"epoch": 1.6307692307692307,
"grad_norm": 0.1816839724779129,
"learning_rate": 7.362297496318113e-06,
"loss": 0.9164,
"step": 636
},
{
"epoch": 1.6333333333333333,
"grad_norm": 0.19317786395549774,
"learning_rate": 7.346153846153847e-06,
"loss": 0.943,
"step": 637
},
{
"epoch": 1.6358974358974359,
"grad_norm": 0.21708151698112488,
"learning_rate": 7.329866270430907e-06,
"loss": 0.9576,
"step": 638
},
{
"epoch": 1.6384615384615384,
"grad_norm": 0.200921893119812,
"learning_rate": 7.313432835820895e-06,
"loss": 0.9782,
"step": 639
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.1886773556470871,
"learning_rate": 7.2968515742128935e-06,
"loss": 0.989,
"step": 640
},
{
"epoch": 1.6435897435897435,
"grad_norm": 0.19396939873695374,
"learning_rate": 7.280120481927711e-06,
"loss": 1.026,
"step": 641
},
{
"epoch": 1.646153846153846,
"grad_norm": 0.21198135614395142,
"learning_rate": 7.263237518910742e-06,
"loss": 1.1656,
"step": 642
},
{
"epoch": 1.6487179487179486,
"grad_norm": 0.22808434069156647,
"learning_rate": 7.246200607902737e-06,
"loss": 1.0588,
"step": 643
},
{
"epoch": 1.6512820512820512,
"grad_norm": 0.19930703938007355,
"learning_rate": 7.229007633587788e-06,
"loss": 0.9826,
"step": 644
},
{
"epoch": 1.6538461538461537,
"grad_norm": 0.188712477684021,
"learning_rate": 7.211656441717792e-06,
"loss": 1.0296,
"step": 645
},
{
"epoch": 1.6564102564102563,
"grad_norm": 0.2129139006137848,
"learning_rate": 7.194144838212635e-06,
"loss": 1.1525,
"step": 646
},
{
"epoch": 1.6589743589743589,
"grad_norm": 0.20264121890068054,
"learning_rate": 7.176470588235295e-06,
"loss": 0.9398,
"step": 647
},
{
"epoch": 1.6615384615384614,
"grad_norm": 0.18227992951869965,
"learning_rate": 7.1586314152410585e-06,
"loss": 1.0267,
"step": 648
},
{
"epoch": 1.664102564102564,
"grad_norm": 0.1936773806810379,
"learning_rate": 7.140625e-06,
"loss": 0.8565,
"step": 649
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.1949433982372284,
"learning_rate": 7.122448979591836e-06,
"loss": 1.0448,
"step": 650
},
{
"epoch": 1.669230769230769,
"grad_norm": 0.19935406744480133,
"learning_rate": 7.104100946372239e-06,
"loss": 0.9539,
"step": 651
},
{
"epoch": 1.6717948717948716,
"grad_norm": 0.2263110727071762,
"learning_rate": 7.085578446909666e-06,
"loss": 0.9838,
"step": 652
},
{
"epoch": 1.6743589743589744,
"grad_norm": 0.2188168615102768,
"learning_rate": 7.06687898089172e-06,
"loss": 0.9958,
"step": 653
},
{
"epoch": 1.676923076923077,
"grad_norm": 0.2003227025270462,
"learning_rate": 7.048e-06,
"loss": 1.0545,
"step": 654
},
{
"epoch": 1.6794871794871795,
"grad_norm": 0.18214313685894012,
"learning_rate": 7.028938906752412e-06,
"loss": 0.9917,
"step": 655
},
{
"epoch": 1.682051282051282,
"grad_norm": 0.1975235790014267,
"learning_rate": 7.009693053311792e-06,
"loss": 1.0347,
"step": 656
},
{
"epoch": 1.6846153846153846,
"grad_norm": 0.2402488738298416,
"learning_rate": 6.990259740259741e-06,
"loss": 0.8716,
"step": 657
},
{
"epoch": 1.6871794871794872,
"grad_norm": 0.20426467061042786,
"learning_rate": 6.970636215334422e-06,
"loss": 0.9865,
"step": 658
},
{
"epoch": 1.6897435897435897,
"grad_norm": 0.22642633318901062,
"learning_rate": 6.950819672131147e-06,
"loss": 0.9125,
"step": 659
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.23084644973278046,
"learning_rate": 6.930807248764415e-06,
"loss": 0.9856,
"step": 660
},
{
"epoch": 1.6948717948717948,
"grad_norm": 0.22062422335147858,
"learning_rate": 6.910596026490067e-06,
"loss": 1.055,
"step": 661
},
{
"epoch": 1.6974358974358974,
"grad_norm": 0.19819729030132294,
"learning_rate": 6.89018302828619e-06,
"loss": 1.0142,
"step": 662
},
{
"epoch": 1.7,
"grad_norm": 0.2273205816745758,
"learning_rate": 6.869565217391305e-06,
"loss": 0.9948,
"step": 663
},
{
"epoch": 1.7025641025641025,
"grad_norm": 0.2149330973625183,
"learning_rate": 6.848739495798319e-06,
"loss": 0.9062,
"step": 664
},
{
"epoch": 1.7051282051282053,
"grad_norm": 0.17445141077041626,
"learning_rate": 6.827702702702703e-06,
"loss": 1.0166,
"step": 665
},
{
"epoch": 1.7076923076923078,
"grad_norm": 0.21047592163085938,
"learning_rate": 6.806451612903226e-06,
"loss": 1.0253,
"step": 666
},
{
"epoch": 1.7102564102564104,
"grad_norm": 0.19377169013023376,
"learning_rate": 6.784982935153583e-06,
"loss": 0.9729,
"step": 667
},
{
"epoch": 1.712820512820513,
"grad_norm": 0.18540802597999573,
"learning_rate": 6.763293310463122e-06,
"loss": 0.8732,
"step": 668
},
{
"epoch": 1.7153846153846155,
"grad_norm": 0.2677832841873169,
"learning_rate": 6.741379310344829e-06,
"loss": 1.0237,
"step": 669
},
{
"epoch": 1.717948717948718,
"grad_norm": 0.20734448730945587,
"learning_rate": 6.719237435008666e-06,
"loss": 1.0527,
"step": 670
},
{
"epoch": 1.7205128205128206,
"grad_norm": 0.1992000937461853,
"learning_rate": 6.696864111498258e-06,
"loss": 0.9951,
"step": 671
},
{
"epoch": 1.7230769230769232,
"grad_norm": 0.19159814715385437,
"learning_rate": 6.6742556917688265e-06,
"loss": 1.1233,
"step": 672
},
{
"epoch": 1.7256410256410257,
"grad_norm": 0.2154679298400879,
"learning_rate": 6.651408450704226e-06,
"loss": 1.0262,
"step": 673
},
{
"epoch": 1.7282051282051283,
"grad_norm": 0.1996496319770813,
"learning_rate": 6.628318584070796e-06,
"loss": 1.0542,
"step": 674
},
{
"epoch": 1.7307692307692308,
"grad_norm": 0.18427924811840057,
"learning_rate": 6.604982206405694e-06,
"loss": 0.8912,
"step": 675
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.1896672397851944,
"learning_rate": 6.58139534883721e-06,
"loss": 1.1042,
"step": 676
},
{
"epoch": 1.735897435897436,
"grad_norm": 0.2349502146244049,
"learning_rate": 6.557553956834534e-06,
"loss": 1.0842,
"step": 677
},
{
"epoch": 1.7384615384615385,
"grad_norm": 0.21734175086021423,
"learning_rate": 6.533453887884268e-06,
"loss": 1.0086,
"step": 678
},
{
"epoch": 1.741025641025641,
"grad_norm": 0.19185325503349304,
"learning_rate": 6.5090909090909095e-06,
"loss": 0.9509,
"step": 679
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.18834951519966125,
"learning_rate": 6.484460694698354e-06,
"loss": 1.0333,
"step": 680
},
{
"epoch": 1.7461538461538462,
"grad_norm": 0.19962508976459503,
"learning_rate": 6.459558823529412e-06,
"loss": 1.0062,
"step": 681
},
{
"epoch": 1.7487179487179487,
"grad_norm": 0.1968788206577301,
"learning_rate": 6.434380776340111e-06,
"loss": 0.9652,
"step": 682
},
{
"epoch": 1.7512820512820513,
"grad_norm": 0.19386903941631317,
"learning_rate": 6.408921933085502e-06,
"loss": 0.9253,
"step": 683
},
{
"epoch": 1.7538461538461538,
"grad_norm": 0.18816019594669342,
"learning_rate": 6.38317757009346e-06,
"loss": 0.998,
"step": 684
},
{
"epoch": 1.7564102564102564,
"grad_norm": 0.19842685759067535,
"learning_rate": 6.357142857142856e-06,
"loss": 1.0057,
"step": 685
},
{
"epoch": 1.758974358974359,
"grad_norm": 0.19914638996124268,
"learning_rate": 6.330812854442344e-06,
"loss": 0.9429,
"step": 686
},
{
"epoch": 1.7615384615384615,
"grad_norm": 0.1913568377494812,
"learning_rate": 6.304182509505703e-06,
"loss": 0.8765,
"step": 687
},
{
"epoch": 1.764102564102564,
"grad_norm": 0.20887283980846405,
"learning_rate": 6.277246653919694e-06,
"loss": 1.0437,
"step": 688
},
{
"epoch": 1.7666666666666666,
"grad_norm": 0.1883188635110855,
"learning_rate": 6.25e-06,
"loss": 0.9215,
"step": 689
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.1730821281671524,
"learning_rate": 6.222437137330755e-06,
"loss": 1.0435,
"step": 690
},
{
"epoch": 1.7717948717948717,
"grad_norm": 0.18366935849189758,
"learning_rate": 6.194552529182879e-06,
"loss": 1.0434,
"step": 691
},
{
"epoch": 1.7743589743589743,
"grad_norm": 0.16954126954078674,
"learning_rate": 6.166340508806262e-06,
"loss": 0.9655,
"step": 692
},
{
"epoch": 1.7769230769230768,
"grad_norm": 0.20240214467048645,
"learning_rate": 6.137795275590551e-06,
"loss": 1.0522,
"step": 693
},
{
"epoch": 1.7794871794871794,
"grad_norm": 0.1842651218175888,
"learning_rate": 6.1089108910891094e-06,
"loss": 0.9502,
"step": 694
},
{
"epoch": 1.782051282051282,
"grad_norm": 0.19008156657218933,
"learning_rate": 6.079681274900399e-06,
"loss": 0.9194,
"step": 695
},
{
"epoch": 1.7846153846153845,
"grad_norm": 0.18955452740192413,
"learning_rate": 6.050100200400802e-06,
"loss": 0.9153,
"step": 696
},
{
"epoch": 1.787179487179487,
"grad_norm": 0.18745142221450806,
"learning_rate": 6.020161290322582e-06,
"loss": 1.0557,
"step": 697
},
{
"epoch": 1.7897435897435896,
"grad_norm": 0.19175171852111816,
"learning_rate": 5.9898580121703855e-06,
"loss": 1.0186,
"step": 698
},
{
"epoch": 1.7923076923076922,
"grad_norm": 0.21348226070404053,
"learning_rate": 5.9591836734693876e-06,
"loss": 1.1486,
"step": 699
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.19678117334842682,
"learning_rate": 5.928131416837782e-06,
"loss": 1.0151,
"step": 700
},
{
"epoch": 1.7974358974358975,
"grad_norm": 0.17783664166927338,
"learning_rate": 5.896694214876034e-06,
"loss": 0.9591,
"step": 701
},
{
"epoch": 1.8,
"grad_norm": 0.21038049459457397,
"learning_rate": 5.8648648648648655e-06,
"loss": 1.0256,
"step": 702
},
{
"epoch": 1.8025641025641026,
"grad_norm": 0.2131882607936859,
"learning_rate": 5.832635983263598e-06,
"loss": 1.0602,
"step": 703
},
{
"epoch": 1.8051282051282052,
"grad_norm": 0.2531805634498596,
"learning_rate": 5.8e-06,
"loss": 0.9224,
"step": 704
},
{
"epoch": 1.8076923076923077,
"grad_norm": 0.20389708876609802,
"learning_rate": 5.766949152542372e-06,
"loss": 0.926,
"step": 705
},
{
"epoch": 1.8102564102564103,
"grad_norm": 0.19052807986736298,
"learning_rate": 5.733475479744137e-06,
"loss": 1.0813,
"step": 706
},
{
"epoch": 1.8128205128205128,
"grad_norm": 0.23358896374702454,
"learning_rate": 5.6995708154506445e-06,
"loss": 1.1185,
"step": 707
},
{
"epoch": 1.8153846153846154,
"grad_norm": 0.19410401582717896,
"learning_rate": 5.6652267818574515e-06,
"loss": 0.9472,
"step": 708
},
{
"epoch": 1.817948717948718,
"grad_norm": 0.20657074451446533,
"learning_rate": 5.630434782608696e-06,
"loss": 0.9792,
"step": 709
},
{
"epoch": 1.8205128205128205,
"grad_norm": 0.19862636923789978,
"learning_rate": 5.5951859956236334e-06,
"loss": 1.0459,
"step": 710
},
{
"epoch": 1.823076923076923,
"grad_norm": 0.19714003801345825,
"learning_rate": 5.559471365638766e-06,
"loss": 1.0864,
"step": 711
},
{
"epoch": 1.8256410256410256,
"grad_norm": 0.2103991061449051,
"learning_rate": 5.523281596452329e-06,
"loss": 1.0025,
"step": 712
},
{
"epoch": 1.8282051282051284,
"grad_norm": 0.21029628813266754,
"learning_rate": 5.486607142857143e-06,
"loss": 1.0526,
"step": 713
},
{
"epoch": 1.830769230769231,
"grad_norm": 0.207773357629776,
"learning_rate": 5.4494382022471915e-06,
"loss": 0.9971,
"step": 714
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.21237727999687195,
"learning_rate": 5.411764705882353e-06,
"loss": 0.9528,
"step": 715
},
{
"epoch": 1.835897435897436,
"grad_norm": 0.1775677651166916,
"learning_rate": 5.373576309794989e-06,
"loss": 0.8888,
"step": 716
},
{
"epoch": 1.8384615384615386,
"grad_norm": 0.21109408140182495,
"learning_rate": 5.3348623853211015e-06,
"loss": 0.9526,
"step": 717
},
{
"epoch": 1.8410256410256411,
"grad_norm": 0.20082655549049377,
"learning_rate": 5.295612009237876e-06,
"loss": 0.989,
"step": 718
},
{
"epoch": 1.8435897435897437,
"grad_norm": 0.18796475231647491,
"learning_rate": 5.255813953488372e-06,
"loss": 1.1235,
"step": 719
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.19870947301387787,
"learning_rate": 5.215456674473068e-06,
"loss": 0.9476,
"step": 720
},
{
"epoch": 1.8487179487179488,
"grad_norm": 0.20163416862487793,
"learning_rate": 5.174528301886793e-06,
"loss": 1.0574,
"step": 721
},
{
"epoch": 1.8512820512820514,
"grad_norm": 0.1803264170885086,
"learning_rate": 5.133016627078385e-06,
"loss": 0.9504,
"step": 722
},
{
"epoch": 1.853846153846154,
"grad_norm": 0.19215236604213715,
"learning_rate": 5.090909090909091e-06,
"loss": 1.1705,
"step": 723
},
{
"epoch": 1.8564102564102565,
"grad_norm": 0.2053728848695755,
"learning_rate": 5.048192771084337e-06,
"loss": 1.0616,
"step": 724
},
{
"epoch": 1.858974358974359,
"grad_norm": 0.18856436014175415,
"learning_rate": 5.004854368932039e-06,
"loss": 1.1064,
"step": 725
},
{
"epoch": 1.8615384615384616,
"grad_norm": 0.22481724619865417,
"learning_rate": 4.960880195599021e-06,
"loss": 0.9446,
"step": 726
},
{
"epoch": 1.8641025641025641,
"grad_norm": 0.19489426910877228,
"learning_rate": 4.916256157635469e-06,
"loss": 0.9482,
"step": 727
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.18249572813510895,
"learning_rate": 4.870967741935484e-06,
"loss": 0.9447,
"step": 728
},
{
"epoch": 1.8692307692307693,
"grad_norm": 0.1984269618988037,
"learning_rate": 4.825e-06,
"loss": 0.8603,
"step": 729
},
{
"epoch": 1.8717948717948718,
"grad_norm": 0.1892971694469452,
"learning_rate": 4.778337531486147e-06,
"loss": 0.9162,
"step": 730
},
{
"epoch": 1.8743589743589744,
"grad_norm": 0.19022035598754883,
"learning_rate": 4.7309644670050755e-06,
"loss": 0.915,
"step": 731
},
{
"epoch": 1.876923076923077,
"grad_norm": 0.20524592697620392,
"learning_rate": 4.6828644501278775e-06,
"loss": 0.9754,
"step": 732
},
{
"epoch": 1.8794871794871795,
"grad_norm": 0.19411511719226837,
"learning_rate": 4.6340206185567015e-06,
"loss": 0.9368,
"step": 733
},
{
"epoch": 1.882051282051282,
"grad_norm": 0.19343458116054535,
"learning_rate": 4.584415584415584e-06,
"loss": 1.0132,
"step": 734
},
{
"epoch": 1.8846153846153846,
"grad_norm": 0.197899729013443,
"learning_rate": 4.534031413612565e-06,
"loss": 1.0597,
"step": 735
},
{
"epoch": 1.8871794871794871,
"grad_norm": 0.22261539101600647,
"learning_rate": 4.482849604221636e-06,
"loss": 0.9934,
"step": 736
},
{
"epoch": 1.8897435897435897,
"grad_norm": 0.21835994720458984,
"learning_rate": 4.430851063829788e-06,
"loss": 0.9954,
"step": 737
},
{
"epoch": 1.8923076923076922,
"grad_norm": 0.1972758173942566,
"learning_rate": 4.378016085790885e-06,
"loss": 0.9468,
"step": 738
},
{
"epoch": 1.8948717948717948,
"grad_norm": 0.18865923583507538,
"learning_rate": 4.324324324324325e-06,
"loss": 0.9226,
"step": 739
},
{
"epoch": 1.8974358974358974,
"grad_norm": 0.19400173425674438,
"learning_rate": 4.2697547683923715e-06,
"loss": 1.1236,
"step": 740
},
{
"epoch": 1.9,
"grad_norm": 0.18145526945590973,
"learning_rate": 4.2142857142857145e-06,
"loss": 0.8993,
"step": 741
},
{
"epoch": 1.9025641025641025,
"grad_norm": 0.2090071588754654,
"learning_rate": 4.157894736842105e-06,
"loss": 0.9357,
"step": 742
},
{
"epoch": 1.905128205128205,
"grad_norm": 0.240007683634758,
"learning_rate": 4.100558659217877e-06,
"loss": 0.9786,
"step": 743
},
{
"epoch": 1.9076923076923076,
"grad_norm": 0.19722330570220947,
"learning_rate": 4.04225352112676e-06,
"loss": 1.0765,
"step": 744
},
{
"epoch": 1.9102564102564101,
"grad_norm": 0.18485118448734283,
"learning_rate": 3.982954545454546e-06,
"loss": 0.8595,
"step": 745
},
{
"epoch": 1.9128205128205127,
"grad_norm": 0.2154824435710907,
"learning_rate": 3.922636103151863e-06,
"loss": 1.0258,
"step": 746
},
{
"epoch": 1.9153846153846152,
"grad_norm": 0.2018478512763977,
"learning_rate": 3.861271676300577e-06,
"loss": 0.9515,
"step": 747
},
{
"epoch": 1.9179487179487178,
"grad_norm": 0.2598324716091156,
"learning_rate": 3.798833819241983e-06,
"loss": 1.1186,
"step": 748
},
{
"epoch": 1.9205128205128204,
"grad_norm": 0.21484240889549255,
"learning_rate": 3.735294117647058e-06,
"loss": 0.9179,
"step": 749
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.20729656517505646,
"learning_rate": 3.6706231454005937e-06,
"loss": 0.9008,
"step": 750
},
{
"epoch": 1.9256410256410257,
"grad_norm": 0.19938671588897705,
"learning_rate": 3.604790419161677e-06,
"loss": 0.9061,
"step": 751
},
{
"epoch": 1.9282051282051282,
"grad_norm": 0.19618763029575348,
"learning_rate": 3.5377643504531735e-06,
"loss": 0.9478,
"step": 752
},
{
"epoch": 1.9307692307692308,
"grad_norm": 0.20993918180465698,
"learning_rate": 3.4695121951219514e-06,
"loss": 1.091,
"step": 753
},
{
"epoch": 1.9333333333333333,
"grad_norm": 0.19574132561683655,
"learning_rate": 3.4e-06,
"loss": 1.0154,
"step": 754
},
{
"epoch": 1.935897435897436,
"grad_norm": 0.2107248604297638,
"learning_rate": 3.329192546583851e-06,
"loss": 0.997,
"step": 755
},
{
"epoch": 1.9384615384615385,
"grad_norm": 0.19578175246715546,
"learning_rate": 3.2570532915360505e-06,
"loss": 0.9224,
"step": 756
},
{
"epoch": 1.941025641025641,
"grad_norm": 0.20714713633060455,
"learning_rate": 3.183544303797469e-06,
"loss": 0.9749,
"step": 757
},
{
"epoch": 1.9435897435897436,
"grad_norm": 0.1808098554611206,
"learning_rate": 3.1086261980830674e-06,
"loss": 0.898,
"step": 758
},
{
"epoch": 1.9461538461538461,
"grad_norm": 0.20211873948574066,
"learning_rate": 3.0322580645161295e-06,
"loss": 0.9319,
"step": 759
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.17889924347400665,
"learning_rate": 2.9543973941368082e-06,
"loss": 1.0142,
"step": 760
},
{
"epoch": 1.9512820512820512,
"grad_norm": 0.20043864846229553,
"learning_rate": 2.875e-06,
"loss": 1.0167,
"step": 761
},
{
"epoch": 1.953846153846154,
"grad_norm": 0.18134412169456482,
"learning_rate": 2.794019933554818e-06,
"loss": 0.8732,
"step": 762
},
{
"epoch": 1.9564102564102566,
"grad_norm": 0.19279873371124268,
"learning_rate": 2.7114093959731548e-06,
"loss": 0.9885,
"step": 763
},
{
"epoch": 1.9589743589743591,
"grad_norm": 0.1957969218492508,
"learning_rate": 2.627118644067797e-06,
"loss": 0.9453,
"step": 764
},
{
"epoch": 1.9615384615384617,
"grad_norm": 0.2282707840204239,
"learning_rate": 2.5410958904109595e-06,
"loss": 0.91,
"step": 765
},
{
"epoch": 1.9641025641025642,
"grad_norm": 0.20508873462677002,
"learning_rate": 2.453287197231834e-06,
"loss": 0.9894,
"step": 766
},
{
"epoch": 1.9666666666666668,
"grad_norm": 0.19494283199310303,
"learning_rate": 2.363636363636364e-06,
"loss": 1.0989,
"step": 767
},
{
"epoch": 1.9692307692307693,
"grad_norm": 0.19367046654224396,
"learning_rate": 2.2720848056537104e-06,
"loss": 1.075,
"step": 768
},
{
"epoch": 1.971794871794872,
"grad_norm": 0.1860765963792801,
"learning_rate": 2.1785714285714286e-06,
"loss": 0.9745,
"step": 769
},
{
"epoch": 1.9743589743589745,
"grad_norm": 0.1922086477279663,
"learning_rate": 2.0830324909747296e-06,
"loss": 0.9443,
"step": 770
},
{
"epoch": 1.976923076923077,
"grad_norm": 0.20211626589298248,
"learning_rate": 1.9854014598540146e-06,
"loss": 0.9371,
"step": 771
},
{
"epoch": 1.9794871794871796,
"grad_norm": 0.21594083309173584,
"learning_rate": 1.885608856088561e-06,
"loss": 1.0045,
"step": 772
},
{
"epoch": 1.9820512820512821,
"grad_norm": 0.18539482355117798,
"learning_rate": 1.7835820895522391e-06,
"loss": 0.9609,
"step": 773
},
{
"epoch": 1.9846153846153847,
"grad_norm": 0.19419516623020172,
"learning_rate": 1.6792452830188683e-06,
"loss": 0.9157,
"step": 774
},
{
"epoch": 1.9871794871794872,
"grad_norm": 0.19369378685951233,
"learning_rate": 1.572519083969466e-06,
"loss": 0.9668,
"step": 775
},
{
"epoch": 1.9897435897435898,
"grad_norm": 0.19599087536334991,
"learning_rate": 1.4633204633204633e-06,
"loss": 0.9275,
"step": 776
},
{
"epoch": 1.9923076923076923,
"grad_norm": 0.20087149739265442,
"learning_rate": 1.3515625000000002e-06,
"loss": 1.057,
"step": 777
},
{
"epoch": 1.994871794871795,
"grad_norm": 0.19752490520477295,
"learning_rate": 1.2371541501976286e-06,
"loss": 1.0176,
"step": 778
},
{
"epoch": 1.9974358974358974,
"grad_norm": 0.17145206034183502,
"learning_rate": 1.12e-06,
"loss": 0.982,
"step": 779
},
{
"epoch": 2.0,
"grad_norm": 0.435234934091568,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.8781,
"step": 780
}
],
"logging_steps": 1,
"max_steps": 780,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 195,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.846660370087018e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}