Safetensors
English
qwen2
GenPRM-1.5B / trainer_state.json
Zhisheng000's picture
Upload folder using huggingface_hub
3fbbd07 verified
raw
history blame
236 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.795497185741088,
"eval_steps": 66,
"global_step": 1320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0150093808630394,
"grad_norm": 0.45099504621179437,
"learning_rate": 3.0303030303030305e-08,
"loss": 0.8363,
"step": 1
},
{
"epoch": 0.0150093808630394,
"eval_loss": 0.8522398471832275,
"eval_runtime": 13.8139,
"eval_samples_per_second": 32.359,
"eval_steps_per_second": 2.027,
"step": 1
},
{
"epoch": 0.0300187617260788,
"grad_norm": 0.441134394511529,
"learning_rate": 6.060606060606061e-08,
"loss": 0.8152,
"step": 2
},
{
"epoch": 0.0450281425891182,
"grad_norm": 0.44058980813366744,
"learning_rate": 9.09090909090909e-08,
"loss": 0.8263,
"step": 3
},
{
"epoch": 0.0600375234521576,
"grad_norm": 0.4412989069973729,
"learning_rate": 1.2121212121212122e-07,
"loss": 0.8285,
"step": 4
},
{
"epoch": 0.075046904315197,
"grad_norm": 0.4411021457996664,
"learning_rate": 1.5151515151515152e-07,
"loss": 0.8294,
"step": 5
},
{
"epoch": 0.0900562851782364,
"grad_norm": 0.4512982125032984,
"learning_rate": 1.818181818181818e-07,
"loss": 0.827,
"step": 6
},
{
"epoch": 0.1050656660412758,
"grad_norm": 0.4487759970382494,
"learning_rate": 2.121212121212121e-07,
"loss": 0.831,
"step": 7
},
{
"epoch": 0.1200750469043152,
"grad_norm": 0.45274790304085666,
"learning_rate": 2.4242424242424244e-07,
"loss": 0.8266,
"step": 8
},
{
"epoch": 0.1350844277673546,
"grad_norm": 0.4452059179334573,
"learning_rate": 2.727272727272727e-07,
"loss": 0.8278,
"step": 9
},
{
"epoch": 0.150093808630394,
"grad_norm": 0.4447347665120929,
"learning_rate": 3.0303030303030305e-07,
"loss": 0.828,
"step": 10
},
{
"epoch": 0.1651031894934334,
"grad_norm": 0.44996814286667713,
"learning_rate": 3.333333333333333e-07,
"loss": 0.8321,
"step": 11
},
{
"epoch": 0.1801125703564728,
"grad_norm": 0.4400767238276578,
"learning_rate": 3.636363636363636e-07,
"loss": 0.8188,
"step": 12
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.4714681881384513,
"learning_rate": 3.939393939393939e-07,
"loss": 0.8295,
"step": 13
},
{
"epoch": 0.2101313320825516,
"grad_norm": 0.444385872298255,
"learning_rate": 4.242424242424242e-07,
"loss": 0.8163,
"step": 14
},
{
"epoch": 0.225140712945591,
"grad_norm": 0.4403917468130588,
"learning_rate": 4.545454545454545e-07,
"loss": 0.829,
"step": 15
},
{
"epoch": 0.2401500938086304,
"grad_norm": 0.4463075871861068,
"learning_rate": 4.848484848484849e-07,
"loss": 0.8301,
"step": 16
},
{
"epoch": 0.2551594746716698,
"grad_norm": 0.4517876122481777,
"learning_rate": 5.151515151515151e-07,
"loss": 0.8237,
"step": 17
},
{
"epoch": 0.2701688555347092,
"grad_norm": 0.4194271488424739,
"learning_rate": 5.454545454545454e-07,
"loss": 0.828,
"step": 18
},
{
"epoch": 0.2851782363977486,
"grad_norm": 0.4385859199926406,
"learning_rate": 5.757575757575758e-07,
"loss": 0.8313,
"step": 19
},
{
"epoch": 0.300187617260788,
"grad_norm": 0.43935758099705285,
"learning_rate": 6.060606060606061e-07,
"loss": 0.8135,
"step": 20
},
{
"epoch": 0.3151969981238274,
"grad_norm": 0.42349119651358025,
"learning_rate": 6.363636363636363e-07,
"loss": 0.814,
"step": 21
},
{
"epoch": 0.3302063789868668,
"grad_norm": 0.42862096475156763,
"learning_rate": 6.666666666666666e-07,
"loss": 0.8107,
"step": 22
},
{
"epoch": 0.3452157598499062,
"grad_norm": 0.41027437311847303,
"learning_rate": 6.96969696969697e-07,
"loss": 0.8093,
"step": 23
},
{
"epoch": 0.3602251407129456,
"grad_norm": 0.41506365946047097,
"learning_rate": 7.272727272727272e-07,
"loss": 0.8007,
"step": 24
},
{
"epoch": 0.37523452157598497,
"grad_norm": 0.35818533786374307,
"learning_rate": 7.575757575757575e-07,
"loss": 0.7935,
"step": 25
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.36820244566867855,
"learning_rate": 7.878787878787878e-07,
"loss": 0.7956,
"step": 26
},
{
"epoch": 0.4052532833020638,
"grad_norm": 0.3554347415386222,
"learning_rate": 8.181818181818182e-07,
"loss": 0.7965,
"step": 27
},
{
"epoch": 0.4202626641651032,
"grad_norm": 0.34816729354565595,
"learning_rate": 8.484848484848484e-07,
"loss": 0.7893,
"step": 28
},
{
"epoch": 0.4352720450281426,
"grad_norm": 0.3492723930636243,
"learning_rate": 8.787878787878787e-07,
"loss": 0.7927,
"step": 29
},
{
"epoch": 0.450281425891182,
"grad_norm": 0.3524378456441126,
"learning_rate": 9.09090909090909e-07,
"loss": 0.7805,
"step": 30
},
{
"epoch": 0.4652908067542214,
"grad_norm": 0.3364134835289249,
"learning_rate": 9.393939393939395e-07,
"loss": 0.7901,
"step": 31
},
{
"epoch": 0.4803001876172608,
"grad_norm": 0.34952134401579665,
"learning_rate": 9.696969696969698e-07,
"loss": 0.7848,
"step": 32
},
{
"epoch": 0.49530956848030017,
"grad_norm": 0.34379662697051444,
"learning_rate": 1e-06,
"loss": 0.7766,
"step": 33
},
{
"epoch": 0.5103189493433395,
"grad_norm": 0.25380437737254385,
"learning_rate": 1.0303030303030302e-06,
"loss": 0.7506,
"step": 34
},
{
"epoch": 0.525328330206379,
"grad_norm": 0.2160315736548007,
"learning_rate": 1.0606060606060606e-06,
"loss": 0.7296,
"step": 35
},
{
"epoch": 0.5403377110694184,
"grad_norm": 0.21519653463861005,
"learning_rate": 1.0909090909090908e-06,
"loss": 0.7429,
"step": 36
},
{
"epoch": 0.5553470919324578,
"grad_norm": 0.2118091773645455,
"learning_rate": 1.121212121212121e-06,
"loss": 0.7341,
"step": 37
},
{
"epoch": 0.5703564727954972,
"grad_norm": 0.2133974139017253,
"learning_rate": 1.1515151515151516e-06,
"loss": 0.7336,
"step": 38
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.21183205584010478,
"learning_rate": 1.1818181818181818e-06,
"loss": 0.7406,
"step": 39
},
{
"epoch": 0.600375234521576,
"grad_norm": 0.20612576338064367,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.7172,
"step": 40
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.20009218157286937,
"learning_rate": 1.2424242424242424e-06,
"loss": 0.7331,
"step": 41
},
{
"epoch": 0.6303939962476548,
"grad_norm": 0.20086489901884286,
"learning_rate": 1.2727272727272726e-06,
"loss": 0.7206,
"step": 42
},
{
"epoch": 0.6454033771106942,
"grad_norm": 0.19327701765033134,
"learning_rate": 1.303030303030303e-06,
"loss": 0.7264,
"step": 43
},
{
"epoch": 0.6604127579737336,
"grad_norm": 0.18735374384890305,
"learning_rate": 1.3333333333333332e-06,
"loss": 0.7073,
"step": 44
},
{
"epoch": 0.6754221388367729,
"grad_norm": 0.17683867740993736,
"learning_rate": 1.3636363636363634e-06,
"loss": 0.6931,
"step": 45
},
{
"epoch": 0.6904315196998124,
"grad_norm": 0.17198229254906564,
"learning_rate": 1.393939393939394e-06,
"loss": 0.698,
"step": 46
},
{
"epoch": 0.7054409005628518,
"grad_norm": 0.16380634624432175,
"learning_rate": 1.4242424242424242e-06,
"loss": 0.6903,
"step": 47
},
{
"epoch": 0.7204502814258912,
"grad_norm": 0.14953817712425876,
"learning_rate": 1.4545454545454544e-06,
"loss": 0.6771,
"step": 48
},
{
"epoch": 0.7354596622889306,
"grad_norm": 0.14120367016713395,
"learning_rate": 1.4848484848484848e-06,
"loss": 0.6689,
"step": 49
},
{
"epoch": 0.7504690431519699,
"grad_norm": 0.13232673022559538,
"learning_rate": 1.515151515151515e-06,
"loss": 0.6748,
"step": 50
},
{
"epoch": 0.7654784240150094,
"grad_norm": 0.12723197101176636,
"learning_rate": 1.5454545454545454e-06,
"loss": 0.6612,
"step": 51
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.12474022700537914,
"learning_rate": 1.5757575757575756e-06,
"loss": 0.6458,
"step": 52
},
{
"epoch": 0.7954971857410882,
"grad_norm": 0.12420274477384924,
"learning_rate": 1.6060606060606058e-06,
"loss": 0.6529,
"step": 53
},
{
"epoch": 0.8105065666041276,
"grad_norm": 0.12270466802134104,
"learning_rate": 1.6363636363636365e-06,
"loss": 0.6475,
"step": 54
},
{
"epoch": 0.8255159474671669,
"grad_norm": 0.12049286207469485,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.6359,
"step": 55
},
{
"epoch": 0.8405253283302064,
"grad_norm": 0.11526479585742994,
"learning_rate": 1.6969696969696969e-06,
"loss": 0.6261,
"step": 56
},
{
"epoch": 0.8555347091932458,
"grad_norm": 0.11592626217416292,
"learning_rate": 1.7272727272727273e-06,
"loss": 0.627,
"step": 57
},
{
"epoch": 0.8705440900562852,
"grad_norm": 0.11594477634938592,
"learning_rate": 1.7575757575757575e-06,
"loss": 0.6244,
"step": 58
},
{
"epoch": 0.8855534709193246,
"grad_norm": 0.11313778567858399,
"learning_rate": 1.7878787878787877e-06,
"loss": 0.6317,
"step": 59
},
{
"epoch": 0.900562851782364,
"grad_norm": 0.11023173423057069,
"learning_rate": 1.818181818181818e-06,
"loss": 0.6248,
"step": 60
},
{
"epoch": 0.9155722326454033,
"grad_norm": 0.10740667281307065,
"learning_rate": 1.8484848484848483e-06,
"loss": 0.621,
"step": 61
},
{
"epoch": 0.9305816135084428,
"grad_norm": 0.10061348969269865,
"learning_rate": 1.878787878787879e-06,
"loss": 0.6182,
"step": 62
},
{
"epoch": 0.9455909943714822,
"grad_norm": 0.09404279395367166,
"learning_rate": 1.909090909090909e-06,
"loss": 0.6068,
"step": 63
},
{
"epoch": 0.9606003752345216,
"grad_norm": 0.09335512170262361,
"learning_rate": 1.9393939393939395e-06,
"loss": 0.6114,
"step": 64
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.08836932387148118,
"learning_rate": 1.9696969696969695e-06,
"loss": 0.5959,
"step": 65
},
{
"epoch": 0.9906191369606003,
"grad_norm": 0.08549247801026265,
"learning_rate": 2e-06,
"loss": 0.6113,
"step": 66
},
{
"epoch": 0.9906191369606003,
"eval_loss": 0.5765168070793152,
"eval_runtime": 13.7554,
"eval_samples_per_second": 32.496,
"eval_steps_per_second": 2.036,
"step": 66
},
{
"epoch": 1.0,
"grad_norm": 0.08549247801026265,
"learning_rate": 1.999996861844573e-06,
"loss": 0.4985,
"step": 67
},
{
"epoch": 1.0150093808630394,
"grad_norm": 0.09984581949422254,
"learning_rate": 1.999987447397988e-06,
"loss": 0.6663,
"step": 68
},
{
"epoch": 1.0300187617260788,
"grad_norm": 0.07132682896345384,
"learning_rate": 1.9999717567193325e-06,
"loss": 0.5697,
"step": 69
},
{
"epoch": 1.0450281425891181,
"grad_norm": 0.06981020986612589,
"learning_rate": 1.999949789907087e-06,
"loss": 0.5768,
"step": 70
},
{
"epoch": 1.0600375234521575,
"grad_norm": 0.06754090898889059,
"learning_rate": 1.9999215470991215e-06,
"loss": 0.5828,
"step": 71
},
{
"epoch": 1.075046904315197,
"grad_norm": 0.06243964561211776,
"learning_rate": 1.9998870284726965e-06,
"loss": 0.5694,
"step": 72
},
{
"epoch": 1.0900562851782365,
"grad_norm": 0.061009810010117634,
"learning_rate": 1.999846234244462e-06,
"loss": 0.5727,
"step": 73
},
{
"epoch": 1.1050656660412759,
"grad_norm": 0.056185784091092324,
"learning_rate": 1.999799164670455e-06,
"loss": 0.5607,
"step": 74
},
{
"epoch": 1.1200750469043153,
"grad_norm": 0.055842108965366594,
"learning_rate": 1.9997458200460992e-06,
"loss": 0.5521,
"step": 75
},
{
"epoch": 1.1350844277673546,
"grad_norm": 0.05187749252022605,
"learning_rate": 1.999686200706201e-06,
"loss": 0.5724,
"step": 76
},
{
"epoch": 1.150093808630394,
"grad_norm": 0.051625347997863995,
"learning_rate": 1.9996203070249514e-06,
"loss": 0.5566,
"step": 77
},
{
"epoch": 1.1651031894934334,
"grad_norm": 0.04854987918067284,
"learning_rate": 1.9995481394159185e-06,
"loss": 0.5444,
"step": 78
},
{
"epoch": 1.1801125703564728,
"grad_norm": 0.046725939692484605,
"learning_rate": 1.999469698332049e-06,
"loss": 0.5452,
"step": 79
},
{
"epoch": 1.1951219512195121,
"grad_norm": 0.04651989075072082,
"learning_rate": 1.9993849842656634e-06,
"loss": 0.5533,
"step": 80
},
{
"epoch": 1.2101313320825515,
"grad_norm": 0.04530133654545522,
"learning_rate": 1.9992939977484538e-06,
"loss": 0.5446,
"step": 81
},
{
"epoch": 1.225140712945591,
"grad_norm": 0.04348117767209829,
"learning_rate": 1.99919673935148e-06,
"loss": 0.5518,
"step": 82
},
{
"epoch": 1.2401500938086305,
"grad_norm": 0.041929135982438165,
"learning_rate": 1.999093209685165e-06,
"loss": 0.5669,
"step": 83
},
{
"epoch": 1.2551594746716699,
"grad_norm": 0.04245022581761594,
"learning_rate": 1.9989834093992944e-06,
"loss": 0.5217,
"step": 84
},
{
"epoch": 1.2701688555347093,
"grad_norm": 0.03980799680014109,
"learning_rate": 1.998867339183008e-06,
"loss": 0.5429,
"step": 85
},
{
"epoch": 1.2851782363977486,
"grad_norm": 0.04050731565010284,
"learning_rate": 1.9987449997647986e-06,
"loss": 0.5277,
"step": 86
},
{
"epoch": 1.300187617260788,
"grad_norm": 0.03868146463623856,
"learning_rate": 1.9986163919125074e-06,
"loss": 0.5471,
"step": 87
},
{
"epoch": 1.3151969981238274,
"grad_norm": 0.038497979471297274,
"learning_rate": 1.998481516433316e-06,
"loss": 0.5444,
"step": 88
},
{
"epoch": 1.3302063789868668,
"grad_norm": 0.03793331778602445,
"learning_rate": 1.998340374173746e-06,
"loss": 0.5443,
"step": 89
},
{
"epoch": 1.3452157598499062,
"grad_norm": 0.037330687993704544,
"learning_rate": 1.998192966019649e-06,
"loss": 0.5397,
"step": 90
},
{
"epoch": 1.3602251407129455,
"grad_norm": 0.036291421015624784,
"learning_rate": 1.998039292896205e-06,
"loss": 0.5275,
"step": 91
},
{
"epoch": 1.375234521575985,
"grad_norm": 0.035032791533978855,
"learning_rate": 1.9978793557679143e-06,
"loss": 0.5219,
"step": 92
},
{
"epoch": 1.3902439024390243,
"grad_norm": 0.034627794718902295,
"learning_rate": 1.9977131556385916e-06,
"loss": 0.5383,
"step": 93
},
{
"epoch": 1.4052532833020637,
"grad_norm": 0.03451383360226329,
"learning_rate": 1.9975406935513613e-06,
"loss": 0.5301,
"step": 94
},
{
"epoch": 1.4202626641651033,
"grad_norm": 0.03375702739265765,
"learning_rate": 1.9973619705886486e-06,
"loss": 0.5358,
"step": 95
},
{
"epoch": 1.4352720450281427,
"grad_norm": 0.03420334157092594,
"learning_rate": 1.9971769878721743e-06,
"loss": 0.5308,
"step": 96
},
{
"epoch": 1.450281425891182,
"grad_norm": 0.032291163299197616,
"learning_rate": 1.9969857465629473e-06,
"loss": 0.5318,
"step": 97
},
{
"epoch": 1.4652908067542214,
"grad_norm": 0.03307719877411306,
"learning_rate": 1.996788247861258e-06,
"loss": 0.5304,
"step": 98
},
{
"epoch": 1.4803001876172608,
"grad_norm": 0.032040033737229634,
"learning_rate": 1.9965844930066696e-06,
"loss": 0.5132,
"step": 99
},
{
"epoch": 1.4953095684803002,
"grad_norm": 0.030182728005005464,
"learning_rate": 1.9963744832780105e-06,
"loss": 0.5148,
"step": 100
},
{
"epoch": 1.5103189493433395,
"grad_norm": 0.029981506115315602,
"learning_rate": 1.996158219993368e-06,
"loss": 0.5229,
"step": 101
},
{
"epoch": 1.5253283302063791,
"grad_norm": 0.030487016777282053,
"learning_rate": 1.995935704510076e-06,
"loss": 0.5105,
"step": 102
},
{
"epoch": 1.5403377110694185,
"grad_norm": 0.029784594581146837,
"learning_rate": 1.995706938224712e-06,
"loss": 0.5204,
"step": 103
},
{
"epoch": 1.555347091932458,
"grad_norm": 0.029409093062428185,
"learning_rate": 1.9954719225730845e-06,
"loss": 0.5192,
"step": 104
},
{
"epoch": 1.5703564727954973,
"grad_norm": 0.028177419957318213,
"learning_rate": 1.995230659030224e-06,
"loss": 0.5163,
"step": 105
},
{
"epoch": 1.5853658536585367,
"grad_norm": 0.02727561948703477,
"learning_rate": 1.994983149110376e-06,
"loss": 0.5088,
"step": 106
},
{
"epoch": 1.600375234521576,
"grad_norm": 0.027757151756443717,
"learning_rate": 1.99472939436699e-06,
"loss": 0.5239,
"step": 107
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.02681869602530872,
"learning_rate": 1.994469396392709e-06,
"loss": 0.5193,
"step": 108
},
{
"epoch": 1.6303939962476548,
"grad_norm": 0.02805316201412264,
"learning_rate": 1.9942031568193616e-06,
"loss": 0.508,
"step": 109
},
{
"epoch": 1.6454033771106942,
"grad_norm": 0.026206675896448955,
"learning_rate": 1.9939306773179494e-06,
"loss": 0.5161,
"step": 110
},
{
"epoch": 1.6604127579737336,
"grad_norm": 0.02633552856537573,
"learning_rate": 1.9936519595986392e-06,
"loss": 0.513,
"step": 111
},
{
"epoch": 1.675422138836773,
"grad_norm": 0.02671752302721344,
"learning_rate": 1.9933670054107495e-06,
"loss": 0.5059,
"step": 112
},
{
"epoch": 1.6904315196998123,
"grad_norm": 0.02608289502544131,
"learning_rate": 1.993075816542742e-06,
"loss": 0.5155,
"step": 113
},
{
"epoch": 1.7054409005628517,
"grad_norm": 0.025066926457861842,
"learning_rate": 1.992778394822208e-06,
"loss": 0.5159,
"step": 114
},
{
"epoch": 1.720450281425891,
"grad_norm": 0.025549312802280857,
"learning_rate": 1.992474742115859e-06,
"loss": 0.5069,
"step": 115
},
{
"epoch": 1.7354596622889304,
"grad_norm": 0.024567185837752966,
"learning_rate": 1.9921648603295138e-06,
"loss": 0.5088,
"step": 116
},
{
"epoch": 1.7504690431519698,
"grad_norm": 0.024409380976663946,
"learning_rate": 1.9918487514080866e-06,
"loss": 0.5065,
"step": 117
},
{
"epoch": 1.7654784240150094,
"grad_norm": 0.024237188144454417,
"learning_rate": 1.991526417335575e-06,
"loss": 0.5185,
"step": 118
},
{
"epoch": 1.7804878048780488,
"grad_norm": 0.024430013424038537,
"learning_rate": 1.9911978601350483e-06,
"loss": 0.4929,
"step": 119
},
{
"epoch": 1.7954971857410882,
"grad_norm": 0.024004085443342052,
"learning_rate": 1.9908630818686336e-06,
"loss": 0.4931,
"step": 120
},
{
"epoch": 1.8105065666041276,
"grad_norm": 0.023061733754567375,
"learning_rate": 1.990522084637503e-06,
"loss": 0.497,
"step": 121
},
{
"epoch": 1.825515947467167,
"grad_norm": 0.02280013753274718,
"learning_rate": 1.990174870581862e-06,
"loss": 0.5013,
"step": 122
},
{
"epoch": 1.8405253283302065,
"grad_norm": 0.022535127183177055,
"learning_rate": 1.9898214418809326e-06,
"loss": 0.4992,
"step": 123
},
{
"epoch": 1.855534709193246,
"grad_norm": 0.02279479434548679,
"learning_rate": 1.989461800752944e-06,
"loss": 0.5034,
"step": 124
},
{
"epoch": 1.8705440900562853,
"grad_norm": 0.022737894727696523,
"learning_rate": 1.989095949455116e-06,
"loss": 0.5041,
"step": 125
},
{
"epoch": 1.8855534709193247,
"grad_norm": 0.02213733247427209,
"learning_rate": 1.988723890283645e-06,
"loss": 0.5064,
"step": 126
},
{
"epoch": 1.900562851782364,
"grad_norm": 0.022001719966808008,
"learning_rate": 1.988345625573689e-06,
"loss": 0.4938,
"step": 127
},
{
"epoch": 1.9155722326454034,
"grad_norm": 0.021997816422623415,
"learning_rate": 1.9879611576993556e-06,
"loss": 0.4975,
"step": 128
},
{
"epoch": 1.9305816135084428,
"grad_norm": 0.021825994539949378,
"learning_rate": 1.987570489073685e-06,
"loss": 0.4953,
"step": 129
},
{
"epoch": 1.9455909943714822,
"grad_norm": 0.021569132172669345,
"learning_rate": 1.9871736221486344e-06,
"loss": 0.4866,
"step": 130
},
{
"epoch": 1.9606003752345216,
"grad_norm": 0.021163833610375136,
"learning_rate": 1.9867705594150646e-06,
"loss": 0.489,
"step": 131
},
{
"epoch": 1.975609756097561,
"grad_norm": 0.020950382903110992,
"learning_rate": 1.9863613034027223e-06,
"loss": 0.4911,
"step": 132
},
{
"epoch": 1.975609756097561,
"eval_loss": 0.47668132185935974,
"eval_runtime": 13.9051,
"eval_samples_per_second": 32.146,
"eval_steps_per_second": 2.014,
"step": 132
},
{
"epoch": 1.9906191369606003,
"grad_norm": 0.020557758708440663,
"learning_rate": 1.9859458566802253e-06,
"loss": 0.4948,
"step": 133
},
{
"epoch": 2.0,
"grad_norm": 0.023846854436759164,
"learning_rate": 1.9855242218550463e-06,
"loss": 0.479,
"step": 134
},
{
"epoch": 2.0150093808630394,
"grad_norm": 0.02517578701828216,
"learning_rate": 1.9850964015734966e-06,
"loss": 0.5028,
"step": 135
},
{
"epoch": 2.0300187617260788,
"grad_norm": 0.02174554797483234,
"learning_rate": 1.9846623985207097e-06,
"loss": 0.5053,
"step": 136
},
{
"epoch": 2.045028142589118,
"grad_norm": 0.02084416259303054,
"learning_rate": 1.9842222154206232e-06,
"loss": 0.4962,
"step": 137
},
{
"epoch": 2.0600375234521575,
"grad_norm": 0.019786665839116563,
"learning_rate": 1.9837758550359635e-06,
"loss": 0.4891,
"step": 138
},
{
"epoch": 2.075046904315197,
"grad_norm": 0.02046650546146805,
"learning_rate": 1.9833233201682263e-06,
"loss": 0.4989,
"step": 139
},
{
"epoch": 2.0900562851782363,
"grad_norm": 0.019907238202250363,
"learning_rate": 1.982864613657662e-06,
"loss": 0.4775,
"step": 140
},
{
"epoch": 2.1050656660412757,
"grad_norm": 0.019815490776120104,
"learning_rate": 1.982399738383255e-06,
"loss": 0.4897,
"step": 141
},
{
"epoch": 2.120075046904315,
"grad_norm": 0.01918425758934218,
"learning_rate": 1.9819286972627067e-06,
"loss": 0.4972,
"step": 142
},
{
"epoch": 2.1350844277673544,
"grad_norm": 0.019994680552468825,
"learning_rate": 1.9814514932524176e-06,
"loss": 0.4951,
"step": 143
},
{
"epoch": 2.150093808630394,
"grad_norm": 0.019278973829129135,
"learning_rate": 1.980968129347469e-06,
"loss": 0.4809,
"step": 144
},
{
"epoch": 2.1651031894934336,
"grad_norm": 0.0192577035975134,
"learning_rate": 1.9804786085816027e-06,
"loss": 0.4909,
"step": 145
},
{
"epoch": 2.180112570356473,
"grad_norm": 0.018971731946795373,
"learning_rate": 1.979982934027203e-06,
"loss": 0.4804,
"step": 146
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.019218520844467814,
"learning_rate": 1.979481108795278e-06,
"loss": 0.4886,
"step": 147
},
{
"epoch": 2.2101313320825517,
"grad_norm": 0.018888083378158605,
"learning_rate": 1.9789731360354377e-06,
"loss": 0.4884,
"step": 148
},
{
"epoch": 2.225140712945591,
"grad_norm": 0.019795240765748786,
"learning_rate": 1.9784590189358786e-06,
"loss": 0.4918,
"step": 149
},
{
"epoch": 2.2401500938086305,
"grad_norm": 0.018701780342971427,
"learning_rate": 1.9779387607233582e-06,
"loss": 0.4837,
"step": 150
},
{
"epoch": 2.25515947467167,
"grad_norm": 0.01883414126738354,
"learning_rate": 1.9774123646631797e-06,
"loss": 0.4856,
"step": 151
},
{
"epoch": 2.2701688555347093,
"grad_norm": 0.018749906068093083,
"learning_rate": 1.9768798340591678e-06,
"loss": 0.4765,
"step": 152
},
{
"epoch": 2.2851782363977486,
"grad_norm": 0.018730040963488046,
"learning_rate": 1.9763411722536503e-06,
"loss": 0.4845,
"step": 153
},
{
"epoch": 2.300187617260788,
"grad_norm": 0.018684796471871466,
"learning_rate": 1.9757963826274354e-06,
"loss": 0.4822,
"step": 154
},
{
"epoch": 2.3151969981238274,
"grad_norm": 0.018465641278047874,
"learning_rate": 1.9752454685997933e-06,
"loss": 0.4828,
"step": 155
},
{
"epoch": 2.3302063789868668,
"grad_norm": 0.018220085507859175,
"learning_rate": 1.9746884336284313e-06,
"loss": 0.4838,
"step": 156
},
{
"epoch": 2.345215759849906,
"grad_norm": 0.01849417235430084,
"learning_rate": 1.974125281209474e-06,
"loss": 0.4953,
"step": 157
},
{
"epoch": 2.3602251407129455,
"grad_norm": 0.018786012040010656,
"learning_rate": 1.973556014877441e-06,
"loss": 0.4928,
"step": 158
},
{
"epoch": 2.375234521575985,
"grad_norm": 0.017733412017590936,
"learning_rate": 1.972980638205225e-06,
"loss": 0.4798,
"step": 159
},
{
"epoch": 2.3902439024390243,
"grad_norm": 0.017824181403243117,
"learning_rate": 1.972399154804068e-06,
"loss": 0.4844,
"step": 160
},
{
"epoch": 2.4052532833020637,
"grad_norm": 0.018065420440970443,
"learning_rate": 1.9718115683235415e-06,
"loss": 0.4666,
"step": 161
},
{
"epoch": 2.420262664165103,
"grad_norm": 0.017368916708408812,
"learning_rate": 1.971217882451521e-06,
"loss": 0.4875,
"step": 162
},
{
"epoch": 2.4352720450281424,
"grad_norm": 0.018070325703733577,
"learning_rate": 1.9706181009141627e-06,
"loss": 0.474,
"step": 163
},
{
"epoch": 2.450281425891182,
"grad_norm": 0.017332167220279988,
"learning_rate": 1.9700122274758824e-06,
"loss": 0.48,
"step": 164
},
{
"epoch": 2.465290806754221,
"grad_norm": 0.017274769746170343,
"learning_rate": 1.9694002659393305e-06,
"loss": 0.4771,
"step": 165
},
{
"epoch": 2.480300187617261,
"grad_norm": 0.01814722046626225,
"learning_rate": 1.9687822201453674e-06,
"loss": 0.4848,
"step": 166
},
{
"epoch": 2.4953095684803,
"grad_norm": 0.017408741457802464,
"learning_rate": 1.9681580939730405e-06,
"loss": 0.4827,
"step": 167
},
{
"epoch": 2.5103189493433398,
"grad_norm": 0.01768240964560969,
"learning_rate": 1.96752789133956e-06,
"loss": 0.4794,
"step": 168
},
{
"epoch": 2.525328330206379,
"grad_norm": 0.017154935951961867,
"learning_rate": 1.9668916162002736e-06,
"loss": 0.4693,
"step": 169
},
{
"epoch": 2.5403377110694185,
"grad_norm": 0.01802153159416841,
"learning_rate": 1.966249272548642e-06,
"loss": 0.4777,
"step": 170
},
{
"epoch": 2.555347091932458,
"grad_norm": 0.01695693908189586,
"learning_rate": 1.965600864416213e-06,
"loss": 0.4759,
"step": 171
},
{
"epoch": 2.5703564727954973,
"grad_norm": 0.017778123189151847,
"learning_rate": 1.964946395872598e-06,
"loss": 0.4741,
"step": 172
},
{
"epoch": 2.5853658536585367,
"grad_norm": 0.01689429975438397,
"learning_rate": 1.964285871025445e-06,
"loss": 0.478,
"step": 173
},
{
"epoch": 2.600375234521576,
"grad_norm": 0.01649235549328338,
"learning_rate": 1.963619294020413e-06,
"loss": 0.4813,
"step": 174
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.017089151903788,
"learning_rate": 1.9629466690411472e-06,
"loss": 0.4655,
"step": 175
},
{
"epoch": 2.630393996247655,
"grad_norm": 0.01640933754335135,
"learning_rate": 1.9622680003092503e-06,
"loss": 0.4707,
"step": 176
},
{
"epoch": 2.645403377110694,
"grad_norm": 0.016910827126253334,
"learning_rate": 1.9615832920842585e-06,
"loss": 0.4746,
"step": 177
},
{
"epoch": 2.6604127579737336,
"grad_norm": 0.016866233985154282,
"learning_rate": 1.9608925486636137e-06,
"loss": 0.4779,
"step": 178
},
{
"epoch": 2.675422138836773,
"grad_norm": 0.016909921106817608,
"learning_rate": 1.9601957743826357e-06,
"loss": 0.4746,
"step": 179
},
{
"epoch": 2.6904315196998123,
"grad_norm": 0.0168852778181624,
"learning_rate": 1.9594929736144973e-06,
"loss": 0.4689,
"step": 180
},
{
"epoch": 2.7054409005628517,
"grad_norm": 0.016401176940009726,
"learning_rate": 1.958784150770194e-06,
"loss": 0.4797,
"step": 181
},
{
"epoch": 2.720450281425891,
"grad_norm": 0.016798403098782076,
"learning_rate": 1.9580693102985183e-06,
"loss": 0.4857,
"step": 182
},
{
"epoch": 2.7354596622889304,
"grad_norm": 0.016774376394434458,
"learning_rate": 1.9573484566860318e-06,
"loss": 0.4778,
"step": 183
},
{
"epoch": 2.75046904315197,
"grad_norm": 0.01629889660789357,
"learning_rate": 1.956621594457035e-06,
"loss": 0.4732,
"step": 184
},
{
"epoch": 2.7654784240150097,
"grad_norm": 0.016664693825275887,
"learning_rate": 1.955888728173542e-06,
"loss": 0.4748,
"step": 185
},
{
"epoch": 2.7804878048780486,
"grad_norm": 0.016438663495088306,
"learning_rate": 1.9551498624352495e-06,
"loss": 0.4692,
"step": 186
},
{
"epoch": 2.7954971857410884,
"grad_norm": 0.01622929505747816,
"learning_rate": 1.9544050018795075e-06,
"loss": 0.4596,
"step": 187
},
{
"epoch": 2.8105065666041273,
"grad_norm": 0.01617655541567404,
"learning_rate": 1.953654151181293e-06,
"loss": 0.4692,
"step": 188
},
{
"epoch": 2.825515947467167,
"grad_norm": 0.016422497381573444,
"learning_rate": 1.9528973150531785e-06,
"loss": 0.4816,
"step": 189
},
{
"epoch": 2.8405253283302065,
"grad_norm": 0.016242133515846205,
"learning_rate": 1.9521344982453028e-06,
"loss": 0.461,
"step": 190
},
{
"epoch": 2.855534709193246,
"grad_norm": 0.01614541338103813,
"learning_rate": 1.951365705545341e-06,
"loss": 0.4648,
"step": 191
},
{
"epoch": 2.8705440900562853,
"grad_norm": 0.016610969075841926,
"learning_rate": 1.9505909417784754e-06,
"loss": 0.4821,
"step": 192
},
{
"epoch": 2.8855534709193247,
"grad_norm": 0.01603749843836089,
"learning_rate": 1.949810211807364e-06,
"loss": 0.4683,
"step": 193
},
{
"epoch": 2.900562851782364,
"grad_norm": 0.016182649712645568,
"learning_rate": 1.9490235205321113e-06,
"loss": 0.4711,
"step": 194
},
{
"epoch": 2.9155722326454034,
"grad_norm": 0.015691775043731006,
"learning_rate": 1.9482308728902354e-06,
"loss": 0.4679,
"step": 195
},
{
"epoch": 2.930581613508443,
"grad_norm": 0.01562069940201586,
"learning_rate": 1.94743227385664e-06,
"loss": 0.4635,
"step": 196
},
{
"epoch": 2.945590994371482,
"grad_norm": 0.016502000964567342,
"learning_rate": 1.946627728443581e-06,
"loss": 0.4846,
"step": 197
},
{
"epoch": 2.9606003752345216,
"grad_norm": 0.016314776580001914,
"learning_rate": 1.9458172417006346e-06,
"loss": 0.4687,
"step": 198
},
{
"epoch": 2.9606003752345216,
"eval_loss": 0.45064201951026917,
"eval_runtime": 13.829,
"eval_samples_per_second": 32.323,
"eval_steps_per_second": 2.025,
"step": 198
},
{
"epoch": 2.975609756097561,
"grad_norm": 0.016032669050809613,
"learning_rate": 1.945000818714668e-06,
"loss": 0.4671,
"step": 199
},
{
"epoch": 2.9906191369606003,
"grad_norm": 0.015653715448159893,
"learning_rate": 1.9441784646098063e-06,
"loss": 0.4711,
"step": 200
},
{
"epoch": 3.0,
"grad_norm": 0.02223419252139366,
"learning_rate": 1.9433501845473993e-06,
"loss": 0.4737,
"step": 201
},
{
"epoch": 3.0150093808630394,
"grad_norm": 0.01514004662536423,
"learning_rate": 1.942515983725989e-06,
"loss": 0.4623,
"step": 202
},
{
"epoch": 3.0300187617260788,
"grad_norm": 0.015881834319376502,
"learning_rate": 1.9416758673812807e-06,
"loss": 0.4644,
"step": 203
},
{
"epoch": 3.045028142589118,
"grad_norm": 0.015532270172868397,
"learning_rate": 1.940829840786104e-06,
"loss": 0.4661,
"step": 204
},
{
"epoch": 3.0600375234521575,
"grad_norm": 0.015406068622430446,
"learning_rate": 1.9399779092503866e-06,
"loss": 0.4739,
"step": 205
},
{
"epoch": 3.075046904315197,
"grad_norm": 0.015806639463125247,
"learning_rate": 1.9391200781211143e-06,
"loss": 0.4663,
"step": 206
},
{
"epoch": 3.0900562851782363,
"grad_norm": 0.015442763548195685,
"learning_rate": 1.9382563527823025e-06,
"loss": 0.4618,
"step": 207
},
{
"epoch": 3.1050656660412757,
"grad_norm": 0.016087586162934313,
"learning_rate": 1.93738673865496e-06,
"loss": 0.4768,
"step": 208
},
{
"epoch": 3.120075046904315,
"grad_norm": 0.015086642867178588,
"learning_rate": 1.9365112411970546e-06,
"loss": 0.4527,
"step": 209
},
{
"epoch": 3.1350844277673544,
"grad_norm": 0.015544622423445547,
"learning_rate": 1.9356298659034817e-06,
"loss": 0.4633,
"step": 210
},
{
"epoch": 3.150093808630394,
"grad_norm": 0.015639897610521446,
"learning_rate": 1.934742618306026e-06,
"loss": 0.4647,
"step": 211
},
{
"epoch": 3.1651031894934336,
"grad_norm": 0.015476405998347507,
"learning_rate": 1.9338495039733286e-06,
"loss": 0.4758,
"step": 212
},
{
"epoch": 3.180112570356473,
"grad_norm": 0.015338735369002602,
"learning_rate": 1.932950528510854e-06,
"loss": 0.4713,
"step": 213
},
{
"epoch": 3.1951219512195124,
"grad_norm": 0.015887835721317498,
"learning_rate": 1.932045697560851e-06,
"loss": 0.488,
"step": 214
},
{
"epoch": 3.2101313320825517,
"grad_norm": 0.015551083745535117,
"learning_rate": 1.9311350168023193e-06,
"loss": 0.4712,
"step": 215
},
{
"epoch": 3.225140712945591,
"grad_norm": 0.015325879160020314,
"learning_rate": 1.9302184919509753e-06,
"loss": 0.4608,
"step": 216
},
{
"epoch": 3.2401500938086305,
"grad_norm": 0.014851862570601724,
"learning_rate": 1.9292961287592137e-06,
"loss": 0.4584,
"step": 217
},
{
"epoch": 3.25515947467167,
"grad_norm": 0.01524320818707719,
"learning_rate": 1.9283679330160725e-06,
"loss": 0.4563,
"step": 218
},
{
"epoch": 3.2701688555347093,
"grad_norm": 0.01574419013401929,
"learning_rate": 1.9274339105471968e-06,
"loss": 0.4637,
"step": 219
},
{
"epoch": 3.2851782363977486,
"grad_norm": 0.014929249445401652,
"learning_rate": 1.9264940672148015e-06,
"loss": 0.4536,
"step": 220
},
{
"epoch": 3.300187617260788,
"grad_norm": 0.015275176183210167,
"learning_rate": 1.9255484089176364e-06,
"loss": 0.477,
"step": 221
},
{
"epoch": 3.3151969981238274,
"grad_norm": 0.014832301389772685,
"learning_rate": 1.924596941590946e-06,
"loss": 0.4545,
"step": 222
},
{
"epoch": 3.3302063789868668,
"grad_norm": 0.014806138439426077,
"learning_rate": 1.9236396712064356e-06,
"loss": 0.4564,
"step": 223
},
{
"epoch": 3.345215759849906,
"grad_norm": 0.015216973896443366,
"learning_rate": 1.9226766037722316e-06,
"loss": 0.4775,
"step": 224
},
{
"epoch": 3.3602251407129455,
"grad_norm": 0.015830800852046037,
"learning_rate": 1.9217077453328448e-06,
"loss": 0.4655,
"step": 225
},
{
"epoch": 3.375234521575985,
"grad_norm": 0.014954797566724949,
"learning_rate": 1.9207331019691313e-06,
"loss": 0.4683,
"step": 226
},
{
"epoch": 3.3902439024390243,
"grad_norm": 0.014824714915179381,
"learning_rate": 1.9197526797982563e-06,
"loss": 0.468,
"step": 227
},
{
"epoch": 3.4052532833020637,
"grad_norm": 0.015070859630471943,
"learning_rate": 1.918766484973654e-06,
"loss": 0.4508,
"step": 228
},
{
"epoch": 3.420262664165103,
"grad_norm": 0.01499772386383127,
"learning_rate": 1.9177745236849897e-06,
"loss": 0.4607,
"step": 229
},
{
"epoch": 3.4352720450281424,
"grad_norm": 0.01470349681548681,
"learning_rate": 1.9167768021581207e-06,
"loss": 0.4545,
"step": 230
},
{
"epoch": 3.450281425891182,
"grad_norm": 0.014758480658159613,
"learning_rate": 1.915773326655057e-06,
"loss": 0.453,
"step": 231
},
{
"epoch": 3.465290806754221,
"grad_norm": 0.01473967663449321,
"learning_rate": 1.9147641034739244e-06,
"loss": 0.4561,
"step": 232
},
{
"epoch": 3.480300187617261,
"grad_norm": 0.01500366500390119,
"learning_rate": 1.9137491389489197e-06,
"loss": 0.468,
"step": 233
},
{
"epoch": 3.4953095684803,
"grad_norm": 0.014848085370275077,
"learning_rate": 1.912728439450276e-06,
"loss": 0.4578,
"step": 234
},
{
"epoch": 3.5103189493433398,
"grad_norm": 0.014381703396358053,
"learning_rate": 1.9117020113842214e-06,
"loss": 0.454,
"step": 235
},
{
"epoch": 3.525328330206379,
"grad_norm": 0.015112883990438414,
"learning_rate": 1.910669861192937e-06,
"loss": 0.4568,
"step": 236
},
{
"epoch": 3.5403377110694185,
"grad_norm": 0.014957316535602687,
"learning_rate": 1.9096319953545185e-06,
"loss": 0.4587,
"step": 237
},
{
"epoch": 3.555347091932458,
"grad_norm": 0.014931146009771568,
"learning_rate": 1.908588420382934e-06,
"loss": 0.4611,
"step": 238
},
{
"epoch": 3.5703564727954973,
"grad_norm": 0.014775742620555714,
"learning_rate": 1.9075391428279847e-06,
"loss": 0.4639,
"step": 239
},
{
"epoch": 3.5853658536585367,
"grad_norm": 0.014136489795191264,
"learning_rate": 1.906484169275263e-06,
"loss": 0.4479,
"step": 240
},
{
"epoch": 3.600375234521576,
"grad_norm": 0.014789494140792015,
"learning_rate": 1.9054235063461103e-06,
"loss": 0.4695,
"step": 241
},
{
"epoch": 3.6153846153846154,
"grad_norm": 0.014464423814392216,
"learning_rate": 1.9043571606975775e-06,
"loss": 0.4527,
"step": 242
},
{
"epoch": 3.630393996247655,
"grad_norm": 0.014608804721408216,
"learning_rate": 1.903285139022381e-06,
"loss": 0.464,
"step": 243
},
{
"epoch": 3.645403377110694,
"grad_norm": 0.014617662641472032,
"learning_rate": 1.9022074480488616e-06,
"loss": 0.4605,
"step": 244
},
{
"epoch": 3.6604127579737336,
"grad_norm": 0.01474231509935561,
"learning_rate": 1.901124094540944e-06,
"loss": 0.4494,
"step": 245
},
{
"epoch": 3.675422138836773,
"grad_norm": 0.014438782822529975,
"learning_rate": 1.9000350852980907e-06,
"loss": 0.4501,
"step": 246
},
{
"epoch": 3.6904315196998123,
"grad_norm": 0.015011699255171157,
"learning_rate": 1.8989404271552628e-06,
"loss": 0.474,
"step": 247
},
{
"epoch": 3.7054409005628517,
"grad_norm": 0.014342680013044425,
"learning_rate": 1.8978401269828743e-06,
"loss": 0.4448,
"step": 248
},
{
"epoch": 3.720450281425891,
"grad_norm": 0.014862003016937061,
"learning_rate": 1.8967341916867517e-06,
"loss": 0.4627,
"step": 249
},
{
"epoch": 3.7354596622889304,
"grad_norm": 0.014657009740811595,
"learning_rate": 1.8956226282080887e-06,
"loss": 0.4695,
"step": 250
},
{
"epoch": 3.75046904315197,
"grad_norm": 0.01399652480743858,
"learning_rate": 1.8945054435234032e-06,
"loss": 0.4485,
"step": 251
},
{
"epoch": 3.7654784240150097,
"grad_norm": 0.01482138592733111,
"learning_rate": 1.893382644644493e-06,
"loss": 0.4541,
"step": 252
},
{
"epoch": 3.7804878048780486,
"grad_norm": 0.014805529769546168,
"learning_rate": 1.8922542386183939e-06,
"loss": 0.4574,
"step": 253
},
{
"epoch": 3.7954971857410884,
"grad_norm": 0.014571190288480682,
"learning_rate": 1.8911202325273323e-06,
"loss": 0.4494,
"step": 254
},
{
"epoch": 3.8105065666041273,
"grad_norm": 0.0147815449251929,
"learning_rate": 1.8899806334886828e-06,
"loss": 0.4587,
"step": 255
},
{
"epoch": 3.825515947467167,
"grad_norm": 0.014250828964641306,
"learning_rate": 1.8888354486549234e-06,
"loss": 0.461,
"step": 256
},
{
"epoch": 3.8405253283302065,
"grad_norm": 0.014756398846796545,
"learning_rate": 1.8876846852135901e-06,
"loss": 0.4454,
"step": 257
},
{
"epoch": 3.855534709193246,
"grad_norm": 0.014425887354809732,
"learning_rate": 1.8865283503872323e-06,
"loss": 0.4514,
"step": 258
},
{
"epoch": 3.8705440900562853,
"grad_norm": 0.014515933982418967,
"learning_rate": 1.8853664514333661e-06,
"loss": 0.4674,
"step": 259
},
{
"epoch": 3.8855534709193247,
"grad_norm": 0.015102016577158591,
"learning_rate": 1.8841989956444309e-06,
"loss": 0.4681,
"step": 260
},
{
"epoch": 3.900562851782364,
"grad_norm": 0.01429047197261286,
"learning_rate": 1.8830259903477424e-06,
"loss": 0.4478,
"step": 261
},
{
"epoch": 3.9155722326454034,
"grad_norm": 0.014175055891186608,
"learning_rate": 1.881847442905446e-06,
"loss": 0.4466,
"step": 262
},
{
"epoch": 3.930581613508443,
"grad_norm": 0.014199706390703304,
"learning_rate": 1.8806633607144724e-06,
"loss": 0.4633,
"step": 263
},
{
"epoch": 3.945590994371482,
"grad_norm": 0.014134069587260203,
"learning_rate": 1.8794737512064888e-06,
"loss": 0.4622,
"step": 264
},
{
"epoch": 3.945590994371482,
"eval_loss": 0.4357408583164215,
"eval_runtime": 13.9645,
"eval_samples_per_second": 32.01,
"eval_steps_per_second": 2.005,
"step": 264
},
{
"epoch": 3.9606003752345216,
"grad_norm": 0.014023531184218459,
"learning_rate": 1.878278621847855e-06,
"loss": 0.4515,
"step": 265
},
{
"epoch": 3.975609756097561,
"grad_norm": 0.014386733226076575,
"learning_rate": 1.8770779801395738e-06,
"loss": 0.4509,
"step": 266
},
{
"epoch": 3.9906191369606003,
"grad_norm": 0.014531318566438902,
"learning_rate": 1.875871833617246e-06,
"loss": 0.4532,
"step": 267
},
{
"epoch": 4.01500938086304,
"grad_norm": 0.022435033190231806,
"learning_rate": 1.874660189851022e-06,
"loss": 0.901,
"step": 268
},
{
"epoch": 4.030018761726079,
"grad_norm": 0.014231420288400026,
"learning_rate": 1.8734430564455548e-06,
"loss": 0.4498,
"step": 269
},
{
"epoch": 4.045028142589119,
"grad_norm": 0.01459887414261473,
"learning_rate": 1.872220441039952e-06,
"loss": 0.4623,
"step": 270
},
{
"epoch": 4.0600375234521575,
"grad_norm": 0.014035710638968229,
"learning_rate": 1.870992351307728e-06,
"loss": 0.4531,
"step": 271
},
{
"epoch": 4.075046904315197,
"grad_norm": 0.01398312713702392,
"learning_rate": 1.8697587949567556e-06,
"loss": 0.4583,
"step": 272
},
{
"epoch": 4.090056285178236,
"grad_norm": 0.014204886625475947,
"learning_rate": 1.868519779729218e-06,
"loss": 0.4564,
"step": 273
},
{
"epoch": 4.105065666041276,
"grad_norm": 0.013937605618481593,
"learning_rate": 1.8672753134015595e-06,
"loss": 0.45,
"step": 274
},
{
"epoch": 4.120075046904315,
"grad_norm": 0.014035968722853485,
"learning_rate": 1.8660254037844386e-06,
"loss": 0.4539,
"step": 275
},
{
"epoch": 4.135084427767355,
"grad_norm": 0.013953280133352524,
"learning_rate": 1.8647700587226757e-06,
"loss": 0.4355,
"step": 276
},
{
"epoch": 4.150093808630394,
"grad_norm": 0.014026635079346822,
"learning_rate": 1.863509286095207e-06,
"loss": 0.4597,
"step": 277
},
{
"epoch": 4.165103189493434,
"grad_norm": 0.013907777893968047,
"learning_rate": 1.8622430938150336e-06,
"loss": 0.4572,
"step": 278
},
{
"epoch": 4.1801125703564725,
"grad_norm": 0.014404008458403212,
"learning_rate": 1.8609714898291714e-06,
"loss": 0.4463,
"step": 279
},
{
"epoch": 4.195121951219512,
"grad_norm": 0.014119288813333237,
"learning_rate": 1.8596944821186025e-06,
"loss": 0.4559,
"step": 280
},
{
"epoch": 4.210131332082551,
"grad_norm": 0.014044205401061443,
"learning_rate": 1.8584120786982243e-06,
"loss": 0.4456,
"step": 281
},
{
"epoch": 4.225140712945591,
"grad_norm": 0.014403222487556044,
"learning_rate": 1.8571242876167993e-06,
"loss": 0.4574,
"step": 282
},
{
"epoch": 4.24015009380863,
"grad_norm": 0.014536903944634817,
"learning_rate": 1.8558311169569046e-06,
"loss": 0.4509,
"step": 283
},
{
"epoch": 4.25515947467167,
"grad_norm": 0.013763721532947899,
"learning_rate": 1.8545325748348816e-06,
"loss": 0.4461,
"step": 284
},
{
"epoch": 4.270168855534709,
"grad_norm": 0.013895967925956643,
"learning_rate": 1.8532286694007836e-06,
"loss": 0.4554,
"step": 285
},
{
"epoch": 4.285178236397749,
"grad_norm": 0.01393914459390315,
"learning_rate": 1.851919408838327e-06,
"loss": 0.4397,
"step": 286
},
{
"epoch": 4.300187617260788,
"grad_norm": 0.013537134276235397,
"learning_rate": 1.850604801364838e-06,
"loss": 0.4562,
"step": 287
},
{
"epoch": 4.315196998123827,
"grad_norm": 0.014143471820575505,
"learning_rate": 1.8492848552312013e-06,
"loss": 0.4535,
"step": 288
},
{
"epoch": 4.330206378986867,
"grad_norm": 0.013836631457896802,
"learning_rate": 1.8479595787218098e-06,
"loss": 0.4429,
"step": 289
},
{
"epoch": 4.345215759849906,
"grad_norm": 0.013594205543129362,
"learning_rate": 1.8466289801545104e-06,
"loss": 0.4403,
"step": 290
},
{
"epoch": 4.360225140712946,
"grad_norm": 0.013860702615794385,
"learning_rate": 1.8452930678805533e-06,
"loss": 0.4474,
"step": 291
},
{
"epoch": 4.375234521575985,
"grad_norm": 0.014225127650714606,
"learning_rate": 1.8439518502845396e-06,
"loss": 0.4477,
"step": 292
},
{
"epoch": 4.390243902439025,
"grad_norm": 0.014938943538672861,
"learning_rate": 1.8426053357843677e-06,
"loss": 0.449,
"step": 293
},
{
"epoch": 4.405253283302064,
"grad_norm": 0.014757581146971215,
"learning_rate": 1.8412535328311812e-06,
"loss": 0.4296,
"step": 294
},
{
"epoch": 4.4202626641651035,
"grad_norm": 0.013546168269445344,
"learning_rate": 1.8398964499093152e-06,
"loss": 0.4582,
"step": 295
},
{
"epoch": 4.435272045028142,
"grad_norm": 0.01407135906976384,
"learning_rate": 1.8385340955362445e-06,
"loss": 0.4526,
"step": 296
},
{
"epoch": 4.450281425891182,
"grad_norm": 0.014172740828230739,
"learning_rate": 1.8371664782625285e-06,
"loss": 0.4488,
"step": 297
},
{
"epoch": 4.465290806754221,
"grad_norm": 0.01412396469645932,
"learning_rate": 1.8357936066717583e-06,
"loss": 0.444,
"step": 298
},
{
"epoch": 4.480300187617261,
"grad_norm": 0.014088947324034075,
"learning_rate": 1.8344154893805026e-06,
"loss": 0.4381,
"step": 299
},
{
"epoch": 4.4953095684803,
"grad_norm": 0.014071918393412349,
"learning_rate": 1.8330321350382542e-06,
"loss": 0.4564,
"step": 300
},
{
"epoch": 4.51031894934334,
"grad_norm": 0.01403461284817204,
"learning_rate": 1.831643552327375e-06,
"loss": 0.4526,
"step": 301
},
{
"epoch": 4.525328330206379,
"grad_norm": 0.014031997266113018,
"learning_rate": 1.8302497499630413e-06,
"loss": 0.436,
"step": 302
},
{
"epoch": 4.5403377110694185,
"grad_norm": 0.013947785382431976,
"learning_rate": 1.8288507366931904e-06,
"loss": 0.4543,
"step": 303
},
{
"epoch": 4.5553470919324575,
"grad_norm": 0.013628442283807026,
"learning_rate": 1.8274465212984645e-06,
"loss": 0.4493,
"step": 304
},
{
"epoch": 4.570356472795497,
"grad_norm": 0.013780622075694634,
"learning_rate": 1.8260371125921558e-06,
"loss": 0.4541,
"step": 305
},
{
"epoch": 4.585365853658536,
"grad_norm": 0.01367940406253203,
"learning_rate": 1.8246225194201513e-06,
"loss": 0.4497,
"step": 306
},
{
"epoch": 4.600375234521576,
"grad_norm": 0.014142512306958784,
"learning_rate": 1.8232027506608778e-06,
"loss": 0.4499,
"step": 307
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.013824356322542732,
"learning_rate": 1.821777815225245e-06,
"loss": 0.4451,
"step": 308
},
{
"epoch": 4.630393996247655,
"grad_norm": 0.013296623575056241,
"learning_rate": 1.820347722056591e-06,
"loss": 0.4512,
"step": 309
},
{
"epoch": 4.645403377110695,
"grad_norm": 0.01368390848987212,
"learning_rate": 1.818912480130625e-06,
"loss": 0.4451,
"step": 310
},
{
"epoch": 4.6604127579737336,
"grad_norm": 0.013358544096664208,
"learning_rate": 1.8174720984553712e-06,
"loss": 0.4454,
"step": 311
},
{
"epoch": 4.6754221388367725,
"grad_norm": 0.014400294321699255,
"learning_rate": 1.8160265860711132e-06,
"loss": 0.4425,
"step": 312
},
{
"epoch": 4.690431519699812,
"grad_norm": 0.013756109715941985,
"learning_rate": 1.8145759520503357e-06,
"loss": 0.4531,
"step": 313
},
{
"epoch": 4.705440900562852,
"grad_norm": 0.013628586470860909,
"learning_rate": 1.8131202054976687e-06,
"loss": 0.4425,
"step": 314
},
{
"epoch": 4.720450281425891,
"grad_norm": 0.013358205799936783,
"learning_rate": 1.8116593555498305e-06,
"loss": 0.4508,
"step": 315
},
{
"epoch": 4.735459662288931,
"grad_norm": 0.014217584611552704,
"learning_rate": 1.810193411375569e-06,
"loss": 0.4537,
"step": 316
},
{
"epoch": 4.75046904315197,
"grad_norm": 0.013645086599534274,
"learning_rate": 1.808722382175606e-06,
"loss": 0.4478,
"step": 317
},
{
"epoch": 4.76547842401501,
"grad_norm": 0.014072448637602094,
"learning_rate": 1.8072462771825778e-06,
"loss": 0.4518,
"step": 318
},
{
"epoch": 4.780487804878049,
"grad_norm": 0.013395441388107984,
"learning_rate": 1.8057651056609782e-06,
"loss": 0.4428,
"step": 319
},
{
"epoch": 4.795497185741088,
"grad_norm": 0.013975384875639913,
"learning_rate": 1.8042788769070997e-06,
"loss": 0.4451,
"step": 320
},
{
"epoch": 4.810506566604127,
"grad_norm": 0.013421942808358008,
"learning_rate": 1.802787600248977e-06,
"loss": 0.4375,
"step": 321
},
{
"epoch": 4.825515947467167,
"grad_norm": 0.014009319367795352,
"learning_rate": 1.8012912850463247e-06,
"loss": 0.454,
"step": 322
},
{
"epoch": 4.840525328330206,
"grad_norm": 0.013790459864287155,
"learning_rate": 1.7997899406904833e-06,
"loss": 0.4454,
"step": 323
},
{
"epoch": 4.855534709193246,
"grad_norm": 0.013608907293000358,
"learning_rate": 1.7982835766043558e-06,
"loss": 0.4428,
"step": 324
},
{
"epoch": 4.870544090056285,
"grad_norm": 0.013990463928312293,
"learning_rate": 1.7967722022423519e-06,
"loss": 0.4442,
"step": 325
},
{
"epoch": 4.885553470919325,
"grad_norm": 0.013540266851849116,
"learning_rate": 1.795255827090327e-06,
"loss": 0.4498,
"step": 326
},
{
"epoch": 4.900562851782364,
"grad_norm": 0.013504467127123212,
"learning_rate": 1.7937344606655226e-06,
"loss": 0.4484,
"step": 327
},
{
"epoch": 4.915572232645403,
"grad_norm": 0.013720590668768537,
"learning_rate": 1.7922081125165075e-06,
"loss": 0.4375,
"step": 328
},
{
"epoch": 4.930581613508442,
"grad_norm": 0.013786451177835153,
"learning_rate": 1.7906767922231171e-06,
"loss": 0.4366,
"step": 329
},
{
"epoch": 4.945590994371482,
"grad_norm": 0.013176942655191242,
"learning_rate": 1.7891405093963937e-06,
"loss": 0.4505,
"step": 330
},
{
"epoch": 4.945590994371482,
"eval_loss": 0.4254697263240814,
"eval_runtime": 13.8629,
"eval_samples_per_second": 32.244,
"eval_steps_per_second": 2.02,
"step": 330
},
{
"epoch": 4.960600375234522,
"grad_norm": 0.013540995677048154,
"learning_rate": 1.7875992736785255e-06,
"loss": 0.4364,
"step": 331
},
{
"epoch": 4.975609756097561,
"grad_norm": 0.013148307270088234,
"learning_rate": 1.7860530947427874e-06,
"loss": 0.4234,
"step": 332
},
{
"epoch": 4.9906191369606,
"grad_norm": 0.013580201172669656,
"learning_rate": 1.7845019822934787e-06,
"loss": 0.4341,
"step": 333
},
{
"epoch": 5.0,
"grad_norm": 0.018296171270344976,
"learning_rate": 1.7829459460658637e-06,
"loss": 0.4486,
"step": 334
},
{
"epoch": 5.01500938086304,
"grad_norm": 0.014697785626980316,
"learning_rate": 1.7813849958261094e-06,
"loss": 0.4341,
"step": 335
},
{
"epoch": 5.030018761726079,
"grad_norm": 0.014043055899901864,
"learning_rate": 1.7798191413712242e-06,
"loss": 0.4479,
"step": 336
},
{
"epoch": 5.045028142589119,
"grad_norm": 0.013424222741471816,
"learning_rate": 1.778248392528998e-06,
"loss": 0.4489,
"step": 337
},
{
"epoch": 5.0600375234521575,
"grad_norm": 0.012943130776201624,
"learning_rate": 1.7766727591579387e-06,
"loss": 0.4288,
"step": 338
},
{
"epoch": 5.075046904315197,
"grad_norm": 0.013826788871450798,
"learning_rate": 1.7750922511472108e-06,
"loss": 0.4431,
"step": 339
},
{
"epoch": 5.090056285178236,
"grad_norm": 0.01434896451317387,
"learning_rate": 1.7735068784165744e-06,
"loss": 0.4298,
"step": 340
},
{
"epoch": 5.105065666041276,
"grad_norm": 0.013543345476736609,
"learning_rate": 1.7719166509163208e-06,
"loss": 0.4443,
"step": 341
},
{
"epoch": 5.120075046904315,
"grad_norm": 0.013654867873190223,
"learning_rate": 1.7703215786272128e-06,
"loss": 0.4471,
"step": 342
},
{
"epoch": 5.135084427767355,
"grad_norm": 0.013298753867087855,
"learning_rate": 1.76872167156042e-06,
"loss": 0.4344,
"step": 343
},
{
"epoch": 5.150093808630394,
"grad_norm": 0.013653938903598364,
"learning_rate": 1.767116939757456e-06,
"loss": 0.4365,
"step": 344
},
{
"epoch": 5.165103189493434,
"grad_norm": 0.013329024847758256,
"learning_rate": 1.7655073932901165e-06,
"loss": 0.4312,
"step": 345
},
{
"epoch": 5.1801125703564725,
"grad_norm": 0.013615866345533169,
"learning_rate": 1.763893042260416e-06,
"loss": 0.4402,
"step": 346
},
{
"epoch": 5.195121951219512,
"grad_norm": 0.013772143243209281,
"learning_rate": 1.7622738968005226e-06,
"loss": 0.4416,
"step": 347
},
{
"epoch": 5.210131332082551,
"grad_norm": 0.013480347503805534,
"learning_rate": 1.7606499670726968e-06,
"loss": 0.4472,
"step": 348
},
{
"epoch": 5.225140712945591,
"grad_norm": 0.013157923235962578,
"learning_rate": 1.759021263269227e-06,
"loss": 0.4317,
"step": 349
},
{
"epoch": 5.24015009380863,
"grad_norm": 0.013273852412143418,
"learning_rate": 1.7573877956123637e-06,
"loss": 0.4334,
"step": 350
},
{
"epoch": 5.25515947467167,
"grad_norm": 0.013606909673657986,
"learning_rate": 1.7557495743542582e-06,
"loss": 0.4371,
"step": 351
},
{
"epoch": 5.270168855534709,
"grad_norm": 0.013941507400326235,
"learning_rate": 1.754106609776896e-06,
"loss": 0.4431,
"step": 352
},
{
"epoch": 5.285178236397749,
"grad_norm": 0.013436310560981397,
"learning_rate": 1.7524589121920342e-06,
"loss": 0.442,
"step": 353
},
{
"epoch": 5.300187617260788,
"grad_norm": 0.01329009884820222,
"learning_rate": 1.7508064919411343e-06,
"loss": 0.4497,
"step": 354
},
{
"epoch": 5.315196998123827,
"grad_norm": 0.01367800689726424,
"learning_rate": 1.7491493593952996e-06,
"loss": 0.4393,
"step": 355
},
{
"epoch": 5.330206378986867,
"grad_norm": 0.01354483521655564,
"learning_rate": 1.747487524955209e-06,
"loss": 0.4364,
"step": 356
},
{
"epoch": 5.345215759849906,
"grad_norm": 0.01352972460550207,
"learning_rate": 1.7458209990510527e-06,
"loss": 0.4333,
"step": 357
},
{
"epoch": 5.360225140712946,
"grad_norm": 0.013745956104063744,
"learning_rate": 1.7441497921424645e-06,
"loss": 0.4328,
"step": 358
},
{
"epoch": 5.375234521575985,
"grad_norm": 0.01319303150078731,
"learning_rate": 1.7424739147184591e-06,
"loss": 0.4333,
"step": 359
},
{
"epoch": 5.390243902439025,
"grad_norm": 0.013764119164539975,
"learning_rate": 1.7407933772973635e-06,
"loss": 0.4518,
"step": 360
},
{
"epoch": 5.405253283302064,
"grad_norm": 0.013090437609724701,
"learning_rate": 1.7391081904267537e-06,
"loss": 0.4392,
"step": 361
},
{
"epoch": 5.4202626641651035,
"grad_norm": 0.013571191725851636,
"learning_rate": 1.7374183646833858e-06,
"loss": 0.442,
"step": 362
},
{
"epoch": 5.435272045028142,
"grad_norm": 0.013136763063661159,
"learning_rate": 1.7357239106731317e-06,
"loss": 0.4321,
"step": 363
},
{
"epoch": 5.450281425891182,
"grad_norm": 0.013174996076392734,
"learning_rate": 1.734024839030911e-06,
"loss": 0.4351,
"step": 364
},
{
"epoch": 5.465290806754221,
"grad_norm": 0.013584392079284755,
"learning_rate": 1.7323211604206264e-06,
"loss": 0.4336,
"step": 365
},
{
"epoch": 5.480300187617261,
"grad_norm": 0.013238150722927616,
"learning_rate": 1.7306128855350938e-06,
"loss": 0.4499,
"step": 366
},
{
"epoch": 5.4953095684803,
"grad_norm": 0.012966538449872765,
"learning_rate": 1.728900025095978e-06,
"loss": 0.439,
"step": 367
},
{
"epoch": 5.51031894934334,
"grad_norm": 0.013097706772820898,
"learning_rate": 1.7271825898537226e-06,
"loss": 0.4405,
"step": 368
},
{
"epoch": 5.525328330206379,
"grad_norm": 0.013139456397187858,
"learning_rate": 1.725460590587486e-06,
"loss": 0.4322,
"step": 369
},
{
"epoch": 5.5403377110694185,
"grad_norm": 0.01320832233373905,
"learning_rate": 1.72373403810507e-06,
"loss": 0.4453,
"step": 370
},
{
"epoch": 5.5553470919324575,
"grad_norm": 0.013313022883952532,
"learning_rate": 1.7220029432428555e-06,
"loss": 0.4369,
"step": 371
},
{
"epoch": 5.570356472795497,
"grad_norm": 0.013035779228992055,
"learning_rate": 1.7202673168657315e-06,
"loss": 0.43,
"step": 372
},
{
"epoch": 5.585365853658536,
"grad_norm": 0.013457833726421647,
"learning_rate": 1.7185271698670292e-06,
"loss": 0.4329,
"step": 373
},
{
"epoch": 5.600375234521576,
"grad_norm": 0.013116857431759213,
"learning_rate": 1.7167825131684511e-06,
"loss": 0.4313,
"step": 374
},
{
"epoch": 5.615384615384615,
"grad_norm": 0.013758827471993697,
"learning_rate": 1.715033357720006e-06,
"loss": 0.4476,
"step": 375
},
{
"epoch": 5.630393996247655,
"grad_norm": 0.013455290328633535,
"learning_rate": 1.7132797144999367e-06,
"loss": 0.4477,
"step": 376
},
{
"epoch": 5.645403377110695,
"grad_norm": 0.013424149696678331,
"learning_rate": 1.7115215945146532e-06,
"loss": 0.4382,
"step": 377
},
{
"epoch": 5.6604127579737336,
"grad_norm": 0.013961218093918704,
"learning_rate": 1.709759008798663e-06,
"loss": 0.4429,
"step": 378
},
{
"epoch": 5.6754221388367725,
"grad_norm": 0.013358780836199631,
"learning_rate": 1.7079919684145026e-06,
"loss": 0.4405,
"step": 379
},
{
"epoch": 5.690431519699812,
"grad_norm": 0.013336158263698005,
"learning_rate": 1.7062204844526657e-06,
"loss": 0.4289,
"step": 380
},
{
"epoch": 5.705440900562852,
"grad_norm": 0.013355419310096175,
"learning_rate": 1.7044445680315372e-06,
"loss": 0.44,
"step": 381
},
{
"epoch": 5.720450281425891,
"grad_norm": 0.013326753688393343,
"learning_rate": 1.7026642302973203e-06,
"loss": 0.4383,
"step": 382
},
{
"epoch": 5.735459662288931,
"grad_norm": 0.013248321811620164,
"learning_rate": 1.7008794824239673e-06,
"loss": 0.4385,
"step": 383
},
{
"epoch": 5.75046904315197,
"grad_norm": 0.013440251864610129,
"learning_rate": 1.6990903356131123e-06,
"loss": 0.4447,
"step": 384
},
{
"epoch": 5.76547842401501,
"grad_norm": 0.013356165325694516,
"learning_rate": 1.6972968010939952e-06,
"loss": 0.4395,
"step": 385
},
{
"epoch": 5.780487804878049,
"grad_norm": 0.013697957995084815,
"learning_rate": 1.6954988901233974e-06,
"loss": 0.4445,
"step": 386
},
{
"epoch": 5.795497185741088,
"grad_norm": 0.01352896933498215,
"learning_rate": 1.6936966139855661e-06,
"loss": 0.4497,
"step": 387
},
{
"epoch": 5.810506566604127,
"grad_norm": 0.013017644966079026,
"learning_rate": 1.6918899839921473e-06,
"loss": 0.4427,
"step": 388
},
{
"epoch": 5.825515947467167,
"grad_norm": 0.0130227730372479,
"learning_rate": 1.690079011482112e-06,
"loss": 0.4353,
"step": 389
},
{
"epoch": 5.840525328330206,
"grad_norm": 0.013588479837256142,
"learning_rate": 1.6882637078216865e-06,
"loss": 0.4309,
"step": 390
},
{
"epoch": 5.855534709193246,
"grad_norm": 0.013099801786314374,
"learning_rate": 1.6864440844042815e-06,
"loss": 0.4259,
"step": 391
},
{
"epoch": 5.870544090056285,
"grad_norm": 0.013097214237123593,
"learning_rate": 1.6846201526504186e-06,
"loss": 0.4302,
"step": 392
},
{
"epoch": 5.885553470919325,
"grad_norm": 0.013065212723171151,
"learning_rate": 1.682791924007661e-06,
"loss": 0.4398,
"step": 393
},
{
"epoch": 5.900562851782364,
"grad_norm": 0.014045965247674342,
"learning_rate": 1.6809594099505392e-06,
"loss": 0.434,
"step": 394
},
{
"epoch": 5.915572232645403,
"grad_norm": 0.013248206942276757,
"learning_rate": 1.6791226219804819e-06,
"loss": 0.4319,
"step": 395
},
{
"epoch": 5.930581613508442,
"grad_norm": 0.013171561529992387,
"learning_rate": 1.6772815716257411e-06,
"loss": 0.4477,
"step": 396
},
{
"epoch": 5.930581613508442,
"eval_loss": 0.41817715764045715,
"eval_runtime": 14.0502,
"eval_samples_per_second": 31.815,
"eval_steps_per_second": 1.993,
"step": 396
},
{
"epoch": 5.945590994371482,
"grad_norm": 0.013426647961607079,
"learning_rate": 1.6754362704413208e-06,
"loss": 0.4338,
"step": 397
},
{
"epoch": 5.960600375234522,
"grad_norm": 0.01345417807854897,
"learning_rate": 1.673586730008905e-06,
"loss": 0.4439,
"step": 398
},
{
"epoch": 5.975609756097561,
"grad_norm": 0.013431711439659157,
"learning_rate": 1.6717329619367848e-06,
"loss": 0.4319,
"step": 399
},
{
"epoch": 5.9906191369606,
"grad_norm": 0.013005069733149226,
"learning_rate": 1.6698749778597842e-06,
"loss": 0.4455,
"step": 400
},
{
"epoch": 6.01500938086304,
"grad_norm": 0.020617486128169053,
"learning_rate": 1.6680127894391894e-06,
"loss": 0.869,
"step": 401
},
{
"epoch": 6.030018761726079,
"grad_norm": 0.013176861969478369,
"learning_rate": 1.6661464083626733e-06,
"loss": 0.4394,
"step": 402
},
{
"epoch": 6.045028142589119,
"grad_norm": 0.012886044431831887,
"learning_rate": 1.6642758463442244e-06,
"loss": 0.4352,
"step": 403
},
{
"epoch": 6.0600375234521575,
"grad_norm": 0.012593283091799695,
"learning_rate": 1.6624011151240707e-06,
"loss": 0.4352,
"step": 404
},
{
"epoch": 6.075046904315197,
"grad_norm": 0.013119523155248839,
"learning_rate": 1.6605222264686082e-06,
"loss": 0.4456,
"step": 405
},
{
"epoch": 6.090056285178236,
"grad_norm": 0.013135474034204285,
"learning_rate": 1.6586391921703266e-06,
"loss": 0.4372,
"step": 406
},
{
"epoch": 6.105065666041276,
"grad_norm": 0.013337941205936685,
"learning_rate": 1.6567520240477343e-06,
"loss": 0.4327,
"step": 407
},
{
"epoch": 6.120075046904315,
"grad_norm": 0.013170367591114004,
"learning_rate": 1.6548607339452852e-06,
"loss": 0.4368,
"step": 408
},
{
"epoch": 6.135084427767355,
"grad_norm": 0.013634531162252857,
"learning_rate": 1.6529653337333031e-06,
"loss": 0.4328,
"step": 409
},
{
"epoch": 6.150093808630394,
"grad_norm": 0.013706123614988762,
"learning_rate": 1.65106583530791e-06,
"loss": 0.4375,
"step": 410
},
{
"epoch": 6.165103189493434,
"grad_norm": 0.0135254442542636,
"learning_rate": 1.649162250590948e-06,
"loss": 0.4354,
"step": 411
},
{
"epoch": 6.1801125703564725,
"grad_norm": 0.013527120604353708,
"learning_rate": 1.6472545915299066e-06,
"loss": 0.4364,
"step": 412
},
{
"epoch": 6.195121951219512,
"grad_norm": 0.012809333866597397,
"learning_rate": 1.645342870097847e-06,
"loss": 0.424,
"step": 413
},
{
"epoch": 6.210131332082551,
"grad_norm": 0.013350184383574616,
"learning_rate": 1.6434270982933271e-06,
"loss": 0.4456,
"step": 414
},
{
"epoch": 6.225140712945591,
"grad_norm": 0.013238123537924208,
"learning_rate": 1.6415072881403263e-06,
"loss": 0.4277,
"step": 415
},
{
"epoch": 6.24015009380863,
"grad_norm": 0.01289362750174883,
"learning_rate": 1.6395834516881702e-06,
"loss": 0.4303,
"step": 416
},
{
"epoch": 6.25515947467167,
"grad_norm": 0.013557747567476165,
"learning_rate": 1.637655601011454e-06,
"loss": 0.4372,
"step": 417
},
{
"epoch": 6.270168855534709,
"grad_norm": 0.012808118264155776,
"learning_rate": 1.6357237482099683e-06,
"loss": 0.4288,
"step": 418
},
{
"epoch": 6.285178236397749,
"grad_norm": 0.013405549925460917,
"learning_rate": 1.6337879054086208e-06,
"loss": 0.4389,
"step": 419
},
{
"epoch": 6.300187617260788,
"grad_norm": 0.013505737436808054,
"learning_rate": 1.6318480847573638e-06,
"loss": 0.4328,
"step": 420
},
{
"epoch": 6.315196998123827,
"grad_norm": 0.013081956141683409,
"learning_rate": 1.6299042984311143e-06,
"loss": 0.4344,
"step": 421
},
{
"epoch": 6.330206378986867,
"grad_norm": 0.013113716625865569,
"learning_rate": 1.6279565586296797e-06,
"loss": 0.4367,
"step": 422
},
{
"epoch": 6.345215759849906,
"grad_norm": 0.013235475707455984,
"learning_rate": 1.6260048775776803e-06,
"loss": 0.4286,
"step": 423
},
{
"epoch": 6.360225140712946,
"grad_norm": 0.013512582434807428,
"learning_rate": 1.6240492675244726e-06,
"loss": 0.4428,
"step": 424
},
{
"epoch": 6.375234521575985,
"grad_norm": 0.013410143400651279,
"learning_rate": 1.6220897407440741e-06,
"loss": 0.4358,
"step": 425
},
{
"epoch": 6.390243902439025,
"grad_norm": 0.013327451309648946,
"learning_rate": 1.6201263095350832e-06,
"loss": 0.4301,
"step": 426
},
{
"epoch": 6.405253283302064,
"grad_norm": 0.013025559977297553,
"learning_rate": 1.6181589862206052e-06,
"loss": 0.4359,
"step": 427
},
{
"epoch": 6.4202626641651035,
"grad_norm": 0.013348709279455499,
"learning_rate": 1.6161877831481722e-06,
"loss": 0.434,
"step": 428
},
{
"epoch": 6.435272045028142,
"grad_norm": 0.01262545802365253,
"learning_rate": 1.6142127126896679e-06,
"loss": 0.4263,
"step": 429
},
{
"epoch": 6.450281425891182,
"grad_norm": 0.013172427598095178,
"learning_rate": 1.612233787241248e-06,
"loss": 0.4221,
"step": 430
},
{
"epoch": 6.465290806754221,
"grad_norm": 0.013481870181090533,
"learning_rate": 1.610251019223264e-06,
"loss": 0.435,
"step": 431
},
{
"epoch": 6.480300187617261,
"grad_norm": 0.013037104332200075,
"learning_rate": 1.6082644210801843e-06,
"loss": 0.4311,
"step": 432
},
{
"epoch": 6.4953095684803,
"grad_norm": 0.013514919736678848,
"learning_rate": 1.6062740052805168e-06,
"loss": 0.4406,
"step": 433
},
{
"epoch": 6.51031894934334,
"grad_norm": 0.013036697288445677,
"learning_rate": 1.6042797843167289e-06,
"loss": 0.4215,
"step": 434
},
{
"epoch": 6.525328330206379,
"grad_norm": 0.012753548020225313,
"learning_rate": 1.6022817707051721e-06,
"loss": 0.4393,
"step": 435
},
{
"epoch": 6.5403377110694185,
"grad_norm": 0.01288860891440036,
"learning_rate": 1.6002799769860005e-06,
"loss": 0.4248,
"step": 436
},
{
"epoch": 6.5553470919324575,
"grad_norm": 0.013672060679601777,
"learning_rate": 1.5982744157230937e-06,
"loss": 0.4385,
"step": 437
},
{
"epoch": 6.570356472795497,
"grad_norm": 0.013035441611515807,
"learning_rate": 1.5962650995039782e-06,
"loss": 0.4422,
"step": 438
},
{
"epoch": 6.585365853658536,
"grad_norm": 0.013122421954394383,
"learning_rate": 1.5942520409397462e-06,
"loss": 0.4365,
"step": 439
},
{
"epoch": 6.600375234521576,
"grad_norm": 0.01319951174301247,
"learning_rate": 1.5922352526649801e-06,
"loss": 0.4307,
"step": 440
},
{
"epoch": 6.615384615384615,
"grad_norm": 0.013085414297184095,
"learning_rate": 1.5902147473376693e-06,
"loss": 0.4312,
"step": 441
},
{
"epoch": 6.630393996247655,
"grad_norm": 0.012791438384207784,
"learning_rate": 1.5881905376391336e-06,
"loss": 0.4211,
"step": 442
},
{
"epoch": 6.645403377110695,
"grad_norm": 0.012892370389429274,
"learning_rate": 1.5861626362739423e-06,
"loss": 0.4238,
"step": 443
},
{
"epoch": 6.6604127579737336,
"grad_norm": 0.012426989025538739,
"learning_rate": 1.5841310559698342e-06,
"loss": 0.4274,
"step": 444
},
{
"epoch": 6.6754221388367725,
"grad_norm": 0.012707681199612911,
"learning_rate": 1.5820958094776398e-06,
"loss": 0.429,
"step": 445
},
{
"epoch": 6.690431519699812,
"grad_norm": 0.013157713008872291,
"learning_rate": 1.5800569095711981e-06,
"loss": 0.4215,
"step": 446
},
{
"epoch": 6.705440900562852,
"grad_norm": 0.013168651131606283,
"learning_rate": 1.578014369047279e-06,
"loss": 0.4385,
"step": 447
},
{
"epoch": 6.720450281425891,
"grad_norm": 0.013114532064645967,
"learning_rate": 1.5759682007255016e-06,
"loss": 0.4448,
"step": 448
},
{
"epoch": 6.735459662288931,
"grad_norm": 0.013082990504026984,
"learning_rate": 1.573918417448254e-06,
"loss": 0.4275,
"step": 449
},
{
"epoch": 6.75046904315197,
"grad_norm": 0.013151892866998883,
"learning_rate": 1.5718650320806142e-06,
"loss": 0.4337,
"step": 450
},
{
"epoch": 6.76547842401501,
"grad_norm": 0.013101096876612917,
"learning_rate": 1.569808057510266e-06,
"loss": 0.4293,
"step": 451
},
{
"epoch": 6.780487804878049,
"grad_norm": 0.012754754751734624,
"learning_rate": 1.567747506647422e-06,
"loss": 0.4257,
"step": 452
},
{
"epoch": 6.795497185741088,
"grad_norm": 0.012418281228569954,
"learning_rate": 1.5656833924247396e-06,
"loss": 0.4194,
"step": 453
},
{
"epoch": 6.810506566604127,
"grad_norm": 0.012677644542908185,
"learning_rate": 1.5636157277972413e-06,
"loss": 0.4203,
"step": 454
},
{
"epoch": 6.825515947467167,
"grad_norm": 0.012978451911838326,
"learning_rate": 1.5615445257422332e-06,
"loss": 0.4236,
"step": 455
},
{
"epoch": 6.840525328330206,
"grad_norm": 0.012837439507680473,
"learning_rate": 1.5594697992592229e-06,
"loss": 0.4331,
"step": 456
},
{
"epoch": 6.855534709193246,
"grad_norm": 0.01252259688640816,
"learning_rate": 1.5573915613698393e-06,
"loss": 0.4378,
"step": 457
},
{
"epoch": 6.870544090056285,
"grad_norm": 0.013197054542329851,
"learning_rate": 1.5553098251177485e-06,
"loss": 0.4206,
"step": 458
},
{
"epoch": 6.885553470919325,
"grad_norm": 0.012853395885087531,
"learning_rate": 1.5532246035685755e-06,
"loss": 0.4268,
"step": 459
},
{
"epoch": 6.900562851782364,
"grad_norm": 0.012990308489818332,
"learning_rate": 1.5511359098098183e-06,
"loss": 0.4291,
"step": 460
},
{
"epoch": 6.915572232645403,
"grad_norm": 0.012921940170193533,
"learning_rate": 1.549043756950768e-06,
"loss": 0.4339,
"step": 461
},
{
"epoch": 6.930581613508442,
"grad_norm": 0.013109887397593497,
"learning_rate": 1.5469481581224271e-06,
"loss": 0.4358,
"step": 462
},
{
"epoch": 6.930581613508442,
"eval_loss": 0.4126039445400238,
"eval_runtime": 13.7392,
"eval_samples_per_second": 32.535,
"eval_steps_per_second": 2.038,
"step": 462
},
{
"epoch": 6.945590994371482,
"grad_norm": 0.012598723095405159,
"learning_rate": 1.5448491264774241e-06,
"loss": 0.4263,
"step": 463
},
{
"epoch": 6.960600375234522,
"grad_norm": 0.012861189060759321,
"learning_rate": 1.5427466751899352e-06,
"loss": 0.427,
"step": 464
},
{
"epoch": 6.975609756097561,
"grad_norm": 0.013200569881254022,
"learning_rate": 1.5406408174555977e-06,
"loss": 0.4259,
"step": 465
},
{
"epoch": 6.9906191369606,
"grad_norm": 0.012616437259491003,
"learning_rate": 1.5385315664914292e-06,
"loss": 0.436,
"step": 466
},
{
"epoch": 7.0,
"grad_norm": 0.016057681180086946,
"learning_rate": 1.536418935535745e-06,
"loss": 0.4215,
"step": 467
},
{
"epoch": 7.01500938086304,
"grad_norm": 0.015147696984116419,
"learning_rate": 1.534302937848073e-06,
"loss": 0.4299,
"step": 468
},
{
"epoch": 7.030018761726079,
"grad_norm": 0.012645368174521793,
"learning_rate": 1.5321835867090732e-06,
"loss": 0.4322,
"step": 469
},
{
"epoch": 7.045028142589119,
"grad_norm": 0.01320195840723717,
"learning_rate": 1.5300608954204514e-06,
"loss": 0.4202,
"step": 470
},
{
"epoch": 7.0600375234521575,
"grad_norm": 0.012876528475684408,
"learning_rate": 1.5279348773048785e-06,
"loss": 0.4234,
"step": 471
},
{
"epoch": 7.075046904315197,
"grad_norm": 0.012414131318572394,
"learning_rate": 1.5258055457059052e-06,
"loss": 0.4286,
"step": 472
},
{
"epoch": 7.090056285178236,
"grad_norm": 0.013424481910424807,
"learning_rate": 1.5236729139878778e-06,
"loss": 0.4363,
"step": 473
},
{
"epoch": 7.105065666041276,
"grad_norm": 0.013082970732005126,
"learning_rate": 1.5215369955358566e-06,
"loss": 0.4307,
"step": 474
},
{
"epoch": 7.120075046904315,
"grad_norm": 0.013012675401740906,
"learning_rate": 1.5193978037555292e-06,
"loss": 0.4281,
"step": 475
},
{
"epoch": 7.135084427767355,
"grad_norm": 0.01386296810810948,
"learning_rate": 1.517255352073129e-06,
"loss": 0.4359,
"step": 476
},
{
"epoch": 7.150093808630394,
"grad_norm": 0.012959933299681203,
"learning_rate": 1.5151096539353479e-06,
"loss": 0.4267,
"step": 477
},
{
"epoch": 7.165103189493434,
"grad_norm": 0.013273365230097464,
"learning_rate": 1.5129607228092548e-06,
"loss": 0.4225,
"step": 478
},
{
"epoch": 7.1801125703564725,
"grad_norm": 0.013069822188325222,
"learning_rate": 1.5108085721822097e-06,
"loss": 0.434,
"step": 479
},
{
"epoch": 7.195121951219512,
"grad_norm": 0.013059257995761383,
"learning_rate": 1.5086532155617784e-06,
"loss": 0.4337,
"step": 480
},
{
"epoch": 7.210131332082551,
"grad_norm": 0.012530361479891216,
"learning_rate": 1.506494666475649e-06,
"loss": 0.4288,
"step": 481
},
{
"epoch": 7.225140712945591,
"grad_norm": 0.01275554693816723,
"learning_rate": 1.5043329384715473e-06,
"loss": 0.4267,
"step": 482
},
{
"epoch": 7.24015009380863,
"grad_norm": 0.012727944808238934,
"learning_rate": 1.5021680451171498e-06,
"loss": 0.4227,
"step": 483
},
{
"epoch": 7.25515947467167,
"grad_norm": 0.012512871850066665,
"learning_rate": 1.5e-06,
"loss": 0.4347,
"step": 484
},
{
"epoch": 7.270168855534709,
"grad_norm": 0.012879847459317067,
"learning_rate": 1.4978288167274232e-06,
"loss": 0.4238,
"step": 485
},
{
"epoch": 7.285178236397749,
"grad_norm": 0.013267371965589561,
"learning_rate": 1.4956545089264405e-06,
"loss": 0.4258,
"step": 486
},
{
"epoch": 7.300187617260788,
"grad_norm": 0.012473072491095367,
"learning_rate": 1.4934770902436834e-06,
"loss": 0.4299,
"step": 487
},
{
"epoch": 7.315196998123827,
"grad_norm": 0.012782011764426037,
"learning_rate": 1.4912965743453087e-06,
"loss": 0.4182,
"step": 488
},
{
"epoch": 7.330206378986867,
"grad_norm": 0.01311348055027308,
"learning_rate": 1.4891129749169118e-06,
"loss": 0.4296,
"step": 489
},
{
"epoch": 7.345215759849906,
"grad_norm": 0.012765085462581965,
"learning_rate": 1.4869263056634417e-06,
"loss": 0.4289,
"step": 490
},
{
"epoch": 7.360225140712946,
"grad_norm": 0.012412381150431463,
"learning_rate": 1.4847365803091144e-06,
"loss": 0.4334,
"step": 491
},
{
"epoch": 7.375234521575985,
"grad_norm": 0.01271837862802656,
"learning_rate": 1.4825438125973263e-06,
"loss": 0.425,
"step": 492
},
{
"epoch": 7.390243902439025,
"grad_norm": 0.012352812019462122,
"learning_rate": 1.4803480162905695e-06,
"loss": 0.4207,
"step": 493
},
{
"epoch": 7.405253283302064,
"grad_norm": 0.012746296584207542,
"learning_rate": 1.4781492051703448e-06,
"loss": 0.4215,
"step": 494
},
{
"epoch": 7.4202626641651035,
"grad_norm": 0.012721837448907083,
"learning_rate": 1.4759473930370736e-06,
"loss": 0.4225,
"step": 495
},
{
"epoch": 7.435272045028142,
"grad_norm": 0.012889669681319507,
"learning_rate": 1.4737425937100135e-06,
"loss": 0.4261,
"step": 496
},
{
"epoch": 7.450281425891182,
"grad_norm": 0.012544026246862405,
"learning_rate": 1.4715348210271703e-06,
"loss": 0.4189,
"step": 497
},
{
"epoch": 7.465290806754221,
"grad_norm": 0.012601175719615424,
"learning_rate": 1.4693240888452118e-06,
"loss": 0.4188,
"step": 498
},
{
"epoch": 7.480300187617261,
"grad_norm": 0.012911814515041583,
"learning_rate": 1.4671104110393808e-06,
"loss": 0.445,
"step": 499
},
{
"epoch": 7.4953095684803,
"grad_norm": 0.012900962528470759,
"learning_rate": 1.4648938015034067e-06,
"loss": 0.4271,
"step": 500
},
{
"epoch": 7.51031894934334,
"grad_norm": 0.012640868695431564,
"learning_rate": 1.4626742741494205e-06,
"loss": 0.4345,
"step": 501
},
{
"epoch": 7.525328330206379,
"grad_norm": 0.012451005837885486,
"learning_rate": 1.4604518429078652e-06,
"loss": 0.429,
"step": 502
},
{
"epoch": 7.5403377110694185,
"grad_norm": 0.013230360834140219,
"learning_rate": 1.4582265217274103e-06,
"loss": 0.4161,
"step": 503
},
{
"epoch": 7.5553470919324575,
"grad_norm": 0.01283686066958734,
"learning_rate": 1.4559983245748637e-06,
"loss": 0.4251,
"step": 504
},
{
"epoch": 7.570356472795497,
"grad_norm": 0.012527190536013916,
"learning_rate": 1.4537672654350832e-06,
"loss": 0.4137,
"step": 505
},
{
"epoch": 7.585365853658536,
"grad_norm": 0.012828676017273635,
"learning_rate": 1.4515333583108893e-06,
"loss": 0.4373,
"step": 506
},
{
"epoch": 7.600375234521576,
"grad_norm": 0.013137450736113143,
"learning_rate": 1.4492966172229778e-06,
"loss": 0.4314,
"step": 507
},
{
"epoch": 7.615384615384615,
"grad_norm": 0.012681479750471514,
"learning_rate": 1.4470570562098306e-06,
"loss": 0.4191,
"step": 508
},
{
"epoch": 7.630393996247655,
"grad_norm": 0.01283115430539013,
"learning_rate": 1.4448146893276295e-06,
"loss": 0.4293,
"step": 509
},
{
"epoch": 7.645403377110695,
"grad_norm": 0.012923054058866432,
"learning_rate": 1.4425695306501655e-06,
"loss": 0.4202,
"step": 510
},
{
"epoch": 7.6604127579737336,
"grad_norm": 0.013121242158230989,
"learning_rate": 1.4403215942687525e-06,
"loss": 0.4373,
"step": 511
},
{
"epoch": 7.6754221388367725,
"grad_norm": 0.012479459230005806,
"learning_rate": 1.4380708942921382e-06,
"loss": 0.4242,
"step": 512
},
{
"epoch": 7.690431519699812,
"grad_norm": 0.012387034220238928,
"learning_rate": 1.4358174448464153e-06,
"loss": 0.414,
"step": 513
},
{
"epoch": 7.705440900562852,
"grad_norm": 0.012574345486950885,
"learning_rate": 1.433561260074933e-06,
"loss": 0.4272,
"step": 514
},
{
"epoch": 7.720450281425891,
"grad_norm": 0.01290052005130462,
"learning_rate": 1.4313023541382079e-06,
"loss": 0.4298,
"step": 515
},
{
"epoch": 7.735459662288931,
"grad_norm": 0.012668792696732748,
"learning_rate": 1.4290407412138363e-06,
"loss": 0.425,
"step": 516
},
{
"epoch": 7.75046904315197,
"grad_norm": 0.01225012968171437,
"learning_rate": 1.4267764354964037e-06,
"loss": 0.4233,
"step": 517
},
{
"epoch": 7.76547842401501,
"grad_norm": 0.012255924007790624,
"learning_rate": 1.4245094511973967e-06,
"loss": 0.4165,
"step": 518
},
{
"epoch": 7.780487804878049,
"grad_norm": 0.012942200297671827,
"learning_rate": 1.4222398025451134e-06,
"loss": 0.4179,
"step": 519
},
{
"epoch": 7.795497185741088,
"grad_norm": 0.012669086030337824,
"learning_rate": 1.4199675037845743e-06,
"loss": 0.4273,
"step": 520
},
{
"epoch": 7.810506566604127,
"grad_norm": 0.012438089901438005,
"learning_rate": 1.4176925691774333e-06,
"loss": 0.4229,
"step": 521
},
{
"epoch": 7.825515947467167,
"grad_norm": 0.013016106458461985,
"learning_rate": 1.4154150130018865e-06,
"loss": 0.4342,
"step": 522
},
{
"epoch": 7.840525328330206,
"grad_norm": 0.012676013909183275,
"learning_rate": 1.4131348495525846e-06,
"loss": 0.43,
"step": 523
},
{
"epoch": 7.855534709193246,
"grad_norm": 0.012244101315314318,
"learning_rate": 1.4108520931405421e-06,
"loss": 0.4124,
"step": 524
},
{
"epoch": 7.870544090056285,
"grad_norm": 0.012356566669634342,
"learning_rate": 1.4085667580930481e-06,
"loss": 0.4253,
"step": 525
},
{
"epoch": 7.885553470919325,
"grad_norm": 0.013545664104386693,
"learning_rate": 1.4062788587535757e-06,
"loss": 0.4336,
"step": 526
},
{
"epoch": 7.900562851782364,
"grad_norm": 0.012958461771861544,
"learning_rate": 1.403988409481692e-06,
"loss": 0.4256,
"step": 527
},
{
"epoch": 7.915572232645403,
"grad_norm": 0.012515973222970926,
"learning_rate": 1.4016954246529694e-06,
"loss": 0.4258,
"step": 528
},
{
"epoch": 7.915572232645403,
"eval_loss": 0.40825632214546204,
"eval_runtime": 13.931,
"eval_samples_per_second": 32.087,
"eval_steps_per_second": 2.01,
"step": 528
},
{
"epoch": 7.930581613508442,
"grad_norm": 0.013163236106306738,
"learning_rate": 1.399399918658893e-06,
"loss": 0.4261,
"step": 529
},
{
"epoch": 7.945590994371482,
"grad_norm": 0.013156945923845514,
"learning_rate": 1.3971019059067716e-06,
"loss": 0.4282,
"step": 530
},
{
"epoch": 7.960600375234522,
"grad_norm": 0.012975296664314717,
"learning_rate": 1.3948014008196485e-06,
"loss": 0.4178,
"step": 531
},
{
"epoch": 7.975609756097561,
"grad_norm": 0.012620579592042935,
"learning_rate": 1.3924984178362077e-06,
"loss": 0.4315,
"step": 532
},
{
"epoch": 7.9906191369606,
"grad_norm": 0.012990444728543174,
"learning_rate": 1.390192971410687e-06,
"loss": 0.425,
"step": 533
},
{
"epoch": 8.0,
"grad_norm": 0.012990444728543174,
"learning_rate": 1.3878850760127846e-06,
"loss": 0.3523,
"step": 534
},
{
"epoch": 8.01500938086304,
"grad_norm": 0.012443192719573455,
"learning_rate": 1.3855747461275697e-06,
"loss": 0.4906,
"step": 535
},
{
"epoch": 8.03001876172608,
"grad_norm": 0.012368209559504574,
"learning_rate": 1.3832619962553905e-06,
"loss": 0.4227,
"step": 536
},
{
"epoch": 8.045028142589118,
"grad_norm": 0.012874491584688213,
"learning_rate": 1.3809468409117844e-06,
"loss": 0.423,
"step": 537
},
{
"epoch": 8.060037523452158,
"grad_norm": 0.012955402213050558,
"learning_rate": 1.3786292946273859e-06,
"loss": 0.4301,
"step": 538
},
{
"epoch": 8.075046904315197,
"grad_norm": 0.012397892892618072,
"learning_rate": 1.3763093719478357e-06,
"loss": 0.4213,
"step": 539
},
{
"epoch": 8.090056285178237,
"grad_norm": 0.012234259749931429,
"learning_rate": 1.3739870874336897e-06,
"loss": 0.4193,
"step": 540
},
{
"epoch": 8.105065666041275,
"grad_norm": 0.012209969001234834,
"learning_rate": 1.3716624556603274e-06,
"loss": 0.4234,
"step": 541
},
{
"epoch": 8.120075046904315,
"grad_norm": 0.012750032029844028,
"learning_rate": 1.3693354912178607e-06,
"loss": 0.4286,
"step": 542
},
{
"epoch": 8.135084427767355,
"grad_norm": 0.012498344856428782,
"learning_rate": 1.367006208711042e-06,
"loss": 0.4162,
"step": 543
},
{
"epoch": 8.150093808630395,
"grad_norm": 0.012551160489829018,
"learning_rate": 1.3646746227591718e-06,
"loss": 0.423,
"step": 544
},
{
"epoch": 8.165103189493433,
"grad_norm": 0.012836057883823809,
"learning_rate": 1.3623407479960086e-06,
"loss": 0.4183,
"step": 545
},
{
"epoch": 8.180112570356473,
"grad_norm": 0.012832136571966581,
"learning_rate": 1.360004599069676e-06,
"loss": 0.4255,
"step": 546
},
{
"epoch": 8.195121951219512,
"grad_norm": 0.012674899783215083,
"learning_rate": 1.3576661906425705e-06,
"loss": 0.4154,
"step": 547
},
{
"epoch": 8.210131332082552,
"grad_norm": 0.012904907272715635,
"learning_rate": 1.3553255373912707e-06,
"loss": 0.4221,
"step": 548
},
{
"epoch": 8.22514071294559,
"grad_norm": 0.012553161887151092,
"learning_rate": 1.3529826540064438e-06,
"loss": 0.4197,
"step": 549
},
{
"epoch": 8.24015009380863,
"grad_norm": 0.01258506653039211,
"learning_rate": 1.3506375551927544e-06,
"loss": 0.4323,
"step": 550
},
{
"epoch": 8.25515947467167,
"grad_norm": 0.013006243593294749,
"learning_rate": 1.3482902556687715e-06,
"loss": 0.4301,
"step": 551
},
{
"epoch": 8.27016885553471,
"grad_norm": 0.012621778538523186,
"learning_rate": 1.345940770166876e-06,
"loss": 0.4273,
"step": 552
},
{
"epoch": 8.285178236397748,
"grad_norm": 0.012547789104974505,
"learning_rate": 1.3435891134331705e-06,
"loss": 0.4255,
"step": 553
},
{
"epoch": 8.300187617260788,
"grad_norm": 0.012362863077543909,
"learning_rate": 1.3412353002273827e-06,
"loss": 0.4274,
"step": 554
},
{
"epoch": 8.315196998123827,
"grad_norm": 0.013130546014588162,
"learning_rate": 1.3388793453227765e-06,
"loss": 0.4245,
"step": 555
},
{
"epoch": 8.330206378986867,
"grad_norm": 0.012375388484720671,
"learning_rate": 1.3365212635060569e-06,
"loss": 0.4182,
"step": 556
},
{
"epoch": 8.345215759849907,
"grad_norm": 0.012314392169435896,
"learning_rate": 1.3341610695772784e-06,
"loss": 0.4128,
"step": 557
},
{
"epoch": 8.360225140712945,
"grad_norm": 0.012928110986283681,
"learning_rate": 1.3317987783497519e-06,
"loss": 0.4251,
"step": 558
},
{
"epoch": 8.375234521575985,
"grad_norm": 0.013081254426541622,
"learning_rate": 1.3294344046499515e-06,
"loss": 0.4288,
"step": 559
},
{
"epoch": 8.390243902439025,
"grad_norm": 0.012679568106310851,
"learning_rate": 1.3270679633174217e-06,
"loss": 0.4181,
"step": 560
},
{
"epoch": 8.405253283302065,
"grad_norm": 0.012837821286797968,
"learning_rate": 1.3246994692046835e-06,
"loss": 0.4221,
"step": 561
},
{
"epoch": 8.420262664165103,
"grad_norm": 0.012683953345995792,
"learning_rate": 1.3223289371771424e-06,
"loss": 0.4342,
"step": 562
},
{
"epoch": 8.435272045028142,
"grad_norm": 0.012324190260690752,
"learning_rate": 1.3199563821129944e-06,
"loss": 0.4143,
"step": 563
},
{
"epoch": 8.450281425891182,
"grad_norm": 0.012704009069056542,
"learning_rate": 1.3175818189031326e-06,
"loss": 0.4139,
"step": 564
},
{
"epoch": 8.465290806754222,
"grad_norm": 0.012664000146649987,
"learning_rate": 1.3152052624510535e-06,
"loss": 0.421,
"step": 565
},
{
"epoch": 8.48030018761726,
"grad_norm": 0.013174322443423665,
"learning_rate": 1.3128267276727644e-06,
"loss": 0.4172,
"step": 566
},
{
"epoch": 8.4953095684803,
"grad_norm": 0.012481267748429541,
"learning_rate": 1.3104462294966894e-06,
"loss": 0.4256,
"step": 567
},
{
"epoch": 8.51031894934334,
"grad_norm": 0.012926931305574265,
"learning_rate": 1.3080637828635744e-06,
"loss": 0.4236,
"step": 568
},
{
"epoch": 8.52532833020638,
"grad_norm": 0.012594343237208048,
"learning_rate": 1.3056794027263948e-06,
"loss": 0.424,
"step": 569
},
{
"epoch": 8.540337711069418,
"grad_norm": 0.013005429097167703,
"learning_rate": 1.3032931040502626e-06,
"loss": 0.4262,
"step": 570
},
{
"epoch": 8.555347091932457,
"grad_norm": 0.012700849998308944,
"learning_rate": 1.300904901812329e-06,
"loss": 0.4112,
"step": 571
},
{
"epoch": 8.570356472795497,
"grad_norm": 0.01234859544446316,
"learning_rate": 1.2985148110016947e-06,
"loss": 0.4234,
"step": 572
},
{
"epoch": 8.585365853658537,
"grad_norm": 0.012647777310344478,
"learning_rate": 1.2961228466193116e-06,
"loss": 0.4298,
"step": 573
},
{
"epoch": 8.600375234521575,
"grad_norm": 0.012976806863275401,
"learning_rate": 1.293729023677892e-06,
"loss": 0.4104,
"step": 574
},
{
"epoch": 8.615384615384615,
"grad_norm": 0.013316957653669519,
"learning_rate": 1.2913333572018132e-06,
"loss": 0.4277,
"step": 575
},
{
"epoch": 8.630393996247655,
"grad_norm": 0.012701811346435813,
"learning_rate": 1.2889358622270223e-06,
"loss": 0.4194,
"step": 576
},
{
"epoch": 8.645403377110695,
"grad_norm": 0.012852571501030714,
"learning_rate": 1.2865365538009432e-06,
"loss": 0.4225,
"step": 577
},
{
"epoch": 8.660412757973734,
"grad_norm": 0.012874594988687248,
"learning_rate": 1.2841354469823814e-06,
"loss": 0.4124,
"step": 578
},
{
"epoch": 8.675422138836772,
"grad_norm": 0.013235216009797502,
"learning_rate": 1.2817325568414297e-06,
"loss": 0.4319,
"step": 579
},
{
"epoch": 8.690431519699812,
"grad_norm": 0.012795063248840513,
"learning_rate": 1.2793278984593734e-06,
"loss": 0.4231,
"step": 580
},
{
"epoch": 8.705440900562852,
"grad_norm": 0.012789613479480306,
"learning_rate": 1.2769214869285963e-06,
"loss": 0.4174,
"step": 581
},
{
"epoch": 8.720450281425892,
"grad_norm": 0.012313758378033874,
"learning_rate": 1.2745133373524852e-06,
"loss": 0.4294,
"step": 582
},
{
"epoch": 8.73545966228893,
"grad_norm": 0.01301988888876861,
"learning_rate": 1.272103464845335e-06,
"loss": 0.4265,
"step": 583
},
{
"epoch": 8.75046904315197,
"grad_norm": 0.012794500405563685,
"learning_rate": 1.269691884532255e-06,
"loss": 0.4169,
"step": 584
},
{
"epoch": 8.76547842401501,
"grad_norm": 0.012906369892351301,
"learning_rate": 1.2672786115490727e-06,
"loss": 0.4235,
"step": 585
},
{
"epoch": 8.78048780487805,
"grad_norm": 0.012698038298790544,
"learning_rate": 1.26486366104224e-06,
"loss": 0.4198,
"step": 586
},
{
"epoch": 8.795497185741088,
"grad_norm": 0.012324390315305763,
"learning_rate": 1.2624470481687368e-06,
"loss": 0.4222,
"step": 587
},
{
"epoch": 8.810506566604127,
"grad_norm": 0.012471895547561243,
"learning_rate": 1.260028788095976e-06,
"loss": 0.4121,
"step": 588
},
{
"epoch": 8.825515947467167,
"grad_norm": 0.013053246144026396,
"learning_rate": 1.2576088960017107e-06,
"loss": 0.423,
"step": 589
},
{
"epoch": 8.840525328330207,
"grad_norm": 0.013057771093177609,
"learning_rate": 1.255187387073935e-06,
"loss": 0.4195,
"step": 590
},
{
"epoch": 8.855534709193245,
"grad_norm": 0.012992648432035044,
"learning_rate": 1.2527642765107917e-06,
"loss": 0.4148,
"step": 591
},
{
"epoch": 8.870544090056285,
"grad_norm": 0.0130532629815524,
"learning_rate": 1.2503395795204766e-06,
"loss": 0.4309,
"step": 592
},
{
"epoch": 8.885553470919325,
"grad_norm": 0.012368890664965363,
"learning_rate": 1.2479133113211412e-06,
"loss": 0.4158,
"step": 593
},
{
"epoch": 8.900562851782365,
"grad_norm": 0.012966982165331422,
"learning_rate": 1.245485487140799e-06,
"loss": 0.4207,
"step": 594
},
{
"epoch": 8.900562851782365,
"eval_loss": 0.4048081934452057,
"eval_runtime": 13.9142,
"eval_samples_per_second": 32.126,
"eval_steps_per_second": 2.012,
"step": 594
},
{
"epoch": 8.915572232645403,
"grad_norm": 0.01282686305166402,
"learning_rate": 1.2430561222172295e-06,
"loss": 0.4342,
"step": 595
},
{
"epoch": 8.930581613508442,
"grad_norm": 0.012073531377116312,
"learning_rate": 1.2406252317978821e-06,
"loss": 0.4225,
"step": 596
},
{
"epoch": 8.945590994371482,
"grad_norm": 0.012136707320216203,
"learning_rate": 1.2381928311397806e-06,
"loss": 0.42,
"step": 597
},
{
"epoch": 8.960600375234522,
"grad_norm": 0.012992601219686593,
"learning_rate": 1.2357589355094273e-06,
"loss": 0.4294,
"step": 598
},
{
"epoch": 8.975609756097562,
"grad_norm": 0.012433755901529724,
"learning_rate": 1.2333235601827084e-06,
"loss": 0.4135,
"step": 599
},
{
"epoch": 8.9906191369606,
"grad_norm": 0.012871323298772467,
"learning_rate": 1.2308867204447957e-06,
"loss": 0.4227,
"step": 600
},
{
"epoch": 9.0,
"grad_norm": 0.012871323298772467,
"learning_rate": 1.228448431590054e-06,
"loss": 0.4203,
"step": 601
},
{
"epoch": 9.01500938086304,
"grad_norm": 0.017461692067906185,
"learning_rate": 1.2260087089219414e-06,
"loss": 0.4263,
"step": 602
},
{
"epoch": 9.03001876172608,
"grad_norm": 0.012676685293751521,
"learning_rate": 1.2235675677529155e-06,
"loss": 0.4206,
"step": 603
},
{
"epoch": 9.045028142589118,
"grad_norm": 0.012796622016032069,
"learning_rate": 1.2211250234043382e-06,
"loss": 0.4263,
"step": 604
},
{
"epoch": 9.060037523452158,
"grad_norm": 0.013183001174916187,
"learning_rate": 1.2186810912063758e-06,
"loss": 0.42,
"step": 605
},
{
"epoch": 9.075046904315197,
"grad_norm": 0.012633953173363561,
"learning_rate": 1.216235786497907e-06,
"loss": 0.4163,
"step": 606
},
{
"epoch": 9.090056285178237,
"grad_norm": 0.012022016291928495,
"learning_rate": 1.213789124626425e-06,
"loss": 0.4185,
"step": 607
},
{
"epoch": 9.105065666041275,
"grad_norm": 0.012893256202566969,
"learning_rate": 1.211341120947939e-06,
"loss": 0.4098,
"step": 608
},
{
"epoch": 9.120075046904315,
"grad_norm": 0.012317279779981451,
"learning_rate": 1.208891790826882e-06,
"loss": 0.4269,
"step": 609
},
{
"epoch": 9.135084427767355,
"grad_norm": 0.012580486012471572,
"learning_rate": 1.2064411496360107e-06,
"loss": 0.4144,
"step": 610
},
{
"epoch": 9.150093808630395,
"grad_norm": 0.012564068366617366,
"learning_rate": 1.2039892127563116e-06,
"loss": 0.4088,
"step": 611
},
{
"epoch": 9.165103189493433,
"grad_norm": 0.011856953897715697,
"learning_rate": 1.201535995576902e-06,
"loss": 0.4283,
"step": 612
},
{
"epoch": 9.180112570356473,
"grad_norm": 0.01293799333411114,
"learning_rate": 1.199081513494936e-06,
"loss": 0.4165,
"step": 613
},
{
"epoch": 9.195121951219512,
"grad_norm": 0.013093089279946351,
"learning_rate": 1.1966257819155062e-06,
"loss": 0.4164,
"step": 614
},
{
"epoch": 9.210131332082552,
"grad_norm": 0.012895898134142643,
"learning_rate": 1.1941688162515467e-06,
"loss": 0.4248,
"step": 615
},
{
"epoch": 9.22514071294559,
"grad_norm": 0.01250661926372622,
"learning_rate": 1.1917106319237384e-06,
"loss": 0.4303,
"step": 616
},
{
"epoch": 9.24015009380863,
"grad_norm": 0.013540155741539446,
"learning_rate": 1.1892512443604101e-06,
"loss": 0.4167,
"step": 617
},
{
"epoch": 9.25515947467167,
"grad_norm": 0.012245135490446384,
"learning_rate": 1.1867906689974427e-06,
"loss": 0.4234,
"step": 618
},
{
"epoch": 9.27016885553471,
"grad_norm": 0.012654858934040628,
"learning_rate": 1.1843289212781722e-06,
"loss": 0.4078,
"step": 619
},
{
"epoch": 9.285178236397748,
"grad_norm": 0.012581564643630807,
"learning_rate": 1.1818660166532924e-06,
"loss": 0.404,
"step": 620
},
{
"epoch": 9.300187617260788,
"grad_norm": 0.012446126751038225,
"learning_rate": 1.1794019705807582e-06,
"loss": 0.4256,
"step": 621
},
{
"epoch": 9.315196998123827,
"grad_norm": 0.012225067090946798,
"learning_rate": 1.1769367985256885e-06,
"loss": 0.4195,
"step": 622
},
{
"epoch": 9.330206378986867,
"grad_norm": 0.012493128506126407,
"learning_rate": 1.1744705159602698e-06,
"loss": 0.4219,
"step": 623
},
{
"epoch": 9.345215759849907,
"grad_norm": 0.012675214454940823,
"learning_rate": 1.1720031383636585e-06,
"loss": 0.4212,
"step": 624
},
{
"epoch": 9.360225140712945,
"grad_norm": 0.012438903850514627,
"learning_rate": 1.1695346812218825e-06,
"loss": 0.4168,
"step": 625
},
{
"epoch": 9.375234521575985,
"grad_norm": 0.01254096913931574,
"learning_rate": 1.167065160027747e-06,
"loss": 0.4149,
"step": 626
},
{
"epoch": 9.390243902439025,
"grad_norm": 0.012506400410392516,
"learning_rate": 1.164594590280734e-06,
"loss": 0.4169,
"step": 627
},
{
"epoch": 9.405253283302065,
"grad_norm": 0.013067474713690975,
"learning_rate": 1.1621229874869075e-06,
"loss": 0.4127,
"step": 628
},
{
"epoch": 9.420262664165103,
"grad_norm": 0.012626791768751198,
"learning_rate": 1.159650367158815e-06,
"loss": 0.4291,
"step": 629
},
{
"epoch": 9.435272045028142,
"grad_norm": 0.012395862427797645,
"learning_rate": 1.15717674481539e-06,
"loss": 0.4099,
"step": 630
},
{
"epoch": 9.450281425891182,
"grad_norm": 0.01277375372458372,
"learning_rate": 1.1547021359818558e-06,
"loss": 0.4123,
"step": 631
},
{
"epoch": 9.465290806754222,
"grad_norm": 0.0123342923006372,
"learning_rate": 1.1522265561896263e-06,
"loss": 0.4154,
"step": 632
},
{
"epoch": 9.48030018761726,
"grad_norm": 0.012429900682600912,
"learning_rate": 1.14975002097621e-06,
"loss": 0.4152,
"step": 633
},
{
"epoch": 9.4953095684803,
"grad_norm": 0.01280375207676722,
"learning_rate": 1.1472725458851116e-06,
"loss": 0.415,
"step": 634
},
{
"epoch": 9.51031894934334,
"grad_norm": 0.012687397063652189,
"learning_rate": 1.144794146465735e-06,
"loss": 0.4304,
"step": 635
},
{
"epoch": 9.52532833020638,
"grad_norm": 0.012179956092863778,
"learning_rate": 1.1423148382732853e-06,
"loss": 0.4093,
"step": 636
},
{
"epoch": 9.540337711069418,
"grad_norm": 0.012093011406295692,
"learning_rate": 1.1398346368686714e-06,
"loss": 0.418,
"step": 637
},
{
"epoch": 9.555347091932457,
"grad_norm": 0.013454735147744316,
"learning_rate": 1.1373535578184082e-06,
"loss": 0.4264,
"step": 638
},
{
"epoch": 9.570356472795497,
"grad_norm": 0.012555914733497363,
"learning_rate": 1.1348716166945195e-06,
"loss": 0.4212,
"step": 639
},
{
"epoch": 9.585365853658537,
"grad_norm": 0.01309842650785753,
"learning_rate": 1.1323888290744385e-06,
"loss": 0.4229,
"step": 640
},
{
"epoch": 9.600375234521575,
"grad_norm": 0.013049394375582246,
"learning_rate": 1.1299052105409134e-06,
"loss": 0.4235,
"step": 641
},
{
"epoch": 9.615384615384615,
"grad_norm": 0.012029514118628914,
"learning_rate": 1.127420776681905e-06,
"loss": 0.4132,
"step": 642
},
{
"epoch": 9.630393996247655,
"grad_norm": 0.012689737627309502,
"learning_rate": 1.1249355430904929e-06,
"loss": 0.4234,
"step": 643
},
{
"epoch": 9.645403377110695,
"grad_norm": 0.012003229452039505,
"learning_rate": 1.1224495253647754e-06,
"loss": 0.4166,
"step": 644
},
{
"epoch": 9.660412757973734,
"grad_norm": 0.012066969097037491,
"learning_rate": 1.119962739107773e-06,
"loss": 0.4092,
"step": 645
},
{
"epoch": 9.675422138836772,
"grad_norm": 0.012338956794216918,
"learning_rate": 1.117475199927329e-06,
"loss": 0.4282,
"step": 646
},
{
"epoch": 9.690431519699812,
"grad_norm": 0.012204286663618873,
"learning_rate": 1.1149869234360126e-06,
"loss": 0.4314,
"step": 647
},
{
"epoch": 9.705440900562852,
"grad_norm": 0.01259052031260883,
"learning_rate": 1.1124979252510207e-06,
"loss": 0.4305,
"step": 648
},
{
"epoch": 9.720450281425892,
"grad_norm": 0.013022893825570933,
"learning_rate": 1.1100082209940793e-06,
"loss": 0.4198,
"step": 649
},
{
"epoch": 9.73545966228893,
"grad_norm": 0.012690810978915846,
"learning_rate": 1.1075178262913466e-06,
"loss": 0.4109,
"step": 650
},
{
"epoch": 9.75046904315197,
"grad_norm": 0.012631087513351836,
"learning_rate": 1.1050267567733138e-06,
"loss": 0.4161,
"step": 651
},
{
"epoch": 9.76547842401501,
"grad_norm": 0.012804454368901522,
"learning_rate": 1.1025350280747073e-06,
"loss": 0.4196,
"step": 652
},
{
"epoch": 9.78048780487805,
"grad_norm": 0.0126217418321894,
"learning_rate": 1.1000426558343909e-06,
"loss": 0.421,
"step": 653
},
{
"epoch": 9.795497185741088,
"grad_norm": 0.012742956495133224,
"learning_rate": 1.097549655695268e-06,
"loss": 0.4175,
"step": 654
},
{
"epoch": 9.810506566604127,
"grad_norm": 0.012689525456554308,
"learning_rate": 1.0950560433041825e-06,
"loss": 0.4078,
"step": 655
},
{
"epoch": 9.825515947467167,
"grad_norm": 0.012319356429640735,
"learning_rate": 1.0925618343118207e-06,
"loss": 0.4249,
"step": 656
},
{
"epoch": 9.840525328330207,
"grad_norm": 0.012463815882183473,
"learning_rate": 1.0900670443726134e-06,
"loss": 0.416,
"step": 657
},
{
"epoch": 9.855534709193245,
"grad_norm": 0.012494008236093816,
"learning_rate": 1.087571689144638e-06,
"loss": 0.4094,
"step": 658
},
{
"epoch": 9.870544090056285,
"grad_norm": 0.012788296107781863,
"learning_rate": 1.0850757842895193e-06,
"loss": 0.4134,
"step": 659
},
{
"epoch": 9.885553470919325,
"grad_norm": 0.011949790784482477,
"learning_rate": 1.0825793454723324e-06,
"loss": 0.4123,
"step": 660
},
{
"epoch": 9.885553470919325,
"eval_loss": 0.40209999680519104,
"eval_runtime": 13.8285,
"eval_samples_per_second": 32.324,
"eval_steps_per_second": 2.025,
"step": 660
},
{
"epoch": 9.900562851782365,
"grad_norm": 0.012120763384507233,
"learning_rate": 1.0800823883615032e-06,
"loss": 0.418,
"step": 661
},
{
"epoch": 9.915572232645403,
"grad_norm": 0.01272550451559427,
"learning_rate": 1.0775849286287104e-06,
"loss": 0.4255,
"step": 662
},
{
"epoch": 9.930581613508442,
"grad_norm": 0.012569887569723945,
"learning_rate": 1.0750869819487883e-06,
"loss": 0.4181,
"step": 663
},
{
"epoch": 9.945590994371482,
"grad_norm": 0.012900893185061444,
"learning_rate": 1.0725885639996262e-06,
"loss": 0.4256,
"step": 664
},
{
"epoch": 9.960600375234522,
"grad_norm": 0.012186922057616576,
"learning_rate": 1.0700896904620722e-06,
"loss": 0.4239,
"step": 665
},
{
"epoch": 9.975609756097562,
"grad_norm": 0.012546185280174902,
"learning_rate": 1.0675903770198332e-06,
"loss": 0.4096,
"step": 666
},
{
"epoch": 9.9906191369606,
"grad_norm": 0.012308385784068267,
"learning_rate": 1.0650906393593768e-06,
"loss": 0.417,
"step": 667
},
{
"epoch": 10.01500938086304,
"grad_norm": 0.014542424040829479,
"learning_rate": 1.0625904931698345e-06,
"loss": 0.8235,
"step": 668
},
{
"epoch": 10.03001876172608,
"grad_norm": 0.012357617955558025,
"learning_rate": 1.0600899541429002e-06,
"loss": 0.4132,
"step": 669
},
{
"epoch": 10.045028142589118,
"grad_norm": 0.01238382198584047,
"learning_rate": 1.057589037972735e-06,
"loss": 0.409,
"step": 670
},
{
"epoch": 10.060037523452158,
"grad_norm": 0.012545757227630114,
"learning_rate": 1.0550877603558654e-06,
"loss": 0.4202,
"step": 671
},
{
"epoch": 10.075046904315197,
"grad_norm": 0.012296729178528745,
"learning_rate": 1.0525861369910876e-06,
"loss": 0.4118,
"step": 672
},
{
"epoch": 10.090056285178237,
"grad_norm": 0.01233936400904742,
"learning_rate": 1.0500841835793676e-06,
"loss": 0.4186,
"step": 673
},
{
"epoch": 10.105065666041275,
"grad_norm": 0.01239644213592733,
"learning_rate": 1.0475819158237424e-06,
"loss": 0.4211,
"step": 674
},
{
"epoch": 10.120075046904315,
"grad_norm": 0.01238165563747052,
"learning_rate": 1.0450793494292222e-06,
"loss": 0.4192,
"step": 675
},
{
"epoch": 10.135084427767355,
"grad_norm": 0.01228638867675189,
"learning_rate": 1.0425765001026922e-06,
"loss": 0.4122,
"step": 676
},
{
"epoch": 10.150093808630395,
"grad_norm": 0.012503182702259674,
"learning_rate": 1.0400733835528124e-06,
"loss": 0.4257,
"step": 677
},
{
"epoch": 10.165103189493433,
"grad_norm": 0.012016184893121317,
"learning_rate": 1.0375700154899207e-06,
"loss": 0.3982,
"step": 678
},
{
"epoch": 10.180112570356473,
"grad_norm": 0.012647564056457507,
"learning_rate": 1.0350664116259326e-06,
"loss": 0.4247,
"step": 679
},
{
"epoch": 10.195121951219512,
"grad_norm": 0.012624563972519114,
"learning_rate": 1.032562587674245e-06,
"loss": 0.4226,
"step": 680
},
{
"epoch": 10.210131332082552,
"grad_norm": 0.01266422823244068,
"learning_rate": 1.0300585593496347e-06,
"loss": 0.4236,
"step": 681
},
{
"epoch": 10.22514071294559,
"grad_norm": 0.013120899008077981,
"learning_rate": 1.0275543423681621e-06,
"loss": 0.4267,
"step": 682
},
{
"epoch": 10.24015009380863,
"grad_norm": 0.012469850454087042,
"learning_rate": 1.0250499524470713e-06,
"loss": 0.4185,
"step": 683
},
{
"epoch": 10.25515947467167,
"grad_norm": 0.013032775148789266,
"learning_rate": 1.022545405304692e-06,
"loss": 0.4173,
"step": 684
},
{
"epoch": 10.27016885553471,
"grad_norm": 0.012130912167583757,
"learning_rate": 1.020040716660341e-06,
"loss": 0.4174,
"step": 685
},
{
"epoch": 10.285178236397748,
"grad_norm": 0.012694584356580994,
"learning_rate": 1.0175359022342224e-06,
"loss": 0.4201,
"step": 686
},
{
"epoch": 10.300187617260788,
"grad_norm": 0.012738648311210665,
"learning_rate": 1.0150309777473304e-06,
"loss": 0.4246,
"step": 687
},
{
"epoch": 10.315196998123827,
"grad_norm": 0.012541064779257812,
"learning_rate": 1.0125259589213495e-06,
"loss": 0.4237,
"step": 688
},
{
"epoch": 10.330206378986867,
"grad_norm": 0.012159573126019833,
"learning_rate": 1.0100208614785565e-06,
"loss": 0.4236,
"step": 689
},
{
"epoch": 10.345215759849907,
"grad_norm": 0.012655017443475823,
"learning_rate": 1.007515701141722e-06,
"loss": 0.414,
"step": 690
},
{
"epoch": 10.360225140712945,
"grad_norm": 0.01201000687630199,
"learning_rate": 1.0050104936340107e-06,
"loss": 0.4185,
"step": 691
},
{
"epoch": 10.375234521575985,
"grad_norm": 0.012362021002323115,
"learning_rate": 1.002505254678884e-06,
"loss": 0.4226,
"step": 692
},
{
"epoch": 10.390243902439025,
"grad_norm": 0.012831699499431226,
"learning_rate": 1e-06,
"loss": 0.4267,
"step": 693
},
{
"epoch": 10.405253283302065,
"grad_norm": 0.01274780819591858,
"learning_rate": 9.97494745321116e-07,
"loss": 0.4109,
"step": 694
},
{
"epoch": 10.420262664165103,
"grad_norm": 0.013774541103795192,
"learning_rate": 9.949895063659892e-07,
"loss": 0.4125,
"step": 695
},
{
"epoch": 10.435272045028142,
"grad_norm": 0.012506222777518427,
"learning_rate": 9.924842988582782e-07,
"loss": 0.4214,
"step": 696
},
{
"epoch": 10.450281425891182,
"grad_norm": 0.01244429874279497,
"learning_rate": 9.899791385214436e-07,
"loss": 0.4051,
"step": 697
},
{
"epoch": 10.465290806754222,
"grad_norm": 0.012191950069124378,
"learning_rate": 9.874740410786506e-07,
"loss": 0.4118,
"step": 698
},
{
"epoch": 10.48030018761726,
"grad_norm": 0.013197523206823903,
"learning_rate": 9.849690222526697e-07,
"loss": 0.416,
"step": 699
},
{
"epoch": 10.4953095684803,
"grad_norm": 0.012181682595547577,
"learning_rate": 9.824640977657773e-07,
"loss": 0.4105,
"step": 700
},
{
"epoch": 10.51031894934334,
"grad_norm": 0.01234029745826439,
"learning_rate": 9.79959283339659e-07,
"loss": 0.4198,
"step": 701
},
{
"epoch": 10.52532833020638,
"grad_norm": 0.012785453706742769,
"learning_rate": 9.77454594695308e-07,
"loss": 0.4143,
"step": 702
},
{
"epoch": 10.540337711069418,
"grad_norm": 0.012575428471071192,
"learning_rate": 9.749500475529289e-07,
"loss": 0.411,
"step": 703
},
{
"epoch": 10.555347091932457,
"grad_norm": 0.012318721406990746,
"learning_rate": 9.72445657631838e-07,
"loss": 0.3997,
"step": 704
},
{
"epoch": 10.570356472795497,
"grad_norm": 0.012789956349847692,
"learning_rate": 9.699414406503652e-07,
"loss": 0.4176,
"step": 705
},
{
"epoch": 10.585365853658537,
"grad_norm": 0.01328367201839251,
"learning_rate": 9.674374123257553e-07,
"loss": 0.4202,
"step": 706
},
{
"epoch": 10.600375234521575,
"grad_norm": 0.013024092725146954,
"learning_rate": 9.649335883740673e-07,
"loss": 0.4158,
"step": 707
},
{
"epoch": 10.615384615384615,
"grad_norm": 0.013090434740472322,
"learning_rate": 9.624299845100794e-07,
"loss": 0.4101,
"step": 708
},
{
"epoch": 10.630393996247655,
"grad_norm": 0.012176392973424509,
"learning_rate": 9.599266164471873e-07,
"loss": 0.4073,
"step": 709
},
{
"epoch": 10.645403377110695,
"grad_norm": 0.012247800521725064,
"learning_rate": 9.574234998973075e-07,
"loss": 0.4126,
"step": 710
},
{
"epoch": 10.660412757973734,
"grad_norm": 0.012220616675445717,
"learning_rate": 9.549206505707777e-07,
"loss": 0.4145,
"step": 711
},
{
"epoch": 10.675422138836772,
"grad_norm": 0.01252656468465896,
"learning_rate": 9.524180841762576e-07,
"loss": 0.4226,
"step": 712
},
{
"epoch": 10.690431519699812,
"grad_norm": 0.01239436211827446,
"learning_rate": 9.499158164206324e-07,
"loss": 0.4181,
"step": 713
},
{
"epoch": 10.705440900562852,
"grad_norm": 0.012506887073625421,
"learning_rate": 9.474138630089123e-07,
"loss": 0.4114,
"step": 714
},
{
"epoch": 10.720450281425892,
"grad_norm": 0.012310549268342016,
"learning_rate": 9.449122396441343e-07,
"loss": 0.4048,
"step": 715
},
{
"epoch": 10.73545966228893,
"grad_norm": 0.012239695816293078,
"learning_rate": 9.424109620272652e-07,
"loss": 0.4208,
"step": 716
},
{
"epoch": 10.75046904315197,
"grad_norm": 0.012441047937013722,
"learning_rate": 9.399100458570996e-07,
"loss": 0.4039,
"step": 717
},
{
"epoch": 10.76547842401501,
"grad_norm": 0.01206547884076615,
"learning_rate": 9.374095068301656e-07,
"loss": 0.4103,
"step": 718
},
{
"epoch": 10.78048780487805,
"grad_norm": 0.012386755042071106,
"learning_rate": 9.349093606406231e-07,
"loss": 0.4163,
"step": 719
},
{
"epoch": 10.795497185741088,
"grad_norm": 0.012498396538373382,
"learning_rate": 9.324096229801673e-07,
"loss": 0.4196,
"step": 720
},
{
"epoch": 10.810506566604127,
"grad_norm": 0.012178639675100663,
"learning_rate": 9.299103095379281e-07,
"loss": 0.4135,
"step": 721
},
{
"epoch": 10.825515947467167,
"grad_norm": 0.012463279304890761,
"learning_rate": 9.274114360003737e-07,
"loss": 0.421,
"step": 722
},
{
"epoch": 10.840525328330207,
"grad_norm": 0.012419698179239286,
"learning_rate": 9.249130180512116e-07,
"loss": 0.4138,
"step": 723
},
{
"epoch": 10.855534709193245,
"grad_norm": 0.012609451632788022,
"learning_rate": 9.224150713712894e-07,
"loss": 0.4111,
"step": 724
},
{
"epoch": 10.870544090056285,
"grad_norm": 0.01264688365836291,
"learning_rate": 9.199176116384973e-07,
"loss": 0.4101,
"step": 725
},
{
"epoch": 10.885553470919325,
"grad_norm": 0.012922477609598963,
"learning_rate": 9.174206545276677e-07,
"loss": 0.4103,
"step": 726
},
{
"epoch": 10.885553470919325,
"eval_loss": 0.3998468220233917,
"eval_runtime": 13.6727,
"eval_samples_per_second": 32.693,
"eval_steps_per_second": 2.048,
"step": 726
},
{
"epoch": 10.900562851782365,
"grad_norm": 0.012222567401425159,
"learning_rate": 9.149242157104806e-07,
"loss": 0.4028,
"step": 727
},
{
"epoch": 10.915572232645403,
"grad_norm": 0.012431994303602674,
"learning_rate": 9.12428310855362e-07,
"loss": 0.4252,
"step": 728
},
{
"epoch": 10.930581613508442,
"grad_norm": 0.01224992043513614,
"learning_rate": 9.099329556273865e-07,
"loss": 0.4194,
"step": 729
},
{
"epoch": 10.945590994371482,
"grad_norm": 0.012591228897490484,
"learning_rate": 9.074381656881796e-07,
"loss": 0.4162,
"step": 730
},
{
"epoch": 10.960600375234522,
"grad_norm": 0.01281665111811267,
"learning_rate": 9.049439566958175e-07,
"loss": 0.4168,
"step": 731
},
{
"epoch": 10.975609756097562,
"grad_norm": 0.012052834238919263,
"learning_rate": 9.024503443047318e-07,
"loss": 0.4195,
"step": 732
},
{
"epoch": 10.9906191369606,
"grad_norm": 0.012847179302409485,
"learning_rate": 8.999573441656089e-07,
"loss": 0.4158,
"step": 733
},
{
"epoch": 11.0,
"grad_norm": 0.012847179302409485,
"learning_rate": 8.974649719252928e-07,
"loss": 0.4147,
"step": 734
},
{
"epoch": 11.01500938086304,
"grad_norm": 0.016922785752928025,
"learning_rate": 8.949732432266866e-07,
"loss": 0.4058,
"step": 735
},
{
"epoch": 11.03001876172608,
"grad_norm": 0.012040148173995293,
"learning_rate": 8.924821737086535e-07,
"loss": 0.4197,
"step": 736
},
{
"epoch": 11.045028142589118,
"grad_norm": 0.01243791973891847,
"learning_rate": 8.899917790059207e-07,
"loss": 0.4225,
"step": 737
},
{
"epoch": 11.060037523452158,
"grad_norm": 0.01195989075560471,
"learning_rate": 8.875020747489793e-07,
"loss": 0.4163,
"step": 738
},
{
"epoch": 11.075046904315197,
"grad_norm": 0.012609607466383803,
"learning_rate": 8.850130765639872e-07,
"loss": 0.4229,
"step": 739
},
{
"epoch": 11.090056285178237,
"grad_norm": 0.012478853178643596,
"learning_rate": 8.825248000726713e-07,
"loss": 0.4203,
"step": 740
},
{
"epoch": 11.105065666041275,
"grad_norm": 0.012533296902894696,
"learning_rate": 8.80037260892227e-07,
"loss": 0.4097,
"step": 741
},
{
"epoch": 11.120075046904315,
"grad_norm": 0.012463619425894766,
"learning_rate": 8.775504746352246e-07,
"loss": 0.4099,
"step": 742
},
{
"epoch": 11.135084427767355,
"grad_norm": 0.012572037460674756,
"learning_rate": 8.750644569095072e-07,
"loss": 0.4207,
"step": 743
},
{
"epoch": 11.150093808630395,
"grad_norm": 0.012817195911965876,
"learning_rate": 8.72579223318095e-07,
"loss": 0.4139,
"step": 744
},
{
"epoch": 11.165103189493433,
"grad_norm": 0.013023585144428015,
"learning_rate": 8.70094789459087e-07,
"loss": 0.4181,
"step": 745
},
{
"epoch": 11.180112570356473,
"grad_norm": 0.012339182343402779,
"learning_rate": 8.676111709255614e-07,
"loss": 0.4186,
"step": 746
},
{
"epoch": 11.195121951219512,
"grad_norm": 0.012231541298556112,
"learning_rate": 8.651283833054808e-07,
"loss": 0.4087,
"step": 747
},
{
"epoch": 11.210131332082552,
"grad_norm": 0.012477160954476265,
"learning_rate": 8.626464421815918e-07,
"loss": 0.4223,
"step": 748
},
{
"epoch": 11.22514071294559,
"grad_norm": 0.01247166784817762,
"learning_rate": 8.601653631313287e-07,
"loss": 0.4218,
"step": 749
},
{
"epoch": 11.24015009380863,
"grad_norm": 0.012436312283189853,
"learning_rate": 8.576851617267149e-07,
"loss": 0.4148,
"step": 750
},
{
"epoch": 11.25515947467167,
"grad_norm": 0.012388511629756823,
"learning_rate": 8.552058535342652e-07,
"loss": 0.4127,
"step": 751
},
{
"epoch": 11.27016885553471,
"grad_norm": 0.012312475936173602,
"learning_rate": 8.527274541148884e-07,
"loss": 0.4085,
"step": 752
},
{
"epoch": 11.285178236397748,
"grad_norm": 0.011970201855727553,
"learning_rate": 8.502499790237899e-07,
"loss": 0.4007,
"step": 753
},
{
"epoch": 11.300187617260788,
"grad_norm": 0.012623949917184176,
"learning_rate": 8.477734438103735e-07,
"loss": 0.4119,
"step": 754
},
{
"epoch": 11.315196998123827,
"grad_norm": 0.012212246192249409,
"learning_rate": 8.452978640181444e-07,
"loss": 0.4018,
"step": 755
},
{
"epoch": 11.330206378986867,
"grad_norm": 0.012416638149246855,
"learning_rate": 8.428232551846101e-07,
"loss": 0.4088,
"step": 756
},
{
"epoch": 11.345215759849907,
"grad_norm": 0.012132111502099707,
"learning_rate": 8.40349632841185e-07,
"loss": 0.4114,
"step": 757
},
{
"epoch": 11.360225140712945,
"grad_norm": 0.012267161024372699,
"learning_rate": 8.378770125130924e-07,
"loss": 0.4111,
"step": 758
},
{
"epoch": 11.375234521575985,
"grad_norm": 0.01283364696949894,
"learning_rate": 8.354054097192659e-07,
"loss": 0.4191,
"step": 759
},
{
"epoch": 11.390243902439025,
"grad_norm": 0.012709567548725487,
"learning_rate": 8.329348399722533e-07,
"loss": 0.4128,
"step": 760
},
{
"epoch": 11.405253283302065,
"grad_norm": 0.012340710769494431,
"learning_rate": 8.304653187781175e-07,
"loss": 0.4011,
"step": 761
},
{
"epoch": 11.420262664165103,
"grad_norm": 0.012429569405680763,
"learning_rate": 8.279968616363417e-07,
"loss": 0.4074,
"step": 762
},
{
"epoch": 11.435272045028142,
"grad_norm": 0.011810395153086212,
"learning_rate": 8.2552948403973e-07,
"loss": 0.4134,
"step": 763
},
{
"epoch": 11.450281425891182,
"grad_norm": 0.012235707581225837,
"learning_rate": 8.230632014743114e-07,
"loss": 0.4209,
"step": 764
},
{
"epoch": 11.465290806754222,
"grad_norm": 0.012394322030979758,
"learning_rate": 8.205980294192421e-07,
"loss": 0.4156,
"step": 765
},
{
"epoch": 11.48030018761726,
"grad_norm": 0.01233224667407161,
"learning_rate": 8.181339833467078e-07,
"loss": 0.4129,
"step": 766
},
{
"epoch": 11.4953095684803,
"grad_norm": 0.012894820199788238,
"learning_rate": 8.156710787218277e-07,
"loss": 0.4022,
"step": 767
},
{
"epoch": 11.51031894934334,
"grad_norm": 0.012115396789355592,
"learning_rate": 8.132093310025571e-07,
"loss": 0.4227,
"step": 768
},
{
"epoch": 11.52532833020638,
"grad_norm": 0.01250946865021327,
"learning_rate": 8.107487556395901e-07,
"loss": 0.4167,
"step": 769
},
{
"epoch": 11.540337711069418,
"grad_norm": 0.01235958458093249,
"learning_rate": 8.082893680762618e-07,
"loss": 0.4159,
"step": 770
},
{
"epoch": 11.555347091932457,
"grad_norm": 0.01251555314637417,
"learning_rate": 8.058311837484535e-07,
"loss": 0.4179,
"step": 771
},
{
"epoch": 11.570356472795497,
"grad_norm": 0.012567617246515256,
"learning_rate": 8.03374218084494e-07,
"loss": 0.4139,
"step": 772
},
{
"epoch": 11.585365853658537,
"grad_norm": 0.012363452875523725,
"learning_rate": 8.009184865050639e-07,
"loss": 0.4125,
"step": 773
},
{
"epoch": 11.600375234521575,
"grad_norm": 0.012315468666145416,
"learning_rate": 7.984640044230983e-07,
"loss": 0.4125,
"step": 774
},
{
"epoch": 11.615384615384615,
"grad_norm": 0.012111170617275159,
"learning_rate": 7.960107872436887e-07,
"loss": 0.4082,
"step": 775
},
{
"epoch": 11.630393996247655,
"grad_norm": 0.012314813108895683,
"learning_rate": 7.935588503639891e-07,
"loss": 0.4205,
"step": 776
},
{
"epoch": 11.645403377110695,
"grad_norm": 0.012195711107959282,
"learning_rate": 7.91108209173118e-07,
"loss": 0.4175,
"step": 777
},
{
"epoch": 11.660412757973734,
"grad_norm": 0.012204558616190958,
"learning_rate": 7.886588790520608e-07,
"loss": 0.4176,
"step": 778
},
{
"epoch": 11.675422138836772,
"grad_norm": 0.012574502223338559,
"learning_rate": 7.862108753735752e-07,
"loss": 0.4141,
"step": 779
},
{
"epoch": 11.690431519699812,
"grad_norm": 0.012138755761975865,
"learning_rate": 7.837642135020928e-07,
"loss": 0.4144,
"step": 780
},
{
"epoch": 11.705440900562852,
"grad_norm": 0.012892259806495502,
"learning_rate": 7.813189087936242e-07,
"loss": 0.4165,
"step": 781
},
{
"epoch": 11.720450281425892,
"grad_norm": 0.012462979592728919,
"learning_rate": 7.788749765956619e-07,
"loss": 0.4018,
"step": 782
},
{
"epoch": 11.73545966228893,
"grad_norm": 0.012473208835669184,
"learning_rate": 7.764324322470841e-07,
"loss": 0.4136,
"step": 783
},
{
"epoch": 11.75046904315197,
"grad_norm": 0.01289776168996197,
"learning_rate": 7.739912910780589e-07,
"loss": 0.4199,
"step": 784
},
{
"epoch": 11.76547842401501,
"grad_norm": 0.01217754538151564,
"learning_rate": 7.715515684099462e-07,
"loss": 0.4151,
"step": 785
},
{
"epoch": 11.78048780487805,
"grad_norm": 0.012374436487471281,
"learning_rate": 7.691132795552042e-07,
"loss": 0.4076,
"step": 786
},
{
"epoch": 11.795497185741088,
"grad_norm": 0.012391305303445135,
"learning_rate": 7.666764398172917e-07,
"loss": 0.4241,
"step": 787
},
{
"epoch": 11.810506566604127,
"grad_norm": 0.012727898092571818,
"learning_rate": 7.642410644905726e-07,
"loss": 0.4066,
"step": 788
},
{
"epoch": 11.825515947467167,
"grad_norm": 0.012072572789924537,
"learning_rate": 7.618071688602198e-07,
"loss": 0.411,
"step": 789
},
{
"epoch": 11.840525328330207,
"grad_norm": 0.012477736514295582,
"learning_rate": 7.593747682021181e-07,
"loss": 0.4162,
"step": 790
},
{
"epoch": 11.855534709193245,
"grad_norm": 0.01223133703780588,
"learning_rate": 7.569438777827705e-07,
"loss": 0.4139,
"step": 791
},
{
"epoch": 11.870544090056285,
"grad_norm": 0.012134907953234304,
"learning_rate": 7.545145128592008e-07,
"loss": 0.4143,
"step": 792
},
{
"epoch": 11.870544090056285,
"eval_loss": 0.3981357216835022,
"eval_runtime": 13.8587,
"eval_samples_per_second": 32.254,
"eval_steps_per_second": 2.02,
"step": 792
},
{
"epoch": 11.885553470919325,
"grad_norm": 0.012556981627557074,
"learning_rate": 7.520866886788587e-07,
"loss": 0.4137,
"step": 793
},
{
"epoch": 11.900562851782365,
"grad_norm": 0.012816523723577786,
"learning_rate": 7.496604204795234e-07,
"loss": 0.4035,
"step": 794
},
{
"epoch": 11.915572232645403,
"grad_norm": 0.012143845555033037,
"learning_rate": 7.472357234892081e-07,
"loss": 0.4006,
"step": 795
},
{
"epoch": 11.930581613508442,
"grad_norm": 0.012405667588160222,
"learning_rate": 7.448126129260651e-07,
"loss": 0.4086,
"step": 796
},
{
"epoch": 11.945590994371482,
"grad_norm": 0.012494666356775595,
"learning_rate": 7.423911039982893e-07,
"loss": 0.4188,
"step": 797
},
{
"epoch": 11.960600375234522,
"grad_norm": 0.012828825056010124,
"learning_rate": 7.399712119040236e-07,
"loss": 0.4253,
"step": 798
},
{
"epoch": 11.975609756097562,
"grad_norm": 0.01248045961928763,
"learning_rate": 7.375529518312636e-07,
"loss": 0.4094,
"step": 799
},
{
"epoch": 11.9906191369606,
"grad_norm": 0.011879777000295733,
"learning_rate": 7.3513633895776e-07,
"loss": 0.4089,
"step": 800
},
{
"epoch": 12.0,
"grad_norm": 0.01480589004238864,
"learning_rate": 7.327213884509272e-07,
"loss": 0.4039,
"step": 801
},
{
"epoch": 12.01500938086304,
"grad_norm": 0.015597993205706172,
"learning_rate": 7.303081154677451e-07,
"loss": 0.4125,
"step": 802
},
{
"epoch": 12.03001876172608,
"grad_norm": 0.01238599658294667,
"learning_rate": 7.278965351546648e-07,
"loss": 0.4199,
"step": 803
},
{
"epoch": 12.045028142589118,
"grad_norm": 0.012326415759353096,
"learning_rate": 7.254866626475152e-07,
"loss": 0.4065,
"step": 804
},
{
"epoch": 12.060037523452158,
"grad_norm": 0.01249682081721596,
"learning_rate": 7.230785130714037e-07,
"loss": 0.4188,
"step": 805
},
{
"epoch": 12.075046904315197,
"grad_norm": 0.012342465641942092,
"learning_rate": 7.206721015406266e-07,
"loss": 0.4051,
"step": 806
},
{
"epoch": 12.090056285178237,
"grad_norm": 0.012375842848816656,
"learning_rate": 7.182674431585702e-07,
"loss": 0.4144,
"step": 807
},
{
"epoch": 12.105065666041275,
"grad_norm": 0.01235742358239482,
"learning_rate": 7.158645530176184e-07,
"loss": 0.4153,
"step": 808
},
{
"epoch": 12.120075046904315,
"grad_norm": 0.01242364276675938,
"learning_rate": 7.134634461990569e-07,
"loss": 0.4198,
"step": 809
},
{
"epoch": 12.135084427767355,
"grad_norm": 0.011962847590866504,
"learning_rate": 7.110641377729777e-07,
"loss": 0.4115,
"step": 810
},
{
"epoch": 12.150093808630395,
"grad_norm": 0.012325487589974966,
"learning_rate": 7.086666427981868e-07,
"loss": 0.4125,
"step": 811
},
{
"epoch": 12.165103189493433,
"grad_norm": 0.012286797408210898,
"learning_rate": 7.062709763221078e-07,
"loss": 0.4087,
"step": 812
},
{
"epoch": 12.180112570356473,
"grad_norm": 0.012087334628339325,
"learning_rate": 7.038771533806883e-07,
"loss": 0.4183,
"step": 813
},
{
"epoch": 12.195121951219512,
"grad_norm": 0.01206278765338631,
"learning_rate": 7.014851889983057e-07,
"loss": 0.4171,
"step": 814
},
{
"epoch": 12.210131332082552,
"grad_norm": 0.01214504139796841,
"learning_rate": 6.990950981876709e-07,
"loss": 0.4016,
"step": 815
},
{
"epoch": 12.22514071294559,
"grad_norm": 0.012463474933901226,
"learning_rate": 6.967068959497376e-07,
"loss": 0.4138,
"step": 816
},
{
"epoch": 12.24015009380863,
"grad_norm": 0.012038537835947886,
"learning_rate": 6.94320597273605e-07,
"loss": 0.4072,
"step": 817
},
{
"epoch": 12.25515947467167,
"grad_norm": 0.012375869455876078,
"learning_rate": 6.919362171364261e-07,
"loss": 0.4187,
"step": 818
},
{
"epoch": 12.27016885553471,
"grad_norm": 0.012447453448819253,
"learning_rate": 6.895537705033107e-07,
"loss": 0.4072,
"step": 819
},
{
"epoch": 12.285178236397748,
"grad_norm": 0.012624813232774283,
"learning_rate": 6.871732723272354e-07,
"loss": 0.4084,
"step": 820
},
{
"epoch": 12.300187617260788,
"grad_norm": 0.012070464285206625,
"learning_rate": 6.847947375489464e-07,
"loss": 0.4091,
"step": 821
},
{
"epoch": 12.315196998123827,
"grad_norm": 0.012323123920528901,
"learning_rate": 6.824181810968674e-07,
"loss": 0.4077,
"step": 822
},
{
"epoch": 12.330206378986867,
"grad_norm": 0.012423328528856116,
"learning_rate": 6.800436178870057e-07,
"loss": 0.4176,
"step": 823
},
{
"epoch": 12.345215759849907,
"grad_norm": 0.011822785847084157,
"learning_rate": 6.776710628228576e-07,
"loss": 0.4072,
"step": 824
},
{
"epoch": 12.360225140712945,
"grad_norm": 0.01263193125503588,
"learning_rate": 6.753005307953165e-07,
"loss": 0.4125,
"step": 825
},
{
"epoch": 12.375234521575985,
"grad_norm": 0.011667379545734729,
"learning_rate": 6.729320366825783e-07,
"loss": 0.4113,
"step": 826
},
{
"epoch": 12.390243902439025,
"grad_norm": 0.012237311305074636,
"learning_rate": 6.705655953500483e-07,
"loss": 0.413,
"step": 827
},
{
"epoch": 12.405253283302065,
"grad_norm": 0.012427319712453859,
"learning_rate": 6.682012216502483e-07,
"loss": 0.4189,
"step": 828
},
{
"epoch": 12.420262664165103,
"grad_norm": 0.012117640495853378,
"learning_rate": 6.658389304227219e-07,
"loss": 0.4157,
"step": 829
},
{
"epoch": 12.435272045028142,
"grad_norm": 0.012064176015325582,
"learning_rate": 6.634787364939434e-07,
"loss": 0.4048,
"step": 830
},
{
"epoch": 12.450281425891182,
"grad_norm": 0.012531719531208345,
"learning_rate": 6.611206546772237e-07,
"loss": 0.426,
"step": 831
},
{
"epoch": 12.465290806754222,
"grad_norm": 0.012359334486955045,
"learning_rate": 6.587646997726173e-07,
"loss": 0.4065,
"step": 832
},
{
"epoch": 12.48030018761726,
"grad_norm": 0.011957899962627493,
"learning_rate": 6.564108865668297e-07,
"loss": 0.4048,
"step": 833
},
{
"epoch": 12.4953095684803,
"grad_norm": 0.01170019242610937,
"learning_rate": 6.540592298331238e-07,
"loss": 0.4126,
"step": 834
},
{
"epoch": 12.51031894934334,
"grad_norm": 0.012331576031846944,
"learning_rate": 6.517097443312288e-07,
"loss": 0.4049,
"step": 835
},
{
"epoch": 12.52532833020638,
"grad_norm": 0.012366827738055758,
"learning_rate": 6.493624448072457e-07,
"loss": 0.4127,
"step": 836
},
{
"epoch": 12.540337711069418,
"grad_norm": 0.011860523908193414,
"learning_rate": 6.470173459935559e-07,
"loss": 0.4172,
"step": 837
},
{
"epoch": 12.555347091932457,
"grad_norm": 0.012415007915885625,
"learning_rate": 6.446744626087293e-07,
"loss": 0.4137,
"step": 838
},
{
"epoch": 12.570356472795497,
"grad_norm": 0.012431394669758795,
"learning_rate": 6.423338093574293e-07,
"loss": 0.4136,
"step": 839
},
{
"epoch": 12.585365853658537,
"grad_norm": 0.012176504783944421,
"learning_rate": 6.399954009303239e-07,
"loss": 0.411,
"step": 840
},
{
"epoch": 12.600375234521575,
"grad_norm": 0.01197249057193435,
"learning_rate": 6.376592520039912e-07,
"loss": 0.4141,
"step": 841
},
{
"epoch": 12.615384615384615,
"grad_norm": 0.012336864192816516,
"learning_rate": 6.35325377240828e-07,
"loss": 0.4138,
"step": 842
},
{
"epoch": 12.630393996247655,
"grad_norm": 0.011861502798104129,
"learning_rate": 6.329937912889581e-07,
"loss": 0.3931,
"step": 843
},
{
"epoch": 12.645403377110695,
"grad_norm": 0.012391892541325432,
"learning_rate": 6.306645087821392e-07,
"loss": 0.4241,
"step": 844
},
{
"epoch": 12.660412757973734,
"grad_norm": 0.012478980233980006,
"learning_rate": 6.283375443396726e-07,
"loss": 0.4161,
"step": 845
},
{
"epoch": 12.675422138836772,
"grad_norm": 0.012527299390245597,
"learning_rate": 6.260129125663105e-07,
"loss": 0.41,
"step": 846
},
{
"epoch": 12.690431519699812,
"grad_norm": 0.011927959930610983,
"learning_rate": 6.236906280521646e-07,
"loss": 0.4055,
"step": 847
},
{
"epoch": 12.705440900562852,
"grad_norm": 0.012720010931915432,
"learning_rate": 6.213707053726145e-07,
"loss": 0.4173,
"step": 848
},
{
"epoch": 12.720450281425892,
"grad_norm": 0.012556965039986982,
"learning_rate": 6.190531590882158e-07,
"loss": 0.4176,
"step": 849
},
{
"epoch": 12.73545966228893,
"grad_norm": 0.012646171657061266,
"learning_rate": 6.167380037446094e-07,
"loss": 0.4037,
"step": 850
},
{
"epoch": 12.75046904315197,
"grad_norm": 0.012363455660222528,
"learning_rate": 6.144252538724302e-07,
"loss": 0.4069,
"step": 851
},
{
"epoch": 12.76547842401501,
"grad_norm": 0.012481896036275676,
"learning_rate": 6.12114923987215e-07,
"loss": 0.4098,
"step": 852
},
{
"epoch": 12.78048780487805,
"grad_norm": 0.01219025499093808,
"learning_rate": 6.098070285893128e-07,
"loss": 0.4128,
"step": 853
},
{
"epoch": 12.795497185741088,
"grad_norm": 0.012221753177996873,
"learning_rate": 6.075015821637922e-07,
"loss": 0.4124,
"step": 854
},
{
"epoch": 12.810506566604127,
"grad_norm": 0.01242451231763906,
"learning_rate": 6.051985991803517e-07,
"loss": 0.4055,
"step": 855
},
{
"epoch": 12.825515947467167,
"grad_norm": 0.012105488715900472,
"learning_rate": 6.028980940932282e-07,
"loss": 0.398,
"step": 856
},
{
"epoch": 12.840525328330207,
"grad_norm": 0.012102745982651956,
"learning_rate": 6.006000813411069e-07,
"loss": 0.4152,
"step": 857
},
{
"epoch": 12.855534709193245,
"grad_norm": 0.012200334931675806,
"learning_rate": 5.983045753470307e-07,
"loss": 0.4146,
"step": 858
},
{
"epoch": 12.855534709193245,
"eval_loss": 0.39677873253822327,
"eval_runtime": 13.689,
"eval_samples_per_second": 32.654,
"eval_steps_per_second": 2.045,
"step": 858
},
{
"epoch": 12.870544090056285,
"grad_norm": 0.012649382929243192,
"learning_rate": 5.960115905183078e-07,
"loss": 0.4081,
"step": 859
},
{
"epoch": 12.885553470919325,
"grad_norm": 0.011929152112251899,
"learning_rate": 5.937211412464245e-07,
"loss": 0.4031,
"step": 860
},
{
"epoch": 12.900562851782365,
"grad_norm": 0.011962658774633357,
"learning_rate": 5.914332419069519e-07,
"loss": 0.4111,
"step": 861
},
{
"epoch": 12.915572232645403,
"grad_norm": 0.012120445434134606,
"learning_rate": 5.89147906859458e-07,
"loss": 0.4057,
"step": 862
},
{
"epoch": 12.930581613508442,
"grad_norm": 0.012223863205227052,
"learning_rate": 5.868651504474156e-07,
"loss": 0.422,
"step": 863
},
{
"epoch": 12.945590994371482,
"grad_norm": 0.012093907000714669,
"learning_rate": 5.845849869981136e-07,
"loss": 0.4063,
"step": 864
},
{
"epoch": 12.960600375234522,
"grad_norm": 0.011744441936770404,
"learning_rate": 5.823074308225668e-07,
"loss": 0.4214,
"step": 865
},
{
"epoch": 12.975609756097562,
"grad_norm": 0.012588359080999381,
"learning_rate": 5.800324962154251e-07,
"loss": 0.4104,
"step": 866
},
{
"epoch": 12.9906191369606,
"grad_norm": 0.012041813356469647,
"learning_rate": 5.777601974548866e-07,
"loss": 0.409,
"step": 867
},
{
"epoch": 13.0,
"grad_norm": 0.017454775927059043,
"learning_rate": 5.754905488026034e-07,
"loss": 0.4009,
"step": 868
},
{
"epoch": 13.01500938086304,
"grad_norm": 0.012590573823027078,
"learning_rate": 5.732235645035963e-07,
"loss": 0.4077,
"step": 869
},
{
"epoch": 13.03001876172608,
"grad_norm": 0.012476473937542754,
"learning_rate": 5.709592587861637e-07,
"loss": 0.409,
"step": 870
},
{
"epoch": 13.045028142589118,
"grad_norm": 0.01272863688069376,
"learning_rate": 5.686976458617921e-07,
"loss": 0.4203,
"step": 871
},
{
"epoch": 13.060037523452158,
"grad_norm": 0.012212260852439769,
"learning_rate": 5.664387399250672e-07,
"loss": 0.4052,
"step": 872
},
{
"epoch": 13.075046904315197,
"grad_norm": 0.012019289466009251,
"learning_rate": 5.641825551535848e-07,
"loss": 0.4038,
"step": 873
},
{
"epoch": 13.090056285178237,
"grad_norm": 0.011953777292384806,
"learning_rate": 5.619291057078618e-07,
"loss": 0.3931,
"step": 874
},
{
"epoch": 13.105065666041275,
"grad_norm": 0.01214865490238678,
"learning_rate": 5.596784057312474e-07,
"loss": 0.4007,
"step": 875
},
{
"epoch": 13.120075046904315,
"grad_norm": 0.011912792450191237,
"learning_rate": 5.574304693498345e-07,
"loss": 0.4192,
"step": 876
},
{
"epoch": 13.135084427767355,
"grad_norm": 0.012090316577047174,
"learning_rate": 5.551853106723709e-07,
"loss": 0.4073,
"step": 877
},
{
"epoch": 13.150093808630395,
"grad_norm": 0.013010062600398912,
"learning_rate": 5.529429437901696e-07,
"loss": 0.4227,
"step": 878
},
{
"epoch": 13.165103189493433,
"grad_norm": 0.01280067493337377,
"learning_rate": 5.507033827770225e-07,
"loss": 0.4126,
"step": 879
},
{
"epoch": 13.180112570356473,
"grad_norm": 0.012398125964523131,
"learning_rate": 5.484666416891108e-07,
"loss": 0.4023,
"step": 880
},
{
"epoch": 13.195121951219512,
"grad_norm": 0.012420941719772401,
"learning_rate": 5.462327345649165e-07,
"loss": 0.4044,
"step": 881
},
{
"epoch": 13.210131332082552,
"grad_norm": 0.012049668556132256,
"learning_rate": 5.440016754251364e-07,
"loss": 0.4175,
"step": 882
},
{
"epoch": 13.22514071294559,
"grad_norm": 0.012258701638030365,
"learning_rate": 5.417734782725896e-07,
"loss": 0.416,
"step": 883
},
{
"epoch": 13.24015009380863,
"grad_norm": 0.01192198170753603,
"learning_rate": 5.395481570921349e-07,
"loss": 0.4039,
"step": 884
},
{
"epoch": 13.25515947467167,
"grad_norm": 0.012554184751834683,
"learning_rate": 5.373257258505796e-07,
"loss": 0.4156,
"step": 885
},
{
"epoch": 13.27016885553471,
"grad_norm": 0.012851533427761994,
"learning_rate": 5.351061984965931e-07,
"loss": 0.4197,
"step": 886
},
{
"epoch": 13.285178236397748,
"grad_norm": 0.012316726606129447,
"learning_rate": 5.328895889606193e-07,
"loss": 0.4236,
"step": 887
},
{
"epoch": 13.300187617260788,
"grad_norm": 0.013029382216239293,
"learning_rate": 5.306759111547881e-07,
"loss": 0.427,
"step": 888
},
{
"epoch": 13.315196998123827,
"grad_norm": 0.012480929561426931,
"learning_rate": 5.284651789728296e-07,
"loss": 0.4107,
"step": 889
},
{
"epoch": 13.330206378986867,
"grad_norm": 0.01199215605719895,
"learning_rate": 5.262574062899866e-07,
"loss": 0.3977,
"step": 890
},
{
"epoch": 13.345215759849907,
"grad_norm": 0.011780966276621337,
"learning_rate": 5.240526069629264e-07,
"loss": 0.4078,
"step": 891
},
{
"epoch": 13.360225140712945,
"grad_norm": 0.012128143743191527,
"learning_rate": 5.218507948296556e-07,
"loss": 0.4143,
"step": 892
},
{
"epoch": 13.375234521575985,
"grad_norm": 0.011698499209161491,
"learning_rate": 5.196519837094306e-07,
"loss": 0.3999,
"step": 893
},
{
"epoch": 13.390243902439025,
"grad_norm": 0.011777330802458462,
"learning_rate": 5.174561874026741e-07,
"loss": 0.4202,
"step": 894
},
{
"epoch": 13.405253283302065,
"grad_norm": 0.012154703240758565,
"learning_rate": 5.152634196908861e-07,
"loss": 0.411,
"step": 895
},
{
"epoch": 13.420262664165103,
"grad_norm": 0.012477581253436483,
"learning_rate": 5.13073694336558e-07,
"loss": 0.3935,
"step": 896
},
{
"epoch": 13.435272045028142,
"grad_norm": 0.012300889865186484,
"learning_rate": 5.108870250830881e-07,
"loss": 0.4173,
"step": 897
},
{
"epoch": 13.450281425891182,
"grad_norm": 0.01236137621122289,
"learning_rate": 5.087034256546912e-07,
"loss": 0.4063,
"step": 898
},
{
"epoch": 13.465290806754222,
"grad_norm": 0.01178006897188101,
"learning_rate": 5.065229097563164e-07,
"loss": 0.4027,
"step": 899
},
{
"epoch": 13.48030018761726,
"grad_norm": 0.012622903840672843,
"learning_rate": 5.043454910735593e-07,
"loss": 0.4007,
"step": 900
},
{
"epoch": 13.4953095684803,
"grad_norm": 0.01232943004197537,
"learning_rate": 5.021711832725767e-07,
"loss": 0.4101,
"step": 901
},
{
"epoch": 13.51031894934334,
"grad_norm": 0.012650338163967687,
"learning_rate": 5.000000000000002e-07,
"loss": 0.4109,
"step": 902
},
{
"epoch": 13.52532833020638,
"grad_norm": 0.012308627141887482,
"learning_rate": 4.978319548828504e-07,
"loss": 0.4167,
"step": 903
},
{
"epoch": 13.540337711069418,
"grad_norm": 0.01247431058858993,
"learning_rate": 4.956670615284528e-07,
"loss": 0.4083,
"step": 904
},
{
"epoch": 13.555347091932457,
"grad_norm": 0.012521896733928029,
"learning_rate": 4.935053335243508e-07,
"loss": 0.41,
"step": 905
},
{
"epoch": 13.570356472795497,
"grad_norm": 0.01224071942990429,
"learning_rate": 4.913467844382217e-07,
"loss": 0.411,
"step": 906
},
{
"epoch": 13.585365853658537,
"grad_norm": 0.011911018144128583,
"learning_rate": 4.891914278177907e-07,
"loss": 0.4131,
"step": 907
},
{
"epoch": 13.600375234521575,
"grad_norm": 0.012173795572423684,
"learning_rate": 4.870392771907454e-07,
"loss": 0.4172,
"step": 908
},
{
"epoch": 13.615384615384615,
"grad_norm": 0.012787639508096029,
"learning_rate": 4.848903460646522e-07,
"loss": 0.4082,
"step": 909
},
{
"epoch": 13.630393996247655,
"grad_norm": 0.012810162920205651,
"learning_rate": 4.827446479268712e-07,
"loss": 0.4156,
"step": 910
},
{
"epoch": 13.645403377110695,
"grad_norm": 0.012158067835970099,
"learning_rate": 4.806021962444707e-07,
"loss": 0.4066,
"step": 911
},
{
"epoch": 13.660412757973734,
"grad_norm": 0.012243835560067891,
"learning_rate": 4.784630044641435e-07,
"loss": 0.4141,
"step": 912
},
{
"epoch": 13.675422138836772,
"grad_norm": 0.012182441628748585,
"learning_rate": 4.7632708601212215e-07,
"loss": 0.4132,
"step": 913
},
{
"epoch": 13.690431519699812,
"grad_norm": 0.012064707690036225,
"learning_rate": 4.7419445429409487e-07,
"loss": 0.4004,
"step": 914
},
{
"epoch": 13.705440900562852,
"grad_norm": 0.012058309173201142,
"learning_rate": 4.7206512269512125e-07,
"loss": 0.4065,
"step": 915
},
{
"epoch": 13.720450281425892,
"grad_norm": 0.011914352483260126,
"learning_rate": 4.6993910457954864e-07,
"loss": 0.4074,
"step": 916
},
{
"epoch": 13.73545966228893,
"grad_norm": 0.01218029087555133,
"learning_rate": 4.6781641329092705e-07,
"loss": 0.4167,
"step": 917
},
{
"epoch": 13.75046904315197,
"grad_norm": 0.013057166343761247,
"learning_rate": 4.6569706215192693e-07,
"loss": 0.4068,
"step": 918
},
{
"epoch": 13.76547842401501,
"grad_norm": 0.012604582774669597,
"learning_rate": 4.635810644642552e-07,
"loss": 0.4144,
"step": 919
},
{
"epoch": 13.78048780487805,
"grad_norm": 0.011935794848686154,
"learning_rate": 4.614684335085708e-07,
"loss": 0.4009,
"step": 920
},
{
"epoch": 13.795497185741088,
"grad_norm": 0.012666727346839646,
"learning_rate": 4.5935918254440274e-07,
"loss": 0.4102,
"step": 921
},
{
"epoch": 13.810506566604127,
"grad_norm": 0.01205052533335544,
"learning_rate": 4.572533248100652e-07,
"loss": 0.4107,
"step": 922
},
{
"epoch": 13.825515947467167,
"grad_norm": 0.012515947617199258,
"learning_rate": 4.5515087352257606e-07,
"loss": 0.4058,
"step": 923
},
{
"epoch": 13.840525328330207,
"grad_norm": 0.01178275512252074,
"learning_rate": 4.530518418775733e-07,
"loss": 0.3981,
"step": 924
},
{
"epoch": 13.840525328330207,
"eval_loss": 0.39569905400276184,
"eval_runtime": 13.9015,
"eval_samples_per_second": 32.155,
"eval_steps_per_second": 2.014,
"step": 924
},
{
"epoch": 13.855534709193245,
"grad_norm": 0.012172425812137781,
"learning_rate": 4.50956243049232e-07,
"loss": 0.41,
"step": 925
},
{
"epoch": 13.870544090056285,
"grad_norm": 0.01225111633980482,
"learning_rate": 4.488640901901818e-07,
"loss": 0.4132,
"step": 926
},
{
"epoch": 13.885553470919325,
"grad_norm": 0.012388855766636281,
"learning_rate": 4.467753964314245e-07,
"loss": 0.4108,
"step": 927
},
{
"epoch": 13.900562851782365,
"grad_norm": 0.012447467722629292,
"learning_rate": 4.4469017488225124e-07,
"loss": 0.4181,
"step": 928
},
{
"epoch": 13.915572232645403,
"grad_norm": 0.0118810129924204,
"learning_rate": 4.426084386301607e-07,
"loss": 0.4139,
"step": 929
},
{
"epoch": 13.930581613508442,
"grad_norm": 0.0118987273245147,
"learning_rate": 4.40530200740777e-07,
"loss": 0.4192,
"step": 930
},
{
"epoch": 13.945590994371482,
"grad_norm": 0.012530895292621011,
"learning_rate": 4.3845547425776707e-07,
"loss": 0.4098,
"step": 931
},
{
"epoch": 13.960600375234522,
"grad_norm": 0.011523586359779753,
"learning_rate": 4.3638427220275876e-07,
"loss": 0.4048,
"step": 932
},
{
"epoch": 13.975609756097562,
"grad_norm": 0.012130640942285414,
"learning_rate": 4.3431660757526043e-07,
"loss": 0.4003,
"step": 933
},
{
"epoch": 13.9906191369606,
"grad_norm": 0.01306375560358271,
"learning_rate": 4.3225249335257795e-07,
"loss": 0.419,
"step": 934
},
{
"epoch": 14.0,
"grad_norm": 0.01306375560358271,
"learning_rate": 4.3019194248973377e-07,
"loss": 0.4085,
"step": 935
},
{
"epoch": 14.01500938086304,
"grad_norm": 0.01702987966177615,
"learning_rate": 4.281349679193861e-07,
"loss": 0.4086,
"step": 936
},
{
"epoch": 14.03001876172608,
"grad_norm": 0.01210923339196911,
"learning_rate": 4.2608158255174597e-07,
"loss": 0.4112,
"step": 937
},
{
"epoch": 14.045028142589118,
"grad_norm": 0.011866596152432489,
"learning_rate": 4.2403179927449864e-07,
"loss": 0.4109,
"step": 938
},
{
"epoch": 14.060037523452158,
"grad_norm": 0.012607746156689865,
"learning_rate": 4.219856309527211e-07,
"loss": 0.4221,
"step": 939
},
{
"epoch": 14.075046904315197,
"grad_norm": 0.012486558528938796,
"learning_rate": 4.1994309042880193e-07,
"loss": 0.4103,
"step": 940
},
{
"epoch": 14.090056285178237,
"grad_norm": 0.012303061756963003,
"learning_rate": 4.1790419052236025e-07,
"loss": 0.4104,
"step": 941
},
{
"epoch": 14.105065666041275,
"grad_norm": 0.012176253504823318,
"learning_rate": 4.158689440301657e-07,
"loss": 0.4156,
"step": 942
},
{
"epoch": 14.120075046904315,
"grad_norm": 0.012798739423948427,
"learning_rate": 4.138373637260579e-07,
"loss": 0.4094,
"step": 943
},
{
"epoch": 14.135084427767355,
"grad_norm": 0.012149574296456156,
"learning_rate": 4.1180946236086646e-07,
"loss": 0.4153,
"step": 944
},
{
"epoch": 14.150093808630395,
"grad_norm": 0.012028612599444181,
"learning_rate": 4.0978525266233064e-07,
"loss": 0.4054,
"step": 945
},
{
"epoch": 14.165103189493433,
"grad_norm": 0.012231549955693293,
"learning_rate": 4.0776474733502007e-07,
"loss": 0.416,
"step": 946
},
{
"epoch": 14.180112570356473,
"grad_norm": 0.012421110011924136,
"learning_rate": 4.0574795906025374e-07,
"loss": 0.4016,
"step": 947
},
{
"epoch": 14.195121951219512,
"grad_norm": 0.011730159482603666,
"learning_rate": 4.03734900496022e-07,
"loss": 0.4013,
"step": 948
},
{
"epoch": 14.210131332082552,
"grad_norm": 0.012688603636352387,
"learning_rate": 4.017255842769062e-07,
"loss": 0.415,
"step": 949
},
{
"epoch": 14.22514071294559,
"grad_norm": 0.012404842948789518,
"learning_rate": 3.9972002301399956e-07,
"loss": 0.4169,
"step": 950
},
{
"epoch": 14.24015009380863,
"grad_norm": 0.012149123886554853,
"learning_rate": 3.977182292948282e-07,
"loss": 0.3949,
"step": 951
},
{
"epoch": 14.25515947467167,
"grad_norm": 0.012045312561245865,
"learning_rate": 3.957202156832713e-07,
"loss": 0.4134,
"step": 952
},
{
"epoch": 14.27016885553471,
"grad_norm": 0.01209269757907142,
"learning_rate": 3.9372599471948354e-07,
"loss": 0.414,
"step": 953
},
{
"epoch": 14.285178236397748,
"grad_norm": 0.011918842634659655,
"learning_rate": 3.9173557891981567e-07,
"loss": 0.4014,
"step": 954
},
{
"epoch": 14.300187617260788,
"grad_norm": 0.01220439396332339,
"learning_rate": 3.89748980776736e-07,
"loss": 0.4018,
"step": 955
},
{
"epoch": 14.315196998123827,
"grad_norm": 0.011998622448288693,
"learning_rate": 3.877662127587521e-07,
"loss": 0.4174,
"step": 956
},
{
"epoch": 14.330206378986867,
"grad_norm": 0.012295617446559496,
"learning_rate": 3.8578728731033214e-07,
"loss": 0.4102,
"step": 957
},
{
"epoch": 14.345215759849907,
"grad_norm": 0.011980609920463275,
"learning_rate": 3.838122168518276e-07,
"loss": 0.4006,
"step": 958
},
{
"epoch": 14.360225140712945,
"grad_norm": 0.01234999901307482,
"learning_rate": 3.818410137793947e-07,
"loss": 0.4083,
"step": 959
},
{
"epoch": 14.375234521575985,
"grad_norm": 0.01204318922158648,
"learning_rate": 3.798736904649168e-07,
"loss": 0.416,
"step": 960
},
{
"epoch": 14.390243902439025,
"grad_norm": 0.012342974076721963,
"learning_rate": 3.77910259255926e-07,
"loss": 0.4042,
"step": 961
},
{
"epoch": 14.405253283302065,
"grad_norm": 0.012080518503284057,
"learning_rate": 3.7595073247552735e-07,
"loss": 0.4148,
"step": 962
},
{
"epoch": 14.420262664165103,
"grad_norm": 0.012401425685947038,
"learning_rate": 3.739951224223199e-07,
"loss": 0.4166,
"step": 963
},
{
"epoch": 14.435272045028142,
"grad_norm": 0.012113429871714052,
"learning_rate": 3.720434413703202e-07,
"loss": 0.4031,
"step": 964
},
{
"epoch": 14.450281425891182,
"grad_norm": 0.012196068177764108,
"learning_rate": 3.700957015688858e-07,
"loss": 0.4115,
"step": 965
},
{
"epoch": 14.465290806754222,
"grad_norm": 0.011860016831751172,
"learning_rate": 3.681519152426362e-07,
"loss": 0.4212,
"step": 966
},
{
"epoch": 14.48030018761726,
"grad_norm": 0.012149681720096986,
"learning_rate": 3.6621209459137926e-07,
"loss": 0.4126,
"step": 967
},
{
"epoch": 14.4953095684803,
"grad_norm": 0.011922991193016708,
"learning_rate": 3.6427625179003217e-07,
"loss": 0.404,
"step": 968
},
{
"epoch": 14.51031894934334,
"grad_norm": 0.012235166739834057,
"learning_rate": 3.623443989885462e-07,
"loss": 0.4008,
"step": 969
},
{
"epoch": 14.52532833020638,
"grad_norm": 0.01261366907306786,
"learning_rate": 3.604165483118299e-07,
"loss": 0.4157,
"step": 970
},
{
"epoch": 14.540337711069418,
"grad_norm": 0.011959411543845642,
"learning_rate": 3.5849271185967366e-07,
"loss": 0.4087,
"step": 971
},
{
"epoch": 14.555347091932457,
"grad_norm": 0.011886511893275631,
"learning_rate": 3.565729017066729e-07,
"loss": 0.4073,
"step": 972
},
{
"epoch": 14.570356472795497,
"grad_norm": 0.012266040637982558,
"learning_rate": 3.546571299021529e-07,
"loss": 0.4002,
"step": 973
},
{
"epoch": 14.585365853658537,
"grad_norm": 0.011946836948162617,
"learning_rate": 3.527454084700933e-07,
"loss": 0.4113,
"step": 974
},
{
"epoch": 14.600375234521575,
"grad_norm": 0.012884422251256487,
"learning_rate": 3.508377494090521e-07,
"loss": 0.411,
"step": 975
},
{
"epoch": 14.615384615384615,
"grad_norm": 0.012401104679477528,
"learning_rate": 3.4893416469208993e-07,
"loss": 0.405,
"step": 976
},
{
"epoch": 14.630393996247655,
"grad_norm": 0.012397424222628246,
"learning_rate": 3.4703466626669673e-07,
"loss": 0.4009,
"step": 977
},
{
"epoch": 14.645403377110695,
"grad_norm": 0.01211105873440178,
"learning_rate": 3.45139266054715e-07,
"loss": 0.4211,
"step": 978
},
{
"epoch": 14.660412757973734,
"grad_norm": 0.012745867091250343,
"learning_rate": 3.4324797595226564e-07,
"loss": 0.4133,
"step": 979
},
{
"epoch": 14.675422138836772,
"grad_norm": 0.012089095003607657,
"learning_rate": 3.413608078296735e-07,
"loss": 0.4052,
"step": 980
},
{
"epoch": 14.690431519699812,
"grad_norm": 0.012182824041581471,
"learning_rate": 3.394777735313918e-07,
"loss": 0.4043,
"step": 981
},
{
"epoch": 14.705440900562852,
"grad_norm": 0.012109620293361477,
"learning_rate": 3.3759888487592946e-07,
"loss": 0.4059,
"step": 982
},
{
"epoch": 14.720450281425892,
"grad_norm": 0.012382313496190551,
"learning_rate": 3.357241536557758e-07,
"loss": 0.4086,
"step": 983
},
{
"epoch": 14.73545966228893,
"grad_norm": 0.012198508778120983,
"learning_rate": 3.3385359163732664e-07,
"loss": 0.4136,
"step": 984
},
{
"epoch": 14.75046904315197,
"grad_norm": 0.01280406441595411,
"learning_rate": 3.319872105608107e-07,
"loss": 0.4068,
"step": 985
},
{
"epoch": 14.76547842401501,
"grad_norm": 0.012130187940707679,
"learning_rate": 3.3012502214021577e-07,
"loss": 0.4145,
"step": 986
},
{
"epoch": 14.78048780487805,
"grad_norm": 0.011810045886264997,
"learning_rate": 3.282670380632152e-07,
"loss": 0.4003,
"step": 987
},
{
"epoch": 14.795497185741088,
"grad_norm": 0.01226997027635051,
"learning_rate": 3.2641326999109474e-07,
"loss": 0.4181,
"step": 988
},
{
"epoch": 14.810506566604127,
"grad_norm": 0.012662471100966417,
"learning_rate": 3.2456372955867907e-07,
"loss": 0.4058,
"step": 989
},
{
"epoch": 14.825515947467167,
"grad_norm": 0.012554555241937682,
"learning_rate": 3.227184283742591e-07,
"loss": 0.4011,
"step": 990
},
{
"epoch": 14.825515947467167,
"eval_loss": 0.3950127065181732,
"eval_runtime": 13.8589,
"eval_samples_per_second": 32.254,
"eval_steps_per_second": 2.02,
"step": 990
},
{
"epoch": 14.840525328330207,
"grad_norm": 0.01206293666136027,
"learning_rate": 3.20877378019518e-07,
"loss": 0.4098,
"step": 991
},
{
"epoch": 14.855534709193245,
"grad_norm": 0.012160543935788812,
"learning_rate": 3.190405900494606e-07,
"loss": 0.4022,
"step": 992
},
{
"epoch": 14.870544090056285,
"grad_norm": 0.011862384188573029,
"learning_rate": 3.17208075992339e-07,
"loss": 0.4132,
"step": 993
},
{
"epoch": 14.885553470919325,
"grad_norm": 0.012348297087273847,
"learning_rate": 3.153798473495811e-07,
"loss": 0.4063,
"step": 994
},
{
"epoch": 14.900562851782365,
"grad_norm": 0.011971438983032082,
"learning_rate": 3.135559155957186e-07,
"loss": 0.4189,
"step": 995
},
{
"epoch": 14.915572232645403,
"grad_norm": 0.012225913697186104,
"learning_rate": 3.117362921783134e-07,
"loss": 0.4078,
"step": 996
},
{
"epoch": 14.930581613508442,
"grad_norm": 0.012483752970869568,
"learning_rate": 3.0992098851788817e-07,
"loss": 0.4027,
"step": 997
},
{
"epoch": 14.945590994371482,
"grad_norm": 0.012312435583901655,
"learning_rate": 3.081100160078528e-07,
"loss": 0.3964,
"step": 998
},
{
"epoch": 14.960600375234522,
"grad_norm": 0.011733287392883436,
"learning_rate": 3.0630338601443385e-07,
"loss": 0.4077,
"step": 999
},
{
"epoch": 14.975609756097562,
"grad_norm": 0.012352553640401984,
"learning_rate": 3.045011098766026e-07,
"loss": 0.4097,
"step": 1000
},
{
"epoch": 14.9906191369606,
"grad_norm": 0.012691085307760875,
"learning_rate": 3.027031989060046e-07,
"loss": 0.4014,
"step": 1001
},
{
"epoch": 15.01500938086304,
"grad_norm": 0.01450895061621053,
"learning_rate": 3.009096643868877e-07,
"loss": 0.8212,
"step": 1002
},
{
"epoch": 15.03001876172608,
"grad_norm": 0.012442122708447712,
"learning_rate": 2.991205175760322e-07,
"loss": 0.4064,
"step": 1003
},
{
"epoch": 15.045028142589118,
"grad_norm": 0.01177546286369461,
"learning_rate": 2.9733576970267973e-07,
"loss": 0.395,
"step": 1004
},
{
"epoch": 15.060037523452158,
"grad_norm": 0.012583125025593713,
"learning_rate": 2.955554319684629e-07,
"loss": 0.404,
"step": 1005
},
{
"epoch": 15.075046904315197,
"grad_norm": 0.012181333733193132,
"learning_rate": 2.937795155473343e-07,
"loss": 0.4163,
"step": 1006
},
{
"epoch": 15.090056285178237,
"grad_norm": 0.011929578865847229,
"learning_rate": 2.920080315854975e-07,
"loss": 0.4091,
"step": 1007
},
{
"epoch": 15.105065666041275,
"grad_norm": 0.012219751268944107,
"learning_rate": 2.902409912013367e-07,
"loss": 0.4087,
"step": 1008
},
{
"epoch": 15.120075046904315,
"grad_norm": 0.012185881807267762,
"learning_rate": 2.8847840548534695e-07,
"loss": 0.3959,
"step": 1009
},
{
"epoch": 15.135084427767355,
"grad_norm": 0.012095974327974867,
"learning_rate": 2.8672028550006357e-07,
"loss": 0.4142,
"step": 1010
},
{
"epoch": 15.150093808630395,
"grad_norm": 0.012004403180987345,
"learning_rate": 2.8496664227999414e-07,
"loss": 0.4095,
"step": 1011
},
{
"epoch": 15.165103189493433,
"grad_norm": 0.012068360546402585,
"learning_rate": 2.8321748683154887e-07,
"loss": 0.412,
"step": 1012
},
{
"epoch": 15.180112570356473,
"grad_norm": 0.011604065137702905,
"learning_rate": 2.814728301329711e-07,
"loss": 0.4037,
"step": 1013
},
{
"epoch": 15.195121951219512,
"grad_norm": 0.012246442251214433,
"learning_rate": 2.7973268313426835e-07,
"loss": 0.4176,
"step": 1014
},
{
"epoch": 15.210131332082552,
"grad_norm": 0.012835581733332583,
"learning_rate": 2.7799705675714437e-07,
"loss": 0.4142,
"step": 1015
},
{
"epoch": 15.22514071294559,
"grad_norm": 0.012179697043205245,
"learning_rate": 2.762659618949298e-07,
"loss": 0.4074,
"step": 1016
},
{
"epoch": 15.24015009380863,
"grad_norm": 0.012369178745362223,
"learning_rate": 2.745394094125141e-07,
"loss": 0.3992,
"step": 1017
},
{
"epoch": 15.25515947467167,
"grad_norm": 0.012579361492169629,
"learning_rate": 2.7281741014627714e-07,
"loss": 0.4104,
"step": 1018
},
{
"epoch": 15.27016885553471,
"grad_norm": 0.01162764738709065,
"learning_rate": 2.710999749040223e-07,
"loss": 0.4068,
"step": 1019
},
{
"epoch": 15.285178236397748,
"grad_norm": 0.01265468112430842,
"learning_rate": 2.69387114464906e-07,
"loss": 0.4088,
"step": 1020
},
{
"epoch": 15.300187617260788,
"grad_norm": 0.011940035373381468,
"learning_rate": 2.6767883957937344e-07,
"loss": 0.4063,
"step": 1021
},
{
"epoch": 15.315196998123827,
"grad_norm": 0.012255329102484067,
"learning_rate": 2.6597516096908867e-07,
"loss": 0.4069,
"step": 1022
},
{
"epoch": 15.330206378986867,
"grad_norm": 0.012355083411473719,
"learning_rate": 2.642760893268684e-07,
"loss": 0.405,
"step": 1023
},
{
"epoch": 15.345215759849907,
"grad_norm": 0.012128588311350583,
"learning_rate": 2.6258163531661447e-07,
"loss": 0.4085,
"step": 1024
},
{
"epoch": 15.360225140712945,
"grad_norm": 0.012015480810663814,
"learning_rate": 2.6089180957324654e-07,
"loss": 0.4099,
"step": 1025
},
{
"epoch": 15.375234521575985,
"grad_norm": 0.01236965765692212,
"learning_rate": 2.5920662270263647e-07,
"loss": 0.3968,
"step": 1026
},
{
"epoch": 15.390243902439025,
"grad_norm": 0.012236517064876534,
"learning_rate": 2.575260852815411e-07,
"loss": 0.4087,
"step": 1027
},
{
"epoch": 15.405253283302065,
"grad_norm": 0.011933985911797585,
"learning_rate": 2.5585020785753553e-07,
"loss": 0.4057,
"step": 1028
},
{
"epoch": 15.420262664165103,
"grad_norm": 0.012075593886736229,
"learning_rate": 2.541790009489474e-07,
"loss": 0.4015,
"step": 1029
},
{
"epoch": 15.435272045028142,
"grad_norm": 0.012164890137346288,
"learning_rate": 2.525124750447908e-07,
"loss": 0.4168,
"step": 1030
},
{
"epoch": 15.450281425891182,
"grad_norm": 0.012126106786702912,
"learning_rate": 2.508506406047004e-07,
"loss": 0.4058,
"step": 1031
},
{
"epoch": 15.465290806754222,
"grad_norm": 0.012368399052280451,
"learning_rate": 2.4919350805886576e-07,
"loss": 0.406,
"step": 1032
},
{
"epoch": 15.48030018761726,
"grad_norm": 0.012432763679806415,
"learning_rate": 2.475410878079657e-07,
"loss": 0.4056,
"step": 1033
},
{
"epoch": 15.4953095684803,
"grad_norm": 0.012881685474676679,
"learning_rate": 2.458933902231038e-07,
"loss": 0.4085,
"step": 1034
},
{
"epoch": 15.51031894934334,
"grad_norm": 0.01228631484433566,
"learning_rate": 2.4425042564574185e-07,
"loss": 0.4017,
"step": 1035
},
{
"epoch": 15.52532833020638,
"grad_norm": 0.012042020829137407,
"learning_rate": 2.426122043876362e-07,
"loss": 0.4046,
"step": 1036
},
{
"epoch": 15.540337711069418,
"grad_norm": 0.012587414690850632,
"learning_rate": 2.4097873673077296e-07,
"loss": 0.408,
"step": 1037
},
{
"epoch": 15.555347091932457,
"grad_norm": 0.012190544175375687,
"learning_rate": 2.393500329273029e-07,
"loss": 0.4097,
"step": 1038
},
{
"epoch": 15.570356472795497,
"grad_norm": 0.01206814550432668,
"learning_rate": 2.377261031994776e-07,
"loss": 0.4064,
"step": 1039
},
{
"epoch": 15.585365853658537,
"grad_norm": 0.012307556354982051,
"learning_rate": 2.3610695773958434e-07,
"loss": 0.4168,
"step": 1040
},
{
"epoch": 15.600375234521575,
"grad_norm": 0.01222641572619058,
"learning_rate": 2.3449260670988358e-07,
"loss": 0.4022,
"step": 1041
},
{
"epoch": 15.615384615384615,
"grad_norm": 0.012220527683488478,
"learning_rate": 2.3288306024254411e-07,
"loss": 0.3987,
"step": 1042
},
{
"epoch": 15.630393996247655,
"grad_norm": 0.012424430033867473,
"learning_rate": 2.3127832843958007e-07,
"loss": 0.4166,
"step": 1043
},
{
"epoch": 15.645403377110695,
"grad_norm": 0.012366507275888419,
"learning_rate": 2.2967842137278703e-07,
"loss": 0.4115,
"step": 1044
},
{
"epoch": 15.660412757973734,
"grad_norm": 0.012360880870716443,
"learning_rate": 2.2808334908367909e-07,
"loss": 0.4161,
"step": 1045
},
{
"epoch": 15.675422138836772,
"grad_norm": 0.012736823965131592,
"learning_rate": 2.264931215834257e-07,
"loss": 0.4066,
"step": 1046
},
{
"epoch": 15.690431519699812,
"grad_norm": 0.012155204376594498,
"learning_rate": 2.2490774885278907e-07,
"loss": 0.4049,
"step": 1047
},
{
"epoch": 15.705440900562852,
"grad_norm": 0.012646948905131343,
"learning_rate": 2.2332724084206112e-07,
"loss": 0.4102,
"step": 1048
},
{
"epoch": 15.720450281425892,
"grad_norm": 0.012200685840136094,
"learning_rate": 2.2175160747100198e-07,
"loss": 0.4049,
"step": 1049
},
{
"epoch": 15.73545966228893,
"grad_norm": 0.012245324836010893,
"learning_rate": 2.2018085862877566e-07,
"loss": 0.411,
"step": 1050
},
{
"epoch": 15.75046904315197,
"grad_norm": 0.012513647467032053,
"learning_rate": 2.1861500417389056e-07,
"loss": 0.4048,
"step": 1051
},
{
"epoch": 15.76547842401501,
"grad_norm": 0.011903061421256312,
"learning_rate": 2.170540539341361e-07,
"loss": 0.4188,
"step": 1052
},
{
"epoch": 15.78048780487805,
"grad_norm": 0.012394997852371569,
"learning_rate": 2.1549801770652098e-07,
"loss": 0.3948,
"step": 1053
},
{
"epoch": 15.795497185741088,
"grad_norm": 0.012059547174933096,
"learning_rate": 2.139469052572127e-07,
"loss": 0.4074,
"step": 1054
},
{
"epoch": 15.810506566604127,
"grad_norm": 0.012205668055297809,
"learning_rate": 2.1240072632147456e-07,
"loss": 0.421,
"step": 1055
},
{
"epoch": 15.825515947467167,
"grad_norm": 0.0120693595501382,
"learning_rate": 2.1085949060360653e-07,
"loss": 0.4148,
"step": 1056
},
{
"epoch": 15.825515947467167,
"eval_loss": 0.3944862484931946,
"eval_runtime": 14.0894,
"eval_samples_per_second": 31.726,
"eval_steps_per_second": 1.987,
"step": 1056
},
{
"epoch": 15.840525328330207,
"grad_norm": 0.012103676359132986,
"learning_rate": 2.0932320777688296e-07,
"loss": 0.405,
"step": 1057
},
{
"epoch": 15.855534709193245,
"grad_norm": 0.012592718645086199,
"learning_rate": 2.0779188748349252e-07,
"loss": 0.4153,
"step": 1058
},
{
"epoch": 15.870544090056285,
"grad_norm": 0.012256564326088341,
"learning_rate": 2.0626553933447732e-07,
"loss": 0.4097,
"step": 1059
},
{
"epoch": 15.885553470919325,
"grad_norm": 0.011906039730046884,
"learning_rate": 2.0474417290967295e-07,
"loss": 0.3977,
"step": 1060
},
{
"epoch": 15.900562851782365,
"grad_norm": 0.01245313620942531,
"learning_rate": 2.0322779775764787e-07,
"loss": 0.421,
"step": 1061
},
{
"epoch": 15.915572232645403,
"grad_norm": 0.012327925900428386,
"learning_rate": 2.0171642339564398e-07,
"loss": 0.4174,
"step": 1062
},
{
"epoch": 15.930581613508442,
"grad_norm": 0.012067615556392703,
"learning_rate": 2.0021005930951684e-07,
"loss": 0.4047,
"step": 1063
},
{
"epoch": 15.945590994371482,
"grad_norm": 0.011899607848417232,
"learning_rate": 1.9870871495367514e-07,
"loss": 0.4109,
"step": 1064
},
{
"epoch": 15.960600375234522,
"grad_norm": 0.012062481535146134,
"learning_rate": 1.972123997510231e-07,
"loss": 0.4121,
"step": 1065
},
{
"epoch": 15.975609756097562,
"grad_norm": 0.012510881372292592,
"learning_rate": 1.957211230929e-07,
"loss": 0.409,
"step": 1066
},
{
"epoch": 15.9906191369606,
"grad_norm": 0.01227375801391932,
"learning_rate": 1.9423489433902184e-07,
"loss": 0.4076,
"step": 1067
},
{
"epoch": 16.0,
"grad_norm": 0.01227375801391932,
"learning_rate": 1.9275372281742243e-07,
"loss": 0.4065,
"step": 1068
},
{
"epoch": 16.015009380863038,
"grad_norm": 0.016887925794017364,
"learning_rate": 1.91277617824394e-07,
"loss": 0.4033,
"step": 1069
},
{
"epoch": 16.03001876172608,
"grad_norm": 0.012065709393656405,
"learning_rate": 1.8980658862443088e-07,
"loss": 0.4139,
"step": 1070
},
{
"epoch": 16.045028142589118,
"grad_norm": 0.012123882563609396,
"learning_rate": 1.8834064445016951e-07,
"loss": 0.4141,
"step": 1071
},
{
"epoch": 16.06003752345216,
"grad_norm": 0.011970596541082451,
"learning_rate": 1.8687979450233115e-07,
"loss": 0.3953,
"step": 1072
},
{
"epoch": 16.075046904315197,
"grad_norm": 0.0121109332002954,
"learning_rate": 1.8542404794966427e-07,
"loss": 0.4007,
"step": 1073
},
{
"epoch": 16.090056285178235,
"grad_norm": 0.012317267567386788,
"learning_rate": 1.8397341392888676e-07,
"loss": 0.3968,
"step": 1074
},
{
"epoch": 16.105065666041277,
"grad_norm": 0.011954379333899165,
"learning_rate": 1.825279015446286e-07,
"loss": 0.4098,
"step": 1075
},
{
"epoch": 16.120075046904315,
"grad_norm": 0.01207522116296904,
"learning_rate": 1.8108751986937486e-07,
"loss": 0.4101,
"step": 1076
},
{
"epoch": 16.135084427767353,
"grad_norm": 0.012742776596916777,
"learning_rate": 1.7965227794340875e-07,
"loss": 0.4127,
"step": 1077
},
{
"epoch": 16.150093808630395,
"grad_norm": 0.01227951952136753,
"learning_rate": 1.7822218477475494e-07,
"loss": 0.4146,
"step": 1078
},
{
"epoch": 16.165103189493433,
"grad_norm": 0.012201981579291116,
"learning_rate": 1.767972493391222e-07,
"loss": 0.4127,
"step": 1079
},
{
"epoch": 16.180112570356474,
"grad_norm": 0.012271422051618314,
"learning_rate": 1.7537748057984857e-07,
"loss": 0.4039,
"step": 1080
},
{
"epoch": 16.195121951219512,
"grad_norm": 0.012025618790831537,
"learning_rate": 1.7396288740784416e-07,
"loss": 0.402,
"step": 1081
},
{
"epoch": 16.21013133208255,
"grad_norm": 0.012207501752001196,
"learning_rate": 1.7255347870153536e-07,
"loss": 0.4167,
"step": 1082
},
{
"epoch": 16.225140712945592,
"grad_norm": 0.01276307209079064,
"learning_rate": 1.7114926330680957e-07,
"loss": 0.4067,
"step": 1083
},
{
"epoch": 16.24015009380863,
"grad_norm": 0.011866760953989293,
"learning_rate": 1.6975025003695864e-07,
"loss": 0.4025,
"step": 1084
},
{
"epoch": 16.255159474671668,
"grad_norm": 0.012251478648853141,
"learning_rate": 1.6835644767262514e-07,
"loss": 0.3965,
"step": 1085
},
{
"epoch": 16.27016885553471,
"grad_norm": 0.012206207332206386,
"learning_rate": 1.6696786496174575e-07,
"loss": 0.409,
"step": 1086
},
{
"epoch": 16.285178236397748,
"grad_norm": 0.012182631218416822,
"learning_rate": 1.655845106194973e-07,
"loss": 0.4113,
"step": 1087
},
{
"epoch": 16.30018761726079,
"grad_norm": 0.01226161680696391,
"learning_rate": 1.642063933282417e-07,
"loss": 0.4032,
"step": 1088
},
{
"epoch": 16.315196998123827,
"grad_norm": 0.011802893119126617,
"learning_rate": 1.6283352173747146e-07,
"loss": 0.4121,
"step": 1089
},
{
"epoch": 16.330206378986865,
"grad_norm": 0.01205076600250808,
"learning_rate": 1.614659044637553e-07,
"loss": 0.4146,
"step": 1090
},
{
"epoch": 16.345215759849907,
"grad_norm": 0.012403739116953917,
"learning_rate": 1.6010355009068454e-07,
"loss": 0.4127,
"step": 1091
},
{
"epoch": 16.360225140712945,
"grad_norm": 0.011974952494326613,
"learning_rate": 1.5874646716881868e-07,
"loss": 0.4056,
"step": 1092
},
{
"epoch": 16.375234521575987,
"grad_norm": 0.012012554552280001,
"learning_rate": 1.5739466421563218e-07,
"loss": 0.3993,
"step": 1093
},
{
"epoch": 16.390243902439025,
"grad_norm": 0.012068435859544514,
"learning_rate": 1.560481497154602e-07,
"loss": 0.4067,
"step": 1094
},
{
"epoch": 16.405253283302063,
"grad_norm": 0.01241486073176841,
"learning_rate": 1.5470693211944642e-07,
"loss": 0.4168,
"step": 1095
},
{
"epoch": 16.420262664165104,
"grad_norm": 0.012421971155925233,
"learning_rate": 1.5337101984548951e-07,
"loss": 0.4036,
"step": 1096
},
{
"epoch": 16.435272045028142,
"grad_norm": 0.012148607504961384,
"learning_rate": 1.5204042127819018e-07,
"loss": 0.3997,
"step": 1097
},
{
"epoch": 16.45028142589118,
"grad_norm": 0.011893274496745576,
"learning_rate": 1.5071514476879876e-07,
"loss": 0.4075,
"step": 1098
},
{
"epoch": 16.465290806754222,
"grad_norm": 0.012445240319998217,
"learning_rate": 1.4939519863516213e-07,
"loss": 0.4038,
"step": 1099
},
{
"epoch": 16.48030018761726,
"grad_norm": 0.012537068527918893,
"learning_rate": 1.4808059116167303e-07,
"loss": 0.4111,
"step": 1100
},
{
"epoch": 16.4953095684803,
"grad_norm": 0.012304138189345946,
"learning_rate": 1.4677133059921632e-07,
"loss": 0.4151,
"step": 1101
},
{
"epoch": 16.51031894934334,
"grad_norm": 0.012061421440472614,
"learning_rate": 1.4546742516511845e-07,
"loss": 0.3969,
"step": 1102
},
{
"epoch": 16.525328330206378,
"grad_norm": 0.011836675742582285,
"learning_rate": 1.4416888304309515e-07,
"loss": 0.4047,
"step": 1103
},
{
"epoch": 16.54033771106942,
"grad_norm": 0.011817532065112164,
"learning_rate": 1.4287571238320051e-07,
"loss": 0.4107,
"step": 1104
},
{
"epoch": 16.555347091932457,
"grad_norm": 0.011743671409316899,
"learning_rate": 1.4158792130177543e-07,
"loss": 0.4004,
"step": 1105
},
{
"epoch": 16.570356472795496,
"grad_norm": 0.012116572838494573,
"learning_rate": 1.4030551788139721e-07,
"loss": 0.4141,
"step": 1106
},
{
"epoch": 16.585365853658537,
"grad_norm": 0.012425598089451213,
"learning_rate": 1.3902851017082862e-07,
"loss": 0.4118,
"step": 1107
},
{
"epoch": 16.600375234521575,
"grad_norm": 0.011753586649073179,
"learning_rate": 1.377569061849665e-07,
"loss": 0.4082,
"step": 1108
},
{
"epoch": 16.615384615384617,
"grad_norm": 0.012375977275680615,
"learning_rate": 1.3649071390479283e-07,
"loss": 0.4146,
"step": 1109
},
{
"epoch": 16.630393996247655,
"grad_norm": 0.012295332163973069,
"learning_rate": 1.3522994127732412e-07,
"loss": 0.4151,
"step": 1110
},
{
"epoch": 16.645403377110693,
"grad_norm": 0.011963182926471421,
"learning_rate": 1.3397459621556128e-07,
"loss": 0.4151,
"step": 1111
},
{
"epoch": 16.660412757973734,
"grad_norm": 0.011814638769608714,
"learning_rate": 1.327246865984404e-07,
"loss": 0.406,
"step": 1112
},
{
"epoch": 16.675422138836772,
"grad_norm": 0.012205572263242072,
"learning_rate": 1.314802202707822e-07,
"loss": 0.41,
"step": 1113
},
{
"epoch": 16.690431519699814,
"grad_norm": 0.011956127437528156,
"learning_rate": 1.3024120504324454e-07,
"loss": 0.4042,
"step": 1114
},
{
"epoch": 16.705440900562852,
"grad_norm": 0.012425936619786686,
"learning_rate": 1.290076486922722e-07,
"loss": 0.4098,
"step": 1115
},
{
"epoch": 16.72045028142589,
"grad_norm": 0.012195755792282742,
"learning_rate": 1.2777955896004811e-07,
"loss": 0.4123,
"step": 1116
},
{
"epoch": 16.735459662288932,
"grad_norm": 0.012001955272470884,
"learning_rate": 1.2655694355444547e-07,
"loss": 0.4058,
"step": 1117
},
{
"epoch": 16.75046904315197,
"grad_norm": 0.012612779412664959,
"learning_rate": 1.25339810148978e-07,
"loss": 0.4018,
"step": 1118
},
{
"epoch": 16.765478424015008,
"grad_norm": 0.011731053965490292,
"learning_rate": 1.2412816638275402e-07,
"loss": 0.4099,
"step": 1119
},
{
"epoch": 16.78048780487805,
"grad_norm": 0.011942110649578081,
"learning_rate": 1.2292201986042616e-07,
"loss": 0.4159,
"step": 1120
},
{
"epoch": 16.795497185741088,
"grad_norm": 0.01228838930130753,
"learning_rate": 1.2172137815214488e-07,
"loss": 0.4177,
"step": 1121
},
{
"epoch": 16.81050656660413,
"grad_norm": 0.012201958299201607,
"learning_rate": 1.2052624879351103e-07,
"loss": 0.4064,
"step": 1122
},
{
"epoch": 16.81050656660413,
"eval_loss": 0.3941747844219208,
"eval_runtime": 13.7699,
"eval_samples_per_second": 32.462,
"eval_steps_per_second": 2.033,
"step": 1122
},
{
"epoch": 16.825515947467167,
"grad_norm": 0.011987590697607683,
"learning_rate": 1.1933663928552752e-07,
"loss": 0.3976,
"step": 1123
},
{
"epoch": 16.840525328330205,
"grad_norm": 0.012138943362389928,
"learning_rate": 1.1815255709455374e-07,
"loss": 0.4153,
"step": 1124
},
{
"epoch": 16.855534709193247,
"grad_norm": 0.012373458329033428,
"learning_rate": 1.1697400965225745e-07,
"loss": 0.4146,
"step": 1125
},
{
"epoch": 16.870544090056285,
"grad_norm": 0.012389090392038503,
"learning_rate": 1.1580100435556883e-07,
"loss": 0.3946,
"step": 1126
},
{
"epoch": 16.885553470919323,
"grad_norm": 0.011834382886208047,
"learning_rate": 1.1463354856663399e-07,
"loss": 0.4016,
"step": 1127
},
{
"epoch": 16.900562851782365,
"grad_norm": 0.012114418651751394,
"learning_rate": 1.1347164961276789e-07,
"loss": 0.396,
"step": 1128
},
{
"epoch": 16.915572232645403,
"grad_norm": 0.012269351272196211,
"learning_rate": 1.1231531478640987e-07,
"loss": 0.4098,
"step": 1129
},
{
"epoch": 16.930581613508444,
"grad_norm": 0.011614991226426072,
"learning_rate": 1.1116455134507663e-07,
"loss": 0.4122,
"step": 1130
},
{
"epoch": 16.945590994371482,
"grad_norm": 0.012827165860445595,
"learning_rate": 1.1001936651131716e-07,
"loss": 0.4074,
"step": 1131
},
{
"epoch": 16.96060037523452,
"grad_norm": 0.01202261694187779,
"learning_rate": 1.0887976747266791e-07,
"loss": 0.4017,
"step": 1132
},
{
"epoch": 16.975609756097562,
"grad_norm": 0.012092296382532815,
"learning_rate": 1.0774576138160596e-07,
"loss": 0.4114,
"step": 1133
},
{
"epoch": 16.9906191369606,
"grad_norm": 0.012388455511771996,
"learning_rate": 1.0661735535550665e-07,
"loss": 0.4104,
"step": 1134
},
{
"epoch": 17.015009380863038,
"grad_norm": 0.01607381588237361,
"learning_rate": 1.0549455647659677e-07,
"loss": 0.7922,
"step": 1135
},
{
"epoch": 17.03001876172608,
"grad_norm": 0.01182074740531476,
"learning_rate": 1.0437737179191108e-07,
"loss": 0.4057,
"step": 1136
},
{
"epoch": 17.045028142589118,
"grad_norm": 0.012095915652515062,
"learning_rate": 1.0326580831324816e-07,
"loss": 0.4012,
"step": 1137
},
{
"epoch": 17.06003752345216,
"grad_norm": 0.011903831536743867,
"learning_rate": 1.021598730171257e-07,
"loss": 0.4091,
"step": 1138
},
{
"epoch": 17.075046904315197,
"grad_norm": 0.012259225232080817,
"learning_rate": 1.0105957284473732e-07,
"loss": 0.4093,
"step": 1139
},
{
"epoch": 17.090056285178235,
"grad_norm": 0.012083353911997635,
"learning_rate": 9.996491470190915e-08,
"loss": 0.4137,
"step": 1140
},
{
"epoch": 17.105065666041277,
"grad_norm": 0.012224092783758394,
"learning_rate": 9.887590545905589e-08,
"loss": 0.4254,
"step": 1141
},
{
"epoch": 17.120075046904315,
"grad_norm": 0.011652986549382856,
"learning_rate": 9.779255195113823e-08,
"loss": 0.4037,
"step": 1142
},
{
"epoch": 17.135084427767353,
"grad_norm": 0.012465106664410062,
"learning_rate": 9.671486097761917e-08,
"loss": 0.401,
"step": 1143
},
{
"epoch": 17.150093808630395,
"grad_norm": 0.012292307923531054,
"learning_rate": 9.564283930242257e-08,
"loss": 0.4078,
"step": 1144
},
{
"epoch": 17.165103189493433,
"grad_norm": 0.012083496838805925,
"learning_rate": 9.457649365388965e-08,
"loss": 0.4061,
"step": 1145
},
{
"epoch": 17.180112570356474,
"grad_norm": 0.012058613681905764,
"learning_rate": 9.351583072473712e-08,
"loss": 0.4011,
"step": 1146
},
{
"epoch": 17.195121951219512,
"grad_norm": 0.011786553163980374,
"learning_rate": 9.246085717201546e-08,
"loss": 0.4148,
"step": 1147
},
{
"epoch": 17.21013133208255,
"grad_norm": 0.012314917316211435,
"learning_rate": 9.141157961706602e-08,
"loss": 0.4102,
"step": 1148
},
{
"epoch": 17.225140712945592,
"grad_norm": 0.012107777455723816,
"learning_rate": 9.036800464548156e-08,
"loss": 0.3958,
"step": 1149
},
{
"epoch": 17.24015009380863,
"grad_norm": 0.011964321058374836,
"learning_rate": 8.933013880706275e-08,
"loss": 0.4023,
"step": 1150
},
{
"epoch": 17.255159474671668,
"grad_norm": 0.012067958170577502,
"learning_rate": 8.829798861577831e-08,
"loss": 0.4134,
"step": 1151
},
{
"epoch": 17.27016885553471,
"grad_norm": 0.01209478822335003,
"learning_rate": 8.727156054972373e-08,
"loss": 0.416,
"step": 1152
},
{
"epoch": 17.285178236397748,
"grad_norm": 0.012244663184378318,
"learning_rate": 8.625086105108037e-08,
"loss": 0.4005,
"step": 1153
},
{
"epoch": 17.30018761726079,
"grad_norm": 0.011573669129444004,
"learning_rate": 8.523589652607566e-08,
"loss": 0.4041,
"step": 1154
},
{
"epoch": 17.315196998123827,
"grad_norm": 0.01184555083569059,
"learning_rate": 8.422667334494249e-08,
"loss": 0.3999,
"step": 1155
},
{
"epoch": 17.330206378986865,
"grad_norm": 0.01271028447647729,
"learning_rate": 8.322319784187959e-08,
"loss": 0.4113,
"step": 1156
},
{
"epoch": 17.345215759849907,
"grad_norm": 0.012424858123698831,
"learning_rate": 8.222547631501054e-08,
"loss": 0.4073,
"step": 1157
},
{
"epoch": 17.360225140712945,
"grad_norm": 0.012569921108406372,
"learning_rate": 8.123351502634623e-08,
"loss": 0.4176,
"step": 1158
},
{
"epoch": 17.375234521575987,
"grad_norm": 0.012212460256096685,
"learning_rate": 8.024732020174385e-08,
"loss": 0.4163,
"step": 1159
},
{
"epoch": 17.390243902439025,
"grad_norm": 0.012876844421681181,
"learning_rate": 7.926689803086872e-08,
"loss": 0.4137,
"step": 1160
},
{
"epoch": 17.405253283302063,
"grad_norm": 0.012127902236964899,
"learning_rate": 7.82922546671555e-08,
"loss": 0.4108,
"step": 1161
},
{
"epoch": 17.420262664165104,
"grad_norm": 0.012475847191274562,
"learning_rate": 7.732339622776829e-08,
"loss": 0.4119,
"step": 1162
},
{
"epoch": 17.435272045028142,
"grad_norm": 0.012323895374580014,
"learning_rate": 7.636032879356425e-08,
"loss": 0.4064,
"step": 1163
},
{
"epoch": 17.45028142589118,
"grad_norm": 0.01162521749026464,
"learning_rate": 7.540305840905369e-08,
"loss": 0.4099,
"step": 1164
},
{
"epoch": 17.465290806754222,
"grad_norm": 0.012085289434805383,
"learning_rate": 7.445159108236343e-08,
"loss": 0.4014,
"step": 1165
},
{
"epoch": 17.48030018761726,
"grad_norm": 0.011680985710547098,
"learning_rate": 7.350593278519823e-08,
"loss": 0.4118,
"step": 1166
},
{
"epoch": 17.4953095684803,
"grad_norm": 0.01202917040251438,
"learning_rate": 7.256608945280318e-08,
"loss": 0.4085,
"step": 1167
},
{
"epoch": 17.51031894934334,
"grad_norm": 0.011995016219742638,
"learning_rate": 7.163206698392742e-08,
"loss": 0.4121,
"step": 1168
},
{
"epoch": 17.525328330206378,
"grad_norm": 0.012587588608999197,
"learning_rate": 7.070387124078614e-08,
"loss": 0.4134,
"step": 1169
},
{
"epoch": 17.54033771106942,
"grad_norm": 0.011907862956892574,
"learning_rate": 6.978150804902449e-08,
"loss": 0.4149,
"step": 1170
},
{
"epoch": 17.555347091932457,
"grad_norm": 0.012367732761458044,
"learning_rate": 6.886498319768075e-08,
"loss": 0.4123,
"step": 1171
},
{
"epoch": 17.570356472795496,
"grad_norm": 0.012413142793066507,
"learning_rate": 6.795430243914935e-08,
"loss": 0.4099,
"step": 1172
},
{
"epoch": 17.585365853658537,
"grad_norm": 0.01198028253713889,
"learning_rate": 6.704947148914608e-08,
"loss": 0.4053,
"step": 1173
},
{
"epoch": 17.600375234521575,
"grad_norm": 0.01218802781184607,
"learning_rate": 6.615049602667122e-08,
"loss": 0.4116,
"step": 1174
},
{
"epoch": 17.615384615384617,
"grad_norm": 0.012525942652443793,
"learning_rate": 6.52573816939742e-08,
"loss": 0.4123,
"step": 1175
},
{
"epoch": 17.630393996247655,
"grad_norm": 0.012661673801348647,
"learning_rate": 6.437013409651847e-08,
"loss": 0.4108,
"step": 1176
},
{
"epoch": 17.645403377110693,
"grad_norm": 0.012526249345482146,
"learning_rate": 6.348875880294535e-08,
"loss": 0.4129,
"step": 1177
},
{
"epoch": 17.660412757973734,
"grad_norm": 0.012160064393132261,
"learning_rate": 6.26132613450403e-08,
"loss": 0.405,
"step": 1178
},
{
"epoch": 17.675422138836772,
"grad_norm": 0.011856736199021006,
"learning_rate": 6.174364721769742e-08,
"loss": 0.3979,
"step": 1179
},
{
"epoch": 17.690431519699814,
"grad_norm": 0.012209332212864363,
"learning_rate": 6.087992187888557e-08,
"loss": 0.4094,
"step": 1180
},
{
"epoch": 17.705440900562852,
"grad_norm": 0.012164727576505307,
"learning_rate": 6.00220907496135e-08,
"loss": 0.4064,
"step": 1181
},
{
"epoch": 17.72045028142589,
"grad_norm": 0.012094675237917947,
"learning_rate": 5.917015921389568e-08,
"loss": 0.4111,
"step": 1182
},
{
"epoch": 17.735459662288932,
"grad_norm": 0.012301334949674643,
"learning_rate": 5.832413261871938e-08,
"loss": 0.4019,
"step": 1183
},
{
"epoch": 17.75046904315197,
"grad_norm": 0.012225443758653235,
"learning_rate": 5.748401627401067e-08,
"loss": 0.3957,
"step": 1184
},
{
"epoch": 17.765478424015008,
"grad_norm": 0.012458043664162905,
"learning_rate": 5.6649815452600725e-08,
"loss": 0.3975,
"step": 1185
},
{
"epoch": 17.78048780487805,
"grad_norm": 0.011703394783094899,
"learning_rate": 5.5821535390193406e-08,
"loss": 0.4084,
"step": 1186
},
{
"epoch": 17.795497185741088,
"grad_norm": 0.01193572234009368,
"learning_rate": 5.499918128533154e-08,
"loss": 0.4029,
"step": 1187
},
{
"epoch": 17.81050656660413,
"grad_norm": 0.012490715952476177,
"learning_rate": 5.4182758299365364e-08,
"loss": 0.4066,
"step": 1188
},
{
"epoch": 17.81050656660413,
"eval_loss": 0.3940185606479645,
"eval_runtime": 14.0978,
"eval_samples_per_second": 31.707,
"eval_steps_per_second": 1.986,
"step": 1188
},
{
"epoch": 17.825515947467167,
"grad_norm": 0.012227421280696971,
"learning_rate": 5.337227155641921e-08,
"loss": 0.4083,
"step": 1189
},
{
"epoch": 17.840525328330205,
"grad_norm": 0.012367946788776257,
"learning_rate": 5.256772614335991e-08,
"loss": 0.4082,
"step": 1190
},
{
"epoch": 17.855534709193247,
"grad_norm": 0.012324728938463323,
"learning_rate": 5.1769127109764666e-08,
"loss": 0.4009,
"step": 1191
},
{
"epoch": 17.870544090056285,
"grad_norm": 0.011883776326167475,
"learning_rate": 5.0976479467888966e-08,
"loss": 0.3992,
"step": 1192
},
{
"epoch": 17.885553470919323,
"grad_norm": 0.01191459640325878,
"learning_rate": 5.018978819263597e-08,
"loss": 0.4086,
"step": 1193
},
{
"epoch": 17.900562851782365,
"grad_norm": 0.012409893635999196,
"learning_rate": 4.940905822152452e-08,
"loss": 0.3926,
"step": 1194
},
{
"epoch": 17.915572232645403,
"grad_norm": 0.012283886910921115,
"learning_rate": 4.863429445465883e-08,
"loss": 0.4075,
"step": 1195
},
{
"epoch": 17.930581613508444,
"grad_norm": 0.012064735027299723,
"learning_rate": 4.786550175469728e-08,
"loss": 0.3929,
"step": 1196
},
{
"epoch": 17.945590994371482,
"grad_norm": 0.012077507797889245,
"learning_rate": 4.7102684946821456e-08,
"loss": 0.4086,
"step": 1197
},
{
"epoch": 17.96060037523452,
"grad_norm": 0.011773580263745117,
"learning_rate": 4.6345848818706956e-08,
"loss": 0.4195,
"step": 1198
},
{
"epoch": 17.975609756097562,
"grad_norm": 0.012158454333714083,
"learning_rate": 4.55949981204925e-08,
"loss": 0.4147,
"step": 1199
},
{
"epoch": 17.9906191369606,
"grad_norm": 0.011705832981943262,
"learning_rate": 4.4850137564750756e-08,
"loss": 0.4133,
"step": 1200
},
{
"epoch": 18.015009380863038,
"grad_norm": 0.01892041042339661,
"learning_rate": 4.4111271826457684e-08,
"loss": 0.8008,
"step": 1201
},
{
"epoch": 18.03001876172608,
"grad_norm": 0.011875241891989333,
"learning_rate": 4.337840554296468e-08,
"loss": 0.3956,
"step": 1202
},
{
"epoch": 18.045028142589118,
"grad_norm": 0.011518111425783685,
"learning_rate": 4.265154331396814e-08,
"loss": 0.4018,
"step": 1203
},
{
"epoch": 18.06003752345216,
"grad_norm": 0.011921877135567948,
"learning_rate": 4.193068970148139e-08,
"loss": 0.4135,
"step": 1204
},
{
"epoch": 18.075046904315197,
"grad_norm": 0.01196404394665153,
"learning_rate": 4.121584922980603e-08,
"loss": 0.4152,
"step": 1205
},
{
"epoch": 18.090056285178235,
"grad_norm": 0.012254596451346144,
"learning_rate": 4.050702638550274e-08,
"loss": 0.4057,
"step": 1206
},
{
"epoch": 18.105065666041277,
"grad_norm": 0.012184092362587513,
"learning_rate": 3.9804225617364185e-08,
"loss": 0.4042,
"step": 1207
},
{
"epoch": 18.120075046904315,
"grad_norm": 0.012194493098812067,
"learning_rate": 3.910745133638638e-08,
"loss": 0.418,
"step": 1208
},
{
"epoch": 18.135084427767353,
"grad_norm": 0.012555563095673745,
"learning_rate": 3.841670791574136e-08,
"loss": 0.4102,
"step": 1209
},
{
"epoch": 18.150093808630395,
"grad_norm": 0.012169854152024248,
"learning_rate": 3.7731999690749585e-08,
"loss": 0.3893,
"step": 1210
},
{
"epoch": 18.165103189493433,
"grad_norm": 0.01244474180194911,
"learning_rate": 3.705333095885277e-08,
"loss": 0.4044,
"step": 1211
},
{
"epoch": 18.180112570356474,
"grad_norm": 0.012701077259060964,
"learning_rate": 3.6380705979586644e-08,
"loss": 0.4094,
"step": 1212
},
{
"epoch": 18.195121951219512,
"grad_norm": 0.011824029437486382,
"learning_rate": 3.571412897455495e-08,
"loss": 0.4129,
"step": 1213
},
{
"epoch": 18.21013133208255,
"grad_norm": 0.012828647907710029,
"learning_rate": 3.505360412740188e-08,
"loss": 0.4,
"step": 1214
},
{
"epoch": 18.225140712945592,
"grad_norm": 0.01201395104905769,
"learning_rate": 3.439913558378704e-08,
"loss": 0.4207,
"step": 1215
},
{
"epoch": 18.24015009380863,
"grad_norm": 0.012461422730290947,
"learning_rate": 3.3750727451358094e-08,
"loss": 0.3988,
"step": 1216
},
{
"epoch": 18.255159474671668,
"grad_norm": 0.012047772292080035,
"learning_rate": 3.310838379972614e-08,
"loss": 0.4122,
"step": 1217
},
{
"epoch": 18.27016885553471,
"grad_norm": 0.012451921378931235,
"learning_rate": 3.24721086604397e-08,
"loss": 0.4179,
"step": 1218
},
{
"epoch": 18.285178236397748,
"grad_norm": 0.012038541117606516,
"learning_rate": 3.1841906026959356e-08,
"loss": 0.4033,
"step": 1219
},
{
"epoch": 18.30018761726079,
"grad_norm": 0.011860346393252194,
"learning_rate": 3.1217779854632806e-08,
"loss": 0.3957,
"step": 1220
},
{
"epoch": 18.315196998123827,
"grad_norm": 0.012122525135396791,
"learning_rate": 3.0599734060669626e-08,
"loss": 0.408,
"step": 1221
},
{
"epoch": 18.330206378986865,
"grad_norm": 0.01207502168393969,
"learning_rate": 2.998777252411766e-08,
"loss": 0.4165,
"step": 1222
},
{
"epoch": 18.345215759849907,
"grad_norm": 0.012521513750259158,
"learning_rate": 2.9381899085837438e-08,
"loss": 0.4122,
"step": 1223
},
{
"epoch": 18.360225140712945,
"grad_norm": 0.012634009105925964,
"learning_rate": 2.8782117548479258e-08,
"loss": 0.4151,
"step": 1224
},
{
"epoch": 18.375234521575987,
"grad_norm": 0.011959711219096641,
"learning_rate": 2.8188431676458345e-08,
"loss": 0.4078,
"step": 1225
},
{
"epoch": 18.390243902439025,
"grad_norm": 0.011988140892609642,
"learning_rate": 2.7600845195931867e-08,
"loss": 0.4058,
"step": 1226
},
{
"epoch": 18.405253283302063,
"grad_norm": 0.012178000803191502,
"learning_rate": 2.701936179477515e-08,
"loss": 0.4144,
"step": 1227
},
{
"epoch": 18.420262664165104,
"grad_norm": 0.0120582199067019,
"learning_rate": 2.6443985122558855e-08,
"loss": 0.4048,
"step": 1228
},
{
"epoch": 18.435272045028142,
"grad_norm": 0.012256955326780608,
"learning_rate": 2.587471879052572e-08,
"loss": 0.4053,
"step": 1229
},
{
"epoch": 18.45028142589118,
"grad_norm": 0.012220393687074404,
"learning_rate": 2.5311566371568505e-08,
"loss": 0.4166,
"step": 1230
},
{
"epoch": 18.465290806754222,
"grad_norm": 0.012318315655887042,
"learning_rate": 2.4754531400206446e-08,
"loss": 0.4086,
"step": 1231
},
{
"epoch": 18.48030018761726,
"grad_norm": 0.01208759598395148,
"learning_rate": 2.4203617372564378e-08,
"loss": 0.403,
"step": 1232
},
{
"epoch": 18.4953095684803,
"grad_norm": 0.012045700658904396,
"learning_rate": 2.3658827746349974e-08,
"loss": 0.4016,
"step": 1233
},
{
"epoch": 18.51031894934334,
"grad_norm": 0.011950579146901357,
"learning_rate": 2.3120165940832325e-08,
"loss": 0.4111,
"step": 1234
},
{
"epoch": 18.525328330206378,
"grad_norm": 0.012254984423373173,
"learning_rate": 2.2587635336820398e-08,
"loss": 0.4163,
"step": 1235
},
{
"epoch": 18.54033771106942,
"grad_norm": 0.012241032174288546,
"learning_rate": 2.2061239276641607e-08,
"loss": 0.4067,
"step": 1236
},
{
"epoch": 18.555347091932457,
"grad_norm": 0.01207555807762867,
"learning_rate": 2.1540981064121388e-08,
"loss": 0.4155,
"step": 1237
},
{
"epoch": 18.570356472795496,
"grad_norm": 0.012558998024423071,
"learning_rate": 2.102686396456199e-08,
"loss": 0.4131,
"step": 1238
},
{
"epoch": 18.585365853658537,
"grad_norm": 0.012060644512723748,
"learning_rate": 2.0518891204722167e-08,
"loss": 0.4069,
"step": 1239
},
{
"epoch": 18.600375234521575,
"grad_norm": 0.012285463532045451,
"learning_rate": 2.0017065972796843e-08,
"loss": 0.3989,
"step": 1240
},
{
"epoch": 18.615384615384617,
"grad_norm": 0.012025225107120854,
"learning_rate": 1.9521391418397148e-08,
"loss": 0.4034,
"step": 1241
},
{
"epoch": 18.630393996247655,
"grad_norm": 0.012338195232809763,
"learning_rate": 1.9031870652530756e-08,
"loss": 0.4113,
"step": 1242
},
{
"epoch": 18.645403377110693,
"grad_norm": 0.01213275063867394,
"learning_rate": 1.8548506747582128e-08,
"loss": 0.4115,
"step": 1243
},
{
"epoch": 18.660412757973734,
"grad_norm": 0.012004565010333276,
"learning_rate": 1.807130273729329e-08,
"loss": 0.3981,
"step": 1244
},
{
"epoch": 18.675422138836772,
"grad_norm": 0.012366830778200107,
"learning_rate": 1.7600261616745103e-08,
"loss": 0.4113,
"step": 1245
},
{
"epoch": 18.690431519699814,
"grad_norm": 0.012029863257280412,
"learning_rate": 1.713538634233791e-08,
"loss": 0.4086,
"step": 1246
},
{
"epoch": 18.705440900562852,
"grad_norm": 0.011996240644873084,
"learning_rate": 1.6676679831773567e-08,
"loss": 0.4019,
"step": 1247
},
{
"epoch": 18.72045028142589,
"grad_norm": 0.012493637507283401,
"learning_rate": 1.622414496403668e-08,
"loss": 0.4007,
"step": 1248
},
{
"epoch": 18.735459662288932,
"grad_norm": 0.011864242959638414,
"learning_rate": 1.5777784579376728e-08,
"loss": 0.4002,
"step": 1249
},
{
"epoch": 18.75046904315197,
"grad_norm": 0.012004702120412627,
"learning_rate": 1.5337601479290195e-08,
"loss": 0.3977,
"step": 1250
},
{
"epoch": 18.765478424015008,
"grad_norm": 0.012055148241702568,
"learning_rate": 1.4903598426503237e-08,
"loss": 0.4056,
"step": 1251
},
{
"epoch": 18.78048780487805,
"grad_norm": 0.011971783702517814,
"learning_rate": 1.447577814495371e-08,
"loss": 0.404,
"step": 1252
},
{
"epoch": 18.795497185741088,
"grad_norm": 0.011875158436815665,
"learning_rate": 1.4054143319774724e-08,
"loss": 0.4023,
"step": 1253
},
{
"epoch": 18.81050656660413,
"grad_norm": 0.012294967191521976,
"learning_rate": 1.3638696597277677e-08,
"loss": 0.4137,
"step": 1254
},
{
"epoch": 18.81050656660413,
"eval_loss": 0.39393627643585205,
"eval_runtime": 13.7085,
"eval_samples_per_second": 32.607,
"eval_steps_per_second": 2.043,
"step": 1254
},
{
"epoch": 18.825515947467167,
"grad_norm": 0.012150083576670793,
"learning_rate": 1.3229440584935137e-08,
"loss": 0.4042,
"step": 1255
},
{
"epoch": 18.840525328330205,
"grad_norm": 0.012191506353413198,
"learning_rate": 1.28263778513652e-08,
"loss": 0.4135,
"step": 1256
},
{
"epoch": 18.855534709193247,
"grad_norm": 0.012120685160381235,
"learning_rate": 1.2429510926314835e-08,
"loss": 0.4119,
"step": 1257
},
{
"epoch": 18.870544090056285,
"grad_norm": 0.011999394179816762,
"learning_rate": 1.2038842300644225e-08,
"loss": 0.4091,
"step": 1258
},
{
"epoch": 18.885553470919323,
"grad_norm": 0.012229137988916727,
"learning_rate": 1.165437442631112e-08,
"loss": 0.4026,
"step": 1259
},
{
"epoch": 18.900562851782365,
"grad_norm": 0.011773188892312082,
"learning_rate": 1.1276109716355286e-08,
"loss": 0.3999,
"step": 1260
},
{
"epoch": 18.915572232645403,
"grad_norm": 0.011987595281597069,
"learning_rate": 1.0904050544883858e-08,
"loss": 0.4038,
"step": 1261
},
{
"epoch": 18.930581613508444,
"grad_norm": 0.011884116567775816,
"learning_rate": 1.0538199247055678e-08,
"loss": 0.4053,
"step": 1262
},
{
"epoch": 18.945590994371482,
"grad_norm": 0.011817654569350527,
"learning_rate": 1.0178558119067315e-08,
"loss": 0.4057,
"step": 1263
},
{
"epoch": 18.96060037523452,
"grad_norm": 0.012055678102132312,
"learning_rate": 9.825129418138178e-09,
"loss": 0.4078,
"step": 1264
},
{
"epoch": 18.975609756097562,
"grad_norm": 0.012071982250938978,
"learning_rate": 9.477915362496758e-09,
"loss": 0.4203,
"step": 1265
},
{
"epoch": 18.9906191369606,
"grad_norm": 0.012189144844470683,
"learning_rate": 9.13691813136641e-09,
"loss": 0.4162,
"step": 1266
},
{
"epoch": 19.0,
"grad_norm": 0.013032683738861633,
"learning_rate": 8.802139864951596e-09,
"loss": 0.3283,
"step": 1267
},
{
"epoch": 19.015009380863038,
"grad_norm": 0.014335853539907665,
"learning_rate": 8.473582664424995e-09,
"loss": 0.4792,
"step": 1268
},
{
"epoch": 19.03001876172608,
"grad_norm": 0.012484679040138053,
"learning_rate": 8.151248591913518e-09,
"loss": 0.4106,
"step": 1269
},
{
"epoch": 19.045028142589118,
"grad_norm": 0.012105246321067362,
"learning_rate": 7.835139670486212e-09,
"loss": 0.4001,
"step": 1270
},
{
"epoch": 19.06003752345216,
"grad_norm": 0.011751825236754727,
"learning_rate": 7.525257884140823e-09,
"loss": 0.4008,
"step": 1271
},
{
"epoch": 19.075046904315197,
"grad_norm": 0.013175703813983093,
"learning_rate": 7.2216051777916894e-09,
"loss": 0.4093,
"step": 1272
},
{
"epoch": 19.090056285178235,
"grad_norm": 0.011978352610379423,
"learning_rate": 6.924183457257871e-09,
"loss": 0.4121,
"step": 1273
},
{
"epoch": 19.105065666041277,
"grad_norm": 0.011787239977337143,
"learning_rate": 6.632994589250262e-09,
"loss": 0.4121,
"step": 1274
},
{
"epoch": 19.120075046904315,
"grad_norm": 0.012318814698430361,
"learning_rate": 6.3480404013608325e-09,
"loss": 0.4043,
"step": 1275
},
{
"epoch": 19.135084427767353,
"grad_norm": 0.012037273362049511,
"learning_rate": 6.069322682050515e-09,
"loss": 0.4066,
"step": 1276
},
{
"epoch": 19.150093808630395,
"grad_norm": 0.011959073152947426,
"learning_rate": 5.796843180638555e-09,
"loss": 0.4095,
"step": 1277
},
{
"epoch": 19.165103189493433,
"grad_norm": 0.011966460370732191,
"learning_rate": 5.530603607290851e-09,
"loss": 0.4224,
"step": 1278
},
{
"epoch": 19.180112570356474,
"grad_norm": 0.012353059701364673,
"learning_rate": 5.2706056330098505e-09,
"loss": 0.4063,
"step": 1279
},
{
"epoch": 19.195121951219512,
"grad_norm": 0.011930396547934605,
"learning_rate": 5.0168508896235585e-09,
"loss": 0.4033,
"step": 1280
},
{
"epoch": 19.21013133208255,
"grad_norm": 0.011731376690623204,
"learning_rate": 4.769340969775659e-09,
"loss": 0.405,
"step": 1281
},
{
"epoch": 19.225140712945592,
"grad_norm": 0.012283286872202488,
"learning_rate": 4.528077426915411e-09,
"loss": 0.4106,
"step": 1282
},
{
"epoch": 19.24015009380863,
"grad_norm": 0.012574046336957924,
"learning_rate": 4.293061775287654e-09,
"loss": 0.4065,
"step": 1283
},
{
"epoch": 19.255159474671668,
"grad_norm": 0.011951354592048286,
"learning_rate": 4.064295489923819e-09,
"loss": 0.4023,
"step": 1284
},
{
"epoch": 19.27016885553471,
"grad_norm": 0.012132271788831939,
"learning_rate": 3.841780006632267e-09,
"loss": 0.4075,
"step": 1285
},
{
"epoch": 19.285178236397748,
"grad_norm": 0.01221857164582425,
"learning_rate": 3.625516721989075e-09,
"loss": 0.4077,
"step": 1286
},
{
"epoch": 19.30018761726079,
"grad_norm": 0.011863748308289977,
"learning_rate": 3.415506993330153e-09,
"loss": 0.4045,
"step": 1287
},
{
"epoch": 19.315196998123827,
"grad_norm": 0.011741159713343184,
"learning_rate": 3.211752138741697e-09,
"loss": 0.4143,
"step": 1288
},
{
"epoch": 19.330206378986865,
"grad_norm": 0.01191775807178149,
"learning_rate": 3.0142534370524164e-09,
"loss": 0.4062,
"step": 1289
},
{
"epoch": 19.345215759849907,
"grad_norm": 0.012215595050070076,
"learning_rate": 2.8230121278257635e-09,
"loss": 0.4093,
"step": 1290
},
{
"epoch": 19.360225140712945,
"grad_norm": 0.012564655988299225,
"learning_rate": 2.6380294113514943e-09,
"loss": 0.4139,
"step": 1291
},
{
"epoch": 19.375234521575987,
"grad_norm": 0.012477001425241382,
"learning_rate": 2.459306448638676e-09,
"loss": 0.405,
"step": 1292
},
{
"epoch": 19.390243902439025,
"grad_norm": 0.01206388735700181,
"learning_rate": 2.2868443614082468e-09,
"loss": 0.4231,
"step": 1293
},
{
"epoch": 19.405253283302063,
"grad_norm": 0.012208171675223407,
"learning_rate": 2.1206442320858e-09,
"loss": 0.4005,
"step": 1294
},
{
"epoch": 19.420262664165104,
"grad_norm": 0.012153451875566665,
"learning_rate": 1.960707103795034e-09,
"loss": 0.4028,
"step": 1295
},
{
"epoch": 19.435272045028142,
"grad_norm": 0.012279486741819839,
"learning_rate": 1.8070339803509804e-09,
"loss": 0.4057,
"step": 1296
},
{
"epoch": 19.45028142589118,
"grad_norm": 0.012000294004781479,
"learning_rate": 1.6596258262541184e-09,
"loss": 0.4097,
"step": 1297
},
{
"epoch": 19.465290806754222,
"grad_norm": 0.012492834461960383,
"learning_rate": 1.5184835666838258e-09,
"loss": 0.4173,
"step": 1298
},
{
"epoch": 19.48030018761726,
"grad_norm": 0.012299822720232542,
"learning_rate": 1.3836080874926047e-09,
"loss": 0.4092,
"step": 1299
},
{
"epoch": 19.4953095684803,
"grad_norm": 0.01218733011356003,
"learning_rate": 1.2550002352010868e-09,
"loss": 0.4218,
"step": 1300
},
{
"epoch": 19.51031894934334,
"grad_norm": 0.012042960770651666,
"learning_rate": 1.1326608169920371e-09,
"loss": 0.3959,
"step": 1301
},
{
"epoch": 19.525328330206378,
"grad_norm": 0.011997398364275975,
"learning_rate": 1.0165906007056912e-09,
"loss": 0.4166,
"step": 1302
},
{
"epoch": 19.54033771106942,
"grad_norm": 0.012305321109405035,
"learning_rate": 9.067903148348711e-10,
"loss": 0.4087,
"step": 1303
},
{
"epoch": 19.555347091932457,
"grad_norm": 0.012014818503486962,
"learning_rate": 8.032606485200988e-10,
"loss": 0.4154,
"step": 1304
},
{
"epoch": 19.570356472795496,
"grad_norm": 0.01251044844330409,
"learning_rate": 7.060022515460451e-10,
"loss": 0.4102,
"step": 1305
},
{
"epoch": 19.585365853658537,
"grad_norm": 0.012008572784907577,
"learning_rate": 6.150157343364215e-10,
"loss": 0.4152,
"step": 1306
},
{
"epoch": 19.600375234521575,
"grad_norm": 0.012287338325467436,
"learning_rate": 5.303016679509831e-10,
"loss": 0.3981,
"step": 1307
},
{
"epoch": 19.615384615384617,
"grad_norm": 0.012267724291012534,
"learning_rate": 4.518605840815315e-10,
"loss": 0.4135,
"step": 1308
},
{
"epoch": 19.630393996247655,
"grad_norm": 0.011809176076779143,
"learning_rate": 3.7969297504858443e-10,
"loss": 0.3975,
"step": 1309
},
{
"epoch": 19.645403377110693,
"grad_norm": 0.012294547435049158,
"learning_rate": 3.1379929379871104e-10,
"loss": 0.4003,
"step": 1310
},
{
"epoch": 19.660412757973734,
"grad_norm": 0.011704232467770586,
"learning_rate": 2.541799539008682e-10,
"loss": 0.4041,
"step": 1311
},
{
"epoch": 19.675422138836772,
"grad_norm": 0.011850331991569664,
"learning_rate": 2.0083532954484618e-10,
"loss": 0.404,
"step": 1312
},
{
"epoch": 19.690431519699814,
"grad_norm": 0.011944977782669314,
"learning_rate": 1.5376575553793793e-10,
"loss": 0.3907,
"step": 1313
},
{
"epoch": 19.705440900562852,
"grad_norm": 0.012277893048513143,
"learning_rate": 1.1297152730338489e-10,
"loss": 0.4192,
"step": 1314
},
{
"epoch": 19.72045028142589,
"grad_norm": 0.012179141361100891,
"learning_rate": 7.845290087848954e-11,
"loss": 0.4054,
"step": 1315
},
{
"epoch": 19.735459662288932,
"grad_norm": 0.012034350217278049,
"learning_rate": 5.0210092912950087e-11,
"loss": 0.4143,
"step": 1316
},
{
"epoch": 19.75046904315197,
"grad_norm": 0.012006277883402618,
"learning_rate": 2.824328066730608e-11,
"loss": 0.3977,
"step": 1317
},
{
"epoch": 19.765478424015008,
"grad_norm": 0.011954480700150845,
"learning_rate": 1.255260201216135e-11,
"loss": 0.3968,
"step": 1318
},
{
"epoch": 19.78048780487805,
"grad_norm": 0.01235198314980799,
"learning_rate": 3.138155427073741e-12,
"loss": 0.4071,
"step": 1319
},
{
"epoch": 19.795497185741088,
"grad_norm": 0.012063710602907587,
"learning_rate": 0.0,
"loss": 0.4101,
"step": 1320
},
{
"epoch": 19.795497185741088,
"eval_loss": 0.3939184844493866,
"eval_runtime": 14.1078,
"eval_samples_per_second": 31.685,
"eval_steps_per_second": 1.985,
"step": 1320
}
],
"logging_steps": 1,
"max_steps": 1320,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 66,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1201839626256384.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}