Safetensors
Russian
bert
RuMathBERT / trainer_state.json
latushko-anna's picture
Rename model/trainer_state.json to trainer_state.json
a5a7e4d verified
raw
history blame
70.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 191805,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013034071061755428,
"grad_norm": 2.9903717041015625,
"learning_rate": 4.9869659289382446e-05,
"loss": 6.0967,
"step": 500
},
{
"epoch": 0.026068142123510857,
"grad_norm": 4.055346965789795,
"learning_rate": 4.973931857876489e-05,
"loss": 5.5622,
"step": 1000
},
{
"epoch": 0.039102213185266285,
"grad_norm": 5.480607032775879,
"learning_rate": 4.960897786814734e-05,
"loss": 5.4371,
"step": 1500
},
{
"epoch": 0.052136284247021714,
"grad_norm": 4.727134704589844,
"learning_rate": 4.9478637157529784e-05,
"loss": 5.3535,
"step": 2000
},
{
"epoch": 0.06517035530877714,
"grad_norm": 4.737260341644287,
"learning_rate": 4.934829644691223e-05,
"loss": 5.2508,
"step": 2500
},
{
"epoch": 0.07820442637053257,
"grad_norm": 5.066771984100342,
"learning_rate": 4.921795573629467e-05,
"loss": 5.199,
"step": 3000
},
{
"epoch": 0.091238497432288,
"grad_norm": 3.627026319503784,
"learning_rate": 4.908761502567713e-05,
"loss": 5.084,
"step": 3500
},
{
"epoch": 0.10427256849404343,
"grad_norm": 4.254016876220703,
"learning_rate": 4.895727431505957e-05,
"loss": 4.9441,
"step": 4000
},
{
"epoch": 0.11730663955579886,
"grad_norm": 6.351306438446045,
"learning_rate": 4.8826933604442015e-05,
"loss": 4.8021,
"step": 4500
},
{
"epoch": 0.13034071061755428,
"grad_norm": 7.492619037628174,
"learning_rate": 4.869659289382446e-05,
"loss": 4.6446,
"step": 5000
},
{
"epoch": 0.14337478167930973,
"grad_norm": 6.017455577850342,
"learning_rate": 4.856625218320691e-05,
"loss": 4.4574,
"step": 5500
},
{
"epoch": 0.15640885274106514,
"grad_norm": 5.2971343994140625,
"learning_rate": 4.843591147258935e-05,
"loss": 4.2184,
"step": 6000
},
{
"epoch": 0.16944292380282058,
"grad_norm": 9.367820739746094,
"learning_rate": 4.8305570761971796e-05,
"loss": 4.101,
"step": 6500
},
{
"epoch": 0.182476994864576,
"grad_norm": 7.676972389221191,
"learning_rate": 4.817523005135424e-05,
"loss": 3.9548,
"step": 7000
},
{
"epoch": 0.19551106592633144,
"grad_norm": 6.3607563972473145,
"learning_rate": 4.804488934073669e-05,
"loss": 3.8584,
"step": 7500
},
{
"epoch": 0.20854513698808685,
"grad_norm": 5.45451021194458,
"learning_rate": 4.7914548630119134e-05,
"loss": 3.7841,
"step": 8000
},
{
"epoch": 0.2215792080498423,
"grad_norm": 16.199485778808594,
"learning_rate": 4.778420791950158e-05,
"loss": 3.6685,
"step": 8500
},
{
"epoch": 0.2346132791115977,
"grad_norm": 6.077032089233398,
"learning_rate": 4.765386720888402e-05,
"loss": 3.6017,
"step": 9000
},
{
"epoch": 0.24764735017335315,
"grad_norm": 11.489569664001465,
"learning_rate": 4.752352649826647e-05,
"loss": 3.5553,
"step": 9500
},
{
"epoch": 0.26068142123510857,
"grad_norm": 4.917782783508301,
"learning_rate": 4.7393185787648915e-05,
"loss": 3.4537,
"step": 10000
},
{
"epoch": 0.273715492296864,
"grad_norm": 5.945028781890869,
"learning_rate": 4.7262845077031366e-05,
"loss": 3.4442,
"step": 10500
},
{
"epoch": 0.28674956335861945,
"grad_norm": 7.648957252502441,
"learning_rate": 4.713250436641381e-05,
"loss": 3.3772,
"step": 11000
},
{
"epoch": 0.29978363442037487,
"grad_norm": 7.488467216491699,
"learning_rate": 4.700216365579625e-05,
"loss": 3.3026,
"step": 11500
},
{
"epoch": 0.3128177054821303,
"grad_norm": 5.8792619705200195,
"learning_rate": 4.68718229451787e-05,
"loss": 3.2446,
"step": 12000
},
{
"epoch": 0.3258517765438857,
"grad_norm": 10.038032531738281,
"learning_rate": 4.674148223456115e-05,
"loss": 3.216,
"step": 12500
},
{
"epoch": 0.33888584760564117,
"grad_norm": 7.69769811630249,
"learning_rate": 4.661114152394359e-05,
"loss": 3.1869,
"step": 13000
},
{
"epoch": 0.3519199186673966,
"grad_norm": 6.179595470428467,
"learning_rate": 4.6480800813326034e-05,
"loss": 3.1464,
"step": 13500
},
{
"epoch": 0.364953989729152,
"grad_norm": 5.665715217590332,
"learning_rate": 4.6350460102708484e-05,
"loss": 3.079,
"step": 14000
},
{
"epoch": 0.3779880607909074,
"grad_norm": 4.681985855102539,
"learning_rate": 4.622011939209093e-05,
"loss": 3.0724,
"step": 14500
},
{
"epoch": 0.3910221318526629,
"grad_norm": 11.111820220947266,
"learning_rate": 4.608977868147337e-05,
"loss": 3.0356,
"step": 15000
},
{
"epoch": 0.4040562029144183,
"grad_norm": 5.951188564300537,
"learning_rate": 4.5959437970855815e-05,
"loss": 3.01,
"step": 15500
},
{
"epoch": 0.4170902739761737,
"grad_norm": 5.438151836395264,
"learning_rate": 4.5829097260238266e-05,
"loss": 2.9605,
"step": 16000
},
{
"epoch": 0.4301243450379291,
"grad_norm": 10.49527645111084,
"learning_rate": 4.569875654962071e-05,
"loss": 2.9453,
"step": 16500
},
{
"epoch": 0.4431584160996846,
"grad_norm": 6.611765384674072,
"learning_rate": 4.556841583900316e-05,
"loss": 2.9529,
"step": 17000
},
{
"epoch": 0.45619248716144,
"grad_norm": 5.289289474487305,
"learning_rate": 4.54380751283856e-05,
"loss": 2.9081,
"step": 17500
},
{
"epoch": 0.4692265582231954,
"grad_norm": 5.65715217590332,
"learning_rate": 4.530773441776805e-05,
"loss": 2.8152,
"step": 18000
},
{
"epoch": 0.48226062928495084,
"grad_norm": 5.513209819793701,
"learning_rate": 4.51773937071505e-05,
"loss": 2.8664,
"step": 18500
},
{
"epoch": 0.4952947003467063,
"grad_norm": 4.413240909576416,
"learning_rate": 4.504705299653294e-05,
"loss": 2.8854,
"step": 19000
},
{
"epoch": 0.5083287714084617,
"grad_norm": 5.602241039276123,
"learning_rate": 4.4916712285915384e-05,
"loss": 2.8295,
"step": 19500
},
{
"epoch": 0.5213628424702171,
"grad_norm": 8.221460342407227,
"learning_rate": 4.478637157529783e-05,
"loss": 2.7826,
"step": 20000
},
{
"epoch": 0.5343969135319726,
"grad_norm": 5.350883483886719,
"learning_rate": 4.465603086468028e-05,
"loss": 2.7846,
"step": 20500
},
{
"epoch": 0.547430984593728,
"grad_norm": 6.6059393882751465,
"learning_rate": 4.452569015406272e-05,
"loss": 2.7562,
"step": 21000
},
{
"epoch": 0.5604650556554834,
"grad_norm": 7.050083637237549,
"learning_rate": 4.4395349443445166e-05,
"loss": 2.7102,
"step": 21500
},
{
"epoch": 0.5734991267172389,
"grad_norm": 6.74811315536499,
"learning_rate": 4.426500873282761e-05,
"loss": 2.7215,
"step": 22000
},
{
"epoch": 0.5865331977789943,
"grad_norm": 7.959073543548584,
"learning_rate": 4.413466802221006e-05,
"loss": 2.7185,
"step": 22500
},
{
"epoch": 0.5995672688407497,
"grad_norm": 7.594911098480225,
"learning_rate": 4.40043273115925e-05,
"loss": 2.6624,
"step": 23000
},
{
"epoch": 0.6126013399025051,
"grad_norm": 5.935075283050537,
"learning_rate": 4.3873986600974954e-05,
"loss": 2.6398,
"step": 23500
},
{
"epoch": 0.6256354109642606,
"grad_norm": 7.0315961837768555,
"learning_rate": 4.37436458903574e-05,
"loss": 2.6571,
"step": 24000
},
{
"epoch": 0.638669482026016,
"grad_norm": 6.930845260620117,
"learning_rate": 4.361330517973984e-05,
"loss": 2.6009,
"step": 24500
},
{
"epoch": 0.6517035530877714,
"grad_norm": 14.607309341430664,
"learning_rate": 4.348296446912229e-05,
"loss": 2.6493,
"step": 25000
},
{
"epoch": 0.6647376241495269,
"grad_norm": 5.613809108734131,
"learning_rate": 4.3352623758504735e-05,
"loss": 2.6042,
"step": 25500
},
{
"epoch": 0.6777716952112823,
"grad_norm": 6.0553693771362305,
"learning_rate": 4.322228304788718e-05,
"loss": 2.6153,
"step": 26000
},
{
"epoch": 0.6908057662730377,
"grad_norm": 8.716107368469238,
"learning_rate": 4.309194233726962e-05,
"loss": 2.5757,
"step": 26500
},
{
"epoch": 0.7038398373347932,
"grad_norm": 7.430722713470459,
"learning_rate": 4.296160162665207e-05,
"loss": 2.5682,
"step": 27000
},
{
"epoch": 0.7168739083965486,
"grad_norm": 9.687034606933594,
"learning_rate": 4.2831260916034516e-05,
"loss": 2.5377,
"step": 27500
},
{
"epoch": 0.729907979458304,
"grad_norm": 3.729767084121704,
"learning_rate": 4.270092020541696e-05,
"loss": 2.5217,
"step": 28000
},
{
"epoch": 0.7429420505200595,
"grad_norm": 9.692636489868164,
"learning_rate": 4.25705794947994e-05,
"loss": 2.4829,
"step": 28500
},
{
"epoch": 0.7559761215818148,
"grad_norm": 8.260266304016113,
"learning_rate": 4.2440238784181854e-05,
"loss": 2.4971,
"step": 29000
},
{
"epoch": 0.7690101926435703,
"grad_norm": 5.885035037994385,
"learning_rate": 4.23098980735643e-05,
"loss": 2.4823,
"step": 29500
},
{
"epoch": 0.7820442637053258,
"grad_norm": 11.001029968261719,
"learning_rate": 4.217955736294674e-05,
"loss": 2.4583,
"step": 30000
},
{
"epoch": 0.7950783347670811,
"grad_norm": 9.69256591796875,
"learning_rate": 4.204921665232919e-05,
"loss": 2.447,
"step": 30500
},
{
"epoch": 0.8081124058288366,
"grad_norm": 15.954379081726074,
"learning_rate": 4.191887594171164e-05,
"loss": 2.4427,
"step": 31000
},
{
"epoch": 0.8211464768905921,
"grad_norm": 5.421440124511719,
"learning_rate": 4.1788535231094085e-05,
"loss": 2.4181,
"step": 31500
},
{
"epoch": 0.8341805479523474,
"grad_norm": 9.169551849365234,
"learning_rate": 4.165819452047653e-05,
"loss": 2.4105,
"step": 32000
},
{
"epoch": 0.8472146190141029,
"grad_norm": 5.778009414672852,
"learning_rate": 4.152785380985897e-05,
"loss": 2.4145,
"step": 32500
},
{
"epoch": 0.8602486900758582,
"grad_norm": 6.441959857940674,
"learning_rate": 4.139751309924142e-05,
"loss": 2.4334,
"step": 33000
},
{
"epoch": 0.8732827611376137,
"grad_norm": 7.385718822479248,
"learning_rate": 4.1267172388623866e-05,
"loss": 2.392,
"step": 33500
},
{
"epoch": 0.8863168321993692,
"grad_norm": 15.347734451293945,
"learning_rate": 4.113683167800631e-05,
"loss": 2.3981,
"step": 34000
},
{
"epoch": 0.8993509032611245,
"grad_norm": 10.47854232788086,
"learning_rate": 4.1006490967388754e-05,
"loss": 2.3511,
"step": 34500
},
{
"epoch": 0.91238497432288,
"grad_norm": 11.82073974609375,
"learning_rate": 4.0876150256771204e-05,
"loss": 2.3632,
"step": 35000
},
{
"epoch": 0.9254190453846355,
"grad_norm": 8.932971954345703,
"learning_rate": 4.074580954615365e-05,
"loss": 2.3272,
"step": 35500
},
{
"epoch": 0.9384531164463908,
"grad_norm": 11.068861961364746,
"learning_rate": 4.061546883553609e-05,
"loss": 2.3321,
"step": 36000
},
{
"epoch": 0.9514871875081463,
"grad_norm": 5.649448871612549,
"learning_rate": 4.0485128124918535e-05,
"loss": 2.3498,
"step": 36500
},
{
"epoch": 0.9645212585699017,
"grad_norm": 9.020928382873535,
"learning_rate": 4.0354787414300985e-05,
"loss": 2.3331,
"step": 37000
},
{
"epoch": 0.9775553296316571,
"grad_norm": 12.966954231262207,
"learning_rate": 4.0224446703683436e-05,
"loss": 2.3095,
"step": 37500
},
{
"epoch": 0.9905894006934126,
"grad_norm": 5.641653060913086,
"learning_rate": 4.009410599306588e-05,
"loss": 2.3127,
"step": 38000
},
{
"epoch": 1.003623471755168,
"grad_norm": 8.139008522033691,
"learning_rate": 3.996376528244832e-05,
"loss": 2.2846,
"step": 38500
},
{
"epoch": 1.0166575428169233,
"grad_norm": 7.005831241607666,
"learning_rate": 3.9833424571830766e-05,
"loss": 2.2518,
"step": 39000
},
{
"epoch": 1.029691613878679,
"grad_norm": 3.906301975250244,
"learning_rate": 3.970308386121322e-05,
"loss": 2.2632,
"step": 39500
},
{
"epoch": 1.0427256849404343,
"grad_norm": 4.201974391937256,
"learning_rate": 3.957274315059566e-05,
"loss": 2.2299,
"step": 40000
},
{
"epoch": 1.0557597560021896,
"grad_norm": 6.107882022857666,
"learning_rate": 3.9442402439978104e-05,
"loss": 2.2016,
"step": 40500
},
{
"epoch": 1.0687938270639452,
"grad_norm": 8.289084434509277,
"learning_rate": 3.931206172936055e-05,
"loss": 2.2227,
"step": 41000
},
{
"epoch": 1.0818278981257006,
"grad_norm": 5.386382102966309,
"learning_rate": 3.9181721018743e-05,
"loss": 2.1849,
"step": 41500
},
{
"epoch": 1.094861969187456,
"grad_norm": 5.536214828491211,
"learning_rate": 3.905138030812544e-05,
"loss": 2.2085,
"step": 42000
},
{
"epoch": 1.1078960402492115,
"grad_norm": 67.06414031982422,
"learning_rate": 3.8921039597507885e-05,
"loss": 2.2039,
"step": 42500
},
{
"epoch": 1.1209301113109669,
"grad_norm": 8.36019229888916,
"learning_rate": 3.879069888689033e-05,
"loss": 2.1925,
"step": 43000
},
{
"epoch": 1.1339641823727222,
"grad_norm": 14.266386985778809,
"learning_rate": 3.866035817627278e-05,
"loss": 2.2101,
"step": 43500
},
{
"epoch": 1.1469982534344778,
"grad_norm": 11.47070598602295,
"learning_rate": 3.853001746565523e-05,
"loss": 2.1402,
"step": 44000
},
{
"epoch": 1.1600323244962332,
"grad_norm": 5.293683052062988,
"learning_rate": 3.839967675503767e-05,
"loss": 2.1872,
"step": 44500
},
{
"epoch": 1.1730663955579885,
"grad_norm": 32.234737396240234,
"learning_rate": 3.826933604442012e-05,
"loss": 2.1357,
"step": 45000
},
{
"epoch": 1.1861004666197439,
"grad_norm": 3.9005160331726074,
"learning_rate": 3.813899533380256e-05,
"loss": 2.1263,
"step": 45500
},
{
"epoch": 1.1991345376814995,
"grad_norm": 9.012932777404785,
"learning_rate": 3.800865462318501e-05,
"loss": 2.1718,
"step": 46000
},
{
"epoch": 1.2121686087432548,
"grad_norm": 8.86204719543457,
"learning_rate": 3.7878313912567454e-05,
"loss": 2.1718,
"step": 46500
},
{
"epoch": 1.2252026798050104,
"grad_norm": 29.908674240112305,
"learning_rate": 3.77479732019499e-05,
"loss": 2.1227,
"step": 47000
},
{
"epoch": 1.2382367508667658,
"grad_norm": 3.599839687347412,
"learning_rate": 3.761763249133234e-05,
"loss": 2.1301,
"step": 47500
},
{
"epoch": 1.2512708219285211,
"grad_norm": 12.039328575134277,
"learning_rate": 3.748729178071479e-05,
"loss": 2.1226,
"step": 48000
},
{
"epoch": 1.2643048929902765,
"grad_norm": 3.92248797416687,
"learning_rate": 3.7356951070097236e-05,
"loss": 2.156,
"step": 48500
},
{
"epoch": 1.277338964052032,
"grad_norm": 22.514301300048828,
"learning_rate": 3.722661035947968e-05,
"loss": 2.1001,
"step": 49000
},
{
"epoch": 1.2903730351137874,
"grad_norm": 4.8082990646362305,
"learning_rate": 3.709626964886212e-05,
"loss": 2.1167,
"step": 49500
},
{
"epoch": 1.303407106175543,
"grad_norm": 7.884994983673096,
"learning_rate": 3.696592893824457e-05,
"loss": 2.1118,
"step": 50000
},
{
"epoch": 1.3164411772372984,
"grad_norm": 4.282125949859619,
"learning_rate": 3.6835588227627024e-05,
"loss": 2.0749,
"step": 50500
},
{
"epoch": 1.3294752482990537,
"grad_norm": 19.30133819580078,
"learning_rate": 3.670524751700947e-05,
"loss": 2.1081,
"step": 51000
},
{
"epoch": 1.342509319360809,
"grad_norm": 3.800236463546753,
"learning_rate": 3.657490680639191e-05,
"loss": 2.0964,
"step": 51500
},
{
"epoch": 1.3555433904225647,
"grad_norm": 5.734689235687256,
"learning_rate": 3.6444566095774355e-05,
"loss": 2.0736,
"step": 52000
},
{
"epoch": 1.36857746148432,
"grad_norm": 7.496071815490723,
"learning_rate": 3.6314225385156805e-05,
"loss": 2.0545,
"step": 52500
},
{
"epoch": 1.3816115325460754,
"grad_norm": 7.645195007324219,
"learning_rate": 3.618388467453925e-05,
"loss": 2.0407,
"step": 53000
},
{
"epoch": 1.394645603607831,
"grad_norm": 22.738969802856445,
"learning_rate": 3.605354396392169e-05,
"loss": 2.0554,
"step": 53500
},
{
"epoch": 1.4076796746695863,
"grad_norm": 9.185379028320312,
"learning_rate": 3.5923203253304136e-05,
"loss": 2.0364,
"step": 54000
},
{
"epoch": 1.4207137457313417,
"grad_norm": 9.092364311218262,
"learning_rate": 3.5792862542686586e-05,
"loss": 2.023,
"step": 54500
},
{
"epoch": 1.433747816793097,
"grad_norm": 3.8213064670562744,
"learning_rate": 3.566252183206903e-05,
"loss": 2.0429,
"step": 55000
},
{
"epoch": 1.4467818878548526,
"grad_norm": 15.87769603729248,
"learning_rate": 3.553218112145147e-05,
"loss": 1.9853,
"step": 55500
},
{
"epoch": 1.459815958916608,
"grad_norm": 8.585647583007812,
"learning_rate": 3.540184041083392e-05,
"loss": 2.0239,
"step": 56000
},
{
"epoch": 1.4728500299783636,
"grad_norm": 4.249543190002441,
"learning_rate": 3.527149970021637e-05,
"loss": 2.0305,
"step": 56500
},
{
"epoch": 1.485884101040119,
"grad_norm": 6.320367336273193,
"learning_rate": 3.514115898959881e-05,
"loss": 2.0173,
"step": 57000
},
{
"epoch": 1.4989181721018743,
"grad_norm": 5.058931350708008,
"learning_rate": 3.501081827898126e-05,
"loss": 1.9641,
"step": 57500
},
{
"epoch": 1.5119522431636296,
"grad_norm": 10.568583488464355,
"learning_rate": 3.4880477568363705e-05,
"loss": 2.035,
"step": 58000
},
{
"epoch": 1.524986314225385,
"grad_norm": 6.535768985748291,
"learning_rate": 3.475013685774615e-05,
"loss": 1.9971,
"step": 58500
},
{
"epoch": 1.5380203852871406,
"grad_norm": 11.262877464294434,
"learning_rate": 3.46197961471286e-05,
"loss": 2.0076,
"step": 59000
},
{
"epoch": 1.5510544563488962,
"grad_norm": 8.998533248901367,
"learning_rate": 3.448945543651104e-05,
"loss": 1.986,
"step": 59500
},
{
"epoch": 1.5640885274106515,
"grad_norm": 5.243868827819824,
"learning_rate": 3.4359114725893486e-05,
"loss": 2.0148,
"step": 60000
},
{
"epoch": 1.5771225984724069,
"grad_norm": 6.43707275390625,
"learning_rate": 3.422877401527593e-05,
"loss": 1.9952,
"step": 60500
},
{
"epoch": 1.5901566695341622,
"grad_norm": 10.8756742477417,
"learning_rate": 3.409843330465838e-05,
"loss": 1.9688,
"step": 61000
},
{
"epoch": 1.6031907405959176,
"grad_norm": 3.6488418579101562,
"learning_rate": 3.3968092594040824e-05,
"loss": 1.9545,
"step": 61500
},
{
"epoch": 1.6162248116576732,
"grad_norm": 3.8945696353912354,
"learning_rate": 3.383775188342327e-05,
"loss": 1.9692,
"step": 62000
},
{
"epoch": 1.6292588827194285,
"grad_norm": 4.477757453918457,
"learning_rate": 3.370741117280571e-05,
"loss": 1.9559,
"step": 62500
},
{
"epoch": 1.6422929537811841,
"grad_norm": 5.086141586303711,
"learning_rate": 3.357707046218816e-05,
"loss": 1.929,
"step": 63000
},
{
"epoch": 1.6553270248429395,
"grad_norm": 5.249891757965088,
"learning_rate": 3.3446729751570605e-05,
"loss": 1.9686,
"step": 63500
},
{
"epoch": 1.6683610959046948,
"grad_norm": 9.6456880569458,
"learning_rate": 3.3316389040953055e-05,
"loss": 1.952,
"step": 64000
},
{
"epoch": 1.6813951669664502,
"grad_norm": 5.007114410400391,
"learning_rate": 3.31860483303355e-05,
"loss": 1.9229,
"step": 64500
},
{
"epoch": 1.6944292380282058,
"grad_norm": 4.589148044586182,
"learning_rate": 3.305570761971795e-05,
"loss": 1.9296,
"step": 65000
},
{
"epoch": 1.7074633090899611,
"grad_norm": 10.281172752380371,
"learning_rate": 3.292536690910039e-05,
"loss": 1.9153,
"step": 65500
},
{
"epoch": 1.7204973801517167,
"grad_norm": 7.041563034057617,
"learning_rate": 3.2795026198482837e-05,
"loss": 1.9276,
"step": 66000
},
{
"epoch": 1.733531451213472,
"grad_norm": 8.523409843444824,
"learning_rate": 3.266468548786528e-05,
"loss": 1.8871,
"step": 66500
},
{
"epoch": 1.7465655222752274,
"grad_norm": 18.92120361328125,
"learning_rate": 3.253434477724773e-05,
"loss": 1.8963,
"step": 67000
},
{
"epoch": 1.7595995933369828,
"grad_norm": 17.547399520874023,
"learning_rate": 3.2404004066630174e-05,
"loss": 1.9069,
"step": 67500
},
{
"epoch": 1.7726336643987382,
"grad_norm": 9.223323822021484,
"learning_rate": 3.227366335601262e-05,
"loss": 1.9232,
"step": 68000
},
{
"epoch": 1.7856677354604937,
"grad_norm": 17.263656616210938,
"learning_rate": 3.214332264539506e-05,
"loss": 1.89,
"step": 68500
},
{
"epoch": 1.7987018065222493,
"grad_norm": 19.6173152923584,
"learning_rate": 3.201298193477751e-05,
"loss": 1.8764,
"step": 69000
},
{
"epoch": 1.8117358775840047,
"grad_norm": 10.714072227478027,
"learning_rate": 3.1882641224159955e-05,
"loss": 1.9165,
"step": 69500
},
{
"epoch": 1.82476994864576,
"grad_norm": 5.039360523223877,
"learning_rate": 3.17523005135424e-05,
"loss": 1.8422,
"step": 70000
},
{
"epoch": 1.8378040197075154,
"grad_norm": 28.72756576538086,
"learning_rate": 3.162195980292485e-05,
"loss": 1.8819,
"step": 70500
},
{
"epoch": 1.8508380907692707,
"grad_norm": 4.069336414337158,
"learning_rate": 3.149161909230729e-05,
"loss": 1.8769,
"step": 71000
},
{
"epoch": 1.8638721618310263,
"grad_norm": 4.223635196685791,
"learning_rate": 3.136127838168974e-05,
"loss": 1.8799,
"step": 71500
},
{
"epoch": 1.8769062328927817,
"grad_norm": 10.401415824890137,
"learning_rate": 3.123093767107219e-05,
"loss": 1.905,
"step": 72000
},
{
"epoch": 1.8899403039545373,
"grad_norm": 5.064211368560791,
"learning_rate": 3.110059696045463e-05,
"loss": 1.827,
"step": 72500
},
{
"epoch": 1.9029743750162926,
"grad_norm": 4.138282299041748,
"learning_rate": 3.0970256249837074e-05,
"loss": 1.8237,
"step": 73000
},
{
"epoch": 1.916008446078048,
"grad_norm": 3.365440845489502,
"learning_rate": 3.0839915539219525e-05,
"loss": 1.8421,
"step": 73500
},
{
"epoch": 1.9290425171398033,
"grad_norm": 7.819665431976318,
"learning_rate": 3.070957482860197e-05,
"loss": 1.8413,
"step": 74000
},
{
"epoch": 1.942076588201559,
"grad_norm": 8.81440544128418,
"learning_rate": 3.057923411798441e-05,
"loss": 1.8633,
"step": 74500
},
{
"epoch": 1.9551106592633143,
"grad_norm": 12.814815521240234,
"learning_rate": 3.044889340736686e-05,
"loss": 1.8255,
"step": 75000
},
{
"epoch": 1.9681447303250699,
"grad_norm": 7.332582950592041,
"learning_rate": 3.0318552696749302e-05,
"loss": 1.8228,
"step": 75500
},
{
"epoch": 1.9811788013868252,
"grad_norm": 6.4567694664001465,
"learning_rate": 3.018821198613175e-05,
"loss": 1.8514,
"step": 76000
},
{
"epoch": 1.9942128724485806,
"grad_norm": 33.37932205200195,
"learning_rate": 3.0057871275514193e-05,
"loss": 1.8347,
"step": 76500
},
{
"epoch": 2.007246943510336,
"grad_norm": 3.908621072769165,
"learning_rate": 2.992753056489664e-05,
"loss": 1.8015,
"step": 77000
},
{
"epoch": 2.0202810145720913,
"grad_norm": 3.9100475311279297,
"learning_rate": 2.979718985427909e-05,
"loss": 1.8148,
"step": 77500
},
{
"epoch": 2.0333150856338467,
"grad_norm": 4.988982200622559,
"learning_rate": 2.9666849143661534e-05,
"loss": 1.7508,
"step": 78000
},
{
"epoch": 2.0463491566956025,
"grad_norm": 5.134647846221924,
"learning_rate": 2.953650843304398e-05,
"loss": 1.7613,
"step": 78500
},
{
"epoch": 2.059383227757358,
"grad_norm": 6.9095845222473145,
"learning_rate": 2.9406167722426425e-05,
"loss": 1.8106,
"step": 79000
},
{
"epoch": 2.072417298819113,
"grad_norm": 14.57297420501709,
"learning_rate": 2.927582701180887e-05,
"loss": 1.7387,
"step": 79500
},
{
"epoch": 2.0854513698808685,
"grad_norm": 46.801937103271484,
"learning_rate": 2.9145486301191315e-05,
"loss": 1.7732,
"step": 80000
},
{
"epoch": 2.098485440942624,
"grad_norm": 10.51559829711914,
"learning_rate": 2.9015145590573762e-05,
"loss": 1.779,
"step": 80500
},
{
"epoch": 2.1115195120043793,
"grad_norm": 3.4089362621307373,
"learning_rate": 2.8884804879956206e-05,
"loss": 1.7613,
"step": 81000
},
{
"epoch": 2.124553583066135,
"grad_norm": 6.211880207061768,
"learning_rate": 2.8754464169338653e-05,
"loss": 1.7656,
"step": 81500
},
{
"epoch": 2.1375876541278904,
"grad_norm": 4.486207962036133,
"learning_rate": 2.8624123458721096e-05,
"loss": 1.7653,
"step": 82000
},
{
"epoch": 2.150621725189646,
"grad_norm": 4.438023090362549,
"learning_rate": 2.8493782748103543e-05,
"loss": 1.758,
"step": 82500
},
{
"epoch": 2.163655796251401,
"grad_norm": 5.200678825378418,
"learning_rate": 2.8363442037485987e-05,
"loss": 1.7487,
"step": 83000
},
{
"epoch": 2.1766898673131565,
"grad_norm": 11.503108024597168,
"learning_rate": 2.8233101326868434e-05,
"loss": 1.7539,
"step": 83500
},
{
"epoch": 2.189723938374912,
"grad_norm": 3.5593841075897217,
"learning_rate": 2.8102760616250884e-05,
"loss": 1.7604,
"step": 84000
},
{
"epoch": 2.2027580094366677,
"grad_norm": 4.380959510803223,
"learning_rate": 2.7972419905633328e-05,
"loss": 1.7688,
"step": 84500
},
{
"epoch": 2.215792080498423,
"grad_norm": 8.921208381652832,
"learning_rate": 2.7842079195015775e-05,
"loss": 1.7414,
"step": 85000
},
{
"epoch": 2.2288261515601784,
"grad_norm": 4.622405529022217,
"learning_rate": 2.771173848439822e-05,
"loss": 1.7623,
"step": 85500
},
{
"epoch": 2.2418602226219337,
"grad_norm": 27.651330947875977,
"learning_rate": 2.7581397773780666e-05,
"loss": 1.7172,
"step": 86000
},
{
"epoch": 2.254894293683689,
"grad_norm": 4.457437992095947,
"learning_rate": 2.745105706316311e-05,
"loss": 1.7444,
"step": 86500
},
{
"epoch": 2.2679283647454445,
"grad_norm": 5.793179988861084,
"learning_rate": 2.7320716352545556e-05,
"loss": 1.7386,
"step": 87000
},
{
"epoch": 2.2809624358072,
"grad_norm": 3.3070342540740967,
"learning_rate": 2.7190375641928e-05,
"loss": 1.7066,
"step": 87500
},
{
"epoch": 2.2939965068689556,
"grad_norm": 4.475468158721924,
"learning_rate": 2.7060034931310447e-05,
"loss": 1.7212,
"step": 88000
},
{
"epoch": 2.307030577930711,
"grad_norm": 4.4862847328186035,
"learning_rate": 2.692969422069289e-05,
"loss": 1.7265,
"step": 88500
},
{
"epoch": 2.3200646489924663,
"grad_norm": 3.608401298522949,
"learning_rate": 2.6799353510075337e-05,
"loss": 1.7324,
"step": 89000
},
{
"epoch": 2.3330987200542217,
"grad_norm": 4.134375095367432,
"learning_rate": 2.666901279945778e-05,
"loss": 1.6866,
"step": 89500
},
{
"epoch": 2.346132791115977,
"grad_norm": 4.030068874359131,
"learning_rate": 2.6538672088840228e-05,
"loss": 1.6955,
"step": 90000
},
{
"epoch": 2.3591668621777324,
"grad_norm": 7.18529748916626,
"learning_rate": 2.640833137822267e-05,
"loss": 1.7119,
"step": 90500
},
{
"epoch": 2.3722009332394878,
"grad_norm": 3.633330821990967,
"learning_rate": 2.6277990667605122e-05,
"loss": 1.737,
"step": 91000
},
{
"epoch": 2.3852350043012436,
"grad_norm": 5.056845188140869,
"learning_rate": 2.614764995698757e-05,
"loss": 1.7121,
"step": 91500
},
{
"epoch": 2.398269075362999,
"grad_norm": 3.203246831893921,
"learning_rate": 2.6017309246370013e-05,
"loss": 1.7096,
"step": 92000
},
{
"epoch": 2.4113031464247543,
"grad_norm": 3.830634355545044,
"learning_rate": 2.588696853575246e-05,
"loss": 1.7047,
"step": 92500
},
{
"epoch": 2.4243372174865097,
"grad_norm": 3.5095880031585693,
"learning_rate": 2.5756627825134903e-05,
"loss": 1.6875,
"step": 93000
},
{
"epoch": 2.437371288548265,
"grad_norm": 13.952683448791504,
"learning_rate": 2.562628711451735e-05,
"loss": 1.727,
"step": 93500
},
{
"epoch": 2.450405359610021,
"grad_norm": 4.152392387390137,
"learning_rate": 2.5495946403899794e-05,
"loss": 1.674,
"step": 94000
},
{
"epoch": 2.463439430671776,
"grad_norm": 28.32253074645996,
"learning_rate": 2.536560569328224e-05,
"loss": 1.6635,
"step": 94500
},
{
"epoch": 2.4764735017335315,
"grad_norm": 37.356117248535156,
"learning_rate": 2.5235264982664684e-05,
"loss": 1.6936,
"step": 95000
},
{
"epoch": 2.489507572795287,
"grad_norm": 11.425202369689941,
"learning_rate": 2.510492427204713e-05,
"loss": 1.6635,
"step": 95500
},
{
"epoch": 2.5025416438570423,
"grad_norm": 3.700289726257324,
"learning_rate": 2.497458356142958e-05,
"loss": 1.7051,
"step": 96000
},
{
"epoch": 2.5155757149187976,
"grad_norm": 16.234506607055664,
"learning_rate": 2.4844242850812025e-05,
"loss": 1.676,
"step": 96500
},
{
"epoch": 2.528609785980553,
"grad_norm": 3.4809882640838623,
"learning_rate": 2.471390214019447e-05,
"loss": 1.6795,
"step": 97000
},
{
"epoch": 2.5416438570423088,
"grad_norm": 4.420949459075928,
"learning_rate": 2.4583561429576916e-05,
"loss": 1.6926,
"step": 97500
},
{
"epoch": 2.554677928104064,
"grad_norm": 24.02429962158203,
"learning_rate": 2.445322071895936e-05,
"loss": 1.6479,
"step": 98000
},
{
"epoch": 2.5677119991658195,
"grad_norm": 4.912638187408447,
"learning_rate": 2.4322880008341807e-05,
"loss": 1.6598,
"step": 98500
},
{
"epoch": 2.580746070227575,
"grad_norm": 22.43536376953125,
"learning_rate": 2.419253929772425e-05,
"loss": 1.6532,
"step": 99000
},
{
"epoch": 2.59378014128933,
"grad_norm": 4.317445755004883,
"learning_rate": 2.40621985871067e-05,
"loss": 1.6554,
"step": 99500
},
{
"epoch": 2.606814212351086,
"grad_norm": 14.290596008300781,
"learning_rate": 2.3931857876489144e-05,
"loss": 1.6265,
"step": 100000
},
{
"epoch": 2.619848283412841,
"grad_norm": 4.331130504608154,
"learning_rate": 2.380151716587159e-05,
"loss": 1.6706,
"step": 100500
},
{
"epoch": 2.6328823544745967,
"grad_norm": 7.016634941101074,
"learning_rate": 2.3671176455254035e-05,
"loss": 1.649,
"step": 101000
},
{
"epoch": 2.645916425536352,
"grad_norm": 5.680657386779785,
"learning_rate": 2.3540835744636482e-05,
"loss": 1.6126,
"step": 101500
},
{
"epoch": 2.6589504965981074,
"grad_norm": 4.337413311004639,
"learning_rate": 2.3410495034018925e-05,
"loss": 1.6317,
"step": 102000
},
{
"epoch": 2.671984567659863,
"grad_norm": 20.466943740844727,
"learning_rate": 2.3280154323401372e-05,
"loss": 1.6348,
"step": 102500
},
{
"epoch": 2.685018638721618,
"grad_norm": 4.808228969573975,
"learning_rate": 2.314981361278382e-05,
"loss": 1.5979,
"step": 103000
},
{
"epoch": 2.698052709783374,
"grad_norm": 4.296200752258301,
"learning_rate": 2.3019472902166263e-05,
"loss": 1.6281,
"step": 103500
},
{
"epoch": 2.7110867808451293,
"grad_norm": 32.726078033447266,
"learning_rate": 2.288913219154871e-05,
"loss": 1.5966,
"step": 104000
},
{
"epoch": 2.7241208519068847,
"grad_norm": 4.275684833526611,
"learning_rate": 2.2758791480931154e-05,
"loss": 1.6108,
"step": 104500
},
{
"epoch": 2.73715492296864,
"grad_norm": 3.496002197265625,
"learning_rate": 2.26284507703136e-05,
"loss": 1.6026,
"step": 105000
},
{
"epoch": 2.7501889940303954,
"grad_norm": 9.172469139099121,
"learning_rate": 2.2498110059696044e-05,
"loss": 1.631,
"step": 105500
},
{
"epoch": 2.7632230650921508,
"grad_norm": 16.79161834716797,
"learning_rate": 2.2367769349078495e-05,
"loss": 1.6357,
"step": 106000
},
{
"epoch": 2.776257136153906,
"grad_norm": 14.198761940002441,
"learning_rate": 2.2237428638460938e-05,
"loss": 1.6423,
"step": 106500
},
{
"epoch": 2.789291207215662,
"grad_norm": 5.301556587219238,
"learning_rate": 2.2107087927843385e-05,
"loss": 1.6125,
"step": 107000
},
{
"epoch": 2.8023252782774173,
"grad_norm": 26.385272979736328,
"learning_rate": 2.197674721722583e-05,
"loss": 1.6334,
"step": 107500
},
{
"epoch": 2.8153593493391726,
"grad_norm": 9.757530212402344,
"learning_rate": 2.1846406506608276e-05,
"loss": 1.586,
"step": 108000
},
{
"epoch": 2.828393420400928,
"grad_norm": 20.982559204101562,
"learning_rate": 2.171606579599072e-05,
"loss": 1.6066,
"step": 108500
},
{
"epoch": 2.8414274914626834,
"grad_norm": 3.695369243621826,
"learning_rate": 2.1585725085373166e-05,
"loss": 1.6307,
"step": 109000
},
{
"epoch": 2.8544615625244387,
"grad_norm": 14.864655494689941,
"learning_rate": 2.1455384374755613e-05,
"loss": 1.5847,
"step": 109500
},
{
"epoch": 2.867495633586194,
"grad_norm": 3.9043121337890625,
"learning_rate": 2.1325043664138057e-05,
"loss": 1.5904,
"step": 110000
},
{
"epoch": 2.88052970464795,
"grad_norm": 4.432578086853027,
"learning_rate": 2.1194702953520504e-05,
"loss": 1.6037,
"step": 110500
},
{
"epoch": 2.8935637757097052,
"grad_norm": 6.775419235229492,
"learning_rate": 2.1064362242902948e-05,
"loss": 1.6052,
"step": 111000
},
{
"epoch": 2.9065978467714606,
"grad_norm": 5.090266227722168,
"learning_rate": 2.0934021532285395e-05,
"loss": 1.5814,
"step": 111500
},
{
"epoch": 2.919631917833216,
"grad_norm": 7.805962085723877,
"learning_rate": 2.0803680821667838e-05,
"loss": 1.6016,
"step": 112000
},
{
"epoch": 2.9326659888949713,
"grad_norm": 6.22263240814209,
"learning_rate": 2.067334011105029e-05,
"loss": 1.564,
"step": 112500
},
{
"epoch": 2.945700059956727,
"grad_norm": 23.055776596069336,
"learning_rate": 2.0542999400432732e-05,
"loss": 1.555,
"step": 113000
},
{
"epoch": 2.958734131018482,
"grad_norm": 20.39297866821289,
"learning_rate": 2.041265868981518e-05,
"loss": 1.5306,
"step": 113500
},
{
"epoch": 2.971768202080238,
"grad_norm": 5.571432113647461,
"learning_rate": 2.0282317979197623e-05,
"loss": 1.577,
"step": 114000
},
{
"epoch": 2.984802273141993,
"grad_norm": 15.77784252166748,
"learning_rate": 2.015197726858007e-05,
"loss": 1.6165,
"step": 114500
},
{
"epoch": 2.9978363442037486,
"grad_norm": 4.388451099395752,
"learning_rate": 2.0021636557962513e-05,
"loss": 1.544,
"step": 115000
},
{
"epoch": 3.010870415265504,
"grad_norm": 2.794743776321411,
"learning_rate": 1.989129584734496e-05,
"loss": 1.561,
"step": 115500
},
{
"epoch": 3.0239044863272593,
"grad_norm": 38.998512268066406,
"learning_rate": 1.9760955136727407e-05,
"loss": 1.5344,
"step": 116000
},
{
"epoch": 3.036938557389015,
"grad_norm": 10.872420310974121,
"learning_rate": 1.9630614426109854e-05,
"loss": 1.5191,
"step": 116500
},
{
"epoch": 3.0499726284507704,
"grad_norm": 4.433558464050293,
"learning_rate": 1.9500273715492298e-05,
"loss": 1.5093,
"step": 117000
},
{
"epoch": 3.063006699512526,
"grad_norm": 3.8315622806549072,
"learning_rate": 1.9369933004874745e-05,
"loss": 1.5344,
"step": 117500
},
{
"epoch": 3.076040770574281,
"grad_norm": 24.29652976989746,
"learning_rate": 1.923959229425719e-05,
"loss": 1.5557,
"step": 118000
},
{
"epoch": 3.0890748416360365,
"grad_norm": 4.876192092895508,
"learning_rate": 1.9109251583639636e-05,
"loss": 1.5381,
"step": 118500
},
{
"epoch": 3.102108912697792,
"grad_norm": 4.730300426483154,
"learning_rate": 1.897891087302208e-05,
"loss": 1.4977,
"step": 119000
},
{
"epoch": 3.1151429837595472,
"grad_norm": 15.773541450500488,
"learning_rate": 1.8848570162404526e-05,
"loss": 1.5262,
"step": 119500
},
{
"epoch": 3.128177054821303,
"grad_norm": 3.4133520126342773,
"learning_rate": 1.8718229451786973e-05,
"loss": 1.5142,
"step": 120000
},
{
"epoch": 3.1412111258830584,
"grad_norm": 4.271722316741943,
"learning_rate": 1.8587888741169417e-05,
"loss": 1.5108,
"step": 120500
},
{
"epoch": 3.1542451969448138,
"grad_norm": 4.478157997131348,
"learning_rate": 1.8457548030551864e-05,
"loss": 1.5111,
"step": 121000
},
{
"epoch": 3.167279268006569,
"grad_norm": 6.74271821975708,
"learning_rate": 1.8327207319934307e-05,
"loss": 1.5359,
"step": 121500
},
{
"epoch": 3.1803133390683245,
"grad_norm": 10.100676536560059,
"learning_rate": 1.8196866609316754e-05,
"loss": 1.4856,
"step": 122000
},
{
"epoch": 3.19334741013008,
"grad_norm": 5.077882289886475,
"learning_rate": 1.8066525898699198e-05,
"loss": 1.5054,
"step": 122500
},
{
"epoch": 3.2063814811918356,
"grad_norm": 4.155623912811279,
"learning_rate": 1.793618518808165e-05,
"loss": 1.5089,
"step": 123000
},
{
"epoch": 3.219415552253591,
"grad_norm": 3.6238481998443604,
"learning_rate": 1.7805844477464092e-05,
"loss": 1.4933,
"step": 123500
},
{
"epoch": 3.2324496233153464,
"grad_norm": 4.119343280792236,
"learning_rate": 1.767550376684654e-05,
"loss": 1.5215,
"step": 124000
},
{
"epoch": 3.2454836943771017,
"grad_norm": 3.789219379425049,
"learning_rate": 1.7545163056228983e-05,
"loss": 1.4686,
"step": 124500
},
{
"epoch": 3.258517765438857,
"grad_norm": 23.477462768554688,
"learning_rate": 1.741482234561143e-05,
"loss": 1.4928,
"step": 125000
},
{
"epoch": 3.2715518365006124,
"grad_norm": 34.81294250488281,
"learning_rate": 1.7284481634993873e-05,
"loss": 1.5147,
"step": 125500
},
{
"epoch": 3.2845859075623682,
"grad_norm": 3.911698579788208,
"learning_rate": 1.715414092437632e-05,
"loss": 1.498,
"step": 126000
},
{
"epoch": 3.2976199786241236,
"grad_norm": 17.540603637695312,
"learning_rate": 1.7023800213758767e-05,
"loss": 1.5224,
"step": 126500
},
{
"epoch": 3.310654049685879,
"grad_norm": 5.028404712677002,
"learning_rate": 1.689345950314121e-05,
"loss": 1.4782,
"step": 127000
},
{
"epoch": 3.3236881207476343,
"grad_norm": 11.53537654876709,
"learning_rate": 1.6763118792523658e-05,
"loss": 1.4837,
"step": 127500
},
{
"epoch": 3.3367221918093897,
"grad_norm": 3.8512253761291504,
"learning_rate": 1.66327780819061e-05,
"loss": 1.4528,
"step": 128000
},
{
"epoch": 3.349756262871145,
"grad_norm": 3.932035207748413,
"learning_rate": 1.650243737128855e-05,
"loss": 1.5026,
"step": 128500
},
{
"epoch": 3.3627903339329004,
"grad_norm": 4.325034141540527,
"learning_rate": 1.6372096660670992e-05,
"loss": 1.4717,
"step": 129000
},
{
"epoch": 3.375824404994656,
"grad_norm": 7.62436580657959,
"learning_rate": 1.6241755950053442e-05,
"loss": 1.4677,
"step": 129500
},
{
"epoch": 3.3888584760564116,
"grad_norm": 4.481779098510742,
"learning_rate": 1.6111415239435886e-05,
"loss": 1.487,
"step": 130000
},
{
"epoch": 3.401892547118167,
"grad_norm": 4.1522536277771,
"learning_rate": 1.5981074528818333e-05,
"loss": 1.4724,
"step": 130500
},
{
"epoch": 3.4149266181799223,
"grad_norm": 22.38875961303711,
"learning_rate": 1.5850733818200777e-05,
"loss": 1.4694,
"step": 131000
},
{
"epoch": 3.4279606892416776,
"grad_norm": 5.144596099853516,
"learning_rate": 1.5720393107583224e-05,
"loss": 1.4792,
"step": 131500
},
{
"epoch": 3.440994760303433,
"grad_norm": 4.0159912109375,
"learning_rate": 1.5590052396965667e-05,
"loss": 1.4535,
"step": 132000
},
{
"epoch": 3.454028831365189,
"grad_norm": 4.164160251617432,
"learning_rate": 1.5459711686348114e-05,
"loss": 1.4516,
"step": 132500
},
{
"epoch": 3.467062902426944,
"grad_norm": 4.1465349197387695,
"learning_rate": 1.532937097573056e-05,
"loss": 1.4383,
"step": 133000
},
{
"epoch": 3.4800969734886995,
"grad_norm": 5.3553466796875,
"learning_rate": 1.5199030265113007e-05,
"loss": 1.4588,
"step": 133500
},
{
"epoch": 3.493131044550455,
"grad_norm": 4.2381110191345215,
"learning_rate": 1.5068689554495452e-05,
"loss": 1.4607,
"step": 134000
},
{
"epoch": 3.5061651156122102,
"grad_norm": 4.227059364318848,
"learning_rate": 1.4938348843877897e-05,
"loss": 1.4855,
"step": 134500
},
{
"epoch": 3.5191991866739656,
"grad_norm": 4.23318338394165,
"learning_rate": 1.4808008133260342e-05,
"loss": 1.4452,
"step": 135000
},
{
"epoch": 3.5322332577357214,
"grad_norm": 4.2789788246154785,
"learning_rate": 1.4677667422642788e-05,
"loss": 1.4471,
"step": 135500
},
{
"epoch": 3.5452673287974767,
"grad_norm": 14.372062683105469,
"learning_rate": 1.4547326712025236e-05,
"loss": 1.4663,
"step": 136000
},
{
"epoch": 3.558301399859232,
"grad_norm": 4.719635963439941,
"learning_rate": 1.4416986001407682e-05,
"loss": 1.4628,
"step": 136500
},
{
"epoch": 3.5713354709209875,
"grad_norm": 4.603359222412109,
"learning_rate": 1.4286645290790127e-05,
"loss": 1.4464,
"step": 137000
},
{
"epoch": 3.584369541982743,
"grad_norm": 4.167656421661377,
"learning_rate": 1.4156304580172572e-05,
"loss": 1.4816,
"step": 137500
},
{
"epoch": 3.597403613044498,
"grad_norm": 3.9802513122558594,
"learning_rate": 1.4025963869555018e-05,
"loss": 1.4404,
"step": 138000
},
{
"epoch": 3.6104376841062535,
"grad_norm": 4.956002235412598,
"learning_rate": 1.3895623158937463e-05,
"loss": 1.4463,
"step": 138500
},
{
"epoch": 3.6234717551680093,
"grad_norm": 4.82868766784668,
"learning_rate": 1.3765282448319908e-05,
"loss": 1.429,
"step": 139000
},
{
"epoch": 3.6365058262297647,
"grad_norm": 9.303766250610352,
"learning_rate": 1.3634941737702355e-05,
"loss": 1.4492,
"step": 139500
},
{
"epoch": 3.64953989729152,
"grad_norm": 4.728789806365967,
"learning_rate": 1.35046010270848e-05,
"loss": 1.4599,
"step": 140000
},
{
"epoch": 3.6625739683532754,
"grad_norm": 4.169735431671143,
"learning_rate": 1.3374260316467246e-05,
"loss": 1.4346,
"step": 140500
},
{
"epoch": 3.675608039415031,
"grad_norm": 4.134032249450684,
"learning_rate": 1.3243919605849691e-05,
"loss": 1.426,
"step": 141000
},
{
"epoch": 3.6886421104767866,
"grad_norm": 7.31259822845459,
"learning_rate": 1.3113578895232136e-05,
"loss": 1.4489,
"step": 141500
},
{
"epoch": 3.7016761815385415,
"grad_norm": 41.01179885864258,
"learning_rate": 1.2983238184614582e-05,
"loss": 1.4594,
"step": 142000
},
{
"epoch": 3.7147102526002973,
"grad_norm": 4.123907566070557,
"learning_rate": 1.2852897473997027e-05,
"loss": 1.4445,
"step": 142500
},
{
"epoch": 3.7277443236620527,
"grad_norm": 12.47805404663086,
"learning_rate": 1.2722556763379476e-05,
"loss": 1.416,
"step": 143000
},
{
"epoch": 3.740778394723808,
"grad_norm": 4.795707702636719,
"learning_rate": 1.2592216052761921e-05,
"loss": 1.449,
"step": 143500
},
{
"epoch": 3.7538124657855634,
"grad_norm": 3.754809856414795,
"learning_rate": 1.2461875342144366e-05,
"loss": 1.4353,
"step": 144000
},
{
"epoch": 3.7668465368473187,
"grad_norm": 4.847051620483398,
"learning_rate": 1.2331534631526812e-05,
"loss": 1.4081,
"step": 144500
},
{
"epoch": 3.7798806079090745,
"grad_norm": 5.240978240966797,
"learning_rate": 1.2201193920909257e-05,
"loss": 1.4497,
"step": 145000
},
{
"epoch": 3.79291467897083,
"grad_norm": 4.278606414794922,
"learning_rate": 1.2070853210291704e-05,
"loss": 1.4296,
"step": 145500
},
{
"epoch": 3.8059487500325853,
"grad_norm": 24.963735580444336,
"learning_rate": 1.194051249967415e-05,
"loss": 1.4273,
"step": 146000
},
{
"epoch": 3.8189828210943406,
"grad_norm": 3.3722941875457764,
"learning_rate": 1.1810171789056595e-05,
"loss": 1.3939,
"step": 146500
},
{
"epoch": 3.832016892156096,
"grad_norm": 3.9926798343658447,
"learning_rate": 1.1679831078439042e-05,
"loss": 1.4149,
"step": 147000
},
{
"epoch": 3.8450509632178513,
"grad_norm": 7.269467353820801,
"learning_rate": 1.1549490367821487e-05,
"loss": 1.4004,
"step": 147500
},
{
"epoch": 3.8580850342796067,
"grad_norm": 5.596455097198486,
"learning_rate": 1.1419149657203932e-05,
"loss": 1.4133,
"step": 148000
},
{
"epoch": 3.8711191053413625,
"grad_norm": 5.81203556060791,
"learning_rate": 1.1288808946586377e-05,
"loss": 1.4313,
"step": 148500
},
{
"epoch": 3.884153176403118,
"grad_norm": 4.842901229858398,
"learning_rate": 1.1158468235968823e-05,
"loss": 1.4139,
"step": 149000
},
{
"epoch": 3.897187247464873,
"grad_norm": 3.6464438438415527,
"learning_rate": 1.1028127525351268e-05,
"loss": 1.4189,
"step": 149500
},
{
"epoch": 3.9102213185266286,
"grad_norm": 5.625620365142822,
"learning_rate": 1.0897786814733713e-05,
"loss": 1.4119,
"step": 150000
},
{
"epoch": 3.923255389588384,
"grad_norm": 3.84614896774292,
"learning_rate": 1.076744610411616e-05,
"loss": 1.4094,
"step": 150500
},
{
"epoch": 3.9362894606501397,
"grad_norm": 5.183802127838135,
"learning_rate": 1.0637105393498606e-05,
"loss": 1.4157,
"step": 151000
},
{
"epoch": 3.9493235317118947,
"grad_norm": 4.6199140548706055,
"learning_rate": 1.0506764682881051e-05,
"loss": 1.4067,
"step": 151500
},
{
"epoch": 3.9623576027736505,
"grad_norm": 5.642277717590332,
"learning_rate": 1.0376423972263498e-05,
"loss": 1.3994,
"step": 152000
},
{
"epoch": 3.975391673835406,
"grad_norm": 4.15669584274292,
"learning_rate": 1.0246083261645943e-05,
"loss": 1.4304,
"step": 152500
},
{
"epoch": 3.988425744897161,
"grad_norm": 4.729000568389893,
"learning_rate": 1.0115742551028389e-05,
"loss": 1.3979,
"step": 153000
},
{
"epoch": 4.001459815958917,
"grad_norm": 3.2223262786865234,
"learning_rate": 9.985401840410834e-06,
"loss": 1.3897,
"step": 153500
},
{
"epoch": 4.014493887020672,
"grad_norm": 4.223217964172363,
"learning_rate": 9.855061129793281e-06,
"loss": 1.3567,
"step": 154000
},
{
"epoch": 4.027527958082428,
"grad_norm": 3.201354742050171,
"learning_rate": 9.724720419175726e-06,
"loss": 1.3796,
"step": 154500
},
{
"epoch": 4.040562029144183,
"grad_norm": 31.99419593811035,
"learning_rate": 9.594379708558171e-06,
"loss": 1.3475,
"step": 155000
},
{
"epoch": 4.053596100205938,
"grad_norm": 19.76371192932129,
"learning_rate": 9.464038997940618e-06,
"loss": 1.3278,
"step": 155500
},
{
"epoch": 4.066630171267693,
"grad_norm": 3.462979316711426,
"learning_rate": 9.333698287323064e-06,
"loss": 1.3632,
"step": 156000
},
{
"epoch": 4.079664242329449,
"grad_norm": 27.641897201538086,
"learning_rate": 9.203357576705509e-06,
"loss": 1.3203,
"step": 156500
},
{
"epoch": 4.092698313391205,
"grad_norm": 3.934295654296875,
"learning_rate": 9.073016866087954e-06,
"loss": 1.3793,
"step": 157000
},
{
"epoch": 4.10573238445296,
"grad_norm": 3.3237240314483643,
"learning_rate": 8.9426761554704e-06,
"loss": 1.3375,
"step": 157500
},
{
"epoch": 4.118766455514716,
"grad_norm": 5.202388286590576,
"learning_rate": 8.812335444852845e-06,
"loss": 1.3852,
"step": 158000
},
{
"epoch": 4.131800526576471,
"grad_norm": 28.595399856567383,
"learning_rate": 8.68199473423529e-06,
"loss": 1.3644,
"step": 158500
},
{
"epoch": 4.144834597638226,
"grad_norm": 3.2022364139556885,
"learning_rate": 8.551654023617737e-06,
"loss": 1.3734,
"step": 159000
},
{
"epoch": 4.157868668699982,
"grad_norm": 4.231220245361328,
"learning_rate": 8.421313313000183e-06,
"loss": 1.349,
"step": 159500
},
{
"epoch": 4.170902739761737,
"grad_norm": 4.515881538391113,
"learning_rate": 8.290972602382628e-06,
"loss": 1.3392,
"step": 160000
},
{
"epoch": 4.183936810823493,
"grad_norm": 3.6497957706451416,
"learning_rate": 8.160631891765075e-06,
"loss": 1.3495,
"step": 160500
},
{
"epoch": 4.196970881885248,
"grad_norm": 16.680282592773438,
"learning_rate": 8.03029118114752e-06,
"loss": 1.3566,
"step": 161000
},
{
"epoch": 4.210004952947004,
"grad_norm": 18.566879272460938,
"learning_rate": 7.899950470529966e-06,
"loss": 1.3248,
"step": 161500
},
{
"epoch": 4.2230390240087585,
"grad_norm": 3.9700820446014404,
"learning_rate": 7.769609759912413e-06,
"loss": 1.3767,
"step": 162000
},
{
"epoch": 4.236073095070514,
"grad_norm": 42.5576286315918,
"learning_rate": 7.639269049294858e-06,
"loss": 1.3346,
"step": 162500
},
{
"epoch": 4.24910716613227,
"grad_norm": 7.013011455535889,
"learning_rate": 7.508928338677302e-06,
"loss": 1.3752,
"step": 163000
},
{
"epoch": 4.262141237194025,
"grad_norm": 12.351140975952148,
"learning_rate": 7.3785876280597476e-06,
"loss": 1.3213,
"step": 163500
},
{
"epoch": 4.275175308255781,
"grad_norm": 48.051631927490234,
"learning_rate": 7.2482469174421946e-06,
"loss": 1.3453,
"step": 164000
},
{
"epoch": 4.288209379317536,
"grad_norm": 3.8004846572875977,
"learning_rate": 7.11790620682464e-06,
"loss": 1.3231,
"step": 164500
},
{
"epoch": 4.301243450379292,
"grad_norm": 3.8865389823913574,
"learning_rate": 6.987565496207085e-06,
"loss": 1.3353,
"step": 165000
},
{
"epoch": 4.3142775214410465,
"grad_norm": 4.471733093261719,
"learning_rate": 6.857224785589532e-06,
"loss": 1.3411,
"step": 165500
},
{
"epoch": 4.327311592502802,
"grad_norm": 4.856067657470703,
"learning_rate": 6.7268840749719775e-06,
"loss": 1.3254,
"step": 166000
},
{
"epoch": 4.340345663564558,
"grad_norm": 4.089067459106445,
"learning_rate": 6.596543364354423e-06,
"loss": 1.3676,
"step": 166500
},
{
"epoch": 4.353379734626313,
"grad_norm": 4.231725215911865,
"learning_rate": 6.466202653736869e-06,
"loss": 1.3331,
"step": 167000
},
{
"epoch": 4.366413805688069,
"grad_norm": 4.140297889709473,
"learning_rate": 6.335861943119314e-06,
"loss": 1.3338,
"step": 167500
},
{
"epoch": 4.379447876749824,
"grad_norm": 3.1667165756225586,
"learning_rate": 6.2055212325017595e-06,
"loss": 1.3658,
"step": 168000
},
{
"epoch": 4.3924819478115795,
"grad_norm": 4.982083797454834,
"learning_rate": 6.075180521884206e-06,
"loss": 1.3098,
"step": 168500
},
{
"epoch": 4.405516018873335,
"grad_norm": 19.951147079467773,
"learning_rate": 5.944839811266651e-06,
"loss": 1.315,
"step": 169000
},
{
"epoch": 4.41855008993509,
"grad_norm": 5.146533489227295,
"learning_rate": 5.814499100649097e-06,
"loss": 1.3322,
"step": 169500
},
{
"epoch": 4.431584160996846,
"grad_norm": 4.29327917098999,
"learning_rate": 5.684158390031543e-06,
"loss": 1.3165,
"step": 170000
},
{
"epoch": 4.444618232058601,
"grad_norm": 4.86635160446167,
"learning_rate": 5.5538176794139886e-06,
"loss": 1.3266,
"step": 170500
},
{
"epoch": 4.457652303120357,
"grad_norm": 5.066024303436279,
"learning_rate": 5.423476968796435e-06,
"loss": 1.3201,
"step": 171000
},
{
"epoch": 4.470686374182112,
"grad_norm": 5.111464500427246,
"learning_rate": 5.293136258178879e-06,
"loss": 1.3188,
"step": 171500
},
{
"epoch": 4.4837204452438675,
"grad_norm": 4.428502082824707,
"learning_rate": 5.162795547561325e-06,
"loss": 1.3162,
"step": 172000
},
{
"epoch": 4.496754516305623,
"grad_norm": 2.84608793258667,
"learning_rate": 5.0324548369437715e-06,
"loss": 1.3052,
"step": 172500
},
{
"epoch": 4.509788587367378,
"grad_norm": 4.425991058349609,
"learning_rate": 4.902114126326217e-06,
"loss": 1.3252,
"step": 173000
},
{
"epoch": 4.522822658429134,
"grad_norm": 21.735198974609375,
"learning_rate": 4.771773415708663e-06,
"loss": 1.3333,
"step": 173500
},
{
"epoch": 4.535856729490889,
"grad_norm": 4.519357204437256,
"learning_rate": 4.641432705091108e-06,
"loss": 1.3115,
"step": 174000
},
{
"epoch": 4.548890800552645,
"grad_norm": 25.662084579467773,
"learning_rate": 4.511091994473554e-06,
"loss": 1.3134,
"step": 174500
},
{
"epoch": 4.5619248716144,
"grad_norm": 3.4979422092437744,
"learning_rate": 4.3807512838560005e-06,
"loss": 1.3202,
"step": 175000
},
{
"epoch": 4.574958942676155,
"grad_norm": 4.444785118103027,
"learning_rate": 4.250410573238446e-06,
"loss": 1.3174,
"step": 175500
},
{
"epoch": 4.587993013737911,
"grad_norm": 6.712714672088623,
"learning_rate": 4.120069862620891e-06,
"loss": 1.3343,
"step": 176000
},
{
"epoch": 4.601027084799666,
"grad_norm": 4.870098114013672,
"learning_rate": 3.9897291520033364e-06,
"loss": 1.3312,
"step": 176500
},
{
"epoch": 4.614061155861422,
"grad_norm": 4.5157928466796875,
"learning_rate": 3.859388441385783e-06,
"loss": 1.3133,
"step": 177000
},
{
"epoch": 4.627095226923177,
"grad_norm": 3.297917366027832,
"learning_rate": 3.7290477307682287e-06,
"loss": 1.34,
"step": 177500
},
{
"epoch": 4.640129297984933,
"grad_norm": 5.5820698738098145,
"learning_rate": 3.598707020150674e-06,
"loss": 1.2856,
"step": 178000
},
{
"epoch": 4.653163369046688,
"grad_norm": 68.55699157714844,
"learning_rate": 3.4683663095331198e-06,
"loss": 1.3293,
"step": 178500
},
{
"epoch": 4.666197440108443,
"grad_norm": 4.395013332366943,
"learning_rate": 3.338025598915565e-06,
"loss": 1.3156,
"step": 179000
},
{
"epoch": 4.679231511170199,
"grad_norm": 4.131389141082764,
"learning_rate": 3.2076848882980112e-06,
"loss": 1.3349,
"step": 179500
},
{
"epoch": 4.692265582231954,
"grad_norm": 3.2444746494293213,
"learning_rate": 3.077344177680457e-06,
"loss": 1.2882,
"step": 180000
},
{
"epoch": 4.70529965329371,
"grad_norm": 6.894190788269043,
"learning_rate": 2.9470034670629027e-06,
"loss": 1.3064,
"step": 180500
},
{
"epoch": 4.718333724355465,
"grad_norm": 4.13007926940918,
"learning_rate": 2.816662756445348e-06,
"loss": 1.3319,
"step": 181000
},
{
"epoch": 4.731367795417221,
"grad_norm": 4.010223388671875,
"learning_rate": 2.686322045827794e-06,
"loss": 1.3289,
"step": 181500
},
{
"epoch": 4.7444018664789755,
"grad_norm": 5.212350845336914,
"learning_rate": 2.55598133521024e-06,
"loss": 1.3052,
"step": 182000
},
{
"epoch": 4.757435937540731,
"grad_norm": 4.112293243408203,
"learning_rate": 2.4256406245926856e-06,
"loss": 1.3178,
"step": 182500
},
{
"epoch": 4.770470008602487,
"grad_norm": 4.711720943450928,
"learning_rate": 2.295299913975131e-06,
"loss": 1.3017,
"step": 183000
},
{
"epoch": 4.783504079664242,
"grad_norm": 4.1918439865112305,
"learning_rate": 2.1649592033575766e-06,
"loss": 1.3368,
"step": 183500
},
{
"epoch": 4.796538150725998,
"grad_norm": 4.53779411315918,
"learning_rate": 2.0346184927400227e-06,
"loss": 1.3103,
"step": 184000
},
{
"epoch": 4.809572221787754,
"grad_norm": 2.9776086807250977,
"learning_rate": 1.9042777821224683e-06,
"loss": 1.3325,
"step": 184500
},
{
"epoch": 4.822606292849509,
"grad_norm": 5.410048007965088,
"learning_rate": 1.773937071504914e-06,
"loss": 1.324,
"step": 185000
},
{
"epoch": 4.835640363911264,
"grad_norm": 5.260219573974609,
"learning_rate": 1.6435963608873595e-06,
"loss": 1.3339,
"step": 185500
},
{
"epoch": 4.848674434973019,
"grad_norm": 5.610768795013428,
"learning_rate": 1.5132556502698054e-06,
"loss": 1.2985,
"step": 186000
},
{
"epoch": 4.861708506034775,
"grad_norm": 6.287191390991211,
"learning_rate": 1.382914939652251e-06,
"loss": 1.2973,
"step": 186500
},
{
"epoch": 4.87474257709653,
"grad_norm": 32.12895202636719,
"learning_rate": 1.2525742290346967e-06,
"loss": 1.2914,
"step": 187000
},
{
"epoch": 4.887776648158286,
"grad_norm": 15.296839714050293,
"learning_rate": 1.1222335184171426e-06,
"loss": 1.3231,
"step": 187500
},
{
"epoch": 4.900810719220042,
"grad_norm": 4.650936126708984,
"learning_rate": 9.918928077995881e-07,
"loss": 1.2902,
"step": 188000
},
{
"epoch": 4.9138447902817965,
"grad_norm": 25.2452335357666,
"learning_rate": 8.615520971820338e-07,
"loss": 1.2964,
"step": 188500
},
{
"epoch": 4.926878861343552,
"grad_norm": 4.3756890296936035,
"learning_rate": 7.312113865644796e-07,
"loss": 1.3137,
"step": 189000
},
{
"epoch": 4.939912932405307,
"grad_norm": 32.994510650634766,
"learning_rate": 6.008706759469253e-07,
"loss": 1.3033,
"step": 189500
},
{
"epoch": 4.952947003467063,
"grad_norm": 3.0575180053710938,
"learning_rate": 4.70529965329371e-07,
"loss": 1.2992,
"step": 190000
},
{
"epoch": 4.965981074528818,
"grad_norm": 4.4134135246276855,
"learning_rate": 3.401892547118167e-07,
"loss": 1.2839,
"step": 190500
},
{
"epoch": 4.979015145590574,
"grad_norm": 40.072750091552734,
"learning_rate": 2.0984854409426243e-07,
"loss": 1.3057,
"step": 191000
},
{
"epoch": 4.99204921665233,
"grad_norm": 19.755613327026367,
"learning_rate": 7.950783347670812e-08,
"loss": 1.2946,
"step": 191500
}
],
"logging_steps": 500,
"max_steps": 191805,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.066567392204288e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}