|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500.0, |
|
"global_step": 13267, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003768749528906309, |
|
"grad_norm": 0.35509032011032104, |
|
"learning_rate": 9.999649547444612e-05, |
|
"loss": 0.5094, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.007537499057812618, |
|
"grad_norm": 0.3374439477920532, |
|
"learning_rate": 9.998598238905239e-05, |
|
"loss": 0.4888, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011306248586718927, |
|
"grad_norm": 0.3017200827598572, |
|
"learning_rate": 9.996846221755392e-05, |
|
"loss": 0.4871, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.015074998115625236, |
|
"grad_norm": 0.29322266578674316, |
|
"learning_rate": 9.994393741594623e-05, |
|
"loss": 0.4899, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.018843747644531544, |
|
"grad_norm": 0.3081373870372772, |
|
"learning_rate": 9.99124114221411e-05, |
|
"loss": 0.4896, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.022612497173437853, |
|
"grad_norm": 0.29016199707984924, |
|
"learning_rate": 9.987388865548454e-05, |
|
"loss": 0.4889, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.026381246702344163, |
|
"grad_norm": 0.265391081571579, |
|
"learning_rate": 9.982837451613738e-05, |
|
"loss": 0.4898, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.030149996231250472, |
|
"grad_norm": 0.27272671461105347, |
|
"learning_rate": 9.977587538431816e-05, |
|
"loss": 0.4894, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03391874576015678, |
|
"grad_norm": 0.28726664185523987, |
|
"learning_rate": 9.971639861940889e-05, |
|
"loss": 0.4869, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.03768749528906309, |
|
"grad_norm": 0.28651759028434753, |
|
"learning_rate": 9.964995255892323e-05, |
|
"loss": 0.4912, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0414562448179694, |
|
"grad_norm": 0.24553848803043365, |
|
"learning_rate": 9.957654651733788e-05, |
|
"loss": 0.4897, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.045224994346875706, |
|
"grad_norm": 0.24010591208934784, |
|
"learning_rate": 9.949619078478677e-05, |
|
"loss": 0.4866, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.048993743875782016, |
|
"grad_norm": 0.26084381341934204, |
|
"learning_rate": 9.940889662561864e-05, |
|
"loss": 0.4892, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.052762493404688325, |
|
"grad_norm": 0.2248304784297943, |
|
"learning_rate": 9.931467627681792e-05, |
|
"loss": 0.4849, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.056531242933594635, |
|
"grad_norm": 0.23623178899288177, |
|
"learning_rate": 9.921354294628944e-05, |
|
"loss": 0.4852, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.060299992462500944, |
|
"grad_norm": 0.23275640606880188, |
|
"learning_rate": 9.910551081100684e-05, |
|
"loss": 0.4855, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.06406874199140725, |
|
"grad_norm": 0.22925056517124176, |
|
"learning_rate": 9.899059501502526e-05, |
|
"loss": 0.4849, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.06783749152031356, |
|
"grad_norm": 0.23725946247577667, |
|
"learning_rate": 9.886881166735846e-05, |
|
"loss": 0.4839, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07160624104921987, |
|
"grad_norm": 0.2293645143508911, |
|
"learning_rate": 9.874017783972058e-05, |
|
"loss": 0.486, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.07537499057812617, |
|
"grad_norm": 0.24602073431015015, |
|
"learning_rate": 9.860471156413309e-05, |
|
"loss": 0.4835, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07914374010703248, |
|
"grad_norm": 0.20568886399269104, |
|
"learning_rate": 9.846243183039694e-05, |
|
"loss": 0.4838, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.0829124896359388, |
|
"grad_norm": 0.2165093570947647, |
|
"learning_rate": 9.831335858343064e-05, |
|
"loss": 0.4827, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0866812391648451, |
|
"grad_norm": 0.2424800992012024, |
|
"learning_rate": 9.815751272047434e-05, |
|
"loss": 0.4832, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.09044998869375141, |
|
"grad_norm": 0.207057386636734, |
|
"learning_rate": 9.79949160881604e-05, |
|
"loss": 0.4809, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09421873822265772, |
|
"grad_norm": 0.21975122392177582, |
|
"learning_rate": 9.782559147945094e-05, |
|
"loss": 0.4827, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.09798748775156403, |
|
"grad_norm": 0.21343478560447693, |
|
"learning_rate": 9.76495626304427e-05, |
|
"loss": 0.4812, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.10175623728047034, |
|
"grad_norm": 0.20896418392658234, |
|
"learning_rate": 9.746685421703961e-05, |
|
"loss": 0.4792, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.10552498680937665, |
|
"grad_norm": 0.2270091027021408, |
|
"learning_rate": 9.727749185149388e-05, |
|
"loss": 0.4795, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.10929373633828296, |
|
"grad_norm": 0.2058868557214737, |
|
"learning_rate": 9.708150207881543e-05, |
|
"loss": 0.4794, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.11306248586718927, |
|
"grad_norm": 0.19969668984413147, |
|
"learning_rate": 9.687891237305096e-05, |
|
"loss": 0.4803, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.11683123539609558, |
|
"grad_norm": 0.19804421067237854, |
|
"learning_rate": 9.666975113343246e-05, |
|
"loss": 0.4782, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.12059998492500189, |
|
"grad_norm": 0.19650672376155853, |
|
"learning_rate": 9.645404768039633e-05, |
|
"loss": 0.4773, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1243687344539082, |
|
"grad_norm": 0.20196650922298431, |
|
"learning_rate": 9.623183225147308e-05, |
|
"loss": 0.4769, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.1281374839828145, |
|
"grad_norm": 0.20083576440811157, |
|
"learning_rate": 9.600313599704869e-05, |
|
"loss": 0.4748, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.13190623351172082, |
|
"grad_norm": 0.19036008417606354, |
|
"learning_rate": 9.576799097599786e-05, |
|
"loss": 0.4751, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.1356749830406271, |
|
"grad_norm": 0.20416900515556335, |
|
"learning_rate": 9.552643015118998e-05, |
|
"loss": 0.4727, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.13944373256953344, |
|
"grad_norm": 0.19550226628780365, |
|
"learning_rate": 9.527848738486842e-05, |
|
"loss": 0.4731, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.14321248209843973, |
|
"grad_norm": 0.20287151634693146, |
|
"learning_rate": 9.502419743390357e-05, |
|
"loss": 0.4745, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.14698123162734605, |
|
"grad_norm": 0.18741615116596222, |
|
"learning_rate": 9.476359594492068e-05, |
|
"loss": 0.4734, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.15074998115625235, |
|
"grad_norm": 0.20605124533176422, |
|
"learning_rate": 9.449671944930288e-05, |
|
"loss": 0.4732, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.15451873068515867, |
|
"grad_norm": 0.20259861648082733, |
|
"learning_rate": 9.422360535807009e-05, |
|
"loss": 0.4745, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.15828748021406497, |
|
"grad_norm": 0.19558943808078766, |
|
"learning_rate": 9.394429195663478e-05, |
|
"loss": 0.4723, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.1620562297429713, |
|
"grad_norm": 0.20177054405212402, |
|
"learning_rate": 9.365881839943508e-05, |
|
"loss": 0.4699, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.1658249792718776, |
|
"grad_norm": 0.20023804903030396, |
|
"learning_rate": 9.336722470444604e-05, |
|
"loss": 0.4719, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.1695937288007839, |
|
"grad_norm": 0.19571995735168457, |
|
"learning_rate": 9.306955174756985e-05, |
|
"loss": 0.4708, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.1733624783296902, |
|
"grad_norm": 0.18980449438095093, |
|
"learning_rate": 9.27658412569059e-05, |
|
"loss": 0.4697, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.17713122785859653, |
|
"grad_norm": 0.18121857941150665, |
|
"learning_rate": 9.24561358069012e-05, |
|
"loss": 0.4692, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.18089997738750282, |
|
"grad_norm": 0.18635448813438416, |
|
"learning_rate": 9.214047881238233e-05, |
|
"loss": 0.4682, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.18466872691640915, |
|
"grad_norm": 0.18292276561260223, |
|
"learning_rate": 9.181891452246937e-05, |
|
"loss": 0.4717, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.18843747644531544, |
|
"grad_norm": 0.4070293605327606, |
|
"learning_rate": 9.149148801437321e-05, |
|
"loss": 0.4685, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.19220622597422174, |
|
"grad_norm": 0.19017393887043, |
|
"learning_rate": 9.115824518707644e-05, |
|
"loss": 0.4675, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.19597497550312806, |
|
"grad_norm": 0.2028086632490158, |
|
"learning_rate": 9.08192327548992e-05, |
|
"loss": 0.4668, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.19974372503203436, |
|
"grad_norm": 0.18879903852939606, |
|
"learning_rate": 9.047449824095075e-05, |
|
"loss": 0.466, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.20351247456094068, |
|
"grad_norm": 0.18708941340446472, |
|
"learning_rate": 9.012408997046766e-05, |
|
"loss": 0.467, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.20728122408984698, |
|
"grad_norm": 0.18148259818553925, |
|
"learning_rate": 8.976805706403942e-05, |
|
"loss": 0.4657, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.2110499736187533, |
|
"grad_norm": 0.18493063747882843, |
|
"learning_rate": 8.94064494307228e-05, |
|
"loss": 0.4638, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2148187231476596, |
|
"grad_norm": 0.18034948408603668, |
|
"learning_rate": 8.903931776104545e-05, |
|
"loss": 0.4624, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.21858747267656592, |
|
"grad_norm": 0.18979419767856598, |
|
"learning_rate": 8.866671351990007e-05, |
|
"loss": 0.4629, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.22235622220547221, |
|
"grad_norm": 0.18408875167369843, |
|
"learning_rate": 8.82886889393301e-05, |
|
"loss": 0.4638, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.22612497173437854, |
|
"grad_norm": 0.17015992105007172, |
|
"learning_rate": 8.790529701120759e-05, |
|
"loss": 0.4608, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22989372126328483, |
|
"grad_norm": 0.17827536165714264, |
|
"learning_rate": 8.751659147980493e-05, |
|
"loss": 0.4635, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.23366247079219116, |
|
"grad_norm": 0.1894233673810959, |
|
"learning_rate": 8.712262683426082e-05, |
|
"loss": 0.4593, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.23743122032109745, |
|
"grad_norm": 0.19764114916324615, |
|
"learning_rate": 8.672345830094199e-05, |
|
"loss": 0.4622, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.24119996985000378, |
|
"grad_norm": 0.18290351331233978, |
|
"learning_rate": 8.631914183570143e-05, |
|
"loss": 0.4608, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.24496871937891007, |
|
"grad_norm": 0.18013353645801544, |
|
"learning_rate": 8.590973411603452e-05, |
|
"loss": 0.4601, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.2487374689078164, |
|
"grad_norm": 0.17729552090168, |
|
"learning_rate": 8.549529253313386e-05, |
|
"loss": 0.4611, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2525062184367227, |
|
"grad_norm": 0.1892414540052414, |
|
"learning_rate": 8.507587518384421e-05, |
|
"loss": 0.4583, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.256274967965629, |
|
"grad_norm": 0.17193005979061127, |
|
"learning_rate": 8.465154086251828e-05, |
|
"loss": 0.4572, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2600437174945353, |
|
"grad_norm": 0.18148685991764069, |
|
"learning_rate": 8.422234905277495e-05, |
|
"loss": 0.4583, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.26381246702344163, |
|
"grad_norm": 0.19143982231616974, |
|
"learning_rate": 8.378835991916083e-05, |
|
"loss": 0.4582, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.26758121655234796, |
|
"grad_norm": 0.18079186975955963, |
|
"learning_rate": 8.334963429871627e-05, |
|
"loss": 0.4599, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.2713499660812542, |
|
"grad_norm": 0.17887386679649353, |
|
"learning_rate": 8.290623369244721e-05, |
|
"loss": 0.4574, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.27511871561016055, |
|
"grad_norm": 0.17481209337711334, |
|
"learning_rate": 8.245822025670384e-05, |
|
"loss": 0.4588, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.27888746513906687, |
|
"grad_norm": 0.17591702938079834, |
|
"learning_rate": 8.200565679446753e-05, |
|
"loss": 0.4543, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.2826562146679732, |
|
"grad_norm": 0.17434370517730713, |
|
"learning_rate": 8.154860674654698e-05, |
|
"loss": 0.4552, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.28642496419687946, |
|
"grad_norm": 0.17741286754608154, |
|
"learning_rate": 8.108713418268514e-05, |
|
"loss": 0.4551, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.2901937137257858, |
|
"grad_norm": 0.1794031709432602, |
|
"learning_rate": 8.062130379257764e-05, |
|
"loss": 0.4557, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.2939624632546921, |
|
"grad_norm": 0.17624689638614655, |
|
"learning_rate": 8.015118087680477e-05, |
|
"loss": 0.4558, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2977312127835984, |
|
"grad_norm": 0.173648402094841, |
|
"learning_rate": 7.96768313376774e-05, |
|
"loss": 0.4519, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3014999623125047, |
|
"grad_norm": 0.17087939381599426, |
|
"learning_rate": 7.919832166999874e-05, |
|
"loss": 0.454, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.305268711841411, |
|
"grad_norm": 0.1744805872440338, |
|
"learning_rate": 7.871571895174316e-05, |
|
"loss": 0.4511, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.30903746137031735, |
|
"grad_norm": 0.1831275224685669, |
|
"learning_rate": 7.822909083465298e-05, |
|
"loss": 0.4537, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.3128062108992236, |
|
"grad_norm": 0.17621232569217682, |
|
"learning_rate": 7.773850553475508e-05, |
|
"loss": 0.4506, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.31657496042812994, |
|
"grad_norm": 0.1809280514717102, |
|
"learning_rate": 7.724403182279823e-05, |
|
"loss": 0.4537, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.32034370995703626, |
|
"grad_norm": 0.18568743765354156, |
|
"learning_rate": 7.674573901461282e-05, |
|
"loss": 0.4484, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3241124594859426, |
|
"grad_norm": 0.17346200346946716, |
|
"learning_rate": 7.624369696139402e-05, |
|
"loss": 0.4492, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.32788120901484885, |
|
"grad_norm": 0.16987943649291992, |
|
"learning_rate": 7.573797603991004e-05, |
|
"loss": 0.4511, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.3316499585437552, |
|
"grad_norm": 0.1740700751543045, |
|
"learning_rate": 7.522864714263655e-05, |
|
"loss": 0.4504, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3354187080726615, |
|
"grad_norm": 0.18099980056285858, |
|
"learning_rate": 7.471578166781899e-05, |
|
"loss": 0.4509, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.3391874576015678, |
|
"grad_norm": 0.1742471605539322, |
|
"learning_rate": 7.419945150946386e-05, |
|
"loss": 0.4482, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3429562071304741, |
|
"grad_norm": 0.17314079403877258, |
|
"learning_rate": 7.367972904726055e-05, |
|
"loss": 0.4497, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.3467249566593804, |
|
"grad_norm": 0.1672036498785019, |
|
"learning_rate": 7.3156687136435e-05, |
|
"loss": 0.4476, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.35049370618828674, |
|
"grad_norm": 0.1716027557849884, |
|
"learning_rate": 7.26303990975369e-05, |
|
"loss": 0.4484, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.35426245571719306, |
|
"grad_norm": 0.16599993407726288, |
|
"learning_rate": 7.210093870616155e-05, |
|
"loss": 0.4478, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.3580312052460993, |
|
"grad_norm": 0.16980785131454468, |
|
"learning_rate": 7.156838018260776e-05, |
|
"loss": 0.4468, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.36179995477500565, |
|
"grad_norm": 0.17415867745876312, |
|
"learning_rate": 7.103279818147371e-05, |
|
"loss": 0.4444, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.365568704303912, |
|
"grad_norm": 0.17735563218593597, |
|
"learning_rate": 7.049426778119179e-05, |
|
"loss": 0.4454, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.3693374538328183, |
|
"grad_norm": 0.1772989183664322, |
|
"learning_rate": 6.995286447350397e-05, |
|
"loss": 0.4456, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.37310620336172456, |
|
"grad_norm": 0.17225749790668488, |
|
"learning_rate": 6.940866415287931e-05, |
|
"loss": 0.4453, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.3768749528906309, |
|
"grad_norm": 0.16734232008457184, |
|
"learning_rate": 6.886174310587501e-05, |
|
"loss": 0.4429, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3806437024195372, |
|
"grad_norm": 0.17711064219474792, |
|
"learning_rate": 6.831217800044252e-05, |
|
"loss": 0.4455, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.3844124519484435, |
|
"grad_norm": 0.16528938710689545, |
|
"learning_rate": 6.776004587518001e-05, |
|
"loss": 0.4452, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.3881812014773498, |
|
"grad_norm": 0.16722093522548676, |
|
"learning_rate": 6.720542412853319e-05, |
|
"loss": 0.4427, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.3919499510062561, |
|
"grad_norm": 0.16517098248004913, |
|
"learning_rate": 6.66483905079454e-05, |
|
"loss": 0.4424, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.39571870053516245, |
|
"grad_norm": 0.16755063831806183, |
|
"learning_rate": 6.608902309895895e-05, |
|
"loss": 0.4405, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.3994874500640687, |
|
"grad_norm": 0.1688978523015976, |
|
"learning_rate": 6.552740031426902e-05, |
|
"loss": 0.437, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.40325619959297504, |
|
"grad_norm": 0.16191639006137848, |
|
"learning_rate": 6.496360088273161e-05, |
|
"loss": 0.4405, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.40702494912188136, |
|
"grad_norm": 0.1776248961687088, |
|
"learning_rate": 6.439770383832732e-05, |
|
"loss": 0.4405, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.4107936986507877, |
|
"grad_norm": 0.16206714510917664, |
|
"learning_rate": 6.382978850908226e-05, |
|
"loss": 0.44, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.41456244817969395, |
|
"grad_norm": 0.16774949431419373, |
|
"learning_rate": 6.325993450594782e-05, |
|
"loss": 0.4405, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4183311977086003, |
|
"grad_norm": 0.16804030537605286, |
|
"learning_rate": 6.26882217116406e-05, |
|
"loss": 0.4386, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.4220999472375066, |
|
"grad_norm": 0.16452039778232574, |
|
"learning_rate": 6.211473026944452e-05, |
|
"loss": 0.4369, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.4258686967664129, |
|
"grad_norm": 0.15764504671096802, |
|
"learning_rate": 6.153954057197612e-05, |
|
"loss": 0.438, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.4296374462953192, |
|
"grad_norm": 0.16407234966754913, |
|
"learning_rate": 6.0962733249915135e-05, |
|
"loss": 0.4366, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.4334061958242255, |
|
"grad_norm": 0.16679194569587708, |
|
"learning_rate": 6.038438916070155e-05, |
|
"loss": 0.4381, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.43717494535313184, |
|
"grad_norm": 0.16508112847805023, |
|
"learning_rate": 5.9804589377200946e-05, |
|
"loss": 0.4369, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.44094369488203816, |
|
"grad_norm": 0.16879412531852722, |
|
"learning_rate": 5.922341517633965e-05, |
|
"loss": 0.4382, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.44471244441094443, |
|
"grad_norm": 0.16117092967033386, |
|
"learning_rate": 5.864094802771115e-05, |
|
"loss": 0.4348, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.44848119393985075, |
|
"grad_norm": 0.1632978767156601, |
|
"learning_rate": 5.8057269582155735e-05, |
|
"loss": 0.4371, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.4522499434687571, |
|
"grad_norm": 0.16430360078811646, |
|
"learning_rate": 5.7472461660314504e-05, |
|
"loss": 0.435, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4560186929976634, |
|
"grad_norm": 0.16830819845199585, |
|
"learning_rate": 5.6886606241159714e-05, |
|
"loss": 0.4337, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.45978744252656967, |
|
"grad_norm": 0.16006672382354736, |
|
"learning_rate": 5.6299785450502853e-05, |
|
"loss": 0.4336, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.463556192055476, |
|
"grad_norm": 0.16786810755729675, |
|
"learning_rate": 5.571208154948218e-05, |
|
"loss": 0.4335, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.4673249415843823, |
|
"grad_norm": 0.16502316296100616, |
|
"learning_rate": 5.5123576923031253e-05, |
|
"loss": 0.433, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.47109369111328864, |
|
"grad_norm": 0.16036230325698853, |
|
"learning_rate": 5.453435406833017e-05, |
|
"loss": 0.4296, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.4748624406421949, |
|
"grad_norm": 0.16277125477790833, |
|
"learning_rate": 5.3944495583240987e-05, |
|
"loss": 0.4349, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.47863119017110123, |
|
"grad_norm": 0.1639643758535385, |
|
"learning_rate": 5.3354084154729034e-05, |
|
"loss": 0.4311, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.48239993970000755, |
|
"grad_norm": 0.1611129492521286, |
|
"learning_rate": 5.276320254727187e-05, |
|
"loss": 0.4315, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.4861686892289138, |
|
"grad_norm": 0.1649327427148819, |
|
"learning_rate": 5.217193359125724e-05, |
|
"loss": 0.433, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.48993743875782014, |
|
"grad_norm": 0.16595204174518585, |
|
"learning_rate": 5.15803601713717e-05, |
|
"loss": 0.432, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.49370618828672647, |
|
"grad_norm": 0.16455797851085663, |
|
"learning_rate": 5.0988565214981976e-05, |
|
"loss": 0.4291, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.4974749378156328, |
|
"grad_norm": 0.16371013224124908, |
|
"learning_rate": 5.0396631680509945e-05, |
|
"loss": 0.4299, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5012436873445391, |
|
"grad_norm": 0.16786278784275055, |
|
"learning_rate": 4.9804642545803524e-05, |
|
"loss": 0.43, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.5050124368734454, |
|
"grad_norm": 0.16633006930351257, |
|
"learning_rate": 4.9212680796504704e-05, |
|
"loss": 0.4289, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.5087811864023517, |
|
"grad_norm": 0.1592586487531662, |
|
"learning_rate": 4.8620829414416615e-05, |
|
"loss": 0.4296, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.512549935931258, |
|
"grad_norm": 0.16653411090373993, |
|
"learning_rate": 4.8029171365870926e-05, |
|
"loss": 0.4282, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.5163186854601644, |
|
"grad_norm": 0.16303293406963348, |
|
"learning_rate": 4.743778959009766e-05, |
|
"loss": 0.4267, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.5200874349890706, |
|
"grad_norm": 0.1592382937669754, |
|
"learning_rate": 4.684676698759864e-05, |
|
"loss": 0.4268, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5238561845179769, |
|
"grad_norm": 0.15816909074783325, |
|
"learning_rate": 4.62561864085264e-05, |
|
"loss": 0.4261, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.5276249340468833, |
|
"grad_norm": 0.1695041060447693, |
|
"learning_rate": 4.566613064107015e-05, |
|
"loss": 0.427, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5313936835757895, |
|
"grad_norm": 0.16515901684761047, |
|
"learning_rate": 4.507668239985055e-05, |
|
"loss": 0.4263, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.5351624331046959, |
|
"grad_norm": 0.15738603472709656, |
|
"learning_rate": 4.448792431432451e-05, |
|
"loss": 0.4277, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.5389311826336022, |
|
"grad_norm": 0.17032016813755035, |
|
"learning_rate": 4.389993891720232e-05, |
|
"loss": 0.4262, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.5426999321625084, |
|
"grad_norm": 0.1652156412601471, |
|
"learning_rate": 4.3312808632877924e-05, |
|
"loss": 0.4228, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.5464686816914148, |
|
"grad_norm": 0.161549910902977, |
|
"learning_rate": 4.27266157658747e-05, |
|
"loss": 0.4231, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.5502374312203211, |
|
"grad_norm": 0.15818439424037933, |
|
"learning_rate": 4.214144248930797e-05, |
|
"loss": 0.4238, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.5540061807492274, |
|
"grad_norm": 0.15689703822135925, |
|
"learning_rate": 4.155737083336575e-05, |
|
"loss": 0.4242, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.5577749302781337, |
|
"grad_norm": 0.16835862398147583, |
|
"learning_rate": 4.097448267380979e-05, |
|
"loss": 0.4246, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.56154367980704, |
|
"grad_norm": 0.162080317735672, |
|
"learning_rate": 4.03928597204981e-05, |
|
"loss": 0.4204, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.5653124293359464, |
|
"grad_norm": 0.16594427824020386, |
|
"learning_rate": 3.9812583505930786e-05, |
|
"loss": 0.4236, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5690811788648527, |
|
"grad_norm": 0.1567797213792801, |
|
"learning_rate": 3.923373537382074e-05, |
|
"loss": 0.422, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.5728499283937589, |
|
"grad_norm": 0.1608632504940033, |
|
"learning_rate": 3.86563964676908e-05, |
|
"loss": 0.4213, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.5766186779226653, |
|
"grad_norm": 0.16220742464065552, |
|
"learning_rate": 3.808064771949893e-05, |
|
"loss": 0.4208, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.5803874274515716, |
|
"grad_norm": 0.16180914640426636, |
|
"learning_rate": 3.75065698382932e-05, |
|
"loss": 0.4213, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.5841561769804778, |
|
"grad_norm": 0.17086252570152283, |
|
"learning_rate": 3.693424329889776e-05, |
|
"loss": 0.4209, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.5879249265093842, |
|
"grad_norm": 0.1601138710975647, |
|
"learning_rate": 3.636374833063191e-05, |
|
"loss": 0.4206, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.5916936760382905, |
|
"grad_norm": 0.15639857947826385, |
|
"learning_rate": 3.579516490606346e-05, |
|
"loss": 0.4191, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.5954624255671968, |
|
"grad_norm": 0.16204357147216797, |
|
"learning_rate": 3.522857272979804e-05, |
|
"loss": 0.4185, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.5992311750961031, |
|
"grad_norm": 0.1710115373134613, |
|
"learning_rate": 3.4664051227306026e-05, |
|
"loss": 0.4178, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.6029999246250094, |
|
"grad_norm": 0.15990346670150757, |
|
"learning_rate": 3.4101679533788734e-05, |
|
"loss": 0.4161, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6067686741539158, |
|
"grad_norm": 0.15846975147724152, |
|
"learning_rate": 3.354153648308492e-05, |
|
"loss": 0.4168, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.610537423682822, |
|
"grad_norm": 0.15800924599170685, |
|
"learning_rate": 3.298370059662004e-05, |
|
"loss": 0.4165, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.6143061732117283, |
|
"grad_norm": 0.1673530787229538, |
|
"learning_rate": 3.2428250072398846e-05, |
|
"loss": 0.4164, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.6180749227406347, |
|
"grad_norm": 0.16620007157325745, |
|
"learning_rate": 3.187526277404355e-05, |
|
"loss": 0.4193, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.621843672269541, |
|
"grad_norm": 0.1582447588443756, |
|
"learning_rate": 3.1324816219878903e-05, |
|
"loss": 0.416, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.6256124217984472, |
|
"grad_norm": 0.16663286089897156, |
|
"learning_rate": 3.077698757206552e-05, |
|
"loss": 0.4172, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.6293811713273536, |
|
"grad_norm": 0.16307072341442108, |
|
"learning_rate": 3.0231853625783163e-05, |
|
"loss": 0.4145, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.6331499208562599, |
|
"grad_norm": 0.1569572240114212, |
|
"learning_rate": 2.9689490798465698e-05, |
|
"loss": 0.4146, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.6369186703851663, |
|
"grad_norm": 0.1609562188386917, |
|
"learning_rate": 2.9149975119088596e-05, |
|
"loss": 0.4146, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.6406874199140725, |
|
"grad_norm": 0.15904489159584045, |
|
"learning_rate": 2.8613382217511265e-05, |
|
"loss": 0.4125, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6444561694429788, |
|
"grad_norm": 0.16079629957675934, |
|
"learning_rate": 2.807978731387516e-05, |
|
"loss": 0.4151, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.6482249189718852, |
|
"grad_norm": 0.16790254414081573, |
|
"learning_rate": 2.754926520805925e-05, |
|
"loss": 0.4141, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.6519936685007914, |
|
"grad_norm": 0.16292473673820496, |
|
"learning_rate": 2.702189026919465e-05, |
|
"loss": 0.4143, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.6557624180296977, |
|
"grad_norm": 0.16475141048431396, |
|
"learning_rate": 2.6497736425239315e-05, |
|
"loss": 0.4129, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.6595311675586041, |
|
"grad_norm": 0.1627720594406128, |
|
"learning_rate": 2.597687715261484e-05, |
|
"loss": 0.4127, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.6632999170875103, |
|
"grad_norm": 0.1639036387205124, |
|
"learning_rate": 2.5459385465906517e-05, |
|
"loss": 0.4121, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.6670686666164167, |
|
"grad_norm": 0.16833463311195374, |
|
"learning_rate": 2.4945333907627892e-05, |
|
"loss": 0.4129, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.670837416145323, |
|
"grad_norm": 0.15790612995624542, |
|
"learning_rate": 2.443479453805189e-05, |
|
"loss": 0.4098, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.6746061656742293, |
|
"grad_norm": 0.16387607157230377, |
|
"learning_rate": 2.392783892510917e-05, |
|
"loss": 0.411, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.6783749152031356, |
|
"grad_norm": 0.16562491655349731, |
|
"learning_rate": 2.3424538134355715e-05, |
|
"loss": 0.4122, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6821436647320419, |
|
"grad_norm": 0.15516149997711182, |
|
"learning_rate": 2.2924962719010874e-05, |
|
"loss": 0.4112, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.6859124142609482, |
|
"grad_norm": 0.16322891414165497, |
|
"learning_rate": 2.242918271006698e-05, |
|
"loss": 0.4109, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.6896811637898546, |
|
"grad_norm": 0.16053235530853271, |
|
"learning_rate": 2.193726760647245e-05, |
|
"loss": 0.4088, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.6934499133187608, |
|
"grad_norm": 0.1649434119462967, |
|
"learning_rate": 2.1449286365389342e-05, |
|
"loss": 0.4103, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.6972186628476671, |
|
"grad_norm": 0.1640276312828064, |
|
"learning_rate": 2.0965307392526818e-05, |
|
"loss": 0.409, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.7009874123765735, |
|
"grad_norm": 0.1616964489221573, |
|
"learning_rate": 2.048539853255197e-05, |
|
"loss": 0.4105, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.7047561619054797, |
|
"grad_norm": 0.1595754474401474, |
|
"learning_rate": 2.0009627059579372e-05, |
|
"loss": 0.4108, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.7085249114343861, |
|
"grad_norm": 0.1613272726535797, |
|
"learning_rate": 1.953805966774037e-05, |
|
"loss": 0.4114, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.7122936609632924, |
|
"grad_norm": 0.15698370337486267, |
|
"learning_rate": 1.9070762461834018e-05, |
|
"loss": 0.4087, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.7160624104921987, |
|
"grad_norm": 0.17342697083950043, |
|
"learning_rate": 1.8607800948060266e-05, |
|
"loss": 0.4072, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.719831160021105, |
|
"grad_norm": 0.16098545491695404, |
|
"learning_rate": 1.8149240024837315e-05, |
|
"loss": 0.4071, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.7235999095500113, |
|
"grad_norm": 0.16427302360534668, |
|
"learning_rate": 1.7695143973704143e-05, |
|
"loss": 0.4067, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.7273686590789176, |
|
"grad_norm": 0.1618986576795578, |
|
"learning_rate": 1.7245576450309316e-05, |
|
"loss": 0.4081, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.731137408607824, |
|
"grad_norm": 0.15955495834350586, |
|
"learning_rate": 1.6800600475487826e-05, |
|
"loss": 0.4085, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.7349061581367302, |
|
"grad_norm": 0.159365713596344, |
|
"learning_rate": 1.6360278426426624e-05, |
|
"loss": 0.4069, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.7386749076656366, |
|
"grad_norm": 0.1574493795633316, |
|
"learning_rate": 1.5924672027920663e-05, |
|
"loss": 0.4058, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.7424436571945429, |
|
"grad_norm": 0.16413567960262299, |
|
"learning_rate": 1.5493842343720104e-05, |
|
"loss": 0.4047, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.7462124067234491, |
|
"grad_norm": 0.1662568747997284, |
|
"learning_rate": 1.5067849767970488e-05, |
|
"loss": 0.4046, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.7499811562523555, |
|
"grad_norm": 0.16023589670658112, |
|
"learning_rate": 1.4646754016746483e-05, |
|
"loss": 0.4072, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.7537499057812618, |
|
"grad_norm": 0.16238714754581451, |
|
"learning_rate": 1.4230614119680957e-05, |
|
"loss": 0.4072, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.757518655310168, |
|
"grad_norm": 0.16267696022987366, |
|
"learning_rate": 1.3819488411690018e-05, |
|
"loss": 0.4056, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.7612874048390744, |
|
"grad_norm": 0.1596570760011673, |
|
"learning_rate": 1.3413434524795631e-05, |
|
"loss": 0.4049, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.7650561543679807, |
|
"grad_norm": 0.17054887115955353, |
|
"learning_rate": 1.3012509380046745e-05, |
|
"loss": 0.4032, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.768824903896887, |
|
"grad_norm": 0.15980112552642822, |
|
"learning_rate": 1.2616769179539944e-05, |
|
"loss": 0.405, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.7725936534257933, |
|
"grad_norm": 0.16071230173110962, |
|
"learning_rate": 1.222626939854103e-05, |
|
"loss": 0.4027, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.7763624029546996, |
|
"grad_norm": 0.15903015434741974, |
|
"learning_rate": 1.1841064777708483e-05, |
|
"loss": 0.4043, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.780131152483606, |
|
"grad_norm": 0.16393496096134186, |
|
"learning_rate": 1.1461209315419758e-05, |
|
"loss": 0.4009, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.7838999020125123, |
|
"grad_norm": 0.16215142607688904, |
|
"learning_rate": 1.1086756260201859e-05, |
|
"loss": 0.4032, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.7876686515414185, |
|
"grad_norm": 0.16436554491519928, |
|
"learning_rate": 1.0717758103266805e-05, |
|
"loss": 0.4035, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.7914374010703249, |
|
"grad_norm": 0.16526173055171967, |
|
"learning_rate": 1.0354266571153399e-05, |
|
"loss": 0.4023, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.7952061505992312, |
|
"grad_norm": 0.1610834300518036, |
|
"learning_rate": 9.996332618476172e-06, |
|
"loss": 0.4031, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.7989749001281374, |
|
"grad_norm": 0.1577410101890564, |
|
"learning_rate": 9.644006420782476e-06, |
|
"loss": 0.4006, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.8027436496570438, |
|
"grad_norm": 0.15618577599525452, |
|
"learning_rate": 9.29733736751881e-06, |
|
"loss": 0.4037, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.8065123991859501, |
|
"grad_norm": 0.15943607687950134, |
|
"learning_rate": 8.956374055107442e-06, |
|
"loss": 0.4026, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.8102811487148565, |
|
"grad_norm": 0.1561410278081894, |
|
"learning_rate": 8.621164280134004e-06, |
|
"loss": 0.4021, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.8140498982437627, |
|
"grad_norm": 0.166486918926239, |
|
"learning_rate": 8.291755032647402e-06, |
|
"loss": 0.4017, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.817818647772669, |
|
"grad_norm": 0.16270950436592102, |
|
"learning_rate": 7.96819248957265e-06, |
|
"loss": 0.4019, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.8215873973015754, |
|
"grad_norm": 0.1590346395969391, |
|
"learning_rate": 7.650522008237754e-06, |
|
"loss": 0.4014, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.8253561468304816, |
|
"grad_norm": 0.15762105584144592, |
|
"learning_rate": 7.338788120015522e-06, |
|
"loss": 0.4005, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.8291248963593879, |
|
"grad_norm": 0.1578063815832138, |
|
"learning_rate": 7.033034524081023e-06, |
|
"loss": 0.4008, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8328936458882943, |
|
"grad_norm": 0.1597341001033783, |
|
"learning_rate": 6.733304081285874e-06, |
|
"loss": 0.4005, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.8366623954172006, |
|
"grad_norm": 0.16171535849571228, |
|
"learning_rate": 6.439638808149923e-06, |
|
"loss": 0.4018, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.8404311449461069, |
|
"grad_norm": 0.16193453967571259, |
|
"learning_rate": 6.152079870971311e-06, |
|
"loss": 0.3993, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.8441998944750132, |
|
"grad_norm": 0.15683096647262573, |
|
"learning_rate": 5.870667580055805e-06, |
|
"loss": 0.4014, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.8479686440039195, |
|
"grad_norm": 0.16238045692443848, |
|
"learning_rate": 5.595441384065986e-06, |
|
"loss": 0.402, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.8517373935328258, |
|
"grad_norm": 0.15774257481098175, |
|
"learning_rate": 5.3264398644913114e-06, |
|
"loss": 0.4, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.8555061430617321, |
|
"grad_norm": 0.15233269333839417, |
|
"learning_rate": 5.063700730239784e-06, |
|
"loss": 0.3994, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.8592748925906384, |
|
"grad_norm": 0.16699600219726562, |
|
"learning_rate": 4.807260812351793e-06, |
|
"loss": 0.399, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.8630436421195448, |
|
"grad_norm": 0.1617075353860855, |
|
"learning_rate": 4.557156058837137e-06, |
|
"loss": 0.3988, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.866812391648451, |
|
"grad_norm": 0.1602969914674759, |
|
"learning_rate": 4.31342152963583e-06, |
|
"loss": 0.3999, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.8705811411773573, |
|
"grad_norm": 0.15863758325576782, |
|
"learning_rate": 4.076091391703302e-06, |
|
"loss": 0.3999, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.8743498907062637, |
|
"grad_norm": 0.1571153998374939, |
|
"learning_rate": 3.845198914220871e-06, |
|
"loss": 0.3984, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.87811864023517, |
|
"grad_norm": 0.15634584426879883, |
|
"learning_rate": 3.6207764639320462e-06, |
|
"loss": 0.3989, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.8818873897640763, |
|
"grad_norm": 0.16058678925037384, |
|
"learning_rate": 3.4028555006052953e-06, |
|
"loss": 0.4015, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.8856561392929826, |
|
"grad_norm": 0.15872234106063843, |
|
"learning_rate": 3.191466572624019e-06, |
|
"loss": 0.3979, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.8894248888218889, |
|
"grad_norm": 0.16169828176498413, |
|
"learning_rate": 2.986639312704209e-06, |
|
"loss": 0.3984, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.8931936383507952, |
|
"grad_norm": 0.16279493272304535, |
|
"learning_rate": 2.788402433740517e-06, |
|
"loss": 0.3982, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.8969623878797015, |
|
"grad_norm": 0.1638520359992981, |
|
"learning_rate": 2.596783724781282e-06, |
|
"loss": 0.4002, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.9007311374086078, |
|
"grad_norm": 0.15517061948776245, |
|
"learning_rate": 2.4118100471329787e-06, |
|
"loss": 0.3974, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.9044998869375142, |
|
"grad_norm": 0.1604815125465393, |
|
"learning_rate": 2.2335073305948086e-06, |
|
"loss": 0.3992, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9082686364664204, |
|
"grad_norm": 0.157390296459198, |
|
"learning_rate": 2.0619005698238437e-06, |
|
"loss": 0.3989, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.9120373859953268, |
|
"grad_norm": 0.15608523786067963, |
|
"learning_rate": 1.8970138208311949e-06, |
|
"loss": 0.3971, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.9158061355242331, |
|
"grad_norm": 0.15673068165779114, |
|
"learning_rate": 1.7388701976099041e-06, |
|
"loss": 0.3994, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.9195748850531393, |
|
"grad_norm": 0.1587488353252411, |
|
"learning_rate": 1.5874918688946972e-06, |
|
"loss": 0.3985, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.9233436345820457, |
|
"grad_norm": 0.16035687923431396, |
|
"learning_rate": 1.4429000550544414e-06, |
|
"loss": 0.399, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.927112384110952, |
|
"grad_norm": 0.15816493332386017, |
|
"learning_rate": 1.305115025117387e-06, |
|
"loss": 0.4, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.9308811336398582, |
|
"grad_norm": 0.16530562937259674, |
|
"learning_rate": 1.1741560939298791e-06, |
|
"loss": 0.3995, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.9346498831687646, |
|
"grad_norm": 0.1594778597354889, |
|
"learning_rate": 1.0500416194487384e-06, |
|
"loss": 0.3997, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.9384186326976709, |
|
"grad_norm": 0.15754447877407074, |
|
"learning_rate": 9.327890001678719e-07, |
|
"loss": 0.3972, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.9421873822265773, |
|
"grad_norm": 0.15905898809432983, |
|
"learning_rate": 8.224146726792947e-07, |
|
"loss": 0.3972, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.9459561317554835, |
|
"grad_norm": 0.15947633981704712, |
|
"learning_rate": 7.189341093690627e-07, |
|
"loss": 0.3964, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.9497248812843898, |
|
"grad_norm": 0.16022710502147675, |
|
"learning_rate": 6.223618162483014e-07, |
|
"loss": 0.3993, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.9534936308132962, |
|
"grad_norm": 0.16380661725997925, |
|
"learning_rate": 5.327113309197828e-07, |
|
"loss": 0.4, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.9572623803422025, |
|
"grad_norm": 0.15692880749702454, |
|
"learning_rate": 4.4999522068017164e-07, |
|
"loss": 0.3982, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.9610311298711087, |
|
"grad_norm": 0.16599752008914948, |
|
"learning_rate": 3.7422508075835583e-07, |
|
"loss": 0.397, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.9647998794000151, |
|
"grad_norm": 0.15927733480930328, |
|
"learning_rate": 3.05411532689992e-07, |
|
"loss": 0.396, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.9685686289289214, |
|
"grad_norm": 0.15940117835998535, |
|
"learning_rate": 2.435642228285906e-07, |
|
"loss": 0.3983, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.9723373784578276, |
|
"grad_norm": 0.16387763619422913, |
|
"learning_rate": 1.886918209932642e-07, |
|
"loss": 0.398, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.976106127986734, |
|
"grad_norm": 0.15747365355491638, |
|
"learning_rate": 1.4080201925338322e-07, |
|
"loss": 0.3978, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.9798748775156403, |
|
"grad_norm": 0.1579546183347702, |
|
"learning_rate": 9.99015308503215e-08, |
|
"loss": 0.4005, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9836436270445467, |
|
"grad_norm": 0.15818338096141815, |
|
"learning_rate": 6.599608925633715e-08, |
|
"loss": 0.3978, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.9874123765734529, |
|
"grad_norm": 0.15863798558712006, |
|
"learning_rate": 3.909044737089307e-08, |
|
"loss": 0.3991, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.9911811261023592, |
|
"grad_norm": 0.1556527018547058, |
|
"learning_rate": 1.9188376854373246e-08, |
|
"loss": 0.3985, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 0.9949498756312656, |
|
"grad_norm": 0.16225971281528473, |
|
"learning_rate": 6.292667599366864e-09, |
|
"loss": 0.3979, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.9987186251601718, |
|
"grad_norm": 0.16236484050750732, |
|
"learning_rate": 4.0512733956998837e-10, |
|
"loss": 0.3953, |
|
"step": 13250 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 13267, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.892725356142474e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|