{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 191805, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013034071061755428, "grad_norm": 2.9903717041015625, "learning_rate": 4.9869659289382446e-05, "loss": 6.0967, "step": 500 }, { "epoch": 0.026068142123510857, "grad_norm": 4.055346965789795, "learning_rate": 4.973931857876489e-05, "loss": 5.5622, "step": 1000 }, { "epoch": 0.039102213185266285, "grad_norm": 5.480607032775879, "learning_rate": 4.960897786814734e-05, "loss": 5.4371, "step": 1500 }, { "epoch": 0.052136284247021714, "grad_norm": 4.727134704589844, "learning_rate": 4.9478637157529784e-05, "loss": 5.3535, "step": 2000 }, { "epoch": 0.06517035530877714, "grad_norm": 4.737260341644287, "learning_rate": 4.934829644691223e-05, "loss": 5.2508, "step": 2500 }, { "epoch": 0.07820442637053257, "grad_norm": 5.066771984100342, "learning_rate": 4.921795573629467e-05, "loss": 5.199, "step": 3000 }, { "epoch": 0.091238497432288, "grad_norm": 3.627026319503784, "learning_rate": 4.908761502567713e-05, "loss": 5.084, "step": 3500 }, { "epoch": 0.10427256849404343, "grad_norm": 4.254016876220703, "learning_rate": 4.895727431505957e-05, "loss": 4.9441, "step": 4000 }, { "epoch": 0.11730663955579886, "grad_norm": 6.351306438446045, "learning_rate": 4.8826933604442015e-05, "loss": 4.8021, "step": 4500 }, { "epoch": 0.13034071061755428, "grad_norm": 7.492619037628174, "learning_rate": 4.869659289382446e-05, "loss": 4.6446, "step": 5000 }, { "epoch": 0.14337478167930973, "grad_norm": 6.017455577850342, "learning_rate": 4.856625218320691e-05, "loss": 4.4574, "step": 5500 }, { "epoch": 0.15640885274106514, "grad_norm": 5.2971343994140625, "learning_rate": 4.843591147258935e-05, "loss": 4.2184, "step": 6000 }, { "epoch": 0.16944292380282058, "grad_norm": 9.367820739746094, "learning_rate": 4.8305570761971796e-05, "loss": 4.101, "step": 6500 }, { "epoch": 0.182476994864576, "grad_norm": 7.676972389221191, "learning_rate": 4.817523005135424e-05, "loss": 3.9548, "step": 7000 }, { "epoch": 0.19551106592633144, "grad_norm": 6.3607563972473145, "learning_rate": 4.804488934073669e-05, "loss": 3.8584, "step": 7500 }, { "epoch": 0.20854513698808685, "grad_norm": 5.45451021194458, "learning_rate": 4.7914548630119134e-05, "loss": 3.7841, "step": 8000 }, { "epoch": 0.2215792080498423, "grad_norm": 16.199485778808594, "learning_rate": 4.778420791950158e-05, "loss": 3.6685, "step": 8500 }, { "epoch": 0.2346132791115977, "grad_norm": 6.077032089233398, "learning_rate": 4.765386720888402e-05, "loss": 3.6017, "step": 9000 }, { "epoch": 0.24764735017335315, "grad_norm": 11.489569664001465, "learning_rate": 4.752352649826647e-05, "loss": 3.5553, "step": 9500 }, { "epoch": 0.26068142123510857, "grad_norm": 4.917782783508301, "learning_rate": 4.7393185787648915e-05, "loss": 3.4537, "step": 10000 }, { "epoch": 0.273715492296864, "grad_norm": 5.945028781890869, "learning_rate": 4.7262845077031366e-05, "loss": 3.4442, "step": 10500 }, { "epoch": 0.28674956335861945, "grad_norm": 7.648957252502441, "learning_rate": 4.713250436641381e-05, "loss": 3.3772, "step": 11000 }, { "epoch": 0.29978363442037487, "grad_norm": 7.488467216491699, "learning_rate": 4.700216365579625e-05, "loss": 3.3026, "step": 11500 }, { "epoch": 0.3128177054821303, "grad_norm": 5.8792619705200195, "learning_rate": 4.68718229451787e-05, "loss": 3.2446, "step": 12000 }, { "epoch": 0.3258517765438857, "grad_norm": 10.038032531738281, "learning_rate": 4.674148223456115e-05, "loss": 3.216, "step": 12500 }, { "epoch": 0.33888584760564117, "grad_norm": 7.69769811630249, "learning_rate": 4.661114152394359e-05, "loss": 3.1869, "step": 13000 }, { "epoch": 0.3519199186673966, "grad_norm": 6.179595470428467, "learning_rate": 4.6480800813326034e-05, "loss": 3.1464, "step": 13500 }, { "epoch": 0.364953989729152, "grad_norm": 5.665715217590332, "learning_rate": 4.6350460102708484e-05, "loss": 3.079, "step": 14000 }, { "epoch": 0.3779880607909074, "grad_norm": 4.681985855102539, "learning_rate": 4.622011939209093e-05, "loss": 3.0724, "step": 14500 }, { "epoch": 0.3910221318526629, "grad_norm": 11.111820220947266, "learning_rate": 4.608977868147337e-05, "loss": 3.0356, "step": 15000 }, { "epoch": 0.4040562029144183, "grad_norm": 5.951188564300537, "learning_rate": 4.5959437970855815e-05, "loss": 3.01, "step": 15500 }, { "epoch": 0.4170902739761737, "grad_norm": 5.438151836395264, "learning_rate": 4.5829097260238266e-05, "loss": 2.9605, "step": 16000 }, { "epoch": 0.4301243450379291, "grad_norm": 10.49527645111084, "learning_rate": 4.569875654962071e-05, "loss": 2.9453, "step": 16500 }, { "epoch": 0.4431584160996846, "grad_norm": 6.611765384674072, "learning_rate": 4.556841583900316e-05, "loss": 2.9529, "step": 17000 }, { "epoch": 0.45619248716144, "grad_norm": 5.289289474487305, "learning_rate": 4.54380751283856e-05, "loss": 2.9081, "step": 17500 }, { "epoch": 0.4692265582231954, "grad_norm": 5.65715217590332, "learning_rate": 4.530773441776805e-05, "loss": 2.8152, "step": 18000 }, { "epoch": 0.48226062928495084, "grad_norm": 5.513209819793701, "learning_rate": 4.51773937071505e-05, "loss": 2.8664, "step": 18500 }, { "epoch": 0.4952947003467063, "grad_norm": 4.413240909576416, "learning_rate": 4.504705299653294e-05, "loss": 2.8854, "step": 19000 }, { "epoch": 0.5083287714084617, "grad_norm": 5.602241039276123, "learning_rate": 4.4916712285915384e-05, "loss": 2.8295, "step": 19500 }, { "epoch": 0.5213628424702171, "grad_norm": 8.221460342407227, "learning_rate": 4.478637157529783e-05, "loss": 2.7826, "step": 20000 }, { "epoch": 0.5343969135319726, "grad_norm": 5.350883483886719, "learning_rate": 4.465603086468028e-05, "loss": 2.7846, "step": 20500 }, { "epoch": 0.547430984593728, "grad_norm": 6.6059393882751465, "learning_rate": 4.452569015406272e-05, "loss": 2.7562, "step": 21000 }, { "epoch": 0.5604650556554834, "grad_norm": 7.050083637237549, "learning_rate": 4.4395349443445166e-05, "loss": 2.7102, "step": 21500 }, { "epoch": 0.5734991267172389, "grad_norm": 6.74811315536499, "learning_rate": 4.426500873282761e-05, "loss": 2.7215, "step": 22000 }, { "epoch": 0.5865331977789943, "grad_norm": 7.959073543548584, "learning_rate": 4.413466802221006e-05, "loss": 2.7185, "step": 22500 }, { "epoch": 0.5995672688407497, "grad_norm": 7.594911098480225, "learning_rate": 4.40043273115925e-05, "loss": 2.6624, "step": 23000 }, { "epoch": 0.6126013399025051, "grad_norm": 5.935075283050537, "learning_rate": 4.3873986600974954e-05, "loss": 2.6398, "step": 23500 }, { "epoch": 0.6256354109642606, "grad_norm": 7.0315961837768555, "learning_rate": 4.37436458903574e-05, "loss": 2.6571, "step": 24000 }, { "epoch": 0.638669482026016, "grad_norm": 6.930845260620117, "learning_rate": 4.361330517973984e-05, "loss": 2.6009, "step": 24500 }, { "epoch": 0.6517035530877714, "grad_norm": 14.607309341430664, "learning_rate": 4.348296446912229e-05, "loss": 2.6493, "step": 25000 }, { "epoch": 0.6647376241495269, "grad_norm": 5.613809108734131, "learning_rate": 4.3352623758504735e-05, "loss": 2.6042, "step": 25500 }, { "epoch": 0.6777716952112823, "grad_norm": 6.0553693771362305, "learning_rate": 4.322228304788718e-05, "loss": 2.6153, "step": 26000 }, { "epoch": 0.6908057662730377, "grad_norm": 8.716107368469238, "learning_rate": 4.309194233726962e-05, "loss": 2.5757, "step": 26500 }, { "epoch": 0.7038398373347932, "grad_norm": 7.430722713470459, "learning_rate": 4.296160162665207e-05, "loss": 2.5682, "step": 27000 }, { "epoch": 0.7168739083965486, "grad_norm": 9.687034606933594, "learning_rate": 4.2831260916034516e-05, "loss": 2.5377, "step": 27500 }, { "epoch": 0.729907979458304, "grad_norm": 3.729767084121704, "learning_rate": 4.270092020541696e-05, "loss": 2.5217, "step": 28000 }, { "epoch": 0.7429420505200595, "grad_norm": 9.692636489868164, "learning_rate": 4.25705794947994e-05, "loss": 2.4829, "step": 28500 }, { "epoch": 0.7559761215818148, "grad_norm": 8.260266304016113, "learning_rate": 4.2440238784181854e-05, "loss": 2.4971, "step": 29000 }, { "epoch": 0.7690101926435703, "grad_norm": 5.885035037994385, "learning_rate": 4.23098980735643e-05, "loss": 2.4823, "step": 29500 }, { "epoch": 0.7820442637053258, "grad_norm": 11.001029968261719, "learning_rate": 4.217955736294674e-05, "loss": 2.4583, "step": 30000 }, { "epoch": 0.7950783347670811, "grad_norm": 9.69256591796875, "learning_rate": 4.204921665232919e-05, "loss": 2.447, "step": 30500 }, { "epoch": 0.8081124058288366, "grad_norm": 15.954379081726074, "learning_rate": 4.191887594171164e-05, "loss": 2.4427, "step": 31000 }, { "epoch": 0.8211464768905921, "grad_norm": 5.421440124511719, "learning_rate": 4.1788535231094085e-05, "loss": 2.4181, "step": 31500 }, { "epoch": 0.8341805479523474, "grad_norm": 9.169551849365234, "learning_rate": 4.165819452047653e-05, "loss": 2.4105, "step": 32000 }, { "epoch": 0.8472146190141029, "grad_norm": 5.778009414672852, "learning_rate": 4.152785380985897e-05, "loss": 2.4145, "step": 32500 }, { "epoch": 0.8602486900758582, "grad_norm": 6.441959857940674, "learning_rate": 4.139751309924142e-05, "loss": 2.4334, "step": 33000 }, { "epoch": 0.8732827611376137, "grad_norm": 7.385718822479248, "learning_rate": 4.1267172388623866e-05, "loss": 2.392, "step": 33500 }, { "epoch": 0.8863168321993692, "grad_norm": 15.347734451293945, "learning_rate": 4.113683167800631e-05, "loss": 2.3981, "step": 34000 }, { "epoch": 0.8993509032611245, "grad_norm": 10.47854232788086, "learning_rate": 4.1006490967388754e-05, "loss": 2.3511, "step": 34500 }, { "epoch": 0.91238497432288, "grad_norm": 11.82073974609375, "learning_rate": 4.0876150256771204e-05, "loss": 2.3632, "step": 35000 }, { "epoch": 0.9254190453846355, "grad_norm": 8.932971954345703, "learning_rate": 4.074580954615365e-05, "loss": 2.3272, "step": 35500 }, { "epoch": 0.9384531164463908, "grad_norm": 11.068861961364746, "learning_rate": 4.061546883553609e-05, "loss": 2.3321, "step": 36000 }, { "epoch": 0.9514871875081463, "grad_norm": 5.649448871612549, "learning_rate": 4.0485128124918535e-05, "loss": 2.3498, "step": 36500 }, { "epoch": 0.9645212585699017, "grad_norm": 9.020928382873535, "learning_rate": 4.0354787414300985e-05, "loss": 2.3331, "step": 37000 }, { "epoch": 0.9775553296316571, "grad_norm": 12.966954231262207, "learning_rate": 4.0224446703683436e-05, "loss": 2.3095, "step": 37500 }, { "epoch": 0.9905894006934126, "grad_norm": 5.641653060913086, "learning_rate": 4.009410599306588e-05, "loss": 2.3127, "step": 38000 }, { "epoch": 1.003623471755168, "grad_norm": 8.139008522033691, "learning_rate": 3.996376528244832e-05, "loss": 2.2846, "step": 38500 }, { "epoch": 1.0166575428169233, "grad_norm": 7.005831241607666, "learning_rate": 3.9833424571830766e-05, "loss": 2.2518, "step": 39000 }, { "epoch": 1.029691613878679, "grad_norm": 3.906301975250244, "learning_rate": 3.970308386121322e-05, "loss": 2.2632, "step": 39500 }, { "epoch": 1.0427256849404343, "grad_norm": 4.201974391937256, "learning_rate": 3.957274315059566e-05, "loss": 2.2299, "step": 40000 }, { "epoch": 1.0557597560021896, "grad_norm": 6.107882022857666, "learning_rate": 3.9442402439978104e-05, "loss": 2.2016, "step": 40500 }, { "epoch": 1.0687938270639452, "grad_norm": 8.289084434509277, "learning_rate": 3.931206172936055e-05, "loss": 2.2227, "step": 41000 }, { "epoch": 1.0818278981257006, "grad_norm": 5.386382102966309, "learning_rate": 3.9181721018743e-05, "loss": 2.1849, "step": 41500 }, { "epoch": 1.094861969187456, "grad_norm": 5.536214828491211, "learning_rate": 3.905138030812544e-05, "loss": 2.2085, "step": 42000 }, { "epoch": 1.1078960402492115, "grad_norm": 67.06414031982422, "learning_rate": 3.8921039597507885e-05, "loss": 2.2039, "step": 42500 }, { "epoch": 1.1209301113109669, "grad_norm": 8.36019229888916, "learning_rate": 3.879069888689033e-05, "loss": 2.1925, "step": 43000 }, { "epoch": 1.1339641823727222, "grad_norm": 14.266386985778809, "learning_rate": 3.866035817627278e-05, "loss": 2.2101, "step": 43500 }, { "epoch": 1.1469982534344778, "grad_norm": 11.47070598602295, "learning_rate": 3.853001746565523e-05, "loss": 2.1402, "step": 44000 }, { "epoch": 1.1600323244962332, "grad_norm": 5.293683052062988, "learning_rate": 3.839967675503767e-05, "loss": 2.1872, "step": 44500 }, { "epoch": 1.1730663955579885, "grad_norm": 32.234737396240234, "learning_rate": 3.826933604442012e-05, "loss": 2.1357, "step": 45000 }, { "epoch": 1.1861004666197439, "grad_norm": 3.9005160331726074, "learning_rate": 3.813899533380256e-05, "loss": 2.1263, "step": 45500 }, { "epoch": 1.1991345376814995, "grad_norm": 9.012932777404785, "learning_rate": 3.800865462318501e-05, "loss": 2.1718, "step": 46000 }, { "epoch": 1.2121686087432548, "grad_norm": 8.86204719543457, "learning_rate": 3.7878313912567454e-05, "loss": 2.1718, "step": 46500 }, { "epoch": 1.2252026798050104, "grad_norm": 29.908674240112305, "learning_rate": 3.77479732019499e-05, "loss": 2.1227, "step": 47000 }, { "epoch": 1.2382367508667658, "grad_norm": 3.599839687347412, "learning_rate": 3.761763249133234e-05, "loss": 2.1301, "step": 47500 }, { "epoch": 1.2512708219285211, "grad_norm": 12.039328575134277, "learning_rate": 3.748729178071479e-05, "loss": 2.1226, "step": 48000 }, { "epoch": 1.2643048929902765, "grad_norm": 3.92248797416687, "learning_rate": 3.7356951070097236e-05, "loss": 2.156, "step": 48500 }, { "epoch": 1.277338964052032, "grad_norm": 22.514301300048828, "learning_rate": 3.722661035947968e-05, "loss": 2.1001, "step": 49000 }, { "epoch": 1.2903730351137874, "grad_norm": 4.8082990646362305, "learning_rate": 3.709626964886212e-05, "loss": 2.1167, "step": 49500 }, { "epoch": 1.303407106175543, "grad_norm": 7.884994983673096, "learning_rate": 3.696592893824457e-05, "loss": 2.1118, "step": 50000 }, { "epoch": 1.3164411772372984, "grad_norm": 4.282125949859619, "learning_rate": 3.6835588227627024e-05, "loss": 2.0749, "step": 50500 }, { "epoch": 1.3294752482990537, "grad_norm": 19.30133819580078, "learning_rate": 3.670524751700947e-05, "loss": 2.1081, "step": 51000 }, { "epoch": 1.342509319360809, "grad_norm": 3.800236463546753, "learning_rate": 3.657490680639191e-05, "loss": 2.0964, "step": 51500 }, { "epoch": 1.3555433904225647, "grad_norm": 5.734689235687256, "learning_rate": 3.6444566095774355e-05, "loss": 2.0736, "step": 52000 }, { "epoch": 1.36857746148432, "grad_norm": 7.496071815490723, "learning_rate": 3.6314225385156805e-05, "loss": 2.0545, "step": 52500 }, { "epoch": 1.3816115325460754, "grad_norm": 7.645195007324219, "learning_rate": 3.618388467453925e-05, "loss": 2.0407, "step": 53000 }, { "epoch": 1.394645603607831, "grad_norm": 22.738969802856445, "learning_rate": 3.605354396392169e-05, "loss": 2.0554, "step": 53500 }, { "epoch": 1.4076796746695863, "grad_norm": 9.185379028320312, "learning_rate": 3.5923203253304136e-05, "loss": 2.0364, "step": 54000 }, { "epoch": 1.4207137457313417, "grad_norm": 9.092364311218262, "learning_rate": 3.5792862542686586e-05, "loss": 2.023, "step": 54500 }, { "epoch": 1.433747816793097, "grad_norm": 3.8213064670562744, "learning_rate": 3.566252183206903e-05, "loss": 2.0429, "step": 55000 }, { "epoch": 1.4467818878548526, "grad_norm": 15.87769603729248, "learning_rate": 3.553218112145147e-05, "loss": 1.9853, "step": 55500 }, { "epoch": 1.459815958916608, "grad_norm": 8.585647583007812, "learning_rate": 3.540184041083392e-05, "loss": 2.0239, "step": 56000 }, { "epoch": 1.4728500299783636, "grad_norm": 4.249543190002441, "learning_rate": 3.527149970021637e-05, "loss": 2.0305, "step": 56500 }, { "epoch": 1.485884101040119, "grad_norm": 6.320367336273193, "learning_rate": 3.514115898959881e-05, "loss": 2.0173, "step": 57000 }, { "epoch": 1.4989181721018743, "grad_norm": 5.058931350708008, "learning_rate": 3.501081827898126e-05, "loss": 1.9641, "step": 57500 }, { "epoch": 1.5119522431636296, "grad_norm": 10.568583488464355, "learning_rate": 3.4880477568363705e-05, "loss": 2.035, "step": 58000 }, { "epoch": 1.524986314225385, "grad_norm": 6.535768985748291, "learning_rate": 3.475013685774615e-05, "loss": 1.9971, "step": 58500 }, { "epoch": 1.5380203852871406, "grad_norm": 11.262877464294434, "learning_rate": 3.46197961471286e-05, "loss": 2.0076, "step": 59000 }, { "epoch": 1.5510544563488962, "grad_norm": 8.998533248901367, "learning_rate": 3.448945543651104e-05, "loss": 1.986, "step": 59500 }, { "epoch": 1.5640885274106515, "grad_norm": 5.243868827819824, "learning_rate": 3.4359114725893486e-05, "loss": 2.0148, "step": 60000 }, { "epoch": 1.5771225984724069, "grad_norm": 6.43707275390625, "learning_rate": 3.422877401527593e-05, "loss": 1.9952, "step": 60500 }, { "epoch": 1.5901566695341622, "grad_norm": 10.8756742477417, "learning_rate": 3.409843330465838e-05, "loss": 1.9688, "step": 61000 }, { "epoch": 1.6031907405959176, "grad_norm": 3.6488418579101562, "learning_rate": 3.3968092594040824e-05, "loss": 1.9545, "step": 61500 }, { "epoch": 1.6162248116576732, "grad_norm": 3.8945696353912354, "learning_rate": 3.383775188342327e-05, "loss": 1.9692, "step": 62000 }, { "epoch": 1.6292588827194285, "grad_norm": 4.477757453918457, "learning_rate": 3.370741117280571e-05, "loss": 1.9559, "step": 62500 }, { "epoch": 1.6422929537811841, "grad_norm": 5.086141586303711, "learning_rate": 3.357707046218816e-05, "loss": 1.929, "step": 63000 }, { "epoch": 1.6553270248429395, "grad_norm": 5.249891757965088, "learning_rate": 3.3446729751570605e-05, "loss": 1.9686, "step": 63500 }, { "epoch": 1.6683610959046948, "grad_norm": 9.6456880569458, "learning_rate": 3.3316389040953055e-05, "loss": 1.952, "step": 64000 }, { "epoch": 1.6813951669664502, "grad_norm": 5.007114410400391, "learning_rate": 3.31860483303355e-05, "loss": 1.9229, "step": 64500 }, { "epoch": 1.6944292380282058, "grad_norm": 4.589148044586182, "learning_rate": 3.305570761971795e-05, "loss": 1.9296, "step": 65000 }, { "epoch": 1.7074633090899611, "grad_norm": 10.281172752380371, "learning_rate": 3.292536690910039e-05, "loss": 1.9153, "step": 65500 }, { "epoch": 1.7204973801517167, "grad_norm": 7.041563034057617, "learning_rate": 3.2795026198482837e-05, "loss": 1.9276, "step": 66000 }, { "epoch": 1.733531451213472, "grad_norm": 8.523409843444824, "learning_rate": 3.266468548786528e-05, "loss": 1.8871, "step": 66500 }, { "epoch": 1.7465655222752274, "grad_norm": 18.92120361328125, "learning_rate": 3.253434477724773e-05, "loss": 1.8963, "step": 67000 }, { "epoch": 1.7595995933369828, "grad_norm": 17.547399520874023, "learning_rate": 3.2404004066630174e-05, "loss": 1.9069, "step": 67500 }, { "epoch": 1.7726336643987382, "grad_norm": 9.223323822021484, "learning_rate": 3.227366335601262e-05, "loss": 1.9232, "step": 68000 }, { "epoch": 1.7856677354604937, "grad_norm": 17.263656616210938, "learning_rate": 3.214332264539506e-05, "loss": 1.89, "step": 68500 }, { "epoch": 1.7987018065222493, "grad_norm": 19.6173152923584, "learning_rate": 3.201298193477751e-05, "loss": 1.8764, "step": 69000 }, { "epoch": 1.8117358775840047, "grad_norm": 10.714072227478027, "learning_rate": 3.1882641224159955e-05, "loss": 1.9165, "step": 69500 }, { "epoch": 1.82476994864576, "grad_norm": 5.039360523223877, "learning_rate": 3.17523005135424e-05, "loss": 1.8422, "step": 70000 }, { "epoch": 1.8378040197075154, "grad_norm": 28.72756576538086, "learning_rate": 3.162195980292485e-05, "loss": 1.8819, "step": 70500 }, { "epoch": 1.8508380907692707, "grad_norm": 4.069336414337158, "learning_rate": 3.149161909230729e-05, "loss": 1.8769, "step": 71000 }, { "epoch": 1.8638721618310263, "grad_norm": 4.223635196685791, "learning_rate": 3.136127838168974e-05, "loss": 1.8799, "step": 71500 }, { "epoch": 1.8769062328927817, "grad_norm": 10.401415824890137, "learning_rate": 3.123093767107219e-05, "loss": 1.905, "step": 72000 }, { "epoch": 1.8899403039545373, "grad_norm": 5.064211368560791, "learning_rate": 3.110059696045463e-05, "loss": 1.827, "step": 72500 }, { "epoch": 1.9029743750162926, "grad_norm": 4.138282299041748, "learning_rate": 3.0970256249837074e-05, "loss": 1.8237, "step": 73000 }, { "epoch": 1.916008446078048, "grad_norm": 3.365440845489502, "learning_rate": 3.0839915539219525e-05, "loss": 1.8421, "step": 73500 }, { "epoch": 1.9290425171398033, "grad_norm": 7.819665431976318, "learning_rate": 3.070957482860197e-05, "loss": 1.8413, "step": 74000 }, { "epoch": 1.942076588201559, "grad_norm": 8.81440544128418, "learning_rate": 3.057923411798441e-05, "loss": 1.8633, "step": 74500 }, { "epoch": 1.9551106592633143, "grad_norm": 12.814815521240234, "learning_rate": 3.044889340736686e-05, "loss": 1.8255, "step": 75000 }, { "epoch": 1.9681447303250699, "grad_norm": 7.332582950592041, "learning_rate": 3.0318552696749302e-05, "loss": 1.8228, "step": 75500 }, { "epoch": 1.9811788013868252, "grad_norm": 6.4567694664001465, "learning_rate": 3.018821198613175e-05, "loss": 1.8514, "step": 76000 }, { "epoch": 1.9942128724485806, "grad_norm": 33.37932205200195, "learning_rate": 3.0057871275514193e-05, "loss": 1.8347, "step": 76500 }, { "epoch": 2.007246943510336, "grad_norm": 3.908621072769165, "learning_rate": 2.992753056489664e-05, "loss": 1.8015, "step": 77000 }, { "epoch": 2.0202810145720913, "grad_norm": 3.9100475311279297, "learning_rate": 2.979718985427909e-05, "loss": 1.8148, "step": 77500 }, { "epoch": 2.0333150856338467, "grad_norm": 4.988982200622559, "learning_rate": 2.9666849143661534e-05, "loss": 1.7508, "step": 78000 }, { "epoch": 2.0463491566956025, "grad_norm": 5.134647846221924, "learning_rate": 2.953650843304398e-05, "loss": 1.7613, "step": 78500 }, { "epoch": 2.059383227757358, "grad_norm": 6.9095845222473145, "learning_rate": 2.9406167722426425e-05, "loss": 1.8106, "step": 79000 }, { "epoch": 2.072417298819113, "grad_norm": 14.57297420501709, "learning_rate": 2.927582701180887e-05, "loss": 1.7387, "step": 79500 }, { "epoch": 2.0854513698808685, "grad_norm": 46.801937103271484, "learning_rate": 2.9145486301191315e-05, "loss": 1.7732, "step": 80000 }, { "epoch": 2.098485440942624, "grad_norm": 10.51559829711914, "learning_rate": 2.9015145590573762e-05, "loss": 1.779, "step": 80500 }, { "epoch": 2.1115195120043793, "grad_norm": 3.4089362621307373, "learning_rate": 2.8884804879956206e-05, "loss": 1.7613, "step": 81000 }, { "epoch": 2.124553583066135, "grad_norm": 6.211880207061768, "learning_rate": 2.8754464169338653e-05, "loss": 1.7656, "step": 81500 }, { "epoch": 2.1375876541278904, "grad_norm": 4.486207962036133, "learning_rate": 2.8624123458721096e-05, "loss": 1.7653, "step": 82000 }, { "epoch": 2.150621725189646, "grad_norm": 4.438023090362549, "learning_rate": 2.8493782748103543e-05, "loss": 1.758, "step": 82500 }, { "epoch": 2.163655796251401, "grad_norm": 5.200678825378418, "learning_rate": 2.8363442037485987e-05, "loss": 1.7487, "step": 83000 }, { "epoch": 2.1766898673131565, "grad_norm": 11.503108024597168, "learning_rate": 2.8233101326868434e-05, "loss": 1.7539, "step": 83500 }, { "epoch": 2.189723938374912, "grad_norm": 3.5593841075897217, "learning_rate": 2.8102760616250884e-05, "loss": 1.7604, "step": 84000 }, { "epoch": 2.2027580094366677, "grad_norm": 4.380959510803223, "learning_rate": 2.7972419905633328e-05, "loss": 1.7688, "step": 84500 }, { "epoch": 2.215792080498423, "grad_norm": 8.921208381652832, "learning_rate": 2.7842079195015775e-05, "loss": 1.7414, "step": 85000 }, { "epoch": 2.2288261515601784, "grad_norm": 4.622405529022217, "learning_rate": 2.771173848439822e-05, "loss": 1.7623, "step": 85500 }, { "epoch": 2.2418602226219337, "grad_norm": 27.651330947875977, "learning_rate": 2.7581397773780666e-05, "loss": 1.7172, "step": 86000 }, { "epoch": 2.254894293683689, "grad_norm": 4.457437992095947, "learning_rate": 2.745105706316311e-05, "loss": 1.7444, "step": 86500 }, { "epoch": 2.2679283647454445, "grad_norm": 5.793179988861084, "learning_rate": 2.7320716352545556e-05, "loss": 1.7386, "step": 87000 }, { "epoch": 2.2809624358072, "grad_norm": 3.3070342540740967, "learning_rate": 2.7190375641928e-05, "loss": 1.7066, "step": 87500 }, { "epoch": 2.2939965068689556, "grad_norm": 4.475468158721924, "learning_rate": 2.7060034931310447e-05, "loss": 1.7212, "step": 88000 }, { "epoch": 2.307030577930711, "grad_norm": 4.4862847328186035, "learning_rate": 2.692969422069289e-05, "loss": 1.7265, "step": 88500 }, { "epoch": 2.3200646489924663, "grad_norm": 3.608401298522949, "learning_rate": 2.6799353510075337e-05, "loss": 1.7324, "step": 89000 }, { "epoch": 2.3330987200542217, "grad_norm": 4.134375095367432, "learning_rate": 2.666901279945778e-05, "loss": 1.6866, "step": 89500 }, { "epoch": 2.346132791115977, "grad_norm": 4.030068874359131, "learning_rate": 2.6538672088840228e-05, "loss": 1.6955, "step": 90000 }, { "epoch": 2.3591668621777324, "grad_norm": 7.18529748916626, "learning_rate": 2.640833137822267e-05, "loss": 1.7119, "step": 90500 }, { "epoch": 2.3722009332394878, "grad_norm": 3.633330821990967, "learning_rate": 2.6277990667605122e-05, "loss": 1.737, "step": 91000 }, { "epoch": 2.3852350043012436, "grad_norm": 5.056845188140869, "learning_rate": 2.614764995698757e-05, "loss": 1.7121, "step": 91500 }, { "epoch": 2.398269075362999, "grad_norm": 3.203246831893921, "learning_rate": 2.6017309246370013e-05, "loss": 1.7096, "step": 92000 }, { "epoch": 2.4113031464247543, "grad_norm": 3.830634355545044, "learning_rate": 2.588696853575246e-05, "loss": 1.7047, "step": 92500 }, { "epoch": 2.4243372174865097, "grad_norm": 3.5095880031585693, "learning_rate": 2.5756627825134903e-05, "loss": 1.6875, "step": 93000 }, { "epoch": 2.437371288548265, "grad_norm": 13.952683448791504, "learning_rate": 2.562628711451735e-05, "loss": 1.727, "step": 93500 }, { "epoch": 2.450405359610021, "grad_norm": 4.152392387390137, "learning_rate": 2.5495946403899794e-05, "loss": 1.674, "step": 94000 }, { "epoch": 2.463439430671776, "grad_norm": 28.32253074645996, "learning_rate": 2.536560569328224e-05, "loss": 1.6635, "step": 94500 }, { "epoch": 2.4764735017335315, "grad_norm": 37.356117248535156, "learning_rate": 2.5235264982664684e-05, "loss": 1.6936, "step": 95000 }, { "epoch": 2.489507572795287, "grad_norm": 11.425202369689941, "learning_rate": 2.510492427204713e-05, "loss": 1.6635, "step": 95500 }, { "epoch": 2.5025416438570423, "grad_norm": 3.700289726257324, "learning_rate": 2.497458356142958e-05, "loss": 1.7051, "step": 96000 }, { "epoch": 2.5155757149187976, "grad_norm": 16.234506607055664, "learning_rate": 2.4844242850812025e-05, "loss": 1.676, "step": 96500 }, { "epoch": 2.528609785980553, "grad_norm": 3.4809882640838623, "learning_rate": 2.471390214019447e-05, "loss": 1.6795, "step": 97000 }, { "epoch": 2.5416438570423088, "grad_norm": 4.420949459075928, "learning_rate": 2.4583561429576916e-05, "loss": 1.6926, "step": 97500 }, { "epoch": 2.554677928104064, "grad_norm": 24.02429962158203, "learning_rate": 2.445322071895936e-05, "loss": 1.6479, "step": 98000 }, { "epoch": 2.5677119991658195, "grad_norm": 4.912638187408447, "learning_rate": 2.4322880008341807e-05, "loss": 1.6598, "step": 98500 }, { "epoch": 2.580746070227575, "grad_norm": 22.43536376953125, "learning_rate": 2.419253929772425e-05, "loss": 1.6532, "step": 99000 }, { "epoch": 2.59378014128933, "grad_norm": 4.317445755004883, "learning_rate": 2.40621985871067e-05, "loss": 1.6554, "step": 99500 }, { "epoch": 2.606814212351086, "grad_norm": 14.290596008300781, "learning_rate": 2.3931857876489144e-05, "loss": 1.6265, "step": 100000 }, { "epoch": 2.619848283412841, "grad_norm": 4.331130504608154, "learning_rate": 2.380151716587159e-05, "loss": 1.6706, "step": 100500 }, { "epoch": 2.6328823544745967, "grad_norm": 7.016634941101074, "learning_rate": 2.3671176455254035e-05, "loss": 1.649, "step": 101000 }, { "epoch": 2.645916425536352, "grad_norm": 5.680657386779785, "learning_rate": 2.3540835744636482e-05, "loss": 1.6126, "step": 101500 }, { "epoch": 2.6589504965981074, "grad_norm": 4.337413311004639, "learning_rate": 2.3410495034018925e-05, "loss": 1.6317, "step": 102000 }, { "epoch": 2.671984567659863, "grad_norm": 20.466943740844727, "learning_rate": 2.3280154323401372e-05, "loss": 1.6348, "step": 102500 }, { "epoch": 2.685018638721618, "grad_norm": 4.808228969573975, "learning_rate": 2.314981361278382e-05, "loss": 1.5979, "step": 103000 }, { "epoch": 2.698052709783374, "grad_norm": 4.296200752258301, "learning_rate": 2.3019472902166263e-05, "loss": 1.6281, "step": 103500 }, { "epoch": 2.7110867808451293, "grad_norm": 32.726078033447266, "learning_rate": 2.288913219154871e-05, "loss": 1.5966, "step": 104000 }, { "epoch": 2.7241208519068847, "grad_norm": 4.275684833526611, "learning_rate": 2.2758791480931154e-05, "loss": 1.6108, "step": 104500 }, { "epoch": 2.73715492296864, "grad_norm": 3.496002197265625, "learning_rate": 2.26284507703136e-05, "loss": 1.6026, "step": 105000 }, { "epoch": 2.7501889940303954, "grad_norm": 9.172469139099121, "learning_rate": 2.2498110059696044e-05, "loss": 1.631, "step": 105500 }, { "epoch": 2.7632230650921508, "grad_norm": 16.79161834716797, "learning_rate": 2.2367769349078495e-05, "loss": 1.6357, "step": 106000 }, { "epoch": 2.776257136153906, "grad_norm": 14.198761940002441, "learning_rate": 2.2237428638460938e-05, "loss": 1.6423, "step": 106500 }, { "epoch": 2.789291207215662, "grad_norm": 5.301556587219238, "learning_rate": 2.2107087927843385e-05, "loss": 1.6125, "step": 107000 }, { "epoch": 2.8023252782774173, "grad_norm": 26.385272979736328, "learning_rate": 2.197674721722583e-05, "loss": 1.6334, "step": 107500 }, { "epoch": 2.8153593493391726, "grad_norm": 9.757530212402344, "learning_rate": 2.1846406506608276e-05, "loss": 1.586, "step": 108000 }, { "epoch": 2.828393420400928, "grad_norm": 20.982559204101562, "learning_rate": 2.171606579599072e-05, "loss": 1.6066, "step": 108500 }, { "epoch": 2.8414274914626834, "grad_norm": 3.695369243621826, "learning_rate": 2.1585725085373166e-05, "loss": 1.6307, "step": 109000 }, { "epoch": 2.8544615625244387, "grad_norm": 14.864655494689941, "learning_rate": 2.1455384374755613e-05, "loss": 1.5847, "step": 109500 }, { "epoch": 2.867495633586194, "grad_norm": 3.9043121337890625, "learning_rate": 2.1325043664138057e-05, "loss": 1.5904, "step": 110000 }, { "epoch": 2.88052970464795, "grad_norm": 4.432578086853027, "learning_rate": 2.1194702953520504e-05, "loss": 1.6037, "step": 110500 }, { "epoch": 2.8935637757097052, "grad_norm": 6.775419235229492, "learning_rate": 2.1064362242902948e-05, "loss": 1.6052, "step": 111000 }, { "epoch": 2.9065978467714606, "grad_norm": 5.090266227722168, "learning_rate": 2.0934021532285395e-05, "loss": 1.5814, "step": 111500 }, { "epoch": 2.919631917833216, "grad_norm": 7.805962085723877, "learning_rate": 2.0803680821667838e-05, "loss": 1.6016, "step": 112000 }, { "epoch": 2.9326659888949713, "grad_norm": 6.22263240814209, "learning_rate": 2.067334011105029e-05, "loss": 1.564, "step": 112500 }, { "epoch": 2.945700059956727, "grad_norm": 23.055776596069336, "learning_rate": 2.0542999400432732e-05, "loss": 1.555, "step": 113000 }, { "epoch": 2.958734131018482, "grad_norm": 20.39297866821289, "learning_rate": 2.041265868981518e-05, "loss": 1.5306, "step": 113500 }, { "epoch": 2.971768202080238, "grad_norm": 5.571432113647461, "learning_rate": 2.0282317979197623e-05, "loss": 1.577, "step": 114000 }, { "epoch": 2.984802273141993, "grad_norm": 15.77784252166748, "learning_rate": 2.015197726858007e-05, "loss": 1.6165, "step": 114500 }, { "epoch": 2.9978363442037486, "grad_norm": 4.388451099395752, "learning_rate": 2.0021636557962513e-05, "loss": 1.544, "step": 115000 }, { "epoch": 3.010870415265504, "grad_norm": 2.794743776321411, "learning_rate": 1.989129584734496e-05, "loss": 1.561, "step": 115500 }, { "epoch": 3.0239044863272593, "grad_norm": 38.998512268066406, "learning_rate": 1.9760955136727407e-05, "loss": 1.5344, "step": 116000 }, { "epoch": 3.036938557389015, "grad_norm": 10.872420310974121, "learning_rate": 1.9630614426109854e-05, "loss": 1.5191, "step": 116500 }, { "epoch": 3.0499726284507704, "grad_norm": 4.433558464050293, "learning_rate": 1.9500273715492298e-05, "loss": 1.5093, "step": 117000 }, { "epoch": 3.063006699512526, "grad_norm": 3.8315622806549072, "learning_rate": 1.9369933004874745e-05, "loss": 1.5344, "step": 117500 }, { "epoch": 3.076040770574281, "grad_norm": 24.29652976989746, "learning_rate": 1.923959229425719e-05, "loss": 1.5557, "step": 118000 }, { "epoch": 3.0890748416360365, "grad_norm": 4.876192092895508, "learning_rate": 1.9109251583639636e-05, "loss": 1.5381, "step": 118500 }, { "epoch": 3.102108912697792, "grad_norm": 4.730300426483154, "learning_rate": 1.897891087302208e-05, "loss": 1.4977, "step": 119000 }, { "epoch": 3.1151429837595472, "grad_norm": 15.773541450500488, "learning_rate": 1.8848570162404526e-05, "loss": 1.5262, "step": 119500 }, { "epoch": 3.128177054821303, "grad_norm": 3.4133520126342773, "learning_rate": 1.8718229451786973e-05, "loss": 1.5142, "step": 120000 }, { "epoch": 3.1412111258830584, "grad_norm": 4.271722316741943, "learning_rate": 1.8587888741169417e-05, "loss": 1.5108, "step": 120500 }, { "epoch": 3.1542451969448138, "grad_norm": 4.478157997131348, "learning_rate": 1.8457548030551864e-05, "loss": 1.5111, "step": 121000 }, { "epoch": 3.167279268006569, "grad_norm": 6.74271821975708, "learning_rate": 1.8327207319934307e-05, "loss": 1.5359, "step": 121500 }, { "epoch": 3.1803133390683245, "grad_norm": 10.100676536560059, "learning_rate": 1.8196866609316754e-05, "loss": 1.4856, "step": 122000 }, { "epoch": 3.19334741013008, "grad_norm": 5.077882289886475, "learning_rate": 1.8066525898699198e-05, "loss": 1.5054, "step": 122500 }, { "epoch": 3.2063814811918356, "grad_norm": 4.155623912811279, "learning_rate": 1.793618518808165e-05, "loss": 1.5089, "step": 123000 }, { "epoch": 3.219415552253591, "grad_norm": 3.6238481998443604, "learning_rate": 1.7805844477464092e-05, "loss": 1.4933, "step": 123500 }, { "epoch": 3.2324496233153464, "grad_norm": 4.119343280792236, "learning_rate": 1.767550376684654e-05, "loss": 1.5215, "step": 124000 }, { "epoch": 3.2454836943771017, "grad_norm": 3.789219379425049, "learning_rate": 1.7545163056228983e-05, "loss": 1.4686, "step": 124500 }, { "epoch": 3.258517765438857, "grad_norm": 23.477462768554688, "learning_rate": 1.741482234561143e-05, "loss": 1.4928, "step": 125000 }, { "epoch": 3.2715518365006124, "grad_norm": 34.81294250488281, "learning_rate": 1.7284481634993873e-05, "loss": 1.5147, "step": 125500 }, { "epoch": 3.2845859075623682, "grad_norm": 3.911698579788208, "learning_rate": 1.715414092437632e-05, "loss": 1.498, "step": 126000 }, { "epoch": 3.2976199786241236, "grad_norm": 17.540603637695312, "learning_rate": 1.7023800213758767e-05, "loss": 1.5224, "step": 126500 }, { "epoch": 3.310654049685879, "grad_norm": 5.028404712677002, "learning_rate": 1.689345950314121e-05, "loss": 1.4782, "step": 127000 }, { "epoch": 3.3236881207476343, "grad_norm": 11.53537654876709, "learning_rate": 1.6763118792523658e-05, "loss": 1.4837, "step": 127500 }, { "epoch": 3.3367221918093897, "grad_norm": 3.8512253761291504, "learning_rate": 1.66327780819061e-05, "loss": 1.4528, "step": 128000 }, { "epoch": 3.349756262871145, "grad_norm": 3.932035207748413, "learning_rate": 1.650243737128855e-05, "loss": 1.5026, "step": 128500 }, { "epoch": 3.3627903339329004, "grad_norm": 4.325034141540527, "learning_rate": 1.6372096660670992e-05, "loss": 1.4717, "step": 129000 }, { "epoch": 3.375824404994656, "grad_norm": 7.62436580657959, "learning_rate": 1.6241755950053442e-05, "loss": 1.4677, "step": 129500 }, { "epoch": 3.3888584760564116, "grad_norm": 4.481779098510742, "learning_rate": 1.6111415239435886e-05, "loss": 1.487, "step": 130000 }, { "epoch": 3.401892547118167, "grad_norm": 4.1522536277771, "learning_rate": 1.5981074528818333e-05, "loss": 1.4724, "step": 130500 }, { "epoch": 3.4149266181799223, "grad_norm": 22.38875961303711, "learning_rate": 1.5850733818200777e-05, "loss": 1.4694, "step": 131000 }, { "epoch": 3.4279606892416776, "grad_norm": 5.144596099853516, "learning_rate": 1.5720393107583224e-05, "loss": 1.4792, "step": 131500 }, { "epoch": 3.440994760303433, "grad_norm": 4.0159912109375, "learning_rate": 1.5590052396965667e-05, "loss": 1.4535, "step": 132000 }, { "epoch": 3.454028831365189, "grad_norm": 4.164160251617432, "learning_rate": 1.5459711686348114e-05, "loss": 1.4516, "step": 132500 }, { "epoch": 3.467062902426944, "grad_norm": 4.1465349197387695, "learning_rate": 1.532937097573056e-05, "loss": 1.4383, "step": 133000 }, { "epoch": 3.4800969734886995, "grad_norm": 5.3553466796875, "learning_rate": 1.5199030265113007e-05, "loss": 1.4588, "step": 133500 }, { "epoch": 3.493131044550455, "grad_norm": 4.2381110191345215, "learning_rate": 1.5068689554495452e-05, "loss": 1.4607, "step": 134000 }, { "epoch": 3.5061651156122102, "grad_norm": 4.227059364318848, "learning_rate": 1.4938348843877897e-05, "loss": 1.4855, "step": 134500 }, { "epoch": 3.5191991866739656, "grad_norm": 4.23318338394165, "learning_rate": 1.4808008133260342e-05, "loss": 1.4452, "step": 135000 }, { "epoch": 3.5322332577357214, "grad_norm": 4.2789788246154785, "learning_rate": 1.4677667422642788e-05, "loss": 1.4471, "step": 135500 }, { "epoch": 3.5452673287974767, "grad_norm": 14.372062683105469, "learning_rate": 1.4547326712025236e-05, "loss": 1.4663, "step": 136000 }, { "epoch": 3.558301399859232, "grad_norm": 4.719635963439941, "learning_rate": 1.4416986001407682e-05, "loss": 1.4628, "step": 136500 }, { "epoch": 3.5713354709209875, "grad_norm": 4.603359222412109, "learning_rate": 1.4286645290790127e-05, "loss": 1.4464, "step": 137000 }, { "epoch": 3.584369541982743, "grad_norm": 4.167656421661377, "learning_rate": 1.4156304580172572e-05, "loss": 1.4816, "step": 137500 }, { "epoch": 3.597403613044498, "grad_norm": 3.9802513122558594, "learning_rate": 1.4025963869555018e-05, "loss": 1.4404, "step": 138000 }, { "epoch": 3.6104376841062535, "grad_norm": 4.956002235412598, "learning_rate": 1.3895623158937463e-05, "loss": 1.4463, "step": 138500 }, { "epoch": 3.6234717551680093, "grad_norm": 4.82868766784668, "learning_rate": 1.3765282448319908e-05, "loss": 1.429, "step": 139000 }, { "epoch": 3.6365058262297647, "grad_norm": 9.303766250610352, "learning_rate": 1.3634941737702355e-05, "loss": 1.4492, "step": 139500 }, { "epoch": 3.64953989729152, "grad_norm": 4.728789806365967, "learning_rate": 1.35046010270848e-05, "loss": 1.4599, "step": 140000 }, { "epoch": 3.6625739683532754, "grad_norm": 4.169735431671143, "learning_rate": 1.3374260316467246e-05, "loss": 1.4346, "step": 140500 }, { "epoch": 3.675608039415031, "grad_norm": 4.134032249450684, "learning_rate": 1.3243919605849691e-05, "loss": 1.426, "step": 141000 }, { "epoch": 3.6886421104767866, "grad_norm": 7.31259822845459, "learning_rate": 1.3113578895232136e-05, "loss": 1.4489, "step": 141500 }, { "epoch": 3.7016761815385415, "grad_norm": 41.01179885864258, "learning_rate": 1.2983238184614582e-05, "loss": 1.4594, "step": 142000 }, { "epoch": 3.7147102526002973, "grad_norm": 4.123907566070557, "learning_rate": 1.2852897473997027e-05, "loss": 1.4445, "step": 142500 }, { "epoch": 3.7277443236620527, "grad_norm": 12.47805404663086, "learning_rate": 1.2722556763379476e-05, "loss": 1.416, "step": 143000 }, { "epoch": 3.740778394723808, "grad_norm": 4.795707702636719, "learning_rate": 1.2592216052761921e-05, "loss": 1.449, "step": 143500 }, { "epoch": 3.7538124657855634, "grad_norm": 3.754809856414795, "learning_rate": 1.2461875342144366e-05, "loss": 1.4353, "step": 144000 }, { "epoch": 3.7668465368473187, "grad_norm": 4.847051620483398, "learning_rate": 1.2331534631526812e-05, "loss": 1.4081, "step": 144500 }, { "epoch": 3.7798806079090745, "grad_norm": 5.240978240966797, "learning_rate": 1.2201193920909257e-05, "loss": 1.4497, "step": 145000 }, { "epoch": 3.79291467897083, "grad_norm": 4.278606414794922, "learning_rate": 1.2070853210291704e-05, "loss": 1.4296, "step": 145500 }, { "epoch": 3.8059487500325853, "grad_norm": 24.963735580444336, "learning_rate": 1.194051249967415e-05, "loss": 1.4273, "step": 146000 }, { "epoch": 3.8189828210943406, "grad_norm": 3.3722941875457764, "learning_rate": 1.1810171789056595e-05, "loss": 1.3939, "step": 146500 }, { "epoch": 3.832016892156096, "grad_norm": 3.9926798343658447, "learning_rate": 1.1679831078439042e-05, "loss": 1.4149, "step": 147000 }, { "epoch": 3.8450509632178513, "grad_norm": 7.269467353820801, "learning_rate": 1.1549490367821487e-05, "loss": 1.4004, "step": 147500 }, { "epoch": 3.8580850342796067, "grad_norm": 5.596455097198486, "learning_rate": 1.1419149657203932e-05, "loss": 1.4133, "step": 148000 }, { "epoch": 3.8711191053413625, "grad_norm": 5.81203556060791, "learning_rate": 1.1288808946586377e-05, "loss": 1.4313, "step": 148500 }, { "epoch": 3.884153176403118, "grad_norm": 4.842901229858398, "learning_rate": 1.1158468235968823e-05, "loss": 1.4139, "step": 149000 }, { "epoch": 3.897187247464873, "grad_norm": 3.6464438438415527, "learning_rate": 1.1028127525351268e-05, "loss": 1.4189, "step": 149500 }, { "epoch": 3.9102213185266286, "grad_norm": 5.625620365142822, "learning_rate": 1.0897786814733713e-05, "loss": 1.4119, "step": 150000 }, { "epoch": 3.923255389588384, "grad_norm": 3.84614896774292, "learning_rate": 1.076744610411616e-05, "loss": 1.4094, "step": 150500 }, { "epoch": 3.9362894606501397, "grad_norm": 5.183802127838135, "learning_rate": 1.0637105393498606e-05, "loss": 1.4157, "step": 151000 }, { "epoch": 3.9493235317118947, "grad_norm": 4.6199140548706055, "learning_rate": 1.0506764682881051e-05, "loss": 1.4067, "step": 151500 }, { "epoch": 3.9623576027736505, "grad_norm": 5.642277717590332, "learning_rate": 1.0376423972263498e-05, "loss": 1.3994, "step": 152000 }, { "epoch": 3.975391673835406, "grad_norm": 4.15669584274292, "learning_rate": 1.0246083261645943e-05, "loss": 1.4304, "step": 152500 }, { "epoch": 3.988425744897161, "grad_norm": 4.729000568389893, "learning_rate": 1.0115742551028389e-05, "loss": 1.3979, "step": 153000 }, { "epoch": 4.001459815958917, "grad_norm": 3.2223262786865234, "learning_rate": 9.985401840410834e-06, "loss": 1.3897, "step": 153500 }, { "epoch": 4.014493887020672, "grad_norm": 4.223217964172363, "learning_rate": 9.855061129793281e-06, "loss": 1.3567, "step": 154000 }, { "epoch": 4.027527958082428, "grad_norm": 3.201354742050171, "learning_rate": 9.724720419175726e-06, "loss": 1.3796, "step": 154500 }, { "epoch": 4.040562029144183, "grad_norm": 31.99419593811035, "learning_rate": 9.594379708558171e-06, "loss": 1.3475, "step": 155000 }, { "epoch": 4.053596100205938, "grad_norm": 19.76371192932129, "learning_rate": 9.464038997940618e-06, "loss": 1.3278, "step": 155500 }, { "epoch": 4.066630171267693, "grad_norm": 3.462979316711426, "learning_rate": 9.333698287323064e-06, "loss": 1.3632, "step": 156000 }, { "epoch": 4.079664242329449, "grad_norm": 27.641897201538086, "learning_rate": 9.203357576705509e-06, "loss": 1.3203, "step": 156500 }, { "epoch": 4.092698313391205, "grad_norm": 3.934295654296875, "learning_rate": 9.073016866087954e-06, "loss": 1.3793, "step": 157000 }, { "epoch": 4.10573238445296, "grad_norm": 3.3237240314483643, "learning_rate": 8.9426761554704e-06, "loss": 1.3375, "step": 157500 }, { "epoch": 4.118766455514716, "grad_norm": 5.202388286590576, "learning_rate": 8.812335444852845e-06, "loss": 1.3852, "step": 158000 }, { "epoch": 4.131800526576471, "grad_norm": 28.595399856567383, "learning_rate": 8.68199473423529e-06, "loss": 1.3644, "step": 158500 }, { "epoch": 4.144834597638226, "grad_norm": 3.2022364139556885, "learning_rate": 8.551654023617737e-06, "loss": 1.3734, "step": 159000 }, { "epoch": 4.157868668699982, "grad_norm": 4.231220245361328, "learning_rate": 8.421313313000183e-06, "loss": 1.349, "step": 159500 }, { "epoch": 4.170902739761737, "grad_norm": 4.515881538391113, "learning_rate": 8.290972602382628e-06, "loss": 1.3392, "step": 160000 }, { "epoch": 4.183936810823493, "grad_norm": 3.6497957706451416, "learning_rate": 8.160631891765075e-06, "loss": 1.3495, "step": 160500 }, { "epoch": 4.196970881885248, "grad_norm": 16.680282592773438, "learning_rate": 8.03029118114752e-06, "loss": 1.3566, "step": 161000 }, { "epoch": 4.210004952947004, "grad_norm": 18.566879272460938, "learning_rate": 7.899950470529966e-06, "loss": 1.3248, "step": 161500 }, { "epoch": 4.2230390240087585, "grad_norm": 3.9700820446014404, "learning_rate": 7.769609759912413e-06, "loss": 1.3767, "step": 162000 }, { "epoch": 4.236073095070514, "grad_norm": 42.5576286315918, "learning_rate": 7.639269049294858e-06, "loss": 1.3346, "step": 162500 }, { "epoch": 4.24910716613227, "grad_norm": 7.013011455535889, "learning_rate": 7.508928338677302e-06, "loss": 1.3752, "step": 163000 }, { "epoch": 4.262141237194025, "grad_norm": 12.351140975952148, "learning_rate": 7.3785876280597476e-06, "loss": 1.3213, "step": 163500 }, { "epoch": 4.275175308255781, "grad_norm": 48.051631927490234, "learning_rate": 7.2482469174421946e-06, "loss": 1.3453, "step": 164000 }, { "epoch": 4.288209379317536, "grad_norm": 3.8004846572875977, "learning_rate": 7.11790620682464e-06, "loss": 1.3231, "step": 164500 }, { "epoch": 4.301243450379292, "grad_norm": 3.8865389823913574, "learning_rate": 6.987565496207085e-06, "loss": 1.3353, "step": 165000 }, { "epoch": 4.3142775214410465, "grad_norm": 4.471733093261719, "learning_rate": 6.857224785589532e-06, "loss": 1.3411, "step": 165500 }, { "epoch": 4.327311592502802, "grad_norm": 4.856067657470703, "learning_rate": 6.7268840749719775e-06, "loss": 1.3254, "step": 166000 }, { "epoch": 4.340345663564558, "grad_norm": 4.089067459106445, "learning_rate": 6.596543364354423e-06, "loss": 1.3676, "step": 166500 }, { "epoch": 4.353379734626313, "grad_norm": 4.231725215911865, "learning_rate": 6.466202653736869e-06, "loss": 1.3331, "step": 167000 }, { "epoch": 4.366413805688069, "grad_norm": 4.140297889709473, "learning_rate": 6.335861943119314e-06, "loss": 1.3338, "step": 167500 }, { "epoch": 4.379447876749824, "grad_norm": 3.1667165756225586, "learning_rate": 6.2055212325017595e-06, "loss": 1.3658, "step": 168000 }, { "epoch": 4.3924819478115795, "grad_norm": 4.982083797454834, "learning_rate": 6.075180521884206e-06, "loss": 1.3098, "step": 168500 }, { "epoch": 4.405516018873335, "grad_norm": 19.951147079467773, "learning_rate": 5.944839811266651e-06, "loss": 1.315, "step": 169000 }, { "epoch": 4.41855008993509, "grad_norm": 5.146533489227295, "learning_rate": 5.814499100649097e-06, "loss": 1.3322, "step": 169500 }, { "epoch": 4.431584160996846, "grad_norm": 4.29327917098999, "learning_rate": 5.684158390031543e-06, "loss": 1.3165, "step": 170000 }, { "epoch": 4.444618232058601, "grad_norm": 4.86635160446167, "learning_rate": 5.5538176794139886e-06, "loss": 1.3266, "step": 170500 }, { "epoch": 4.457652303120357, "grad_norm": 5.066024303436279, "learning_rate": 5.423476968796435e-06, "loss": 1.3201, "step": 171000 }, { "epoch": 4.470686374182112, "grad_norm": 5.111464500427246, "learning_rate": 5.293136258178879e-06, "loss": 1.3188, "step": 171500 }, { "epoch": 4.4837204452438675, "grad_norm": 4.428502082824707, "learning_rate": 5.162795547561325e-06, "loss": 1.3162, "step": 172000 }, { "epoch": 4.496754516305623, "grad_norm": 2.84608793258667, "learning_rate": 5.0324548369437715e-06, "loss": 1.3052, "step": 172500 }, { "epoch": 4.509788587367378, "grad_norm": 4.425991058349609, "learning_rate": 4.902114126326217e-06, "loss": 1.3252, "step": 173000 }, { "epoch": 4.522822658429134, "grad_norm": 21.735198974609375, "learning_rate": 4.771773415708663e-06, "loss": 1.3333, "step": 173500 }, { "epoch": 4.535856729490889, "grad_norm": 4.519357204437256, "learning_rate": 4.641432705091108e-06, "loss": 1.3115, "step": 174000 }, { "epoch": 4.548890800552645, "grad_norm": 25.662084579467773, "learning_rate": 4.511091994473554e-06, "loss": 1.3134, "step": 174500 }, { "epoch": 4.5619248716144, "grad_norm": 3.4979422092437744, "learning_rate": 4.3807512838560005e-06, "loss": 1.3202, "step": 175000 }, { "epoch": 4.574958942676155, "grad_norm": 4.444785118103027, "learning_rate": 4.250410573238446e-06, "loss": 1.3174, "step": 175500 }, { "epoch": 4.587993013737911, "grad_norm": 6.712714672088623, "learning_rate": 4.120069862620891e-06, "loss": 1.3343, "step": 176000 }, { "epoch": 4.601027084799666, "grad_norm": 4.870098114013672, "learning_rate": 3.9897291520033364e-06, "loss": 1.3312, "step": 176500 }, { "epoch": 4.614061155861422, "grad_norm": 4.5157928466796875, "learning_rate": 3.859388441385783e-06, "loss": 1.3133, "step": 177000 }, { "epoch": 4.627095226923177, "grad_norm": 3.297917366027832, "learning_rate": 3.7290477307682287e-06, "loss": 1.34, "step": 177500 }, { "epoch": 4.640129297984933, "grad_norm": 5.5820698738098145, "learning_rate": 3.598707020150674e-06, "loss": 1.2856, "step": 178000 }, { "epoch": 4.653163369046688, "grad_norm": 68.55699157714844, "learning_rate": 3.4683663095331198e-06, "loss": 1.3293, "step": 178500 }, { "epoch": 4.666197440108443, "grad_norm": 4.395013332366943, "learning_rate": 3.338025598915565e-06, "loss": 1.3156, "step": 179000 }, { "epoch": 4.679231511170199, "grad_norm": 4.131389141082764, "learning_rate": 3.2076848882980112e-06, "loss": 1.3349, "step": 179500 }, { "epoch": 4.692265582231954, "grad_norm": 3.2444746494293213, "learning_rate": 3.077344177680457e-06, "loss": 1.2882, "step": 180000 }, { "epoch": 4.70529965329371, "grad_norm": 6.894190788269043, "learning_rate": 2.9470034670629027e-06, "loss": 1.3064, "step": 180500 }, { "epoch": 4.718333724355465, "grad_norm": 4.13007926940918, "learning_rate": 2.816662756445348e-06, "loss": 1.3319, "step": 181000 }, { "epoch": 4.731367795417221, "grad_norm": 4.010223388671875, "learning_rate": 2.686322045827794e-06, "loss": 1.3289, "step": 181500 }, { "epoch": 4.7444018664789755, "grad_norm": 5.212350845336914, "learning_rate": 2.55598133521024e-06, "loss": 1.3052, "step": 182000 }, { "epoch": 4.757435937540731, "grad_norm": 4.112293243408203, "learning_rate": 2.4256406245926856e-06, "loss": 1.3178, "step": 182500 }, { "epoch": 4.770470008602487, "grad_norm": 4.711720943450928, "learning_rate": 2.295299913975131e-06, "loss": 1.3017, "step": 183000 }, { "epoch": 4.783504079664242, "grad_norm": 4.1918439865112305, "learning_rate": 2.1649592033575766e-06, "loss": 1.3368, "step": 183500 }, { "epoch": 4.796538150725998, "grad_norm": 4.53779411315918, "learning_rate": 2.0346184927400227e-06, "loss": 1.3103, "step": 184000 }, { "epoch": 4.809572221787754, "grad_norm": 2.9776086807250977, "learning_rate": 1.9042777821224683e-06, "loss": 1.3325, "step": 184500 }, { "epoch": 4.822606292849509, "grad_norm": 5.410048007965088, "learning_rate": 1.773937071504914e-06, "loss": 1.324, "step": 185000 }, { "epoch": 4.835640363911264, "grad_norm": 5.260219573974609, "learning_rate": 1.6435963608873595e-06, "loss": 1.3339, "step": 185500 }, { "epoch": 4.848674434973019, "grad_norm": 5.610768795013428, "learning_rate": 1.5132556502698054e-06, "loss": 1.2985, "step": 186000 }, { "epoch": 4.861708506034775, "grad_norm": 6.287191390991211, "learning_rate": 1.382914939652251e-06, "loss": 1.2973, "step": 186500 }, { "epoch": 4.87474257709653, "grad_norm": 32.12895202636719, "learning_rate": 1.2525742290346967e-06, "loss": 1.2914, "step": 187000 }, { "epoch": 4.887776648158286, "grad_norm": 15.296839714050293, "learning_rate": 1.1222335184171426e-06, "loss": 1.3231, "step": 187500 }, { "epoch": 4.900810719220042, "grad_norm": 4.650936126708984, "learning_rate": 9.918928077995881e-07, "loss": 1.2902, "step": 188000 }, { "epoch": 4.9138447902817965, "grad_norm": 25.2452335357666, "learning_rate": 8.615520971820338e-07, "loss": 1.2964, "step": 188500 }, { "epoch": 4.926878861343552, "grad_norm": 4.3756890296936035, "learning_rate": 7.312113865644796e-07, "loss": 1.3137, "step": 189000 }, { "epoch": 4.939912932405307, "grad_norm": 32.994510650634766, "learning_rate": 6.008706759469253e-07, "loss": 1.3033, "step": 189500 }, { "epoch": 4.952947003467063, "grad_norm": 3.0575180053710938, "learning_rate": 4.70529965329371e-07, "loss": 1.2992, "step": 190000 }, { "epoch": 4.965981074528818, "grad_norm": 4.4134135246276855, "learning_rate": 3.401892547118167e-07, "loss": 1.2839, "step": 190500 }, { "epoch": 4.979015145590574, "grad_norm": 40.072750091552734, "learning_rate": 2.0984854409426243e-07, "loss": 1.3057, "step": 191000 }, { "epoch": 4.99204921665233, "grad_norm": 19.755613327026367, "learning_rate": 7.950783347670812e-08, "loss": 1.2946, "step": 191500 } ], "logging_steps": 500, "max_steps": 191805, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.066567392204288e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }