|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9981024667931688, |
|
"eval_steps": 500, |
|
"global_step": 1580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01265022137887413, |
|
"grad_norm": 13.099321365356445, |
|
"learning_rate": 6.329113924050633e-08, |
|
"loss": 2.5391, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02530044275774826, |
|
"grad_norm": 27.764171600341797, |
|
"learning_rate": 1.2658227848101266e-07, |
|
"loss": 2.5382, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03795066413662239, |
|
"grad_norm": 13.548388481140137, |
|
"learning_rate": 1.89873417721519e-07, |
|
"loss": 2.5232, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05060088551549652, |
|
"grad_norm": 12.454045295715332, |
|
"learning_rate": 2.5316455696202533e-07, |
|
"loss": 2.4481, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06325110689437065, |
|
"grad_norm": 8.77104377746582, |
|
"learning_rate": 3.1645569620253163e-07, |
|
"loss": 2.3842, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07590132827324478, |
|
"grad_norm": 6.88944149017334, |
|
"learning_rate": 3.79746835443038e-07, |
|
"loss": 2.2827, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08855154965211891, |
|
"grad_norm": 6.3905930519104, |
|
"learning_rate": 4.4303797468354424e-07, |
|
"loss": 2.2146, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10120177103099304, |
|
"grad_norm": 8.130488395690918, |
|
"learning_rate": 5.063291139240507e-07, |
|
"loss": 2.0978, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11385199240986717, |
|
"grad_norm": 21.30768394470215, |
|
"learning_rate": 5.69620253164557e-07, |
|
"loss": 2.0624, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1265022137887413, |
|
"grad_norm": 6.106363296508789, |
|
"learning_rate": 6.329113924050633e-07, |
|
"loss": 2.0176, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13915243516761544, |
|
"grad_norm": 6.555318832397461, |
|
"learning_rate": 6.962025316455696e-07, |
|
"loss": 1.9714, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.15180265654648956, |
|
"grad_norm": 6.091899394989014, |
|
"learning_rate": 7.59493670886076e-07, |
|
"loss": 1.8976, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1644528779253637, |
|
"grad_norm": 8.193241119384766, |
|
"learning_rate": 8.227848101265823e-07, |
|
"loss": 1.8571, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.17710309930423782, |
|
"grad_norm": 7.589028835296631, |
|
"learning_rate": 8.860759493670885e-07, |
|
"loss": 1.845, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18975332068311196, |
|
"grad_norm": 7.830214023590088, |
|
"learning_rate": 9.493670886075948e-07, |
|
"loss": 1.8197, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.20240354206198607, |
|
"grad_norm": 6.8579535484313965, |
|
"learning_rate": 9.99995119100718e-07, |
|
"loss": 1.8233, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.21505376344086022, |
|
"grad_norm": 6.225603103637695, |
|
"learning_rate": 9.998242976313776e-07, |
|
"loss": 1.7624, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.22770398481973433, |
|
"grad_norm": 7.000970363616943, |
|
"learning_rate": 9.994095264822903e-07, |
|
"loss": 1.7696, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.24035420619860848, |
|
"grad_norm": 10.97808837890625, |
|
"learning_rate": 9.987510080911721e-07, |
|
"loss": 1.7406, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2530044275774826, |
|
"grad_norm": 6.758321762084961, |
|
"learning_rate": 9.97849063861667e-07, |
|
"loss": 1.7793, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2656546489563567, |
|
"grad_norm": 12.631876945495605, |
|
"learning_rate": 9.967041340064793e-07, |
|
"loss": 1.7416, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2783048703352309, |
|
"grad_norm": 14.51364517211914, |
|
"learning_rate": 9.953167773325195e-07, |
|
"loss": 1.7273, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.290955091714105, |
|
"grad_norm": 9.859718322753906, |
|
"learning_rate": 9.936876709681666e-07, |
|
"loss": 1.7137, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3036053130929791, |
|
"grad_norm": 8.631091117858887, |
|
"learning_rate": 9.91817610032781e-07, |
|
"loss": 1.7117, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3162555344718533, |
|
"grad_norm": 9.83065414428711, |
|
"learning_rate": 9.897075072486298e-07, |
|
"loss": 1.7011, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3289057558507274, |
|
"grad_norm": 17.304086685180664, |
|
"learning_rate": 9.87358392495415e-07, |
|
"loss": 1.7106, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3415559772296015, |
|
"grad_norm": 10.813619613647461, |
|
"learning_rate": 9.847714123076173e-07, |
|
"loss": 1.6754, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.35420619860847563, |
|
"grad_norm": 10.511194229125977, |
|
"learning_rate": 9.81947829314908e-07, |
|
"loss": 1.6962, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3668564199873498, |
|
"grad_norm": 10.410032272338867, |
|
"learning_rate": 9.788890216258938e-07, |
|
"loss": 1.6962, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3795066413662239, |
|
"grad_norm": 7.6379714012146, |
|
"learning_rate": 9.755964821555046e-07, |
|
"loss": 1.6905, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.39215686274509803, |
|
"grad_norm": 16.694063186645508, |
|
"learning_rate": 9.720718178963446e-07, |
|
"loss": 1.6889, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.40480708412397215, |
|
"grad_norm": 10.317975044250488, |
|
"learning_rate": 9.68316749134364e-07, |
|
"loss": 1.6611, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4174573055028463, |
|
"grad_norm": 10.705253601074219, |
|
"learning_rate": 9.643331086092404e-07, |
|
"loss": 1.6706, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.43010752688172044, |
|
"grad_norm": 10.81264591217041, |
|
"learning_rate": 9.601228406198703e-07, |
|
"loss": 1.6597, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.44275774826059455, |
|
"grad_norm": 7.797952175140381, |
|
"learning_rate": 9.55688000075414e-07, |
|
"loss": 1.667, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.45540796963946867, |
|
"grad_norm": 11.544702529907227, |
|
"learning_rate": 9.510307514923536e-07, |
|
"loss": 1.6463, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.46805819101834284, |
|
"grad_norm": 8.987153053283691, |
|
"learning_rate": 9.461533679380567e-07, |
|
"loss": 1.6518, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.48070841239721696, |
|
"grad_norm": 8.551813125610352, |
|
"learning_rate": 9.410582299213572e-07, |
|
"loss": 1.645, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.49335863377609107, |
|
"grad_norm": 13.203941345214844, |
|
"learning_rate": 9.357478242306996e-07, |
|
"loss": 1.633, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5060088551549652, |
|
"grad_norm": 60.77254104614258, |
|
"learning_rate": 9.302247427204087e-07, |
|
"loss": 1.6537, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5186590765338394, |
|
"grad_norm": 23.738460540771484, |
|
"learning_rate": 9.24491681045682e-07, |
|
"loss": 1.6392, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5313092979127134, |
|
"grad_norm": 12.887944221496582, |
|
"learning_rate": 9.185514373469179e-07, |
|
"loss": 1.6342, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5439595192915876, |
|
"grad_norm": 11.586170196533203, |
|
"learning_rate": 9.124069108840264e-07, |
|
"loss": 1.6542, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5566097406704618, |
|
"grad_norm": 6.504334926605225, |
|
"learning_rate": 9.060611006213832e-07, |
|
"loss": 1.6413, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5692599620493358, |
|
"grad_norm": 8.936307907104492, |
|
"learning_rate": 8.995171037641234e-07, |
|
"loss": 1.6305, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.58191018342821, |
|
"grad_norm": 10.616469383239746, |
|
"learning_rate": 8.927781142464858e-07, |
|
"loss": 1.6135, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5945604048070842, |
|
"grad_norm": 6.386105537414551, |
|
"learning_rate": 8.858474211729469e-07, |
|
"loss": 1.6249, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6072106261859582, |
|
"grad_norm": 7.467748165130615, |
|
"learning_rate": 8.787284072129037e-07, |
|
"loss": 1.6282, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6198608475648324, |
|
"grad_norm": 11.522847175598145, |
|
"learning_rate": 8.714245469496931e-07, |
|
"loss": 1.633, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6325110689437066, |
|
"grad_norm": 5.804441928863525, |
|
"learning_rate": 8.639394051847471e-07, |
|
"loss": 1.6011, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 7.378643035888672, |
|
"learning_rate": 8.562766351977181e-07, |
|
"loss": 1.6185, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6578115117014548, |
|
"grad_norm": 6.906543254852295, |
|
"learning_rate": 8.484399769634203e-07, |
|
"loss": 1.6326, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6704617330803289, |
|
"grad_norm": 7.340395927429199, |
|
"learning_rate": 8.404332553264546e-07, |
|
"loss": 1.6306, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.683111954459203, |
|
"grad_norm": 13.938148498535156, |
|
"learning_rate": 8.32260378134416e-07, |
|
"loss": 1.6156, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6957621758380772, |
|
"grad_norm": 9.445398330688477, |
|
"learning_rate": 8.239253343305847e-07, |
|
"loss": 1.6228, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7084123972169513, |
|
"grad_norm": 6.176075458526611, |
|
"learning_rate": 8.154321920070412e-07, |
|
"loss": 1.5906, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7210626185958254, |
|
"grad_norm": 8.291935920715332, |
|
"learning_rate": 8.067850964191475e-07, |
|
"loss": 1.6202, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7337128399746996, |
|
"grad_norm": 11.122963905334473, |
|
"learning_rate": 7.979882679623694e-07, |
|
"loss": 1.6181, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7463630613535737, |
|
"grad_norm": 8.314069747924805, |
|
"learning_rate": 7.890460001124241e-07, |
|
"loss": 1.5892, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7590132827324478, |
|
"grad_norm": 6.397180080413818, |
|
"learning_rate": 7.799626573297604e-07, |
|
"loss": 1.5916, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7716635041113219, |
|
"grad_norm": 8.690041542053223, |
|
"learning_rate": 7.707426729293915e-07, |
|
"loss": 1.5919, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 7.349343776702881, |
|
"learning_rate": 7.613905469171245e-07, |
|
"loss": 1.5858, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7969639468690702, |
|
"grad_norm": 9.25490665435791, |
|
"learning_rate": 7.519108437932378e-07, |
|
"loss": 1.6114, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8096141682479443, |
|
"grad_norm": 7.29209566116333, |
|
"learning_rate": 7.423081903246813e-07, |
|
"loss": 1.607, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8222643896268185, |
|
"grad_norm": 7.68463659286499, |
|
"learning_rate": 7.325872732868869e-07, |
|
"loss": 1.5746, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8349146110056926, |
|
"grad_norm": 6.869282245635986, |
|
"learning_rate": 7.227528371762896e-07, |
|
"loss": 1.5811, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8475648323845667, |
|
"grad_norm": 17.074424743652344, |
|
"learning_rate": 7.128096818946769e-07, |
|
"loss": 1.5897, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8602150537634409, |
|
"grad_norm": 11.504667282104492, |
|
"learning_rate": 7.027626604064969e-07, |
|
"loss": 1.5794, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.872865275142315, |
|
"grad_norm": 8.88005256652832, |
|
"learning_rate": 6.926166763702672e-07, |
|
"loss": 1.6042, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8855154965211891, |
|
"grad_norm": 24.4627685546875, |
|
"learning_rate": 6.823766817452424e-07, |
|
"loss": 1.5639, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8981657179000633, |
|
"grad_norm": 20.67166519165039, |
|
"learning_rate": 6.720476743745072e-07, |
|
"loss": 1.587, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9108159392789373, |
|
"grad_norm": 16.13395118713379, |
|
"learning_rate": 6.616346955456742e-07, |
|
"loss": 1.6019, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9234661606578115, |
|
"grad_norm": 16.747365951538086, |
|
"learning_rate": 6.511428275303785e-07, |
|
"loss": 1.6014, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9361163820366857, |
|
"grad_norm": 7.6073994636535645, |
|
"learning_rate": 6.405771911037697e-07, |
|
"loss": 1.5753, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9487666034155597, |
|
"grad_norm": 8.412175178527832, |
|
"learning_rate": 6.299429430452096e-07, |
|
"loss": 1.5632, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9614168247944339, |
|
"grad_norm": 6.702009677886963, |
|
"learning_rate": 6.192452736213987e-07, |
|
"loss": 1.5773, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9740670461733081, |
|
"grad_norm": 6.783812046051025, |
|
"learning_rate": 6.084894040531589e-07, |
|
"loss": 1.5662, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9867172675521821, |
|
"grad_norm": 10.526253700256348, |
|
"learning_rate": 5.976805839671071e-07, |
|
"loss": 1.5854, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9993674889310563, |
|
"grad_norm": 17.669225692749023, |
|
"learning_rate": 5.868240888334652e-07, |
|
"loss": 1.588, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.0113851992409868, |
|
"grad_norm": 7.780857086181641, |
|
"learning_rate": 5.759252173912572e-07, |
|
"loss": 1.479, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0240354206198607, |
|
"grad_norm": 49.61854934692383, |
|
"learning_rate": 5.64989289062149e-07, |
|
"loss": 1.5685, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.036685641998735, |
|
"grad_norm": 31.814254760742188, |
|
"learning_rate": 5.540216413541936e-07, |
|
"loss": 1.5346, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.049335863377609, |
|
"grad_norm": 18.944929122924805, |
|
"learning_rate": 5.430276272567485e-07, |
|
"loss": 1.5623, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.0619860847564833, |
|
"grad_norm": 7.601119518280029, |
|
"learning_rate": 5.320126126278379e-07, |
|
"loss": 1.569, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.0746363061353574, |
|
"grad_norm": 6.994897842407227, |
|
"learning_rate": 5.209819735752341e-07, |
|
"loss": 1.555, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0872865275142316, |
|
"grad_norm": 7.816515922546387, |
|
"learning_rate": 5.09941093832535e-07, |
|
"loss": 1.5482, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0999367488931056, |
|
"grad_norm": 9.505668640136719, |
|
"learning_rate": 4.988953621315213e-07, |
|
"loss": 1.5455, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.1125869702719797, |
|
"grad_norm": 6.980685234069824, |
|
"learning_rate": 4.87850169572073e-07, |
|
"loss": 1.5679, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.125237191650854, |
|
"grad_norm": 5.63450288772583, |
|
"learning_rate": 4.7681090699093066e-07, |
|
"loss": 1.5502, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.137887413029728, |
|
"grad_norm": 11.722896575927734, |
|
"learning_rate": 4.657829623305859e-07, |
|
"loss": 1.5628, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1505376344086022, |
|
"grad_norm": 14.06059455871582, |
|
"learning_rate": 4.5477171800958203e-07, |
|
"loss": 1.5144, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.1631878557874762, |
|
"grad_norm": 14.6784029006958, |
|
"learning_rate": 4.437825482955139e-07, |
|
"loss": 1.5457, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.1758380771663504, |
|
"grad_norm": 18.590673446655273, |
|
"learning_rate": 4.3282081668200327e-07, |
|
"loss": 1.5526, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1884882985452245, |
|
"grad_norm": 27.646364212036133, |
|
"learning_rate": 4.218918732709342e-07, |
|
"loss": 1.5234, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.2011385199240987, |
|
"grad_norm": 8.348926544189453, |
|
"learning_rate": 4.1100105216122496e-07, |
|
"loss": 1.5587, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.2137887413029729, |
|
"grad_norm": 9.07374382019043, |
|
"learning_rate": 4.0015366884540814e-07, |
|
"loss": 1.5576, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.226438962681847, |
|
"grad_norm": 6.855799198150635, |
|
"learning_rate": 3.893550176152954e-07, |
|
"loss": 1.5354, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.239089184060721, |
|
"grad_norm": 8.235871315002441, |
|
"learning_rate": 3.78610368977986e-07, |
|
"loss": 1.5196, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.2517394054395952, |
|
"grad_norm": 8.418612480163574, |
|
"learning_rate": 3.6792496708348774e-07, |
|
"loss": 1.5618, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.2643896268184693, |
|
"grad_norm": 8.189360618591309, |
|
"learning_rate": 3.5730402716519826e-07, |
|
"loss": 1.5453, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2770398481973435, |
|
"grad_norm": 8.278912544250488, |
|
"learning_rate": 3.4675273299450256e-07, |
|
"loss": 1.5456, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.2896900695762175, |
|
"grad_norm": 5.916414260864258, |
|
"learning_rate": 3.362762343507257e-07, |
|
"loss": 1.5276, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.3023402909550916, |
|
"grad_norm": 7.753712177276611, |
|
"learning_rate": 3.258796445076738e-07, |
|
"loss": 1.5288, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.3149905123339658, |
|
"grad_norm": 6.598722457885742, |
|
"learning_rate": 3.1556803773799613e-07, |
|
"loss": 1.5544, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.32764073371284, |
|
"grad_norm": 13.177520751953125, |
|
"learning_rate": 3.053464468365785e-07, |
|
"loss": 1.5548, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.3402909550917141, |
|
"grad_norm": 8.982342720031738, |
|
"learning_rate": 2.9521986066418446e-07, |
|
"loss": 1.5316, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.3529411764705883, |
|
"grad_norm": 8.048005104064941, |
|
"learning_rate": 2.8519322171253604e-07, |
|
"loss": 1.5499, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.3655913978494625, |
|
"grad_norm": 13.070209503173828, |
|
"learning_rate": 2.7527142369202875e-07, |
|
"loss": 1.5515, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.3782416192283364, |
|
"grad_norm": 10.934873580932617, |
|
"learning_rate": 2.6545930914325374e-07, |
|
"loss": 1.5432, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.3908918406072106, |
|
"grad_norm": 22.81064796447754, |
|
"learning_rate": 2.5576166707349384e-07, |
|
"loss": 1.5591, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.4035420619860848, |
|
"grad_norm": 6.494205474853516, |
|
"learning_rate": 2.4618323061935093e-07, |
|
"loss": 1.5343, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.416192283364959, |
|
"grad_norm": 9.8826322555542, |
|
"learning_rate": 2.3672867473663672e-07, |
|
"loss": 1.5541, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.428842504743833, |
|
"grad_norm": 15.296801567077637, |
|
"learning_rate": 2.2740261391866633e-07, |
|
"loss": 1.521, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.441492726122707, |
|
"grad_norm": 9.621323585510254, |
|
"learning_rate": 2.182095999440552e-07, |
|
"loss": 1.5235, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.4541429475015812, |
|
"grad_norm": 6.374513626098633, |
|
"learning_rate": 2.091541196551318e-07, |
|
"loss": 1.5362, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.4667931688804554, |
|
"grad_norm": 8.73390007019043, |
|
"learning_rate": 2.0024059276803739e-07, |
|
"loss": 1.5475, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.4794433902593296, |
|
"grad_norm": 7.136387348175049, |
|
"learning_rate": 1.9147336971559448e-07, |
|
"loss": 1.5519, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.4920936116382038, |
|
"grad_norm": 15.520442962646484, |
|
"learning_rate": 1.8285672952398446e-07, |
|
"loss": 1.5551, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.504743833017078, |
|
"grad_norm": 8.319755554199219, |
|
"learning_rate": 1.743948777242814e-07, |
|
"loss": 1.5433, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.5173940543959519, |
|
"grad_norm": 41.71631622314453, |
|
"learning_rate": 1.6609194429985436e-07, |
|
"loss": 1.5308, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.530044275774826, |
|
"grad_norm": 19.970256805419922, |
|
"learning_rate": 1.5795198167064249e-07, |
|
"loss": 1.5446, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.5426944971537002, |
|
"grad_norm": 7.31848669052124, |
|
"learning_rate": 1.4997896271528737e-07, |
|
"loss": 1.5417, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.5553447185325742, |
|
"grad_norm": 11.576011657714844, |
|
"learning_rate": 1.4217677883208624e-07, |
|
"loss": 1.5312, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.5679949399114483, |
|
"grad_norm": 6.723977088928223, |
|
"learning_rate": 1.3454923803971418e-07, |
|
"loss": 1.5214, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.5806451612903225, |
|
"grad_norm": 12.752711296081543, |
|
"learning_rate": 1.2710006311864103e-07, |
|
"loss": 1.5196, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.5932953826691967, |
|
"grad_norm": 7.53589391708374, |
|
"learning_rate": 1.1983288979415062e-07, |
|
"loss": 1.5456, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.6059456040480709, |
|
"grad_norm": 26.84853172302246, |
|
"learning_rate": 1.1275126496184917e-07, |
|
"loss": 1.5518, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.618595825426945, |
|
"grad_norm": 25.39905548095703, |
|
"learning_rate": 1.0585864495652896e-07, |
|
"loss": 1.5198, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.6312460468058192, |
|
"grad_norm": 29.904163360595703, |
|
"learning_rate": 9.915839386523211e-08, |
|
"loss": 1.5363, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.6438962681846934, |
|
"grad_norm": 12.930685043334961, |
|
"learning_rate": 9.265378188533696e-08, |
|
"loss": 1.5213, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.6565464895635673, |
|
"grad_norm": 7.995054244995117, |
|
"learning_rate": 8.634798372847146e-08, |
|
"loss": 1.5326, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.6691967109424415, |
|
"grad_norm": 8.757749557495117, |
|
"learning_rate": 8.024407707102698e-08, |
|
"loss": 1.5254, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.6818469323213157, |
|
"grad_norm": 6.595693588256836, |
|
"learning_rate": 7.434504105203621e-08, |
|
"loss": 1.5285, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.6944971537001896, |
|
"grad_norm": 9.962479591369629, |
|
"learning_rate": 6.865375481914015e-08, |
|
"loss": 1.5483, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.7071473750790638, |
|
"grad_norm": 18.681095123291016, |
|
"learning_rate": 6.317299612336146e-08, |
|
"loss": 1.5408, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.719797596457938, |
|
"grad_norm": 9.540229797363281, |
|
"learning_rate": 5.790543996336466e-08, |
|
"loss": 1.5333, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.7324478178368121, |
|
"grad_norm": 18.783493041992188, |
|
"learning_rate": 5.285365727986707e-08, |
|
"loss": 1.5343, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.7450980392156863, |
|
"grad_norm": 31.82489776611328, |
|
"learning_rate": 4.802011370083747e-08, |
|
"loss": 1.5412, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.7577482605945605, |
|
"grad_norm": 5.806077480316162, |
|
"learning_rate": 4.3407168338095325e-08, |
|
"loss": 1.5522, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.7703984819734346, |
|
"grad_norm": 15.603910446166992, |
|
"learning_rate": 3.901707263589671e-08, |
|
"loss": 1.5457, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7830487033523088, |
|
"grad_norm": 6.486084461212158, |
|
"learning_rate": 3.485196927206985e-08, |
|
"loss": 1.537, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.7956989247311828, |
|
"grad_norm": 7.477235794067383, |
|
"learning_rate": 3.091389111223691e-08, |
|
"loss": 1.5367, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.808349146110057, |
|
"grad_norm": 5.301967620849609, |
|
"learning_rate": 2.7204760217631074e-08, |
|
"loss": 1.555, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.820999367488931, |
|
"grad_norm": 11.517831802368164, |
|
"learning_rate": 2.3726386906994688e-08, |
|
"loss": 1.5269, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.833649588867805, |
|
"grad_norm": 6.416511058807373, |
|
"learning_rate": 2.0480468873015298e-08, |
|
"loss": 1.5494, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.8462998102466792, |
|
"grad_norm": 12.837623596191406, |
|
"learning_rate": 1.7468590353731495e-08, |
|
"loss": 1.517, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.8589500316255534, |
|
"grad_norm": 6.5693159103393555, |
|
"learning_rate": 1.4692221359312196e-08, |
|
"loss": 1.5285, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.8716002530044276, |
|
"grad_norm": 36.55837631225586, |
|
"learning_rate": 1.2152716954587694e-08, |
|
"loss": 1.517, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.8842504743833017, |
|
"grad_norm": 9.189281463623047, |
|
"learning_rate": 9.851316597681959e-09, |
|
"loss": 1.5424, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.896900695762176, |
|
"grad_norm": 5.850296497344971, |
|
"learning_rate": 7.789143535069153e-09, |
|
"loss": 1.5468, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.90955091714105, |
|
"grad_norm": 20.80071258544922, |
|
"learning_rate": 5.9672042533499e-09, |
|
"loss": 1.5156, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.9222011385199242, |
|
"grad_norm": 15.397515296936035, |
|
"learning_rate": 4.386387988014273e-09, |
|
"loss": 1.5257, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.9348513598987982, |
|
"grad_norm": 7.598056316375732, |
|
"learning_rate": 3.0474662894321437e-09, |
|
"loss": 1.5253, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.9475015812776724, |
|
"grad_norm": 17.48052978515625, |
|
"learning_rate": 1.9510926462816823e-09, |
|
"loss": 1.514, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.9601518026565465, |
|
"grad_norm": 9.740424156188965, |
|
"learning_rate": 1.0978021666005476e-09, |
|
"loss": 1.5153, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.9728020240354205, |
|
"grad_norm": 7.099216461181641, |
|
"learning_rate": 4.880113166155774e-10, |
|
"loss": 1.5389, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.9854522454142947, |
|
"grad_norm": 12.578405380249023, |
|
"learning_rate": 1.2201771747727407e-10, |
|
"loss": 1.5433, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.9981024667931688, |
|
"grad_norm": 10.554688453674316, |
|
"learning_rate": 0.0, |
|
"loss": 1.5311, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.9981024667931688, |
|
"step": 1580, |
|
"total_flos": 2.5201109024647414e+18, |
|
"train_loss": 1.6402756485757948, |
|
"train_runtime": 7842.2323, |
|
"train_samples_per_second": 3.225, |
|
"train_steps_per_second": 0.201 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1580, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.5201109024647414e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|