| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 1312, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007621951219512195, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 1.6918, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01524390243902439, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 1.6227, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.022865853658536585, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 1.6367, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03048780487804878, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 6.060606060606061e-06, | |
| "loss": 1.493, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.038109756097560975, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 7.5757575757575764e-06, | |
| "loss": 1.441, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04573170731707317, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 1.3488, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.053353658536585365, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1.0606060606060606e-05, | |
| "loss": 1.3289, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06097560975609756, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 1.2742, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06859756097560976, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 1.2758, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.07621951219512195, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 1.5151515151515153e-05, | |
| "loss": 1.2707, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08384146341463415, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.273, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.09146341463414634, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 1.2414, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.09908536585365854, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.96969696969697e-05, | |
| "loss": 1.275, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.10670731707317073, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.986440677966102e-05, | |
| "loss": 1.2547, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.11432926829268293, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.969491525423729e-05, | |
| "loss": 1.2391, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.12195121951219512, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.9525423728813562e-05, | |
| "loss": 1.2379, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.12957317073170732, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.9355932203389832e-05, | |
| "loss": 1.2105, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.13719512195121952, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.9186440677966102e-05, | |
| "loss": 1.2359, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1448170731707317, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.9016949152542375e-05, | |
| "loss": 1.2867, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1524390243902439, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.8847457627118645e-05, | |
| "loss": 1.1762, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1600609756097561, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.8677966101694918e-05, | |
| "loss": 1.1992, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1676829268292683, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.8508474576271188e-05, | |
| "loss": 1.232, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.17530487804878048, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.8338983050847458e-05, | |
| "loss": 1.2172, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.18292682926829268, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.816949152542373e-05, | |
| "loss": 1.182, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.19054878048780488, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.2492, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.19817073170731708, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.7830508474576274e-05, | |
| "loss": 1.2312, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.20579268292682926, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.7661016949152543e-05, | |
| "loss": 1.241, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.21341463414634146, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.7491525423728813e-05, | |
| "loss": 1.1879, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.22103658536585366, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.7322033898305086e-05, | |
| "loss": 1.2211, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.22865853658536586, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.715254237288136e-05, | |
| "loss": 1.2152, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.23628048780487804, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.698305084745763e-05, | |
| "loss": 1.1938, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.24390243902439024, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.68135593220339e-05, | |
| "loss": 1.2266, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.25152439024390244, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.6644067796610172e-05, | |
| "loss": 1.1672, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.25914634146341464, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.6474576271186442e-05, | |
| "loss": 1.2104, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.26676829268292684, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.6305084745762715e-05, | |
| "loss": 1.2098, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.27439024390243905, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.6135593220338985e-05, | |
| "loss": 1.1684, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2820121951219512, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.5966101694915255e-05, | |
| "loss": 1.1734, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2896341463414634, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.5796610169491528e-05, | |
| "loss": 1.2008, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2972560975609756, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.5627118644067798e-05, | |
| "loss": 1.1848, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3048780487804878, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.545762711864407e-05, | |
| "loss": 1.2033, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.528813559322034e-05, | |
| "loss": 1.1781, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3201219512195122, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.511864406779661e-05, | |
| "loss": 1.2043, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3277439024390244, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.4949152542372882e-05, | |
| "loss": 1.1699, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3353658536585366, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.4779661016949153e-05, | |
| "loss": 1.1871, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.3429878048780488, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.4610169491525426e-05, | |
| "loss": 1.1418, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.35060975609756095, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.4440677966101698e-05, | |
| "loss": 1.1824, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.35823170731707316, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1.4271186440677966e-05, | |
| "loss": 1.176, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.36585365853658536, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.4101694915254239e-05, | |
| "loss": 1.1785, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.37347560975609756, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.393220338983051e-05, | |
| "loss": 1.1758, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.38109756097560976, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.3762711864406782e-05, | |
| "loss": 1.1988, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.38871951219512196, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.3593220338983053e-05, | |
| "loss": 1.1822, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.39634146341463417, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.3423728813559323e-05, | |
| "loss": 1.1801, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.40396341463414637, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.3254237288135595e-05, | |
| "loss": 1.2043, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.4115853658536585, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.3084745762711866e-05, | |
| "loss": 1.2098, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.4192073170731707, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.2915254237288137e-05, | |
| "loss": 1.1973, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4268292682926829, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.2745762711864407e-05, | |
| "loss": 1.2066, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4344512195121951, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.2576271186440679e-05, | |
| "loss": 1.1543, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.4420731707317073, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.240677966101695e-05, | |
| "loss": 1.1703, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.4496951219512195, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1.2237288135593222e-05, | |
| "loss": 1.1566, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.4573170731707317, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.2067796610169493e-05, | |
| "loss": 1.1623, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4649390243902439, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.1898305084745763e-05, | |
| "loss": 1.198, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4725609756097561, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.1728813559322034e-05, | |
| "loss": 1.1971, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4801829268292683, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.1559322033898306e-05, | |
| "loss": 1.159, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4878048780487805, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.1389830508474577e-05, | |
| "loss": 1.184, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.4954268292682927, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.1220338983050849e-05, | |
| "loss": 1.1859, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5030487804878049, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.1050847457627118e-05, | |
| "loss": 1.1434, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.510670731707317, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.088135593220339e-05, | |
| "loss": 1.1775, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5182926829268293, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.0711864406779661e-05, | |
| "loss": 1.15, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5259146341463414, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.0542372881355933e-05, | |
| "loss": 1.1613, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.5335365853658537, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.0372881355932204e-05, | |
| "loss": 1.1602, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5411585365853658, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.0203389830508474e-05, | |
| "loss": 1.1879, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.5487804878048781, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.0033898305084746e-05, | |
| "loss": 1.1941, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.5564024390243902, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 9.864406779661017e-06, | |
| "loss": 1.1574, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.5640243902439024, | |
| "grad_norm": 2.75, | |
| "learning_rate": 9.69491525423729e-06, | |
| "loss": 1.1793, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5716463414634146, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 9.52542372881356e-06, | |
| "loss": 1.1711, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5792682926829268, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 9.355932203389831e-06, | |
| "loss": 1.1875, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.586890243902439, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 9.186440677966101e-06, | |
| "loss": 1.19, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5945121951219512, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 9.016949152542374e-06, | |
| "loss": 1.141, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6021341463414634, | |
| "grad_norm": 2.75, | |
| "learning_rate": 8.847457627118646e-06, | |
| "loss": 1.1699, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.6097560975609756, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 8.677966101694915e-06, | |
| "loss": 1.1809, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6173780487804879, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 8.508474576271187e-06, | |
| "loss": 1.1824, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 8.338983050847458e-06, | |
| "loss": 1.1729, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6326219512195121, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 8.16949152542373e-06, | |
| "loss": 1.1113, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.6402439024390244, | |
| "grad_norm": 2.5, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.1434, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.6478658536585366, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 7.830508474576271e-06, | |
| "loss": 1.1643, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6554878048780488, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 7.661016949152543e-06, | |
| "loss": 1.1738, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.663109756097561, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 7.491525423728814e-06, | |
| "loss": 1.125, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.6707317073170732, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 7.3220338983050855e-06, | |
| "loss": 1.1723, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.6783536585365854, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 7.152542372881357e-06, | |
| "loss": 1.1617, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.6859756097560976, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 6.9830508474576275e-06, | |
| "loss": 1.1445, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6935975609756098, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 6.813559322033899e-06, | |
| "loss": 1.1807, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.7012195121951219, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 6.64406779661017e-06, | |
| "loss": 1.1865, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7088414634146342, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 6.474576271186441e-06, | |
| "loss": 1.198, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.7164634146341463, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 6.3050847457627125e-06, | |
| "loss": 1.1734, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.7240853658536586, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 6.135593220338983e-06, | |
| "loss": 1.1633, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.7317073170731707, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.9661016949152555e-06, | |
| "loss": 1.1953, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.739329268292683, | |
| "grad_norm": 2.5, | |
| "learning_rate": 5.796610169491525e-06, | |
| "loss": 1.1879, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.7469512195121951, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.6271186440677975e-06, | |
| "loss": 1.1664, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.7545731707317073, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 5.457627118644067e-06, | |
| "loss": 1.1789, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.7621951219512195, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 5.28813559322034e-06, | |
| "loss": 1.1738, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7698170731707317, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.118644067796611e-06, | |
| "loss": 1.1582, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.7774390243902439, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 4.949152542372882e-06, | |
| "loss": 1.1391, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.7850609756097561, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 4.779661016949153e-06, | |
| "loss": 1.1652, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.7926829268292683, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 4.610169491525424e-06, | |
| "loss": 1.1426, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.8003048780487805, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 4.440677966101695e-06, | |
| "loss": 1.1713, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8079268292682927, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 4.271186440677967e-06, | |
| "loss": 1.1727, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.8155487804878049, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 4.101694915254237e-06, | |
| "loss": 1.1695, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.823170731707317, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.932203389830509e-06, | |
| "loss": 1.1672, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.8307926829268293, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 3.76271186440678e-06, | |
| "loss": 1.2016, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.8384146341463414, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 3.5932203389830512e-06, | |
| "loss": 1.1727, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8460365853658537, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.4237288135593223e-06, | |
| "loss": 1.1799, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.8536585365853658, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 3.2542372881355933e-06, | |
| "loss": 1.1555, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.8612804878048781, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 3.0847457627118648e-06, | |
| "loss": 1.1678, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.8689024390243902, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.915254237288136e-06, | |
| "loss": 1.1578, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.8765243902439024, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.745762711864407e-06, | |
| "loss": 1.1535, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.8841463414634146, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.576271186440678e-06, | |
| "loss": 1.1602, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.8917682926829268, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 2.4067796610169493e-06, | |
| "loss": 1.1852, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.899390243902439, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.2372881355932204e-06, | |
| "loss": 1.1605, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.9070121951219512, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.0677966101694914e-06, | |
| "loss": 1.1451, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.9146341463414634, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.8983050847457629e-06, | |
| "loss": 1.1496, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.9222560975609756, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 1.728813559322034e-06, | |
| "loss": 1.1582, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.9298780487804879, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.5593220338983054e-06, | |
| "loss": 1.227, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.3898305084745764e-06, | |
| "loss": 1.1977, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.9451219512195121, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.2203389830508477e-06, | |
| "loss": 1.1885, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.9527439024390244, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.0508474576271187e-06, | |
| "loss": 1.1549, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.9603658536585366, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 8.813559322033899e-07, | |
| "loss": 1.1781, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.9679878048780488, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 7.118644067796611e-07, | |
| "loss": 1.1957, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 5.423728813559322e-07, | |
| "loss": 1.1898, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.9832317073170732, | |
| "grad_norm": 2.625, | |
| "learning_rate": 3.7288135593220347e-07, | |
| "loss": 1.1531, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.9908536585365854, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 2.0338983050847458e-07, | |
| "loss": 1.1773, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.9984756097560976, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 3.3898305084745764e-08, | |
| "loss": 1.0754, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1312, | |
| "total_flos": 4.7710598211200614e+17, | |
| "train_loss": 1.2026635146722562, | |
| "train_runtime": 2875.3707, | |
| "train_samples_per_second": 14.591, | |
| "train_steps_per_second": 0.456 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1312, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.7710598211200614e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |