|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 50, |
|
"global_step": 485, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008247422680412371, |
|
"grad_norm": 16.75, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.8829, |
|
"num_input_tokens_seen": 1413808, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.016494845360824743, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.4033, |
|
"num_input_tokens_seen": 2866496, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.024742268041237112, |
|
"grad_norm": 5.0, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2755, |
|
"num_input_tokens_seen": 4305104, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.032989690721649485, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.2531, |
|
"num_input_tokens_seen": 5594128, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.2585, |
|
"num_input_tokens_seen": 6683376, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.049484536082474224, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 4e-05, |
|
"loss": 0.2308, |
|
"num_input_tokens_seen": 8030336, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0577319587628866, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 3.9992569962849926e-05, |
|
"loss": 0.2221, |
|
"num_input_tokens_seen": 9395728, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06597938144329897, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 3.99702853719449e-05, |
|
"loss": 0.2259, |
|
"num_input_tokens_seen": 10689344, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07422680412371134, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 3.9933162784818745e-05, |
|
"loss": 0.2201, |
|
"num_input_tokens_seen": 11936704, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 3.988122978369162e-05, |
|
"loss": 0.2242, |
|
"num_input_tokens_seen": 13217248, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09072164948453608, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.981452495497628e-05, |
|
"loss": 0.2213, |
|
"num_input_tokens_seen": 14587328, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09896907216494845, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.973309786060829e-05, |
|
"loss": 0.1958, |
|
"num_input_tokens_seen": 15976464, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.10721649484536082, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.963700900122124e-05, |
|
"loss": 0.2136, |
|
"num_input_tokens_seen": 17262576, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1154639175257732, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.952632977119465e-05, |
|
"loss": 0.2059, |
|
"num_input_tokens_seen": 18801264, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.9401142405607594e-05, |
|
"loss": 0.197, |
|
"num_input_tokens_seen": 20158000, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13195876288659794, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.9261539919137776e-05, |
|
"loss": 0.2273, |
|
"num_input_tokens_seen": 21322240, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1402061855670103, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 3.9107626036951266e-05, |
|
"loss": 0.1971, |
|
"num_input_tokens_seen": 22631360, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.14845360824742268, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.8939515117634326e-05, |
|
"loss": 0.2194, |
|
"num_input_tokens_seen": 23848496, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.15670103092783505, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 3.875733206822452e-05, |
|
"loss": 0.2215, |
|
"num_input_tokens_seen": 25148336, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.8561212251404406e-05, |
|
"loss": 0.1989, |
|
"num_input_tokens_seen": 26427264, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1731958762886598, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.835130138492644e-05, |
|
"loss": 0.2072, |
|
"num_input_tokens_seen": 27833024, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.18144329896907216, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 3.812775543334425e-05, |
|
"loss": 0.2013, |
|
"num_input_tokens_seen": 29273008, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.18969072164948453, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 3.789074049213033e-05, |
|
"loss": 0.2119, |
|
"num_input_tokens_seen": 30628416, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.1979381443298969, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.7640432664266514e-05, |
|
"loss": 0.2213, |
|
"num_input_tokens_seen": 31861856, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.737701792939881e-05, |
|
"loss": 0.2102, |
|
"num_input_tokens_seen": 33121072, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21443298969072164, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.7100692005653796e-05, |
|
"loss": 0.2052, |
|
"num_input_tokens_seen": 34464560, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.22268041237113403, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.681166020421938e-05, |
|
"loss": 0.1942, |
|
"num_input_tokens_seen": 35918800, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2309278350515464, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.6510137276797786e-05, |
|
"loss": 0.1946, |
|
"num_input_tokens_seen": 37267616, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.23917525773195877, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.6196347256044236e-05, |
|
"loss": 0.2263, |
|
"num_input_tokens_seen": 38542608, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.5870523289109886e-05, |
|
"loss": 0.2102, |
|
"num_input_tokens_seen": 39934016, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2556701030927835, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.553290746441261e-05, |
|
"loss": 0.2084, |
|
"num_input_tokens_seen": 41070080, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2639175257731959, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.5183750631764406e-05, |
|
"loss": 0.1939, |
|
"num_input_tokens_seen": 42375696, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.2721649484536082, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.4823312215989046e-05, |
|
"loss": 0.2027, |
|
"num_input_tokens_seen": 43648368, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2804123711340206, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.445186002416849e-05, |
|
"loss": 0.2093, |
|
"num_input_tokens_seen": 44952352, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 3.4069670046661197e-05, |
|
"loss": 0.1887, |
|
"num_input_tokens_seen": 46407584, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.29690721649484536, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.3677026252040306e-05, |
|
"loss": 0.2109, |
|
"num_input_tokens_seen": 47649744, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.30515463917525776, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 3.327422037610389e-05, |
|
"loss": 0.2072, |
|
"num_input_tokens_seen": 49014464, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3134020618556701, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 3.286155170511419e-05, |
|
"loss": 0.2046, |
|
"num_input_tokens_seen": 50443616, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3216494845360825, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.2439326853426824e-05, |
|
"loss": 0.211, |
|
"num_input_tokens_seen": 51801328, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 3.200785953567517e-05, |
|
"loss": 0.1977, |
|
"num_input_tokens_seen": 53112944, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.33814432989690724, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.156747033367922e-05, |
|
"loss": 0.2001, |
|
"num_input_tokens_seen": 54444256, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3463917525773196, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.1118486458252094e-05, |
|
"loss": 0.2, |
|
"num_input_tokens_seen": 55882912, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.354639175257732, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.0661241506081236e-05, |
|
"loss": 0.1997, |
|
"num_input_tokens_seen": 57157872, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3628865979381443, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.019607521186475e-05, |
|
"loss": 0.2085, |
|
"num_input_tokens_seen": 58474160, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 2.972333319588736e-05, |
|
"loss": 0.2093, |
|
"num_input_tokens_seen": 59687904, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.37938144329896906, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 2.9243366707223165e-05, |
|
"loss": 0.1963, |
|
"num_input_tokens_seen": 61006320, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.38762886597938145, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.875653236275632e-05, |
|
"loss": 0.2001, |
|
"num_input_tokens_seen": 62265552, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3958762886597938, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 2.8263191882213362e-05, |
|
"loss": 0.1948, |
|
"num_input_tokens_seen": 63682384, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4041237113402062, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.7763711819404098e-05, |
|
"loss": 0.2048, |
|
"num_input_tokens_seen": 64848160, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 2.7258463289870764e-05, |
|
"loss": 0.192, |
|
"num_input_tokens_seen": 66278032, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42061855670103093, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.6747821695147806e-05, |
|
"loss": 0.1933, |
|
"num_input_tokens_seen": 67686560, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.4288659793814433, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.623216644383715e-05, |
|
"loss": 0.2094, |
|
"num_input_tokens_seen": 68863776, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.43711340206185567, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 2.5711880669706172e-05, |
|
"loss": 0.1964, |
|
"num_input_tokens_seen": 70186224, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.44536082474226807, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 2.5187350947017918e-05, |
|
"loss": 0.2042, |
|
"num_input_tokens_seen": 71498112, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 2.4658967003304986e-05, |
|
"loss": 0.1908, |
|
"num_input_tokens_seen": 72880736, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4618556701030928, |
|
"grad_norm": 1.0, |
|
"learning_rate": 2.4127121429800498e-05, |
|
"loss": 0.187, |
|
"num_input_tokens_seen": 74122048, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.47010309278350515, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 2.3592209389741372e-05, |
|
"loss": 0.1778, |
|
"num_input_tokens_seen": 75602400, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.47835051546391755, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 2.30546283247606e-05, |
|
"loss": 0.207, |
|
"num_input_tokens_seen": 76746240, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4865979381443299, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 2.251477765958655e-05, |
|
"loss": 0.1911, |
|
"num_input_tokens_seen": 78209744, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.1973058505269007e-05, |
|
"loss": 0.1935, |
|
"num_input_tokens_seen": 79494896, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5030927835051546, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 2.1429873361152124e-05, |
|
"loss": 0.1977, |
|
"num_input_tokens_seen": 80721808, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.511340206185567, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 2.088562581581592e-05, |
|
"loss": 0.1956, |
|
"num_input_tokens_seen": 81918944, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.5195876288659794, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.0340720247208447e-05, |
|
"loss": 0.1912, |
|
"num_input_tokens_seen": 83184064, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5278350515463918, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.9795561522191523e-05, |
|
"loss": 0.1843, |
|
"num_input_tokens_seen": 84574976, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.9250554695723107e-05, |
|
"loss": 0.1942, |
|
"num_input_tokens_seen": 85844768, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5443298969072164, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.8706104709899964e-05, |
|
"loss": 0.1922, |
|
"num_input_tokens_seen": 87245056, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.5525773195876289, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.816261609308419e-05, |
|
"loss": 0.182, |
|
"num_input_tokens_seen": 88603792, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5608247422680412, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 1.7620492659337155e-05, |
|
"loss": 0.1879, |
|
"num_input_tokens_seen": 90054816, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5690721649484536, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 1.7080137208384122e-05, |
|
"loss": 0.1809, |
|
"num_input_tokens_seen": 91432912, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 1.6541951226332565e-05, |
|
"loss": 0.1735, |
|
"num_input_tokens_seen": 92795296, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5855670103092784, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 1.600633458736653e-05, |
|
"loss": 0.1915, |
|
"num_input_tokens_seen": 94071744, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5938144329896907, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.5473685256638572e-05, |
|
"loss": 0.1895, |
|
"num_input_tokens_seen": 95342096, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.6020618556701031, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 1.4944398994580232e-05, |
|
"loss": 0.1869, |
|
"num_input_tokens_seen": 96569312, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.6103092783505155, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.4418869062850514e-05, |
|
"loss": 0.2004, |
|
"num_input_tokens_seen": 97849216, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 1.3897485932141042e-05, |
|
"loss": 0.1865, |
|
"num_input_tokens_seen": 99083488, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6268041237113402, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.3380636992054878e-05, |
|
"loss": 0.1769, |
|
"num_input_tokens_seen": 100566624, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.6350515463917525, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 1.2868706263274602e-05, |
|
"loss": 0.1969, |
|
"num_input_tokens_seen": 101823872, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.643298969072165, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 1.236207411223353e-05, |
|
"loss": 0.1767, |
|
"num_input_tokens_seen": 103284176, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.6515463917525773, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 1.1861116968502015e-05, |
|
"loss": 0.1799, |
|
"num_input_tokens_seen": 104567360, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.136620704509892e-05, |
|
"loss": 0.1856, |
|
"num_input_tokens_seen": 105872848, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.668041237113402, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 1.087771206193593e-05, |
|
"loss": 0.1791, |
|
"num_input_tokens_seen": 107217232, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6762886597938145, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 1.0395994972600285e-05, |
|
"loss": 0.1806, |
|
"num_input_tokens_seen": 108626976, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6845360824742268, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.921413694678959e-06, |
|
"loss": 0.2018, |
|
"num_input_tokens_seen": 109754000, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6927835051546392, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.454320843824512e-06, |
|
"loss": 0.1848, |
|
"num_input_tokens_seen": 111026592, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.995063471760377e-06, |
|
"loss": 0.1885, |
|
"num_input_tokens_seen": 112287760, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.709278350515464, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 8.543982808420156e-06, |
|
"loss": 0.1838, |
|
"num_input_tokens_seen": 113634128, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.7175257731958763, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 8.101414008412469e-06, |
|
"loss": 0.1842, |
|
"num_input_tokens_seen": 114949760, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.7257731958762886, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.667685901999875e-06, |
|
"loss": 0.1935, |
|
"num_input_tokens_seen": 116223648, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.734020618556701, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 7.24312075077674e-06, |
|
"loss": 0.1866, |
|
"num_input_tokens_seen": 117618112, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"grad_norm": 0.875, |
|
"learning_rate": 6.828034008227678e-06, |
|
"loss": 0.1751, |
|
"num_input_tokens_seen": 119000256, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7505154639175258, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 6.422734085344464e-06, |
|
"loss": 0.1796, |
|
"num_input_tokens_seen": 120232672, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.7587628865979381, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 6.027522121475482e-06, |
|
"loss": 0.1783, |
|
"num_input_tokens_seen": 121499376, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.7670103092783506, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 5.642691760578116e-06, |
|
"loss": 0.1856, |
|
"num_input_tokens_seen": 122791312, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.7752577319587629, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 5.268528933040147e-06, |
|
"loss": 0.1674, |
|
"num_input_tokens_seen": 124261040, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 4.905311643232464e-06, |
|
"loss": 0.1773, |
|
"num_input_tokens_seen": 125708848, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7917525773195876, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.553309762950739e-06, |
|
"loss": 0.1905, |
|
"num_input_tokens_seen": 126865712, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.212784830899725e-06, |
|
"loss": 0.1793, |
|
"num_input_tokens_seen": 128157040, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.8082474226804124, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.8839898583689725e-06, |
|
"loss": 0.1812, |
|
"num_input_tokens_seen": 129465312, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.8164948453608247, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.567169141244562e-06, |
|
"loss": 0.1813, |
|
"num_input_tokens_seen": 130665504, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.262558078496301e-06, |
|
"loss": 0.1727, |
|
"num_input_tokens_seen": 132002896, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8329896907216495, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 2.9703829972754407e-06, |
|
"loss": 0.1858, |
|
"num_input_tokens_seen": 133415088, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.8412371134020619, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.69086098475277e-06, |
|
"loss": 0.1707, |
|
"num_input_tokens_seen": 134815184, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.8494845360824742, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 2.4241997268220096e-06, |
|
"loss": 0.1822, |
|
"num_input_tokens_seen": 136261472, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.8577319587628865, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.1705973537884615e-06, |
|
"loss": 0.1809, |
|
"num_input_tokens_seen": 137429504, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.9302422931574183e-06, |
|
"loss": 0.1885, |
|
"num_input_tokens_seen": 138708544, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8742268041237113, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 1.7033131296318473e-06, |
|
"loss": 0.1687, |
|
"num_input_tokens_seen": 140033024, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.8824742268041237, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 1.4899784724232968e-06, |
|
"loss": 0.1748, |
|
"num_input_tokens_seen": 141348192, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.8907216494845361, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 1.2903968299746094e-06, |
|
"loss": 0.1716, |
|
"num_input_tokens_seen": 142800048, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8989690721649485, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 1.104716492187574e-06, |
|
"loss": 0.1841, |
|
"num_input_tokens_seen": 144156592, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 9.330754202429726e-07, |
|
"loss": 0.1855, |
|
"num_input_tokens_seen": 145334944, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9154639175257732, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.756011440948996e-07, |
|
"loss": 0.1895, |
|
"num_input_tokens_seen": 146529728, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.9237113402061856, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.324106677155573e-07, |
|
"loss": 0.1841, |
|
"num_input_tokens_seen": 147823952, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.931958762886598, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5.036103821608485e-07, |
|
"loss": 0.1838, |
|
"num_input_tokens_seen": 149194048, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.9402061855670103, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.892959865214363e-07, |
|
"loss": 0.1773, |
|
"num_input_tokens_seen": 150529248, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.8955241681795534e-07, |
|
"loss": 0.1863, |
|
"num_input_tokens_seen": 151864336, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9567010309278351, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 2.044537828932458e-07, |
|
"loss": 0.1803, |
|
"num_input_tokens_seen": 153203872, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.9649484536082474, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 1.3406331334845813e-07, |
|
"loss": 0.1869, |
|
"num_input_tokens_seen": 154513568, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.9731958762886598, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 7.843330856396103e-08, |
|
"loss": 0.1818, |
|
"num_input_tokens_seen": 155739584, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.9814432989690721, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.760510183997701e-08, |
|
"loss": 0.1826, |
|
"num_input_tokens_seen": 157087344, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 1.160902868577951e-08, |
|
"loss": 0.1905, |
|
"num_input_tokens_seen": 158352288, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9979381443298969, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.64404280295927e-10, |
|
"loss": 0.1794, |
|
"num_input_tokens_seen": 159671536, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.09546061605215073, |
|
"eval_runtime": 83.6615, |
|
"eval_samples_per_second": 12.419, |
|
"eval_steps_per_second": 0.394, |
|
"num_input_tokens_seen": 160044208, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"num_input_tokens_seen": 160044208, |
|
"step": 485, |
|
"total_flos": 9.013493089079132e+17, |
|
"train_loss": 0.20329311268845784, |
|
"train_runtime": 14400.7959, |
|
"train_samples_per_second": 4.305, |
|
"train_steps_per_second": 0.034, |
|
"train_tokens_per_second": 1385.946 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 485, |
|
"num_input_tokens_seen": 160044208, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.013493089079132e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|