|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999842829076621, |
|
"eval_steps": 1590, |
|
"global_step": 15906, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009430255402750491, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.001, |
|
"loss": 5.5551, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0018860510805500982, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.001, |
|
"loss": 3.5038, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.002829076620825147, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.001, |
|
"loss": 3.5068, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0037721021611001964, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.4288, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.004715127701375246, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 0.001, |
|
"loss": 3.3071, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.005658153241650294, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.001, |
|
"loss": 3.2653, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.006601178781925344, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 0.001, |
|
"loss": 3.1297, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.007544204322200393, |
|
"grad_norm": 0.12451171875, |
|
"learning_rate": 0.001, |
|
"loss": 3.0482, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.008487229862475442, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.9037, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.009430255402750491, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 0.001, |
|
"loss": 2.8178, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01037328094302554, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 0.001, |
|
"loss": 2.687, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.011316306483300589, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.6247, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.01225933202357564, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.001, |
|
"loss": 2.5556, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.013202357563850688, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4524, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.014145383104125737, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 0.001, |
|
"loss": 2.4904, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.015088408644400786, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.001, |
|
"loss": 2.4211, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.016031434184675834, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.419, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.016974459724950885, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 0.001, |
|
"loss": 2.3542, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.017917485265225932, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2893, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.018860510805500982, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.001, |
|
"loss": 2.2671, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.019803536345776033, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.001, |
|
"loss": 2.2644, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.02074656188605108, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2669, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.02168958742632613, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.001, |
|
"loss": 2.2009, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.022632612966601177, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.001, |
|
"loss": 2.1569, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.023575638506876228, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0607, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.02451866404715128, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.001, |
|
"loss": 2.1118, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.025461689587426325, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0465, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.026404715127701376, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0682, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.027347740667976423, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.001, |
|
"loss": 2.014, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.028290766208251474, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 2.0251, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.029233791748526524, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.001, |
|
"loss": 1.991, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.03017681728880157, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.001, |
|
"loss": 1.9579, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.03111984282907662, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.001, |
|
"loss": 1.9253, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.03206286836935167, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.001, |
|
"loss": 1.9019, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.033005893909626716, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 0.001, |
|
"loss": 1.9208, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.03394891944990177, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 1.9165, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.03489194499017682, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.8541, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.035834970530451864, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 1.8854, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.03677799607072692, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.8651, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.037721021611001965, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.8392, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.03866404715127701, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.001, |
|
"loss": 1.843, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.039607072691552066, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.001, |
|
"loss": 1.7958, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.04055009823182711, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.001, |
|
"loss": 1.7849, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.04149312377210216, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.001, |
|
"loss": 1.7397, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.04243614931237721, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.001, |
|
"loss": 1.7396, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.04337917485265226, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.001, |
|
"loss": 1.7219, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.04432220039292731, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.001, |
|
"loss": 1.7536, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.045265225933202355, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.001, |
|
"loss": 1.697, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.04620825147347741, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.001, |
|
"loss": 1.6725, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.047151277013752456, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.691, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.0480943025540275, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.6721, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.04903732809430256, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.001, |
|
"loss": 1.7221, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.049980353634577604, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.001, |
|
"loss": 1.6609, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.05092337917485265, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.001, |
|
"loss": 1.6805, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.0518664047151277, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.001, |
|
"loss": 1.6157, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.05280943025540275, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.001, |
|
"loss": 1.5996, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.0537524557956778, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.001, |
|
"loss": 1.5686, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.054695481335952846, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.001, |
|
"loss": 1.6021, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.0556385068762279, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.001, |
|
"loss": 1.6159, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.05658153241650295, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.001, |
|
"loss": 1.5456, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.057524557956777994, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.001, |
|
"loss": 1.5764, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.05846758349705305, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.001, |
|
"loss": 1.5426, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.059410609037328095, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.001, |
|
"loss": 1.5535, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.06035363457760314, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.001, |
|
"loss": 1.505, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.06129666011787819, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.001, |
|
"loss": 1.5328, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.06223968565815324, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.001, |
|
"loss": 1.5274, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.06318271119842829, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.5246, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.06412573673870334, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 1.4633, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.06506876227897838, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.001, |
|
"loss": 1.487, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.06601178781925343, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.001, |
|
"loss": 1.4582, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.06695481335952849, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.001, |
|
"loss": 1.4586, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.06789783889980354, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 1.4322, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.06884086444007859, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.47, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.06978388998035363, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.001, |
|
"loss": 1.4215, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.07072691552062868, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 1.4569, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.07166994106090373, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.4428, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.07261296660117879, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.001, |
|
"loss": 1.3861, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.07355599214145384, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.4478, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.07449901768172888, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.406, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.07544204322200393, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.3944, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.07638506876227898, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001, |
|
"loss": 1.3884, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.07732809430255402, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.001, |
|
"loss": 1.38, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.07827111984282907, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 1.3446, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.07921414538310413, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.351, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.08015717092337918, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 1.352, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.08110019646365423, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.3378, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.08204322200392927, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.3056, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.08298624754420432, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.001, |
|
"loss": 1.3099, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.08392927308447937, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 1.3364, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.08487229862475441, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.2865, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.08581532416502947, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.001, |
|
"loss": 1.3022, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.08675834970530452, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.001, |
|
"loss": 1.2641, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.08770137524557957, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 1.291, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.08864440078585462, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.001, |
|
"loss": 1.2947, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.08958742632612966, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.001, |
|
"loss": 1.2626, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.09053045186640471, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 1.2719, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.09147347740667977, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 1.2817, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.09241650294695482, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.001, |
|
"loss": 1.2678, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.09335952848722986, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.001, |
|
"loss": 1.2336, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.09430255402750491, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.001, |
|
"loss": 1.2415, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.09524557956777996, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.001, |
|
"loss": 1.2478, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.096188605108055, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.001, |
|
"loss": 1.2475, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.09713163064833005, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.001, |
|
"loss": 1.2128, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.09807465618860511, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.2292, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.09901768172888016, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.001, |
|
"loss": 1.2015, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.09996070726915521, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 1.2088, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.09996070726915521, |
|
"eval_loss": 1.5537890195846558, |
|
"eval_runtime": 9.6819, |
|
"eval_samples_per_second": 103.285, |
|
"eval_steps_per_second": 1.446, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.10090373280943025, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.2156, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.1018467583497053, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.001, |
|
"loss": 1.2115, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.10278978388998035, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 1.2202, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.1037328094302554, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.001, |
|
"loss": 1.2208, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.10467583497053046, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 1.1911, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.1056188605108055, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 1.2102, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.10656188605108055, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.1984, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.1075049115913556, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.001, |
|
"loss": 1.2012, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.10844793713163065, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.001, |
|
"loss": 1.1869, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.10939096267190569, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1948, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.11033398821218075, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.1783, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.1112770137524558, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.001, |
|
"loss": 1.1893, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.11222003929273085, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1495, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.1131630648330059, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001, |
|
"loss": 1.175, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.11410609037328094, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1588, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.11504911591355599, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.001, |
|
"loss": 1.1376, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.11599214145383104, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1511, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.1169351669941061, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 1.1645, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.11787819253438114, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 1.1619, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.11882121807465619, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1304, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.11976424361493124, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 1.1361, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.12070726915520628, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.001, |
|
"loss": 1.1151, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.12165029469548133, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1299, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.12259332023575638, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1334, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.12353634577603144, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.112, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.12447937131630649, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 1.1034, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.12542239685658152, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.001, |
|
"loss": 1.12, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.12636542239685658, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0996, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.12730844793713164, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.1141, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.12825147347740667, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 1.1112, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.12919449901768174, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1229, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.13013752455795677, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.001, |
|
"loss": 1.074, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.13108055009823183, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 1.1199, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.13202357563850686, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.001, |
|
"loss": 1.097, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.13296660117878192, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0832, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.13390962671905698, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0887, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.13485265225933202, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.066, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.13579567779960708, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0979, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.1367387033398821, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.001, |
|
"loss": 1.101, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.13768172888015717, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0761, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.13862475442043223, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0845, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.13956777996070727, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0938, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.14051080550098233, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0659, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.14145383104125736, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.001, |
|
"loss": 1.0683, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.14239685658153242, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.001, |
|
"loss": 1.0777, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.14333988212180745, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0741, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.14428290766208252, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0533, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.14522593320235758, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0655, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.1461689587426326, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0541, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.14711198428290767, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.0506, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.1480550098231827, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0596, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.14899803536345776, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0586, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.1499410609037328, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0466, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.15088408644400786, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0485, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.15182711198428292, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 1.011, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.15277013752455795, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0434, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.153713163064833, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0353, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.15465618860510805, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0222, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.1555992141453831, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0403, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.15654223968565814, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.001, |
|
"loss": 1.0397, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.1574852652259332, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0382, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.15842829076620826, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0336, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.1593713163064833, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.001, |
|
"loss": 1.0083, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.16031434184675836, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0236, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.1612573673870334, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0245, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.16220039292730845, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001, |
|
"loss": 1.026, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.16314341846758348, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 1.0276, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.16408644400785855, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9937, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.1650294695481336, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 1.0249, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.16597249508840864, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0096, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.1669155206286837, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0195, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.16785854616895873, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 1.018, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.1688015717092338, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0289, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.16974459724950883, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9931, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.1706876227897839, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0101, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.17163064833005895, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 1.0159, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.17257367387033398, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0094, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.17351669941060904, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 1.0081, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.17445972495088408, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9958, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.17540275049115914, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9909, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.1763457760314342, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9854, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.17728880157170923, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9858, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.1782318271119843, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9825, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.17917485265225933, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.001, |
|
"loss": 1.0153, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.1801178781925344, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9984, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.18106090373280942, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9832, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.18200392927308448, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9843, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.18294695481335954, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9774, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.18388998035363457, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9824, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.18483300589390964, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9884, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.18577603143418467, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9684, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.18671905697445973, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9746, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.18766208251473476, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9831, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.18860510805500982, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9868, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.18954813359528488, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9687, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.19049115913555992, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9759, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.19143418467583498, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9755, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.19237721021611, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9784, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.19332023575638507, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9691, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.1942632612966601, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9851, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.19520628683693517, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9695, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.19614931237721023, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.993, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.19709233791748526, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9625, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.19803536345776032, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9655, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.19897838899803535, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9606, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.19992141453831042, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9608, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.19992141453831042, |
|
"eval_loss": 1.169226050376892, |
|
"eval_runtime": 9.7503, |
|
"eval_samples_per_second": 102.561, |
|
"eval_steps_per_second": 1.436, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.20086444007858545, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9741, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.2018074656188605, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9608, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.20275049115913557, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9464, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.2036935166994106, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9683, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.20463654223968566, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9308, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.2055795677799607, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9541, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.20652259332023576, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9452, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.2074656188605108, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9673, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.20840864440078585, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9508, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.2093516699410609, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.955, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.21029469548133595, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9499, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.211237721021611, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9441, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.21218074656188604, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9476, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.2131237721021611, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9506, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.21406679764243616, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9546, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.2150098231827112, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9488, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.21595284872298626, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9473, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.2168958742632613, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9491, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.21783889980353635, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9304, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.21878192534381138, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9482, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.21972495088408645, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9418, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.2206679764243615, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9226, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.22161100196463654, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9427, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.2225540275049116, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9261, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.22349705304518663, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9418, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.2244400785854617, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9382, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.22538310412573673, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9353, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.2263261296660118, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9138, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.22726915520628685, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9033, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.22821218074656188, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9337, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.22915520628683694, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9188, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.23009823182711198, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.9407, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.23104125736738704, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9068, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.23198428290766207, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9079, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.23292730844793713, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9095, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.2338703339882122, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9148, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.23481335952848723, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9044, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.2357563850687623, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9401, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.23669941060903732, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9228, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.23764243614931238, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9071, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.2385854616895874, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.92, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.23952848722986247, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9323, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.24047151277013754, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9013, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.24141453831041257, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9045, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.24235756385068763, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9049, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.24330058939096266, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8902, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.24424361493123772, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.911, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.24518664047151276, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9092, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.24612966601178782, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.894, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.24707269155206288, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9096, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.2480157170923379, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9147, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.24895874263261297, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9088, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.249901768172888, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9116, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.25084479371316304, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.901, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.2517878192534381, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9013, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 0.25273084479371316, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.903, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.2536738703339882, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8916, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 0.2546168958742633, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.001, |
|
"loss": 0.897, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.2555599214145383, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.9015, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 0.25650294695481335, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.897, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.2574459724950884, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8936, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.25838899803536347, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9048, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.2593320235756385, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8973, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.26027504911591354, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.9053, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.2612180746561886, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.9121, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 0.26216110019646366, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.89, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.2631041257367387, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.9025, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 0.2640471512770137, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.899, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.2649901768172888, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8793, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 0.26593320235756385, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8964, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.2668762278978389, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.896, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 0.26781925343811397, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.886, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.268762278978389, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8861, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.26970530451866404, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8864, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.2706483300589391, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8834, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.27159135559921416, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8859, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.2725343811394892, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8953, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 0.2734774066797642, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8928, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.2744204322200393, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8821, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 0.27536345776031435, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8872, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.2763064833005894, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8753, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 0.27724950884086447, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9047, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.2781925343811395, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8876, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.27913555992141453, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 0.864, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.28007858546168957, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8863, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 0.28102161100196466, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.9028, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.2819646365422397, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.8684, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 0.2829076620825147, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8808, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2838506876227898, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8736, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 0.28479371316306484, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8729, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.2857367387033399, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8807, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 0.2866797642436149, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8716, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.28762278978389, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8754, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.28856581532416503, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 0.866, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.28950884086444006, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8661, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 0.29045186640471515, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8797, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.2913948919449902, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8523, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 0.2923379174852652, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8774, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.29328094302554025, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8785, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 0.29422396856581534, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8648, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.2951669941060904, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8676, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 0.2961100196463654, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.8557, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.2970530451866405, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8694, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.29799607072691553, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8459, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.29893909626719056, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8551, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 0.2998821218074656, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8717, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.2998821218074656, |
|
"eval_loss": 1.035895824432373, |
|
"eval_runtime": 9.7687, |
|
"eval_samples_per_second": 102.368, |
|
"eval_steps_per_second": 1.433, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.3008251473477407, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8668, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 0.3017681728880157, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8674, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.30271119842829075, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8886, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 0.30365422396856584, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.854, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.3045972495088409, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.8513, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 0.3055402750491159, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8574, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.30648330058939094, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8437, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.307426326129666, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8604, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.30836935166994106, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8544, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 0.3093123772102161, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8607, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.3102554027504912, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8454, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 0.3111984282907662, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8575, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.31214145383104125, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8401, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 0.3130844793713163, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8592, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.31402750491159137, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8376, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 0.3149705304518664, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.853, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.31591355599214144, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8659, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.3168565815324165, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8733, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.31779960707269156, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8541, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 0.3187426326129666, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8474, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.3196856581532416, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8421, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 0.3206286836935167, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8501, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.32157170923379175, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8596, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 0.3225147347740668, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8421, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.32345776031434187, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8732, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 0.3244007858546169, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8549, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.32534381139489194, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8468, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.32628683693516697, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8419, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.32722986247544206, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8531, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 0.3281728880157171, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.848, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.3291159135559921, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8367, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 0.3300589390962672, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8405, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.33100196463654225, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8567, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 0.3319449901768173, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8572, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.3328880157170923, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8505, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 0.3338310412573674, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8398, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.33477406679764243, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8475, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.33571709233791747, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8267, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.33666011787819256, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8442, |
|
"step": 5355 |
|
}, |
|
{ |
|
"epoch": 0.3376031434184676, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8605, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.3385461689587426, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8458, |
|
"step": 5385 |
|
}, |
|
{ |
|
"epoch": 0.33948919449901765, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8474, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.34043222003929274, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8507, |
|
"step": 5415 |
|
}, |
|
{ |
|
"epoch": 0.3413752455795678, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8449, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.3423182711198428, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8456, |
|
"step": 5445 |
|
}, |
|
{ |
|
"epoch": 0.3432612966601179, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.834, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.34420432220039293, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8382, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.34514734774066796, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8162, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.346090373280943, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8331, |
|
"step": 5505 |
|
}, |
|
{ |
|
"epoch": 0.3470333988212181, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8461, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.3479764243614931, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8277, |
|
"step": 5535 |
|
}, |
|
{ |
|
"epoch": 0.34891944990176815, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8261, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.34986247544204324, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.8368, |
|
"step": 5565 |
|
}, |
|
{ |
|
"epoch": 0.3508055009823183, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.829, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.3517485265225933, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8356, |
|
"step": 5595 |
|
}, |
|
{ |
|
"epoch": 0.3526915520628684, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8404, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.35363457760314343, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8221, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.35457760314341846, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8336, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.3555206286836935, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8118, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 0.3564636542239686, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8288, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.3574066797642436, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8376, |
|
"step": 5685 |
|
}, |
|
{ |
|
"epoch": 0.35834970530451865, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8426, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.35929273084479374, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8437, |
|
"step": 5715 |
|
}, |
|
{ |
|
"epoch": 0.3602357563850688, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8469, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.3611787819253438, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8274, |
|
"step": 5745 |
|
}, |
|
{ |
|
"epoch": 0.36212180746561884, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8306, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.3630648330058939, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8315, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.36400785854616896, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8379, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.364950884086444, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8342, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 0.3658939096267191, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8374, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.3668369351669941, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8103, |
|
"step": 5835 |
|
}, |
|
{ |
|
"epoch": 0.36777996070726915, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8053, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.3687229862475442, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8248, |
|
"step": 5865 |
|
}, |
|
{ |
|
"epoch": 0.36966601178781927, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8118, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.3706090373280943, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8289, |
|
"step": 5895 |
|
}, |
|
{ |
|
"epoch": 0.37155206286836934, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8295, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.3724950884086444, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8158, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.37343811394891946, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8235, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.3743811394891945, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8148, |
|
"step": 5955 |
|
}, |
|
{ |
|
"epoch": 0.3753241650294695, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8161, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.3762671905697446, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.812, |
|
"step": 5985 |
|
}, |
|
{ |
|
"epoch": 0.37721021611001965, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8154, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3781532416502947, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8248, |
|
"step": 6015 |
|
}, |
|
{ |
|
"epoch": 0.37909626719056977, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8104, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.3800392927308448, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8228, |
|
"step": 6045 |
|
}, |
|
{ |
|
"epoch": 0.38098231827111984, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8392, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.38192534381139487, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8352, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 0.38286836935166996, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8271, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.383811394891945, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8122, |
|
"step": 6105 |
|
}, |
|
{ |
|
"epoch": 0.38475442043222, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.8221, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.3856974459724951, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8354, |
|
"step": 6135 |
|
}, |
|
{ |
|
"epoch": 0.38664047151277015, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8277, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.3875834970530452, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8263, |
|
"step": 6165 |
|
}, |
|
{ |
|
"epoch": 0.3885265225933202, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8122, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.3894695481335953, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8296, |
|
"step": 6195 |
|
}, |
|
{ |
|
"epoch": 0.39041257367387033, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8171, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.39135559921414537, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8127, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 0.39229862475442046, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.806, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.3932416502946955, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8157, |
|
"step": 6255 |
|
}, |
|
{ |
|
"epoch": 0.3941846758349705, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.826, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.39512770137524555, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8208, |
|
"step": 6285 |
|
}, |
|
{ |
|
"epoch": 0.39607072691552064, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8041, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.3970137524557957, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8254, |
|
"step": 6315 |
|
}, |
|
{ |
|
"epoch": 0.3979567779960707, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8332, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.3988998035363458, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8143, |
|
"step": 6345 |
|
}, |
|
{ |
|
"epoch": 0.39984282907662083, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8087, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.39984282907662083, |
|
"eval_loss": 0.9629083871841431, |
|
"eval_runtime": 9.6716, |
|
"eval_samples_per_second": 103.395, |
|
"eval_steps_per_second": 1.448, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.40078585461689586, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8169, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 0.4017288801571709, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8229, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.402671905697446, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8108, |
|
"step": 6405 |
|
}, |
|
{ |
|
"epoch": 0.403614931237721, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.814, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.40455795677799605, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8077, |
|
"step": 6435 |
|
}, |
|
{ |
|
"epoch": 0.40550098231827114, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8103, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.4064440078585462, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7904, |
|
"step": 6465 |
|
}, |
|
{ |
|
"epoch": 0.4073870333988212, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8006, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.40833005893909624, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.8112, |
|
"step": 6495 |
|
}, |
|
{ |
|
"epoch": 0.40927308447937133, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7984, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.41021611001964636, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7883, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 0.4111591355599214, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8196, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.4121021611001965, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8274, |
|
"step": 6555 |
|
}, |
|
{ |
|
"epoch": 0.4130451866404715, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7942, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.41398821218074655, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7965, |
|
"step": 6585 |
|
}, |
|
{ |
|
"epoch": 0.4149312377210216, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7944, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.4158742632612967, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8055, |
|
"step": 6615 |
|
}, |
|
{ |
|
"epoch": 0.4168172888015717, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8083, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.41776031434184674, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8151, |
|
"step": 6645 |
|
}, |
|
{ |
|
"epoch": 0.4187033398821218, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8093, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.41964636542239686, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001, |
|
"loss": 0.807, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 0.4205893909626719, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7884, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.4215324165029469, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7958, |
|
"step": 6705 |
|
}, |
|
{ |
|
"epoch": 0.422475442043222, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8029, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.42341846758349705, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.804, |
|
"step": 6735 |
|
}, |
|
{ |
|
"epoch": 0.4243614931237721, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8235, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.42530451866404717, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8105, |
|
"step": 6765 |
|
}, |
|
{ |
|
"epoch": 0.4262475442043222, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8028, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.42719056974459724, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8017, |
|
"step": 6795 |
|
}, |
|
{ |
|
"epoch": 0.4281335952848723, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7998, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.42907662082514736, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8083, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 0.4300196463654224, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7701, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.4309626719056974, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7922, |
|
"step": 6855 |
|
}, |
|
{ |
|
"epoch": 0.4319056974459725, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7971, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.43284872298624755, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.795, |
|
"step": 6885 |
|
}, |
|
{ |
|
"epoch": 0.4337917485265226, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8004, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.43473477406679767, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7965, |
|
"step": 6915 |
|
}, |
|
{ |
|
"epoch": 0.4356777996070727, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7937, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.43662082514734774, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8007, |
|
"step": 6945 |
|
}, |
|
{ |
|
"epoch": 0.43756385068762277, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7935, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.43850687622789786, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8045, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 0.4394499017681729, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8055, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.4403929273084479, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8005, |
|
"step": 7005 |
|
}, |
|
{ |
|
"epoch": 0.441335952848723, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7881, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.44227897838899805, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.8212, |
|
"step": 7035 |
|
}, |
|
{ |
|
"epoch": 0.4432220039292731, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7984, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.4441650294695481, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.8078, |
|
"step": 7065 |
|
}, |
|
{ |
|
"epoch": 0.4451080550098232, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7773, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.44605108055009823, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7884, |
|
"step": 7095 |
|
}, |
|
{ |
|
"epoch": 0.44699410609037327, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7842, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.44793713163064836, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7854, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 0.4488801571709234, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7913, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.4498231827111984, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7944, |
|
"step": 7155 |
|
}, |
|
{ |
|
"epoch": 0.45076620825147345, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7935, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.45170923379174854, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7915, |
|
"step": 7185 |
|
}, |
|
{ |
|
"epoch": 0.4526522593320236, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7893, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.4535952848722986, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7749, |
|
"step": 7215 |
|
}, |
|
{ |
|
"epoch": 0.4545383104125737, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7738, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.45548133595284873, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7832, |
|
"step": 7245 |
|
}, |
|
{ |
|
"epoch": 0.45642436149312376, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7935, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.4573673870333988, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7969, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 0.4583104125736739, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7891, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.4592534381139489, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7854, |
|
"step": 7305 |
|
}, |
|
{ |
|
"epoch": 0.46019646365422395, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8013, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.46113948919449904, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7864, |
|
"step": 7335 |
|
}, |
|
{ |
|
"epoch": 0.4620825147347741, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7932, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.4630255402750491, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7866, |
|
"step": 7365 |
|
}, |
|
{ |
|
"epoch": 0.46396856581532414, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8011, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.46491159135559923, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7743, |
|
"step": 7395 |
|
}, |
|
{ |
|
"epoch": 0.46585461689587426, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7784, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.4667976424361493, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7953, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 0.4677406679764244, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7807, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.4686836935166994, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7713, |
|
"step": 7455 |
|
}, |
|
{ |
|
"epoch": 0.46962671905697445, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7636, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.4705697445972495, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7773, |
|
"step": 7485 |
|
}, |
|
{ |
|
"epoch": 0.4715127701375246, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8002, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4724557956777996, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7799, |
|
"step": 7515 |
|
}, |
|
{ |
|
"epoch": 0.47339882121807464, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7776, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.47434184675834973, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7823, |
|
"step": 7545 |
|
}, |
|
{ |
|
"epoch": 0.47528487229862476, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8059, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.4762278978388998, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7908, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 0.4771709233791748, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7923, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.4781139489194499, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.778, |
|
"step": 7605 |
|
}, |
|
{ |
|
"epoch": 0.47905697445972495, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.8007, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7842, |
|
"step": 7635 |
|
}, |
|
{ |
|
"epoch": 0.48094302554027507, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7968, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.4818860510805501, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7812, |
|
"step": 7665 |
|
}, |
|
{ |
|
"epoch": 0.48282907662082514, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7832, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.48377210216110017, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7915, |
|
"step": 7695 |
|
}, |
|
{ |
|
"epoch": 0.48471512770137526, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.8046, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.4856581532416503, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7674, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 0.4866011787819253, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7795, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.4875442043222004, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7983, |
|
"step": 7755 |
|
}, |
|
{ |
|
"epoch": 0.48848722986247545, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7897, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.4894302554027505, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.772, |
|
"step": 7785 |
|
}, |
|
{ |
|
"epoch": 0.4903732809430255, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7795, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.4913163064833006, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7739, |
|
"step": 7815 |
|
}, |
|
{ |
|
"epoch": 0.49225933202357564, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7891, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.49320235756385067, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7802, |
|
"step": 7845 |
|
}, |
|
{ |
|
"epoch": 0.49414538310412576, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7843, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.4950884086444008, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7756, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 0.4960314341846758, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.77, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.49697445972495086, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7633, |
|
"step": 7905 |
|
}, |
|
{ |
|
"epoch": 0.49791748526522595, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7842, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.498860510805501, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7742, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 0.499803536345776, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7608, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.499803536345776, |
|
"eval_loss": 0.9156466126441956, |
|
"eval_runtime": 9.6921, |
|
"eval_samples_per_second": 103.176, |
|
"eval_steps_per_second": 1.444, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.500746561886051, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7861, |
|
"step": 7965 |
|
}, |
|
{ |
|
"epoch": 0.5016895874263261, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7726, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.5026326129666012, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.7749, |
|
"step": 7995 |
|
}, |
|
{ |
|
"epoch": 0.5035756385068763, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7686, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.5045186640471513, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7797, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 0.5054616895874263, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7622, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.5064047151277014, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7753, |
|
"step": 8055 |
|
}, |
|
{ |
|
"epoch": 0.5073477406679764, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7744, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.5082907662082514, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.7659, |
|
"step": 8085 |
|
}, |
|
{ |
|
"epoch": 0.5092337917485266, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7883, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.5101768172888016, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7809, |
|
"step": 8115 |
|
}, |
|
{ |
|
"epoch": 0.5111198428290766, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7701, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.5120628683693517, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7659, |
|
"step": 8145 |
|
}, |
|
{ |
|
"epoch": 0.5130058939096267, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7772, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.5139489194499017, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7769, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 0.5148919449901768, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7706, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.5158349705304519, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7645, |
|
"step": 8205 |
|
}, |
|
{ |
|
"epoch": 0.5167779960707269, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7724, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.517721021611002, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7651, |
|
"step": 8235 |
|
}, |
|
{ |
|
"epoch": 0.518664047151277, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7703, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.519607072691552, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7709, |
|
"step": 8265 |
|
}, |
|
{ |
|
"epoch": 0.5205500982318271, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7759, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.5214931237721021, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7687, |
|
"step": 8295 |
|
}, |
|
{ |
|
"epoch": 0.5224361493123773, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7735, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.5233791748526523, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7653, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 0.5243222003929273, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.766, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.5252652259332024, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.768, |
|
"step": 8355 |
|
}, |
|
{ |
|
"epoch": 0.5262082514734774, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7651, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.5271512770137524, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.77, |
|
"step": 8385 |
|
}, |
|
{ |
|
"epoch": 0.5280943025540275, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7671, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5290373280943026, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7568, |
|
"step": 8415 |
|
}, |
|
{ |
|
"epoch": 0.5299803536345776, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7719, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.5309233791748527, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7765, |
|
"step": 8445 |
|
}, |
|
{ |
|
"epoch": 0.5318664047151277, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7713, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.5328094302554027, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7779, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 0.5337524557956778, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7675, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.5346954813359528, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7652, |
|
"step": 8505 |
|
}, |
|
{ |
|
"epoch": 0.5356385068762279, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7692, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.536581532416503, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7781, |
|
"step": 8535 |
|
}, |
|
{ |
|
"epoch": 0.537524557956778, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.765, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.538467583497053, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7549, |
|
"step": 8565 |
|
}, |
|
{ |
|
"epoch": 0.5394106090373281, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7709, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.5403536345776031, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7739, |
|
"step": 8595 |
|
}, |
|
{ |
|
"epoch": 0.5412966601178782, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.769, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.5422396856581533, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7737, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.5431827111984283, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7638, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.5441257367387033, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7392, |
|
"step": 8655 |
|
}, |
|
{ |
|
"epoch": 0.5450687622789784, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7566, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.5460117878192534, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7592, |
|
"step": 8685 |
|
}, |
|
{ |
|
"epoch": 0.5469548133595284, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7485, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.5478978388998036, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7678, |
|
"step": 8715 |
|
}, |
|
{ |
|
"epoch": 0.5488408644400786, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7634, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.5497838899803537, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7471, |
|
"step": 8745 |
|
}, |
|
{ |
|
"epoch": 0.5507269155206287, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7561, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.5516699410609037, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7622, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 0.5526129666011788, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7701, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.5535559921414538, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7728, |
|
"step": 8805 |
|
}, |
|
{ |
|
"epoch": 0.5544990176817289, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7813, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.555442043222004, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7614, |
|
"step": 8835 |
|
}, |
|
{ |
|
"epoch": 0.556385068762279, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7766, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.557328094302554, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7735, |
|
"step": 8865 |
|
}, |
|
{ |
|
"epoch": 0.5582711198428291, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7641, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.5592141453831041, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7798, |
|
"step": 8895 |
|
}, |
|
{ |
|
"epoch": 0.5601571709233791, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7471, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.5611001964636543, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7625, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 0.5620432220039293, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7631, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.5629862475442043, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7679, |
|
"step": 8955 |
|
}, |
|
{ |
|
"epoch": 0.5639292730844794, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7647, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.5648722986247544, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7674, |
|
"step": 8985 |
|
}, |
|
{ |
|
"epoch": 0.5658153241650294, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7735, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5667583497053045, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.7826, |
|
"step": 9015 |
|
}, |
|
{ |
|
"epoch": 0.5677013752455796, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.764, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 0.5686444007858547, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7535, |
|
"step": 9045 |
|
}, |
|
{ |
|
"epoch": 0.5695874263261297, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7588, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.5705304518664047, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7622, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 0.5714734774066798, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7514, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 0.5724165029469548, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7593, |
|
"step": 9105 |
|
}, |
|
{ |
|
"epoch": 0.5733595284872298, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7677, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.574302554027505, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7539, |
|
"step": 9135 |
|
}, |
|
{ |
|
"epoch": 0.57524557956778, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7475, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.576188605108055, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.741, |
|
"step": 9165 |
|
}, |
|
{ |
|
"epoch": 0.5771316306483301, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7533, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.5780746561886051, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.765, |
|
"step": 9195 |
|
}, |
|
{ |
|
"epoch": 0.5790176817288801, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7741, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 0.5799607072691552, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7598, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 0.5809037328094303, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7539, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.5818467583497053, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7455, |
|
"step": 9255 |
|
}, |
|
{ |
|
"epoch": 0.5827897838899804, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7506, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 0.5837328094302554, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7555, |
|
"step": 9285 |
|
}, |
|
{ |
|
"epoch": 0.5846758349705304, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7635, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5856188605108055, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7351, |
|
"step": 9315 |
|
}, |
|
{ |
|
"epoch": 0.5865618860510805, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7341, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 0.5875049115913556, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7525, |
|
"step": 9345 |
|
}, |
|
{ |
|
"epoch": 0.5884479371316307, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7575, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.5893909626719057, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7608, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 0.5903339882121807, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7602, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 0.5912770137524558, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7615, |
|
"step": 9405 |
|
}, |
|
{ |
|
"epoch": 0.5922200392927308, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.762, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.5931630648330058, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7635, |
|
"step": 9435 |
|
}, |
|
{ |
|
"epoch": 0.594106090373281, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7556, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.595049115913556, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7497, |
|
"step": 9465 |
|
}, |
|
{ |
|
"epoch": 0.5959921414538311, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7419, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.5969351669941061, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7562, |
|
"step": 9495 |
|
}, |
|
{ |
|
"epoch": 0.5978781925343811, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7468, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 0.5988212180746562, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7499, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 0.5997642436149312, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7683, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.5997642436149312, |
|
"eval_loss": 0.8865543603897095, |
|
"eval_runtime": 9.6786, |
|
"eval_samples_per_second": 103.32, |
|
"eval_steps_per_second": 1.446, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.6007072691552063, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7574, |
|
"step": 9555 |
|
}, |
|
{ |
|
"epoch": 0.6016502946954814, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7518, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 0.6025933202357564, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7391, |
|
"step": 9585 |
|
}, |
|
{ |
|
"epoch": 0.6035363457760314, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7425, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.6044793713163065, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7606, |
|
"step": 9615 |
|
}, |
|
{ |
|
"epoch": 0.6054223968565815, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7292, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 0.6063654223968565, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7356, |
|
"step": 9645 |
|
}, |
|
{ |
|
"epoch": 0.6073084479371317, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7513, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.6082514734774067, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.7522, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 0.6091944990176817, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7563, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 0.6101375245579568, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7473, |
|
"step": 9705 |
|
}, |
|
{ |
|
"epoch": 0.6110805500982318, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.76, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.6120235756385068, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7473, |
|
"step": 9735 |
|
}, |
|
{ |
|
"epoch": 0.6129666011787819, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7416, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.613909626719057, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7449, |
|
"step": 9765 |
|
}, |
|
{ |
|
"epoch": 0.614852652259332, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7509, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.6157956777996071, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7468, |
|
"step": 9795 |
|
}, |
|
{ |
|
"epoch": 0.6167387033398821, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7632, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 0.6176817288801572, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7586, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 0.6186247544204322, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7495, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.6195677799607072, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7548, |
|
"step": 9855 |
|
}, |
|
{ |
|
"epoch": 0.6205108055009824, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7484, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 0.6214538310412574, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7683, |
|
"step": 9885 |
|
}, |
|
{ |
|
"epoch": 0.6223968565815324, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7332, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.6233398821218075, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.743, |
|
"step": 9915 |
|
}, |
|
{ |
|
"epoch": 0.6242829076620825, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7527, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 0.6252259332023575, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7407, |
|
"step": 9945 |
|
}, |
|
{ |
|
"epoch": 0.6261689587426326, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.756, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.6271119842829077, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7505, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 0.6280550098231827, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7517, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 0.6289980353634578, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.766, |
|
"step": 10005 |
|
}, |
|
{ |
|
"epoch": 0.6299410609037328, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7385, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.6308840864440078, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7565, |
|
"step": 10035 |
|
}, |
|
{ |
|
"epoch": 0.6318271119842829, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7508, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.6327701375245579, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7519, |
|
"step": 10065 |
|
}, |
|
{ |
|
"epoch": 0.633713163064833, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.76, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.6346561886051081, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7326, |
|
"step": 10095 |
|
}, |
|
{ |
|
"epoch": 0.6355992141453831, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7506, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 0.6365422396856582, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7419, |
|
"step": 10125 |
|
}, |
|
{ |
|
"epoch": 0.6374852652259332, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7309, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.6384282907662082, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7367, |
|
"step": 10155 |
|
}, |
|
{ |
|
"epoch": 0.6393713163064833, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7472, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 0.6403143418467584, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7431, |
|
"step": 10185 |
|
}, |
|
{ |
|
"epoch": 0.6412573673870334, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7496, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.6422003929273085, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.741, |
|
"step": 10215 |
|
}, |
|
{ |
|
"epoch": 0.6431434184675835, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7548, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 0.6440864440078585, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7615, |
|
"step": 10245 |
|
}, |
|
{ |
|
"epoch": 0.6450294695481336, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.001, |
|
"loss": 0.764, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.6459724950884086, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7467, |
|
"step": 10275 |
|
}, |
|
{ |
|
"epoch": 0.6469155206286837, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.752, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 0.6478585461689588, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7238, |
|
"step": 10305 |
|
}, |
|
{ |
|
"epoch": 0.6488015717092338, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7464, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.6497445972495088, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7376, |
|
"step": 10335 |
|
}, |
|
{ |
|
"epoch": 0.6506876227897839, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7378, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.6516306483300589, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7536, |
|
"step": 10365 |
|
}, |
|
{ |
|
"epoch": 0.6525736738703339, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.732, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.6535166994106091, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7554, |
|
"step": 10395 |
|
}, |
|
{ |
|
"epoch": 0.6544597249508841, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7348, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 0.6554027504911591, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7446, |
|
"step": 10425 |
|
}, |
|
{ |
|
"epoch": 0.6563457760314342, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7386, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.6572888015717092, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7456, |
|
"step": 10455 |
|
}, |
|
{ |
|
"epoch": 0.6582318271119842, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.7447, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 0.6591748526522593, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7466, |
|
"step": 10485 |
|
}, |
|
{ |
|
"epoch": 0.6601178781925344, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7638, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6610609037328095, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7454, |
|
"step": 10515 |
|
}, |
|
{ |
|
"epoch": 0.6620039292730845, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.738, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 0.6629469548133595, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7443, |
|
"step": 10545 |
|
}, |
|
{ |
|
"epoch": 0.6638899803536346, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7433, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.6648330058939096, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7328, |
|
"step": 10575 |
|
}, |
|
{ |
|
"epoch": 0.6657760314341846, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7419, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 0.6667190569744598, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7387, |
|
"step": 10605 |
|
}, |
|
{ |
|
"epoch": 0.6676620825147348, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7325, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.6686051080550098, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.737, |
|
"step": 10635 |
|
}, |
|
{ |
|
"epoch": 0.6695481335952849, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7447, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.6704911591355599, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7332, |
|
"step": 10665 |
|
}, |
|
{ |
|
"epoch": 0.6714341846758349, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7459, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.67237721021611, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7389, |
|
"step": 10695 |
|
}, |
|
{ |
|
"epoch": 0.6733202357563851, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7362, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 0.6742632612966601, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7297, |
|
"step": 10725 |
|
}, |
|
{ |
|
"epoch": 0.6752062868369352, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7506, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.6761493123772102, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7279, |
|
"step": 10755 |
|
}, |
|
{ |
|
"epoch": 0.6770923379174852, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7329, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 0.6780353634577603, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.736, |
|
"step": 10785 |
|
}, |
|
{ |
|
"epoch": 0.6789783889980353, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7168, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6799214145383105, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7394, |
|
"step": 10815 |
|
}, |
|
{ |
|
"epoch": 0.6808644400785855, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7165, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 0.6818074656188605, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7249, |
|
"step": 10845 |
|
}, |
|
{ |
|
"epoch": 0.6827504911591356, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.732, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.6836935166994106, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.747, |
|
"step": 10875 |
|
}, |
|
{ |
|
"epoch": 0.6846365422396856, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7268, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 0.6855795677799607, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7334, |
|
"step": 10905 |
|
}, |
|
{ |
|
"epoch": 0.6865225933202358, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7243, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.6874656188605108, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7402, |
|
"step": 10935 |
|
}, |
|
{ |
|
"epoch": 0.6884086444007859, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.738, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.6893516699410609, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7309, |
|
"step": 10965 |
|
}, |
|
{ |
|
"epoch": 0.6902946954813359, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.7551, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.691237721021611, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7438, |
|
"step": 10995 |
|
}, |
|
{ |
|
"epoch": 0.692180746561886, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7353, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 0.6931237721021611, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.728, |
|
"step": 11025 |
|
}, |
|
{ |
|
"epoch": 0.6940667976424362, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7366, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.6950098231827112, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7424, |
|
"step": 11055 |
|
}, |
|
{ |
|
"epoch": 0.6959528487229862, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7434, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 0.6968958742632613, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7371, |
|
"step": 11085 |
|
}, |
|
{ |
|
"epoch": 0.6978388998035363, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7326, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.6987819253438114, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7272, |
|
"step": 11115 |
|
}, |
|
{ |
|
"epoch": 0.6997249508840865, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.738, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 0.6997249508840865, |
|
"eval_loss": 0.8602269291877747, |
|
"eval_runtime": 9.6753, |
|
"eval_samples_per_second": 103.356, |
|
"eval_steps_per_second": 1.447, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 0.7006679764243615, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7375, |
|
"step": 11145 |
|
}, |
|
{ |
|
"epoch": 0.7016110019646365, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.7545, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.7025540275049116, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7482, |
|
"step": 11175 |
|
}, |
|
{ |
|
"epoch": 0.7034970530451866, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7274, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 0.7044400785854616, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7241, |
|
"step": 11205 |
|
}, |
|
{ |
|
"epoch": 0.7053831041257368, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7303, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.7063261296660118, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7267, |
|
"step": 11235 |
|
}, |
|
{ |
|
"epoch": 0.7072691552062869, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7267, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.7082121807465619, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7309, |
|
"step": 11265 |
|
}, |
|
{ |
|
"epoch": 0.7091552062868369, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7377, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.710098231827112, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7306, |
|
"step": 11295 |
|
}, |
|
{ |
|
"epoch": 0.711041257367387, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.7341, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 0.7119842829076621, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7349, |
|
"step": 11325 |
|
}, |
|
{ |
|
"epoch": 0.7129273084479372, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7407, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 0.7138703339882122, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7358, |
|
"step": 11355 |
|
}, |
|
{ |
|
"epoch": 0.7148133595284872, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7254, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 0.7157563850687623, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7328, |
|
"step": 11385 |
|
}, |
|
{ |
|
"epoch": 0.7166994106090373, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7304, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.7176424361493123, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7317, |
|
"step": 11415 |
|
}, |
|
{ |
|
"epoch": 0.7185854616895875, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.732, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 0.7195284872298625, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7433, |
|
"step": 11445 |
|
}, |
|
{ |
|
"epoch": 0.7204715127701375, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7415, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 0.7214145383104126, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7285, |
|
"step": 11475 |
|
}, |
|
{ |
|
"epoch": 0.7223575638506876, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7299, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 0.7233005893909626, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7314, |
|
"step": 11505 |
|
}, |
|
{ |
|
"epoch": 0.7242436149312377, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7413, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 0.7251866404715128, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7266, |
|
"step": 11535 |
|
}, |
|
{ |
|
"epoch": 0.7261296660117879, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7104, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.7270726915520629, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7252, |
|
"step": 11565 |
|
}, |
|
{ |
|
"epoch": 0.7280157170923379, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.726, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 0.728958742632613, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7242, |
|
"step": 11595 |
|
}, |
|
{ |
|
"epoch": 0.729901768172888, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7313, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 0.730844793713163, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7379, |
|
"step": 11625 |
|
}, |
|
{ |
|
"epoch": 0.7317878192534382, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7394, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 0.7327308447937132, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7332, |
|
"step": 11655 |
|
}, |
|
{ |
|
"epoch": 0.7336738703339882, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7154, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 0.7346168958742633, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7351, |
|
"step": 11685 |
|
}, |
|
{ |
|
"epoch": 0.7355599214145383, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7375, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.7365029469548133, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7363, |
|
"step": 11715 |
|
}, |
|
{ |
|
"epoch": 0.7374459724950884, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7301, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 0.7383889980353635, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7287, |
|
"step": 11745 |
|
}, |
|
{ |
|
"epoch": 0.7393320235756385, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7237, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 0.7402750491159136, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7242, |
|
"step": 11775 |
|
}, |
|
{ |
|
"epoch": 0.7412180746561886, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7242, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 0.7421611001964636, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7171, |
|
"step": 11805 |
|
}, |
|
{ |
|
"epoch": 0.7431041257367387, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7191, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 0.7440471512770137, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7323, |
|
"step": 11835 |
|
}, |
|
{ |
|
"epoch": 0.7449901768172889, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7139, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.7459332023575639, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7237, |
|
"step": 11865 |
|
}, |
|
{ |
|
"epoch": 0.7468762278978389, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7136, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 0.747819253438114, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.7375, |
|
"step": 11895 |
|
}, |
|
{ |
|
"epoch": 0.748762278978389, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7236, |
|
"step": 11910 |
|
}, |
|
{ |
|
"epoch": 0.749705304518664, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7416, |
|
"step": 11925 |
|
}, |
|
{ |
|
"epoch": 0.750648330058939, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7376, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 0.7515913555992142, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7293, |
|
"step": 11955 |
|
}, |
|
{ |
|
"epoch": 0.7525343811394892, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.7274, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 0.7534774066797643, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7251, |
|
"step": 11985 |
|
}, |
|
{ |
|
"epoch": 0.7544204322200393, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7221, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7553634577603143, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7269, |
|
"step": 12015 |
|
}, |
|
{ |
|
"epoch": 0.7563064833005894, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7229, |
|
"step": 12030 |
|
}, |
|
{ |
|
"epoch": 0.7572495088408644, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7332, |
|
"step": 12045 |
|
}, |
|
{ |
|
"epoch": 0.7581925343811395, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7425, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 0.7591355599214146, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7084, |
|
"step": 12075 |
|
}, |
|
{ |
|
"epoch": 0.7600785854616896, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7212, |
|
"step": 12090 |
|
}, |
|
{ |
|
"epoch": 0.7610216110019646, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7221, |
|
"step": 12105 |
|
}, |
|
{ |
|
"epoch": 0.7619646365422397, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7125, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 0.7629076620825147, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7214, |
|
"step": 12135 |
|
}, |
|
{ |
|
"epoch": 0.7638506876227897, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7211, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.7647937131630649, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7126, |
|
"step": 12165 |
|
}, |
|
{ |
|
"epoch": 0.7657367387033399, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.726, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 0.766679764243615, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7079, |
|
"step": 12195 |
|
}, |
|
{ |
|
"epoch": 0.76762278978389, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7282, |
|
"step": 12210 |
|
}, |
|
{ |
|
"epoch": 0.768565815324165, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7293, |
|
"step": 12225 |
|
}, |
|
{ |
|
"epoch": 0.76950884086444, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7245, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 0.7704518664047151, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7263, |
|
"step": 12255 |
|
}, |
|
{ |
|
"epoch": 0.7713948919449902, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7483, |
|
"step": 12270 |
|
}, |
|
{ |
|
"epoch": 0.7723379174852653, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7243, |
|
"step": 12285 |
|
}, |
|
{ |
|
"epoch": 0.7732809430255403, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.72, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.7742239685658153, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7145, |
|
"step": 12315 |
|
}, |
|
{ |
|
"epoch": 0.7751669941060904, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7264, |
|
"step": 12330 |
|
}, |
|
{ |
|
"epoch": 0.7761100196463654, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7233, |
|
"step": 12345 |
|
}, |
|
{ |
|
"epoch": 0.7770530451866404, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7132, |
|
"step": 12360 |
|
}, |
|
{ |
|
"epoch": 0.7779960707269156, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7218, |
|
"step": 12375 |
|
}, |
|
{ |
|
"epoch": 0.7789390962671906, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7229, |
|
"step": 12390 |
|
}, |
|
{ |
|
"epoch": 0.7798821218074656, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7244, |
|
"step": 12405 |
|
}, |
|
{ |
|
"epoch": 0.7808251473477407, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7133, |
|
"step": 12420 |
|
}, |
|
{ |
|
"epoch": 0.7817681728880157, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7165, |
|
"step": 12435 |
|
}, |
|
{ |
|
"epoch": 0.7827111984282907, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7125, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.7836542239685658, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7025, |
|
"step": 12465 |
|
}, |
|
{ |
|
"epoch": 0.7845972495088409, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7143, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 0.7855402750491159, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7217, |
|
"step": 12495 |
|
}, |
|
{ |
|
"epoch": 0.786483300589391, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7194, |
|
"step": 12510 |
|
}, |
|
{ |
|
"epoch": 0.787426326129666, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7117, |
|
"step": 12525 |
|
}, |
|
{ |
|
"epoch": 0.788369351669941, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7125, |
|
"step": 12540 |
|
}, |
|
{ |
|
"epoch": 0.7893123772102161, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7107, |
|
"step": 12555 |
|
}, |
|
{ |
|
"epoch": 0.7902554027504911, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7392, |
|
"step": 12570 |
|
}, |
|
{ |
|
"epoch": 0.7911984282907663, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7211, |
|
"step": 12585 |
|
}, |
|
{ |
|
"epoch": 0.7921414538310413, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7139, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7930844793713163, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.721, |
|
"step": 12615 |
|
}, |
|
{ |
|
"epoch": 0.7940275049115914, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7258, |
|
"step": 12630 |
|
}, |
|
{ |
|
"epoch": 0.7949705304518664, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7079, |
|
"step": 12645 |
|
}, |
|
{ |
|
"epoch": 0.7959135559921414, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.712, |
|
"step": 12660 |
|
}, |
|
{ |
|
"epoch": 0.7968565815324165, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7296, |
|
"step": 12675 |
|
}, |
|
{ |
|
"epoch": 0.7977996070726916, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7146, |
|
"step": 12690 |
|
}, |
|
{ |
|
"epoch": 0.7987426326129666, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7202, |
|
"step": 12705 |
|
}, |
|
{ |
|
"epoch": 0.7996856581532417, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7257, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 0.7996856581532417, |
|
"eval_loss": 0.8420960307121277, |
|
"eval_runtime": 9.6794, |
|
"eval_samples_per_second": 103.312, |
|
"eval_steps_per_second": 1.446, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 0.8006286836935167, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7087, |
|
"step": 12735 |
|
}, |
|
{ |
|
"epoch": 0.8015717092337917, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7219, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.8025147347740668, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7241, |
|
"step": 12765 |
|
}, |
|
{ |
|
"epoch": 0.8034577603143418, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7211, |
|
"step": 12780 |
|
}, |
|
{ |
|
"epoch": 0.8044007858546169, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7234, |
|
"step": 12795 |
|
}, |
|
{ |
|
"epoch": 0.805343811394892, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7214, |
|
"step": 12810 |
|
}, |
|
{ |
|
"epoch": 0.806286836935167, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7201, |
|
"step": 12825 |
|
}, |
|
{ |
|
"epoch": 0.807229862475442, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7261, |
|
"step": 12840 |
|
}, |
|
{ |
|
"epoch": 0.8081728880157171, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7301, |
|
"step": 12855 |
|
}, |
|
{ |
|
"epoch": 0.8091159135559921, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7104, |
|
"step": 12870 |
|
}, |
|
{ |
|
"epoch": 0.8100589390962671, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7094, |
|
"step": 12885 |
|
}, |
|
{ |
|
"epoch": 0.8110019646365423, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7188, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.8119449901768173, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7296, |
|
"step": 12915 |
|
}, |
|
{ |
|
"epoch": 0.8128880157170923, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.725, |
|
"step": 12930 |
|
}, |
|
{ |
|
"epoch": 0.8138310412573674, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.716, |
|
"step": 12945 |
|
}, |
|
{ |
|
"epoch": 0.8147740667976424, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7199, |
|
"step": 12960 |
|
}, |
|
{ |
|
"epoch": 0.8157170923379174, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7251, |
|
"step": 12975 |
|
}, |
|
{ |
|
"epoch": 0.8166601178781925, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7153, |
|
"step": 12990 |
|
}, |
|
{ |
|
"epoch": 0.8176031434184676, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7172, |
|
"step": 13005 |
|
}, |
|
{ |
|
"epoch": 0.8185461689587427, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7241, |
|
"step": 13020 |
|
}, |
|
{ |
|
"epoch": 0.8194891944990177, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7087, |
|
"step": 13035 |
|
}, |
|
{ |
|
"epoch": 0.8204322200392927, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7146, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.8213752455795678, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7137, |
|
"step": 13065 |
|
}, |
|
{ |
|
"epoch": 0.8223182711198428, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7309, |
|
"step": 13080 |
|
}, |
|
{ |
|
"epoch": 0.8232612966601178, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7075, |
|
"step": 13095 |
|
}, |
|
{ |
|
"epoch": 0.824204322200393, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.7187, |
|
"step": 13110 |
|
}, |
|
{ |
|
"epoch": 0.825147347740668, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7133, |
|
"step": 13125 |
|
}, |
|
{ |
|
"epoch": 0.826090373280943, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7062, |
|
"step": 13140 |
|
}, |
|
{ |
|
"epoch": 0.8270333988212181, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7139, |
|
"step": 13155 |
|
}, |
|
{ |
|
"epoch": 0.8279764243614931, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7122, |
|
"step": 13170 |
|
}, |
|
{ |
|
"epoch": 0.8289194499017681, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7089, |
|
"step": 13185 |
|
}, |
|
{ |
|
"epoch": 0.8298624754420432, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7148, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.8308055009823183, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7165, |
|
"step": 13215 |
|
}, |
|
{ |
|
"epoch": 0.8317485265225933, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.716, |
|
"step": 13230 |
|
}, |
|
{ |
|
"epoch": 0.8326915520628684, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7324, |
|
"step": 13245 |
|
}, |
|
{ |
|
"epoch": 0.8336345776031434, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7028, |
|
"step": 13260 |
|
}, |
|
{ |
|
"epoch": 0.8345776031434184, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7162, |
|
"step": 13275 |
|
}, |
|
{ |
|
"epoch": 0.8355206286836935, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7115, |
|
"step": 13290 |
|
}, |
|
{ |
|
"epoch": 0.8364636542239685, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7323, |
|
"step": 13305 |
|
}, |
|
{ |
|
"epoch": 0.8374066797642437, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7154, |
|
"step": 13320 |
|
}, |
|
{ |
|
"epoch": 0.8383497053045187, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7231, |
|
"step": 13335 |
|
}, |
|
{ |
|
"epoch": 0.8392927308447937, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7308, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 0.8402357563850688, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7169, |
|
"step": 13365 |
|
}, |
|
{ |
|
"epoch": 0.8411787819253438, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7209, |
|
"step": 13380 |
|
}, |
|
{ |
|
"epoch": 0.8421218074656188, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.706, |
|
"step": 13395 |
|
}, |
|
{ |
|
"epoch": 0.8430648330058939, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7079, |
|
"step": 13410 |
|
}, |
|
{ |
|
"epoch": 0.844007858546169, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7062, |
|
"step": 13425 |
|
}, |
|
{ |
|
"epoch": 0.844950884086444, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7047, |
|
"step": 13440 |
|
}, |
|
{ |
|
"epoch": 0.8458939096267191, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7179, |
|
"step": 13455 |
|
}, |
|
{ |
|
"epoch": 0.8468369351669941, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7159, |
|
"step": 13470 |
|
}, |
|
{ |
|
"epoch": 0.8477799607072691, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7152, |
|
"step": 13485 |
|
}, |
|
{ |
|
"epoch": 0.8487229862475442, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.709, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8496660117878193, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7158, |
|
"step": 13515 |
|
}, |
|
{ |
|
"epoch": 0.8506090373280943, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7026, |
|
"step": 13530 |
|
}, |
|
{ |
|
"epoch": 0.8515520628683694, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7197, |
|
"step": 13545 |
|
}, |
|
{ |
|
"epoch": 0.8524950884086444, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7271, |
|
"step": 13560 |
|
}, |
|
{ |
|
"epoch": 0.8534381139489194, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7241, |
|
"step": 13575 |
|
}, |
|
{ |
|
"epoch": 0.8543811394891945, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7244, |
|
"step": 13590 |
|
}, |
|
{ |
|
"epoch": 0.8553241650294695, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7154, |
|
"step": 13605 |
|
}, |
|
{ |
|
"epoch": 0.8562671905697447, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7135, |
|
"step": 13620 |
|
}, |
|
{ |
|
"epoch": 0.8572102161100197, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7095, |
|
"step": 13635 |
|
}, |
|
{ |
|
"epoch": 0.8581532416502947, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7245, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 0.8590962671905698, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7174, |
|
"step": 13665 |
|
}, |
|
{ |
|
"epoch": 0.8600392927308448, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7131, |
|
"step": 13680 |
|
}, |
|
{ |
|
"epoch": 0.8609823182711198, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7227, |
|
"step": 13695 |
|
}, |
|
{ |
|
"epoch": 0.8619253438113949, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7067, |
|
"step": 13710 |
|
}, |
|
{ |
|
"epoch": 0.86286836935167, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7013, |
|
"step": 13725 |
|
}, |
|
{ |
|
"epoch": 0.863811394891945, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7046, |
|
"step": 13740 |
|
}, |
|
{ |
|
"epoch": 0.8647544204322201, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7077, |
|
"step": 13755 |
|
}, |
|
{ |
|
"epoch": 0.8656974459724951, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7064, |
|
"step": 13770 |
|
}, |
|
{ |
|
"epoch": 0.8666404715127701, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7177, |
|
"step": 13785 |
|
}, |
|
{ |
|
"epoch": 0.8675834970530452, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7128, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.8685265225933202, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7131, |
|
"step": 13815 |
|
}, |
|
{ |
|
"epoch": 0.8694695481335953, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7048, |
|
"step": 13830 |
|
}, |
|
{ |
|
"epoch": 0.8704125736738704, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7183, |
|
"step": 13845 |
|
}, |
|
{ |
|
"epoch": 0.8713555992141454, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7087, |
|
"step": 13860 |
|
}, |
|
{ |
|
"epoch": 0.8722986247544204, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7117, |
|
"step": 13875 |
|
}, |
|
{ |
|
"epoch": 0.8732416502946955, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7216, |
|
"step": 13890 |
|
}, |
|
{ |
|
"epoch": 0.8741846758349705, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7159, |
|
"step": 13905 |
|
}, |
|
{ |
|
"epoch": 0.8751277013752455, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.7096, |
|
"step": 13920 |
|
}, |
|
{ |
|
"epoch": 0.8760707269155207, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.702, |
|
"step": 13935 |
|
}, |
|
{ |
|
"epoch": 0.8770137524557957, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7101, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 0.8779567779960707, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7212, |
|
"step": 13965 |
|
}, |
|
{ |
|
"epoch": 0.8788998035363458, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7126, |
|
"step": 13980 |
|
}, |
|
{ |
|
"epoch": 0.8798428290766208, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7036, |
|
"step": 13995 |
|
}, |
|
{ |
|
"epoch": 0.8807858546168958, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7071, |
|
"step": 14010 |
|
}, |
|
{ |
|
"epoch": 0.8817288801571709, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7051, |
|
"step": 14025 |
|
}, |
|
{ |
|
"epoch": 0.882671905697446, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7156, |
|
"step": 14040 |
|
}, |
|
{ |
|
"epoch": 0.8836149312377211, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.709, |
|
"step": 14055 |
|
}, |
|
{ |
|
"epoch": 0.8845579567779961, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7062, |
|
"step": 14070 |
|
}, |
|
{ |
|
"epoch": 0.8855009823182711, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7142, |
|
"step": 14085 |
|
}, |
|
{ |
|
"epoch": 0.8864440078585462, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7143, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.8873870333988212, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7093, |
|
"step": 14115 |
|
}, |
|
{ |
|
"epoch": 0.8883300589390962, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.712, |
|
"step": 14130 |
|
}, |
|
{ |
|
"epoch": 0.8892730844793714, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7085, |
|
"step": 14145 |
|
}, |
|
{ |
|
"epoch": 0.8902161100196464, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7197, |
|
"step": 14160 |
|
}, |
|
{ |
|
"epoch": 0.8911591355599214, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7022, |
|
"step": 14175 |
|
}, |
|
{ |
|
"epoch": 0.8921021611001965, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7019, |
|
"step": 14190 |
|
}, |
|
{ |
|
"epoch": 0.8930451866404715, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7171, |
|
"step": 14205 |
|
}, |
|
{ |
|
"epoch": 0.8939882121807465, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7052, |
|
"step": 14220 |
|
}, |
|
{ |
|
"epoch": 0.8949312377210216, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7029, |
|
"step": 14235 |
|
}, |
|
{ |
|
"epoch": 0.8958742632612967, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7067, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 0.8968172888015717, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6962, |
|
"step": 14265 |
|
}, |
|
{ |
|
"epoch": 0.8977603143418468, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.702, |
|
"step": 14280 |
|
}, |
|
{ |
|
"epoch": 0.8987033398821218, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7066, |
|
"step": 14295 |
|
}, |
|
{ |
|
"epoch": 0.8996463654223968, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7141, |
|
"step": 14310 |
|
}, |
|
{ |
|
"epoch": 0.8996463654223968, |
|
"eval_loss": 0.8242524266242981, |
|
"eval_runtime": 9.6736, |
|
"eval_samples_per_second": 103.374, |
|
"eval_steps_per_second": 1.447, |
|
"step": 14310 |
|
}, |
|
{ |
|
"epoch": 0.9005893909626719, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7051, |
|
"step": 14325 |
|
}, |
|
{ |
|
"epoch": 0.9015324165029469, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7161, |
|
"step": 14340 |
|
}, |
|
{ |
|
"epoch": 0.902475442043222, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6994, |
|
"step": 14355 |
|
}, |
|
{ |
|
"epoch": 0.9034184675834971, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7121, |
|
"step": 14370 |
|
}, |
|
{ |
|
"epoch": 0.9043614931237721, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7232, |
|
"step": 14385 |
|
}, |
|
{ |
|
"epoch": 0.9053045186640472, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7122, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.9062475442043222, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7168, |
|
"step": 14415 |
|
}, |
|
{ |
|
"epoch": 0.9071905697445972, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6997, |
|
"step": 14430 |
|
}, |
|
{ |
|
"epoch": 0.9081335952848723, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7124, |
|
"step": 14445 |
|
}, |
|
{ |
|
"epoch": 0.9090766208251474, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6995, |
|
"step": 14460 |
|
}, |
|
{ |
|
"epoch": 0.9100196463654224, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7087, |
|
"step": 14475 |
|
}, |
|
{ |
|
"epoch": 0.9109626719056975, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6991, |
|
"step": 14490 |
|
}, |
|
{ |
|
"epoch": 0.9119056974459725, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7069, |
|
"step": 14505 |
|
}, |
|
{ |
|
"epoch": 0.9128487229862475, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.001, |
|
"loss": 0.701, |
|
"step": 14520 |
|
}, |
|
{ |
|
"epoch": 0.9137917485265226, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7111, |
|
"step": 14535 |
|
}, |
|
{ |
|
"epoch": 0.9147347740667976, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6989, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 0.9156777996070727, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7243, |
|
"step": 14565 |
|
}, |
|
{ |
|
"epoch": 0.9166208251473478, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7043, |
|
"step": 14580 |
|
}, |
|
{ |
|
"epoch": 0.9175638506876228, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6925, |
|
"step": 14595 |
|
}, |
|
{ |
|
"epoch": 0.9185068762278978, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7129, |
|
"step": 14610 |
|
}, |
|
{ |
|
"epoch": 0.9194499017681729, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7064, |
|
"step": 14625 |
|
}, |
|
{ |
|
"epoch": 0.9203929273084479, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6876, |
|
"step": 14640 |
|
}, |
|
{ |
|
"epoch": 0.9213359528487229, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6978, |
|
"step": 14655 |
|
}, |
|
{ |
|
"epoch": 0.9222789783889981, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7003, |
|
"step": 14670 |
|
}, |
|
{ |
|
"epoch": 0.9232220039292731, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7009, |
|
"step": 14685 |
|
}, |
|
{ |
|
"epoch": 0.9241650294695481, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7093, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.9251080550098232, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6927, |
|
"step": 14715 |
|
}, |
|
{ |
|
"epoch": 0.9260510805500982, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6995, |
|
"step": 14730 |
|
}, |
|
{ |
|
"epoch": 0.9269941060903732, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.711, |
|
"step": 14745 |
|
}, |
|
{ |
|
"epoch": 0.9279371316306483, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7156, |
|
"step": 14760 |
|
}, |
|
{ |
|
"epoch": 0.9288801571709234, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7173, |
|
"step": 14775 |
|
}, |
|
{ |
|
"epoch": 0.9298231827111985, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7132, |
|
"step": 14790 |
|
}, |
|
{ |
|
"epoch": 0.9307662082514735, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6983, |
|
"step": 14805 |
|
}, |
|
{ |
|
"epoch": 0.9317092337917485, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7047, |
|
"step": 14820 |
|
}, |
|
{ |
|
"epoch": 0.9326522593320236, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7115, |
|
"step": 14835 |
|
}, |
|
{ |
|
"epoch": 0.9335952848722986, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7038, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 0.9345383104125736, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7066, |
|
"step": 14865 |
|
}, |
|
{ |
|
"epoch": 0.9354813359528488, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7062, |
|
"step": 14880 |
|
}, |
|
{ |
|
"epoch": 0.9364243614931238, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6915, |
|
"step": 14895 |
|
}, |
|
{ |
|
"epoch": 0.9373673870333988, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7031, |
|
"step": 14910 |
|
}, |
|
{ |
|
"epoch": 0.9383104125736739, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7072, |
|
"step": 14925 |
|
}, |
|
{ |
|
"epoch": 0.9392534381139489, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7012, |
|
"step": 14940 |
|
}, |
|
{ |
|
"epoch": 0.9401964636542239, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7211, |
|
"step": 14955 |
|
}, |
|
{ |
|
"epoch": 0.941139489194499, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7048, |
|
"step": 14970 |
|
}, |
|
{ |
|
"epoch": 0.9420825147347741, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.7016, |
|
"step": 14985 |
|
}, |
|
{ |
|
"epoch": 0.9430255402750491, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7095, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.9439685658153242, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.705, |
|
"step": 15015 |
|
}, |
|
{ |
|
"epoch": 0.9449115913555992, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6986, |
|
"step": 15030 |
|
}, |
|
{ |
|
"epoch": 0.9458546168958742, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7026, |
|
"step": 15045 |
|
}, |
|
{ |
|
"epoch": 0.9467976424361493, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.709, |
|
"step": 15060 |
|
}, |
|
{ |
|
"epoch": 0.9477406679764243, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.001, |
|
"loss": 0.712, |
|
"step": 15075 |
|
}, |
|
{ |
|
"epoch": 0.9486836935166995, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7126, |
|
"step": 15090 |
|
}, |
|
{ |
|
"epoch": 0.9496267190569745, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6879, |
|
"step": 15105 |
|
}, |
|
{ |
|
"epoch": 0.9505697445972495, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7031, |
|
"step": 15120 |
|
}, |
|
{ |
|
"epoch": 0.9515127701375246, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7146, |
|
"step": 15135 |
|
}, |
|
{ |
|
"epoch": 0.9524557956777996, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6882, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 0.9533988212180746, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6981, |
|
"step": 15165 |
|
}, |
|
{ |
|
"epoch": 0.9543418467583497, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7011, |
|
"step": 15180 |
|
}, |
|
{ |
|
"epoch": 0.9552848722986248, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.698, |
|
"step": 15195 |
|
}, |
|
{ |
|
"epoch": 0.9562278978388998, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6932, |
|
"step": 15210 |
|
}, |
|
{ |
|
"epoch": 0.9571709233791749, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.69, |
|
"step": 15225 |
|
}, |
|
{ |
|
"epoch": 0.9581139489194499, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.695, |
|
"step": 15240 |
|
}, |
|
{ |
|
"epoch": 0.9590569744597249, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.7002, |
|
"step": 15255 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6943, |
|
"step": 15270 |
|
}, |
|
{ |
|
"epoch": 0.960943025540275, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7044, |
|
"step": 15285 |
|
}, |
|
{ |
|
"epoch": 0.9618860510805501, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7069, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.9628290766208252, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6985, |
|
"step": 15315 |
|
}, |
|
{ |
|
"epoch": 0.9637721021611002, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7049, |
|
"step": 15330 |
|
}, |
|
{ |
|
"epoch": 0.9647151277013752, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7035, |
|
"step": 15345 |
|
}, |
|
{ |
|
"epoch": 0.9656581532416503, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7016, |
|
"step": 15360 |
|
}, |
|
{ |
|
"epoch": 0.9666011787819253, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6954, |
|
"step": 15375 |
|
}, |
|
{ |
|
"epoch": 0.9675442043222003, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7014, |
|
"step": 15390 |
|
}, |
|
{ |
|
"epoch": 0.9684872298624755, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7129, |
|
"step": 15405 |
|
}, |
|
{ |
|
"epoch": 0.9694302554027505, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6999, |
|
"step": 15420 |
|
}, |
|
{ |
|
"epoch": 0.9703732809430256, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7017, |
|
"step": 15435 |
|
}, |
|
{ |
|
"epoch": 0.9713163064833006, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6893, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 0.9722593320235756, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6993, |
|
"step": 15465 |
|
}, |
|
{ |
|
"epoch": 0.9732023575638507, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6999, |
|
"step": 15480 |
|
}, |
|
{ |
|
"epoch": 0.9741453831041257, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6864, |
|
"step": 15495 |
|
}, |
|
{ |
|
"epoch": 0.9750884086444008, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7057, |
|
"step": 15510 |
|
}, |
|
{ |
|
"epoch": 0.9760314341846759, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6957, |
|
"step": 15525 |
|
}, |
|
{ |
|
"epoch": 0.9769744597249509, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001, |
|
"loss": 0.709, |
|
"step": 15540 |
|
}, |
|
{ |
|
"epoch": 0.9779174852652259, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6965, |
|
"step": 15555 |
|
}, |
|
{ |
|
"epoch": 0.978860510805501, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6989, |
|
"step": 15570 |
|
}, |
|
{ |
|
"epoch": 0.979803536345776, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6995, |
|
"step": 15585 |
|
}, |
|
{ |
|
"epoch": 0.980746561886051, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6894, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.9816895874263262, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.001, |
|
"loss": 0.7084, |
|
"step": 15615 |
|
}, |
|
{ |
|
"epoch": 0.9826326129666012, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7021, |
|
"step": 15630 |
|
}, |
|
{ |
|
"epoch": 0.9835756385068762, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6892, |
|
"step": 15645 |
|
}, |
|
{ |
|
"epoch": 0.9845186640471513, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.001, |
|
"loss": 0.7147, |
|
"step": 15660 |
|
}, |
|
{ |
|
"epoch": 0.9854616895874263, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.001, |
|
"loss": 0.7007, |
|
"step": 15675 |
|
}, |
|
{ |
|
"epoch": 0.9864047151277013, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.699, |
|
"step": 15690 |
|
}, |
|
{ |
|
"epoch": 0.9873477406679764, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6943, |
|
"step": 15705 |
|
}, |
|
{ |
|
"epoch": 0.9882907662082515, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6943, |
|
"step": 15720 |
|
}, |
|
{ |
|
"epoch": 0.9892337917485265, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.001, |
|
"loss": 0.703, |
|
"step": 15735 |
|
}, |
|
{ |
|
"epoch": 0.9901768172888016, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6953, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 0.9911198428290766, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6884, |
|
"step": 15765 |
|
}, |
|
{ |
|
"epoch": 0.9920628683693516, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6972, |
|
"step": 15780 |
|
}, |
|
{ |
|
"epoch": 0.9930058939096267, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.001, |
|
"loss": 0.6929, |
|
"step": 15795 |
|
}, |
|
{ |
|
"epoch": 0.9939489194499017, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.001, |
|
"loss": 0.6849, |
|
"step": 15810 |
|
}, |
|
{ |
|
"epoch": 0.9948919449901769, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.6932, |
|
"step": 15825 |
|
}, |
|
{ |
|
"epoch": 0.9958349705304519, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7042, |
|
"step": 15840 |
|
}, |
|
{ |
|
"epoch": 0.9967779960707269, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001, |
|
"loss": 0.6924, |
|
"step": 15855 |
|
}, |
|
{ |
|
"epoch": 0.997721021611002, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7009, |
|
"step": 15870 |
|
}, |
|
{ |
|
"epoch": 0.998664047151277, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7059, |
|
"step": 15885 |
|
}, |
|
{ |
|
"epoch": 0.999607072691552, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001, |
|
"loss": 0.691, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.999607072691552, |
|
"eval_loss": 0.8118711709976196, |
|
"eval_runtime": 9.6839, |
|
"eval_samples_per_second": 103.264, |
|
"eval_steps_per_second": 1.446, |
|
"step": 15900 |
|
} |
|
], |
|
"logging_steps": 15, |
|
"max_steps": 15906, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1590, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.185992916964999e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|