diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,5692 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 344, - "global_step": 5510, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0014519056261343012, - "grad_norm": 6.9375, - "learning_rate": 1.3333333333333333e-05, - "loss": 0.6156, - "num_input_tokens_seen": 499226, - "step": 8 - }, - { - "epoch": 0.0029038112522686023, - "grad_norm": 1.203125, - "learning_rate": 2.6666666666666667e-05, - "loss": 0.0994, - "num_input_tokens_seen": 1014244, - "step": 16 - }, - { - "epoch": 0.004355716878402904, - "grad_norm": 2.125, - "learning_rate": 4e-05, - "loss": 0.0849, - "num_input_tokens_seen": 1528464, - "step": 24 - }, - { - "epoch": 0.005807622504537205, - "grad_norm": 1.4453125, - "learning_rate": 3.999979012178918e-05, - "loss": 0.12, - "num_input_tokens_seen": 2041011, - "step": 32 - }, - { - "epoch": 0.007259528130671506, - "grad_norm": 2.609375, - "learning_rate": 3.9999160491561583e-05, - "loss": 0.1437, - "num_input_tokens_seen": 2530185, - "step": 40 - }, - { - "epoch": 0.008711433756805808, - "grad_norm": 1.4140625, - "learning_rate": 3.9998111122531796e-05, - "loss": 0.0898, - "num_input_tokens_seen": 3017273, - "step": 48 - }, - { - "epoch": 0.010163339382940109, - "grad_norm": 1.921875, - "learning_rate": 3.999664203672378e-05, - "loss": 0.1247, - "num_input_tokens_seen": 3507672, - "step": 56 - }, - { - "epoch": 0.01161524500907441, - "grad_norm": 1.0859375, - "learning_rate": 3.999475326497044e-05, - "loss": 0.0819, - "num_input_tokens_seen": 4018539, - "step": 64 - }, - { - "epoch": 0.013067150635208712, - "grad_norm": 1.6171875, - "learning_rate": 3.999244484691299e-05, - "loss": 0.1078, - "num_input_tokens_seen": 4525857, - "step": 72 - }, - { - "epoch": 0.014519056261343012, - "grad_norm": 1.3671875, - "learning_rate": 3.998971683100009e-05, - "loss": 0.099, - "num_input_tokens_seen": 5023032, - "step": 80 - }, - { - "epoch": 0.015970961887477313, - "grad_norm": 1.625, - "learning_rate": 3.9986569274486843e-05, - "loss": 0.0855, - "num_input_tokens_seen": 5524113, - "step": 88 - }, - { - "epoch": 0.017422867513611617, - "grad_norm": 1.734375, - "learning_rate": 3.9983002243433615e-05, - "loss": 0.1026, - "num_input_tokens_seen": 5999882, - "step": 96 - }, - { - "epoch": 0.018874773139745917, - "grad_norm": 3.5625, - "learning_rate": 3.9979015812704605e-05, - "loss": 0.0843, - "num_input_tokens_seen": 6471878, - "step": 104 - }, - { - "epoch": 0.020326678765880218, - "grad_norm": 1.0625, - "learning_rate": 3.997461006596631e-05, - "loss": 0.0841, - "num_input_tokens_seen": 6944973, - "step": 112 - }, - { - "epoch": 0.021778584392014518, - "grad_norm": 1.0625, - "learning_rate": 3.9969785095685765e-05, - "loss": 0.0982, - "num_input_tokens_seen": 7460215, - "step": 120 - }, - { - "epoch": 0.02323049001814882, - "grad_norm": 1.0859375, - "learning_rate": 3.996454100312857e-05, - "loss": 0.0971, - "num_input_tokens_seen": 7942417, - "step": 128 - }, - { - "epoch": 0.024682395644283123, - "grad_norm": 82.0, - "learning_rate": 3.9958877898356806e-05, - "loss": 0.2563, - "num_input_tokens_seen": 8454243, - "step": 136 - }, - { - "epoch": 0.026134301270417423, - "grad_norm": 2.21875, - "learning_rate": 3.99527959002267e-05, - "loss": 0.1566, - "num_input_tokens_seen": 8973734, - "step": 144 - }, - { - "epoch": 0.027586206896551724, - "grad_norm": 2.40625, - "learning_rate": 3.994629513638614e-05, - "loss": 0.1109, - "num_input_tokens_seen": 9497439, - "step": 152 - }, - { - "epoch": 0.029038112522686024, - "grad_norm": 3.65625, - "learning_rate": 3.993937574327201e-05, - "loss": 0.1353, - "num_input_tokens_seen": 9988636, - "step": 160 - }, - { - "epoch": 0.030490018148820328, - "grad_norm": 1.578125, - "learning_rate": 3.993203786610727e-05, - "loss": 0.1002, - "num_input_tokens_seen": 10460548, - "step": 168 - }, - { - "epoch": 0.031941923774954625, - "grad_norm": 1.1015625, - "learning_rate": 3.992428165889799e-05, - "loss": 0.0952, - "num_input_tokens_seen": 10983644, - "step": 176 - }, - { - "epoch": 0.033393829401088926, - "grad_norm": 2.515625, - "learning_rate": 3.991610728443006e-05, - "loss": 0.1082, - "num_input_tokens_seen": 11485663, - "step": 184 - }, - { - "epoch": 0.03484573502722323, - "grad_norm": 1.53125, - "learning_rate": 3.9907514914265776e-05, - "loss": 0.0907, - "num_input_tokens_seen": 11981340, - "step": 192 - }, - { - "epoch": 0.036297640653357534, - "grad_norm": 12.0625, - "learning_rate": 3.989850472874027e-05, - "loss": 0.0704, - "num_input_tokens_seen": 12482463, - "step": 200 - }, - { - "epoch": 0.037749546279491834, - "grad_norm": 1.078125, - "learning_rate": 3.988907691695771e-05, - "loss": 0.0847, - "num_input_tokens_seen": 12968571, - "step": 208 - }, - { - "epoch": 0.039201451905626135, - "grad_norm": 1.2578125, - "learning_rate": 3.987923167678732e-05, - "loss": 0.0968, - "num_input_tokens_seen": 13451536, - "step": 216 - }, - { - "epoch": 0.040653357531760435, - "grad_norm": 2.484375, - "learning_rate": 3.986896921485924e-05, - "loss": 0.1026, - "num_input_tokens_seen": 13949131, - "step": 224 - }, - { - "epoch": 0.042105263157894736, - "grad_norm": 2.453125, - "learning_rate": 3.9858289746560183e-05, - "loss": 0.1126, - "num_input_tokens_seen": 14447251, - "step": 232 - }, - { - "epoch": 0.043557168784029036, - "grad_norm": 1.2265625, - "learning_rate": 3.984719349602892e-05, - "loss": 0.0934, - "num_input_tokens_seen": 14937783, - "step": 240 - }, - { - "epoch": 0.04500907441016334, - "grad_norm": 1.75, - "learning_rate": 3.983568069615157e-05, - "loss": 0.0936, - "num_input_tokens_seen": 15429323, - "step": 248 - }, - { - "epoch": 0.04646098003629764, - "grad_norm": 1.2109375, - "learning_rate": 3.982375158855672e-05, - "loss": 0.0749, - "num_input_tokens_seen": 15920688, - "step": 256 - }, - { - "epoch": 0.047912885662431945, - "grad_norm": 1.2578125, - "learning_rate": 3.981140642361034e-05, - "loss": 0.0868, - "num_input_tokens_seen": 16393398, - "step": 264 - }, - { - "epoch": 0.049364791288566245, - "grad_norm": 1.171875, - "learning_rate": 3.9798645460410544e-05, - "loss": 0.0997, - "num_input_tokens_seen": 16894283, - "step": 272 - }, - { - "epoch": 0.050816696914700546, - "grad_norm": 0.99609375, - "learning_rate": 3.9785468966782155e-05, - "loss": 0.0849, - "num_input_tokens_seen": 17371830, - "step": 280 - }, - { - "epoch": 0.052268602540834846, - "grad_norm": 1.15625, - "learning_rate": 3.9771877219271055e-05, - "loss": 0.0925, - "num_input_tokens_seen": 17893827, - "step": 288 - }, - { - "epoch": 0.05372050816696915, - "grad_norm": 0.8125, - "learning_rate": 3.975787050313841e-05, - "loss": 0.0822, - "num_input_tokens_seen": 18380621, - "step": 296 - }, - { - "epoch": 0.05517241379310345, - "grad_norm": 1.6484375, - "learning_rate": 3.9743449112354676e-05, - "loss": 0.1172, - "num_input_tokens_seen": 18905348, - "step": 304 - }, - { - "epoch": 0.05662431941923775, - "grad_norm": 1.2734375, - "learning_rate": 3.9728613349593415e-05, - "loss": 0.1075, - "num_input_tokens_seen": 19399905, - "step": 312 - }, - { - "epoch": 0.05807622504537205, - "grad_norm": 18.25, - "learning_rate": 3.971336352622496e-05, - "loss": 0.1882, - "num_input_tokens_seen": 19921923, - "step": 320 - }, - { - "epoch": 0.05952813067150635, - "grad_norm": 1.8359375, - "learning_rate": 3.969769996230989e-05, - "loss": 0.1074, - "num_input_tokens_seen": 20436822, - "step": 328 - }, - { - "epoch": 0.060980036297640657, - "grad_norm": 1.3828125, - "learning_rate": 3.968162298659227e-05, - "loss": 0.1112, - "num_input_tokens_seen": 20943888, - "step": 336 - }, - { - "epoch": 0.06243194192377496, - "grad_norm": 1.3125, - "learning_rate": 3.9665132936492794e-05, - "loss": 0.1519, - "num_input_tokens_seen": 21418243, - "step": 344 - }, - { - "epoch": 0.06243194192377496, - "eval_loss": 0.11010845005512238, - "eval_runtime": 2622.9951, - "eval_samples_per_second": 1.188, - "eval_steps_per_second": 0.149, - "num_input_tokens_seen": 21418243, - "step": 344 - }, - { - "epoch": 0.06388384754990925, - "grad_norm": 3.640625, - "learning_rate": 3.9648230158101674e-05, - "loss": 0.123, - "num_input_tokens_seen": 21924518, - "step": 352 - }, - { - "epoch": 0.06533575317604355, - "grad_norm": 1.5625, - "learning_rate": 3.9630915006171416e-05, - "loss": 0.1086, - "num_input_tokens_seen": 22403227, - "step": 360 - }, - { - "epoch": 0.06678765880217785, - "grad_norm": 3.09375, - "learning_rate": 3.961318784410932e-05, - "loss": 0.1068, - "num_input_tokens_seen": 22901361, - "step": 368 - }, - { - "epoch": 0.06823956442831217, - "grad_norm": 0.9375, - "learning_rate": 3.95950490439699e-05, - "loss": 0.0931, - "num_input_tokens_seen": 23408098, - "step": 376 - }, - { - "epoch": 0.06969147005444647, - "grad_norm": 0.9296875, - "learning_rate": 3.9576498986447026e-05, - "loss": 0.0817, - "num_input_tokens_seen": 23890867, - "step": 384 - }, - { - "epoch": 0.07114337568058077, - "grad_norm": 1.2109375, - "learning_rate": 3.9557538060866005e-05, - "loss": 0.0917, - "num_input_tokens_seen": 24393313, - "step": 392 - }, - { - "epoch": 0.07259528130671507, - "grad_norm": 1.0078125, - "learning_rate": 3.9538166665175354e-05, - "loss": 0.0865, - "num_input_tokens_seen": 24894282, - "step": 400 - }, - { - "epoch": 0.07404718693284937, - "grad_norm": 1.640625, - "learning_rate": 3.9518385205938446e-05, - "loss": 0.1222, - "num_input_tokens_seen": 25397169, - "step": 408 - }, - { - "epoch": 0.07549909255898367, - "grad_norm": 1.5859375, - "learning_rate": 3.949819409832502e-05, - "loss": 0.0899, - "num_input_tokens_seen": 25894407, - "step": 416 - }, - { - "epoch": 0.07695099818511797, - "grad_norm": 1.1640625, - "learning_rate": 3.947759376610242e-05, - "loss": 0.0716, - "num_input_tokens_seen": 26375741, - "step": 424 - }, - { - "epoch": 0.07840290381125227, - "grad_norm": 2.15625, - "learning_rate": 3.945658464162674e-05, - "loss": 0.1094, - "num_input_tokens_seen": 26881148, - "step": 432 - }, - { - "epoch": 0.07985480943738657, - "grad_norm": 1.265625, - "learning_rate": 3.9435167165833724e-05, - "loss": 0.1517, - "num_input_tokens_seen": 27373108, - "step": 440 - }, - { - "epoch": 0.08130671506352087, - "grad_norm": 7.84375, - "learning_rate": 3.9413341788229524e-05, - "loss": 0.0959, - "num_input_tokens_seen": 27852888, - "step": 448 - }, - { - "epoch": 0.08275862068965517, - "grad_norm": 2.828125, - "learning_rate": 3.939110896688126e-05, - "loss": 0.0824, - "num_input_tokens_seen": 28338065, - "step": 456 - }, - { - "epoch": 0.08421052631578947, - "grad_norm": 5.5625, - "learning_rate": 3.93684691684074e-05, - "loss": 0.1234, - "num_input_tokens_seen": 28842856, - "step": 464 - }, - { - "epoch": 0.08566243194192377, - "grad_norm": 1.8515625, - "learning_rate": 3.9345422867967995e-05, - "loss": 0.1118, - "num_input_tokens_seen": 29349096, - "step": 472 - }, - { - "epoch": 0.08711433756805807, - "grad_norm": 1.421875, - "learning_rate": 3.9321970549254664e-05, - "loss": 0.1055, - "num_input_tokens_seen": 29826034, - "step": 480 - }, - { - "epoch": 0.08856624319419237, - "grad_norm": 18.75, - "learning_rate": 3.929811270448049e-05, - "loss": 0.1166, - "num_input_tokens_seen": 30321718, - "step": 488 - }, - { - "epoch": 0.09001814882032667, - "grad_norm": 3.46875, - "learning_rate": 3.927384983436964e-05, - "loss": 0.1134, - "num_input_tokens_seen": 30812607, - "step": 496 - }, - { - "epoch": 0.09147005444646097, - "grad_norm": 1.0390625, - "learning_rate": 3.924918244814689e-05, - "loss": 0.0805, - "num_input_tokens_seen": 31304931, - "step": 504 - }, - { - "epoch": 0.09292196007259527, - "grad_norm": 1.1015625, - "learning_rate": 3.922411106352694e-05, - "loss": 0.0849, - "num_input_tokens_seen": 31792831, - "step": 512 - }, - { - "epoch": 0.09437386569872959, - "grad_norm": 1.375, - "learning_rate": 3.9198636206703516e-05, - "loss": 0.0919, - "num_input_tokens_seen": 32286282, - "step": 520 - }, - { - "epoch": 0.09582577132486389, - "grad_norm": 1.40625, - "learning_rate": 3.9172758412338346e-05, - "loss": 0.0896, - "num_input_tokens_seen": 32770941, - "step": 528 - }, - { - "epoch": 0.09727767695099819, - "grad_norm": 4.8125, - "learning_rate": 3.9146478223549974e-05, - "loss": 0.0925, - "num_input_tokens_seen": 33253136, - "step": 536 - }, - { - "epoch": 0.09872958257713249, - "grad_norm": 1.1796875, - "learning_rate": 3.9119796191902274e-05, - "loss": 0.0656, - "num_input_tokens_seen": 33760146, - "step": 544 - }, - { - "epoch": 0.10018148820326679, - "grad_norm": 3.640625, - "learning_rate": 3.9092712877392965e-05, - "loss": 0.1162, - "num_input_tokens_seen": 34251987, - "step": 552 - }, - { - "epoch": 0.10163339382940109, - "grad_norm": 2.03125, - "learning_rate": 3.906522884844181e-05, - "loss": 0.1153, - "num_input_tokens_seen": 34730598, - "step": 560 - }, - { - "epoch": 0.10308529945553539, - "grad_norm": 1.390625, - "learning_rate": 3.903734468187868e-05, - "loss": 0.0731, - "num_input_tokens_seen": 35215481, - "step": 568 - }, - { - "epoch": 0.10453720508166969, - "grad_norm": 2.515625, - "learning_rate": 3.900906096293148e-05, - "loss": 0.0992, - "num_input_tokens_seen": 35691971, - "step": 576 - }, - { - "epoch": 0.105989110707804, - "grad_norm": 0.765625, - "learning_rate": 3.8980378285213846e-05, - "loss": 0.1058, - "num_input_tokens_seen": 36191442, - "step": 584 - }, - { - "epoch": 0.1074410163339383, - "grad_norm": 1.0078125, - "learning_rate": 3.895129725071268e-05, - "loss": 0.0841, - "num_input_tokens_seen": 36677760, - "step": 592 - }, - { - "epoch": 0.1088929219600726, - "grad_norm": 1.1015625, - "learning_rate": 3.892181846977553e-05, - "loss": 0.096, - "num_input_tokens_seen": 37169594, - "step": 600 - }, - { - "epoch": 0.1103448275862069, - "grad_norm": 1.0078125, - "learning_rate": 3.8891942561097787e-05, - "loss": 0.0865, - "num_input_tokens_seen": 37658243, - "step": 608 - }, - { - "epoch": 0.1117967332123412, - "grad_norm": 3.40625, - "learning_rate": 3.8861670151709664e-05, - "loss": 0.0926, - "num_input_tokens_seen": 38172841, - "step": 616 - }, - { - "epoch": 0.1132486388384755, - "grad_norm": 1.9296875, - "learning_rate": 3.883100187696308e-05, - "loss": 0.0844, - "num_input_tokens_seen": 38680418, - "step": 624 - }, - { - "epoch": 0.1147005444646098, - "grad_norm": 0.921875, - "learning_rate": 3.87999383805183e-05, - "loss": 0.0889, - "num_input_tokens_seen": 39168241, - "step": 632 - }, - { - "epoch": 0.1161524500907441, - "grad_norm": 0.9375, - "learning_rate": 3.876848031433042e-05, - "loss": 0.0931, - "num_input_tokens_seen": 39636702, - "step": 640 - }, - { - "epoch": 0.1176043557168784, - "grad_norm": 1.03125, - "learning_rate": 3.8736628338635716e-05, - "loss": 0.0638, - "num_input_tokens_seen": 40118232, - "step": 648 - }, - { - "epoch": 0.1190562613430127, - "grad_norm": 1.4140625, - "learning_rate": 3.870438312193774e-05, - "loss": 0.0775, - "num_input_tokens_seen": 40614511, - "step": 656 - }, - { - "epoch": 0.120508166969147, - "grad_norm": 1.2734375, - "learning_rate": 3.8671745340993354e-05, - "loss": 0.0902, - "num_input_tokens_seen": 41136221, - "step": 664 - }, - { - "epoch": 0.12196007259528131, - "grad_norm": 2.140625, - "learning_rate": 3.863871568079845e-05, - "loss": 0.1083, - "num_input_tokens_seen": 41626515, - "step": 672 - }, - { - "epoch": 0.12341197822141561, - "grad_norm": 1.265625, - "learning_rate": 3.860529483457362e-05, - "loss": 0.0914, - "num_input_tokens_seen": 42128107, - "step": 680 - }, - { - "epoch": 0.12486388384754991, - "grad_norm": 1.921875, - "learning_rate": 3.8571483503749625e-05, - "loss": 0.1172, - "num_input_tokens_seen": 42626752, - "step": 688 - }, - { - "epoch": 0.12486388384754991, - "eval_loss": 0.08887020498514175, - "eval_runtime": 2566.1938, - "eval_samples_per_second": 1.215, - "eval_steps_per_second": 0.152, - "num_input_tokens_seen": 42626752, - "step": 688 - }, - { - "epoch": 0.12631578947368421, - "grad_norm": 1.1875, - "learning_rate": 3.8537282397952604e-05, - "loss": 0.0873, - "num_input_tokens_seen": 43128274, - "step": 696 - }, - { - "epoch": 0.1277676950998185, - "grad_norm": 0.92578125, - "learning_rate": 3.8502692234989265e-05, - "loss": 0.0807, - "num_input_tokens_seen": 43630580, - "step": 704 - }, - { - "epoch": 0.12921960072595282, - "grad_norm": 0.59375, - "learning_rate": 3.846771374083175e-05, - "loss": 0.0792, - "num_input_tokens_seen": 44143904, - "step": 712 - }, - { - "epoch": 0.1306715063520871, - "grad_norm": 1.015625, - "learning_rate": 3.843234764960244e-05, - "loss": 0.0808, - "num_input_tokens_seen": 44635682, - "step": 720 - }, - { - "epoch": 0.13212341197822142, - "grad_norm": 0.84375, - "learning_rate": 3.839659470355853e-05, - "loss": 0.0902, - "num_input_tokens_seen": 45110870, - "step": 728 - }, - { - "epoch": 0.1335753176043557, - "grad_norm": 0.96875, - "learning_rate": 3.8360455653076446e-05, - "loss": 0.0872, - "num_input_tokens_seen": 45620246, - "step": 736 - }, - { - "epoch": 0.13502722323049002, - "grad_norm": 0.79296875, - "learning_rate": 3.832393125663613e-05, - "loss": 0.1095, - "num_input_tokens_seen": 46106634, - "step": 744 - }, - { - "epoch": 0.13647912885662433, - "grad_norm": 1.1875, - "learning_rate": 3.8287022280805064e-05, - "loss": 0.1008, - "num_input_tokens_seen": 46599497, - "step": 752 - }, - { - "epoch": 0.13793103448275862, - "grad_norm": 0.80859375, - "learning_rate": 3.824972950022224e-05, - "loss": 0.0761, - "num_input_tokens_seen": 47098121, - "step": 760 - }, - { - "epoch": 0.13938294010889293, - "grad_norm": 0.75390625, - "learning_rate": 3.8212053697581855e-05, - "loss": 0.0864, - "num_input_tokens_seen": 47599433, - "step": 768 - }, - { - "epoch": 0.14083484573502722, - "grad_norm": 0.77734375, - "learning_rate": 3.817399566361692e-05, - "loss": 0.0756, - "num_input_tokens_seen": 48099996, - "step": 776 - }, - { - "epoch": 0.14228675136116153, - "grad_norm": 0.8203125, - "learning_rate": 3.8135556197082647e-05, - "loss": 0.0991, - "num_input_tokens_seen": 48591151, - "step": 784 - }, - { - "epoch": 0.14373865698729582, - "grad_norm": 1.1875, - "learning_rate": 3.809673610473967e-05, - "loss": 0.0859, - "num_input_tokens_seen": 49119581, - "step": 792 - }, - { - "epoch": 0.14519056261343014, - "grad_norm": 0.99609375, - "learning_rate": 3.805753620133715e-05, - "loss": 0.0938, - "num_input_tokens_seen": 49589057, - "step": 800 - }, - { - "epoch": 0.14664246823956442, - "grad_norm": 1.8828125, - "learning_rate": 3.801795730959565e-05, - "loss": 0.0657, - "num_input_tokens_seen": 50091363, - "step": 808 - }, - { - "epoch": 0.14809437386569874, - "grad_norm": 1.5, - "learning_rate": 3.7978000260189854e-05, - "loss": 0.1124, - "num_input_tokens_seen": 50595440, - "step": 816 - }, - { - "epoch": 0.14954627949183302, - "grad_norm": 1.046875, - "learning_rate": 3.793766589173117e-05, - "loss": 0.0969, - "num_input_tokens_seen": 51097536, - "step": 824 - }, - { - "epoch": 0.15099818511796734, - "grad_norm": 1.2421875, - "learning_rate": 3.789695505075013e-05, - "loss": 0.0815, - "num_input_tokens_seen": 51592933, - "step": 832 - }, - { - "epoch": 0.15245009074410162, - "grad_norm": 0.640625, - "learning_rate": 3.785586859167855e-05, - "loss": 0.0806, - "num_input_tokens_seen": 52089163, - "step": 840 - }, - { - "epoch": 0.15390199637023594, - "grad_norm": 0.87109375, - "learning_rate": 3.78144073768317e-05, - "loss": 0.0628, - "num_input_tokens_seen": 52591035, - "step": 848 - }, - { - "epoch": 0.15535390199637023, - "grad_norm": 0.890625, - "learning_rate": 3.7772572276390125e-05, - "loss": 0.1, - "num_input_tokens_seen": 53108139, - "step": 856 - }, - { - "epoch": 0.15680580762250454, - "grad_norm": 1.3046875, - "learning_rate": 3.7730364168381444e-05, - "loss": 0.1083, - "num_input_tokens_seen": 53612734, - "step": 864 - }, - { - "epoch": 0.15825771324863883, - "grad_norm": 1.28125, - "learning_rate": 3.768778393866186e-05, - "loss": 0.0782, - "num_input_tokens_seen": 54104981, - "step": 872 - }, - { - "epoch": 0.15970961887477314, - "grad_norm": 1.1484375, - "learning_rate": 3.764483248089763e-05, - "loss": 0.1166, - "num_input_tokens_seen": 54591628, - "step": 880 - }, - { - "epoch": 0.16116152450090745, - "grad_norm": 0.89453125, - "learning_rate": 3.760151069654626e-05, - "loss": 0.0958, - "num_input_tokens_seen": 55092240, - "step": 888 - }, - { - "epoch": 0.16261343012704174, - "grad_norm": 1.0546875, - "learning_rate": 3.75578194948376e-05, - "loss": 0.0904, - "num_input_tokens_seen": 55596058, - "step": 896 - }, - { - "epoch": 0.16406533575317606, - "grad_norm": 0.76953125, - "learning_rate": 3.751375979275479e-05, - "loss": 0.0816, - "num_input_tokens_seen": 56065485, - "step": 904 - }, - { - "epoch": 0.16551724137931034, - "grad_norm": 1.0078125, - "learning_rate": 3.746933251501497e-05, - "loss": 0.0729, - "num_input_tokens_seen": 56559741, - "step": 912 - }, - { - "epoch": 0.16696914700544466, - "grad_norm": 0.875, - "learning_rate": 3.7424538594049886e-05, - "loss": 0.0626, - "num_input_tokens_seen": 57042468, - "step": 920 - }, - { - "epoch": 0.16842105263157894, - "grad_norm": 1.2109375, - "learning_rate": 3.737937896998634e-05, - "loss": 0.0872, - "num_input_tokens_seen": 57530081, - "step": 928 - }, - { - "epoch": 0.16987295825771326, - "grad_norm": 0.9296875, - "learning_rate": 3.733385459062645e-05, - "loss": 0.0863, - "num_input_tokens_seen": 58052036, - "step": 936 - }, - { - "epoch": 0.17132486388384754, - "grad_norm": 0.80078125, - "learning_rate": 3.728796641142775e-05, - "loss": 0.0747, - "num_input_tokens_seen": 58558654, - "step": 944 - }, - { - "epoch": 0.17277676950998186, - "grad_norm": 1.46875, - "learning_rate": 3.724171539548311e-05, - "loss": 0.0946, - "num_input_tokens_seen": 59069780, - "step": 952 - }, - { - "epoch": 0.17422867513611615, - "grad_norm": 0.94921875, - "learning_rate": 3.71951025135006e-05, - "loss": 0.0707, - "num_input_tokens_seen": 59546270, - "step": 960 - }, - { - "epoch": 0.17568058076225046, - "grad_norm": 1.1875, - "learning_rate": 3.714812874378305e-05, - "loss": 0.0796, - "num_input_tokens_seen": 60050879, - "step": 968 - }, - { - "epoch": 0.17713248638838475, - "grad_norm": 0.71484375, - "learning_rate": 3.710079507220751e-05, - "loss": 0.0908, - "num_input_tokens_seen": 60542881, - "step": 976 - }, - { - "epoch": 0.17858439201451906, - "grad_norm": 0.64453125, - "learning_rate": 3.705310249220463e-05, - "loss": 0.0799, - "num_input_tokens_seen": 61009270, - "step": 984 - }, - { - "epoch": 0.18003629764065335, - "grad_norm": 2.265625, - "learning_rate": 3.700505200473774e-05, - "loss": 0.0937, - "num_input_tokens_seen": 61499242, - "step": 992 - }, - { - "epoch": 0.18148820326678766, - "grad_norm": 2.78125, - "learning_rate": 3.695664461828187e-05, - "loss": 0.0913, - "num_input_tokens_seen": 61987954, - "step": 1000 - }, - { - "epoch": 0.18294010889292195, - "grad_norm": 0.78125, - "learning_rate": 3.69078813488026e-05, - "loss": 0.0546, - "num_input_tokens_seen": 62482644, - "step": 1008 - }, - { - "epoch": 0.18439201451905626, - "grad_norm": 1.609375, - "learning_rate": 3.68587632197347e-05, - "loss": 0.0788, - "num_input_tokens_seen": 62950426, - "step": 1016 - }, - { - "epoch": 0.18584392014519055, - "grad_norm": 0.90234375, - "learning_rate": 3.6809291261960655e-05, - "loss": 0.0865, - "num_input_tokens_seen": 63454867, - "step": 1024 - }, - { - "epoch": 0.18729582577132486, - "grad_norm": 1.390625, - "learning_rate": 3.675946651378909e-05, - "loss": 0.0832, - "num_input_tokens_seen": 63980224, - "step": 1032 - }, - { - "epoch": 0.18729582577132486, - "eval_loss": 0.07875645905733109, - "eval_runtime": 2702.6122, - "eval_samples_per_second": 1.153, - "eval_steps_per_second": 0.144, - "num_input_tokens_seen": 63980224, - "step": 1032 - }, - { - "epoch": 0.18874773139745918, - "grad_norm": 0.9296875, - "learning_rate": 3.67092900209329e-05, - "loss": 0.0831, - "num_input_tokens_seen": 64445080, - "step": 1040 - }, - { - "epoch": 0.19019963702359347, - "grad_norm": 1.25, - "learning_rate": 3.665876283648732e-05, - "loss": 0.0697, - "num_input_tokens_seen": 64941877, - "step": 1048 - }, - { - "epoch": 0.19165154264972778, - "grad_norm": 0.62890625, - "learning_rate": 3.660788602090788e-05, - "loss": 0.0845, - "num_input_tokens_seen": 65451057, - "step": 1056 - }, - { - "epoch": 0.19310344827586207, - "grad_norm": 1.2265625, - "learning_rate": 3.655666064198807e-05, - "loss": 0.0822, - "num_input_tokens_seen": 65944830, - "step": 1064 - }, - { - "epoch": 0.19455535390199638, - "grad_norm": 1.125, - "learning_rate": 3.6505087774836977e-05, - "loss": 0.0974, - "num_input_tokens_seen": 66458462, - "step": 1072 - }, - { - "epoch": 0.19600725952813067, - "grad_norm": 0.7578125, - "learning_rate": 3.645316850185672e-05, - "loss": 0.0907, - "num_input_tokens_seen": 66955532, - "step": 1080 - }, - { - "epoch": 0.19745916515426498, - "grad_norm": 1.390625, - "learning_rate": 3.6400903912719696e-05, - "loss": 0.0791, - "num_input_tokens_seen": 67453162, - "step": 1088 - }, - { - "epoch": 0.19891107078039927, - "grad_norm": 0.859375, - "learning_rate": 3.6348295104345764e-05, - "loss": 0.0593, - "num_input_tokens_seen": 67939256, - "step": 1096 - }, - { - "epoch": 0.20036297640653358, - "grad_norm": 1.0546875, - "learning_rate": 3.629534318087918e-05, - "loss": 0.1024, - "num_input_tokens_seen": 68457767, - "step": 1104 - }, - { - "epoch": 0.20181488203266787, - "grad_norm": 1.0234375, - "learning_rate": 3.624204925366543e-05, - "loss": 0.0621, - "num_input_tokens_seen": 68964063, - "step": 1112 - }, - { - "epoch": 0.20326678765880218, - "grad_norm": 0.72265625, - "learning_rate": 3.618841444122794e-05, - "loss": 0.0685, - "num_input_tokens_seen": 69443542, - "step": 1120 - }, - { - "epoch": 0.20471869328493647, - "grad_norm": 0.83203125, - "learning_rate": 3.613443986924455e-05, - "loss": 0.0866, - "num_input_tokens_seen": 69941074, - "step": 1128 - }, - { - "epoch": 0.20617059891107078, - "grad_norm": 0.890625, - "learning_rate": 3.60801266705239e-05, - "loss": 0.0873, - "num_input_tokens_seen": 70410725, - "step": 1136 - }, - { - "epoch": 0.20762250453720507, - "grad_norm": 0.68359375, - "learning_rate": 3.6025475984981716e-05, - "loss": 0.0767, - "num_input_tokens_seen": 70885703, - "step": 1144 - }, - { - "epoch": 0.20907441016333939, - "grad_norm": 0.8515625, - "learning_rate": 3.59704889596168e-05, - "loss": 0.08, - "num_input_tokens_seen": 71379385, - "step": 1152 - }, - { - "epoch": 0.21052631578947367, - "grad_norm": 3.34375, - "learning_rate": 3.5915166748486984e-05, - "loss": 0.0974, - "num_input_tokens_seen": 71863351, - "step": 1160 - }, - { - "epoch": 0.211978221415608, - "grad_norm": 0.82421875, - "learning_rate": 3.585951051268496e-05, - "loss": 0.0799, - "num_input_tokens_seen": 72351447, - "step": 1168 - }, - { - "epoch": 0.21343012704174227, - "grad_norm": 0.76953125, - "learning_rate": 3.5803521420313836e-05, - "loss": 0.0598, - "num_input_tokens_seen": 72853284, - "step": 1176 - }, - { - "epoch": 0.2148820326678766, - "grad_norm": 0.92578125, - "learning_rate": 3.574720064646267e-05, - "loss": 0.1021, - "num_input_tokens_seen": 73354953, - "step": 1184 - }, - { - "epoch": 0.2163339382940109, - "grad_norm": 0.87890625, - "learning_rate": 3.5690549373181785e-05, - "loss": 0.0749, - "num_input_tokens_seen": 73851645, - "step": 1192 - }, - { - "epoch": 0.2177858439201452, - "grad_norm": 1.1015625, - "learning_rate": 3.563356878945797e-05, - "loss": 0.0677, - "num_input_tokens_seen": 74351802, - "step": 1200 - }, - { - "epoch": 0.2192377495462795, - "grad_norm": 0.84765625, - "learning_rate": 3.557626009118951e-05, - "loss": 0.0632, - "num_input_tokens_seen": 74849173, - "step": 1208 - }, - { - "epoch": 0.2206896551724138, - "grad_norm": 0.67578125, - "learning_rate": 3.551862448116113e-05, - "loss": 0.1037, - "num_input_tokens_seen": 75333244, - "step": 1216 - }, - { - "epoch": 0.2221415607985481, - "grad_norm": 1.0234375, - "learning_rate": 3.546066316901869e-05, - "loss": 0.0675, - "num_input_tokens_seen": 75799822, - "step": 1224 - }, - { - "epoch": 0.2235934664246824, - "grad_norm": 0.89453125, - "learning_rate": 3.540237737124384e-05, - "loss": 0.0684, - "num_input_tokens_seen": 76300896, - "step": 1232 - }, - { - "epoch": 0.2250453720508167, - "grad_norm": 1.1328125, - "learning_rate": 3.534376831112848e-05, - "loss": 0.0757, - "num_input_tokens_seen": 76787655, - "step": 1240 - }, - { - "epoch": 0.226497277676951, - "grad_norm": 0.8203125, - "learning_rate": 3.528483721874907e-05, - "loss": 0.0651, - "num_input_tokens_seen": 77298718, - "step": 1248 - }, - { - "epoch": 0.2279491833030853, - "grad_norm": 1.4921875, - "learning_rate": 3.522558533094084e-05, - "loss": 0.0863, - "num_input_tokens_seen": 77797727, - "step": 1256 - }, - { - "epoch": 0.2294010889292196, - "grad_norm": 0.5859375, - "learning_rate": 3.51660138912718e-05, - "loss": 0.0885, - "num_input_tokens_seen": 78292669, - "step": 1264 - }, - { - "epoch": 0.2308529945553539, - "grad_norm": 0.58203125, - "learning_rate": 3.510612415001668e-05, - "loss": 0.0892, - "num_input_tokens_seen": 78800617, - "step": 1272 - }, - { - "epoch": 0.2323049001814882, - "grad_norm": 0.88671875, - "learning_rate": 3.5045917364130644e-05, - "loss": 0.0527, - "num_input_tokens_seen": 79317483, - "step": 1280 - }, - { - "epoch": 0.2337568058076225, - "grad_norm": 0.61328125, - "learning_rate": 3.4985394797222954e-05, - "loss": 0.0587, - "num_input_tokens_seen": 79807917, - "step": 1288 - }, - { - "epoch": 0.2352087114337568, - "grad_norm": 1.3515625, - "learning_rate": 3.49245577195304e-05, - "loss": 0.0546, - "num_input_tokens_seen": 80289419, - "step": 1296 - }, - { - "epoch": 0.2366606170598911, - "grad_norm": 2.1875, - "learning_rate": 3.4863407407890696e-05, - "loss": 0.0982, - "num_input_tokens_seen": 80784249, - "step": 1304 - }, - { - "epoch": 0.2381125226860254, - "grad_norm": 2.59375, - "learning_rate": 3.480194514571564e-05, - "loss": 0.0965, - "num_input_tokens_seen": 81278666, - "step": 1312 - }, - { - "epoch": 0.2395644283121597, - "grad_norm": 1.2109375, - "learning_rate": 3.474017222296419e-05, - "loss": 0.0984, - "num_input_tokens_seen": 81786558, - "step": 1320 - }, - { - "epoch": 0.241016333938294, - "grad_norm": 0.6328125, - "learning_rate": 3.4678089936115395e-05, - "loss": 0.1122, - "num_input_tokens_seen": 82281843, - "step": 1328 - }, - { - "epoch": 0.2424682395644283, - "grad_norm": 2.765625, - "learning_rate": 3.461569958814119e-05, - "loss": 0.0745, - "num_input_tokens_seen": 82776869, - "step": 1336 - }, - { - "epoch": 0.24392014519056263, - "grad_norm": 0.984375, - "learning_rate": 3.455300248847903e-05, - "loss": 0.1094, - "num_input_tokens_seen": 83275171, - "step": 1344 - }, - { - "epoch": 0.2453720508166969, - "grad_norm": 1.03125, - "learning_rate": 3.448999995300443e-05, - "loss": 0.0663, - "num_input_tokens_seen": 83755833, - "step": 1352 - }, - { - "epoch": 0.24682395644283123, - "grad_norm": 1.5078125, - "learning_rate": 3.4426693304003324e-05, - "loss": 0.0879, - "num_input_tokens_seen": 84237888, - "step": 1360 - }, - { - "epoch": 0.2482758620689655, - "grad_norm": 1.0859375, - "learning_rate": 3.4363083870144346e-05, - "loss": 0.0661, - "num_input_tokens_seen": 84739837, - "step": 1368 - }, - { - "epoch": 0.24972776769509983, - "grad_norm": 1.3046875, - "learning_rate": 3.4299172986450906e-05, - "loss": 0.0764, - "num_input_tokens_seen": 85221444, - "step": 1376 - }, - { - "epoch": 0.24972776769509983, - "eval_loss": 0.08076217025518417, - "eval_runtime": 2579.1691, - "eval_samples_per_second": 1.209, - "eval_steps_per_second": 0.151, - "num_input_tokens_seen": 85221444, - "step": 1376 - }, - { - "epoch": 0.25117967332123414, - "grad_norm": 1.0078125, - "learning_rate": 3.4234961994273206e-05, - "loss": 0.0714, - "num_input_tokens_seen": 85711647, - "step": 1384 - }, - { - "epoch": 0.25263157894736843, - "grad_norm": 0.62109375, - "learning_rate": 3.417045224126004e-05, - "loss": 0.0774, - "num_input_tokens_seen": 86223550, - "step": 1392 - }, - { - "epoch": 0.2540834845735027, - "grad_norm": 1.265625, - "learning_rate": 3.410564508133058e-05, - "loss": 0.0872, - "num_input_tokens_seen": 86721404, - "step": 1400 - }, - { - "epoch": 0.255535390199637, - "grad_norm": 1.3046875, - "learning_rate": 3.40405418746459e-05, - "loss": 0.0729, - "num_input_tokens_seen": 87180793, - "step": 1408 - }, - { - "epoch": 0.25698729582577134, - "grad_norm": 0.8984375, - "learning_rate": 3.397514398758046e-05, - "loss": 0.0732, - "num_input_tokens_seen": 87680677, - "step": 1416 - }, - { - "epoch": 0.25843920145190563, - "grad_norm": 0.5703125, - "learning_rate": 3.39094527926934e-05, - "loss": 0.0765, - "num_input_tokens_seen": 88187512, - "step": 1424 - }, - { - "epoch": 0.2598911070780399, - "grad_norm": 1.0546875, - "learning_rate": 3.384346966869976e-05, - "loss": 0.0684, - "num_input_tokens_seen": 88692751, - "step": 1432 - }, - { - "epoch": 0.2613430127041742, - "grad_norm": 2.34375, - "learning_rate": 3.377719600044156e-05, - "loss": 0.0878, - "num_input_tokens_seen": 89183444, - "step": 1440 - }, - { - "epoch": 0.26279491833030855, - "grad_norm": 0.5234375, - "learning_rate": 3.371063317885868e-05, - "loss": 0.0738, - "num_input_tokens_seen": 89681459, - "step": 1448 - }, - { - "epoch": 0.26424682395644283, - "grad_norm": 0.8046875, - "learning_rate": 3.364378260095972e-05, - "loss": 0.075, - "num_input_tokens_seen": 90168008, - "step": 1456 - }, - { - "epoch": 0.2656987295825771, - "grad_norm": 0.984375, - "learning_rate": 3.3576645669792634e-05, - "loss": 0.0606, - "num_input_tokens_seen": 90654438, - "step": 1464 - }, - { - "epoch": 0.2671506352087114, - "grad_norm": 1.1796875, - "learning_rate": 3.350922379441534e-05, - "loss": 0.0853, - "num_input_tokens_seen": 91167951, - "step": 1472 - }, - { - "epoch": 0.26860254083484575, - "grad_norm": 0.8828125, - "learning_rate": 3.3441518389866075e-05, - "loss": 0.0518, - "num_input_tokens_seen": 91650643, - "step": 1480 - }, - { - "epoch": 0.27005444646098004, - "grad_norm": 0.80859375, - "learning_rate": 3.3373530877133764e-05, - "loss": 0.0749, - "num_input_tokens_seen": 92155336, - "step": 1488 - }, - { - "epoch": 0.2715063520871143, - "grad_norm": 0.75390625, - "learning_rate": 3.330526268312817e-05, - "loss": 0.0583, - "num_input_tokens_seen": 92628298, - "step": 1496 - }, - { - "epoch": 0.27295825771324866, - "grad_norm": 0.8203125, - "learning_rate": 3.323671524064992e-05, - "loss": 0.0885, - "num_input_tokens_seen": 93154901, - "step": 1504 - }, - { - "epoch": 0.27441016333938295, - "grad_norm": 0.77734375, - "learning_rate": 3.316788998836048e-05, - "loss": 0.0583, - "num_input_tokens_seen": 93650095, - "step": 1512 - }, - { - "epoch": 0.27586206896551724, - "grad_norm": 4.5625, - "learning_rate": 3.309878837075193e-05, - "loss": 0.0764, - "num_input_tokens_seen": 94136210, - "step": 1520 - }, - { - "epoch": 0.2773139745916515, - "grad_norm": 0.80078125, - "learning_rate": 3.3029411838116654e-05, - "loss": 0.0638, - "num_input_tokens_seen": 94624523, - "step": 1528 - }, - { - "epoch": 0.27876588021778587, - "grad_norm": 1.078125, - "learning_rate": 3.295976184651691e-05, - "loss": 0.0685, - "num_input_tokens_seen": 95110498, - "step": 1536 - }, - { - "epoch": 0.28021778584392015, - "grad_norm": 0.76171875, - "learning_rate": 3.288983985775426e-05, - "loss": 0.0853, - "num_input_tokens_seen": 95620511, - "step": 1544 - }, - { - "epoch": 0.28166969147005444, - "grad_norm": 0.73046875, - "learning_rate": 3.281964733933889e-05, - "loss": 0.0779, - "num_input_tokens_seen": 96130692, - "step": 1552 - }, - { - "epoch": 0.2831215970961887, - "grad_norm": 0.80078125, - "learning_rate": 3.274918576445882e-05, - "loss": 0.0713, - "num_input_tokens_seen": 96638367, - "step": 1560 - }, - { - "epoch": 0.28457350272232307, - "grad_norm": 0.80859375, - "learning_rate": 3.267845661194898e-05, - "loss": 0.0653, - "num_input_tokens_seen": 97154890, - "step": 1568 - }, - { - "epoch": 0.28602540834845736, - "grad_norm": 0.87890625, - "learning_rate": 3.260746136626016e-05, - "loss": 0.0522, - "num_input_tokens_seen": 97650182, - "step": 1576 - }, - { - "epoch": 0.28747731397459164, - "grad_norm": 0.734375, - "learning_rate": 3.253620151742788e-05, - "loss": 0.0868, - "num_input_tokens_seen": 98121695, - "step": 1584 - }, - { - "epoch": 0.28892921960072593, - "grad_norm": 0.484375, - "learning_rate": 3.24646785610411e-05, - "loss": 0.0844, - "num_input_tokens_seen": 98595616, - "step": 1592 - }, - { - "epoch": 0.29038112522686027, - "grad_norm": 0.984375, - "learning_rate": 3.239289399821083e-05, - "loss": 0.0668, - "num_input_tokens_seen": 99105755, - "step": 1600 - }, - { - "epoch": 0.29183303085299456, - "grad_norm": 0.9765625, - "learning_rate": 3.2320849335538636e-05, - "loss": 0.0699, - "num_input_tokens_seen": 99595258, - "step": 1608 - }, - { - "epoch": 0.29328493647912884, - "grad_norm": 1.6328125, - "learning_rate": 3.2248546085084995e-05, - "loss": 0.0903, - "num_input_tokens_seen": 100106643, - "step": 1616 - }, - { - "epoch": 0.29473684210526313, - "grad_norm": 1.40625, - "learning_rate": 3.21759857643376e-05, - "loss": 0.0826, - "num_input_tokens_seen": 100593045, - "step": 1624 - }, - { - "epoch": 0.2961887477313975, - "grad_norm": 0.81640625, - "learning_rate": 3.2103169896179476e-05, - "loss": 0.084, - "num_input_tokens_seen": 101094273, - "step": 1632 - }, - { - "epoch": 0.29764065335753176, - "grad_norm": 1.046875, - "learning_rate": 3.203010000885704e-05, - "loss": 0.0742, - "num_input_tokens_seen": 101593296, - "step": 1640 - }, - { - "epoch": 0.29909255898366605, - "grad_norm": 0.75390625, - "learning_rate": 3.1956777635948016e-05, - "loss": 0.064, - "num_input_tokens_seen": 102074203, - "step": 1648 - }, - { - "epoch": 0.3005444646098004, - "grad_norm": 0.5703125, - "learning_rate": 3.188320431632924e-05, - "loss": 0.0569, - "num_input_tokens_seen": 102576481, - "step": 1656 - }, - { - "epoch": 0.3019963702359347, - "grad_norm": 0.61328125, - "learning_rate": 3.180938159414439e-05, - "loss": 0.0932, - "num_input_tokens_seen": 103070807, - "step": 1664 - }, - { - "epoch": 0.30344827586206896, - "grad_norm": 1.03125, - "learning_rate": 3.173531101877155e-05, - "loss": 0.0621, - "num_input_tokens_seen": 103568290, - "step": 1672 - }, - { - "epoch": 0.30490018148820325, - "grad_norm": 0.7734375, - "learning_rate": 3.166099414479069e-05, - "loss": 0.0579, - "num_input_tokens_seen": 104059494, - "step": 1680 - }, - { - "epoch": 0.3063520871143376, - "grad_norm": 1.1640625, - "learning_rate": 3.158643253195108e-05, - "loss": 0.0695, - "num_input_tokens_seen": 104556886, - "step": 1688 - }, - { - "epoch": 0.3078039927404719, - "grad_norm": 0.90625, - "learning_rate": 3.15116277451385e-05, - "loss": 0.0723, - "num_input_tokens_seen": 105058562, - "step": 1696 - }, - { - "epoch": 0.30925589836660616, - "grad_norm": 0.8203125, - "learning_rate": 3.143658135434244e-05, - "loss": 0.0652, - "num_input_tokens_seen": 105536081, - "step": 1704 - }, - { - "epoch": 0.31070780399274045, - "grad_norm": 0.80859375, - "learning_rate": 3.136129493462312e-05, - "loss": 0.0748, - "num_input_tokens_seen": 106037792, - "step": 1712 - }, - { - "epoch": 0.3121597096188748, - "grad_norm": 0.8203125, - "learning_rate": 3.1285770066078445e-05, - "loss": 0.072, - "num_input_tokens_seen": 106546503, - "step": 1720 - }, - { - "epoch": 0.3121597096188748, - "eval_loss": 0.06825637072324753, - "eval_runtime": 2711.2246, - "eval_samples_per_second": 1.15, - "eval_steps_per_second": 0.144, - "num_input_tokens_seen": 106546503, - "step": 1720 - }, - { - "epoch": 0.3136116152450091, - "grad_norm": 1.3984375, - "learning_rate": 3.121000833381084e-05, - "loss": 0.0737, - "num_input_tokens_seen": 107037952, - "step": 1728 - }, - { - "epoch": 0.31506352087114337, - "grad_norm": 0.828125, - "learning_rate": 3.113401132789399e-05, - "loss": 0.0712, - "num_input_tokens_seen": 107540349, - "step": 1736 - }, - { - "epoch": 0.31651542649727765, - "grad_norm": 0.8515625, - "learning_rate": 3.1057780643339465e-05, - "loss": 0.0685, - "num_input_tokens_seen": 108034983, - "step": 1744 - }, - { - "epoch": 0.317967332123412, - "grad_norm": 0.80859375, - "learning_rate": 3.098131788006322e-05, - "loss": 0.0718, - "num_input_tokens_seen": 108503192, - "step": 1752 - }, - { - "epoch": 0.3194192377495463, - "grad_norm": 0.4921875, - "learning_rate": 3.0904624642852065e-05, - "loss": 0.076, - "num_input_tokens_seen": 109019554, - "step": 1760 - }, - { - "epoch": 0.32087114337568057, - "grad_norm": 1.265625, - "learning_rate": 3.082770254132993e-05, - "loss": 0.0549, - "num_input_tokens_seen": 109504850, - "step": 1768 - }, - { - "epoch": 0.3223230490018149, - "grad_norm": 0.66796875, - "learning_rate": 3.075055318992412e-05, - "loss": 0.068, - "num_input_tokens_seen": 110008850, - "step": 1776 - }, - { - "epoch": 0.3237749546279492, - "grad_norm": 0.78125, - "learning_rate": 3.067317820783143e-05, - "loss": 0.0676, - "num_input_tokens_seen": 110528376, - "step": 1784 - }, - { - "epoch": 0.3252268602540835, - "grad_norm": 0.62890625, - "learning_rate": 3.0595579218984124e-05, - "loss": 0.0862, - "num_input_tokens_seen": 111026349, - "step": 1792 - }, - { - "epoch": 0.32667876588021777, - "grad_norm": 0.71484375, - "learning_rate": 3.05177578520159e-05, - "loss": 0.0561, - "num_input_tokens_seen": 111515922, - "step": 1800 - }, - { - "epoch": 0.3281306715063521, - "grad_norm": 0.76171875, - "learning_rate": 3.04397157402277e-05, - "loss": 0.0599, - "num_input_tokens_seen": 112007455, - "step": 1808 - }, - { - "epoch": 0.3295825771324864, - "grad_norm": 0.60546875, - "learning_rate": 3.0361454521553383e-05, - "loss": 0.0856, - "num_input_tokens_seen": 112491694, - "step": 1816 - }, - { - "epoch": 0.3310344827586207, - "grad_norm": 0.69140625, - "learning_rate": 3.028297583852541e-05, - "loss": 0.055, - "num_input_tokens_seen": 112968009, - "step": 1824 - }, - { - "epoch": 0.33248638838475497, - "grad_norm": 1.2265625, - "learning_rate": 3.020428133824035e-05, - "loss": 0.0495, - "num_input_tokens_seen": 113462356, - "step": 1832 - }, - { - "epoch": 0.3339382940108893, - "grad_norm": 0.9140625, - "learning_rate": 3.0125372672324285e-05, - "loss": 0.0765, - "num_input_tokens_seen": 113976443, - "step": 1840 - }, - { - "epoch": 0.3353901996370236, - "grad_norm": 0.60546875, - "learning_rate": 3.0046251496898177e-05, - "loss": 0.0521, - "num_input_tokens_seen": 114445408, - "step": 1848 - }, - { - "epoch": 0.3368421052631579, - "grad_norm": 1.0, - "learning_rate": 2.9966919472543098e-05, - "loss": 0.0659, - "num_input_tokens_seen": 114933077, - "step": 1856 - }, - { - "epoch": 0.3382940108892922, - "grad_norm": 0.8203125, - "learning_rate": 2.9887378264265387e-05, - "loss": 0.0853, - "num_input_tokens_seen": 115416098, - "step": 1864 - }, - { - "epoch": 0.3397459165154265, - "grad_norm": 0.640625, - "learning_rate": 2.9807629541461693e-05, - "loss": 0.0611, - "num_input_tokens_seen": 115937997, - "step": 1872 - }, - { - "epoch": 0.3411978221415608, - "grad_norm": 0.76953125, - "learning_rate": 2.972767497788393e-05, - "loss": 0.048, - "num_input_tokens_seen": 116441850, - "step": 1880 - }, - { - "epoch": 0.3426497277676951, - "grad_norm": 1.046875, - "learning_rate": 2.9647516251604192e-05, - "loss": 0.0777, - "num_input_tokens_seen": 116937086, - "step": 1888 - }, - { - "epoch": 0.3441016333938294, - "grad_norm": 0.81640625, - "learning_rate": 2.9567155044979466e-05, - "loss": 0.0598, - "num_input_tokens_seen": 117443956, - "step": 1896 - }, - { - "epoch": 0.3455535390199637, - "grad_norm": 1.40625, - "learning_rate": 2.9486593044616394e-05, - "loss": 0.0686, - "num_input_tokens_seen": 117937379, - "step": 1904 - }, - { - "epoch": 0.347005444646098, - "grad_norm": 0.72265625, - "learning_rate": 2.9405831941335816e-05, - "loss": 0.053, - "num_input_tokens_seen": 118423431, - "step": 1912 - }, - { - "epoch": 0.3484573502722323, - "grad_norm": 0.5625, - "learning_rate": 2.932487343013732e-05, - "loss": 0.0485, - "num_input_tokens_seen": 118938547, - "step": 1920 - }, - { - "epoch": 0.34990925589836663, - "grad_norm": 0.7265625, - "learning_rate": 2.9243719210163654e-05, - "loss": 0.076, - "num_input_tokens_seen": 119414827, - "step": 1928 - }, - { - "epoch": 0.3513611615245009, - "grad_norm": 0.62890625, - "learning_rate": 2.916237098466507e-05, - "loss": 0.037, - "num_input_tokens_seen": 119906010, - "step": 1936 - }, - { - "epoch": 0.3528130671506352, - "grad_norm": 0.66015625, - "learning_rate": 2.9080830460963563e-05, - "loss": 0.0561, - "num_input_tokens_seen": 120390508, - "step": 1944 - }, - { - "epoch": 0.3542649727767695, - "grad_norm": 0.87890625, - "learning_rate": 2.8999099350417065e-05, - "loss": 0.0846, - "num_input_tokens_seen": 120863309, - "step": 1952 - }, - { - "epoch": 0.35571687840290384, - "grad_norm": 0.73046875, - "learning_rate": 2.8917179368383493e-05, - "loss": 0.0403, - "num_input_tokens_seen": 121339176, - "step": 1960 - }, - { - "epoch": 0.3571687840290381, - "grad_norm": 0.453125, - "learning_rate": 2.883507223418478e-05, - "loss": 0.0645, - "num_input_tokens_seen": 121867501, - "step": 1968 - }, - { - "epoch": 0.3586206896551724, - "grad_norm": 1.21875, - "learning_rate": 2.875277967107076e-05, - "loss": 0.0911, - "num_input_tokens_seen": 122375421, - "step": 1976 - }, - { - "epoch": 0.3600725952813067, - "grad_norm": 0.90234375, - "learning_rate": 2.867030340618303e-05, - "loss": 0.0454, - "num_input_tokens_seen": 122856601, - "step": 1984 - }, - { - "epoch": 0.36152450090744104, - "grad_norm": 0.546875, - "learning_rate": 2.858764517051868e-05, - "loss": 0.0615, - "num_input_tokens_seen": 123347371, - "step": 1992 - }, - { - "epoch": 0.3629764065335753, - "grad_norm": 0.369140625, - "learning_rate": 2.850480669889397e-05, - "loss": 0.0536, - "num_input_tokens_seen": 123846779, - "step": 2000 - }, - { - "epoch": 0.3644283121597096, - "grad_norm": 1.875, - "learning_rate": 2.8421789729907928e-05, - "loss": 0.0499, - "num_input_tokens_seen": 124332390, - "step": 2008 - }, - { - "epoch": 0.3658802177858439, - "grad_norm": 0.53125, - "learning_rate": 2.833859600590583e-05, - "loss": 0.076, - "num_input_tokens_seen": 124806640, - "step": 2016 - }, - { - "epoch": 0.36733212341197824, - "grad_norm": 0.98828125, - "learning_rate": 2.825522727294268e-05, - "loss": 0.0347, - "num_input_tokens_seen": 125289556, - "step": 2024 - }, - { - "epoch": 0.3687840290381125, - "grad_norm": 0.765625, - "learning_rate": 2.817168528074654e-05, - "loss": 0.0854, - "num_input_tokens_seen": 125783042, - "step": 2032 - }, - { - "epoch": 0.3702359346642468, - "grad_norm": 0.7109375, - "learning_rate": 2.8087971782681774e-05, - "loss": 0.0731, - "num_input_tokens_seen": 126277662, - "step": 2040 - }, - { - "epoch": 0.3716878402903811, - "grad_norm": 0.7265625, - "learning_rate": 2.8004088535712315e-05, - "loss": 0.0833, - "num_input_tokens_seen": 126770182, - "step": 2048 - }, - { - "epoch": 0.37313974591651544, - "grad_norm": 0.84375, - "learning_rate": 2.7920037300364746e-05, - "loss": 0.0752, - "num_input_tokens_seen": 127265873, - "step": 2056 - }, - { - "epoch": 0.37459165154264973, - "grad_norm": 1.046875, - "learning_rate": 2.783581984069134e-05, - "loss": 0.0652, - "num_input_tokens_seen": 127767598, - "step": 2064 - }, - { - "epoch": 0.37459165154264973, - "eval_loss": 0.06295192986726761, - "eval_runtime": 2754.9055, - "eval_samples_per_second": 1.131, - "eval_steps_per_second": 0.142, - "num_input_tokens_seen": 127767598, - "step": 2064 - }, - { - "epoch": 0.376043557168784, - "grad_norm": 1.9609375, - "learning_rate": 2.7751437924233093e-05, - "loss": 0.06, - "num_input_tokens_seen": 128256289, - "step": 2072 - }, - { - "epoch": 0.37749546279491836, - "grad_norm": 1.421875, - "learning_rate": 2.7666893321982548e-05, - "loss": 0.0714, - "num_input_tokens_seen": 128789423, - "step": 2080 - }, - { - "epoch": 0.37894736842105264, - "grad_norm": 0.7265625, - "learning_rate": 2.758218780834671e-05, - "loss": 0.0608, - "num_input_tokens_seen": 129283910, - "step": 2088 - }, - { - "epoch": 0.38039927404718693, - "grad_norm": 0.87109375, - "learning_rate": 2.7497323161109734e-05, - "loss": 0.0567, - "num_input_tokens_seen": 129762227, - "step": 2096 - }, - { - "epoch": 0.3818511796733212, - "grad_norm": 0.71484375, - "learning_rate": 2.741230116139565e-05, - "loss": 0.0822, - "num_input_tokens_seen": 130260949, - "step": 2104 - }, - { - "epoch": 0.38330308529945556, - "grad_norm": 1.328125, - "learning_rate": 2.7327123593630984e-05, - "loss": 0.0744, - "num_input_tokens_seen": 130738461, - "step": 2112 - }, - { - "epoch": 0.38475499092558985, - "grad_norm": 0.70703125, - "learning_rate": 2.7241792245507284e-05, - "loss": 0.0428, - "num_input_tokens_seen": 131250070, - "step": 2120 - }, - { - "epoch": 0.38620689655172413, - "grad_norm": 1.0234375, - "learning_rate": 2.715630890794362e-05, - "loss": 0.0764, - "num_input_tokens_seen": 131731607, - "step": 2128 - }, - { - "epoch": 0.3876588021778584, - "grad_norm": 0.92578125, - "learning_rate": 2.7070675375048984e-05, - "loss": 0.0464, - "num_input_tokens_seen": 132241144, - "step": 2136 - }, - { - "epoch": 0.38911070780399276, - "grad_norm": 0.83984375, - "learning_rate": 2.698489344408464e-05, - "loss": 0.0598, - "num_input_tokens_seen": 132728134, - "step": 2144 - }, - { - "epoch": 0.39056261343012705, - "grad_norm": 1.1953125, - "learning_rate": 2.689896491542642e-05, - "loss": 0.0897, - "num_input_tokens_seen": 133209860, - "step": 2152 - }, - { - "epoch": 0.39201451905626133, - "grad_norm": 1.1015625, - "learning_rate": 2.681289159252689e-05, - "loss": 0.0525, - "num_input_tokens_seen": 133711627, - "step": 2160 - }, - { - "epoch": 0.3934664246823956, - "grad_norm": 0.65625, - "learning_rate": 2.6726675281877567e-05, - "loss": 0.0602, - "num_input_tokens_seen": 134198176, - "step": 2168 - }, - { - "epoch": 0.39491833030852996, - "grad_norm": 0.69921875, - "learning_rate": 2.6640317792970947e-05, - "loss": 0.0562, - "num_input_tokens_seen": 134689114, - "step": 2176 - }, - { - "epoch": 0.39637023593466425, - "grad_norm": 0.72265625, - "learning_rate": 2.6553820938262557e-05, - "loss": 0.0341, - "num_input_tokens_seen": 135179499, - "step": 2184 - }, - { - "epoch": 0.39782214156079854, - "grad_norm": 1.0234375, - "learning_rate": 2.6467186533132906e-05, - "loss": 0.0783, - "num_input_tokens_seen": 135700208, - "step": 2192 - }, - { - "epoch": 0.3992740471869328, - "grad_norm": 0.58984375, - "learning_rate": 2.638041639584939e-05, - "loss": 0.0604, - "num_input_tokens_seen": 136212202, - "step": 2200 - }, - { - "epoch": 0.40072595281306717, - "grad_norm": 0.55859375, - "learning_rate": 2.6293512347528122e-05, - "loss": 0.0591, - "num_input_tokens_seen": 136698380, - "step": 2208 - }, - { - "epoch": 0.40217785843920145, - "grad_norm": 0.66796875, - "learning_rate": 2.6206476212095734e-05, - "loss": 0.0743, - "num_input_tokens_seen": 137191271, - "step": 2216 - }, - { - "epoch": 0.40362976406533574, - "grad_norm": 0.5859375, - "learning_rate": 2.6119309816251042e-05, - "loss": 0.0437, - "num_input_tokens_seen": 137660173, - "step": 2224 - }, - { - "epoch": 0.4050816696914701, - "grad_norm": 0.8671875, - "learning_rate": 2.6032014989426784e-05, - "loss": 0.0597, - "num_input_tokens_seen": 138165909, - "step": 2232 - }, - { - "epoch": 0.40653357531760437, - "grad_norm": 0.7734375, - "learning_rate": 2.594459356375116e-05, - "loss": 0.0504, - "num_input_tokens_seen": 138631528, - "step": 2240 - }, - { - "epoch": 0.40798548094373865, - "grad_norm": 0.71484375, - "learning_rate": 2.585704737400941e-05, - "loss": 0.0611, - "num_input_tokens_seen": 139130348, - "step": 2248 - }, - { - "epoch": 0.40943738656987294, - "grad_norm": 0.6640625, - "learning_rate": 2.57693782576053e-05, - "loss": 0.0461, - "num_input_tokens_seen": 139617268, - "step": 2256 - }, - { - "epoch": 0.4108892921960073, - "grad_norm": 0.67578125, - "learning_rate": 2.568158805452256e-05, - "loss": 0.062, - "num_input_tokens_seen": 140121646, - "step": 2264 - }, - { - "epoch": 0.41234119782214157, - "grad_norm": 0.73828125, - "learning_rate": 2.559367860728627e-05, - "loss": 0.0506, - "num_input_tokens_seen": 140625443, - "step": 2272 - }, - { - "epoch": 0.41379310344827586, - "grad_norm": 0.703125, - "learning_rate": 2.5505651760924182e-05, - "loss": 0.0757, - "num_input_tokens_seen": 141135512, - "step": 2280 - }, - { - "epoch": 0.41524500907441014, - "grad_norm": 0.56640625, - "learning_rate": 2.5417509362927986e-05, - "loss": 0.078, - "num_input_tokens_seen": 141614186, - "step": 2288 - }, - { - "epoch": 0.4166969147005445, - "grad_norm": 0.98828125, - "learning_rate": 2.5329253263214573e-05, - "loss": 0.0549, - "num_input_tokens_seen": 142126285, - "step": 2296 - }, - { - "epoch": 0.41814882032667877, - "grad_norm": 0.49609375, - "learning_rate": 2.5240885314087162e-05, - "loss": 0.0592, - "num_input_tokens_seen": 142609607, - "step": 2304 - }, - { - "epoch": 0.41960072595281306, - "grad_norm": 0.890625, - "learning_rate": 2.5152407370196467e-05, - "loss": 0.0477, - "num_input_tokens_seen": 143090080, - "step": 2312 - }, - { - "epoch": 0.42105263157894735, - "grad_norm": 0.77734375, - "learning_rate": 2.5063821288501746e-05, - "loss": 0.0576, - "num_input_tokens_seen": 143576776, - "step": 2320 - }, - { - "epoch": 0.4225045372050817, - "grad_norm": 0.5859375, - "learning_rate": 2.4975128928231823e-05, - "loss": 0.0671, - "num_input_tokens_seen": 144070311, - "step": 2328 - }, - { - "epoch": 0.423956442831216, - "grad_norm": 0.97265625, - "learning_rate": 2.4886332150846092e-05, - "loss": 0.0637, - "num_input_tokens_seen": 144581612, - "step": 2336 - }, - { - "epoch": 0.42540834845735026, - "grad_norm": 0.55078125, - "learning_rate": 2.4797432819995427e-05, - "loss": 0.0496, - "num_input_tokens_seen": 145085129, - "step": 2344 - }, - { - "epoch": 0.42686025408348455, - "grad_norm": 0.8046875, - "learning_rate": 2.4708432801483086e-05, - "loss": 0.0662, - "num_input_tokens_seen": 145568633, - "step": 2352 - }, - { - "epoch": 0.4283121597096189, - "grad_norm": 0.84375, - "learning_rate": 2.4619333963225525e-05, - "loss": 0.059, - "num_input_tokens_seen": 146076350, - "step": 2360 - }, - { - "epoch": 0.4297640653357532, - "grad_norm": 1.1015625, - "learning_rate": 2.4530138175213222e-05, - "loss": 0.1076, - "num_input_tokens_seen": 146577893, - "step": 2368 - }, - { - "epoch": 0.43121597096188746, - "grad_norm": 0.89453125, - "learning_rate": 2.4440847309471422e-05, - "loss": 0.0794, - "num_input_tokens_seen": 147074725, - "step": 2376 - }, - { - "epoch": 0.4326678765880218, - "grad_norm": 0.9375, - "learning_rate": 2.435146324002083e-05, - "loss": 0.0537, - "num_input_tokens_seen": 147559139, - "step": 2384 - }, - { - "epoch": 0.4341197822141561, - "grad_norm": 0.59765625, - "learning_rate": 2.426198784283831e-05, - "loss": 0.0429, - "num_input_tokens_seen": 148055859, - "step": 2392 - }, - { - "epoch": 0.4355716878402904, - "grad_norm": 0.369140625, - "learning_rate": 2.4172422995817496e-05, - "loss": 0.0583, - "num_input_tokens_seen": 148559803, - "step": 2400 - }, - { - "epoch": 0.43702359346642466, - "grad_norm": 1.515625, - "learning_rate": 2.408277057872936e-05, - "loss": 0.0693, - "num_input_tokens_seen": 149047633, - "step": 2408 - }, - { - "epoch": 0.43702359346642466, - "eval_loss": 0.05809076130390167, - "eval_runtime": 2813.328, - "eval_samples_per_second": 1.108, - "eval_steps_per_second": 0.139, - "num_input_tokens_seen": 149047633, - "step": 2408 - }, - { - "epoch": 0.438475499092559, - "grad_norm": 0.7265625, - "learning_rate": 2.3993032473182796e-05, - "loss": 0.0627, - "num_input_tokens_seen": 149553600, - "step": 2416 - }, - { - "epoch": 0.4399274047186933, - "grad_norm": 0.70703125, - "learning_rate": 2.390321056258511e-05, - "loss": 0.0518, - "num_input_tokens_seen": 150031007, - "step": 2424 - }, - { - "epoch": 0.4413793103448276, - "grad_norm": 0.6640625, - "learning_rate": 2.3813306732102483e-05, - "loss": 0.0564, - "num_input_tokens_seen": 150506503, - "step": 2432 - }, - { - "epoch": 0.44283121597096187, - "grad_norm": 0.75390625, - "learning_rate": 2.3723322868620436e-05, - "loss": 0.0728, - "num_input_tokens_seen": 151018070, - "step": 2440 - }, - { - "epoch": 0.4442831215970962, - "grad_norm": 0.453125, - "learning_rate": 2.3633260860704188e-05, - "loss": 0.0428, - "num_input_tokens_seen": 151507916, - "step": 2448 - }, - { - "epoch": 0.4457350272232305, - "grad_norm": 0.93359375, - "learning_rate": 2.3543122598559053e-05, - "loss": 0.0458, - "num_input_tokens_seen": 151999967, - "step": 2456 - }, - { - "epoch": 0.4471869328493648, - "grad_norm": 1.609375, - "learning_rate": 2.345290997399074e-05, - "loss": 0.051, - "num_input_tokens_seen": 152499025, - "step": 2464 - }, - { - "epoch": 0.44863883847549907, - "grad_norm": 1.3984375, - "learning_rate": 2.3362624880365677e-05, - "loss": 0.0713, - "num_input_tokens_seen": 152984867, - "step": 2472 - }, - { - "epoch": 0.4500907441016334, - "grad_norm": 0.91796875, - "learning_rate": 2.3272269212571262e-05, - "loss": 0.0627, - "num_input_tokens_seen": 153473082, - "step": 2480 - }, - { - "epoch": 0.4515426497277677, - "grad_norm": 0.55859375, - "learning_rate": 2.3181844866976076e-05, - "loss": 0.048, - "num_input_tokens_seen": 153951602, - "step": 2488 - }, - { - "epoch": 0.452994555353902, - "grad_norm": 0.46875, - "learning_rate": 2.3091353741390116e-05, - "loss": 0.0476, - "num_input_tokens_seen": 154432971, - "step": 2496 - }, - { - "epoch": 0.45444646098003627, - "grad_norm": 0.97265625, - "learning_rate": 2.3000797735024922e-05, - "loss": 0.049, - "num_input_tokens_seen": 154912331, - "step": 2504 - }, - { - "epoch": 0.4558983666061706, - "grad_norm": 0.94140625, - "learning_rate": 2.2910178748453765e-05, - "loss": 0.0544, - "num_input_tokens_seen": 155385055, - "step": 2512 - }, - { - "epoch": 0.4573502722323049, - "grad_norm": 0.76953125, - "learning_rate": 2.2819498683571718e-05, - "loss": 0.0494, - "num_input_tokens_seen": 155892191, - "step": 2520 - }, - { - "epoch": 0.4588021778584392, - "grad_norm": 0.625, - "learning_rate": 2.272875944355575e-05, - "loss": 0.066, - "num_input_tokens_seen": 156405102, - "step": 2528 - }, - { - "epoch": 0.46025408348457353, - "grad_norm": 0.7734375, - "learning_rate": 2.2637962932824803e-05, - "loss": 0.0605, - "num_input_tokens_seen": 156909466, - "step": 2536 - }, - { - "epoch": 0.4617059891107078, - "grad_norm": 0.6640625, - "learning_rate": 2.2547111056999808e-05, - "loss": 0.0394, - "num_input_tokens_seen": 157391122, - "step": 2544 - }, - { - "epoch": 0.4631578947368421, - "grad_norm": 0.361328125, - "learning_rate": 2.245620572286366e-05, - "loss": 0.0525, - "num_input_tokens_seen": 157880121, - "step": 2552 - }, - { - "epoch": 0.4646098003629764, - "grad_norm": 0.494140625, - "learning_rate": 2.2365248838321273e-05, - "loss": 0.0491, - "num_input_tokens_seen": 158360167, - "step": 2560 - }, - { - "epoch": 0.46606170598911073, - "grad_norm": 0.52734375, - "learning_rate": 2.2274242312359445e-05, - "loss": 0.0528, - "num_input_tokens_seen": 158867422, - "step": 2568 - }, - { - "epoch": 0.467513611615245, - "grad_norm": 0.671875, - "learning_rate": 2.2183188055006867e-05, - "loss": 0.0679, - "num_input_tokens_seen": 159364296, - "step": 2576 - }, - { - "epoch": 0.4689655172413793, - "grad_norm": 0.59375, - "learning_rate": 2.2092087977294e-05, - "loss": 0.0744, - "num_input_tokens_seen": 159890619, - "step": 2584 - }, - { - "epoch": 0.4704174228675136, - "grad_norm": 0.68359375, - "learning_rate": 2.2000943991212977e-05, - "loss": 0.0419, - "num_input_tokens_seen": 160398651, - "step": 2592 - }, - { - "epoch": 0.47186932849364793, - "grad_norm": 0.73828125, - "learning_rate": 2.190975800967747e-05, - "loss": 0.0616, - "num_input_tokens_seen": 160922909, - "step": 2600 - }, - { - "epoch": 0.4733212341197822, - "grad_norm": 0.5390625, - "learning_rate": 2.1818531946482543e-05, - "loss": 0.0442, - "num_input_tokens_seen": 161419902, - "step": 2608 - }, - { - "epoch": 0.4747731397459165, - "grad_norm": 0.625, - "learning_rate": 2.172726771626449e-05, - "loss": 0.0469, - "num_input_tokens_seen": 161929180, - "step": 2616 - }, - { - "epoch": 0.4762250453720508, - "grad_norm": 0.63671875, - "learning_rate": 2.163596723446065e-05, - "loss": 0.0573, - "num_input_tokens_seen": 162437709, - "step": 2624 - }, - { - "epoch": 0.47767695099818513, - "grad_norm": 1.046875, - "learning_rate": 2.1544632417269194e-05, - "loss": 0.052, - "num_input_tokens_seen": 162950151, - "step": 2632 - }, - { - "epoch": 0.4791288566243194, - "grad_norm": 0.90234375, - "learning_rate": 2.145326518160893e-05, - "loss": 0.0576, - "num_input_tokens_seen": 163429462, - "step": 2640 - }, - { - "epoch": 0.4805807622504537, - "grad_norm": 0.578125, - "learning_rate": 2.136186744507904e-05, - "loss": 0.0577, - "num_input_tokens_seen": 163939160, - "step": 2648 - }, - { - "epoch": 0.482032667876588, - "grad_norm": 0.4921875, - "learning_rate": 2.1270441125918882e-05, - "loss": 0.051, - "num_input_tokens_seen": 164446079, - "step": 2656 - }, - { - "epoch": 0.48348457350272234, - "grad_norm": 0.58984375, - "learning_rate": 2.1178988142967678e-05, - "loss": 0.0489, - "num_input_tokens_seen": 164936233, - "step": 2664 - }, - { - "epoch": 0.4849364791288566, - "grad_norm": 0.91015625, - "learning_rate": 2.108751041562427e-05, - "loss": 0.0622, - "num_input_tokens_seen": 165409965, - "step": 2672 - }, - { - "epoch": 0.4863883847549909, - "grad_norm": 0.5234375, - "learning_rate": 2.0996009863806834e-05, - "loss": 0.0578, - "num_input_tokens_seen": 165901841, - "step": 2680 - }, - { - "epoch": 0.48784029038112525, - "grad_norm": 0.88671875, - "learning_rate": 2.0904488407912575e-05, - "loss": 0.0389, - "num_input_tokens_seen": 166384603, - "step": 2688 - }, - { - "epoch": 0.48929219600725954, - "grad_norm": 0.34375, - "learning_rate": 2.0812947968777437e-05, - "loss": 0.0432, - "num_input_tokens_seen": 166889709, - "step": 2696 - }, - { - "epoch": 0.4907441016333938, - "grad_norm": 0.9296875, - "learning_rate": 2.0721390467635788e-05, - "loss": 0.0453, - "num_input_tokens_seen": 167372121, - "step": 2704 - }, - { - "epoch": 0.4921960072595281, - "grad_norm": 0.4609375, - "learning_rate": 2.0629817826080073e-05, - "loss": 0.0447, - "num_input_tokens_seen": 167871991, - "step": 2712 - }, - { - "epoch": 0.49364791288566245, - "grad_norm": 0.953125, - "learning_rate": 2.053823196602051e-05, - "loss": 0.0543, - "num_input_tokens_seen": 168369985, - "step": 2720 - }, - { - "epoch": 0.49509981851179674, - "grad_norm": 0.58203125, - "learning_rate": 2.044663480964474e-05, - "loss": 0.0416, - "num_input_tokens_seen": 168846412, - "step": 2728 - }, - { - "epoch": 0.496551724137931, - "grad_norm": 0.6171875, - "learning_rate": 2.0355028279377498e-05, - "loss": 0.0467, - "num_input_tokens_seen": 169335334, - "step": 2736 - }, - { - "epoch": 0.4980036297640653, - "grad_norm": 0.67578125, - "learning_rate": 2.026341429784025e-05, - "loss": 0.0724, - "num_input_tokens_seen": 169830612, - "step": 2744 - }, - { - "epoch": 0.49945553539019966, - "grad_norm": 0.53125, - "learning_rate": 2.0171794787810842e-05, - "loss": 0.0723, - "num_input_tokens_seen": 170349739, - "step": 2752 - }, - { - "epoch": 0.49945553539019966, - "eval_loss": 0.054387591779232025, - "eval_runtime": 2838.6975, - "eval_samples_per_second": 1.098, - "eval_steps_per_second": 0.137, - "num_input_tokens_seen": 170349739, - "step": 2752 - }, - { - "epoch": 0.5009074410163339, - "grad_norm": 0.5390625, - "learning_rate": 2.008017167218317e-05, - "loss": 0.0365, - "num_input_tokens_seen": 170843316, - "step": 2760 - }, - { - "epoch": 0.5023593466424683, - "grad_norm": 0.6640625, - "learning_rate": 1.9988546873926788e-05, - "loss": 0.0456, - "num_input_tokens_seen": 171324496, - "step": 2768 - }, - { - "epoch": 0.5038112522686026, - "grad_norm": 0.71875, - "learning_rate": 1.9896922316046562e-05, - "loss": 0.0416, - "num_input_tokens_seen": 171829665, - "step": 2776 - }, - { - "epoch": 0.5052631578947369, - "grad_norm": 0.5625, - "learning_rate": 1.980529992154233e-05, - "loss": 0.0395, - "num_input_tokens_seen": 172325874, - "step": 2784 - }, - { - "epoch": 0.5067150635208711, - "grad_norm": 0.490234375, - "learning_rate": 1.9713681613368506e-05, - "loss": 0.0536, - "num_input_tokens_seen": 172832464, - "step": 2792 - }, - { - "epoch": 0.5081669691470054, - "grad_norm": 0.78125, - "learning_rate": 1.9622069314393753e-05, - "loss": 0.0505, - "num_input_tokens_seen": 173320567, - "step": 2800 - }, - { - "epoch": 0.5096188747731397, - "grad_norm": 0.75390625, - "learning_rate": 1.9530464947360615e-05, - "loss": 0.0528, - "num_input_tokens_seen": 173816293, - "step": 2808 - }, - { - "epoch": 0.511070780399274, - "grad_norm": 0.74609375, - "learning_rate": 1.943887043484515e-05, - "loss": 0.0766, - "num_input_tokens_seen": 174302982, - "step": 2816 - }, - { - "epoch": 0.5125226860254084, - "grad_norm": 0.80859375, - "learning_rate": 1.9347287699216602e-05, - "loss": 0.0574, - "num_input_tokens_seen": 174807598, - "step": 2824 - }, - { - "epoch": 0.5139745916515427, - "grad_norm": 1.1875, - "learning_rate": 1.9255718662597044e-05, - "loss": 0.0667, - "num_input_tokens_seen": 175302323, - "step": 2832 - }, - { - "epoch": 0.515426497277677, - "grad_norm": 0.59765625, - "learning_rate": 1.9164165246821026e-05, - "loss": 0.0434, - "num_input_tokens_seen": 175782712, - "step": 2840 - }, - { - "epoch": 0.5168784029038113, - "grad_norm": 0.6328125, - "learning_rate": 1.9072629373395268e-05, - "loss": 0.0573, - "num_input_tokens_seen": 176252965, - "step": 2848 - }, - { - "epoch": 0.5183303085299455, - "grad_norm": 0.7109375, - "learning_rate": 1.8981112963458293e-05, - "loss": 0.0541, - "num_input_tokens_seen": 176746353, - "step": 2856 - }, - { - "epoch": 0.5197822141560798, - "grad_norm": 1.0859375, - "learning_rate": 1.8889617937740146e-05, - "loss": 0.0457, - "num_input_tokens_seen": 177252614, - "step": 2864 - }, - { - "epoch": 0.5212341197822141, - "grad_norm": 0.73828125, - "learning_rate": 1.879814621652206e-05, - "loss": 0.0588, - "num_input_tokens_seen": 177752505, - "step": 2872 - }, - { - "epoch": 0.5226860254083484, - "grad_norm": 0.83984375, - "learning_rate": 1.8706699719596138e-05, - "loss": 0.0717, - "num_input_tokens_seen": 178248588, - "step": 2880 - }, - { - "epoch": 0.5241379310344828, - "grad_norm": 0.95703125, - "learning_rate": 1.8615280366225113e-05, - "loss": 0.0634, - "num_input_tokens_seen": 178746624, - "step": 2888 - }, - { - "epoch": 0.5255898366606171, - "grad_norm": 0.703125, - "learning_rate": 1.852389007510201e-05, - "loss": 0.0573, - "num_input_tokens_seen": 179239200, - "step": 2896 - }, - { - "epoch": 0.5270417422867514, - "grad_norm": 0.96484375, - "learning_rate": 1.8432530764309916e-05, - "loss": 0.0574, - "num_input_tokens_seen": 179731398, - "step": 2904 - }, - { - "epoch": 0.5284936479128857, - "grad_norm": 0.58203125, - "learning_rate": 1.8341204351281684e-05, - "loss": 0.0786, - "num_input_tokens_seen": 180216141, - "step": 2912 - }, - { - "epoch": 0.52994555353902, - "grad_norm": 0.4765625, - "learning_rate": 1.8249912752759748e-05, - "loss": 0.0481, - "num_input_tokens_seen": 180719896, - "step": 2920 - }, - { - "epoch": 0.5313974591651542, - "grad_norm": 0.64453125, - "learning_rate": 1.8158657884755832e-05, - "loss": 0.0595, - "num_input_tokens_seen": 181215874, - "step": 2928 - }, - { - "epoch": 0.5328493647912885, - "grad_norm": 0.6953125, - "learning_rate": 1.8067441662510782e-05, - "loss": 0.0495, - "num_input_tokens_seen": 181715660, - "step": 2936 - }, - { - "epoch": 0.5343012704174228, - "grad_norm": 0.53515625, - "learning_rate": 1.797626600045435e-05, - "loss": 0.0507, - "num_input_tokens_seen": 182189644, - "step": 2944 - }, - { - "epoch": 0.5357531760435572, - "grad_norm": 0.88671875, - "learning_rate": 1.7885132812165022e-05, - "loss": 0.0457, - "num_input_tokens_seen": 182692258, - "step": 2952 - }, - { - "epoch": 0.5372050816696915, - "grad_norm": 0.48828125, - "learning_rate": 1.7794044010329844e-05, - "loss": 0.0454, - "num_input_tokens_seen": 183173683, - "step": 2960 - }, - { - "epoch": 0.5386569872958258, - "grad_norm": 1.015625, - "learning_rate": 1.7703001506704297e-05, - "loss": 0.0612, - "num_input_tokens_seen": 183670207, - "step": 2968 - }, - { - "epoch": 0.5401088929219601, - "grad_norm": 0.6796875, - "learning_rate": 1.761200721207215e-05, - "loss": 0.0559, - "num_input_tokens_seen": 184191448, - "step": 2976 - }, - { - "epoch": 0.5415607985480944, - "grad_norm": 0.65625, - "learning_rate": 1.7521063036205383e-05, - "loss": 0.032, - "num_input_tokens_seen": 184672691, - "step": 2984 - }, - { - "epoch": 0.5430127041742286, - "grad_norm": 0.625, - "learning_rate": 1.7430170887824088e-05, - "loss": 0.0597, - "num_input_tokens_seen": 185179876, - "step": 2992 - }, - { - "epoch": 0.5444646098003629, - "grad_norm": 0.734375, - "learning_rate": 1.7339332674556408e-05, - "loss": 0.0566, - "num_input_tokens_seen": 185659670, - "step": 3000 - }, - { - "epoch": 0.5459165154264973, - "grad_norm": 0.279296875, - "learning_rate": 1.724855030289852e-05, - "loss": 0.028, - "num_input_tokens_seen": 186148613, - "step": 3008 - }, - { - "epoch": 0.5473684210526316, - "grad_norm": 0.87109375, - "learning_rate": 1.715782567817459e-05, - "loss": 0.0567, - "num_input_tokens_seen": 186651171, - "step": 3016 - }, - { - "epoch": 0.5488203266787659, - "grad_norm": 0.71484375, - "learning_rate": 1.7067160704496817e-05, - "loss": 0.0584, - "num_input_tokens_seen": 187155654, - "step": 3024 - }, - { - "epoch": 0.5502722323049002, - "grad_norm": 1.078125, - "learning_rate": 1.6976557284725434e-05, - "loss": 0.0554, - "num_input_tokens_seen": 187631290, - "step": 3032 - }, - { - "epoch": 0.5517241379310345, - "grad_norm": 0.5390625, - "learning_rate": 1.6886017320428817e-05, - "loss": 0.0654, - "num_input_tokens_seen": 188114682, - "step": 3040 - }, - { - "epoch": 0.5531760435571688, - "grad_norm": 0.7734375, - "learning_rate": 1.6795542711843535e-05, - "loss": 0.0489, - "num_input_tokens_seen": 188586657, - "step": 3048 - }, - { - "epoch": 0.554627949183303, - "grad_norm": 0.8515625, - "learning_rate": 1.670513535783448e-05, - "loss": 0.0432, - "num_input_tokens_seen": 189073577, - "step": 3056 - }, - { - "epoch": 0.5560798548094373, - "grad_norm": 0.95703125, - "learning_rate": 1.661479715585503e-05, - "loss": 0.0559, - "num_input_tokens_seen": 189536844, - "step": 3064 - }, - { - "epoch": 0.5575317604355717, - "grad_norm": 0.90234375, - "learning_rate": 1.6524530001907196e-05, - "loss": 0.0552, - "num_input_tokens_seen": 190005564, - "step": 3072 - }, - { - "epoch": 0.558983666061706, - "grad_norm": 0.7265625, - "learning_rate": 1.643433579050186e-05, - "loss": 0.0479, - "num_input_tokens_seen": 190494115, - "step": 3080 - }, - { - "epoch": 0.5604355716878403, - "grad_norm": 0.7265625, - "learning_rate": 1.6344216414618998e-05, - "loss": 0.0558, - "num_input_tokens_seen": 190997100, - "step": 3088 - }, - { - "epoch": 0.5618874773139746, - "grad_norm": 0.6875, - "learning_rate": 1.625417376566794e-05, - "loss": 0.0854, - "num_input_tokens_seen": 191513399, - "step": 3096 - }, - { - "epoch": 0.5618874773139746, - "eval_loss": 0.0525849312543869, - "eval_runtime": 2614.8433, - "eval_samples_per_second": 1.192, - "eval_steps_per_second": 0.149, - "num_input_tokens_seen": 191513399, - "step": 3096 - }, - { - "epoch": 0.5633393829401089, - "grad_norm": 0.435546875, - "learning_rate": 1.616420973344769e-05, - "loss": 0.0467, - "num_input_tokens_seen": 191995923, - "step": 3104 - }, - { - "epoch": 0.5647912885662432, - "grad_norm": 0.67578125, - "learning_rate": 1.607432620610727e-05, - "loss": 0.0564, - "num_input_tokens_seen": 192465595, - "step": 3112 - }, - { - "epoch": 0.5662431941923775, - "grad_norm": 0.88671875, - "learning_rate": 1.5984525070106065e-05, - "loss": 0.0507, - "num_input_tokens_seen": 192958871, - "step": 3120 - }, - { - "epoch": 0.5676950998185119, - "grad_norm": 0.53515625, - "learning_rate": 1.5894808210174252e-05, - "loss": 0.0574, - "num_input_tokens_seen": 193430762, - "step": 3128 - }, - { - "epoch": 0.5691470054446461, - "grad_norm": 0.50390625, - "learning_rate": 1.5805177509273226e-05, - "loss": 0.0545, - "num_input_tokens_seen": 193908960, - "step": 3136 - }, - { - "epoch": 0.5705989110707804, - "grad_norm": 0.78515625, - "learning_rate": 1.571563484855611e-05, - "loss": 0.0532, - "num_input_tokens_seen": 194435990, - "step": 3144 - }, - { - "epoch": 0.5720508166969147, - "grad_norm": 0.60546875, - "learning_rate": 1.5626182107328253e-05, - "loss": 0.0402, - "num_input_tokens_seen": 194945870, - "step": 3152 - }, - { - "epoch": 0.573502722323049, - "grad_norm": 1.1640625, - "learning_rate": 1.5536821163007768e-05, - "loss": 0.0728, - "num_input_tokens_seen": 195449492, - "step": 3160 - }, - { - "epoch": 0.5749546279491833, - "grad_norm": 0.5703125, - "learning_rate": 1.5447553891086178e-05, - "loss": 0.0457, - "num_input_tokens_seen": 195943237, - "step": 3168 - }, - { - "epoch": 0.5764065335753176, - "grad_norm": 0.79296875, - "learning_rate": 1.5358382165089008e-05, - "loss": 0.0612, - "num_input_tokens_seen": 196442834, - "step": 3176 - }, - { - "epoch": 0.5778584392014519, - "grad_norm": 0.81640625, - "learning_rate": 1.5269307856536486e-05, - "loss": 0.0533, - "num_input_tokens_seen": 196964754, - "step": 3184 - }, - { - "epoch": 0.5793103448275863, - "grad_norm": 0.625, - "learning_rate": 1.5180332834904276e-05, - "loss": 0.0331, - "num_input_tokens_seen": 197500093, - "step": 3192 - }, - { - "epoch": 0.5807622504537205, - "grad_norm": 0.73046875, - "learning_rate": 1.5091458967584199e-05, - "loss": 0.0689, - "num_input_tokens_seen": 197994930, - "step": 3200 - }, - { - "epoch": 0.5822141560798548, - "grad_norm": 4.5, - "learning_rate": 1.5002688119845086e-05, - "loss": 0.0541, - "num_input_tokens_seen": 198501247, - "step": 3208 - }, - { - "epoch": 0.5836660617059891, - "grad_norm": 0.67578125, - "learning_rate": 1.4914022154793613e-05, - "loss": 0.0435, - "num_input_tokens_seen": 199000501, - "step": 3216 - }, - { - "epoch": 0.5851179673321234, - "grad_norm": 0.8984375, - "learning_rate": 1.482546293333518e-05, - "loss": 0.0557, - "num_input_tokens_seen": 199479084, - "step": 3224 - }, - { - "epoch": 0.5865698729582577, - "grad_norm": 0.62109375, - "learning_rate": 1.473701231413489e-05, - "loss": 0.0382, - "num_input_tokens_seen": 200003062, - "step": 3232 - }, - { - "epoch": 0.588021778584392, - "grad_norm": 0.5078125, - "learning_rate": 1.464867215357851e-05, - "loss": 0.0529, - "num_input_tokens_seen": 200510961, - "step": 3240 - }, - { - "epoch": 0.5894736842105263, - "grad_norm": 0.7421875, - "learning_rate": 1.4560444305733521e-05, - "loss": 0.0628, - "num_input_tokens_seen": 201013169, - "step": 3248 - }, - { - "epoch": 0.5909255898366607, - "grad_norm": 0.72265625, - "learning_rate": 1.447233062231022e-05, - "loss": 0.0322, - "num_input_tokens_seen": 201480209, - "step": 3256 - }, - { - "epoch": 0.592377495462795, - "grad_norm": 0.57421875, - "learning_rate": 1.4384332952622815e-05, - "loss": 0.0567, - "num_input_tokens_seen": 201973667, - "step": 3264 - }, - { - "epoch": 0.5938294010889292, - "grad_norm": 2.15625, - "learning_rate": 1.4296453143550664e-05, - "loss": 0.0463, - "num_input_tokens_seen": 202453986, - "step": 3272 - }, - { - "epoch": 0.5952813067150635, - "grad_norm": 0.56640625, - "learning_rate": 1.4208693039499468e-05, - "loss": 0.0425, - "num_input_tokens_seen": 202952414, - "step": 3280 - }, - { - "epoch": 0.5967332123411978, - "grad_norm": 1.125, - "learning_rate": 1.4121054482362592e-05, - "loss": 0.048, - "num_input_tokens_seen": 203470869, - "step": 3288 - }, - { - "epoch": 0.5981851179673321, - "grad_norm": 0.671875, - "learning_rate": 1.4033539311482403e-05, - "loss": 0.0449, - "num_input_tokens_seen": 203946575, - "step": 3296 - }, - { - "epoch": 0.5996370235934664, - "grad_norm": 1.0, - "learning_rate": 1.3946149363611631e-05, - "loss": 0.0579, - "num_input_tokens_seen": 204443918, - "step": 3304 - }, - { - "epoch": 0.6010889292196008, - "grad_norm": 0.5703125, - "learning_rate": 1.3858886472874881e-05, - "loss": 0.1074, - "num_input_tokens_seen": 204950872, - "step": 3312 - }, - { - "epoch": 0.6025408348457351, - "grad_norm": 0.5390625, - "learning_rate": 1.3771752470730078e-05, - "loss": 0.0591, - "num_input_tokens_seen": 205454235, - "step": 3320 - }, - { - "epoch": 0.6039927404718693, - "grad_norm": 0.73046875, - "learning_rate": 1.3684749185930088e-05, - "loss": 0.055, - "num_input_tokens_seen": 205939041, - "step": 3328 - }, - { - "epoch": 0.6054446460980036, - "grad_norm": 0.77734375, - "learning_rate": 1.3597878444484272e-05, - "loss": 0.0483, - "num_input_tokens_seen": 206431197, - "step": 3336 - }, - { - "epoch": 0.6068965517241379, - "grad_norm": 0.416015625, - "learning_rate": 1.351114206962021e-05, - "loss": 0.0568, - "num_input_tokens_seen": 206925320, - "step": 3344 - }, - { - "epoch": 0.6083484573502722, - "grad_norm": 0.490234375, - "learning_rate": 1.3424541881745425e-05, - "loss": 0.0553, - "num_input_tokens_seen": 207406668, - "step": 3352 - }, - { - "epoch": 0.6098003629764065, - "grad_norm": 0.5859375, - "learning_rate": 1.333807969840916e-05, - "loss": 0.0517, - "num_input_tokens_seen": 207877782, - "step": 3360 - }, - { - "epoch": 0.6112522686025408, - "grad_norm": 0.546875, - "learning_rate": 1.3251757334264253e-05, - "loss": 0.04, - "num_input_tokens_seen": 208344318, - "step": 3368 - }, - { - "epoch": 0.6127041742286752, - "grad_norm": 1.109375, - "learning_rate": 1.316557660102903e-05, - "loss": 0.0488, - "num_input_tokens_seen": 208814858, - "step": 3376 - }, - { - "epoch": 0.6141560798548095, - "grad_norm": 0.5, - "learning_rate": 1.3079539307449311e-05, - "loss": 0.044, - "num_input_tokens_seen": 209297102, - "step": 3384 - }, - { - "epoch": 0.6156079854809438, - "grad_norm": 0.5390625, - "learning_rate": 1.2993647259260418e-05, - "loss": 0.0469, - "num_input_tokens_seen": 209774677, - "step": 3392 - }, - { - "epoch": 0.617059891107078, - "grad_norm": 1.3359375, - "learning_rate": 1.2907902259149287e-05, - "loss": 0.0694, - "num_input_tokens_seen": 210275870, - "step": 3400 - }, - { - "epoch": 0.6185117967332123, - "grad_norm": 0.5625, - "learning_rate": 1.2822306106716645e-05, - "loss": 0.0595, - "num_input_tokens_seen": 210797636, - "step": 3408 - }, - { - "epoch": 0.6199637023593466, - "grad_norm": 0.578125, - "learning_rate": 1.2736860598439215e-05, - "loss": 0.0665, - "num_input_tokens_seen": 211287706, - "step": 3416 - }, - { - "epoch": 0.6214156079854809, - "grad_norm": 0.83203125, - "learning_rate": 1.2651567527632045e-05, - "loss": 0.0698, - "num_input_tokens_seen": 211773156, - "step": 3424 - }, - { - "epoch": 0.6228675136116153, - "grad_norm": 0.5390625, - "learning_rate": 1.2566428684410843e-05, - "loss": 0.0348, - "num_input_tokens_seen": 212277142, - "step": 3432 - }, - { - "epoch": 0.6243194192377496, - "grad_norm": 0.5625, - "learning_rate": 1.2481445855654415e-05, - "loss": 0.0474, - "num_input_tokens_seen": 212767513, - "step": 3440 - }, - { - "epoch": 0.6243194192377496, - "eval_loss": 0.05037084221839905, - "eval_runtime": 2739.6179, - "eval_samples_per_second": 1.138, - "eval_steps_per_second": 0.142, - "num_input_tokens_seen": 212767513, - "step": 3440 - }, - { - "epoch": 0.6257713248638839, - "grad_norm": 0.71875, - "learning_rate": 1.2396620824967169e-05, - "loss": 0.1043, - "num_input_tokens_seen": 213273298, - "step": 3448 - }, - { - "epoch": 0.6272232304900182, - "grad_norm": 0.53125, - "learning_rate": 1.2311955372641674e-05, - "loss": 0.0779, - "num_input_tokens_seen": 213743600, - "step": 3456 - }, - { - "epoch": 0.6286751361161524, - "grad_norm": 0.96875, - "learning_rate": 1.222745127562129e-05, - "loss": 0.0474, - "num_input_tokens_seen": 214249105, - "step": 3464 - }, - { - "epoch": 0.6301270417422867, - "grad_norm": 0.6171875, - "learning_rate": 1.2143110307462892e-05, - "loss": 0.0914, - "num_input_tokens_seen": 214743732, - "step": 3472 - }, - { - "epoch": 0.631578947368421, - "grad_norm": 0.58203125, - "learning_rate": 1.2058934238299625e-05, - "loss": 0.0333, - "num_input_tokens_seen": 215240214, - "step": 3480 - }, - { - "epoch": 0.6330308529945553, - "grad_norm": 0.94140625, - "learning_rate": 1.1974924834803765e-05, - "loss": 0.0477, - "num_input_tokens_seen": 215752215, - "step": 3488 - }, - { - "epoch": 0.6344827586206897, - "grad_norm": 0.859375, - "learning_rate": 1.1891083860149653e-05, - "loss": 0.0456, - "num_input_tokens_seen": 216218681, - "step": 3496 - }, - { - "epoch": 0.635934664246824, - "grad_norm": 0.9921875, - "learning_rate": 1.1807413073976655e-05, - "loss": 0.0537, - "num_input_tokens_seen": 216717186, - "step": 3504 - }, - { - "epoch": 0.6373865698729583, - "grad_norm": 0.5078125, - "learning_rate": 1.1723914232352265e-05, - "loss": 0.0543, - "num_input_tokens_seen": 217224763, - "step": 3512 - }, - { - "epoch": 0.6388384754990926, - "grad_norm": 0.80078125, - "learning_rate": 1.1640589087735222e-05, - "loss": 0.053, - "num_input_tokens_seen": 217712978, - "step": 3520 - }, - { - "epoch": 0.6402903811252268, - "grad_norm": 0.5234375, - "learning_rate": 1.1557439388938772e-05, - "loss": 0.0464, - "num_input_tokens_seen": 218177197, - "step": 3528 - }, - { - "epoch": 0.6417422867513611, - "grad_norm": 0.6796875, - "learning_rate": 1.1474466881093904e-05, - "loss": 0.0679, - "num_input_tokens_seen": 218664950, - "step": 3536 - }, - { - "epoch": 0.6431941923774954, - "grad_norm": 0.640625, - "learning_rate": 1.139167330561277e-05, - "loss": 0.0551, - "num_input_tokens_seen": 219190307, - "step": 3544 - }, - { - "epoch": 0.6446460980036298, - "grad_norm": 0.58984375, - "learning_rate": 1.130906040015211e-05, - "loss": 0.045, - "num_input_tokens_seen": 219656276, - "step": 3552 - }, - { - "epoch": 0.6460980036297641, - "grad_norm": 0.51953125, - "learning_rate": 1.1226629898576818e-05, - "loss": 0.0516, - "num_input_tokens_seen": 220153311, - "step": 3560 - }, - { - "epoch": 0.6475499092558984, - "grad_norm": 1.125, - "learning_rate": 1.1144383530923505e-05, - "loss": 0.04, - "num_input_tokens_seen": 220641855, - "step": 3568 - }, - { - "epoch": 0.6490018148820327, - "grad_norm": 1.1328125, - "learning_rate": 1.1062323023364217e-05, - "loss": 0.0566, - "num_input_tokens_seen": 221165742, - "step": 3576 - }, - { - "epoch": 0.650453720508167, - "grad_norm": 1.3515625, - "learning_rate": 1.0980450098170211e-05, - "loss": 0.0598, - "num_input_tokens_seen": 221645634, - "step": 3584 - }, - { - "epoch": 0.6519056261343013, - "grad_norm": 0.5390625, - "learning_rate": 1.0898766473675795e-05, - "loss": 0.0582, - "num_input_tokens_seen": 222128368, - "step": 3592 - }, - { - "epoch": 0.6533575317604355, - "grad_norm": 0.78125, - "learning_rate": 1.081727386424225e-05, - "loss": 0.0637, - "num_input_tokens_seen": 222630366, - "step": 3600 - }, - { - "epoch": 0.6548094373865698, - "grad_norm": 1.15625, - "learning_rate": 1.0735973980221898e-05, - "loss": 0.0319, - "num_input_tokens_seen": 223132889, - "step": 3608 - }, - { - "epoch": 0.6562613430127042, - "grad_norm": 0.72265625, - "learning_rate": 1.0654868527922157e-05, - "loss": 0.0605, - "num_input_tokens_seen": 223620866, - "step": 3616 - }, - { - "epoch": 0.6577132486388385, - "grad_norm": 0.9296875, - "learning_rate": 1.0573959209569736e-05, - "loss": 0.0563, - "num_input_tokens_seen": 224112161, - "step": 3624 - }, - { - "epoch": 0.6591651542649728, - "grad_norm": 0.5625, - "learning_rate": 1.0493247723274949e-05, - "loss": 0.0637, - "num_input_tokens_seen": 224615692, - "step": 3632 - }, - { - "epoch": 0.6606170598911071, - "grad_norm": 0.59375, - "learning_rate": 1.0412735762996022e-05, - "loss": 0.0525, - "num_input_tokens_seen": 225123661, - "step": 3640 - }, - { - "epoch": 0.6620689655172414, - "grad_norm": 0.423828125, - "learning_rate": 1.0332425018503573e-05, - "loss": 0.0448, - "num_input_tokens_seen": 225606843, - "step": 3648 - }, - { - "epoch": 0.6635208711433757, - "grad_norm": 0.78515625, - "learning_rate": 1.025231717534513e-05, - "loss": 0.0511, - "num_input_tokens_seen": 226083858, - "step": 3656 - }, - { - "epoch": 0.6649727767695099, - "grad_norm": 0.625, - "learning_rate": 1.0172413914809791e-05, - "loss": 0.0297, - "num_input_tokens_seen": 226586157, - "step": 3664 - }, - { - "epoch": 0.6664246823956442, - "grad_norm": 0.6484375, - "learning_rate": 1.0092716913892878e-05, - "loss": 0.0542, - "num_input_tokens_seen": 227090262, - "step": 3672 - }, - { - "epoch": 0.6678765880217786, - "grad_norm": 0.66796875, - "learning_rate": 1.0013227845260785e-05, - "loss": 0.0496, - "num_input_tokens_seen": 227568348, - "step": 3680 - }, - { - "epoch": 0.6693284936479129, - "grad_norm": 0.431640625, - "learning_rate": 9.933948377215873e-06, - "loss": 0.0474, - "num_input_tokens_seen": 228069156, - "step": 3688 - }, - { - "epoch": 0.6707803992740472, - "grad_norm": 0.5078125, - "learning_rate": 9.85488017366143e-06, - "loss": 0.0276, - "num_input_tokens_seen": 228546696, - "step": 3696 - }, - { - "epoch": 0.6722323049001815, - "grad_norm": 0.60546875, - "learning_rate": 9.776024894066755e-06, - "loss": 0.0413, - "num_input_tokens_seen": 229039860, - "step": 3704 - }, - { - "epoch": 0.6736842105263158, - "grad_norm": 0.69921875, - "learning_rate": 9.697384193432365e-06, - "loss": 0.0398, - "num_input_tokens_seen": 229524911, - "step": 3712 - }, - { - "epoch": 0.6751361161524501, - "grad_norm": 1.46875, - "learning_rate": 9.618959722255204e-06, - "loss": 0.0448, - "num_input_tokens_seen": 230032334, - "step": 3720 - }, - { - "epoch": 0.6765880217785843, - "grad_norm": 0.765625, - "learning_rate": 9.540753126494035e-06, - "loss": 0.0746, - "num_input_tokens_seen": 230518610, - "step": 3728 - }, - { - "epoch": 0.6780399274047187, - "grad_norm": 0.98828125, - "learning_rate": 9.462766047534915e-06, - "loss": 0.0463, - "num_input_tokens_seen": 231010962, - "step": 3736 - }, - { - "epoch": 0.679491833030853, - "grad_norm": 0.67578125, - "learning_rate": 9.385000122156695e-06, - "loss": 0.0675, - "num_input_tokens_seen": 231515592, - "step": 3744 - }, - { - "epoch": 0.6809437386569873, - "grad_norm": 0.63671875, - "learning_rate": 9.3074569824967e-06, - "loss": 0.0627, - "num_input_tokens_seen": 232031254, - "step": 3752 - }, - { - "epoch": 0.6823956442831216, - "grad_norm": 0.578125, - "learning_rate": 9.230138256016461e-06, - "loss": 0.0601, - "num_input_tokens_seen": 232525195, - "step": 3760 - }, - { - "epoch": 0.6838475499092559, - "grad_norm": 0.7734375, - "learning_rate": 9.153045565467605e-06, - "loss": 0.0587, - "num_input_tokens_seen": 232999291, - "step": 3768 - }, - { - "epoch": 0.6852994555353902, - "grad_norm": 2.046875, - "learning_rate": 9.076180528857709e-06, - "loss": 0.0536, - "num_input_tokens_seen": 233490579, - "step": 3776 - }, - { - "epoch": 0.6867513611615245, - "grad_norm": 0.60546875, - "learning_rate": 8.999544759416413e-06, - "loss": 0.0346, - "num_input_tokens_seen": 234000641, - "step": 3784 - }, - { - "epoch": 0.6867513611615245, - "eval_loss": 0.04955988749861717, - "eval_runtime": 2842.036, - "eval_samples_per_second": 1.097, - "eval_steps_per_second": 0.137, - "num_input_tokens_seen": 234000641, - "step": 3784 - }, - { - "epoch": 0.6882032667876588, - "grad_norm": 0.369140625, - "learning_rate": 8.923139865561525e-06, - "loss": 0.0568, - "num_input_tokens_seen": 234523989, - "step": 3792 - }, - { - "epoch": 0.6896551724137931, - "grad_norm": 0.5703125, - "learning_rate": 8.846967450865302e-06, - "loss": 0.0471, - "num_input_tokens_seen": 234995824, - "step": 3800 - }, - { - "epoch": 0.6911070780399274, - "grad_norm": 0.58203125, - "learning_rate": 8.77102911402075e-06, - "loss": 0.0396, - "num_input_tokens_seen": 235480070, - "step": 3808 - }, - { - "epoch": 0.6925589836660617, - "grad_norm": 1.1875, - "learning_rate": 8.695326448808089e-06, - "loss": 0.0427, - "num_input_tokens_seen": 235969468, - "step": 3816 - }, - { - "epoch": 0.694010889292196, - "grad_norm": 0.55078125, - "learning_rate": 8.61986104406132e-06, - "loss": 0.0468, - "num_input_tokens_seen": 236457438, - "step": 3824 - }, - { - "epoch": 0.6954627949183303, - "grad_norm": 0.72265625, - "learning_rate": 8.544634483634855e-06, - "loss": 0.07, - "num_input_tokens_seen": 236964483, - "step": 3832 - }, - { - "epoch": 0.6969147005444646, - "grad_norm": 0.734375, - "learning_rate": 8.469648346370275e-06, - "loss": 0.0681, - "num_input_tokens_seen": 237478465, - "step": 3840 - }, - { - "epoch": 0.6983666061705989, - "grad_norm": 0.69921875, - "learning_rate": 8.39490420606323e-06, - "loss": 0.0486, - "num_input_tokens_seen": 237972518, - "step": 3848 - }, - { - "epoch": 0.6998185117967333, - "grad_norm": 0.70703125, - "learning_rate": 8.320403631430352e-06, - "loss": 0.0398, - "num_input_tokens_seen": 238453985, - "step": 3856 - }, - { - "epoch": 0.7012704174228676, - "grad_norm": 0.66796875, - "learning_rate": 8.246148186076367e-06, - "loss": 0.0565, - "num_input_tokens_seen": 238956557, - "step": 3864 - }, - { - "epoch": 0.7027223230490018, - "grad_norm": 1.125, - "learning_rate": 8.172139428461292e-06, - "loss": 0.0699, - "num_input_tokens_seen": 239428560, - "step": 3872 - }, - { - "epoch": 0.7041742286751361, - "grad_norm": 0.98046875, - "learning_rate": 8.098378911867682e-06, - "loss": 0.0595, - "num_input_tokens_seen": 239904462, - "step": 3880 - }, - { - "epoch": 0.7056261343012704, - "grad_norm": 2.03125, - "learning_rate": 8.02486818436806e-06, - "loss": 0.0696, - "num_input_tokens_seen": 240404479, - "step": 3888 - }, - { - "epoch": 0.7070780399274047, - "grad_norm": 1.0234375, - "learning_rate": 7.95160878879242e-06, - "loss": 0.0534, - "num_input_tokens_seen": 240926945, - "step": 3896 - }, - { - "epoch": 0.708529945553539, - "grad_norm": 0.9140625, - "learning_rate": 7.87860226269586e-06, - "loss": 0.0596, - "num_input_tokens_seen": 241440836, - "step": 3904 - }, - { - "epoch": 0.7099818511796733, - "grad_norm": 0.8984375, - "learning_rate": 7.805850138326282e-06, - "loss": 0.035, - "num_input_tokens_seen": 241942169, - "step": 3912 - }, - { - "epoch": 0.7114337568058077, - "grad_norm": 0.87109375, - "learning_rate": 7.733353942592246e-06, - "loss": 0.0501, - "num_input_tokens_seen": 242419037, - "step": 3920 - }, - { - "epoch": 0.712885662431942, - "grad_norm": 0.69140625, - "learning_rate": 7.661115197030954e-06, - "loss": 0.0576, - "num_input_tokens_seen": 242917759, - "step": 3928 - }, - { - "epoch": 0.7143375680580762, - "grad_norm": 0.5859375, - "learning_rate": 7.589135417776266e-06, - "loss": 0.0394, - "num_input_tokens_seen": 243411063, - "step": 3936 - }, - { - "epoch": 0.7157894736842105, - "grad_norm": 0.76171875, - "learning_rate": 7.517416115526901e-06, - "loss": 0.0485, - "num_input_tokens_seen": 243885516, - "step": 3944 - }, - { - "epoch": 0.7172413793103448, - "grad_norm": 0.5390625, - "learning_rate": 7.445958795514761e-06, - "loss": 0.0642, - "num_input_tokens_seen": 244397104, - "step": 3952 - }, - { - "epoch": 0.7186932849364791, - "grad_norm": 0.80859375, - "learning_rate": 7.374764957473281e-06, - "loss": 0.0486, - "num_input_tokens_seen": 244892690, - "step": 3960 - }, - { - "epoch": 0.7201451905626134, - "grad_norm": 1.0703125, - "learning_rate": 7.303836095605994e-06, - "loss": 0.0532, - "num_input_tokens_seen": 245418852, - "step": 3968 - }, - { - "epoch": 0.7215970961887477, - "grad_norm": 0.455078125, - "learning_rate": 7.233173698555174e-06, - "loss": 0.0389, - "num_input_tokens_seen": 245925757, - "step": 3976 - }, - { - "epoch": 0.7230490018148821, - "grad_norm": 0.73046875, - "learning_rate": 7.16277924937056e-06, - "loss": 0.0514, - "num_input_tokens_seen": 246421511, - "step": 3984 - }, - { - "epoch": 0.7245009074410164, - "grad_norm": 0.5625, - "learning_rate": 7.092654225478257e-06, - "loss": 0.041, - "num_input_tokens_seen": 246952363, - "step": 3992 - }, - { - "epoch": 0.7259528130671506, - "grad_norm": 0.5625, - "learning_rate": 7.022800098649716e-06, - "loss": 0.0446, - "num_input_tokens_seen": 247450049, - "step": 4000 - }, - { - "epoch": 0.7274047186932849, - "grad_norm": 0.921875, - "learning_rate": 6.953218334970861e-06, - "loss": 0.0379, - "num_input_tokens_seen": 247943269, - "step": 4008 - }, - { - "epoch": 0.7288566243194192, - "grad_norm": 0.69921875, - "learning_rate": 6.8839103948113e-06, - "loss": 0.0394, - "num_input_tokens_seen": 248447780, - "step": 4016 - }, - { - "epoch": 0.7303085299455535, - "grad_norm": 0.27734375, - "learning_rate": 6.814877732793663e-06, - "loss": 0.0401, - "num_input_tokens_seen": 248921260, - "step": 4024 - }, - { - "epoch": 0.7317604355716878, - "grad_norm": 0.61328125, - "learning_rate": 6.7461217977631325e-06, - "loss": 0.0447, - "num_input_tokens_seen": 249435130, - "step": 4032 - }, - { - "epoch": 0.7332123411978222, - "grad_norm": 0.76953125, - "learning_rate": 6.67764403275696e-06, - "loss": 0.0457, - "num_input_tokens_seen": 249913307, - "step": 4040 - }, - { - "epoch": 0.7346642468239565, - "grad_norm": 0.94921875, - "learning_rate": 6.609445874974218e-06, - "loss": 0.066, - "num_input_tokens_seen": 250435878, - "step": 4048 - }, - { - "epoch": 0.7361161524500908, - "grad_norm": 0.40625, - "learning_rate": 6.5415287557456585e-06, - "loss": 0.0509, - "num_input_tokens_seen": 250946234, - "step": 4056 - }, - { - "epoch": 0.737568058076225, - "grad_norm": 0.48046875, - "learning_rate": 6.473894100503615e-06, - "loss": 0.0553, - "num_input_tokens_seen": 251435205, - "step": 4064 - }, - { - "epoch": 0.7390199637023593, - "grad_norm": 0.8671875, - "learning_rate": 6.4065433287521306e-06, - "loss": 0.0445, - "num_input_tokens_seen": 251949775, - "step": 4072 - }, - { - "epoch": 0.7404718693284936, - "grad_norm": 0.94921875, - "learning_rate": 6.33947785403716e-06, - "loss": 0.0626, - "num_input_tokens_seen": 252447111, - "step": 4080 - }, - { - "epoch": 0.7419237749546279, - "grad_norm": 0.58984375, - "learning_rate": 6.272699083916885e-06, - "loss": 0.0685, - "num_input_tokens_seen": 252958790, - "step": 4088 - }, - { - "epoch": 0.7433756805807622, - "grad_norm": 0.61328125, - "learning_rate": 6.20620841993218e-06, - "loss": 0.0705, - "num_input_tokens_seen": 253436330, - "step": 4096 - }, - { - "epoch": 0.7448275862068966, - "grad_norm": 0.9921875, - "learning_rate": 6.1400072575772056e-06, - "loss": 0.0599, - "num_input_tokens_seen": 253927128, - "step": 4104 - }, - { - "epoch": 0.7462794918330309, - "grad_norm": 0.5859375, - "learning_rate": 6.0740969862701195e-06, - "loss": 0.0407, - "num_input_tokens_seen": 254426830, - "step": 4112 - }, - { - "epoch": 0.7477313974591652, - "grad_norm": 0.87890625, - "learning_rate": 6.008478989323898e-06, - "loss": 0.0566, - "num_input_tokens_seen": 254922990, - "step": 4120 - }, - { - "epoch": 0.7491833030852995, - "grad_norm": 0.9375, - "learning_rate": 5.943154643917315e-06, - "loss": 0.0498, - "num_input_tokens_seen": 255423630, - "step": 4128 - }, - { - "epoch": 0.7491833030852995, - "eval_loss": 0.049039360135793686, - "eval_runtime": 2629.7216, - "eval_samples_per_second": 1.185, - "eval_steps_per_second": 0.148, - "num_input_tokens_seen": 255423630, - "step": 4128 - }, - { - "epoch": 0.7506352087114337, - "grad_norm": 0.4921875, - "learning_rate": 5.87812532106606e-06, - "loss": 0.0614, - "num_input_tokens_seen": 255929632, - "step": 4136 - }, - { - "epoch": 0.752087114337568, - "grad_norm": 0.7109375, - "learning_rate": 5.813392385593915e-06, - "loss": 0.0651, - "num_input_tokens_seen": 256430965, - "step": 4144 - }, - { - "epoch": 0.7535390199637023, - "grad_norm": 0.5625, - "learning_rate": 5.7489571961041415e-06, - "loss": 0.0618, - "num_input_tokens_seen": 256934909, - "step": 4152 - }, - { - "epoch": 0.7549909255898367, - "grad_norm": 0.84375, - "learning_rate": 5.684821104950984e-06, - "loss": 0.0604, - "num_input_tokens_seen": 257421654, - "step": 4160 - }, - { - "epoch": 0.756442831215971, - "grad_norm": 0.60546875, - "learning_rate": 5.620985458211241e-06, - "loss": 0.0516, - "num_input_tokens_seen": 257913684, - "step": 4168 - }, - { - "epoch": 0.7578947368421053, - "grad_norm": 0.9453125, - "learning_rate": 5.55745159565604e-06, - "loss": 0.0418, - "num_input_tokens_seen": 258400849, - "step": 4176 - }, - { - "epoch": 0.7593466424682396, - "grad_norm": 0.734375, - "learning_rate": 5.494220850722729e-06, - "loss": 0.062, - "num_input_tokens_seen": 258878333, - "step": 4184 - }, - { - "epoch": 0.7607985480943739, - "grad_norm": 1.203125, - "learning_rate": 5.431294550486869e-06, - "loss": 0.0615, - "num_input_tokens_seen": 259369068, - "step": 4192 - }, - { - "epoch": 0.7622504537205081, - "grad_norm": 0.71875, - "learning_rate": 5.3686740156343805e-06, - "loss": 0.0584, - "num_input_tokens_seen": 259870513, - "step": 4200 - }, - { - "epoch": 0.7637023593466424, - "grad_norm": 0.66796875, - "learning_rate": 5.306360560433854e-06, - "loss": 0.0419, - "num_input_tokens_seen": 260370376, - "step": 4208 - }, - { - "epoch": 0.7651542649727767, - "grad_norm": 1.015625, - "learning_rate": 5.244355492708941e-06, - "loss": 0.0582, - "num_input_tokens_seen": 260881761, - "step": 4216 - }, - { - "epoch": 0.7666061705989111, - "grad_norm": 0.70703125, - "learning_rate": 5.182660113810907e-06, - "loss": 0.0468, - "num_input_tokens_seen": 261402673, - "step": 4224 - }, - { - "epoch": 0.7680580762250454, - "grad_norm": 1.3671875, - "learning_rate": 5.121275718591321e-06, - "loss": 0.0686, - "num_input_tokens_seen": 261898525, - "step": 4232 - }, - { - "epoch": 0.7695099818511797, - "grad_norm": 0.51953125, - "learning_rate": 5.0602035953748865e-06, - "loss": 0.0624, - "num_input_tokens_seen": 262392396, - "step": 4240 - }, - { - "epoch": 0.770961887477314, - "grad_norm": 0.56640625, - "learning_rate": 4.999445025932408e-06, - "loss": 0.0429, - "num_input_tokens_seen": 262882816, - "step": 4248 - }, - { - "epoch": 0.7724137931034483, - "grad_norm": 1.2734375, - "learning_rate": 4.939001285453864e-06, - "loss": 0.0372, - "num_input_tokens_seen": 263383267, - "step": 4256 - }, - { - "epoch": 0.7738656987295826, - "grad_norm": 0.54296875, - "learning_rate": 4.8788736425216595e-06, - "loss": 0.0343, - "num_input_tokens_seen": 263858756, - "step": 4264 - }, - { - "epoch": 0.7753176043557168, - "grad_norm": 0.5859375, - "learning_rate": 4.81906335908402e-06, - "loss": 0.048, - "num_input_tokens_seen": 264345998, - "step": 4272 - }, - { - "epoch": 0.7767695099818511, - "grad_norm": 2.4375, - "learning_rate": 4.759571690428464e-06, - "loss": 0.0595, - "num_input_tokens_seen": 264834486, - "step": 4280 - }, - { - "epoch": 0.7782214156079855, - "grad_norm": 0.482421875, - "learning_rate": 4.700399885155487e-06, - "loss": 0.0456, - "num_input_tokens_seen": 265331269, - "step": 4288 - }, - { - "epoch": 0.7796733212341198, - "grad_norm": 0.90625, - "learning_rate": 4.641549185152359e-06, - "loss": 0.0374, - "num_input_tokens_seen": 265836347, - "step": 4296 - }, - { - "epoch": 0.7811252268602541, - "grad_norm": 0.828125, - "learning_rate": 4.583020825567039e-06, - "loss": 0.0359, - "num_input_tokens_seen": 266324737, - "step": 4304 - }, - { - "epoch": 0.7825771324863884, - "grad_norm": 0.451171875, - "learning_rate": 4.524816034782263e-06, - "loss": 0.0575, - "num_input_tokens_seen": 266808164, - "step": 4312 - }, - { - "epoch": 0.7840290381125227, - "grad_norm": 1.34375, - "learning_rate": 4.46693603438977e-06, - "loss": 0.0502, - "num_input_tokens_seen": 267324813, - "step": 4320 - }, - { - "epoch": 0.785480943738657, - "grad_norm": 0.60546875, - "learning_rate": 4.409382039164653e-06, - "loss": 0.063, - "num_input_tokens_seen": 267822646, - "step": 4328 - }, - { - "epoch": 0.7869328493647912, - "grad_norm": 0.6484375, - "learning_rate": 4.352155257039865e-06, - "loss": 0.0736, - "num_input_tokens_seen": 268320339, - "step": 4336 - }, - { - "epoch": 0.7883847549909256, - "grad_norm": 0.8828125, - "learning_rate": 4.295256889080865e-06, - "loss": 0.0568, - "num_input_tokens_seen": 268805229, - "step": 4344 - }, - { - "epoch": 0.7898366606170599, - "grad_norm": 0.84375, - "learning_rate": 4.238688129460431e-06, - "loss": 0.0398, - "num_input_tokens_seen": 269290686, - "step": 4352 - }, - { - "epoch": 0.7912885662431942, - "grad_norm": 0.78515625, - "learning_rate": 4.18245016543356e-06, - "loss": 0.0468, - "num_input_tokens_seen": 269771817, - "step": 4360 - }, - { - "epoch": 0.7927404718693285, - "grad_norm": 0.53515625, - "learning_rate": 4.126544177312577e-06, - "loss": 0.0497, - "num_input_tokens_seen": 270261530, - "step": 4368 - }, - { - "epoch": 0.7941923774954628, - "grad_norm": 1.4375, - "learning_rate": 4.0709713384423685e-06, - "loss": 0.0356, - "num_input_tokens_seen": 270769688, - "step": 4376 - }, - { - "epoch": 0.7956442831215971, - "grad_norm": 2.0, - "learning_rate": 4.015732815175728e-06, - "loss": 0.0573, - "num_input_tokens_seen": 271284923, - "step": 4384 - }, - { - "epoch": 0.7970961887477314, - "grad_norm": 0.52734375, - "learning_rate": 3.960829766848893e-06, - "loss": 0.056, - "num_input_tokens_seen": 271756884, - "step": 4392 - }, - { - "epoch": 0.7985480943738656, - "grad_norm": 0.5234375, - "learning_rate": 3.906263345757231e-06, - "loss": 0.0309, - "num_input_tokens_seen": 272248473, - "step": 4400 - }, - { - "epoch": 0.8, - "grad_norm": 0.828125, - "learning_rate": 3.852034697131015e-06, - "loss": 0.0447, - "num_input_tokens_seen": 272755455, - "step": 4408 - }, - { - "epoch": 0.8014519056261343, - "grad_norm": 0.7421875, - "learning_rate": 3.7981449591114207e-06, - "loss": 0.0459, - "num_input_tokens_seen": 273244979, - "step": 4416 - }, - { - "epoch": 0.8029038112522686, - "grad_norm": 0.73046875, - "learning_rate": 3.7445952627266336e-06, - "loss": 0.0642, - "num_input_tokens_seen": 273749266, - "step": 4424 - }, - { - "epoch": 0.8043557168784029, - "grad_norm": 0.80078125, - "learning_rate": 3.6913867318680984e-06, - "loss": 0.0455, - "num_input_tokens_seen": 274271081, - "step": 4432 - }, - { - "epoch": 0.8058076225045372, - "grad_norm": 0.859375, - "learning_rate": 3.6385204832669385e-06, - "loss": 0.0414, - "num_input_tokens_seen": 274770517, - "step": 4440 - }, - { - "epoch": 0.8072595281306715, - "grad_norm": 0.703125, - "learning_rate": 3.585997626470519e-06, - "loss": 0.0426, - "num_input_tokens_seen": 275248505, - "step": 4448 - }, - { - "epoch": 0.8087114337568058, - "grad_norm": 0.63671875, - "learning_rate": 3.533819263819167e-06, - "loss": 0.0498, - "num_input_tokens_seen": 275748095, - "step": 4456 - }, - { - "epoch": 0.8101633393829402, - "grad_norm": 0.46484375, - "learning_rate": 3.4819864904230195e-06, - "loss": 0.0508, - "num_input_tokens_seen": 276242421, - "step": 4464 - }, - { - "epoch": 0.8116152450090744, - "grad_norm": 0.7734375, - "learning_rate": 3.4305003941390468e-06, - "loss": 0.0605, - "num_input_tokens_seen": 276731693, - "step": 4472 - }, - { - "epoch": 0.8116152450090744, - "eval_loss": 0.04871319234371185, - "eval_runtime": 2768.9798, - "eval_samples_per_second": 1.126, - "eval_steps_per_second": 0.141, - "num_input_tokens_seen": 276731693, - "step": 4472 - }, - { - "epoch": 0.8130671506352087, - "grad_norm": 0.69921875, - "learning_rate": 3.3793620555482322e-06, - "loss": 0.053, - "num_input_tokens_seen": 277218277, - "step": 4480 - }, - { - "epoch": 0.814519056261343, - "grad_norm": 0.6484375, - "learning_rate": 3.3285725479328757e-06, - "loss": 0.0582, - "num_input_tokens_seen": 277705169, - "step": 4488 - }, - { - "epoch": 0.8159709618874773, - "grad_norm": 0.74609375, - "learning_rate": 3.2781329372540683e-06, - "loss": 0.0618, - "num_input_tokens_seen": 278213285, - "step": 4496 - }, - { - "epoch": 0.8174228675136116, - "grad_norm": 0.42578125, - "learning_rate": 3.2280442821293455e-06, - "loss": 0.0556, - "num_input_tokens_seen": 278697097, - "step": 4504 - }, - { - "epoch": 0.8188747731397459, - "grad_norm": 0.53515625, - "learning_rate": 3.178307633810436e-06, - "loss": 0.0526, - "num_input_tokens_seen": 279193929, - "step": 4512 - }, - { - "epoch": 0.8203266787658802, - "grad_norm": 0.890625, - "learning_rate": 3.128924036161207e-06, - "loss": 0.0411, - "num_input_tokens_seen": 279698041, - "step": 4520 - }, - { - "epoch": 0.8217785843920146, - "grad_norm": 0.453125, - "learning_rate": 3.079894525635783e-06, - "loss": 0.0505, - "num_input_tokens_seen": 280182805, - "step": 4528 - }, - { - "epoch": 0.8232304900181489, - "grad_norm": 0.431640625, - "learning_rate": 3.0312201312567536e-06, - "loss": 0.04, - "num_input_tokens_seen": 280651028, - "step": 4536 - }, - { - "epoch": 0.8246823956442831, - "grad_norm": 0.94140625, - "learning_rate": 2.982901874593598e-06, - "loss": 0.0696, - "num_input_tokens_seen": 281162798, - "step": 4544 - }, - { - "epoch": 0.8261343012704174, - "grad_norm": 0.458984375, - "learning_rate": 2.934940769741239e-06, - "loss": 0.0356, - "num_input_tokens_seen": 281658265, - "step": 4552 - }, - { - "epoch": 0.8275862068965517, - "grad_norm": 3.09375, - "learning_rate": 2.8873378232987726e-06, - "loss": 0.0503, - "num_input_tokens_seen": 282170245, - "step": 4560 - }, - { - "epoch": 0.829038112522686, - "grad_norm": 0.99609375, - "learning_rate": 2.840094034348315e-06, - "loss": 0.0471, - "num_input_tokens_seen": 282655198, - "step": 4568 - }, - { - "epoch": 0.8304900181488203, - "grad_norm": 0.46875, - "learning_rate": 2.793210394434056e-06, - "loss": 0.0615, - "num_input_tokens_seen": 283132416, - "step": 4576 - }, - { - "epoch": 0.8319419237749546, - "grad_norm": 1.1875, - "learning_rate": 2.746687887541448e-06, - "loss": 0.0537, - "num_input_tokens_seen": 283628667, - "step": 4584 - }, - { - "epoch": 0.833393829401089, - "grad_norm": 0.6328125, - "learning_rate": 2.700527490076539e-06, - "loss": 0.0375, - "num_input_tokens_seen": 284146751, - "step": 4592 - }, - { - "epoch": 0.8348457350272233, - "grad_norm": 1.1796875, - "learning_rate": 2.6547301708454877e-06, - "loss": 0.041, - "num_input_tokens_seen": 284643128, - "step": 4600 - }, - { - "epoch": 0.8362976406533575, - "grad_norm": 0.578125, - "learning_rate": 2.609296891034241e-06, - "loss": 0.0473, - "num_input_tokens_seen": 285145371, - "step": 4608 - }, - { - "epoch": 0.8377495462794918, - "grad_norm": 0.78125, - "learning_rate": 2.5642286041883458e-06, - "loss": 0.0472, - "num_input_tokens_seen": 285639963, - "step": 4616 - }, - { - "epoch": 0.8392014519056261, - "grad_norm": 0.5078125, - "learning_rate": 2.519526256192939e-06, - "loss": 0.0493, - "num_input_tokens_seen": 286128983, - "step": 4624 - }, - { - "epoch": 0.8406533575317604, - "grad_norm": 0.45703125, - "learning_rate": 2.47519078525291e-06, - "loss": 0.0726, - "num_input_tokens_seen": 286625920, - "step": 4632 - }, - { - "epoch": 0.8421052631578947, - "grad_norm": 0.7578125, - "learning_rate": 2.431223121873183e-06, - "loss": 0.0465, - "num_input_tokens_seen": 287119525, - "step": 4640 - }, - { - "epoch": 0.8435571687840291, - "grad_norm": 0.5390625, - "learning_rate": 2.3876241888392173e-06, - "loss": 0.0553, - "num_input_tokens_seen": 287610722, - "step": 4648 - }, - { - "epoch": 0.8450090744101634, - "grad_norm": 0.53125, - "learning_rate": 2.3443949011976107e-06, - "loss": 0.0428, - "num_input_tokens_seen": 288097243, - "step": 4656 - }, - { - "epoch": 0.8464609800362977, - "grad_norm": 1.359375, - "learning_rate": 2.301536166236926e-06, - "loss": 0.048, - "num_input_tokens_seen": 288598177, - "step": 4664 - }, - { - "epoch": 0.847912885662432, - "grad_norm": 0.53515625, - "learning_rate": 2.259048883468622e-06, - "loss": 0.0436, - "num_input_tokens_seen": 289095940, - "step": 4672 - }, - { - "epoch": 0.8493647912885662, - "grad_norm": 0.9453125, - "learning_rate": 2.216933944608184e-06, - "loss": 0.0525, - "num_input_tokens_seen": 289579822, - "step": 4680 - }, - { - "epoch": 0.8508166969147005, - "grad_norm": 1.1875, - "learning_rate": 2.1751922335564134e-06, - "loss": 0.0752, - "num_input_tokens_seen": 290090500, - "step": 4688 - }, - { - "epoch": 0.8522686025408348, - "grad_norm": 0.7421875, - "learning_rate": 2.13382462638088e-06, - "loss": 0.0348, - "num_input_tokens_seen": 290583181, - "step": 4696 - }, - { - "epoch": 0.8537205081669691, - "grad_norm": 0.80078125, - "learning_rate": 2.0928319912975193e-06, - "loss": 0.063, - "num_input_tokens_seen": 291086649, - "step": 4704 - }, - { - "epoch": 0.8551724137931035, - "grad_norm": 0.53125, - "learning_rate": 2.0522151886524153e-06, - "loss": 0.0492, - "num_input_tokens_seen": 291577384, - "step": 4712 - }, - { - "epoch": 0.8566243194192378, - "grad_norm": 0.68359375, - "learning_rate": 2.0119750709037646e-06, - "loss": 0.0428, - "num_input_tokens_seen": 292058725, - "step": 4720 - }, - { - "epoch": 0.8580762250453721, - "grad_norm": 0.640625, - "learning_rate": 1.972112482603954e-06, - "loss": 0.074, - "num_input_tokens_seen": 292542677, - "step": 4728 - }, - { - "epoch": 0.8595281306715064, - "grad_norm": 0.40234375, - "learning_rate": 1.9326282603818526e-06, - "loss": 0.0493, - "num_input_tokens_seen": 293025201, - "step": 4736 - }, - { - "epoch": 0.8609800362976406, - "grad_norm": 0.84375, - "learning_rate": 1.8935232329252585e-06, - "loss": 0.0431, - "num_input_tokens_seen": 293508845, - "step": 4744 - }, - { - "epoch": 0.8624319419237749, - "grad_norm": 0.88671875, - "learning_rate": 1.854798220963485e-06, - "loss": 0.0356, - "num_input_tokens_seen": 293995884, - "step": 4752 - }, - { - "epoch": 0.8638838475499092, - "grad_norm": 0.6953125, - "learning_rate": 1.816454037250155e-06, - "loss": 0.0548, - "num_input_tokens_seen": 294512519, - "step": 4760 - }, - { - "epoch": 0.8653357531760436, - "grad_norm": 0.8203125, - "learning_rate": 1.778491486546141e-06, - "loss": 0.0409, - "num_input_tokens_seen": 295012760, - "step": 4768 - }, - { - "epoch": 0.8667876588021779, - "grad_norm": 0.57421875, - "learning_rate": 1.7409113656026643e-06, - "loss": 0.0336, - "num_input_tokens_seen": 295509942, - "step": 4776 - }, - { - "epoch": 0.8682395644283122, - "grad_norm": 0.74609375, - "learning_rate": 1.7037144631445745e-06, - "loss": 0.0413, - "num_input_tokens_seen": 296013081, - "step": 4784 - }, - { - "epoch": 0.8696914700544465, - "grad_norm": 0.54296875, - "learning_rate": 1.666901559853804e-06, - "loss": 0.0387, - "num_input_tokens_seen": 296492427, - "step": 4792 - }, - { - "epoch": 0.8711433756805808, - "grad_norm": 0.50390625, - "learning_rate": 1.63047342835299e-06, - "loss": 0.0468, - "num_input_tokens_seen": 297011120, - "step": 4800 - }, - { - "epoch": 0.872595281306715, - "grad_norm": 1.0625, - "learning_rate": 1.594430833189231e-06, - "loss": 0.0518, - "num_input_tokens_seen": 297502338, - "step": 4808 - }, - { - "epoch": 0.8740471869328493, - "grad_norm": 0.51953125, - "learning_rate": 1.5587745308180656e-06, - "loss": 0.055, - "num_input_tokens_seen": 298011343, - "step": 4816 - }, - { - "epoch": 0.8740471869328493, - "eval_loss": 0.04861417040228844, - "eval_runtime": 2715.815, - "eval_samples_per_second": 1.148, - "eval_steps_per_second": 0.144, - "num_input_tokens_seen": 298011343, - "step": 4816 - }, - { - "epoch": 0.8754990925589836, - "grad_norm": 0.5390625, - "learning_rate": 1.523505269587595e-06, - "loss": 0.0366, - "num_input_tokens_seen": 298524933, - "step": 4824 - }, - { - "epoch": 0.876950998185118, - "grad_norm": 0.76953125, - "learning_rate": 1.4886237897227584e-06, - "loss": 0.0466, - "num_input_tokens_seen": 299031985, - "step": 4832 - }, - { - "epoch": 0.8784029038112523, - "grad_norm": 0.58203125, - "learning_rate": 1.4541308233098117e-06, - "loss": 0.0472, - "num_input_tokens_seen": 299512381, - "step": 4840 - }, - { - "epoch": 0.8798548094373866, - "grad_norm": 0.345703125, - "learning_rate": 1.420027094280969e-06, - "loss": 0.0585, - "num_input_tokens_seen": 300023962, - "step": 4848 - }, - { - "epoch": 0.8813067150635209, - "grad_norm": 0.76953125, - "learning_rate": 1.3863133183991905e-06, - "loss": 0.0455, - "num_input_tokens_seen": 300499402, - "step": 4856 - }, - { - "epoch": 0.8827586206896552, - "grad_norm": 0.8828125, - "learning_rate": 1.3529902032431698e-06, - "loss": 0.0572, - "num_input_tokens_seen": 301015365, - "step": 4864 - }, - { - "epoch": 0.8842105263157894, - "grad_norm": 0.515625, - "learning_rate": 1.3200584481924915e-06, - "loss": 0.054, - "num_input_tokens_seen": 301509565, - "step": 4872 - }, - { - "epoch": 0.8856624319419237, - "grad_norm": 0.8125, - "learning_rate": 1.2875187444129366e-06, - "loss": 0.0505, - "num_input_tokens_seen": 302023484, - "step": 4880 - }, - { - "epoch": 0.8871143375680581, - "grad_norm": 1.0234375, - "learning_rate": 1.2553717748419846e-06, - "loss": 0.0426, - "num_input_tokens_seen": 302520603, - "step": 4888 - }, - { - "epoch": 0.8885662431941924, - "grad_norm": 0.5703125, - "learning_rate": 1.2236182141744757e-06, - "loss": 0.0495, - "num_input_tokens_seen": 303012766, - "step": 4896 - }, - { - "epoch": 0.8900181488203267, - "grad_norm": 0.400390625, - "learning_rate": 1.192258728848472e-06, - "loss": 0.0561, - "num_input_tokens_seen": 303502416, - "step": 4904 - }, - { - "epoch": 0.891470054446461, - "grad_norm": 0.5078125, - "learning_rate": 1.1612939770312325e-06, - "loss": 0.0365, - "num_input_tokens_seen": 304003546, - "step": 4912 - }, - { - "epoch": 0.8929219600725953, - "grad_norm": 0.609375, - "learning_rate": 1.130724608605427e-06, - "loss": 0.05, - "num_input_tokens_seen": 304494827, - "step": 4920 - }, - { - "epoch": 0.8943738656987296, - "grad_norm": 0.60546875, - "learning_rate": 1.1005512651554983e-06, - "loss": 0.0365, - "num_input_tokens_seen": 304962434, - "step": 4928 - }, - { - "epoch": 0.8958257713248639, - "grad_norm": 0.3984375, - "learning_rate": 1.0707745799541748e-06, - "loss": 0.0505, - "num_input_tokens_seen": 305453792, - "step": 4936 - }, - { - "epoch": 0.8972776769509981, - "grad_norm": 0.703125, - "learning_rate": 1.041395177949196e-06, - "loss": 0.0371, - "num_input_tokens_seen": 305940285, - "step": 4944 - }, - { - "epoch": 0.8987295825771325, - "grad_norm": 0.5703125, - "learning_rate": 1.0124136757502012e-06, - "loss": 0.0523, - "num_input_tokens_seen": 306438405, - "step": 4952 - }, - { - "epoch": 0.9001814882032668, - "grad_norm": 0.7734375, - "learning_rate": 9.838306816157695e-07, - "loss": 0.0405, - "num_input_tokens_seen": 306937715, - "step": 4960 - }, - { - "epoch": 0.9016333938294011, - "grad_norm": 0.60546875, - "learning_rate": 9.556467954406634e-07, - "loss": 0.0742, - "num_input_tokens_seen": 307458431, - "step": 4968 - }, - { - "epoch": 0.9030852994555354, - "grad_norm": 0.69921875, - "learning_rate": 9.278626087432529e-07, - "loss": 0.049, - "num_input_tokens_seen": 307956789, - "step": 4976 - }, - { - "epoch": 0.9045372050816697, - "grad_norm": 0.48828125, - "learning_rate": 9.004787046530694e-07, - "loss": 0.0432, - "num_input_tokens_seen": 308463995, - "step": 4984 - }, - { - "epoch": 0.905989110707804, - "grad_norm": 0.546875, - "learning_rate": 8.734956578985976e-07, - "loss": 0.057, - "num_input_tokens_seen": 308971509, - "step": 4992 - }, - { - "epoch": 0.9074410163339383, - "grad_norm": 0.953125, - "learning_rate": 8.469140347951898e-07, - "loss": 0.0461, - "num_input_tokens_seen": 309453074, - "step": 5000 - }, - { - "epoch": 0.9088929219600725, - "grad_norm": 0.703125, - "learning_rate": 8.207343932332023e-07, - "loss": 0.042, - "num_input_tokens_seen": 309930257, - "step": 5008 - }, - { - "epoch": 0.9103448275862069, - "grad_norm": 1.234375, - "learning_rate": 7.949572826662622e-07, - "loss": 0.077, - "num_input_tokens_seen": 310432591, - "step": 5016 - }, - { - "epoch": 0.9117967332123412, - "grad_norm": 0.54296875, - "learning_rate": 7.695832440997563e-07, - "loss": 0.0504, - "num_input_tokens_seen": 310899484, - "step": 5024 - }, - { - "epoch": 0.9132486388384755, - "grad_norm": 0.89453125, - "learning_rate": 7.44612810079468e-07, - "loss": 0.0577, - "num_input_tokens_seen": 311385620, - "step": 5032 - }, - { - "epoch": 0.9147005444646098, - "grad_norm": 1.4375, - "learning_rate": 7.200465046803984e-07, - "loss": 0.065, - "num_input_tokens_seen": 311886953, - "step": 5040 - }, - { - "epoch": 0.9161524500907441, - "grad_norm": 1.09375, - "learning_rate": 6.958848434957643e-07, - "loss": 0.0473, - "num_input_tokens_seen": 312387145, - "step": 5048 - }, - { - "epoch": 0.9176043557168784, - "grad_norm": 1.015625, - "learning_rate": 6.721283336261964e-07, - "loss": 0.0464, - "num_input_tokens_seen": 312865084, - "step": 5056 - }, - { - "epoch": 0.9190562613430127, - "grad_norm": 0.7421875, - "learning_rate": 6.487774736690688e-07, - "loss": 0.0462, - "num_input_tokens_seen": 313342169, - "step": 5064 - }, - { - "epoch": 0.9205081669691471, - "grad_norm": 0.671875, - "learning_rate": 6.258327537080488e-07, - "loss": 0.0407, - "num_input_tokens_seen": 313820850, - "step": 5072 - }, - { - "epoch": 0.9219600725952813, - "grad_norm": 0.703125, - "learning_rate": 6.032946553028196e-07, - "loss": 0.048, - "num_input_tokens_seen": 314294169, - "step": 5080 - }, - { - "epoch": 0.9234119782214156, - "grad_norm": 0.89453125, - "learning_rate": 5.811636514789598e-07, - "loss": 0.0393, - "num_input_tokens_seen": 314789090, - "step": 5088 - }, - { - "epoch": 0.9248638838475499, - "grad_norm": 0.5, - "learning_rate": 5.594402067180116e-07, - "loss": 0.0466, - "num_input_tokens_seen": 315317576, - "step": 5096 - }, - { - "epoch": 0.9263157894736842, - "grad_norm": 0.51171875, - "learning_rate": 5.381247769477504e-07, - "loss": 0.0336, - "num_input_tokens_seen": 315804951, - "step": 5104 - }, - { - "epoch": 0.9277676950998185, - "grad_norm": 0.63671875, - "learning_rate": 5.172178095326019e-07, - "loss": 0.0515, - "num_input_tokens_seen": 316286642, - "step": 5112 - }, - { - "epoch": 0.9292196007259528, - "grad_norm": 0.8515625, - "learning_rate": 4.967197432642579e-07, - "loss": 0.079, - "num_input_tokens_seen": 316792651, - "step": 5120 - }, - { - "epoch": 0.9306715063520871, - "grad_norm": 0.8671875, - "learning_rate": 4.7663100835246614e-07, - "loss": 0.0423, - "num_input_tokens_seen": 317277912, - "step": 5128 - }, - { - "epoch": 0.9321234119782215, - "grad_norm": 0.359375, - "learning_rate": 4.569520264159977e-07, - "loss": 0.0307, - "num_input_tokens_seen": 317761276, - "step": 5136 - }, - { - "epoch": 0.9335753176043557, - "grad_norm": 0.65625, - "learning_rate": 4.3768321047380936e-07, - "loss": 0.0443, - "num_input_tokens_seen": 318275629, - "step": 5144 - }, - { - "epoch": 0.93502722323049, - "grad_norm": 0.66796875, - "learning_rate": 4.188249649363596e-07, - "loss": 0.037, - "num_input_tokens_seen": 318764138, - "step": 5152 - }, - { - "epoch": 0.9364791288566243, - "grad_norm": 0.5078125, - "learning_rate": 4.0037768559712864e-07, - "loss": 0.0398, - "num_input_tokens_seen": 319237492, - "step": 5160 - }, - { - "epoch": 0.9364791288566243, - "eval_loss": 0.04859951138496399, - "eval_runtime": 2495.2416, - "eval_samples_per_second": 1.249, - "eval_steps_per_second": 0.156, - "num_input_tokens_seen": 319237492, - "step": 5160 - }, - { - "epoch": 0.9379310344827586, - "grad_norm": 0.50390625, - "learning_rate": 3.8234175962432284e-07, - "loss": 0.0643, - "num_input_tokens_seen": 319726771, - "step": 5168 - }, - { - "epoch": 0.9393829401088929, - "grad_norm": 0.46484375, - "learning_rate": 3.647175655527235e-07, - "loss": 0.0545, - "num_input_tokens_seen": 320207370, - "step": 5176 - }, - { - "epoch": 0.9408348457350272, - "grad_norm": 1.09375, - "learning_rate": 3.4750547327576434e-07, - "loss": 0.0645, - "num_input_tokens_seen": 320689649, - "step": 5184 - }, - { - "epoch": 0.9422867513611616, - "grad_norm": 0.5234375, - "learning_rate": 3.3070584403775754e-07, - "loss": 0.0368, - "num_input_tokens_seen": 321189372, - "step": 5192 - }, - { - "epoch": 0.9437386569872959, - "grad_norm": 0.578125, - "learning_rate": 3.143190304263177e-07, - "loss": 0.0461, - "num_input_tokens_seen": 321681717, - "step": 5200 - }, - { - "epoch": 0.9451905626134302, - "grad_norm": 0.6640625, - "learning_rate": 2.9834537636495466e-07, - "loss": 0.0348, - "num_input_tokens_seen": 322172599, - "step": 5208 - }, - { - "epoch": 0.9466424682395644, - "grad_norm": 0.578125, - "learning_rate": 2.8278521710586315e-07, - "loss": 0.0484, - "num_input_tokens_seen": 322668094, - "step": 5216 - }, - { - "epoch": 0.9480943738656987, - "grad_norm": 0.8125, - "learning_rate": 2.6763887922288236e-07, - "loss": 0.0589, - "num_input_tokens_seen": 323137080, - "step": 5224 - }, - { - "epoch": 0.949546279491833, - "grad_norm": 0.451171875, - "learning_rate": 2.5290668060464095e-07, - "loss": 0.0323, - "num_input_tokens_seen": 323645462, - "step": 5232 - }, - { - "epoch": 0.9509981851179673, - "grad_norm": 0.6171875, - "learning_rate": 2.385889304478872e-07, - "loss": 0.05, - "num_input_tokens_seen": 324137149, - "step": 5240 - }, - { - "epoch": 0.9524500907441016, - "grad_norm": 0.421875, - "learning_rate": 2.2468592925100062e-07, - "loss": 0.0392, - "num_input_tokens_seen": 324621626, - "step": 5248 - }, - { - "epoch": 0.953901996370236, - "grad_norm": 0.51171875, - "learning_rate": 2.1119796880768374e-07, - "loss": 0.0468, - "num_input_tokens_seen": 325115784, - "step": 5256 - }, - { - "epoch": 0.9553539019963703, - "grad_norm": 0.51953125, - "learning_rate": 1.9812533220083362e-07, - "loss": 0.0679, - "num_input_tokens_seen": 325614737, - "step": 5264 - }, - { - "epoch": 0.9568058076225046, - "grad_norm": 0.5078125, - "learning_rate": 1.8546829379661125e-07, - "loss": 0.07, - "num_input_tokens_seen": 326095021, - "step": 5272 - }, - { - "epoch": 0.9582577132486388, - "grad_norm": 0.423828125, - "learning_rate": 1.7322711923867475e-07, - "loss": 0.0609, - "num_input_tokens_seen": 326613882, - "step": 5280 - }, - { - "epoch": 0.9597096188747731, - "grad_norm": 0.8125, - "learning_rate": 1.6140206544260407e-07, - "loss": 0.0323, - "num_input_tokens_seen": 327087152, - "step": 5288 - }, - { - "epoch": 0.9611615245009074, - "grad_norm": 0.44921875, - "learning_rate": 1.4999338059051184e-07, - "loss": 0.0431, - "num_input_tokens_seen": 327601813, - "step": 5296 - }, - { - "epoch": 0.9626134301270417, - "grad_norm": 0.7578125, - "learning_rate": 1.3900130412583646e-07, - "loss": 0.0378, - "num_input_tokens_seen": 328093647, - "step": 5304 - }, - { - "epoch": 0.964065335753176, - "grad_norm": 1.2890625, - "learning_rate": 1.2842606674831058e-07, - "loss": 0.0777, - "num_input_tokens_seen": 328588015, - "step": 5312 - }, - { - "epoch": 0.9655172413793104, - "grad_norm": 0.71484375, - "learning_rate": 1.1826789040912723e-07, - "loss": 0.0603, - "num_input_tokens_seen": 329080878, - "step": 5320 - }, - { - "epoch": 0.9669691470054447, - "grad_norm": 0.6171875, - "learning_rate": 1.0852698830627007e-07, - "loss": 0.0433, - "num_input_tokens_seen": 329543543, - "step": 5328 - }, - { - "epoch": 0.968421052631579, - "grad_norm": 0.640625, - "learning_rate": 9.920356488005045e-08, - "loss": 0.0625, - "num_input_tokens_seen": 330031499, - "step": 5336 - }, - { - "epoch": 0.9698729582577132, - "grad_norm": 0.359375, - "learning_rate": 9.029781580881081e-08, - "loss": 0.0408, - "num_input_tokens_seen": 330508472, - "step": 5344 - }, - { - "epoch": 0.9713248638838475, - "grad_norm": 0.60546875, - "learning_rate": 8.180992800482124e-08, - "loss": 0.0362, - "num_input_tokens_seen": 330999368, - "step": 5352 - }, - { - "epoch": 0.9727767695099818, - "grad_norm": 0.80859375, - "learning_rate": 7.374007961035157e-08, - "loss": 0.0372, - "num_input_tokens_seen": 331494527, - "step": 5360 - }, - { - "epoch": 0.9742286751361161, - "grad_norm": 0.90625, - "learning_rate": 6.608843999393655e-08, - "loss": 0.0544, - "num_input_tokens_seen": 331992801, - "step": 5368 - }, - { - "epoch": 0.9756805807622505, - "grad_norm": 0.486328125, - "learning_rate": 5.885516974681871e-08, - "loss": 0.0434, - "num_input_tokens_seen": 332484019, - "step": 5376 - }, - { - "epoch": 0.9771324863883848, - "grad_norm": 0.58984375, - "learning_rate": 5.2040420679577706e-08, - "loss": 0.0463, - "num_input_tokens_seen": 332971275, - "step": 5384 - }, - { - "epoch": 0.9785843920145191, - "grad_norm": 0.365234375, - "learning_rate": 4.564433581895067e-08, - "loss": 0.0291, - "num_input_tokens_seen": 333465979, - "step": 5392 - }, - { - "epoch": 0.9800362976406534, - "grad_norm": 0.5546875, - "learning_rate": 3.966704940482347e-08, - "loss": 0.0428, - "num_input_tokens_seen": 333965786, - "step": 5400 - }, - { - "epoch": 0.9814882032667877, - "grad_norm": 0.478515625, - "learning_rate": 3.4108686887408537e-08, - "loss": 0.0382, - "num_input_tokens_seen": 334462422, - "step": 5408 - }, - { - "epoch": 0.9829401088929219, - "grad_norm": 0.58203125, - "learning_rate": 2.8969364924629205e-08, - "loss": 0.0335, - "num_input_tokens_seen": 334957763, - "step": 5416 - }, - { - "epoch": 0.9843920145190562, - "grad_norm": 0.49609375, - "learning_rate": 2.424919137965276e-08, - "loss": 0.0386, - "num_input_tokens_seen": 335453503, - "step": 5424 - }, - { - "epoch": 0.9858439201451905, - "grad_norm": 0.6640625, - "learning_rate": 1.9948265318638915e-08, - "loss": 0.0471, - "num_input_tokens_seen": 335956152, - "step": 5432 - }, - { - "epoch": 0.9872958257713249, - "grad_norm": 0.5078125, - "learning_rate": 1.606667700865261e-08, - "loss": 0.0428, - "num_input_tokens_seen": 336428666, - "step": 5440 - }, - { - "epoch": 0.9887477313974592, - "grad_norm": 0.7421875, - "learning_rate": 1.2604507915774389e-08, - "loss": 0.0409, - "num_input_tokens_seen": 336955164, - "step": 5448 - }, - { - "epoch": 0.9901996370235935, - "grad_norm": 0.80859375, - "learning_rate": 9.561830703390673e-09, - "loss": 0.0468, - "num_input_tokens_seen": 337481648, - "step": 5456 - }, - { - "epoch": 0.9916515426497278, - "grad_norm": 1.6953125, - "learning_rate": 6.938709230666085e-09, - "loss": 0.0517, - "num_input_tokens_seen": 337980342, - "step": 5464 - }, - { - "epoch": 0.993103448275862, - "grad_norm": 0.60546875, - "learning_rate": 4.7351985512067435e-09, - "loss": 0.0586, - "num_input_tokens_seen": 338476887, - "step": 5472 - }, - { - "epoch": 0.9945553539019963, - "grad_norm": 0.63671875, - "learning_rate": 2.9513449118967475e-09, - "loss": 0.0758, - "num_input_tokens_seen": 338954735, - "step": 5480 - }, - { - "epoch": 0.9960072595281306, - "grad_norm": 0.486328125, - "learning_rate": 1.5871857519411671e-09, - "loss": 0.0453, - "num_input_tokens_seen": 339472532, - "step": 5488 - }, - { - "epoch": 0.997459165154265, - "grad_norm": 0.52734375, - "learning_rate": 6.427497020644602e-10, - "loss": 0.0365, - "num_input_tokens_seen": 339948028, - "step": 5496 - }, - { - "epoch": 0.9989110707803993, - "grad_norm": 0.4609375, - "learning_rate": 1.1805658392427533e-10, - "loss": 0.0511, - "num_input_tokens_seen": 340437678, - "step": 5504 - }, - { - "epoch": 0.9989110707803993, - "eval_loss": 0.04862402379512787, - "eval_runtime": 2527.5451, - "eval_samples_per_second": 1.233, - "eval_steps_per_second": 0.154, - "num_input_tokens_seen": 340437678, - "step": 5504 - }, - { - "epoch": 1.0, - "num_input_tokens_seen": 340779614, - "step": 5510, - "total_flos": 1.7763887818171482e+19, - "train_loss": 0.06540190598601221, - "train_runtime": 392745.8674, - "train_samples_per_second": 0.786, - "train_steps_per_second": 0.014, - "train_tokens_per_second": 108.825 - } - ], - "logging_steps": 8, - "max_steps": 5510, - "num_input_tokens_seen": 340779614, - "num_train_epochs": 1, - "save_steps": 688, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.7763887818171482e+19, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -}