{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2422360248447206, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012422360248447204, "grad_norm": 0.00018831038323696703, "learning_rate": 6.060606060606061e-06, "loss": 2.4276, "step": 2 }, { "epoch": 0.024844720496894408, "grad_norm": 0.00010692311479942873, "learning_rate": 1.8181818181818182e-05, "loss": 2.2565, "step": 4 }, { "epoch": 0.037267080745341616, "grad_norm": 6.173110887175426e-05, "learning_rate": 3.0303030303030306e-05, "loss": 1.9191, "step": 6 }, { "epoch": 0.049689440993788817, "grad_norm": 4.326488488004543e-05, "learning_rate": 4.242424242424243e-05, "loss": 1.7997, "step": 8 }, { "epoch": 0.062111801242236024, "grad_norm": 3.188664049957879e-05, "learning_rate": 5.4545454545454546e-05, "loss": 1.6296, "step": 10 }, { "epoch": 0.07453416149068323, "grad_norm": 2.6054423869936727e-05, "learning_rate": 6.666666666666667e-05, "loss": 1.5767, "step": 12 }, { "epoch": 0.08695652173913043, "grad_norm": 2.7649846742860973e-05, "learning_rate": 7.878787878787879e-05, "loss": 1.5131, "step": 14 }, { "epoch": 0.09937888198757763, "grad_norm": 1.6101763321785256e-05, "learning_rate": 9.090909090909092e-05, "loss": 1.4497, "step": 16 }, { "epoch": 0.11180124223602485, "grad_norm": 2.3354801669484004e-05, "learning_rate": 0.00010303030303030303, "loss": 1.4353, "step": 18 }, { "epoch": 0.12422360248447205, "grad_norm": 1.531304769741837e-05, "learning_rate": 0.00011515151515151516, "loss": 1.3655, "step": 20 }, { "epoch": 0.13664596273291926, "grad_norm": 9.777257218956947e-06, "learning_rate": 0.00012727272727272728, "loss": 1.3691, "step": 22 }, { "epoch": 0.14906832298136646, "grad_norm": 1.5149210412346292e-05, "learning_rate": 0.0001393939393939394, "loss": 1.3916, "step": 24 }, { "epoch": 0.16149068322981366, "grad_norm": 1.1748486940632574e-05, "learning_rate": 0.00015151515151515152, "loss": 1.341, "step": 26 }, { "epoch": 0.17391304347826086, "grad_norm": 3.532869595801458e-05, "learning_rate": 0.00016363636363636366, "loss": 1.3409, "step": 28 }, { "epoch": 0.18633540372670807, "grad_norm": 7.877199095673859e-06, "learning_rate": 0.00017575757575757578, "loss": 1.3728, "step": 30 }, { "epoch": 0.19875776397515527, "grad_norm": 1.1650959095277358e-05, "learning_rate": 0.0001878787878787879, "loss": 1.3579, "step": 32 }, { "epoch": 0.2111801242236025, "grad_norm": 3.298332740087062e-05, "learning_rate": 0.0002, "loss": 1.3835, "step": 34 }, { "epoch": 0.2236024844720497, "grad_norm": 9.103293450607453e-06, "learning_rate": 0.00019999471258384815, "loss": 1.3517, "step": 36 }, { "epoch": 0.2360248447204969, "grad_norm": 1.055269422067795e-05, "learning_rate": 0.000199978850894528, "loss": 1.3748, "step": 38 }, { "epoch": 0.2484472049689441, "grad_norm": 1.8264010577695444e-05, "learning_rate": 0.0001999524166093866, "loss": 1.383, "step": 40 }, { "epoch": 0.2608695652173913, "grad_norm": 9.094917004404124e-06, "learning_rate": 0.00019991541252380526, "loss": 1.4102, "step": 42 }, { "epoch": 0.2732919254658385, "grad_norm": 1.286895803787047e-05, "learning_rate": 0.00019986784255090397, "loss": 1.3811, "step": 44 }, { "epoch": 0.2857142857142857, "grad_norm": 1.386065014230553e-05, "learning_rate": 0.0001998097117211276, "loss": 1.3794, "step": 46 }, { "epoch": 0.2981366459627329, "grad_norm": 8.020670065889135e-06, "learning_rate": 0.00019974102618171394, "loss": 1.3584, "step": 48 }, { "epoch": 0.3105590062111801, "grad_norm": 8.217157301260158e-06, "learning_rate": 0.00019966179319604355, "loss": 1.3592, "step": 50 }, { "epoch": 0.32298136645962733, "grad_norm": 1.1515216101543047e-05, "learning_rate": 0.00019957202114287187, "loss": 1.3679, "step": 52 }, { "epoch": 0.33540372670807456, "grad_norm": 8.87443184183212e-06, "learning_rate": 0.0001994717195154429, "loss": 1.3779, "step": 54 }, { "epoch": 0.34782608695652173, "grad_norm": 7.773131073918194e-06, "learning_rate": 0.00019936089892048556, "loss": 1.3634, "step": 56 }, { "epoch": 0.36024844720496896, "grad_norm": 8.845816410030238e-06, "learning_rate": 0.00019923957107709195, "loss": 1.3312, "step": 58 }, { "epoch": 0.37267080745341613, "grad_norm": 8.347928087459877e-06, "learning_rate": 0.000199107748815478, "loss": 1.3626, "step": 60 }, { "epoch": 0.38509316770186336, "grad_norm": 9.047604180523194e-06, "learning_rate": 0.0001989654460756269, "loss": 1.3653, "step": 62 }, { "epoch": 0.39751552795031053, "grad_norm": 8.44144778966438e-06, "learning_rate": 0.00019881267790581466, "loss": 1.3712, "step": 64 }, { "epoch": 0.40993788819875776, "grad_norm": 8.453951522824354e-06, "learning_rate": 0.0001986494604610191, "loss": 1.3027, "step": 66 }, { "epoch": 0.422360248447205, "grad_norm": 7.831426046323031e-06, "learning_rate": 0.00019847581100121127, "loss": 1.3737, "step": 68 }, { "epoch": 0.43478260869565216, "grad_norm": 8.356658327102195e-06, "learning_rate": 0.00019829174788953038, "loss": 1.3522, "step": 70 }, { "epoch": 0.4472049689440994, "grad_norm": 8.671122486703098e-06, "learning_rate": 0.0001980972905903418, "loss": 1.3808, "step": 72 }, { "epoch": 0.45962732919254656, "grad_norm": 9.771009899850469e-06, "learning_rate": 0.00019789245966717883, "loss": 1.3695, "step": 74 }, { "epoch": 0.4720496894409938, "grad_norm": 6.7302303250471596e-06, "learning_rate": 0.00019767727678056805, "loss": 1.3754, "step": 76 }, { "epoch": 0.484472049689441, "grad_norm": 9.788966963242274e-06, "learning_rate": 0.00019745176468573893, "loss": 1.379, "step": 78 }, { "epoch": 0.4968944099378882, "grad_norm": 9.375480658491142e-06, "learning_rate": 0.00019721594723021732, "loss": 1.3484, "step": 80 }, { "epoch": 0.5093167701863354, "grad_norm": 3.550610927050002e-05, "learning_rate": 0.00019696984935130364, "loss": 1.377, "step": 82 }, { "epoch": 0.5217391304347826, "grad_norm": 9.707620847621001e-06, "learning_rate": 0.00019671349707343593, "loss": 1.3587, "step": 84 }, { "epoch": 0.5341614906832298, "grad_norm": 1.0621732144500129e-05, "learning_rate": 0.00019644691750543767, "loss": 1.393, "step": 86 }, { "epoch": 0.546583850931677, "grad_norm": 9.491611308476422e-06, "learning_rate": 0.0001961701388376511, "loss": 1.3715, "step": 88 }, { "epoch": 0.5590062111801242, "grad_norm": 8.982805411505979e-06, "learning_rate": 0.00019588319033895623, "loss": 1.3829, "step": 90 }, { "epoch": 0.5714285714285714, "grad_norm": 1.0132759598491248e-05, "learning_rate": 0.0001955861023536756, "loss": 1.3395, "step": 92 }, { "epoch": 0.5838509316770186, "grad_norm": 1.3477648280968424e-05, "learning_rate": 0.0001952789062983654, "loss": 1.3908, "step": 94 }, { "epoch": 0.5962732919254659, "grad_norm": 1.1435187843744643e-05, "learning_rate": 0.0001949616346584934, "loss": 1.3799, "step": 96 }, { "epoch": 0.6086956521739131, "grad_norm": 1.273245652555488e-05, "learning_rate": 0.00019463432098500337, "loss": 1.3973, "step": 98 }, { "epoch": 0.6211180124223602, "grad_norm": 1.1162846021761652e-05, "learning_rate": 0.00019429699989076746, "loss": 1.3564, "step": 100 }, { "epoch": 0.6335403726708074, "grad_norm": 1.1645292943285313e-05, "learning_rate": 0.00019394970704692566, "loss": 1.3535, "step": 102 }, { "epoch": 0.6459627329192547, "grad_norm": 3.541897240211256e-05, "learning_rate": 0.00019359247917911384, "loss": 1.3813, "step": 104 }, { "epoch": 0.6583850931677019, "grad_norm": 9.2498794401763e-06, "learning_rate": 0.00019322535406358, "loss": 1.3617, "step": 106 }, { "epoch": 0.6708074534161491, "grad_norm": 8.978020559879951e-06, "learning_rate": 0.00019284837052318933, "loss": 1.3813, "step": 108 }, { "epoch": 0.6832298136645962, "grad_norm": 1.3010416296310723e-05, "learning_rate": 0.00019246156842331918, "loss": 1.3726, "step": 110 }, { "epoch": 0.6956521739130435, "grad_norm": 1.2821891687053721e-05, "learning_rate": 0.00019206498866764288, "loss": 1.4042, "step": 112 }, { "epoch": 0.7080745341614907, "grad_norm": 9.70847668213537e-06, "learning_rate": 0.00019165867319380456, "loss": 1.4061, "step": 114 }, { "epoch": 0.7204968944099379, "grad_norm": 1.6388959920732304e-05, "learning_rate": 0.0001912426649689842, "loss": 1.3668, "step": 116 }, { "epoch": 0.7329192546583851, "grad_norm": 1.66231893672375e-05, "learning_rate": 0.00019081700798535397, "loss": 1.3712, "step": 118 }, { "epoch": 0.7453416149068323, "grad_norm": 1.864772457338404e-05, "learning_rate": 0.00019038174725542604, "loss": 1.3849, "step": 120 }, { "epoch": 0.7577639751552795, "grad_norm": 1.4855336303298827e-05, "learning_rate": 0.0001899369288072927, "loss": 1.4006, "step": 122 }, { "epoch": 0.7701863354037267, "grad_norm": 1.8388282114756294e-05, "learning_rate": 0.00018948259967975888, "loss": 1.4008, "step": 124 }, { "epoch": 0.782608695652174, "grad_norm": 2.689829489099793e-05, "learning_rate": 0.00018901880791736793, "loss": 1.3808, "step": 126 }, { "epoch": 0.7950310559006211, "grad_norm": 2.5062678105314262e-05, "learning_rate": 0.000188545602565321, "loss": 1.3948, "step": 128 }, { "epoch": 0.8074534161490683, "grad_norm": 2.1250931240501814e-05, "learning_rate": 0.0001880630336642905, "loss": 1.402, "step": 130 }, { "epoch": 0.8198757763975155, "grad_norm": 2.403008875262458e-05, "learning_rate": 0.0001875711522451284, "loss": 1.384, "step": 132 }, { "epoch": 0.8322981366459627, "grad_norm": 2.597655839053914e-05, "learning_rate": 0.00018707001032347, "loss": 1.3541, "step": 134 }, { "epoch": 0.84472049689441, "grad_norm": 2.3845692339818925e-05, "learning_rate": 0.0001865596608942331, "loss": 1.396, "step": 136 }, { "epoch": 0.8571428571428571, "grad_norm": 1.8721084416029043e-05, "learning_rate": 0.00018604015792601396, "loss": 1.3427, "step": 138 }, { "epoch": 0.8695652173913043, "grad_norm": 3.2439467759104446e-05, "learning_rate": 0.0001855115563553803, "loss": 1.376, "step": 140 }, { "epoch": 0.8819875776397516, "grad_norm": 2.3974960640771315e-05, "learning_rate": 0.0001849739120810618, "loss": 1.4019, "step": 142 }, { "epoch": 0.8944099378881988, "grad_norm": 3.250224108342081e-05, "learning_rate": 0.00018442728195803881, "loss": 1.3915, "step": 144 }, { "epoch": 0.906832298136646, "grad_norm": 2.820833833538927e-05, "learning_rate": 0.0001838717237915302, "loss": 1.3943, "step": 146 }, { "epoch": 0.9192546583850931, "grad_norm": 4.0885188354877755e-05, "learning_rate": 0.00018330729633088045, "loss": 1.3818, "step": 148 }, { "epoch": 0.9316770186335404, "grad_norm": 5.243903797236271e-05, "learning_rate": 0.00018273405926334696, "loss": 1.4112, "step": 150 }, { "epoch": 0.9440993788819876, "grad_norm": 4.928431008011103e-05, "learning_rate": 0.0001821520732077883, "loss": 4.4804, "step": 152 }, { "epoch": 0.9565217391304348, "grad_norm": 7.073425513226539e-05, "learning_rate": 0.00018156139970825391, "loss": 1.4202, "step": 154 }, { "epoch": 0.968944099378882, "grad_norm": 4.9798843974713236e-05, "learning_rate": 0.00018096210122747584, "loss": 1.4218, "step": 156 }, { "epoch": 0.9813664596273292, "grad_norm": 6.847319309599698e-05, "learning_rate": 0.0001803542411402634, "loss": 1.401, "step": 158 }, { "epoch": 0.9937888198757764, "grad_norm": 5.58948922844138e-05, "learning_rate": 0.0001797378837268015, "loss": 1.3425, "step": 160 }, { "epoch": 1.0062111801242235, "grad_norm": 4.716762123280205e-05, "learning_rate": 0.00017911309416585296, "loss": 1.3779, "step": 162 }, { "epoch": 1.0186335403726707, "grad_norm": 5.352822699933313e-05, "learning_rate": 0.0001784799385278661, "loss": 1.341, "step": 164 }, { "epoch": 1.031055900621118, "grad_norm": 5.088459511171095e-05, "learning_rate": 0.0001778384837679879, "loss": 1.4021, "step": 166 }, { "epoch": 1.0434782608695652, "grad_norm": 5.318582770996727e-05, "learning_rate": 0.00017718879771898348, "loss": 1.3597, "step": 168 }, { "epoch": 1.0559006211180124, "grad_norm": 4.925034227198921e-05, "learning_rate": 0.00017653094908406301, "loss": 1.3425, "step": 170 }, { "epoch": 1.0683229813664596, "grad_norm": 5.734206206398085e-05, "learning_rate": 0.00017586500742961653, "loss": 1.389, "step": 172 }, { "epoch": 1.0807453416149069, "grad_norm": 5.8051424275618047e-05, "learning_rate": 0.00017519104317785717, "loss": 1.3776, "step": 174 }, { "epoch": 1.093167701863354, "grad_norm": 4.752572931465693e-05, "learning_rate": 0.00017450912759937434, "loss": 1.4045, "step": 176 }, { "epoch": 1.1055900621118013, "grad_norm": 4.9184389354195446e-05, "learning_rate": 0.00017381933280559693, "loss": 2.4013, "step": 178 }, { "epoch": 1.1180124223602483, "grad_norm": 6.995958392508328e-05, "learning_rate": 0.00017312173174116762, "loss": 1.6484, "step": 180 }, { "epoch": 1.1304347826086956, "grad_norm": 4.960764272254892e-05, "learning_rate": 0.0001724163981762291, "loss": 1.3301, "step": 182 }, { "epoch": 1.1428571428571428, "grad_norm": 4.448357503861189e-05, "learning_rate": 0.0001717034066986231, "loss": 1.4058, "step": 184 }, { "epoch": 1.15527950310559, "grad_norm": 4.6940349420765415e-05, "learning_rate": 0.00017098283270600267, "loss": 1.3731, "step": 186 }, { "epoch": 1.1677018633540373, "grad_norm": 4.82694485981483e-05, "learning_rate": 0.00017025475239785919, "loss": 1.3586, "step": 188 }, { "epoch": 1.1801242236024845, "grad_norm": 4.615723810275085e-05, "learning_rate": 0.00016951924276746425, "loss": 1.3777, "step": 190 }, { "epoch": 1.1925465838509317, "grad_norm": 4.163803532719612e-05, "learning_rate": 0.00016877638159372782, "loss": 1.3974, "step": 192 }, { "epoch": 1.204968944099379, "grad_norm": 4.0616945625515655e-05, "learning_rate": 0.00016802624743297333, "loss": 1.41, "step": 194 }, { "epoch": 1.2173913043478262, "grad_norm": 4.708253254648298e-05, "learning_rate": 0.00016726891961063028, "loss": 1.3499, "step": 196 }, { "epoch": 1.2298136645962732, "grad_norm": 4.392105984152295e-05, "learning_rate": 0.00016650447821284594, "loss": 1.3619, "step": 198 }, { "epoch": 1.2422360248447206, "grad_norm": 4.838509266846813e-05, "learning_rate": 0.00016573300407801616, "loss": 1.3634, "step": 200 } ], "logging_steps": 2, "max_steps": 644, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8994607076972954e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }