{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.629948364888124, "eval_steps": 5000, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1721170395869191, "grad_norm": 134.16664123535156, "learning_rate": 2.224770642201835e-06, "loss": 10.8697, "step": 100 }, { "epoch": 0.3442340791738382, "grad_norm": 69.6041259765625, "learning_rate": 4.5183486238532115e-06, "loss": 9.1125, "step": 200 }, { "epoch": 0.5163511187607573, "grad_norm": 103.4394760131836, "learning_rate": 6.8119266055045875e-06, "loss": 6.8873, "step": 300 }, { "epoch": 0.6884681583476764, "grad_norm": 32.237342834472656, "learning_rate": 9.08256880733945e-06, "loss": 3.1124, "step": 400 }, { "epoch": 0.8605851979345955, "grad_norm": 13.523033142089844, "learning_rate": 1.1376146788990828e-05, "loss": 1.0882, "step": 500 }, { "epoch": 1.0327022375215147, "grad_norm": 56.13002395629883, "learning_rate": 1.3669724770642203e-05, "loss": 0.869, "step": 600 }, { "epoch": 1.2048192771084336, "grad_norm": 4.41276741027832, "learning_rate": 1.5963302752293578e-05, "loss": 0.6952, "step": 700 }, { "epoch": 1.3769363166953528, "grad_norm": 1.8771318197250366, "learning_rate": 1.8256880733944955e-05, "loss": 0.5522, "step": 800 }, { "epoch": 1.549053356282272, "grad_norm": 5.131401062011719, "learning_rate": 1.9938798928981258e-05, "loss": 0.5184, "step": 900 }, { "epoch": 1.721170395869191, "grad_norm": 3.2520999908447266, "learning_rate": 1.9683794466403164e-05, "loss": 0.3996, "step": 1000 }, { "epoch": 1.8932874354561102, "grad_norm": 1.1916402578353882, "learning_rate": 1.9428790003825067e-05, "loss": 0.6316, "step": 1100 }, { "epoch": 2.0654044750430294, "grad_norm": 3.3565962314605713, "learning_rate": 1.9173785541246974e-05, "loss": 0.5352, "step": 1200 }, { "epoch": 2.2375215146299485, "grad_norm": 2.182133436203003, "learning_rate": 1.8918781078668877e-05, "loss": 0.3731, "step": 1300 }, { "epoch": 2.4096385542168672, "grad_norm": 3.537748098373413, "learning_rate": 1.8663776616090783e-05, "loss": 0.3376, "step": 1400 }, { "epoch": 2.581755593803787, "grad_norm": 2.066549777984619, "learning_rate": 1.840877215351269e-05, "loss": 0.597, "step": 1500 }, { "epoch": 2.7538726333907055, "grad_norm": 2.867453098297119, "learning_rate": 1.8153767690934592e-05, "loss": 0.5737, "step": 1600 }, { "epoch": 2.9259896729776247, "grad_norm": 0.8096536993980408, "learning_rate": 1.7898763228356495e-05, "loss": 0.7107, "step": 1700 }, { "epoch": 3.098106712564544, "grad_norm": 5.293230056762695, "learning_rate": 1.7643758765778402e-05, "loss": 0.4356, "step": 1800 }, { "epoch": 3.270223752151463, "grad_norm": 2.1939845085144043, "learning_rate": 1.7388754303200308e-05, "loss": 0.5581, "step": 1900 }, { "epoch": 3.442340791738382, "grad_norm": 2.1973116397857666, "learning_rate": 1.713374984062221e-05, "loss": 0.2012, "step": 2000 }, { "epoch": 3.6144578313253013, "grad_norm": 1.3364547491073608, "learning_rate": 1.6878745378044118e-05, "loss": 0.3906, "step": 2100 }, { "epoch": 3.7865748709122204, "grad_norm": 3.2359094619750977, "learning_rate": 1.662374091546602e-05, "loss": 0.5386, "step": 2200 }, { "epoch": 3.958691910499139, "grad_norm": 31.699663162231445, "learning_rate": 1.6368736452887927e-05, "loss": 0.2624, "step": 2300 }, { "epoch": 4.130808950086059, "grad_norm": 92.98713684082031, "learning_rate": 1.611373199030983e-05, "loss": 0.3573, "step": 2400 }, { "epoch": 4.3029259896729775, "grad_norm": 2.056157350540161, "learning_rate": 1.5858727527731736e-05, "loss": 0.4798, "step": 2500 }, { "epoch": 4.475043029259897, "grad_norm": 7.822810649871826, "learning_rate": 1.5606273109779423e-05, "loss": 0.2465, "step": 2600 }, { "epoch": 4.647160068846816, "grad_norm": 1.6002038717269897, "learning_rate": 1.5351268647201326e-05, "loss": 0.3482, "step": 2700 }, { "epoch": 4.8192771084337345, "grad_norm": 2.061086416244507, "learning_rate": 1.5096264184623233e-05, "loss": 0.1915, "step": 2800 }, { "epoch": 4.991394148020654, "grad_norm": 1.1744683980941772, "learning_rate": 1.4841259722045136e-05, "loss": 0.4617, "step": 2900 }, { "epoch": 5.163511187607573, "grad_norm": 2.5757875442504883, "learning_rate": 1.4586255259467042e-05, "loss": 0.2874, "step": 3000 }, { "epoch": 5.335628227194492, "grad_norm": 8.106232643127441, "learning_rate": 1.4331250796888947e-05, "loss": 0.4636, "step": 3100 }, { "epoch": 5.507745266781411, "grad_norm": 2.139594316482544, "learning_rate": 1.4076246334310853e-05, "loss": 0.1344, "step": 3200 }, { "epoch": 5.679862306368331, "grad_norm": 8.198427200317383, "learning_rate": 1.3821241871732756e-05, "loss": 0.3615, "step": 3300 }, { "epoch": 5.851979345955249, "grad_norm": 0.706113338470459, "learning_rate": 1.3566237409154661e-05, "loss": 0.309, "step": 3400 }, { "epoch": 6.024096385542169, "grad_norm": 1.0154913663864136, "learning_rate": 1.3311232946576567e-05, "loss": 0.1883, "step": 3500 }, { "epoch": 6.196213425129088, "grad_norm": 27.715837478637695, "learning_rate": 1.3056228483998472e-05, "loss": 0.4029, "step": 3600 }, { "epoch": 6.368330464716007, "grad_norm": 3.0514609813690186, "learning_rate": 1.2801224021420375e-05, "loss": 0.2082, "step": 3700 }, { "epoch": 6.540447504302926, "grad_norm": 3.8193249702453613, "learning_rate": 1.2546219558842281e-05, "loss": 0.1333, "step": 3800 }, { "epoch": 6.712564543889846, "grad_norm": 1.4768047332763672, "learning_rate": 1.2291215096264186e-05, "loss": 0.1509, "step": 3900 }, { "epoch": 6.884681583476764, "grad_norm": 1.5106594562530518, "learning_rate": 1.2036210633686089e-05, "loss": 0.6264, "step": 4000 }, { "epoch": 7.056798623063683, "grad_norm": 1.1024622917175293, "learning_rate": 1.1781206171107995e-05, "loss": 0.2177, "step": 4100 }, { "epoch": 7.228915662650603, "grad_norm": 0.900026798248291, "learning_rate": 1.15262017085299e-05, "loss": 0.1957, "step": 4200 }, { "epoch": 7.401032702237521, "grad_norm": 144.5244140625, "learning_rate": 1.1271197245951807e-05, "loss": 0.2887, "step": 4300 }, { "epoch": 7.573149741824441, "grad_norm": 4.466265678405762, "learning_rate": 1.101619278337371e-05, "loss": 0.2271, "step": 4400 }, { "epoch": 7.74526678141136, "grad_norm": 2.862029790878296, "learning_rate": 1.0761188320795614e-05, "loss": 0.3486, "step": 4500 }, { "epoch": 7.917383820998279, "grad_norm": 1.178603172302246, "learning_rate": 1.050618385821752e-05, "loss": 0.4429, "step": 4600 }, { "epoch": 8.089500860585199, "grad_norm": 6.430075645446777, "learning_rate": 1.0251179395639424e-05, "loss": 0.4398, "step": 4700 }, { "epoch": 8.261617900172118, "grad_norm": 6.42482852935791, "learning_rate": 9.996174933061328e-06, "loss": 0.31, "step": 4800 }, { "epoch": 8.433734939759036, "grad_norm": 6.2779622077941895, "learning_rate": 9.743720515109015e-06, "loss": 0.2045, "step": 4900 }, { "epoch": 8.605851979345955, "grad_norm": 4.175030708312988, "learning_rate": 9.48871605253092e-06, "loss": 0.2583, "step": 5000 }, { "epoch": 8.605851979345955, "eval_loss": 0.23712533712387085, "eval_runtime": 13.9021, "eval_samples_per_second": 1335.195, "eval_steps_per_second": 10.502, "step": 5000 }, { "epoch": 8.777969018932874, "grad_norm": 1.0522035360336304, "learning_rate": 9.233711589952825e-06, "loss": 0.2774, "step": 5100 }, { "epoch": 8.950086058519794, "grad_norm": 0.9467515349388123, "learning_rate": 8.98125717200051e-06, "loss": 0.1902, "step": 5200 }, { "epoch": 9.122203098106713, "grad_norm": 9.148195266723633, "learning_rate": 8.726252709422416e-06, "loss": 0.3058, "step": 5300 }, { "epoch": 9.294320137693632, "grad_norm": 9.301542282104492, "learning_rate": 8.471248246844321e-06, "loss": 0.3742, "step": 5400 }, { "epoch": 9.46643717728055, "grad_norm": 17.278079986572266, "learning_rate": 8.216243784266226e-06, "loss": 0.2972, "step": 5500 }, { "epoch": 9.638554216867469, "grad_norm": 4.043286323547363, "learning_rate": 7.96123932168813e-06, "loss": 0.3084, "step": 5600 }, { "epoch": 9.81067125645439, "grad_norm": 2.227259874343872, "learning_rate": 7.706234859110035e-06, "loss": 0.1215, "step": 5700 }, { "epoch": 9.982788296041308, "grad_norm": 0.9134290218353271, "learning_rate": 7.45123039653194e-06, "loss": 0.1876, "step": 5800 }, { "epoch": 10.154905335628227, "grad_norm": 1.4163002967834473, "learning_rate": 7.196225933953844e-06, "loss": 0.1702, "step": 5900 }, { "epoch": 10.327022375215146, "grad_norm": 1.564228892326355, "learning_rate": 6.94122147137575e-06, "loss": 0.2506, "step": 6000 }, { "epoch": 10.499139414802066, "grad_norm": 5.47558069229126, "learning_rate": 6.686217008797654e-06, "loss": 0.2852, "step": 6100 }, { "epoch": 10.671256454388985, "grad_norm": 10.801889419555664, "learning_rate": 6.431212546219559e-06, "loss": 0.2354, "step": 6200 }, { "epoch": 10.843373493975903, "grad_norm": 1.8754569292068481, "learning_rate": 6.176208083641464e-06, "loss": 0.214, "step": 6300 }, { "epoch": 11.015490533562822, "grad_norm": 2.237508773803711, "learning_rate": 5.9212036210633696e-06, "loss": 0.3815, "step": 6400 }, { "epoch": 11.187607573149743, "grad_norm": 13.412964820861816, "learning_rate": 5.666199158485273e-06, "loss": 0.0803, "step": 6500 }, { "epoch": 11.359724612736661, "grad_norm": 0.996343195438385, "learning_rate": 5.411194695907179e-06, "loss": 0.1941, "step": 6600 }, { "epoch": 11.53184165232358, "grad_norm": 64.30641174316406, "learning_rate": 5.156190233329084e-06, "loss": 0.1576, "step": 6700 }, { "epoch": 11.703958691910499, "grad_norm": 0.9045078158378601, "learning_rate": 4.901185770750988e-06, "loss": 0.2911, "step": 6800 }, { "epoch": 11.876075731497417, "grad_norm": 1.798627495765686, "learning_rate": 4.646181308172894e-06, "loss": 0.4913, "step": 6900 }, { "epoch": 12.048192771084338, "grad_norm": 6.165831565856934, "learning_rate": 4.3911768455947986e-06, "loss": 0.2759, "step": 7000 }, { "epoch": 12.220309810671257, "grad_norm": 3.460507392883301, "learning_rate": 4.136172383016703e-06, "loss": 0.2928, "step": 7100 }, { "epoch": 12.392426850258175, "grad_norm": 5.5960187911987305, "learning_rate": 3.881167920438608e-06, "loss": 0.2181, "step": 7200 }, { "epoch": 12.564543889845094, "grad_norm": 1.473883032798767, "learning_rate": 3.6261634578605126e-06, "loss": 0.1286, "step": 7300 }, { "epoch": 12.736660929432015, "grad_norm": 10.179828643798828, "learning_rate": 3.3711589952824173e-06, "loss": 0.3342, "step": 7400 }, { "epoch": 12.908777969018933, "grad_norm": 3.2593960762023926, "learning_rate": 3.1187045773301034e-06, "loss": 0.1577, "step": 7500 }, { "epoch": 13.080895008605852, "grad_norm": 1.169028878211975, "learning_rate": 2.863700114752008e-06, "loss": 0.2578, "step": 7600 }, { "epoch": 13.25301204819277, "grad_norm": 1.2204866409301758, "learning_rate": 2.6086956521739132e-06, "loss": 0.2844, "step": 7700 }, { "epoch": 13.42512908777969, "grad_norm": 1.1637088060379028, "learning_rate": 2.353691189595818e-06, "loss": 0.0917, "step": 7800 }, { "epoch": 13.59724612736661, "grad_norm": 0.6639829277992249, "learning_rate": 2.098686727017723e-06, "loss": 0.2617, "step": 7900 }, { "epoch": 13.769363166953529, "grad_norm": 4.155405044555664, "learning_rate": 1.843682264439628e-06, "loss": 0.3021, "step": 8000 }, { "epoch": 13.941480206540447, "grad_norm": 1.9663244485855103, "learning_rate": 1.5886778018615326e-06, "loss": 0.1036, "step": 8100 }, { "epoch": 14.113597246127366, "grad_norm": 32.85494613647461, "learning_rate": 1.3336733392834375e-06, "loss": 0.5471, "step": 8200 }, { "epoch": 14.285714285714286, "grad_norm": 0.6067169904708862, "learning_rate": 1.0786688767053424e-06, "loss": 0.2395, "step": 8300 }, { "epoch": 14.457831325301205, "grad_norm": 0.9747382998466492, "learning_rate": 8.236644141272474e-07, "loss": 0.2664, "step": 8400 }, { "epoch": 14.629948364888124, "grad_norm": 21.624757766723633, "learning_rate": 5.686599515491522e-07, "loss": 0.2697, "step": 8500 } ], "logging_steps": 100, "max_steps": 8715, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }