diff --git "a/checkpoint-2375/trainer_state.json" "b/checkpoint-2375/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2375/trainer_state.json" @@ -0,0 +1,4041 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2375, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002105263157894737, + "grad_norm": NaN, + "learning_rate": 4.999991251325301e-05, + "loss": 8.8259, + "num_input_tokens_seen": 592, + "step": 5 + }, + { + "epoch": 0.004210526315789474, + "grad_norm": 3.9077489376068115, + "learning_rate": 4.9999212622950984e-05, + "loss": 6.5033, + "num_input_tokens_seen": 1296, + "step": 10 + }, + { + "epoch": 0.00631578947368421, + "grad_norm": 17.610599517822266, + "learning_rate": 4.9997353571051935e-05, + "loss": 5.5742, + "num_input_tokens_seen": 1920, + "step": 15 + }, + { + "epoch": 0.008421052631578947, + "grad_norm": 4.236301422119141, + "learning_rate": 4.999440105392749e-05, + "loss": 4.7275, + "num_input_tokens_seen": 2432, + "step": 20 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 4.381397247314453, + "learning_rate": 4.999035520073032e-05, + "loss": 3.5477, + "num_input_tokens_seen": 3168, + "step": 25 + }, + { + "epoch": 0.01263157894736842, + "grad_norm": 3.554992198944092, + "learning_rate": 4.998521618843914e-05, + "loss": 3.6426, + "num_input_tokens_seen": 3920, + "step": 30 + }, + { + "epoch": 0.014736842105263158, + "grad_norm": 4.44541072845459, + "learning_rate": 4.9978984241851013e-05, + "loss": 3.8361, + "num_input_tokens_seen": 4432, + "step": 35 + }, + { + "epoch": 0.016842105263157894, + "grad_norm": 2.611109495162964, + "learning_rate": 4.997165963357145e-05, + "loss": 3.6522, + "num_input_tokens_seen": 5008, + "step": 40 + }, + { + "epoch": 0.018947368421052633, + "grad_norm": 2.0907833576202393, + "learning_rate": 4.996324268400256e-05, + "loss": 3.1776, + "num_input_tokens_seen": 5936, + "step": 45 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 3.9562370777130127, + "learning_rate": 4.995373376132898e-05, + "loss": 3.9684, + "num_input_tokens_seen": 6512, + "step": 50 + }, + { + "epoch": 0.023157894736842106, + "grad_norm": 4.664333820343018, + "learning_rate": 4.9943133281501795e-05, + "loss": 3.1654, + "num_input_tokens_seen": 7104, + "step": 55 + }, + { + "epoch": 0.02526315789473684, + "grad_norm": 2.5006332397460938, + "learning_rate": 4.993144170822032e-05, + "loss": 3.374, + "num_input_tokens_seen": 7712, + "step": 60 + }, + { + "epoch": 0.02736842105263158, + "grad_norm": 2.9427053928375244, + "learning_rate": 4.9918659552911864e-05, + "loss": 3.2508, + "num_input_tokens_seen": 8320, + "step": 65 + }, + { + "epoch": 0.029473684210526315, + "grad_norm": 2.8679909706115723, + "learning_rate": 4.9904787374709305e-05, + "loss": 3.6347, + "num_input_tokens_seen": 8848, + "step": 70 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 2.2228591442108154, + "learning_rate": 4.988982578042665e-05, + "loss": 3.6148, + "num_input_tokens_seen": 9472, + "step": 75 + }, + { + "epoch": 0.03368421052631579, + "grad_norm": 3.230189561843872, + "learning_rate": 4.987377542453251e-05, + "loss": 3.3735, + "num_input_tokens_seen": 10064, + "step": 80 + }, + { + "epoch": 0.035789473684210524, + "grad_norm": 4.476248741149902, + "learning_rate": 4.9856637009121434e-05, + "loss": 3.5765, + "num_input_tokens_seen": 10656, + "step": 85 + }, + { + "epoch": 0.037894736842105266, + "grad_norm": 3.675215244293213, + "learning_rate": 4.9838411283883245e-05, + "loss": 3.1841, + "num_input_tokens_seen": 11168, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 4.318700313568115, + "learning_rate": 4.9819099046070206e-05, + "loss": 3.3461, + "num_input_tokens_seen": 11808, + "step": 95 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 3.9966771602630615, + "learning_rate": 4.979870114046217e-05, + "loss": 3.3828, + "num_input_tokens_seen": 12336, + "step": 100 + }, + { + "epoch": 0.042105263157894736, + "eval_loss": 3.3995444774627686, + "eval_runtime": 21.4256, + "eval_samples_per_second": 23.337, + "eval_steps_per_second": 11.668, + "num_input_tokens_seen": 12336, + "step": 100 + }, + { + "epoch": 0.04421052631578947, + "grad_norm": 4.8785295486450195, + "learning_rate": 4.977721845932959e-05, + "loss": 3.7097, + "num_input_tokens_seen": 12992, + "step": 105 + }, + { + "epoch": 0.04631578947368421, + "grad_norm": 3.408017635345459, + "learning_rate": 4.975465194239454e-05, + "loss": 2.9564, + "num_input_tokens_seen": 13744, + "step": 110 + }, + { + "epoch": 0.04842105263157895, + "grad_norm": 3.588205099105835, + "learning_rate": 4.973100257678958e-05, + "loss": 3.4605, + "num_input_tokens_seen": 14304, + "step": 115 + }, + { + "epoch": 0.05052631578947368, + "grad_norm": 3.157773494720459, + "learning_rate": 4.970627139701458e-05, + "loss": 4.0948, + "num_input_tokens_seen": 14832, + "step": 120 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 1.8902894258499146, + "learning_rate": 4.9680459484891445e-05, + "loss": 3.0968, + "num_input_tokens_seen": 15584, + "step": 125 + }, + { + "epoch": 0.05473684210526316, + "grad_norm": 1.1550252437591553, + "learning_rate": 4.9653567969516844e-05, + "loss": 3.2226, + "num_input_tokens_seen": 16336, + "step": 130 + }, + { + "epoch": 0.056842105263157895, + "grad_norm": 3.8178346157073975, + "learning_rate": 4.962559802721277e-05, + "loss": 3.5534, + "num_input_tokens_seen": 16880, + "step": 135 + }, + { + "epoch": 0.05894736842105263, + "grad_norm": 1.2998478412628174, + "learning_rate": 4.959655088147511e-05, + "loss": 2.8113, + "num_input_tokens_seen": 17728, + "step": 140 + }, + { + "epoch": 0.061052631578947365, + "grad_norm": 4.0586066246032715, + "learning_rate": 4.956642780292012e-05, + "loss": 3.3134, + "num_input_tokens_seen": 18336, + "step": 145 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 5.839606285095215, + "learning_rate": 4.9535230109228844e-05, + "loss": 3.4131, + "num_input_tokens_seen": 19008, + "step": 150 + }, + { + "epoch": 0.06526315789473684, + "grad_norm": 4.800465106964111, + "learning_rate": 4.950295916508947e-05, + "loss": 3.4343, + "num_input_tokens_seen": 19520, + "step": 155 + }, + { + "epoch": 0.06736842105263158, + "grad_norm": 2.6969552040100098, + "learning_rate": 4.9469616382137635e-05, + "loss": 3.0208, + "num_input_tokens_seen": 20192, + "step": 160 + }, + { + "epoch": 0.06947368421052631, + "grad_norm": 6.1990203857421875, + "learning_rate": 4.943520321889468e-05, + "loss": 3.652, + "num_input_tokens_seen": 20848, + "step": 165 + }, + { + "epoch": 0.07157894736842105, + "grad_norm": 2.14052152633667, + "learning_rate": 4.939972118070384e-05, + "loss": 3.0198, + "num_input_tokens_seen": 21520, + "step": 170 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 3.615708351135254, + "learning_rate": 4.9363171819664434e-05, + "loss": 2.7798, + "num_input_tokens_seen": 22240, + "step": 175 + }, + { + "epoch": 0.07578947368421053, + "grad_norm": 5.227450370788574, + "learning_rate": 4.932555673456389e-05, + "loss": 3.482, + "num_input_tokens_seen": 22752, + "step": 180 + }, + { + "epoch": 0.07789473684210527, + "grad_norm": 4.943465709686279, + "learning_rate": 4.9286877570807915e-05, + "loss": 2.8097, + "num_input_tokens_seen": 23408, + "step": 185 + }, + { + "epoch": 0.08, + "grad_norm": 3.956165075302124, + "learning_rate": 4.924713602034842e-05, + "loss": 3.7867, + "num_input_tokens_seen": 24048, + "step": 190 + }, + { + "epoch": 0.08210526315789474, + "grad_norm": 4.213566303253174, + "learning_rate": 4.920633382160955e-05, + "loss": 3.4609, + "num_input_tokens_seen": 24624, + "step": 195 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 4.879347324371338, + "learning_rate": 4.9164472759411695e-05, + "loss": 3.7301, + "num_input_tokens_seen": 25168, + "step": 200 + }, + { + "epoch": 0.08421052631578947, + "eval_loss": 3.3119306564331055, + "eval_runtime": 22.2022, + "eval_samples_per_second": 22.52, + "eval_steps_per_second": 11.26, + "num_input_tokens_seen": 25168, + "step": 200 + }, + { + "epoch": 0.0863157894736842, + "grad_norm": 2.6285791397094727, + "learning_rate": 4.91215546648933e-05, + "loss": 3.1267, + "num_input_tokens_seen": 25936, + "step": 205 + }, + { + "epoch": 0.08842105263157894, + "grad_norm": 6.523679733276367, + "learning_rate": 4.907758141543086e-05, + "loss": 3.3567, + "num_input_tokens_seen": 26480, + "step": 210 + }, + { + "epoch": 0.09052631578947369, + "grad_norm": 3.7712221145629883, + "learning_rate": 4.903255493455676e-05, + "loss": 3.0875, + "num_input_tokens_seen": 27072, + "step": 215 + }, + { + "epoch": 0.09263157894736843, + "grad_norm": 2.6647517681121826, + "learning_rate": 4.898647719187515e-05, + "loss": 2.9579, + "num_input_tokens_seen": 27760, + "step": 220 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 5.09914493560791, + "learning_rate": 4.8939350202975756e-05, + "loss": 3.0374, + "num_input_tokens_seen": 28496, + "step": 225 + }, + { + "epoch": 0.0968421052631579, + "grad_norm": 3.115642786026001, + "learning_rate": 4.889117602934574e-05, + "loss": 2.9435, + "num_input_tokens_seen": 29232, + "step": 230 + }, + { + "epoch": 0.09894736842105263, + "grad_norm": 4.625009536743164, + "learning_rate": 4.884195677827952e-05, + "loss": 3.8317, + "num_input_tokens_seen": 29792, + "step": 235 + }, + { + "epoch": 0.10105263157894737, + "grad_norm": 4.469867706298828, + "learning_rate": 4.879169460278659e-05, + "loss": 3.4246, + "num_input_tokens_seen": 30320, + "step": 240 + }, + { + "epoch": 0.1031578947368421, + "grad_norm": 4.335468292236328, + "learning_rate": 4.874039170149733e-05, + "loss": 3.2556, + "num_input_tokens_seen": 30800, + "step": 245 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 4.776815414428711, + "learning_rate": 4.868805031856686e-05, + "loss": 3.6451, + "num_input_tokens_seen": 31440, + "step": 250 + }, + { + "epoch": 0.10736842105263159, + "grad_norm": 7.101145267486572, + "learning_rate": 4.863467274357679e-05, + "loss": 2.991, + "num_input_tokens_seen": 32032, + "step": 255 + }, + { + "epoch": 0.10947368421052632, + "grad_norm": 5.720299243927002, + "learning_rate": 4.858026131143522e-05, + "loss": 3.5287, + "num_input_tokens_seen": 32544, + "step": 260 + }, + { + "epoch": 0.11157894736842106, + "grad_norm": 1.5953224897384644, + "learning_rate": 4.8524818402274415e-05, + "loss": 3.1499, + "num_input_tokens_seen": 33216, + "step": 265 + }, + { + "epoch": 0.11368421052631579, + "grad_norm": 3.63209867477417, + "learning_rate": 4.846834644134686e-05, + "loss": 2.6614, + "num_input_tokens_seen": 33824, + "step": 270 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 5.393692493438721, + "learning_rate": 4.841084789891905e-05, + "loss": 3.2591, + "num_input_tokens_seen": 34464, + "step": 275 + }, + { + "epoch": 0.11789473684210526, + "grad_norm": 3.1078500747680664, + "learning_rate": 4.8352325290163526e-05, + "loss": 2.8512, + "num_input_tokens_seen": 35216, + "step": 280 + }, + { + "epoch": 0.12, + "grad_norm": 5.469326972961426, + "learning_rate": 4.829278117504876e-05, + "loss": 3.0881, + "num_input_tokens_seen": 35808, + "step": 285 + }, + { + "epoch": 0.12210526315789473, + "grad_norm": 6.58186674118042, + "learning_rate": 4.823221815822725e-05, + "loss": 3.1116, + "num_input_tokens_seen": 36480, + "step": 290 + }, + { + "epoch": 0.12421052631578948, + "grad_norm": 3.288723945617676, + "learning_rate": 4.817063888892155e-05, + "loss": 3.0578, + "num_input_tokens_seen": 37040, + "step": 295 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 4.704117298126221, + "learning_rate": 4.810804606080839e-05, + "loss": 2.7913, + "num_input_tokens_seen": 38256, + "step": 300 + }, + { + "epoch": 0.12631578947368421, + "eval_loss": 3.294227123260498, + "eval_runtime": 21.4039, + "eval_samples_per_second": 23.36, + "eval_steps_per_second": 11.68, + "num_input_tokens_seen": 38256, + "step": 300 + }, + { + "epoch": 0.12842105263157894, + "grad_norm": 5.600541591644287, + "learning_rate": 4.804444241190084e-05, + "loss": 3.3033, + "num_input_tokens_seen": 38832, + "step": 305 + }, + { + "epoch": 0.13052631578947368, + "grad_norm": 5.58181619644165, + "learning_rate": 4.797983072442855e-05, + "loss": 2.9879, + "num_input_tokens_seen": 39632, + "step": 310 + }, + { + "epoch": 0.13263157894736843, + "grad_norm": 4.971944332122803, + "learning_rate": 4.791421382471605e-05, + "loss": 3.3603, + "num_input_tokens_seen": 40320, + "step": 315 + }, + { + "epoch": 0.13473684210526315, + "grad_norm": 4.8193230628967285, + "learning_rate": 4.78475945830591e-05, + "loss": 3.2463, + "num_input_tokens_seen": 40976, + "step": 320 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 1.9304226636886597, + "learning_rate": 4.777997591359914e-05, + "loss": 3.3801, + "num_input_tokens_seen": 41712, + "step": 325 + }, + { + "epoch": 0.13894736842105262, + "grad_norm": 1.8638708591461182, + "learning_rate": 4.7711360774195835e-05, + "loss": 3.3647, + "num_input_tokens_seen": 42352, + "step": 330 + }, + { + "epoch": 0.14105263157894737, + "grad_norm": 6.234751224517822, + "learning_rate": 4.764175216629766e-05, + "loss": 3.4926, + "num_input_tokens_seen": 43040, + "step": 335 + }, + { + "epoch": 0.1431578947368421, + "grad_norm": 5.5220136642456055, + "learning_rate": 4.7571153134810634e-05, + "loss": 2.8625, + "num_input_tokens_seen": 43680, + "step": 340 + }, + { + "epoch": 0.14526315789473684, + "grad_norm": 3.765749216079712, + "learning_rate": 4.749956676796507e-05, + "loss": 3.7596, + "num_input_tokens_seen": 44256, + "step": 345 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 4.542320251464844, + "learning_rate": 4.742699619718061e-05, + "loss": 2.8841, + "num_input_tokens_seen": 45104, + "step": 350 + }, + { + "epoch": 0.14947368421052631, + "grad_norm": 6.3029632568359375, + "learning_rate": 4.735344459692909e-05, + "loss": 3.1136, + "num_input_tokens_seen": 45696, + "step": 355 + }, + { + "epoch": 0.15157894736842106, + "grad_norm": 7.161231517791748, + "learning_rate": 4.7278915184595774e-05, + "loss": 3.5087, + "num_input_tokens_seen": 46240, + "step": 360 + }, + { + "epoch": 0.15368421052631578, + "grad_norm": 6.039397239685059, + "learning_rate": 4.720341122033862e-05, + "loss": 3.0394, + "num_input_tokens_seen": 46928, + "step": 365 + }, + { + "epoch": 0.15578947368421053, + "grad_norm": 3.1546213626861572, + "learning_rate": 4.71269360069456e-05, + "loss": 3.3537, + "num_input_tokens_seen": 47552, + "step": 370 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 2.607238531112671, + "learning_rate": 4.704949288969031e-05, + "loss": 3.3383, + "num_input_tokens_seen": 48176, + "step": 375 + }, + { + "epoch": 0.16, + "grad_norm": 3.171997308731079, + "learning_rate": 4.697108525618556e-05, + "loss": 2.6449, + "num_input_tokens_seen": 48928, + "step": 380 + }, + { + "epoch": 0.16210526315789472, + "grad_norm": 5.732158660888672, + "learning_rate": 4.6891716536235275e-05, + "loss": 2.9166, + "num_input_tokens_seen": 49552, + "step": 385 + }, + { + "epoch": 0.16421052631578947, + "grad_norm": 6.06363582611084, + "learning_rate": 4.681139020168436e-05, + "loss": 3.689, + "num_input_tokens_seen": 50064, + "step": 390 + }, + { + "epoch": 0.16631578947368422, + "grad_norm": 6.7919487953186035, + "learning_rate": 4.673010976626692e-05, + "loss": 3.0367, + "num_input_tokens_seen": 50800, + "step": 395 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 6.127151012420654, + "learning_rate": 4.664787878545252e-05, + "loss": 2.9739, + "num_input_tokens_seen": 51344, + "step": 400 + }, + { + "epoch": 0.16842105263157894, + "eval_loss": 3.2370829582214355, + "eval_runtime": 20.9481, + "eval_samples_per_second": 23.869, + "eval_steps_per_second": 11.934, + "num_input_tokens_seen": 51344, + "step": 400 + }, + { + "epoch": 0.1705263157894737, + "grad_norm": 4.9031853675842285, + "learning_rate": 4.656470085629062e-05, + "loss": 2.9691, + "num_input_tokens_seen": 52208, + "step": 405 + }, + { + "epoch": 0.1726315789473684, + "grad_norm": 2.7085535526275635, + "learning_rate": 4.648057961725334e-05, + "loss": 3.5849, + "num_input_tokens_seen": 52784, + "step": 410 + }, + { + "epoch": 0.17473684210526316, + "grad_norm": 2.7885546684265137, + "learning_rate": 4.639551874807617e-05, + "loss": 3.0472, + "num_input_tokens_seen": 53488, + "step": 415 + }, + { + "epoch": 0.17684210526315788, + "grad_norm": 4.653199672698975, + "learning_rate": 4.630952196959709e-05, + "loss": 3.3394, + "num_input_tokens_seen": 54096, + "step": 420 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 6.162243366241455, + "learning_rate": 4.622259304359378e-05, + "loss": 3.1293, + "num_input_tokens_seen": 54832, + "step": 425 + }, + { + "epoch": 0.18105263157894738, + "grad_norm": 6.41090202331543, + "learning_rate": 4.613473577261908e-05, + "loss": 3.3758, + "num_input_tokens_seen": 55328, + "step": 430 + }, + { + "epoch": 0.1831578947368421, + "grad_norm": 2.4367990493774414, + "learning_rate": 4.604595399983463e-05, + "loss": 2.7601, + "num_input_tokens_seen": 56064, + "step": 435 + }, + { + "epoch": 0.18526315789473685, + "grad_norm": 3.071305751800537, + "learning_rate": 4.59562516088428e-05, + "loss": 3.4554, + "num_input_tokens_seen": 56624, + "step": 440 + }, + { + "epoch": 0.18736842105263157, + "grad_norm": 5.66983699798584, + "learning_rate": 4.5865632523516754e-05, + "loss": 3.1669, + "num_input_tokens_seen": 57216, + "step": 445 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 4.650711536407471, + "learning_rate": 4.577410070782885e-05, + "loss": 3.0806, + "num_input_tokens_seen": 57936, + "step": 450 + }, + { + "epoch": 0.19157894736842104, + "grad_norm": 5.459670066833496, + "learning_rate": 4.5681660165677236e-05, + "loss": 3.4798, + "num_input_tokens_seen": 58528, + "step": 455 + }, + { + "epoch": 0.1936842105263158, + "grad_norm": 6.794463157653809, + "learning_rate": 4.558831494071069e-05, + "loss": 3.1879, + "num_input_tokens_seen": 59152, + "step": 460 + }, + { + "epoch": 0.1957894736842105, + "grad_norm": 5.128958702087402, + "learning_rate": 4.549406911615174e-05, + "loss": 2.96, + "num_input_tokens_seen": 59904, + "step": 465 + }, + { + "epoch": 0.19789473684210526, + "grad_norm": 5.253583908081055, + "learning_rate": 4.539892681461808e-05, + "loss": 3.4593, + "num_input_tokens_seen": 60432, + "step": 470 + }, + { + "epoch": 0.2, + "grad_norm": 9.341363906860352, + "learning_rate": 4.530289219794218e-05, + "loss": 3.1928, + "num_input_tokens_seen": 61104, + "step": 475 + }, + { + "epoch": 0.20210526315789473, + "grad_norm": 2.279937267303467, + "learning_rate": 4.5205969466989304e-05, + "loss": 2.6037, + "num_input_tokens_seen": 61968, + "step": 480 + }, + { + "epoch": 0.20421052631578948, + "grad_norm": 5.586785793304443, + "learning_rate": 4.5108162861473665e-05, + "loss": 2.8718, + "num_input_tokens_seen": 62832, + "step": 485 + }, + { + "epoch": 0.2063157894736842, + "grad_norm": 6.161190509796143, + "learning_rate": 4.500947665977306e-05, + "loss": 2.8371, + "num_input_tokens_seen": 63360, + "step": 490 + }, + { + "epoch": 0.20842105263157895, + "grad_norm": 5.741548538208008, + "learning_rate": 4.490991517874165e-05, + "loss": 3.188, + "num_input_tokens_seen": 63968, + "step": 495 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 5.338064670562744, + "learning_rate": 4.480948277352113e-05, + "loss": 2.4265, + "num_input_tokens_seen": 64896, + "step": 500 + }, + { + "epoch": 0.21052631578947367, + "eval_loss": 3.229081153869629, + "eval_runtime": 22.1222, + "eval_samples_per_second": 22.602, + "eval_steps_per_second": 11.301, + "num_input_tokens_seen": 64896, + "step": 500 + }, + { + "epoch": 0.21263157894736842, + "grad_norm": 6.277206897735596, + "learning_rate": 4.470818383735027e-05, + "loss": 3.0783, + "num_input_tokens_seen": 65488, + "step": 505 + }, + { + "epoch": 0.21473684210526317, + "grad_norm": 4.257750034332275, + "learning_rate": 4.460602280137271e-05, + "loss": 3.1631, + "num_input_tokens_seen": 66144, + "step": 510 + }, + { + "epoch": 0.2168421052631579, + "grad_norm": 3.1334800720214844, + "learning_rate": 4.45030041344431e-05, + "loss": 3.0374, + "num_input_tokens_seen": 66688, + "step": 515 + }, + { + "epoch": 0.21894736842105264, + "grad_norm": 5.618471145629883, + "learning_rate": 4.4399132342931684e-05, + "loss": 2.7939, + "num_input_tokens_seen": 67296, + "step": 520 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 5.193708896636963, + "learning_rate": 4.4294411970527116e-05, + "loss": 3.0484, + "num_input_tokens_seen": 67984, + "step": 525 + }, + { + "epoch": 0.2231578947368421, + "grad_norm": 6.637373447418213, + "learning_rate": 4.418884759803773e-05, + "loss": 2.873, + "num_input_tokens_seen": 68560, + "step": 530 + }, + { + "epoch": 0.22526315789473683, + "grad_norm": 2.414778709411621, + "learning_rate": 4.408244384319116e-05, + "loss": 2.8347, + "num_input_tokens_seen": 69360, + "step": 535 + }, + { + "epoch": 0.22736842105263158, + "grad_norm": 5.371325969696045, + "learning_rate": 4.397520536043234e-05, + "loss": 3.0505, + "num_input_tokens_seen": 69888, + "step": 540 + }, + { + "epoch": 0.2294736842105263, + "grad_norm": 8.659576416015625, + "learning_rate": 4.386713684071992e-05, + "loss": 3.4455, + "num_input_tokens_seen": 70608, + "step": 545 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 5.507298469543457, + "learning_rate": 4.375824301132103e-05, + "loss": 3.1131, + "num_input_tokens_seen": 71184, + "step": 550 + }, + { + "epoch": 0.2336842105263158, + "grad_norm": 7.195556163787842, + "learning_rate": 4.3648528635604556e-05, + "loss": 2.922, + "num_input_tokens_seen": 71728, + "step": 555 + }, + { + "epoch": 0.23578947368421052, + "grad_norm": 2.615894079208374, + "learning_rate": 4.35379985128327e-05, + "loss": 3.0054, + "num_input_tokens_seen": 72400, + "step": 560 + }, + { + "epoch": 0.23789473684210527, + "grad_norm": 4.415563583374023, + "learning_rate": 4.3426657477951105e-05, + "loss": 2.7888, + "num_input_tokens_seen": 73024, + "step": 565 + }, + { + "epoch": 0.24, + "grad_norm": 3.531663417816162, + "learning_rate": 4.331451040137734e-05, + "loss": 2.649, + "num_input_tokens_seen": 73792, + "step": 570 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 5.618805408477783, + "learning_rate": 4.320156218878783e-05, + "loss": 3.2557, + "num_input_tokens_seen": 74432, + "step": 575 + }, + { + "epoch": 0.24421052631578946, + "grad_norm": 2.6014719009399414, + "learning_rate": 4.308781778090329e-05, + "loss": 3.3816, + "num_input_tokens_seen": 75088, + "step": 580 + }, + { + "epoch": 0.2463157894736842, + "grad_norm": 7.288875102996826, + "learning_rate": 4.297328215327261e-05, + "loss": 3.619, + "num_input_tokens_seen": 75760, + "step": 585 + }, + { + "epoch": 0.24842105263157896, + "grad_norm": 7.931623458862305, + "learning_rate": 4.285796031605519e-05, + "loss": 3.4897, + "num_input_tokens_seen": 76288, + "step": 590 + }, + { + "epoch": 0.2505263157894737, + "grad_norm": 2.327460527420044, + "learning_rate": 4.274185731380178e-05, + "loss": 3.3372, + "num_input_tokens_seen": 76912, + "step": 595 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 6.344222545623779, + "learning_rate": 4.262497822523381e-05, + "loss": 3.2421, + "num_input_tokens_seen": 77584, + "step": 600 + }, + { + "epoch": 0.25263157894736843, + "eval_loss": 3.197930097579956, + "eval_runtime": 22.9116, + "eval_samples_per_second": 21.823, + "eval_steps_per_second": 10.912, + "num_input_tokens_seen": 77584, + "step": 600 + }, + { + "epoch": 0.25473684210526315, + "grad_norm": 7.213596820831299, + "learning_rate": 4.2507328163021264e-05, + "loss": 3.5072, + "num_input_tokens_seen": 78144, + "step": 605 + }, + { + "epoch": 0.25684210526315787, + "grad_norm": 4.491611480712891, + "learning_rate": 4.241265646947705e-05, + "loss": 3.2118, + "num_input_tokens_seen": 78672, + "step": 610 + }, + { + "epoch": 0.25894736842105265, + "grad_norm": 5.271435260772705, + "learning_rate": 4.229363164613873e-05, + "loss": 3.6353, + "num_input_tokens_seen": 79232, + "step": 615 + }, + { + "epoch": 0.26105263157894737, + "grad_norm": 2.8192007541656494, + "learning_rate": 4.217385034332861e-05, + "loss": 3.0386, + "num_input_tokens_seen": 79872, + "step": 620 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 6.350308418273926, + "learning_rate": 4.205331780066892e-05, + "loss": 3.7191, + "num_input_tokens_seen": 80368, + "step": 625 + }, + { + "epoch": 0.26526315789473687, + "grad_norm": 4.168824672698975, + "learning_rate": 4.193203929064353e-05, + "loss": 2.6762, + "num_input_tokens_seen": 81024, + "step": 630 + }, + { + "epoch": 0.2673684210526316, + "grad_norm": 2.9309401512145996, + "learning_rate": 4.181002011836737e-05, + "loss": 3.0327, + "num_input_tokens_seen": 81792, + "step": 635 + }, + { + "epoch": 0.2694736842105263, + "grad_norm": 5.757573127746582, + "learning_rate": 4.1687265621354314e-05, + "loss": 3.6899, + "num_input_tokens_seen": 82336, + "step": 640 + }, + { + "epoch": 0.27157894736842103, + "grad_norm": 6.026342868804932, + "learning_rate": 4.156378116928375e-05, + "loss": 2.6871, + "num_input_tokens_seen": 83040, + "step": 645 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 6.627229690551758, + "learning_rate": 4.143957216376561e-05, + "loss": 3.0722, + "num_input_tokens_seen": 83776, + "step": 650 + }, + { + "epoch": 0.27578947368421053, + "grad_norm": 3.4329299926757812, + "learning_rate": 4.131464403810422e-05, + "loss": 2.9647, + "num_input_tokens_seen": 84384, + "step": 655 + }, + { + "epoch": 0.27789473684210525, + "grad_norm": 7.016841888427734, + "learning_rate": 4.118900225706047e-05, + "loss": 3.4247, + "num_input_tokens_seen": 84992, + "step": 660 + }, + { + "epoch": 0.28, + "grad_norm": 4.475071907043457, + "learning_rate": 4.106265231661292e-05, + "loss": 3.0979, + "num_input_tokens_seen": 85600, + "step": 665 + }, + { + "epoch": 0.28210526315789475, + "grad_norm": 3.230617046356201, + "learning_rate": 4.093559974371725e-05, + "loss": 3.0953, + "num_input_tokens_seen": 86208, + "step": 670 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 6.390127658843994, + "learning_rate": 4.0807850096064605e-05, + "loss": 3.0413, + "num_input_tokens_seen": 86784, + "step": 675 + }, + { + "epoch": 0.2863157894736842, + "grad_norm": 7.769253730773926, + "learning_rate": 4.067940896183843e-05, + "loss": 3.3108, + "num_input_tokens_seen": 87360, + "step": 680 + }, + { + "epoch": 0.28842105263157897, + "grad_norm": 3.7311196327209473, + "learning_rate": 4.0550281959470023e-05, + "loss": 3.2254, + "num_input_tokens_seen": 88048, + "step": 685 + }, + { + "epoch": 0.2905263157894737, + "grad_norm": 6.714389324188232, + "learning_rate": 4.042047473739278e-05, + "loss": 2.8943, + "num_input_tokens_seen": 88688, + "step": 690 + }, + { + "epoch": 0.2926315789473684, + "grad_norm": 6.58662223815918, + "learning_rate": 4.028999297379511e-05, + "loss": 2.8755, + "num_input_tokens_seen": 89408, + "step": 695 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 7.6045403480529785, + "learning_rate": 4.0158842376372064e-05, + "loss": 3.1406, + "num_input_tokens_seen": 89952, + "step": 700 + }, + { + "epoch": 0.29473684210526313, + "eval_loss": 3.176912784576416, + "eval_runtime": 21.4574, + "eval_samples_per_second": 23.302, + "eval_steps_per_second": 11.651, + "num_input_tokens_seen": 89952, + "step": 700 + }, + { + "epoch": 0.2968421052631579, + "grad_norm": 7.232762813568115, + "learning_rate": 4.002702868207563e-05, + "loss": 2.9851, + "num_input_tokens_seen": 90656, + "step": 705 + }, + { + "epoch": 0.29894736842105263, + "grad_norm": 1.8625717163085938, + "learning_rate": 3.9894557656863823e-05, + "loss": 3.2503, + "num_input_tokens_seen": 91616, + "step": 710 + }, + { + "epoch": 0.30105263157894735, + "grad_norm": 5.469936370849609, + "learning_rate": 3.976143509544843e-05, + "loss": 2.9929, + "num_input_tokens_seen": 92128, + "step": 715 + }, + { + "epoch": 0.3031578947368421, + "grad_norm": 6.398062229156494, + "learning_rate": 3.9627666821041545e-05, + "loss": 3.2811, + "num_input_tokens_seen": 92688, + "step": 720 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 7.9203715324401855, + "learning_rate": 3.949325868510083e-05, + "loss": 3.6912, + "num_input_tokens_seen": 93312, + "step": 725 + }, + { + "epoch": 0.30736842105263157, + "grad_norm": 7.217921733856201, + "learning_rate": 3.935821656707359e-05, + "loss": 2.7706, + "num_input_tokens_seen": 93872, + "step": 730 + }, + { + "epoch": 0.3094736842105263, + "grad_norm": 3.361421823501587, + "learning_rate": 3.9222546374139533e-05, + "loss": 3.7275, + "num_input_tokens_seen": 94480, + "step": 735 + }, + { + "epoch": 0.31157894736842107, + "grad_norm": 5.755743980407715, + "learning_rate": 3.9086254040952416e-05, + "loss": 3.4091, + "num_input_tokens_seen": 95040, + "step": 740 + }, + { + "epoch": 0.3136842105263158, + "grad_norm": 2.512439012527466, + "learning_rate": 3.894934552938041e-05, + "loss": 3.2579, + "num_input_tokens_seen": 95696, + "step": 745 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 6.869725227355957, + "learning_rate": 3.8811826828245334e-05, + "loss": 2.8736, + "num_input_tokens_seen": 96352, + "step": 750 + }, + { + "epoch": 0.3178947368421053, + "grad_norm": 4.974020957946777, + "learning_rate": 3.867370395306067e-05, + "loss": 2.5466, + "num_input_tokens_seen": 97184, + "step": 755 + }, + { + "epoch": 0.32, + "grad_norm": 8.7909574508667, + "learning_rate": 3.853498294576845e-05, + "loss": 3.1885, + "num_input_tokens_seen": 97744, + "step": 760 + }, + { + "epoch": 0.32210526315789473, + "grad_norm": 8.156033515930176, + "learning_rate": 3.8395669874474915e-05, + "loss": 3.6878, + "num_input_tokens_seen": 98272, + "step": 765 + }, + { + "epoch": 0.32421052631578945, + "grad_norm": 6.218159198760986, + "learning_rate": 3.825577083318512e-05, + "loss": 3.2699, + "num_input_tokens_seen": 98992, + "step": 770 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 7.318730354309082, + "learning_rate": 3.8115291941536345e-05, + "loss": 3.1887, + "num_input_tokens_seen": 99632, + "step": 775 + }, + { + "epoch": 0.32842105263157895, + "grad_norm": 7.037641525268555, + "learning_rate": 3.797423934453038e-05, + "loss": 3.2489, + "num_input_tokens_seen": 100192, + "step": 780 + }, + { + "epoch": 0.33052631578947367, + "grad_norm": 5.422025680541992, + "learning_rate": 3.783261921226479e-05, + "loss": 3.6458, + "num_input_tokens_seen": 100704, + "step": 785 + }, + { + "epoch": 0.33263157894736844, + "grad_norm": 7.227110385894775, + "learning_rate": 3.7690437739662924e-05, + "loss": 2.6975, + "num_input_tokens_seen": 101504, + "step": 790 + }, + { + "epoch": 0.33473684210526317, + "grad_norm": 3.2489235401153564, + "learning_rate": 3.7547701146203005e-05, + "loss": 2.5729, + "num_input_tokens_seen": 102336, + "step": 795 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 5.875162124633789, + "learning_rate": 3.7404415675646054e-05, + "loss": 3.1426, + "num_input_tokens_seen": 102976, + "step": 800 + }, + { + "epoch": 0.3368421052631579, + "eval_loss": 3.157048225402832, + "eval_runtime": 21.3317, + "eval_samples_per_second": 23.439, + "eval_steps_per_second": 11.72, + "num_input_tokens_seen": 102976, + "step": 800 + }, + { + "epoch": 0.3389473684210526, + "grad_norm": 5.515801906585693, + "learning_rate": 3.726058759576271e-05, + "loss": 2.5607, + "num_input_tokens_seen": 103760, + "step": 805 + }, + { + "epoch": 0.3410526315789474, + "grad_norm": 6.290920257568359, + "learning_rate": 3.711622319805913e-05, + "loss": 3.128, + "num_input_tokens_seen": 104272, + "step": 810 + }, + { + "epoch": 0.3431578947368421, + "grad_norm": 7.0145087242126465, + "learning_rate": 3.697132879750174e-05, + "loss": 2.8597, + "num_input_tokens_seen": 104880, + "step": 815 + }, + { + "epoch": 0.3452631578947368, + "grad_norm": 6.190591812133789, + "learning_rate": 3.6825910732241026e-05, + "loss": 3.5617, + "num_input_tokens_seen": 105440, + "step": 820 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 8.005406379699707, + "learning_rate": 3.667997536333424e-05, + "loss": 3.2266, + "num_input_tokens_seen": 105984, + "step": 825 + }, + { + "epoch": 0.3494736842105263, + "grad_norm": 7.040084362030029, + "learning_rate": 3.65335290744672e-05, + "loss": 3.0301, + "num_input_tokens_seen": 106720, + "step": 830 + }, + { + "epoch": 0.35157894736842105, + "grad_norm": 5.619962215423584, + "learning_rate": 3.6386578271674984e-05, + "loss": 2.9117, + "num_input_tokens_seen": 107392, + "step": 835 + }, + { + "epoch": 0.35368421052631577, + "grad_norm": 8.019217491149902, + "learning_rate": 3.623912938306176e-05, + "loss": 3.4049, + "num_input_tokens_seen": 107920, + "step": 840 + }, + { + "epoch": 0.35578947368421054, + "grad_norm": 2.9099061489105225, + "learning_rate": 3.6091188858519607e-05, + "loss": 3.2627, + "num_input_tokens_seen": 108528, + "step": 845 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 7.037422180175781, + "learning_rate": 3.5942763169446295e-05, + "loss": 2.5905, + "num_input_tokens_seen": 109376, + "step": 850 + }, + { + "epoch": 0.36, + "grad_norm": 6.472799301147461, + "learning_rate": 3.579385880846232e-05, + "loss": 3.5297, + "num_input_tokens_seen": 110048, + "step": 855 + }, + { + "epoch": 0.36210526315789476, + "grad_norm": 7.292938232421875, + "learning_rate": 3.564448228912682e-05, + "loss": 2.3537, + "num_input_tokens_seen": 110720, + "step": 860 + }, + { + "epoch": 0.3642105263157895, + "grad_norm": 6.758236885070801, + "learning_rate": 3.549464014565265e-05, + "loss": 2.8941, + "num_input_tokens_seen": 111408, + "step": 865 + }, + { + "epoch": 0.3663157894736842, + "grad_norm": 4.073180198669434, + "learning_rate": 3.534433893262058e-05, + "loss": 3.6315, + "num_input_tokens_seen": 112032, + "step": 870 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 9.296513557434082, + "learning_rate": 3.519358522469259e-05, + "loss": 3.3729, + "num_input_tokens_seen": 112592, + "step": 875 + }, + { + "epoch": 0.3705263157894737, + "grad_norm": 6.642693519592285, + "learning_rate": 3.504238561632424e-05, + "loss": 3.4379, + "num_input_tokens_seen": 113136, + "step": 880 + }, + { + "epoch": 0.3726315789473684, + "grad_norm": 5.044366359710693, + "learning_rate": 3.489074672147621e-05, + "loss": 3.3523, + "num_input_tokens_seen": 113728, + "step": 885 + }, + { + "epoch": 0.37473684210526315, + "grad_norm": 5.917368412017822, + "learning_rate": 3.473867517332501e-05, + "loss": 3.5194, + "num_input_tokens_seen": 114416, + "step": 890 + }, + { + "epoch": 0.37684210526315787, + "grad_norm": 10.968324661254883, + "learning_rate": 3.458617762397279e-05, + "loss": 3.3524, + "num_input_tokens_seen": 114944, + "step": 895 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 6.620034694671631, + "learning_rate": 3.4433260744156396e-05, + "loss": 3.101, + "num_input_tokens_seen": 115536, + "step": 900 + }, + { + "epoch": 0.37894736842105264, + "eval_loss": 3.1375341415405273, + "eval_runtime": 20.7823, + "eval_samples_per_second": 24.059, + "eval_steps_per_second": 12.029, + "num_input_tokens_seen": 115536, + "step": 900 + }, + { + "epoch": 0.38105263157894737, + "grad_norm": 5.726701259613037, + "learning_rate": 3.427993122295552e-05, + "loss": 3.3473, + "num_input_tokens_seen": 116160, + "step": 905 + }, + { + "epoch": 0.3831578947368421, + "grad_norm": 6.464386463165283, + "learning_rate": 3.412619576750014e-05, + "loss": 3.0013, + "num_input_tokens_seen": 116720, + "step": 910 + }, + { + "epoch": 0.38526315789473686, + "grad_norm": 6.10352087020874, + "learning_rate": 3.397206110267713e-05, + "loss": 3.0683, + "num_input_tokens_seen": 117360, + "step": 915 + }, + { + "epoch": 0.3873684210526316, + "grad_norm": 6.360663414001465, + "learning_rate": 3.381753397083604e-05, + "loss": 3.2696, + "num_input_tokens_seen": 118112, + "step": 920 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 6.36335563659668, + "learning_rate": 3.3662621131494204e-05, + "loss": 3.1551, + "num_input_tokens_seen": 118784, + "step": 925 + }, + { + "epoch": 0.391578947368421, + "grad_norm": 7.843230247497559, + "learning_rate": 3.350732936104108e-05, + "loss": 2.4573, + "num_input_tokens_seen": 119616, + "step": 930 + }, + { + "epoch": 0.3936842105263158, + "grad_norm": 4.782888412475586, + "learning_rate": 3.335166545244178e-05, + "loss": 3.0543, + "num_input_tokens_seen": 120448, + "step": 935 + }, + { + "epoch": 0.3957894736842105, + "grad_norm": 5.410915851593018, + "learning_rate": 3.319563621493994e-05, + "loss": 3.6002, + "num_input_tokens_seen": 121008, + "step": 940 + }, + { + "epoch": 0.39789473684210525, + "grad_norm": 7.571856498718262, + "learning_rate": 3.3039248473759885e-05, + "loss": 3.206, + "num_input_tokens_seen": 121552, + "step": 945 + }, + { + "epoch": 0.4, + "grad_norm": 8.490530967712402, + "learning_rate": 3.2882509069808044e-05, + "loss": 2.6314, + "num_input_tokens_seen": 122256, + "step": 950 + }, + { + "epoch": 0.40210526315789474, + "grad_norm": 6.643606662750244, + "learning_rate": 3.272542485937369e-05, + "loss": 2.9055, + "num_input_tokens_seen": 122928, + "step": 955 + }, + { + "epoch": 0.40421052631578946, + "grad_norm": 4.136000156402588, + "learning_rate": 3.2568002713829084e-05, + "loss": 2.8356, + "num_input_tokens_seen": 123664, + "step": 960 + }, + { + "epoch": 0.4063157894736842, + "grad_norm": 3.604309558868408, + "learning_rate": 3.241024951932885e-05, + "loss": 2.9538, + "num_input_tokens_seen": 124256, + "step": 965 + }, + { + "epoch": 0.40842105263157896, + "grad_norm": 7.433534145355225, + "learning_rate": 3.225217217650876e-05, + "loss": 3.2552, + "num_input_tokens_seen": 124864, + "step": 970 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 6.31335973739624, + "learning_rate": 3.2093777600183875e-05, + "loss": 3.399, + "num_input_tokens_seen": 125616, + "step": 975 + }, + { + "epoch": 0.4126315789473684, + "grad_norm": 5.291494846343994, + "learning_rate": 3.1935072719046115e-05, + "loss": 3.2976, + "num_input_tokens_seen": 126160, + "step": 980 + }, + { + "epoch": 0.4147368421052632, + "grad_norm": 6.659648418426514, + "learning_rate": 3.1776064475361114e-05, + "loss": 2.6787, + "num_input_tokens_seen": 126928, + "step": 985 + }, + { + "epoch": 0.4168421052631579, + "grad_norm": 6.5285258293151855, + "learning_rate": 3.161675982466454e-05, + "loss": 3.0148, + "num_input_tokens_seen": 127824, + "step": 990 + }, + { + "epoch": 0.4189473684210526, + "grad_norm": 3.0640006065368652, + "learning_rate": 3.145716573545792e-05, + "loss": 2.9179, + "num_input_tokens_seen": 128496, + "step": 995 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.517613649368286, + "learning_rate": 3.129728918890371e-05, + "loss": 3.3436, + "num_input_tokens_seen": 129104, + "step": 1000 + }, + { + "epoch": 0.42105263157894735, + "eval_loss": 3.1259236335754395, + "eval_runtime": 20.5644, + "eval_samples_per_second": 24.314, + "eval_steps_per_second": 12.157, + "num_input_tokens_seen": 129104, + "step": 1000 + }, + { + "epoch": 0.4231578947368421, + "grad_norm": 3.706281900405884, + "learning_rate": 3.1137137178519985e-05, + "loss": 3.0361, + "num_input_tokens_seen": 129728, + "step": 1005 + }, + { + "epoch": 0.42526315789473684, + "grad_norm": 6.263809680938721, + "learning_rate": 3.0976716709874496e-05, + "loss": 4.0744, + "num_input_tokens_seen": 130256, + "step": 1010 + }, + { + "epoch": 0.42736842105263156, + "grad_norm": 7.12708044052124, + "learning_rate": 3.081603480027826e-05, + "loss": 3.0169, + "num_input_tokens_seen": 130880, + "step": 1015 + }, + { + "epoch": 0.42947368421052634, + "grad_norm": 3.2109436988830566, + "learning_rate": 3.065509847847851e-05, + "loss": 2.7457, + "num_input_tokens_seen": 131568, + "step": 1020 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 4.197145462036133, + "learning_rate": 3.0493914784351328e-05, + "loss": 2.7009, + "num_input_tokens_seen": 132128, + "step": 1025 + }, + { + "epoch": 0.4336842105263158, + "grad_norm": 7.7999091148376465, + "learning_rate": 3.0332490768593675e-05, + "loss": 3.2792, + "num_input_tokens_seen": 132832, + "step": 1030 + }, + { + "epoch": 0.4357894736842105, + "grad_norm": 7.479982852935791, + "learning_rate": 3.017083349241492e-05, + "loss": 2.6386, + "num_input_tokens_seen": 133488, + "step": 1035 + }, + { + "epoch": 0.4378947368421053, + "grad_norm": 2.8392317295074463, + "learning_rate": 3.0008950027228033e-05, + "loss": 2.6743, + "num_input_tokens_seen": 134288, + "step": 1040 + }, + { + "epoch": 0.44, + "grad_norm": 5.863373756408691, + "learning_rate": 2.984684745434021e-05, + "loss": 2.6163, + "num_input_tokens_seen": 135088, + "step": 1045 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 3.552429437637329, + "learning_rate": 2.9684532864643122e-05, + "loss": 3.0771, + "num_input_tokens_seen": 135808, + "step": 1050 + }, + { + "epoch": 0.4442105263157895, + "grad_norm": 3.51658296585083, + "learning_rate": 2.952201335830275e-05, + "loss": 3.0567, + "num_input_tokens_seen": 136464, + "step": 1055 + }, + { + "epoch": 0.4463157894736842, + "grad_norm": 6.143110752105713, + "learning_rate": 2.9359296044448794e-05, + "loss": 2.6468, + "num_input_tokens_seen": 137120, + "step": 1060 + }, + { + "epoch": 0.44842105263157894, + "grad_norm": 9.046653747558594, + "learning_rate": 2.9196388040863693e-05, + "loss": 3.7056, + "num_input_tokens_seen": 137744, + "step": 1065 + }, + { + "epoch": 0.45052631578947366, + "grad_norm": 9.399657249450684, + "learning_rate": 2.9033296473671278e-05, + "loss": 2.609, + "num_input_tokens_seen": 138384, + "step": 1070 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 6.991403102874756, + "learning_rate": 2.8870028477025042e-05, + "loss": 2.9277, + "num_input_tokens_seen": 139072, + "step": 1075 + }, + { + "epoch": 0.45473684210526316, + "grad_norm": 6.533243656158447, + "learning_rate": 2.870659119279605e-05, + "loss": 3.3649, + "num_input_tokens_seen": 139680, + "step": 1080 + }, + { + "epoch": 0.4568421052631579, + "grad_norm": 7.684907913208008, + "learning_rate": 2.8542991770260608e-05, + "loss": 2.743, + "num_input_tokens_seen": 140384, + "step": 1085 + }, + { + "epoch": 0.4589473684210526, + "grad_norm": 6.984562873840332, + "learning_rate": 2.8379237365787426e-05, + "loss": 2.8093, + "num_input_tokens_seen": 141024, + "step": 1090 + }, + { + "epoch": 0.4610526315789474, + "grad_norm": 7.385660171508789, + "learning_rate": 2.8215335142524657e-05, + "loss": 3.2659, + "num_input_tokens_seen": 141520, + "step": 1095 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 8.597911834716797, + "learning_rate": 2.8051292270086503e-05, + "loss": 3.2059, + "num_input_tokens_seen": 142176, + "step": 1100 + }, + { + "epoch": 0.4631578947368421, + "eval_loss": 3.088874101638794, + "eval_runtime": 21.713, + "eval_samples_per_second": 23.028, + "eval_steps_per_second": 11.514, + "num_input_tokens_seen": 142176, + "step": 1100 + }, + { + "epoch": 0.4652631578947368, + "grad_norm": 3.734241008758545, + "learning_rate": 2.788711592423966e-05, + "loss": 2.5484, + "num_input_tokens_seen": 143008, + "step": 1105 + }, + { + "epoch": 0.4673684210526316, + "grad_norm": 9.137406349182129, + "learning_rate": 2.7722813286589316e-05, + "loss": 3.5024, + "num_input_tokens_seen": 143568, + "step": 1110 + }, + { + "epoch": 0.4694736842105263, + "grad_norm": 6.121912479400635, + "learning_rate": 2.755839154426513e-05, + "loss": 2.9845, + "num_input_tokens_seen": 144272, + "step": 1115 + }, + { + "epoch": 0.47157894736842104, + "grad_norm": 7.885390281677246, + "learning_rate": 2.7393857889606756e-05, + "loss": 2.6784, + "num_input_tokens_seen": 144944, + "step": 1120 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 8.315886497497559, + "learning_rate": 2.722921951984927e-05, + "loss": 3.1619, + "num_input_tokens_seen": 145632, + "step": 1125 + }, + { + "epoch": 0.47578947368421054, + "grad_norm": 9.628824234008789, + "learning_rate": 2.7064483636808313e-05, + "loss": 3.1394, + "num_input_tokens_seen": 146256, + "step": 1130 + }, + { + "epoch": 0.47789473684210526, + "grad_norm": 7.490114212036133, + "learning_rate": 2.689965744656508e-05, + "loss": 3.4272, + "num_input_tokens_seen": 146848, + "step": 1135 + }, + { + "epoch": 0.48, + "grad_norm": 4.576737880706787, + "learning_rate": 2.6734748159151102e-05, + "loss": 3.1131, + "num_input_tokens_seen": 147456, + "step": 1140 + }, + { + "epoch": 0.48210526315789476, + "grad_norm": 6.783941745758057, + "learning_rate": 2.656976298823284e-05, + "loss": 2.4894, + "num_input_tokens_seen": 148240, + "step": 1145 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 5.665360450744629, + "learning_rate": 2.6404709150796137e-05, + "loss": 2.961, + "num_input_tokens_seen": 148864, + "step": 1150 + }, + { + "epoch": 0.4863157894736842, + "grad_norm": 7.185101509094238, + "learning_rate": 2.623959386683056e-05, + "loss": 3.0204, + "num_input_tokens_seen": 149568, + "step": 1155 + }, + { + "epoch": 0.4884210526315789, + "grad_norm": 3.023557186126709, + "learning_rate": 2.6074424359013517e-05, + "loss": 3.104, + "num_input_tokens_seen": 150320, + "step": 1160 + }, + { + "epoch": 0.4905263157894737, + "grad_norm": 2.9198670387268066, + "learning_rate": 2.5909207852394363e-05, + "loss": 2.9611, + "num_input_tokens_seen": 151040, + "step": 1165 + }, + { + "epoch": 0.4926315789473684, + "grad_norm": 6.565184593200684, + "learning_rate": 2.5743951574078314e-05, + "loss": 2.8824, + "num_input_tokens_seen": 151808, + "step": 1170 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 7.136038780212402, + "learning_rate": 2.5578662752910347e-05, + "loss": 3.1135, + "num_input_tokens_seen": 152352, + "step": 1175 + }, + { + "epoch": 0.4968421052631579, + "grad_norm": 7.393047332763672, + "learning_rate": 2.5413348619158967e-05, + "loss": 2.9705, + "num_input_tokens_seen": 152992, + "step": 1180 + }, + { + "epoch": 0.49894736842105264, + "grad_norm": 7.747554779052734, + "learning_rate": 2.5248016404199908e-05, + "loss": 2.6599, + "num_input_tokens_seen": 153648, + "step": 1185 + }, + { + "epoch": 0.5010526315789474, + "grad_norm": 6.3294219970703125, + "learning_rate": 2.508267334019988e-05, + "loss": 3.4948, + "num_input_tokens_seen": 154192, + "step": 1190 + }, + { + "epoch": 0.5031578947368421, + "grad_norm": 3.7944674491882324, + "learning_rate": 2.4917326659800123e-05, + "loss": 2.7937, + "num_input_tokens_seen": 154768, + "step": 1195 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 8.302847862243652, + "learning_rate": 2.475198359580009e-05, + "loss": 3.2607, + "num_input_tokens_seen": 155360, + "step": 1200 + }, + { + "epoch": 0.5052631578947369, + "eval_loss": 3.0811386108398438, + "eval_runtime": 19.3295, + "eval_samples_per_second": 25.867, + "eval_steps_per_second": 12.934, + "num_input_tokens_seen": 155360, + "step": 1200 + }, + { + "epoch": 0.5073684210526316, + "grad_norm": 4.6022186279296875, + "learning_rate": 2.458665138084104e-05, + "loss": 2.8833, + "num_input_tokens_seen": 156176, + "step": 1205 + }, + { + "epoch": 0.5094736842105263, + "grad_norm": 9.753704071044922, + "learning_rate": 2.4421337247089655e-05, + "loss": 3.0525, + "num_input_tokens_seen": 156784, + "step": 1210 + }, + { + "epoch": 0.511578947368421, + "grad_norm": 6.436235427856445, + "learning_rate": 2.425604842592169e-05, + "loss": 3.5911, + "num_input_tokens_seen": 157424, + "step": 1215 + }, + { + "epoch": 0.5136842105263157, + "grad_norm": 7.303015232086182, + "learning_rate": 2.4090792147605647e-05, + "loss": 2.8173, + "num_input_tokens_seen": 158128, + "step": 1220 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 8.383377075195312, + "learning_rate": 2.392557564098649e-05, + "loss": 2.9395, + "num_input_tokens_seen": 158784, + "step": 1225 + }, + { + "epoch": 0.5178947368421053, + "grad_norm": 8.447127342224121, + "learning_rate": 2.3760406133169443e-05, + "loss": 3.643, + "num_input_tokens_seen": 159280, + "step": 1230 + }, + { + "epoch": 0.52, + "grad_norm": 6.229928493499756, + "learning_rate": 2.3595290849203862e-05, + "loss": 3.1131, + "num_input_tokens_seen": 159840, + "step": 1235 + }, + { + "epoch": 0.5221052631578947, + "grad_norm": 7.979696750640869, + "learning_rate": 2.3430237011767167e-05, + "loss": 2.9351, + "num_input_tokens_seen": 160416, + "step": 1240 + }, + { + "epoch": 0.5242105263157895, + "grad_norm": 9.784839630126953, + "learning_rate": 2.32652518408489e-05, + "loss": 2.2889, + "num_input_tokens_seen": 161120, + "step": 1245 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 7.285416126251221, + "learning_rate": 2.3100342553434924e-05, + "loss": 3.7082, + "num_input_tokens_seen": 161728, + "step": 1250 + }, + { + "epoch": 0.5284210526315789, + "grad_norm": 8.140905380249023, + "learning_rate": 2.2935516363191693e-05, + "loss": 3.1545, + "num_input_tokens_seen": 162496, + "step": 1255 + }, + { + "epoch": 0.5305263157894737, + "grad_norm": 7.0224480628967285, + "learning_rate": 2.2770780480150744e-05, + "loss": 3.3456, + "num_input_tokens_seen": 162976, + "step": 1260 + }, + { + "epoch": 0.5326315789473685, + "grad_norm": 3.493330717086792, + "learning_rate": 2.2606142110393247e-05, + "loss": 3.2902, + "num_input_tokens_seen": 163600, + "step": 1265 + }, + { + "epoch": 0.5347368421052632, + "grad_norm": 8.467061996459961, + "learning_rate": 2.2441608455734873e-05, + "loss": 2.9682, + "num_input_tokens_seen": 164128, + "step": 1270 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 8.28370475769043, + "learning_rate": 2.2277186713410687e-05, + "loss": 3.363, + "num_input_tokens_seen": 164848, + "step": 1275 + }, + { + "epoch": 0.5389473684210526, + "grad_norm": 6.318331241607666, + "learning_rate": 2.2112884075760347e-05, + "loss": 2.636, + "num_input_tokens_seen": 165552, + "step": 1280 + }, + { + "epoch": 0.5410526315789473, + "grad_norm": 6.917469501495361, + "learning_rate": 2.19487077299135e-05, + "loss": 3.3949, + "num_input_tokens_seen": 166176, + "step": 1285 + }, + { + "epoch": 0.5431578947368421, + "grad_norm": 6.556621551513672, + "learning_rate": 2.1784664857475352e-05, + "loss": 3.1511, + "num_input_tokens_seen": 166672, + "step": 1290 + }, + { + "epoch": 0.5452631578947369, + "grad_norm": 3.6133346557617188, + "learning_rate": 2.1620762634212586e-05, + "loss": 2.8032, + "num_input_tokens_seen": 167392, + "step": 1295 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 6.281291484832764, + "learning_rate": 2.1457008229739394e-05, + "loss": 2.8431, + "num_input_tokens_seen": 168080, + "step": 1300 + }, + { + "epoch": 0.5473684210526316, + "eval_loss": 3.060394763946533, + "eval_runtime": 18.9707, + "eval_samples_per_second": 26.356, + "eval_steps_per_second": 13.178, + "num_input_tokens_seen": 168080, + "step": 1300 + }, + { + "epoch": 0.5494736842105263, + "grad_norm": 7.926747798919678, + "learning_rate": 2.1293408807203947e-05, + "loss": 2.9994, + "num_input_tokens_seen": 168816, + "step": 1305 + }, + { + "epoch": 0.5515789473684211, + "grad_norm": 7.976357460021973, + "learning_rate": 2.1129971522974967e-05, + "loss": 3.1117, + "num_input_tokens_seen": 169456, + "step": 1310 + }, + { + "epoch": 0.5536842105263158, + "grad_norm": 14.778353691101074, + "learning_rate": 2.0966703526328728e-05, + "loss": 3.0262, + "num_input_tokens_seen": 170032, + "step": 1315 + }, + { + "epoch": 0.5557894736842105, + "grad_norm": 6.6729230880737305, + "learning_rate": 2.080361195913631e-05, + "loss": 2.9819, + "num_input_tokens_seen": 170560, + "step": 1320 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 9.366944313049316, + "learning_rate": 2.0640703955551212e-05, + "loss": 3.2148, + "num_input_tokens_seen": 171168, + "step": 1325 + }, + { + "epoch": 0.56, + "grad_norm": 6.804953575134277, + "learning_rate": 2.047798664169726e-05, + "loss": 3.6074, + "num_input_tokens_seen": 171744, + "step": 1330 + }, + { + "epoch": 0.5621052631578948, + "grad_norm": 9.633087158203125, + "learning_rate": 2.031546713535688e-05, + "loss": 2.5756, + "num_input_tokens_seen": 172560, + "step": 1335 + }, + { + "epoch": 0.5642105263157895, + "grad_norm": 9.197052955627441, + "learning_rate": 2.0153152545659798e-05, + "loss": 3.2645, + "num_input_tokens_seen": 173200, + "step": 1340 + }, + { + "epoch": 0.5663157894736842, + "grad_norm": 7.425899028778076, + "learning_rate": 1.9991049972771972e-05, + "loss": 2.6577, + "num_input_tokens_seen": 174080, + "step": 1345 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 11.483154296875, + "learning_rate": 1.9829166507585083e-05, + "loss": 3.6653, + "num_input_tokens_seen": 174560, + "step": 1350 + }, + { + "epoch": 0.5705263157894737, + "grad_norm": 4.616602897644043, + "learning_rate": 1.9667509231406334e-05, + "loss": 3.3071, + "num_input_tokens_seen": 175184, + "step": 1355 + }, + { + "epoch": 0.5726315789473684, + "grad_norm": 6.981142044067383, + "learning_rate": 1.9506085215648675e-05, + "loss": 3.0256, + "num_input_tokens_seen": 175872, + "step": 1360 + }, + { + "epoch": 0.5747368421052632, + "grad_norm": 10.552703857421875, + "learning_rate": 1.93449015215215e-05, + "loss": 3.2127, + "num_input_tokens_seen": 176432, + "step": 1365 + }, + { + "epoch": 0.5768421052631579, + "grad_norm": 7.310719966888428, + "learning_rate": 1.9183965199721745e-05, + "loss": 2.8919, + "num_input_tokens_seen": 177072, + "step": 1370 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 4.100451469421387, + "learning_rate": 1.90232832901255e-05, + "loss": 2.2461, + "num_input_tokens_seen": 177792, + "step": 1375 + }, + { + "epoch": 0.5810526315789474, + "grad_norm": 9.246238708496094, + "learning_rate": 1.8862862821480025e-05, + "loss": 2.5727, + "num_input_tokens_seen": 179168, + "step": 1380 + }, + { + "epoch": 0.5831578947368421, + "grad_norm": 6.5480875968933105, + "learning_rate": 1.87027108110963e-05, + "loss": 3.5131, + "num_input_tokens_seen": 179712, + "step": 1385 + }, + { + "epoch": 0.5852631578947368, + "grad_norm": 7.038618564605713, + "learning_rate": 1.8542834264542092e-05, + "loss": 3.0084, + "num_input_tokens_seen": 180256, + "step": 1390 + }, + { + "epoch": 0.5873684210526315, + "grad_norm": 8.34747314453125, + "learning_rate": 1.8383240175335464e-05, + "loss": 3.0598, + "num_input_tokens_seen": 180864, + "step": 1395 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 7.6554155349731445, + "learning_rate": 1.8223935524638898e-05, + "loss": 3.3622, + "num_input_tokens_seen": 181424, + "step": 1400 + }, + { + "epoch": 0.5894736842105263, + "eval_loss": 3.044952630996704, + "eval_runtime": 20.7836, + "eval_samples_per_second": 24.057, + "eval_steps_per_second": 12.029, + "num_input_tokens_seen": 181424, + "step": 1400 + }, + { + "epoch": 0.5915789473684211, + "grad_norm": 7.480663776397705, + "learning_rate": 1.806492728095389e-05, + "loss": 3.0682, + "num_input_tokens_seen": 181984, + "step": 1405 + }, + { + "epoch": 0.5936842105263158, + "grad_norm": 9.622962951660156, + "learning_rate": 1.7906222399816124e-05, + "loss": 3.5743, + "num_input_tokens_seen": 182496, + "step": 1410 + }, + { + "epoch": 0.5957894736842105, + "grad_norm": 6.27373743057251, + "learning_rate": 1.7747827823491252e-05, + "loss": 2.965, + "num_input_tokens_seen": 183072, + "step": 1415 + }, + { + "epoch": 0.5978947368421053, + "grad_norm": 2.8175039291381836, + "learning_rate": 1.758975048067116e-05, + "loss": 2.9435, + "num_input_tokens_seen": 183664, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 7.884466648101807, + "learning_rate": 1.7431997286170922e-05, + "loss": 3.6214, + "num_input_tokens_seen": 184192, + "step": 1425 + }, + { + "epoch": 0.6021052631578947, + "grad_norm": 5.639321804046631, + "learning_rate": 1.7274575140626318e-05, + "loss": 2.9336, + "num_input_tokens_seen": 184896, + "step": 1430 + }, + { + "epoch": 0.6042105263157894, + "grad_norm": 10.029356002807617, + "learning_rate": 1.7117490930191965e-05, + "loss": 3.0541, + "num_input_tokens_seen": 185392, + "step": 1435 + }, + { + "epoch": 0.6063157894736843, + "grad_norm": 5.941510200500488, + "learning_rate": 1.696075152624012e-05, + "loss": 3.0067, + "num_input_tokens_seen": 185952, + "step": 1440 + }, + { + "epoch": 0.608421052631579, + "grad_norm": 7.775386810302734, + "learning_rate": 1.6804363785060056e-05, + "loss": 3.6286, + "num_input_tokens_seen": 186512, + "step": 1445 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 7.570580005645752, + "learning_rate": 1.6648334547558226e-05, + "loss": 3.5453, + "num_input_tokens_seen": 187072, + "step": 1450 + }, + { + "epoch": 0.6126315789473684, + "grad_norm": 4.141610145568848, + "learning_rate": 1.6492670638958924e-05, + "loss": 2.7635, + "num_input_tokens_seen": 187712, + "step": 1455 + }, + { + "epoch": 0.6147368421052631, + "grad_norm": 6.558705806732178, + "learning_rate": 1.6337378868505805e-05, + "loss": 3.2797, + "num_input_tokens_seen": 188384, + "step": 1460 + }, + { + "epoch": 0.6168421052631579, + "grad_norm": 7.0916748046875, + "learning_rate": 1.6182466029163975e-05, + "loss": 3.1194, + "num_input_tokens_seen": 188960, + "step": 1465 + }, + { + "epoch": 0.6189473684210526, + "grad_norm": 7.959896564483643, + "learning_rate": 1.602793889732288e-05, + "loss": 2.7758, + "num_input_tokens_seen": 189616, + "step": 1470 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 6.758187770843506, + "learning_rate": 1.5873804232499863e-05, + "loss": 3.2369, + "num_input_tokens_seen": 190160, + "step": 1475 + }, + { + "epoch": 0.6231578947368421, + "grad_norm": 7.454896450042725, + "learning_rate": 1.5720068777044476e-05, + "loss": 2.9654, + "num_input_tokens_seen": 190816, + "step": 1480 + }, + { + "epoch": 0.6252631578947369, + "grad_norm": 4.9395341873168945, + "learning_rate": 1.5566739255843606e-05, + "loss": 3.2288, + "num_input_tokens_seen": 191520, + "step": 1485 + }, + { + "epoch": 0.6273684210526316, + "grad_norm": 6.551896095275879, + "learning_rate": 1.541382237602721e-05, + "loss": 2.9903, + "num_input_tokens_seen": 192256, + "step": 1490 + }, + { + "epoch": 0.6294736842105263, + "grad_norm": 3.614539861679077, + "learning_rate": 1.5261324826675e-05, + "loss": 2.4733, + "num_input_tokens_seen": 193008, + "step": 1495 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 9.163614273071289, + "learning_rate": 1.5109253278523799e-05, + "loss": 2.2921, + "num_input_tokens_seen": 193696, + "step": 1500 + }, + { + "epoch": 0.631578947368421, + "eval_loss": 3.0401594638824463, + "eval_runtime": 19.007, + "eval_samples_per_second": 26.306, + "eval_steps_per_second": 13.153, + "num_input_tokens_seen": 193696, + "step": 1500 + }, + { + "epoch": 0.6336842105263157, + "grad_norm": 8.729575157165527, + "learning_rate": 1.495761438367577e-05, + "loss": 3.5125, + "num_input_tokens_seen": 194272, + "step": 1505 + }, + { + "epoch": 0.6357894736842106, + "grad_norm": 8.781404495239258, + "learning_rate": 1.4806414775307418e-05, + "loss": 2.9632, + "num_input_tokens_seen": 194976, + "step": 1510 + }, + { + "epoch": 0.6378947368421053, + "grad_norm": 7.9863739013671875, + "learning_rate": 1.465566106737942e-05, + "loss": 2.9676, + "num_input_tokens_seen": 195568, + "step": 1515 + }, + { + "epoch": 0.64, + "grad_norm": 6.997982501983643, + "learning_rate": 1.4505359854347361e-05, + "loss": 2.4359, + "num_input_tokens_seen": 196608, + "step": 1520 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 8.560687065124512, + "learning_rate": 1.4355517710873184e-05, + "loss": 3.0732, + "num_input_tokens_seen": 197280, + "step": 1525 + }, + { + "epoch": 0.6442105263157895, + "grad_norm": 3.5838141441345215, + "learning_rate": 1.4206141191537682e-05, + "loss": 2.4828, + "num_input_tokens_seen": 198096, + "step": 1530 + }, + { + "epoch": 0.6463157894736842, + "grad_norm": 6.076948642730713, + "learning_rate": 1.4057236830553704e-05, + "loss": 2.8096, + "num_input_tokens_seen": 198816, + "step": 1535 + }, + { + "epoch": 0.6484210526315789, + "grad_norm": 2.3415188789367676, + "learning_rate": 1.3908811141480408e-05, + "loss": 2.2974, + "num_input_tokens_seen": 199504, + "step": 1540 + }, + { + "epoch": 0.6505263157894737, + "grad_norm": 9.881010055541992, + "learning_rate": 1.3760870616938248e-05, + "loss": 3.609, + "num_input_tokens_seen": 200080, + "step": 1545 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 3.0654964447021484, + "learning_rate": 1.3613421728325018e-05, + "loss": 2.8781, + "num_input_tokens_seen": 200928, + "step": 1550 + }, + { + "epoch": 0.6547368421052632, + "grad_norm": 8.141066551208496, + "learning_rate": 1.346647092553281e-05, + "loss": 3.2405, + "num_input_tokens_seen": 201472, + "step": 1555 + }, + { + "epoch": 0.6568421052631579, + "grad_norm": 6.4536943435668945, + "learning_rate": 1.3320024636665757e-05, + "loss": 3.5873, + "num_input_tokens_seen": 202048, + "step": 1560 + }, + { + "epoch": 0.6589473684210526, + "grad_norm": 3.8144352436065674, + "learning_rate": 1.3174089267758983e-05, + "loss": 3.0432, + "num_input_tokens_seen": 202656, + "step": 1565 + }, + { + "epoch": 0.6610526315789473, + "grad_norm": 6.641602993011475, + "learning_rate": 1.3028671202498261e-05, + "loss": 2.9072, + "num_input_tokens_seen": 203312, + "step": 1570 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 8.284623146057129, + "learning_rate": 1.2883776801940884e-05, + "loss": 3.0814, + "num_input_tokens_seen": 203872, + "step": 1575 + }, + { + "epoch": 0.6652631578947369, + "grad_norm": 3.8963799476623535, + "learning_rate": 1.2739412404237306e-05, + "loss": 2.6138, + "num_input_tokens_seen": 204512, + "step": 1580 + }, + { + "epoch": 0.6673684210526316, + "grad_norm": 2.5593197345733643, + "learning_rate": 1.2595584324353943e-05, + "loss": 2.8958, + "num_input_tokens_seen": 205120, + "step": 1585 + }, + { + "epoch": 0.6694736842105263, + "grad_norm": 9.582676887512207, + "learning_rate": 1.245229885379699e-05, + "loss": 3.1137, + "num_input_tokens_seen": 205712, + "step": 1590 + }, + { + "epoch": 0.671578947368421, + "grad_norm": 7.700749397277832, + "learning_rate": 1.2309562260337073e-05, + "loss": 3.2855, + "num_input_tokens_seen": 206256, + "step": 1595 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 6.43610954284668, + "learning_rate": 1.216738078773522e-05, + "loss": 3.1937, + "num_input_tokens_seen": 206800, + "step": 1600 + }, + { + "epoch": 0.6736842105263158, + "eval_loss": 3.031867742538452, + "eval_runtime": 20.3668, + "eval_samples_per_second": 24.55, + "eval_steps_per_second": 12.275, + "num_input_tokens_seen": 206800, + "step": 1600 + }, + { + "epoch": 0.6757894736842105, + "grad_norm": 4.286380767822266, + "learning_rate": 1.202576065546963e-05, + "loss": 3.0851, + "num_input_tokens_seen": 207488, + "step": 1605 + }, + { + "epoch": 0.6778947368421052, + "grad_norm": 7.998393535614014, + "learning_rate": 1.1884708058463668e-05, + "loss": 3.2747, + "num_input_tokens_seen": 208128, + "step": 1610 + }, + { + "epoch": 0.68, + "grad_norm": 4.960210800170898, + "learning_rate": 1.1744229166814888e-05, + "loss": 3.3051, + "num_input_tokens_seen": 208688, + "step": 1615 + }, + { + "epoch": 0.6821052631578948, + "grad_norm": 7.372851371765137, + "learning_rate": 1.1604330125525079e-05, + "loss": 2.5032, + "num_input_tokens_seen": 209808, + "step": 1620 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 7.572737693786621, + "learning_rate": 1.146501705423155e-05, + "loss": 2.986, + "num_input_tokens_seen": 210496, + "step": 1625 + }, + { + "epoch": 0.6863157894736842, + "grad_norm": 8.84771728515625, + "learning_rate": 1.1326296046939333e-05, + "loss": 3.1372, + "num_input_tokens_seen": 211088, + "step": 1630 + }, + { + "epoch": 0.6884210526315789, + "grad_norm": 7.353373050689697, + "learning_rate": 1.1188173171754673e-05, + "loss": 3.011, + "num_input_tokens_seen": 211680, + "step": 1635 + }, + { + "epoch": 0.6905263157894737, + "grad_norm": 3.834928035736084, + "learning_rate": 1.1050654470619601e-05, + "loss": 3.3625, + "num_input_tokens_seen": 212336, + "step": 1640 + }, + { + "epoch": 0.6926315789473684, + "grad_norm": 7.206390857696533, + "learning_rate": 1.091374595904759e-05, + "loss": 3.3342, + "num_input_tokens_seen": 212864, + "step": 1645 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 7.2623820304870605, + "learning_rate": 1.0777453625860472e-05, + "loss": 3.0298, + "num_input_tokens_seen": 213456, + "step": 1650 + }, + { + "epoch": 0.6968421052631579, + "grad_norm": 7.93752908706665, + "learning_rate": 1.064178343292641e-05, + "loss": 3.4735, + "num_input_tokens_seen": 214016, + "step": 1655 + }, + { + "epoch": 0.6989473684210527, + "grad_norm": 3.989210605621338, + "learning_rate": 1.0506741314899166e-05, + "loss": 2.811, + "num_input_tokens_seen": 214736, + "step": 1660 + }, + { + "epoch": 0.7010526315789474, + "grad_norm": 6.527041435241699, + "learning_rate": 1.0372333178958462e-05, + "loss": 2.6078, + "num_input_tokens_seen": 215376, + "step": 1665 + }, + { + "epoch": 0.7031578947368421, + "grad_norm": 7.541158676147461, + "learning_rate": 1.0238564904551574e-05, + "loss": 3.1118, + "num_input_tokens_seen": 216000, + "step": 1670 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 8.171647071838379, + "learning_rate": 1.0105442343136184e-05, + "loss": 2.7288, + "num_input_tokens_seen": 216672, + "step": 1675 + }, + { + "epoch": 0.7073684210526315, + "grad_norm": 7.328914642333984, + "learning_rate": 9.972971317924374e-06, + "loss": 2.736, + "num_input_tokens_seen": 217344, + "step": 1680 + }, + { + "epoch": 0.7094736842105264, + "grad_norm": 10.017664909362793, + "learning_rate": 9.841157623627947e-06, + "loss": 2.043, + "num_input_tokens_seen": 218048, + "step": 1685 + }, + { + "epoch": 0.7115789473684211, + "grad_norm": 8.35176944732666, + "learning_rate": 9.710007026204895e-06, + "loss": 3.133, + "num_input_tokens_seen": 218672, + "step": 1690 + }, + { + "epoch": 0.7136842105263158, + "grad_norm": 4.202718734741211, + "learning_rate": 9.579525262607226e-06, + "loss": 2.7477, + "num_input_tokens_seen": 219408, + "step": 1695 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 8.910264015197754, + "learning_rate": 9.449718040529987e-06, + "loss": 3.2635, + "num_input_tokens_seen": 219920, + "step": 1700 + }, + { + "epoch": 0.7157894736842105, + "eval_loss": 3.0285239219665527, + "eval_runtime": 21.5591, + "eval_samples_per_second": 23.192, + "eval_steps_per_second": 11.596, + "num_input_tokens_seen": 219920, + "step": 1700 + }, + { + "epoch": 0.7178947368421053, + "grad_norm": 7.725574016571045, + "learning_rate": 9.320591038161574e-06, + "loss": 3.1978, + "num_input_tokens_seen": 220512, + "step": 1705 + }, + { + "epoch": 0.72, + "grad_norm": 8.27135181427002, + "learning_rate": 9.192149903935405e-06, + "loss": 3.2038, + "num_input_tokens_seen": 221040, + "step": 1710 + }, + { + "epoch": 0.7221052631578947, + "grad_norm": 7.715824127197266, + "learning_rate": 9.064400256282757e-06, + "loss": 3.3379, + "num_input_tokens_seen": 221696, + "step": 1715 + }, + { + "epoch": 0.7242105263157895, + "grad_norm": 6.266040325164795, + "learning_rate": 8.937347683387095e-06, + "loss": 3.035, + "num_input_tokens_seen": 222272, + "step": 1720 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 9.326272964477539, + "learning_rate": 8.810997742939531e-06, + "loss": 3.2204, + "num_input_tokens_seen": 222784, + "step": 1725 + }, + { + "epoch": 0.728421052631579, + "grad_norm": 7.447837829589844, + "learning_rate": 8.685355961895784e-06, + "loss": 3.0992, + "num_input_tokens_seen": 223344, + "step": 1730 + }, + { + "epoch": 0.7305263157894737, + "grad_norm": 7.900444030761719, + "learning_rate": 8.56042783623439e-06, + "loss": 2.8659, + "num_input_tokens_seen": 223840, + "step": 1735 + }, + { + "epoch": 0.7326315789473684, + "grad_norm": 7.262746334075928, + "learning_rate": 8.436218830716258e-06, + "loss": 2.8752, + "num_input_tokens_seen": 224624, + "step": 1740 + }, + { + "epoch": 0.7347368421052631, + "grad_norm": 8.595057487487793, + "learning_rate": 8.31273437864569e-06, + "loss": 2.854, + "num_input_tokens_seen": 225216, + "step": 1745 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 11.285130500793457, + "learning_rate": 8.189979881632634e-06, + "loss": 2.9706, + "num_input_tokens_seen": 225776, + "step": 1750 + }, + { + "epoch": 0.7389473684210527, + "grad_norm": 8.038933753967285, + "learning_rate": 8.067960709356478e-06, + "loss": 2.9734, + "num_input_tokens_seen": 226480, + "step": 1755 + }, + { + "epoch": 0.7410526315789474, + "grad_norm": 10.75576400756836, + "learning_rate": 7.946682199331088e-06, + "loss": 3.0971, + "num_input_tokens_seen": 227120, + "step": 1760 + }, + { + "epoch": 0.7431578947368421, + "grad_norm": 11.199416160583496, + "learning_rate": 7.826149656671386e-06, + "loss": 3.1954, + "num_input_tokens_seen": 227744, + "step": 1765 + }, + { + "epoch": 0.7452631578947368, + "grad_norm": 5.3302836418151855, + "learning_rate": 7.706368353861269e-06, + "loss": 2.899, + "num_input_tokens_seen": 228368, + "step": 1770 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 3.570517063140869, + "learning_rate": 7.587343530522945e-06, + "loss": 2.5961, + "num_input_tokens_seen": 229088, + "step": 1775 + }, + { + "epoch": 0.7494736842105263, + "grad_norm": 8.551164627075195, + "learning_rate": 7.469080393187786e-06, + "loss": 3.042, + "num_input_tokens_seen": 229728, + "step": 1780 + }, + { + "epoch": 0.751578947368421, + "grad_norm": 8.114623069763184, + "learning_rate": 7.351584115068535e-06, + "loss": 3.7636, + "num_input_tokens_seen": 230240, + "step": 1785 + }, + { + "epoch": 0.7536842105263157, + "grad_norm": 10.003973007202148, + "learning_rate": 7.234859835833021e-06, + "loss": 3.3422, + "num_input_tokens_seen": 230880, + "step": 1790 + }, + { + "epoch": 0.7557894736842106, + "grad_norm": 6.54116153717041, + "learning_rate": 7.118912661379368e-06, + "loss": 2.8134, + "num_input_tokens_seen": 231568, + "step": 1795 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 7.931966781616211, + "learning_rate": 7.003747663612581e-06, + "loss": 2.9374, + "num_input_tokens_seen": 232224, + "step": 1800 + }, + { + "epoch": 0.7578947368421053, + "eval_loss": 3.024580955505371, + "eval_runtime": 19.415, + "eval_samples_per_second": 25.753, + "eval_steps_per_second": 12.877, + "num_input_tokens_seen": 232224, + "step": 1800 + }, + { + "epoch": 0.76, + "grad_norm": 8.41895866394043, + "learning_rate": 6.889369880222776e-06, + "loss": 3.1164, + "num_input_tokens_seen": 232880, + "step": 1805 + }, + { + "epoch": 0.7621052631578947, + "grad_norm": 9.935205459594727, + "learning_rate": 6.775784314464717e-06, + "loss": 3.4611, + "num_input_tokens_seen": 233456, + "step": 1810 + }, + { + "epoch": 0.7642105263157895, + "grad_norm": 7.748201370239258, + "learning_rate": 6.662995934939007e-06, + "loss": 2.6114, + "num_input_tokens_seen": 234080, + "step": 1815 + }, + { + "epoch": 0.7663157894736842, + "grad_norm": 4.554218292236328, + "learning_rate": 6.551009675374764e-06, + "loss": 2.9599, + "num_input_tokens_seen": 234688, + "step": 1820 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 8.75670051574707, + "learning_rate": 6.439830434413754e-06, + "loss": 3.4321, + "num_input_tokens_seen": 235248, + "step": 1825 + }, + { + "epoch": 0.7705263157894737, + "grad_norm": 8.14771556854248, + "learning_rate": 6.329463075396161e-06, + "loss": 3.1403, + "num_input_tokens_seen": 235824, + "step": 1830 + }, + { + "epoch": 0.7726315789473684, + "grad_norm": 8.070094108581543, + "learning_rate": 6.219912426147795e-06, + "loss": 3.1174, + "num_input_tokens_seen": 236528, + "step": 1835 + }, + { + "epoch": 0.7747368421052632, + "grad_norm": 12.188468933105469, + "learning_rate": 6.111183278768956e-06, + "loss": 2.9941, + "num_input_tokens_seen": 237200, + "step": 1840 + }, + { + "epoch": 0.7768421052631579, + "grad_norm": 8.94471263885498, + "learning_rate": 6.003280389424789e-06, + "loss": 2.8323, + "num_input_tokens_seen": 237840, + "step": 1845 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 8.164437294006348, + "learning_rate": 5.896208478137222e-06, + "loss": 2.867, + "num_input_tokens_seen": 238560, + "step": 1850 + }, + { + "epoch": 0.7810526315789473, + "grad_norm": 3.1200382709503174, + "learning_rate": 5.78997222857853e-06, + "loss": 2.7932, + "num_input_tokens_seen": 239248, + "step": 1855 + }, + { + "epoch": 0.783157894736842, + "grad_norm": 9.779284477233887, + "learning_rate": 5.684576287866411e-06, + "loss": 2.9749, + "num_input_tokens_seen": 239808, + "step": 1860 + }, + { + "epoch": 0.7852631578947369, + "grad_norm": 8.78986644744873, + "learning_rate": 5.5800252663607665e-06, + "loss": 3.3494, + "num_input_tokens_seen": 240320, + "step": 1865 + }, + { + "epoch": 0.7873684210526316, + "grad_norm": 7.762701988220215, + "learning_rate": 5.476323737461955e-06, + "loss": 3.056, + "num_input_tokens_seen": 241040, + "step": 1870 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 7.796496868133545, + "learning_rate": 5.373476237410807e-06, + "loss": 3.0191, + "num_input_tokens_seen": 241616, + "step": 1875 + }, + { + "epoch": 0.791578947368421, + "grad_norm": 8.419198989868164, + "learning_rate": 5.271487265090163e-06, + "loss": 3.2889, + "num_input_tokens_seen": 242144, + "step": 1880 + }, + { + "epoch": 0.7936842105263158, + "grad_norm": 7.434835910797119, + "learning_rate": 5.170361281828054e-06, + "loss": 2.777, + "num_input_tokens_seen": 242800, + "step": 1885 + }, + { + "epoch": 0.7957894736842105, + "grad_norm": 8.441527366638184, + "learning_rate": 5.070102711202607e-06, + "loss": 3.3625, + "num_input_tokens_seen": 243360, + "step": 1890 + }, + { + "epoch": 0.7978947368421052, + "grad_norm": 3.879971742630005, + "learning_rate": 4.970715938848478e-06, + "loss": 3.1517, + "num_input_tokens_seen": 244320, + "step": 1895 + }, + { + "epoch": 0.8, + "grad_norm": 3.3906750679016113, + "learning_rate": 4.872205312265074e-06, + "loss": 3.3592, + "num_input_tokens_seen": 244976, + "step": 1900 + }, + { + "epoch": 0.8, + "eval_loss": 3.0196211338043213, + "eval_runtime": 19.5552, + "eval_samples_per_second": 25.569, + "eval_steps_per_second": 12.784, + "num_input_tokens_seen": 244976, + "step": 1900 + }, + { + "epoch": 0.8021052631578948, + "grad_norm": 8.258299827575684, + "learning_rate": 4.7745751406263165e-06, + "loss": 2.8077, + "num_input_tokens_seen": 245520, + "step": 1905 + }, + { + "epoch": 0.8042105263157895, + "grad_norm": 7.004968166351318, + "learning_rate": 4.677829694592198e-06, + "loss": 2.9133, + "num_input_tokens_seen": 246192, + "step": 1910 + }, + { + "epoch": 0.8063157894736842, + "grad_norm": 9.79128360748291, + "learning_rate": 4.581973206121948e-06, + "loss": 2.9371, + "num_input_tokens_seen": 246688, + "step": 1915 + }, + { + "epoch": 0.8084210526315789, + "grad_norm": 4.271524429321289, + "learning_rate": 4.487009868288888e-06, + "loss": 2.6443, + "num_input_tokens_seen": 247312, + "step": 1920 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 6.933757781982422, + "learning_rate": 4.392943835097069e-06, + "loss": 2.6296, + "num_input_tokens_seen": 248080, + "step": 1925 + }, + { + "epoch": 0.8126315789473684, + "grad_norm": 7.335968017578125, + "learning_rate": 4.299779221299499e-06, + "loss": 3.0604, + "num_input_tokens_seen": 248608, + "step": 1930 + }, + { + "epoch": 0.8147368421052632, + "grad_norm": 2.3223562240600586, + "learning_rate": 4.207520102218213e-06, + "loss": 2.9399, + "num_input_tokens_seen": 249216, + "step": 1935 + }, + { + "epoch": 0.8168421052631579, + "grad_norm": 8.753087043762207, + "learning_rate": 4.116170513565942e-06, + "loss": 3.5264, + "num_input_tokens_seen": 249792, + "step": 1940 + }, + { + "epoch": 0.8189473684210526, + "grad_norm": 8.94509506225586, + "learning_rate": 4.025734451269636e-06, + "loss": 3.2601, + "num_input_tokens_seen": 250336, + "step": 1945 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 6.4770917892456055, + "learning_rate": 3.936215871295634e-06, + "loss": 3.1531, + "num_input_tokens_seen": 250896, + "step": 1950 + }, + { + "epoch": 0.8231578947368421, + "grad_norm": 8.961740493774414, + "learning_rate": 3.847618689476612e-06, + "loss": 2.8972, + "num_input_tokens_seen": 251456, + "step": 1955 + }, + { + "epoch": 0.8252631578947368, + "grad_norm": 7.780544281005859, + "learning_rate": 3.7599467813403344e-06, + "loss": 3.6235, + "num_input_tokens_seen": 252048, + "step": 1960 + }, + { + "epoch": 0.8273684210526315, + "grad_norm": 7.865116596221924, + "learning_rate": 3.6732039819400683e-06, + "loss": 2.6362, + "num_input_tokens_seen": 252672, + "step": 1965 + }, + { + "epoch": 0.8294736842105264, + "grad_norm": 10.95207691192627, + "learning_rate": 3.5873940856868656e-06, + "loss": 3.9149, + "num_input_tokens_seen": 253232, + "step": 1970 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 12.879585266113281, + "learning_rate": 3.502520846183577e-06, + "loss": 3.1847, + "num_input_tokens_seen": 253968, + "step": 1975 + }, + { + "epoch": 0.8336842105263158, + "grad_norm": 9.403814315795898, + "learning_rate": 3.418587976060653e-06, + "loss": 3.2093, + "num_input_tokens_seen": 254576, + "step": 1980 + }, + { + "epoch": 0.8357894736842105, + "grad_norm": 6.636196136474609, + "learning_rate": 3.3355991468137394e-06, + "loss": 3.2709, + "num_input_tokens_seen": 255120, + "step": 1985 + }, + { + "epoch": 0.8378947368421052, + "grad_norm": 6.675850868225098, + "learning_rate": 3.2535579886430718e-06, + "loss": 3.1014, + "num_input_tokens_seen": 255728, + "step": 1990 + }, + { + "epoch": 0.84, + "grad_norm": 11.94994831085205, + "learning_rate": 3.1724680902946753e-06, + "loss": 3.3848, + "num_input_tokens_seen": 256336, + "step": 1995 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 9.998152732849121, + "learning_rate": 3.0923329989034132e-06, + "loss": 3.1163, + "num_input_tokens_seen": 256912, + "step": 2000 + }, + { + "epoch": 0.8421052631578947, + "eval_loss": 3.0172553062438965, + "eval_runtime": 20.2205, + "eval_samples_per_second": 24.727, + "eval_steps_per_second": 12.364, + "num_input_tokens_seen": 256912, + "step": 2000 + }, + { + "epoch": 0.8442105263157895, + "grad_norm": 9.193185806274414, + "learning_rate": 3.013156219837776e-06, + "loss": 3.1348, + "num_input_tokens_seen": 257552, + "step": 2005 + }, + { + "epoch": 0.8463157894736842, + "grad_norm": 0.8085441589355469, + "learning_rate": 2.9349412165465773e-06, + "loss": 2.287, + "num_input_tokens_seen": 259152, + "step": 2010 + }, + { + "epoch": 0.848421052631579, + "grad_norm": 8.64601993560791, + "learning_rate": 2.8576914104074425e-06, + "loss": 2.9002, + "num_input_tokens_seen": 259792, + "step": 2015 + }, + { + "epoch": 0.8505263157894737, + "grad_norm": 6.8055572509765625, + "learning_rate": 2.781410180577157e-06, + "loss": 3.3239, + "num_input_tokens_seen": 260448, + "step": 2020 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 6.6297993659973145, + "learning_rate": 2.706100863843822e-06, + "loss": 2.6339, + "num_input_tokens_seen": 261168, + "step": 2025 + }, + { + "epoch": 0.8547368421052631, + "grad_norm": 7.018527030944824, + "learning_rate": 2.6317667544809134e-06, + "loss": 3.0074, + "num_input_tokens_seen": 261872, + "step": 2030 + }, + { + "epoch": 0.8568421052631578, + "grad_norm": 10.741174697875977, + "learning_rate": 2.558411104103198e-06, + "loss": 3.0096, + "num_input_tokens_seen": 262416, + "step": 2035 + }, + { + "epoch": 0.8589473684210527, + "grad_norm": 4.673122406005859, + "learning_rate": 2.4860371215244484e-06, + "loss": 3.1151, + "num_input_tokens_seen": 263040, + "step": 2040 + }, + { + "epoch": 0.8610526315789474, + "grad_norm": 11.751906394958496, + "learning_rate": 2.414647972617129e-06, + "loss": 2.9073, + "num_input_tokens_seen": 263584, + "step": 2045 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 7.23284912109375, + "learning_rate": 2.3442467801738863e-06, + "loss": 2.8287, + "num_input_tokens_seen": 264176, + "step": 2050 + }, + { + "epoch": 0.8652631578947368, + "grad_norm": 5.624141693115234, + "learning_rate": 2.2748366237709374e-06, + "loss": 3.2398, + "num_input_tokens_seen": 264800, + "step": 2055 + }, + { + "epoch": 0.8673684210526316, + "grad_norm": 6.689244270324707, + "learning_rate": 2.2064205396333886e-06, + "loss": 3.0842, + "num_input_tokens_seen": 265424, + "step": 2060 + }, + { + "epoch": 0.8694736842105263, + "grad_norm": 4.412909984588623, + "learning_rate": 2.13900152050239e-06, + "loss": 2.7342, + "num_input_tokens_seen": 266048, + "step": 2065 + }, + { + "epoch": 0.871578947368421, + "grad_norm": 7.338773727416992, + "learning_rate": 2.072582515504254e-06, + "loss": 3.8273, + "num_input_tokens_seen": 266608, + "step": 2070 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 9.580443382263184, + "learning_rate": 2.007166430021415e-06, + "loss": 3.3556, + "num_input_tokens_seen": 267184, + "step": 2075 + }, + { + "epoch": 0.8757894736842106, + "grad_norm": 7.67100715637207, + "learning_rate": 1.9427561255653816e-06, + "loss": 3.136, + "num_input_tokens_seen": 267840, + "step": 2080 + }, + { + "epoch": 0.8778947368421053, + "grad_norm": 9.706231117248535, + "learning_rate": 1.87935441965153e-06, + "loss": 3.18, + "num_input_tokens_seen": 268336, + "step": 2085 + }, + { + "epoch": 0.88, + "grad_norm": 10.20802116394043, + "learning_rate": 1.8169640856758651e-06, + "loss": 2.7568, + "num_input_tokens_seen": 268864, + "step": 2090 + }, + { + "epoch": 0.8821052631578947, + "grad_norm": 7.580144882202148, + "learning_rate": 1.7555878527937164e-06, + "loss": 3.4589, + "num_input_tokens_seen": 269408, + "step": 2095 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 4.091537952423096, + "learning_rate": 1.6952284058003366e-06, + "loss": 2.8533, + "num_input_tokens_seen": 270112, + "step": 2100 + }, + { + "epoch": 0.8842105263157894, + "eval_loss": 3.016836404800415, + "eval_runtime": 20.1028, + "eval_samples_per_second": 24.872, + "eval_steps_per_second": 12.436, + "num_input_tokens_seen": 270112, + "step": 2100 + }, + { + "epoch": 0.8863157894736842, + "grad_norm": 7.614329814910889, + "learning_rate": 1.6358883850134816e-06, + "loss": 2.816, + "num_input_tokens_seen": 270848, + "step": 2105 + }, + { + "epoch": 0.888421052631579, + "grad_norm": 4.826380729675293, + "learning_rate": 1.5775703861578866e-06, + "loss": 3.0902, + "num_input_tokens_seen": 271472, + "step": 2110 + }, + { + "epoch": 0.8905263157894737, + "grad_norm": 6.646672248840332, + "learning_rate": 1.5202769602517515e-06, + "loss": 2.4841, + "num_input_tokens_seen": 272416, + "step": 2115 + }, + { + "epoch": 0.8926315789473684, + "grad_norm": 4.4659528732299805, + "learning_rate": 1.4640106134951316e-06, + "loss": 2.9784, + "num_input_tokens_seen": 273056, + "step": 2120 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 10.427908897399902, + "learning_rate": 1.4087738071603075e-06, + "loss": 3.1002, + "num_input_tokens_seen": 273680, + "step": 2125 + }, + { + "epoch": 0.8968421052631579, + "grad_norm": 9.523606300354004, + "learning_rate": 1.3545689574841342e-06, + "loss": 3.1575, + "num_input_tokens_seen": 274256, + "step": 2130 + }, + { + "epoch": 0.8989473684210526, + "grad_norm": 10.486383438110352, + "learning_rate": 1.3013984355623315e-06, + "loss": 3.2515, + "num_input_tokens_seen": 274864, + "step": 2135 + }, + { + "epoch": 0.9010526315789473, + "grad_norm": 7.348026752471924, + "learning_rate": 1.2492645672457837e-06, + "loss": 2.6269, + "num_input_tokens_seen": 275488, + "step": 2140 + }, + { + "epoch": 0.9031578947368422, + "grad_norm": 4.190880298614502, + "learning_rate": 1.1981696330387787e-06, + "loss": 2.8124, + "num_input_tokens_seen": 276112, + "step": 2145 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 6.005728721618652, + "learning_rate": 1.1481158679992555e-06, + "loss": 2.5941, + "num_input_tokens_seen": 276768, + "step": 2150 + }, + { + "epoch": 0.9073684210526316, + "grad_norm": 8.46441650390625, + "learning_rate": 1.0991054616410589e-06, + "loss": 2.898, + "num_input_tokens_seen": 277344, + "step": 2155 + }, + { + "epoch": 0.9094736842105263, + "grad_norm": 7.817598819732666, + "learning_rate": 1.051140557838129e-06, + "loss": 2.9216, + "num_input_tokens_seen": 277936, + "step": 2160 + }, + { + "epoch": 0.911578947368421, + "grad_norm": 4.694936752319336, + "learning_rate": 1.004223254730749e-06, + "loss": 2.9048, + "num_input_tokens_seen": 278480, + "step": 2165 + }, + { + "epoch": 0.9136842105263158, + "grad_norm": 11.393693923950195, + "learning_rate": 9.5835560463374e-07, + "loss": 3.0062, + "num_input_tokens_seen": 279104, + "step": 2170 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 6.442198753356934, + "learning_rate": 9.135396139467151e-07, + "loss": 3.1778, + "num_input_tokens_seen": 279680, + "step": 2175 + }, + { + "epoch": 0.9178947368421052, + "grad_norm": 7.988010406494141, + "learning_rate": 8.697772430662859e-07, + "loss": 3.4734, + "num_input_tokens_seen": 280320, + "step": 2180 + }, + { + "epoch": 0.92, + "grad_norm": 6.969496250152588, + "learning_rate": 8.270704063003232e-07, + "loss": 3.112, + "num_input_tokens_seen": 280944, + "step": 2185 + }, + { + "epoch": 0.9221052631578948, + "grad_norm": 9.206937789916992, + "learning_rate": 7.854209717842231e-07, + "loss": 3.3519, + "num_input_tokens_seen": 281456, + "step": 2190 + }, + { + "epoch": 0.9242105263157895, + "grad_norm": 12.280099868774414, + "learning_rate": 7.448307613991734e-07, + "loss": 3.118, + "num_input_tokens_seen": 282032, + "step": 2195 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 8.12384033203125, + "learning_rate": 7.053015506924748e-07, + "loss": 3.6021, + "num_input_tokens_seen": 282512, + "step": 2200 + }, + { + "epoch": 0.9263157894736842, + "eval_loss": 3.0150835514068604, + "eval_runtime": 20.7343, + "eval_samples_per_second": 24.115, + "eval_steps_per_second": 12.057, + "num_input_tokens_seen": 282512, + "step": 2200 + }, + { + "epoch": 0.9284210526315789, + "grad_norm": 6.287796497344971, + "learning_rate": 6.668350687998565e-07, + "loss": 3.0778, + "num_input_tokens_seen": 283152, + "step": 2205 + }, + { + "epoch": 0.9305263157894736, + "grad_norm": 3.7126476764678955, + "learning_rate": 6.2943299836985e-07, + "loss": 2.407, + "num_input_tokens_seen": 283888, + "step": 2210 + }, + { + "epoch": 0.9326315789473684, + "grad_norm": 8.16634750366211, + "learning_rate": 5.930969754901843e-07, + "loss": 2.6046, + "num_input_tokens_seen": 284544, + "step": 2215 + }, + { + "epoch": 0.9347368421052632, + "grad_norm": 9.50013542175293, + "learning_rate": 5.578285896162106e-07, + "loss": 2.7371, + "num_input_tokens_seen": 285232, + "step": 2220 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 10.298531532287598, + "learning_rate": 5.236293835013839e-07, + "loss": 2.9548, + "num_input_tokens_seen": 285808, + "step": 2225 + }, + { + "epoch": 0.9389473684210526, + "grad_norm": 4.417308807373047, + "learning_rate": 4.905008531297661e-07, + "loss": 2.6968, + "num_input_tokens_seen": 286448, + "step": 2230 + }, + { + "epoch": 0.9410526315789474, + "grad_norm": 9.6083402633667, + "learning_rate": 4.5844444765059945e-07, + "loss": 3.5563, + "num_input_tokens_seen": 286960, + "step": 2235 + }, + { + "epoch": 0.9431578947368421, + "grad_norm": 2.603585958480835, + "learning_rate": 4.2746156931490754e-07, + "loss": 2.1315, + "num_input_tokens_seen": 287728, + "step": 2240 + }, + { + "epoch": 0.9452631578947368, + "grad_norm": 7.709352016448975, + "learning_rate": 3.9755357341415835e-07, + "loss": 2.559, + "num_input_tokens_seen": 288496, + "step": 2245 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 12.336277961730957, + "learning_rate": 3.687217682209837e-07, + "loss": 3.3418, + "num_input_tokens_seen": 289072, + "step": 2250 + }, + { + "epoch": 0.9494736842105264, + "grad_norm": 6.97348165512085, + "learning_rate": 3.4096741493194197e-07, + "loss": 2.5656, + "num_input_tokens_seen": 290000, + "step": 2255 + }, + { + "epoch": 0.9515789473684211, + "grad_norm": 4.049922466278076, + "learning_rate": 3.142917276123564e-07, + "loss": 2.9834, + "num_input_tokens_seen": 290672, + "step": 2260 + }, + { + "epoch": 0.9536842105263158, + "grad_norm": 7.198146820068359, + "learning_rate": 2.886958731432132e-07, + "loss": 2.6608, + "num_input_tokens_seen": 291520, + "step": 2265 + }, + { + "epoch": 0.9557894736842105, + "grad_norm": 10.9363374710083, + "learning_rate": 2.641809711700999e-07, + "loss": 2.6429, + "num_input_tokens_seen": 292192, + "step": 2270 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 7.181972026824951, + "learning_rate": 2.4074809405425225e-07, + "loss": 2.8782, + "num_input_tokens_seen": 292880, + "step": 2275 + }, + { + "epoch": 0.96, + "grad_norm": 8.074187278747559, + "learning_rate": 2.1839826682562015e-07, + "loss": 3.6088, + "num_input_tokens_seen": 293488, + "step": 2280 + }, + { + "epoch": 0.9621052631578947, + "grad_norm": 9.010050773620605, + "learning_rate": 1.9713246713805588e-07, + "loss": 3.6666, + "num_input_tokens_seen": 294048, + "step": 2285 + }, + { + "epoch": 0.9642105263157895, + "grad_norm": 7.33037805557251, + "learning_rate": 1.7695162522652353e-07, + "loss": 3.5345, + "num_input_tokens_seen": 294624, + "step": 2290 + }, + { + "epoch": 0.9663157894736842, + "grad_norm": 11.428391456604004, + "learning_rate": 1.578566238664314e-07, + "loss": 3.5267, + "num_input_tokens_seen": 295136, + "step": 2295 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 8.888762474060059, + "learning_rate": 1.3984829833499636e-07, + "loss": 3.2839, + "num_input_tokens_seen": 295680, + "step": 2300 + }, + { + "epoch": 0.968421052631579, + "eval_loss": 3.0145645141601562, + "eval_runtime": 19.0351, + "eval_samples_per_second": 26.267, + "eval_steps_per_second": 13.134, + "num_input_tokens_seen": 295680, + "step": 2300 + }, + { + "epoch": 0.9705263157894737, + "grad_norm": 8.029891014099121, + "learning_rate": 1.229274363747146e-07, + "loss": 2.6669, + "num_input_tokens_seen": 296320, + "step": 2305 + }, + { + "epoch": 0.9726315789473684, + "grad_norm": 7.007322788238525, + "learning_rate": 1.0709477815890601e-07, + "loss": 3.453, + "num_input_tokens_seen": 296864, + "step": 2310 + }, + { + "epoch": 0.9747368421052631, + "grad_norm": 8.719565391540527, + "learning_rate": 9.235101625932885e-08, + "loss": 3.5526, + "num_input_tokens_seen": 297456, + "step": 2315 + }, + { + "epoch": 0.9768421052631578, + "grad_norm": 10.41721248626709, + "learning_rate": 7.869679561589293e-08, + "loss": 3.2058, + "num_input_tokens_seen": 298000, + "step": 2320 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 8.100107192993164, + "learning_rate": 6.613271350844608e-08, + "loss": 3.3422, + "num_input_tokens_seen": 298576, + "step": 2325 + }, + { + "epoch": 0.9810526315789474, + "grad_norm": 5.350499629974365, + "learning_rate": 5.4659319530636633e-08, + "loss": 1.9689, + "num_input_tokens_seen": 299408, + "step": 2330 + }, + { + "epoch": 0.9831578947368421, + "grad_norm": 7.671536445617676, + "learning_rate": 4.427711556588832e-08, + "loss": 3.3022, + "num_input_tokens_seen": 299936, + "step": 2335 + }, + { + "epoch": 0.9852631578947368, + "grad_norm": 8.892446517944336, + "learning_rate": 3.4986555765434413e-08, + "loss": 3.0765, + "num_input_tokens_seen": 300432, + "step": 2340 + }, + { + "epoch": 0.9873684210526316, + "grad_norm": 7.4744415283203125, + "learning_rate": 2.6788046528461453e-08, + "loss": 3.8128, + "num_input_tokens_seen": 301072, + "step": 2345 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 7.746840476989746, + "learning_rate": 1.9681946484320644e-08, + "loss": 3.1598, + "num_input_tokens_seen": 301664, + "step": 2350 + }, + { + "epoch": 0.991578947368421, + "grad_norm": 7.51800012588501, + "learning_rate": 1.3668566476848777e-08, + "loss": 3.4258, + "num_input_tokens_seen": 302192, + "step": 2355 + }, + { + "epoch": 0.9936842105263158, + "grad_norm": 14.097600936889648, + "learning_rate": 8.74816955076796e-09, + "loss": 3.2759, + "num_input_tokens_seen": 302720, + "step": 2360 + }, + { + "epoch": 0.9957894736842106, + "grad_norm": 7.83034610748291, + "learning_rate": 4.920970940180958e-09, + "loss": 2.9216, + "num_input_tokens_seen": 303328, + "step": 2365 + }, + { + "epoch": 0.9978947368421053, + "grad_norm": 3.084580421447754, + "learning_rate": 2.1871380591509392e-09, + "loss": 2.5628, + "num_input_tokens_seen": 303936, + "step": 2370 + }, + { + "epoch": 1.0, + "grad_norm": 10.095595359802246, + "learning_rate": 5.467904943851077e-10, + "loss": 2.9564, + "num_input_tokens_seen": 304528, + "step": 2375 + } + ], + "logging_steps": 5, + "max_steps": 2375, + "num_input_tokens_seen": 304528, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5097324650299392.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}