{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0713171383998217, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.036036036036036036, "grad_norm": 6.189637660980225, "learning_rate": 3.3333333333333333e-06, "loss": 3.5287, "num_input_tokens_seen": 3200, "step": 5 }, { "epoch": 0.07207207207207207, "grad_norm": 5.866844177246094, "learning_rate": 6.666666666666667e-06, "loss": 3.4855, "num_input_tokens_seen": 6560, "step": 10 }, { "epoch": 0.10810810810810811, "grad_norm": 5.849005222320557, "learning_rate": 1e-05, "loss": 3.6108, "num_input_tokens_seen": 9856, "step": 15 }, { "epoch": 0.14414414414414414, "grad_norm": 7.444654941558838, "learning_rate": 1.3333333333333333e-05, "loss": 3.418, "num_input_tokens_seen": 12752, "step": 20 }, { "epoch": 0.18018018018018017, "grad_norm": 5.2968339920043945, "learning_rate": 1.6666666666666667e-05, "loss": 3.5055, "num_input_tokens_seen": 15824, "step": 25 }, { "epoch": 0.21621621621621623, "grad_norm": 6.087192058563232, "learning_rate": 2e-05, "loss": 3.4734, "num_input_tokens_seen": 18672, "step": 30 }, { "epoch": 0.25225225225225223, "grad_norm": 5.243087291717529, "learning_rate": 2.3333333333333336e-05, "loss": 3.0335, "num_input_tokens_seen": 26016, "step": 35 }, { "epoch": 0.2882882882882883, "grad_norm": 5.207272052764893, "learning_rate": 2.6666666666666667e-05, "loss": 3.5633, "num_input_tokens_seen": 30032, "step": 40 }, { "epoch": 0.32432432432432434, "grad_norm": 5.279658317565918, "learning_rate": 3e-05, "loss": 3.7508, "num_input_tokens_seen": 33504, "step": 45 }, { "epoch": 0.36036036036036034, "grad_norm": 4.909541606903076, "learning_rate": 3.3333333333333335e-05, "loss": 3.4502, "num_input_tokens_seen": 37472, "step": 50 }, { "epoch": 0.3963963963963964, "grad_norm": 5.139044284820557, "learning_rate": 3.6666666666666666e-05, "loss": 2.9229, "num_input_tokens_seen": 45008, "step": 55 }, { "epoch": 0.43243243243243246, "grad_norm": 5.718777179718018, "learning_rate": 4e-05, "loss": 3.668, "num_input_tokens_seen": 48832, "step": 60 }, { "epoch": 0.46846846846846846, "grad_norm": 5.036716938018799, "learning_rate": 4.3333333333333334e-05, "loss": 3.2389, "num_input_tokens_seen": 51840, "step": 65 }, { "epoch": 0.5045045045045045, "grad_norm": 5.894410133361816, "learning_rate": 4.666666666666667e-05, "loss": 3.589, "num_input_tokens_seen": 54976, "step": 70 }, { "epoch": 0.5405405405405406, "grad_norm": 6.030157089233398, "learning_rate": 5e-05, "loss": 3.7204, "num_input_tokens_seen": 58192, "step": 75 }, { "epoch": 0.5765765765765766, "grad_norm": 5.202926158905029, "learning_rate": 5.333333333333333e-05, "loss": 2.9049, "num_input_tokens_seen": 65872, "step": 80 }, { "epoch": 0.6126126126126126, "grad_norm": 6.379465103149414, "learning_rate": 5.666666666666667e-05, "loss": 3.6342, "num_input_tokens_seen": 69360, "step": 85 }, { "epoch": 0.6486486486486487, "grad_norm": 4.922969818115234, "learning_rate": 6e-05, "loss": 3.4525, "num_input_tokens_seen": 72656, "step": 90 }, { "epoch": 0.6846846846846847, "grad_norm": 6.9363298416137695, "learning_rate": 6.333333333333333e-05, "loss": 3.4722, "num_input_tokens_seen": 76496, "step": 95 }, { "epoch": 0.7207207207207207, "grad_norm": 5.329838275909424, "learning_rate": 6.666666666666667e-05, "loss": 3.2838, "num_input_tokens_seen": 79760, "step": 100 }, { "epoch": 0.7207207207207207, "eval_loss": 2.9567394256591797, "eval_runtime": 0.3463, "eval_samples_per_second": 14.439, "eval_steps_per_second": 8.663, "num_input_tokens_seen": 79760, "step": 100 }, { "epoch": 0.7567567567567568, "grad_norm": 7.31512451171875, "learning_rate": 7e-05, "loss": 3.0849, "num_input_tokens_seen": 82880, "step": 105 }, { "epoch": 0.7927927927927928, "grad_norm": 5.24735164642334, "learning_rate": 7.333333333333333e-05, "loss": 3.0536, "num_input_tokens_seen": 86176, "step": 110 }, { "epoch": 0.8288288288288288, "grad_norm": 8.290057182312012, "learning_rate": 7.666666666666667e-05, "loss": 2.6354, "num_input_tokens_seen": 93136, "step": 115 }, { "epoch": 0.8648648648648649, "grad_norm": 6.639683246612549, "learning_rate": 8e-05, "loss": 2.8682, "num_input_tokens_seen": 96800, "step": 120 }, { "epoch": 0.9009009009009009, "grad_norm": 6.359157562255859, "learning_rate": 8.333333333333334e-05, "loss": 3.0521, "num_input_tokens_seen": 100000, "step": 125 }, { "epoch": 0.9369369369369369, "grad_norm": 5.462643623352051, "learning_rate": 8.666666666666667e-05, "loss": 3.2956, "num_input_tokens_seen": 103504, "step": 130 }, { "epoch": 0.972972972972973, "grad_norm": 5.68543815612793, "learning_rate": 9e-05, "loss": 3.2966, "num_input_tokens_seen": 106720, "step": 135 }, { "epoch": 1.0144144144144145, "grad_norm": 6.407595634460449, "learning_rate": 9.333333333333334e-05, "loss": 3.6629, "num_input_tokens_seen": 110112, "step": 140 }, { "epoch": 1.0504504504504504, "grad_norm": 8.155672073364258, "learning_rate": 9.666666666666667e-05, "loss": 2.9811, "num_input_tokens_seen": 113184, "step": 145 }, { "epoch": 1.0864864864864865, "grad_norm": 10.235915184020996, "learning_rate": 0.0001, "loss": 2.698, "num_input_tokens_seen": 116976, "step": 150 }, { "epoch": 1.1225225225225226, "grad_norm": 7.20429801940918, "learning_rate": 0.00010333333333333334, "loss": 2.5234, "num_input_tokens_seen": 120672, "step": 155 }, { "epoch": 1.1585585585585585, "grad_norm": 7.340835094451904, "learning_rate": 0.00010666666666666667, "loss": 2.7242, "num_input_tokens_seen": 123680, "step": 160 }, { "epoch": 1.1945945945945946, "grad_norm": 8.580013275146484, "learning_rate": 0.00011000000000000002, "loss": 2.8218, "num_input_tokens_seen": 126624, "step": 165 }, { "epoch": 1.2306306306306307, "grad_norm": 0.891119658946991, "learning_rate": 0.00011333333333333334, "loss": 2.4674, "num_input_tokens_seen": 133728, "step": 170 }, { "epoch": 1.2666666666666666, "grad_norm": 0.8570746779441833, "learning_rate": 0.00011666666666666668, "loss": 2.2086, "num_input_tokens_seen": 140880, "step": 175 }, { "epoch": 1.3027027027027027, "grad_norm": 5.530377388000488, "learning_rate": 0.00012, "loss": 2.4582, "num_input_tokens_seen": 148288, "step": 180 }, { "epoch": 1.3387387387387388, "grad_norm": 5.526784896850586, "learning_rate": 0.00012333333333333334, "loss": 2.9289, "num_input_tokens_seen": 151824, "step": 185 }, { "epoch": 1.3747747747747747, "grad_norm": 9.305526733398438, "learning_rate": 0.00012666666666666666, "loss": 2.6308, "num_input_tokens_seen": 154896, "step": 190 }, { "epoch": 1.4108108108108108, "grad_norm": 8.043120384216309, "learning_rate": 0.00013000000000000002, "loss": 2.8798, "num_input_tokens_seen": 158080, "step": 195 }, { "epoch": 1.4468468468468467, "grad_norm": 7.124425411224365, "learning_rate": 0.00013333333333333334, "loss": 2.9072, "num_input_tokens_seen": 161216, "step": 200 }, { "epoch": 1.4468468468468467, "eval_loss": 3.2079105377197266, "eval_runtime": 0.3, "eval_samples_per_second": 16.667, "eval_steps_per_second": 10.0, "num_input_tokens_seen": 161216, "step": 200 }, { "epoch": 1.4828828828828828, "grad_norm": 7.224998950958252, "learning_rate": 0.00013666666666666666, "loss": 2.8577, "num_input_tokens_seen": 164352, "step": 205 }, { "epoch": 1.518918918918919, "grad_norm": 6.092015743255615, "learning_rate": 0.00014, "loss": 3.4773, "num_input_tokens_seen": 168416, "step": 210 }, { "epoch": 1.554954954954955, "grad_norm": 0.6125243306159973, "learning_rate": 0.00014333333333333334, "loss": 2.2967, "num_input_tokens_seen": 176656, "step": 215 }, { "epoch": 1.590990990990991, "grad_norm": 10.359404563903809, "learning_rate": 0.00014666666666666666, "loss": 2.9172, "num_input_tokens_seen": 179744, "step": 220 }, { "epoch": 1.627027027027027, "grad_norm": 8.786321640014648, "learning_rate": 0.00015000000000000001, "loss": 2.795, "num_input_tokens_seen": 182864, "step": 225 }, { "epoch": 1.663063063063063, "grad_norm": 7.069016933441162, "learning_rate": 0.00015333333333333334, "loss": 2.8839, "num_input_tokens_seen": 185952, "step": 230 }, { "epoch": 1.699099099099099, "grad_norm": 8.044764518737793, "learning_rate": 0.00015666666666666666, "loss": 2.9845, "num_input_tokens_seen": 189120, "step": 235 }, { "epoch": 1.7351351351351352, "grad_norm": 4.376863956451416, "learning_rate": 0.00016, "loss": 2.5886, "num_input_tokens_seen": 193440, "step": 240 }, { "epoch": 1.7711711711711713, "grad_norm": 7.085455417633057, "learning_rate": 0.00016333333333333334, "loss": 3.0422, "num_input_tokens_seen": 197248, "step": 245 }, { "epoch": 1.8072072072072072, "grad_norm": 8.856867790222168, "learning_rate": 0.0001666666666666667, "loss": 2.9034, "num_input_tokens_seen": 200096, "step": 250 }, { "epoch": 1.8432432432432433, "grad_norm": 6.714722633361816, "learning_rate": 0.00017, "loss": 2.5464, "num_input_tokens_seen": 203792, "step": 255 }, { "epoch": 1.8792792792792792, "grad_norm": 8.368040084838867, "learning_rate": 0.00017333333333333334, "loss": 3.1094, "num_input_tokens_seen": 206784, "step": 260 }, { "epoch": 1.9153153153153153, "grad_norm": 7.750094413757324, "learning_rate": 0.00017666666666666666, "loss": 2.901, "num_input_tokens_seen": 210064, "step": 265 }, { "epoch": 1.9513513513513514, "grad_norm": 7.384438991546631, "learning_rate": 0.00018, "loss": 2.8172, "num_input_tokens_seen": 213520, "step": 270 }, { "epoch": 1.9873873873873875, "grad_norm": 5.683988571166992, "learning_rate": 0.00018333333333333334, "loss": 2.8409, "num_input_tokens_seen": 216752, "step": 275 }, { "epoch": 2.028828828828829, "grad_norm": 6.135600566864014, "learning_rate": 0.0001866666666666667, "loss": 2.6342, "num_input_tokens_seen": 220320, "step": 280 }, { "epoch": 2.064864864864865, "grad_norm": 14.494550704956055, "learning_rate": 0.00019, "loss": 1.7648, "num_input_tokens_seen": 223808, "step": 285 }, { "epoch": 2.1009009009009008, "grad_norm": 7.6210551261901855, "learning_rate": 0.00019333333333333333, "loss": 2.0011, "num_input_tokens_seen": 227280, "step": 290 }, { "epoch": 2.136936936936937, "grad_norm": 6.052605628967285, "learning_rate": 0.00019666666666666666, "loss": 2.0263, "num_input_tokens_seen": 230656, "step": 295 }, { "epoch": 2.172972972972973, "grad_norm": 9.172389030456543, "learning_rate": 0.0002, "loss": 1.6316, "num_input_tokens_seen": 233776, "step": 300 }, { "epoch": 2.172972972972973, "eval_loss": 3.5101757049560547, "eval_runtime": 0.2957, "eval_samples_per_second": 16.907, "eval_steps_per_second": 10.144, "num_input_tokens_seen": 233776, "step": 300 }, { "epoch": 2.209009009009009, "grad_norm": 7.576519966125488, "learning_rate": 0.00019991889981715698, "loss": 2.0076, "num_input_tokens_seen": 237312, "step": 305 }, { "epoch": 2.245045045045045, "grad_norm": 6.638524532318115, "learning_rate": 0.00019967573081342103, "loss": 1.9773, "num_input_tokens_seen": 240736, "step": 310 }, { "epoch": 2.281081081081081, "grad_norm": 4.5822906494140625, "learning_rate": 0.0001992708874098054, "loss": 1.6004, "num_input_tokens_seen": 248352, "step": 315 }, { "epoch": 2.317117117117117, "grad_norm": 11.44710636138916, "learning_rate": 0.00019870502626379127, "loss": 2.0804, "num_input_tokens_seen": 252048, "step": 320 }, { "epoch": 2.353153153153153, "grad_norm": 11.596573829650879, "learning_rate": 0.00019797906520422677, "loss": 1.8889, "num_input_tokens_seen": 255056, "step": 325 }, { "epoch": 2.389189189189189, "grad_norm": 7.399219036102295, "learning_rate": 0.0001970941817426052, "loss": 1.6757, "num_input_tokens_seen": 262768, "step": 330 }, { "epoch": 2.4252252252252253, "grad_norm": 9.732112884521484, "learning_rate": 0.00019605181116313724, "loss": 1.8391, "num_input_tokens_seen": 265616, "step": 335 }, { "epoch": 2.4612612612612614, "grad_norm": 11.036967277526855, "learning_rate": 0.00019485364419471454, "loss": 2.1077, "num_input_tokens_seen": 269040, "step": 340 }, { "epoch": 2.4972972972972975, "grad_norm": 8.808915138244629, "learning_rate": 0.0001935016242685415, "loss": 1.7143, "num_input_tokens_seen": 272256, "step": 345 }, { "epoch": 2.533333333333333, "grad_norm": 8.189955711364746, "learning_rate": 0.00019199794436588243, "loss": 1.7009, "num_input_tokens_seen": 275312, "step": 350 }, { "epoch": 2.5693693693693693, "grad_norm": 7.398507118225098, "learning_rate": 0.00019034504346103823, "loss": 2.174, "num_input_tokens_seen": 278928, "step": 355 }, { "epoch": 2.6054054054054054, "grad_norm": 10.141551971435547, "learning_rate": 0.000188545602565321, "loss": 2.0016, "num_input_tokens_seen": 282256, "step": 360 }, { "epoch": 2.6414414414414416, "grad_norm": 7.210516929626465, "learning_rate": 0.00018660254037844388, "loss": 1.9766, "num_input_tokens_seen": 285616, "step": 365 }, { "epoch": 2.6774774774774777, "grad_norm": 9.60509967803955, "learning_rate": 0.0001845190085543795, "loss": 2.034, "num_input_tokens_seen": 292624, "step": 370 }, { "epoch": 2.7135135135135133, "grad_norm": 7.727905750274658, "learning_rate": 0.00018229838658936564, "loss": 1.9747, "num_input_tokens_seen": 295760, "step": 375 }, { "epoch": 2.7495495495495494, "grad_norm": 7.084625244140625, "learning_rate": 0.00017994427634035015, "loss": 2.2312, "num_input_tokens_seen": 298880, "step": 380 }, { "epoch": 2.7855855855855856, "grad_norm": 8.565605163574219, "learning_rate": 0.00017746049618276545, "loss": 1.7228, "num_input_tokens_seen": 305968, "step": 385 }, { "epoch": 2.8216216216216217, "grad_norm": 7.990584850311279, "learning_rate": 0.00017485107481711012, "loss": 2.0828, "num_input_tokens_seen": 309680, "step": 390 }, { "epoch": 2.857657657657658, "grad_norm": 8.152718544006348, "learning_rate": 0.00017212024473438147, "loss": 2.2638, "num_input_tokens_seen": 312752, "step": 395 }, { "epoch": 2.8936936936936934, "grad_norm": 8.080291748046875, "learning_rate": 0.00016927243535095997, "loss": 2.1471, "num_input_tokens_seen": 316096, "step": 400 }, { "epoch": 2.8936936936936934, "eval_loss": 3.6192080974578857, "eval_runtime": 0.2961, "eval_samples_per_second": 16.888, "eval_steps_per_second": 10.133, "num_input_tokens_seen": 316096, "step": 400 }, { "epoch": 0.02888344105192779, "grad_norm": 2.729156255722046, "learning_rate": 0.00019997110279082795, "loss": 2.2111, "num_input_tokens_seen": 328800, "step": 405 }, { "epoch": 0.0292400267439269, "grad_norm": 1.5953912734985352, "learning_rate": 0.00019996828529806075, "loss": 1.7657, "num_input_tokens_seen": 342272, "step": 410 }, { "epoch": 0.02959661243592601, "grad_norm": 1.2057961225509644, "learning_rate": 0.0001999653367874579, "loss": 1.9321, "num_input_tokens_seen": 354880, "step": 415 }, { "epoch": 0.02995319812792512, "grad_norm": 1.0265964269638062, "learning_rate": 0.00019996225726288367, "loss": 1.7011, "num_input_tokens_seen": 369008, "step": 420 }, { "epoch": 0.030309783819924226, "grad_norm": 1.1587436199188232, "learning_rate": 0.00019995904672837406, "loss": 1.8462, "num_input_tokens_seen": 382416, "step": 425 }, { "epoch": 0.030666369511923332, "grad_norm": 1.5749984979629517, "learning_rate": 0.00019995570518813679, "loss": 1.6671, "num_input_tokens_seen": 394608, "step": 430 }, { "epoch": 0.031022955203922443, "grad_norm": 0.9112032055854797, "learning_rate": 0.00019995223264655126, "loss": 1.6305, "num_input_tokens_seen": 408576, "step": 435 }, { "epoch": 0.03137954089592155, "grad_norm": 1.300126075744629, "learning_rate": 0.00019994862910816856, "loss": 1.5954, "num_input_tokens_seen": 422848, "step": 440 }, { "epoch": 0.03173612658792066, "grad_norm": 0.9108946323394775, "learning_rate": 0.00019994489457771146, "loss": 1.6211, "num_input_tokens_seen": 437920, "step": 445 }, { "epoch": 0.03209271227991977, "grad_norm": 1.2991251945495605, "learning_rate": 0.00019994102906007447, "loss": 1.8802, "num_input_tokens_seen": 449264, "step": 450 }, { "epoch": 0.032449297971918874, "grad_norm": 1.0694671869277954, "learning_rate": 0.0001999370325603236, "loss": 1.828, "num_input_tokens_seen": 461680, "step": 455 }, { "epoch": 0.032805883663917984, "grad_norm": 0.929099440574646, "learning_rate": 0.00019993290508369673, "loss": 1.5744, "num_input_tokens_seen": 475520, "step": 460 }, { "epoch": 0.033162469355917094, "grad_norm": 1.2470440864562988, "learning_rate": 0.00019992864663560329, "loss": 1.7386, "num_input_tokens_seen": 487968, "step": 465 }, { "epoch": 0.033519055047916205, "grad_norm": 0.9236185550689697, "learning_rate": 0.00019992425722162435, "loss": 1.5627, "num_input_tokens_seen": 501696, "step": 470 }, { "epoch": 0.03387564073991531, "grad_norm": 1.1745721101760864, "learning_rate": 0.00019991973684751267, "loss": 1.5464, "num_input_tokens_seen": 514864, "step": 475 }, { "epoch": 0.03423222643191442, "grad_norm": 1.0948406457901, "learning_rate": 0.0001999150855191926, "loss": 1.5185, "num_input_tokens_seen": 526304, "step": 480 }, { "epoch": 0.03458881212391353, "grad_norm": 0.8970179557800293, "learning_rate": 0.00019991030324276022, "loss": 1.7223, "num_input_tokens_seen": 537648, "step": 485 }, { "epoch": 0.03494539781591264, "grad_norm": 1.3912155628204346, "learning_rate": 0.00019990539002448307, "loss": 1.5333, "num_input_tokens_seen": 550144, "step": 490 }, { "epoch": 0.03530198350791174, "grad_norm": 1.0233279466629028, "learning_rate": 0.0001999003458708004, "loss": 1.695, "num_input_tokens_seen": 564336, "step": 495 }, { "epoch": 0.03565856919991085, "grad_norm": 1.1323829889297485, "learning_rate": 0.00019989517078832308, "loss": 1.864, "num_input_tokens_seen": 578144, "step": 500 }, { "epoch": 0.03565856919991085, "eval_loss": 1.5353425741195679, "eval_runtime": 25.9763, "eval_samples_per_second": 17.323, "eval_steps_per_second": 8.662, "num_input_tokens_seen": 578144, "step": 500 }, { "epoch": 0.03601515489190996, "grad_norm": 1.0973526239395142, "learning_rate": 0.0001998898647838335, "loss": 1.5819, "num_input_tokens_seen": 589952, "step": 505 }, { "epoch": 0.036371740583909073, "grad_norm": 1.1353367567062378, "learning_rate": 0.0001998844278642857, "loss": 1.6855, "num_input_tokens_seen": 604288, "step": 510 }, { "epoch": 0.03672832627590818, "grad_norm": 0.9509443640708923, "learning_rate": 0.0001998788600368053, "loss": 1.6287, "num_input_tokens_seen": 617344, "step": 515 }, { "epoch": 0.03708491196790729, "grad_norm": 1.1190470457077026, "learning_rate": 0.0001998731613086894, "loss": 1.5082, "num_input_tokens_seen": 630464, "step": 520 }, { "epoch": 0.0374414976599064, "grad_norm": 0.9811726808547974, "learning_rate": 0.00019986733168740676, "loss": 1.6331, "num_input_tokens_seen": 640992, "step": 525 }, { "epoch": 0.03779808335190551, "grad_norm": 0.9463362693786621, "learning_rate": 0.00019986137118059767, "loss": 1.7424, "num_input_tokens_seen": 652944, "step": 530 }, { "epoch": 0.03815466904390461, "grad_norm": 0.8193320631980896, "learning_rate": 0.00019985527979607385, "loss": 1.6713, "num_input_tokens_seen": 667408, "step": 535 }, { "epoch": 0.03851125473590372, "grad_norm": 0.7664808630943298, "learning_rate": 0.00019984905754181874, "loss": 1.5241, "num_input_tokens_seen": 681968, "step": 540 }, { "epoch": 0.03886784042790283, "grad_norm": 0.7736773490905762, "learning_rate": 0.00019984270442598712, "loss": 1.6653, "num_input_tokens_seen": 695968, "step": 545 }, { "epoch": 0.03922442611990194, "grad_norm": 0.8348143696784973, "learning_rate": 0.00019983622045690536, "loss": 1.6706, "num_input_tokens_seen": 709040, "step": 550 }, { "epoch": 0.039581011811901046, "grad_norm": 0.938188374042511, "learning_rate": 0.00019982960564307129, "loss": 1.4911, "num_input_tokens_seen": 721712, "step": 555 }, { "epoch": 0.039937597503900156, "grad_norm": 1.1161514520645142, "learning_rate": 0.0001998228599931543, "loss": 1.6092, "num_input_tokens_seen": 734896, "step": 560 }, { "epoch": 0.040294183195899266, "grad_norm": 1.0454977750778198, "learning_rate": 0.00019981598351599515, "loss": 1.5464, "num_input_tokens_seen": 749472, "step": 565 }, { "epoch": 0.04065076888789837, "grad_norm": 1.1589754819869995, "learning_rate": 0.00019980897622060614, "loss": 1.6342, "num_input_tokens_seen": 761968, "step": 570 }, { "epoch": 0.04100735457989748, "grad_norm": 1.0694705247879028, "learning_rate": 0.00019980183811617094, "loss": 1.6302, "num_input_tokens_seen": 774112, "step": 575 }, { "epoch": 0.04136394027189659, "grad_norm": 1.0234978199005127, "learning_rate": 0.00019979456921204479, "loss": 1.7415, "num_input_tokens_seen": 786016, "step": 580 }, { "epoch": 0.0417205259638957, "grad_norm": 0.8734306693077087, "learning_rate": 0.00019978716951775418, "loss": 1.4202, "num_input_tokens_seen": 797520, "step": 585 }, { "epoch": 0.042077111655894804, "grad_norm": 1.0587542057037354, "learning_rate": 0.0001997796390429972, "loss": 1.4963, "num_input_tokens_seen": 810448, "step": 590 }, { "epoch": 0.042433697347893914, "grad_norm": 1.0301488637924194, "learning_rate": 0.00019977197779764313, "loss": 1.7623, "num_input_tokens_seen": 824176, "step": 595 }, { "epoch": 0.042790283039893025, "grad_norm": 0.8244880437850952, "learning_rate": 0.00019976418579173286, "loss": 1.5259, "num_input_tokens_seen": 837264, "step": 600 }, { "epoch": 0.042790283039893025, "eval_loss": 1.4980205297470093, "eval_runtime": 25.9775, "eval_samples_per_second": 17.323, "eval_steps_per_second": 8.661, "num_input_tokens_seen": 837264, "step": 600 }, { "epoch": 0.043146868731892135, "grad_norm": 0.9113627076148987, "learning_rate": 0.0001997562630354785, "loss": 1.7852, "num_input_tokens_seen": 849456, "step": 605 }, { "epoch": 0.04350345442389124, "grad_norm": 0.8031953573226929, "learning_rate": 0.0001997482095392636, "loss": 1.4472, "num_input_tokens_seen": 861488, "step": 610 }, { "epoch": 0.04386004011589035, "grad_norm": 1.3095686435699463, "learning_rate": 0.00019974002531364293, "loss": 1.6599, "num_input_tokens_seen": 872416, "step": 615 }, { "epoch": 0.04421662580788946, "grad_norm": 1.1288928985595703, "learning_rate": 0.0001997317103693428, "loss": 1.5452, "num_input_tokens_seen": 884592, "step": 620 }, { "epoch": 0.04457321149988857, "grad_norm": 0.909373939037323, "learning_rate": 0.00019972326471726063, "loss": 1.5212, "num_input_tokens_seen": 897120, "step": 625 }, { "epoch": 0.04492979719188767, "grad_norm": 1.1868454217910767, "learning_rate": 0.00019971468836846532, "loss": 1.5916, "num_input_tokens_seen": 907232, "step": 630 }, { "epoch": 0.04528638288388678, "grad_norm": 0.9705728888511658, "learning_rate": 0.00019970598133419695, "loss": 1.5263, "num_input_tokens_seen": 920480, "step": 635 }, { "epoch": 0.045642968575885894, "grad_norm": 1.021158218383789, "learning_rate": 0.0001996971436258669, "loss": 1.5248, "num_input_tokens_seen": 932960, "step": 640 }, { "epoch": 0.045999554267885004, "grad_norm": 0.9590160846710205, "learning_rate": 0.0001996881752550578, "loss": 1.4684, "num_input_tokens_seen": 946832, "step": 645 }, { "epoch": 0.04635613995988411, "grad_norm": 0.7721974849700928, "learning_rate": 0.00019967907623352361, "loss": 1.663, "num_input_tokens_seen": 960880, "step": 650 }, { "epoch": 0.04671272565188322, "grad_norm": 0.913921058177948, "learning_rate": 0.0001996698465731894, "loss": 1.7776, "num_input_tokens_seen": 973312, "step": 655 }, { "epoch": 0.04706931134388233, "grad_norm": 0.7938061356544495, "learning_rate": 0.00019966048628615148, "loss": 1.5013, "num_input_tokens_seen": 986944, "step": 660 }, { "epoch": 0.04742589703588144, "grad_norm": 1.2008130550384521, "learning_rate": 0.00019965099538467746, "loss": 1.5296, "num_input_tokens_seen": 1001280, "step": 665 }, { "epoch": 0.04778248272788054, "grad_norm": 0.9730687737464905, "learning_rate": 0.00019964137388120602, "loss": 1.7029, "num_input_tokens_seen": 1012784, "step": 670 }, { "epoch": 0.04813906841987965, "grad_norm": 1.1330794095993042, "learning_rate": 0.00019963162178834705, "loss": 1.4998, "num_input_tokens_seen": 1024640, "step": 675 }, { "epoch": 0.04849565411187876, "grad_norm": 0.9690409302711487, "learning_rate": 0.00019962173911888158, "loss": 1.4893, "num_input_tokens_seen": 1037392, "step": 680 }, { "epoch": 0.04885223980387787, "grad_norm": 1.1701139211654663, "learning_rate": 0.00019961172588576175, "loss": 1.5194, "num_input_tokens_seen": 1049264, "step": 685 }, { "epoch": 0.049208825495876976, "grad_norm": 0.9897874593734741, "learning_rate": 0.0001996015821021109, "loss": 1.4361, "num_input_tokens_seen": 1062144, "step": 690 }, { "epoch": 0.049565411187876086, "grad_norm": 0.9895176887512207, "learning_rate": 0.00019959130778122338, "loss": 1.6835, "num_input_tokens_seen": 1074480, "step": 695 }, { "epoch": 0.0499219968798752, "grad_norm": 0.7624967098236084, "learning_rate": 0.00019958090293656464, "loss": 1.6569, "num_input_tokens_seen": 1088624, "step": 700 }, { "epoch": 0.0499219968798752, "eval_loss": 1.4771517515182495, "eval_runtime": 25.9347, "eval_samples_per_second": 17.351, "eval_steps_per_second": 8.676, "num_input_tokens_seen": 1088624, "step": 700 }, { "epoch": 0.0502785825718743, "grad_norm": 1.0006903409957886, "learning_rate": 0.00019957036758177124, "loss": 1.763, "num_input_tokens_seen": 1099952, "step": 705 }, { "epoch": 0.05063516826387341, "grad_norm": 0.8108828067779541, "learning_rate": 0.00019955970173065075, "loss": 1.4925, "num_input_tokens_seen": 1113952, "step": 710 }, { "epoch": 0.05099175395587252, "grad_norm": 1.0443133115768433, "learning_rate": 0.00019954890539718173, "loss": 1.5073, "num_input_tokens_seen": 1129488, "step": 715 }, { "epoch": 0.05134833964787163, "grad_norm": 0.6349768042564392, "learning_rate": 0.00019953797859551385, "loss": 1.825, "num_input_tokens_seen": 1145296, "step": 720 }, { "epoch": 0.051704925339870735, "grad_norm": 1.1168112754821777, "learning_rate": 0.00019952692133996767, "loss": 1.5229, "num_input_tokens_seen": 1157936, "step": 725 }, { "epoch": 0.052061511031869845, "grad_norm": 0.8870391249656677, "learning_rate": 0.00019951573364503474, "loss": 1.5875, "num_input_tokens_seen": 1171040, "step": 730 }, { "epoch": 0.052418096723868955, "grad_norm": 1.017572283744812, "learning_rate": 0.00019950441552537768, "loss": 1.6827, "num_input_tokens_seen": 1185616, "step": 735 }, { "epoch": 0.052774682415868066, "grad_norm": 0.7321210503578186, "learning_rate": 0.00019949296699582983, "loss": 1.633, "num_input_tokens_seen": 1197824, "step": 740 }, { "epoch": 0.05313126810786717, "grad_norm": 0.8157454133033752, "learning_rate": 0.00019948138807139563, "loss": 1.6336, "num_input_tokens_seen": 1209120, "step": 745 }, { "epoch": 0.05348785379986628, "grad_norm": 1.0458636283874512, "learning_rate": 0.00019946967876725036, "loss": 1.6223, "num_input_tokens_seen": 1220112, "step": 750 }, { "epoch": 0.05384443949186539, "grad_norm": 1.1940375566482544, "learning_rate": 0.00019945783909874014, "loss": 1.7688, "num_input_tokens_seen": 1231328, "step": 755 }, { "epoch": 0.0542010251838645, "grad_norm": 0.870665431022644, "learning_rate": 0.00019944586908138197, "loss": 1.7899, "num_input_tokens_seen": 1245488, "step": 760 }, { "epoch": 0.0545576108758636, "grad_norm": 1.010379433631897, "learning_rate": 0.0001994337687308637, "loss": 1.5405, "num_input_tokens_seen": 1257792, "step": 765 }, { "epoch": 0.054914196567862714, "grad_norm": 0.7964643239974976, "learning_rate": 0.00019942153806304394, "loss": 1.5962, "num_input_tokens_seen": 1271088, "step": 770 }, { "epoch": 0.055270782259861824, "grad_norm": 0.8049175143241882, "learning_rate": 0.00019940917709395216, "loss": 1.4369, "num_input_tokens_seen": 1282912, "step": 775 }, { "epoch": 0.055627367951860934, "grad_norm": 1.1336143016815186, "learning_rate": 0.00019939668583978857, "loss": 1.5273, "num_input_tokens_seen": 1295904, "step": 780 }, { "epoch": 0.05598395364386004, "grad_norm": 1.0602260828018188, "learning_rate": 0.00019938406431692412, "loss": 1.6961, "num_input_tokens_seen": 1308816, "step": 785 }, { "epoch": 0.05634053933585915, "grad_norm": 0.991144597530365, "learning_rate": 0.0001993713125419005, "loss": 1.6196, "num_input_tokens_seen": 1320768, "step": 790 }, { "epoch": 0.05669712502785826, "grad_norm": 1.162597417831421, "learning_rate": 0.00019935843053143013, "loss": 1.5991, "num_input_tokens_seen": 1333520, "step": 795 }, { "epoch": 0.05705371071985737, "grad_norm": 1.01413893699646, "learning_rate": 0.0001993454183023961, "loss": 1.6086, "num_input_tokens_seen": 1345120, "step": 800 }, { "epoch": 0.05705371071985737, "eval_loss": 1.4568166732788086, "eval_runtime": 25.9764, "eval_samples_per_second": 17.323, "eval_steps_per_second": 8.662, "num_input_tokens_seen": 1345120, "step": 800 }, { "epoch": 0.05741029641185647, "grad_norm": 1.002255916595459, "learning_rate": 0.00019933227587185214, "loss": 1.5361, "num_input_tokens_seen": 1357760, "step": 805 }, { "epoch": 0.05776688210385558, "grad_norm": 0.9191799759864807, "learning_rate": 0.00019931900325702263, "loss": 1.4695, "num_input_tokens_seen": 1369728, "step": 810 }, { "epoch": 0.05812346779585469, "grad_norm": 1.1915909051895142, "learning_rate": 0.00019930560047530264, "loss": 1.536, "num_input_tokens_seen": 1381664, "step": 815 }, { "epoch": 0.0584800534878538, "grad_norm": 1.1932401657104492, "learning_rate": 0.00019929206754425768, "loss": 1.6324, "num_input_tokens_seen": 1397072, "step": 820 }, { "epoch": 0.058836639179852906, "grad_norm": 0.7846195697784424, "learning_rate": 0.00019927840448162397, "loss": 1.5848, "num_input_tokens_seen": 1411456, "step": 825 }, { "epoch": 0.05919322487185202, "grad_norm": 0.9281952381134033, "learning_rate": 0.0001992646113053083, "loss": 1.5532, "num_input_tokens_seen": 1423040, "step": 830 }, { "epoch": 0.05954981056385113, "grad_norm": 0.8611142039299011, "learning_rate": 0.00019925068803338785, "loss": 1.4567, "num_input_tokens_seen": 1434832, "step": 835 }, { "epoch": 0.05990639625585024, "grad_norm": 0.900896430015564, "learning_rate": 0.00019923663468411038, "loss": 1.423, "num_input_tokens_seen": 1449904, "step": 840 }, { "epoch": 0.06026298194784934, "grad_norm": 0.7716097831726074, "learning_rate": 0.00019922245127589413, "loss": 1.6289, "num_input_tokens_seen": 1462224, "step": 845 }, { "epoch": 0.06061956763984845, "grad_norm": 1.0148396492004395, "learning_rate": 0.00019920813782732785, "loss": 1.5684, "num_input_tokens_seen": 1473344, "step": 850 }, { "epoch": 0.06097615333184756, "grad_norm": 1.283855676651001, "learning_rate": 0.0001991936943571706, "loss": 1.3161, "num_input_tokens_seen": 1487152, "step": 855 }, { "epoch": 0.061332739023846665, "grad_norm": 0.8233208656311035, "learning_rate": 0.0001991791208843519, "loss": 1.526, "num_input_tokens_seen": 1500496, "step": 860 }, { "epoch": 0.061689324715845775, "grad_norm": 1.0969665050506592, "learning_rate": 0.00019916441742797168, "loss": 1.5244, "num_input_tokens_seen": 1514480, "step": 865 }, { "epoch": 0.062045910407844886, "grad_norm": 0.8461065292358398, "learning_rate": 0.0001991495840073002, "loss": 1.446, "num_input_tokens_seen": 1527920, "step": 870 }, { "epoch": 0.062402496099843996, "grad_norm": 0.8559203147888184, "learning_rate": 0.00019913462064177805, "loss": 1.4903, "num_input_tokens_seen": 1540000, "step": 875 }, { "epoch": 0.0627590817918431, "grad_norm": 0.9753026366233826, "learning_rate": 0.00019911952735101614, "loss": 1.5945, "num_input_tokens_seen": 1550800, "step": 880 }, { "epoch": 0.06311566748384222, "grad_norm": 1.038980484008789, "learning_rate": 0.0001991043041547956, "loss": 1.7116, "num_input_tokens_seen": 1562272, "step": 885 }, { "epoch": 0.06347225317584132, "grad_norm": 0.8071208000183105, "learning_rate": 0.00019908895107306792, "loss": 1.6224, "num_input_tokens_seen": 1574672, "step": 890 }, { "epoch": 0.06382883886784042, "grad_norm": 0.9644535183906555, "learning_rate": 0.00019907346812595474, "loss": 1.4743, "num_input_tokens_seen": 1587840, "step": 895 }, { "epoch": 0.06418542455983954, "grad_norm": 0.9939907193183899, "learning_rate": 0.00019905785533374788, "loss": 1.6472, "num_input_tokens_seen": 1600928, "step": 900 }, { "epoch": 0.06418542455983954, "eval_loss": 1.4476536512374878, "eval_runtime": 25.9327, "eval_samples_per_second": 17.353, "eval_steps_per_second": 8.676, "num_input_tokens_seen": 1600928, "step": 900 }, { "epoch": 0.06454201025183864, "grad_norm": 1.1221510171890259, "learning_rate": 0.0001990421127169094, "loss": 1.58, "num_input_tokens_seen": 1614560, "step": 905 }, { "epoch": 0.06489859594383775, "grad_norm": 0.6738218665122986, "learning_rate": 0.0001990262402960715, "loss": 1.5166, "num_input_tokens_seen": 1628448, "step": 910 }, { "epoch": 0.06525518163583686, "grad_norm": 1.1361334323883057, "learning_rate": 0.00019901023809203644, "loss": 1.3501, "num_input_tokens_seen": 1639888, "step": 915 }, { "epoch": 0.06561176732783597, "grad_norm": 0.86082923412323, "learning_rate": 0.00019899410612577662, "loss": 1.6196, "num_input_tokens_seen": 1653776, "step": 920 }, { "epoch": 0.06596835301983509, "grad_norm": 1.001594066619873, "learning_rate": 0.0001989778444184345, "loss": 1.7417, "num_input_tokens_seen": 1667536, "step": 925 }, { "epoch": 0.06632493871183419, "grad_norm": 0.9237222671508789, "learning_rate": 0.00019896145299132262, "loss": 1.5764, "num_input_tokens_seen": 1678976, "step": 930 }, { "epoch": 0.06668152440383329, "grad_norm": 1.0402158498764038, "learning_rate": 0.00019894493186592345, "loss": 1.5937, "num_input_tokens_seen": 1691424, "step": 935 }, { "epoch": 0.06703811009583241, "grad_norm": 0.9966005086898804, "learning_rate": 0.00019892828106388946, "loss": 1.376, "num_input_tokens_seen": 1704144, "step": 940 }, { "epoch": 0.06739469578783151, "grad_norm": 0.9723909497261047, "learning_rate": 0.00019891150060704312, "loss": 1.5858, "num_input_tokens_seen": 1716832, "step": 945 }, { "epoch": 0.06775128147983062, "grad_norm": 0.849105715751648, "learning_rate": 0.00019889459051737677, "loss": 1.5743, "num_input_tokens_seen": 1729408, "step": 950 }, { "epoch": 0.06810786717182973, "grad_norm": 0.718659520149231, "learning_rate": 0.0001988775508170527, "loss": 1.5805, "num_input_tokens_seen": 1741456, "step": 955 }, { "epoch": 0.06846445286382884, "grad_norm": 1.210124135017395, "learning_rate": 0.00019886038152840306, "loss": 1.2253, "num_input_tokens_seen": 1752928, "step": 960 }, { "epoch": 0.06882103855582795, "grad_norm": 0.8950662612915039, "learning_rate": 0.00019884308267392972, "loss": 1.5651, "num_input_tokens_seen": 1766736, "step": 965 }, { "epoch": 0.06917762424782706, "grad_norm": 0.7714616060256958, "learning_rate": 0.00019882565427630454, "loss": 1.5761, "num_input_tokens_seen": 1778432, "step": 970 }, { "epoch": 0.06953420993982616, "grad_norm": 0.9974557757377625, "learning_rate": 0.0001988080963583691, "loss": 1.5444, "num_input_tokens_seen": 1790368, "step": 975 }, { "epoch": 0.06989079563182528, "grad_norm": 1.1878458261489868, "learning_rate": 0.0001987904089431346, "loss": 1.7706, "num_input_tokens_seen": 1802512, "step": 980 }, { "epoch": 0.07024738132382438, "grad_norm": 0.9763291478157043, "learning_rate": 0.00019877259205378214, "loss": 1.4229, "num_input_tokens_seen": 1813488, "step": 985 }, { "epoch": 0.07060396701582348, "grad_norm": 1.0481489896774292, "learning_rate": 0.00019875464571366243, "loss": 1.3622, "num_input_tokens_seen": 1826064, "step": 990 }, { "epoch": 0.0709605527078226, "grad_norm": 0.9758256077766418, "learning_rate": 0.00019873656994629575, "loss": 1.3448, "num_input_tokens_seen": 1837856, "step": 995 }, { "epoch": 0.0713171383998217, "grad_norm": 0.7545611262321472, "learning_rate": 0.0001987183647753722, "loss": 1.5894, "num_input_tokens_seen": 1850160, "step": 1000 }, { "epoch": 0.0713171383998217, "eval_loss": 1.4465608596801758, "eval_runtime": 25.9554, "eval_samples_per_second": 17.337, "eval_steps_per_second": 8.669, "num_input_tokens_seen": 1850160, "step": 1000 } ], "logging_steps": 5, "max_steps": 14021, "num_input_tokens_seen": 1850160, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.354467259154432e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }