{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 268, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037313432835820895, "grad_norm": 11.718372519156443, "learning_rate": 2e-07, "loss": 1.5946, "step": 1 }, { "epoch": 0.007462686567164179, "grad_norm": 12.194885050328885, "learning_rate": 4e-07, "loss": 1.6052, "step": 2 }, { "epoch": 0.011194029850746268, "grad_norm": 13.01588928537949, "learning_rate": 6e-07, "loss": 1.7294, "step": 3 }, { "epoch": 0.014925373134328358, "grad_norm": 12.313961908592717, "learning_rate": 8e-07, "loss": 1.5217, "step": 4 }, { "epoch": 0.018656716417910446, "grad_norm": 12.37356816651012, "learning_rate": 1e-06, "loss": 1.3742, "step": 5 }, { "epoch": 0.022388059701492536, "grad_norm": 11.427627877521694, "learning_rate": 1.2e-06, "loss": 1.4112, "step": 6 }, { "epoch": 0.026119402985074626, "grad_norm": 10.518944089588336, "learning_rate": 1.4e-06, "loss": 1.6147, "step": 7 }, { "epoch": 0.029850746268656716, "grad_norm": 8.76984656973648, "learning_rate": 1.6e-06, "loss": 1.4108, "step": 8 }, { "epoch": 0.033582089552238806, "grad_norm": 8.084328772350803, "learning_rate": 1.8e-06, "loss": 1.2484, "step": 9 }, { "epoch": 0.03731343283582089, "grad_norm": 9.697158992831765, "learning_rate": 2e-06, "loss": 1.23, "step": 10 }, { "epoch": 0.041044776119402986, "grad_norm": 18.09050975452378, "learning_rate": 1.9999821640202585e-06, "loss": 1.2535, "step": 11 }, { "epoch": 0.04477611940298507, "grad_norm": 18.87482868101649, "learning_rate": 1.9999286567172775e-06, "loss": 1.7845, "step": 12 }, { "epoch": 0.048507462686567165, "grad_norm": 16.46811558730055, "learning_rate": 1.999839479999768e-06, "loss": 1.5637, "step": 13 }, { "epoch": 0.05223880597014925, "grad_norm": 17.76805068894177, "learning_rate": 1.999714637048838e-06, "loss": 1.3015, "step": 14 }, { "epoch": 0.055970149253731345, "grad_norm": 16.79626847324504, "learning_rate": 1.9995541323178804e-06, "loss": 1.6793, "step": 15 }, { "epoch": 0.05970149253731343, "grad_norm": 11.89468375626604, "learning_rate": 1.9993579715324135e-06, "loss": 1.3764, "step": 16 }, { "epoch": 0.06343283582089553, "grad_norm": 10.93288879427306, "learning_rate": 1.9991261616898766e-06, "loss": 1.3707, "step": 17 }, { "epoch": 0.06716417910447761, "grad_norm": 8.361062086748422, "learning_rate": 1.9988587110593807e-06, "loss": 1.6238, "step": 18 }, { "epoch": 0.0708955223880597, "grad_norm": 6.736084108094181, "learning_rate": 1.9985556291814147e-06, "loss": 1.2496, "step": 19 }, { "epoch": 0.07462686567164178, "grad_norm": 7.272816093776056, "learning_rate": 1.9982169268675023e-06, "loss": 1.5627, "step": 20 }, { "epoch": 0.07835820895522388, "grad_norm": 6.329740952982906, "learning_rate": 1.997842616199819e-06, "loss": 1.3453, "step": 21 }, { "epoch": 0.08208955223880597, "grad_norm": 4.520010885801565, "learning_rate": 1.99743271053076e-06, "loss": 1.5461, "step": 22 }, { "epoch": 0.08582089552238806, "grad_norm": 3.6042605956174354, "learning_rate": 1.9969872244824635e-06, "loss": 1.5243, "step": 23 }, { "epoch": 0.08955223880597014, "grad_norm": 6.07998924732389, "learning_rate": 1.99650617394629e-06, "loss": 1.1925, "step": 24 }, { "epoch": 0.09328358208955224, "grad_norm": 5.379400172850125, "learning_rate": 1.9959895760822544e-06, "loss": 1.3644, "step": 25 }, { "epoch": 0.09701492537313433, "grad_norm": 5.724766090503273, "learning_rate": 1.995437449318415e-06, "loss": 1.282, "step": 26 }, { "epoch": 0.10074626865671642, "grad_norm": 7.952440800639504, "learning_rate": 1.994849813350215e-06, "loss": 1.2719, "step": 27 }, { "epoch": 0.1044776119402985, "grad_norm": 5.5605547177462915, "learning_rate": 1.9942266891397812e-06, "loss": 1.4344, "step": 28 }, { "epoch": 0.10820895522388059, "grad_norm": 4.20729688038585, "learning_rate": 1.9935680989151754e-06, "loss": 1.4261, "step": 29 }, { "epoch": 0.11194029850746269, "grad_norm": 4.179661464504617, "learning_rate": 1.9928740661696007e-06, "loss": 1.7263, "step": 30 }, { "epoch": 0.11567164179104478, "grad_norm": 4.444811669926014, "learning_rate": 1.992144615660566e-06, "loss": 1.4733, "step": 31 }, { "epoch": 0.11940298507462686, "grad_norm": 4.028798590389647, "learning_rate": 1.9913797734089995e-06, "loss": 1.2016, "step": 32 }, { "epoch": 0.12313432835820895, "grad_norm": 3.667451061021437, "learning_rate": 1.990579566698323e-06, "loss": 1.3823, "step": 33 }, { "epoch": 0.12686567164179105, "grad_norm": 3.204493378117863, "learning_rate": 1.9897440240734786e-06, "loss": 1.1922, "step": 34 }, { "epoch": 0.13059701492537312, "grad_norm": 2.5157465668345225, "learning_rate": 1.9888731753399087e-06, "loss": 1.3169, "step": 35 }, { "epoch": 0.13432835820895522, "grad_norm": 4.013532823842619, "learning_rate": 1.9879670515624933e-06, "loss": 1.5473, "step": 36 }, { "epoch": 0.13805970149253732, "grad_norm": 4.443422336518452, "learning_rate": 1.9870256850644436e-06, "loss": 1.3413, "step": 37 }, { "epoch": 0.1417910447761194, "grad_norm": 3.8185223864854443, "learning_rate": 1.9860491094261476e-06, "loss": 1.3775, "step": 38 }, { "epoch": 0.1455223880597015, "grad_norm": 3.2374300805591356, "learning_rate": 1.9850373594839715e-06, "loss": 1.4237, "step": 39 }, { "epoch": 0.14925373134328357, "grad_norm": 3.4010570222238945, "learning_rate": 1.9839904713290183e-06, "loss": 1.3512, "step": 40 }, { "epoch": 0.15298507462686567, "grad_norm": 3.7069360522451036, "learning_rate": 1.9829084823058396e-06, "loss": 1.3539, "step": 41 }, { "epoch": 0.15671641791044777, "grad_norm": 4.189014809102456, "learning_rate": 1.9817914310111044e-06, "loss": 1.184, "step": 42 }, { "epoch": 0.16044776119402984, "grad_norm": 2.6617535657025764, "learning_rate": 1.980639357292221e-06, "loss": 0.9118, "step": 43 }, { "epoch": 0.16417910447761194, "grad_norm": 5.853386102660827, "learning_rate": 1.9794523022459164e-06, "loss": 1.2803, "step": 44 }, { "epoch": 0.16791044776119404, "grad_norm": 6.062445920092195, "learning_rate": 1.9782303082167703e-06, "loss": 1.1335, "step": 45 }, { "epoch": 0.17164179104477612, "grad_norm": 3.931954985578852, "learning_rate": 1.976973418795704e-06, "loss": 1.3316, "step": 46 }, { "epoch": 0.17537313432835822, "grad_norm": 4.463556977447332, "learning_rate": 1.9756816788184255e-06, "loss": 1.0166, "step": 47 }, { "epoch": 0.1791044776119403, "grad_norm": 4.622340592834071, "learning_rate": 1.974355134363832e-06, "loss": 1.3222, "step": 48 }, { "epoch": 0.1828358208955224, "grad_norm": 6.981137384760035, "learning_rate": 1.972993832752363e-06, "loss": 1.2864, "step": 49 }, { "epoch": 0.1865671641791045, "grad_norm": 7.538142483116736, "learning_rate": 1.9715978225443146e-06, "loss": 1.3298, "step": 50 }, { "epoch": 0.19029850746268656, "grad_norm": 5.378377013811346, "learning_rate": 1.970167153538106e-06, "loss": 1.6898, "step": 51 }, { "epoch": 0.19402985074626866, "grad_norm": 4.107675209526753, "learning_rate": 1.9687018767685044e-06, "loss": 1.2467, "step": 52 }, { "epoch": 0.19776119402985073, "grad_norm": 3.462570336890335, "learning_rate": 1.9672020445048035e-06, "loss": 1.2439, "step": 53 }, { "epoch": 0.20149253731343283, "grad_norm": 5.1132284991358565, "learning_rate": 1.9656677102489587e-06, "loss": 1.2553, "step": 54 }, { "epoch": 0.20522388059701493, "grad_norm": 6.340348167811004, "learning_rate": 1.964098928733679e-06, "loss": 1.1768, "step": 55 }, { "epoch": 0.208955223880597, "grad_norm": 3.9131654258908415, "learning_rate": 1.962495755920476e-06, "loss": 1.2787, "step": 56 }, { "epoch": 0.2126865671641791, "grad_norm": 4.822496966837534, "learning_rate": 1.9608582489976645e-06, "loss": 0.9751, "step": 57 }, { "epoch": 0.21641791044776118, "grad_norm": 6.830738771850188, "learning_rate": 1.959186466378326e-06, "loss": 1.1173, "step": 58 }, { "epoch": 0.22014925373134328, "grad_norm": 3.7173186685103716, "learning_rate": 1.9574804676982214e-06, "loss": 1.3968, "step": 59 }, { "epoch": 0.22388059701492538, "grad_norm": 2.826816507380959, "learning_rate": 1.955740313813667e-06, "loss": 1.2454, "step": 60 }, { "epoch": 0.22761194029850745, "grad_norm": 5.060694147656036, "learning_rate": 1.9539660667993617e-06, "loss": 1.2803, "step": 61 }, { "epoch": 0.23134328358208955, "grad_norm": 4.866896848059289, "learning_rate": 1.952157789946173e-06, "loss": 1.3675, "step": 62 }, { "epoch": 0.23507462686567165, "grad_norm": 3.6902361398869843, "learning_rate": 1.9503155477588792e-06, "loss": 1.3265, "step": 63 }, { "epoch": 0.23880597014925373, "grad_norm": 3.844263841899348, "learning_rate": 1.9484394059538696e-06, "loss": 1.1316, "step": 64 }, { "epoch": 0.24253731343283583, "grad_norm": 2.7586273990747094, "learning_rate": 1.9465294314567986e-06, "loss": 1.1227, "step": 65 }, { "epoch": 0.2462686567164179, "grad_norm": 4.486151113202917, "learning_rate": 1.9445856924001987e-06, "loss": 1.2286, "step": 66 }, { "epoch": 0.25, "grad_norm": 3.4213181164419453, "learning_rate": 1.9426082581210507e-06, "loss": 1.1066, "step": 67 }, { "epoch": 0.2537313432835821, "grad_norm": 3.2763985271498646, "learning_rate": 1.9405971991583107e-06, "loss": 1.1251, "step": 68 }, { "epoch": 0.2574626865671642, "grad_norm": 3.2345913669334014, "learning_rate": 1.9385525872503914e-06, "loss": 1.1556, "step": 69 }, { "epoch": 0.26119402985074625, "grad_norm": 3.106534312573296, "learning_rate": 1.9364744953326073e-06, "loss": 1.0577, "step": 70 }, { "epoch": 0.26492537313432835, "grad_norm": 2.9408023039448272, "learning_rate": 1.9343629975345684e-06, "loss": 0.9973, "step": 71 }, { "epoch": 0.26865671641791045, "grad_norm": 4.559495537184346, "learning_rate": 1.9322181691775386e-06, "loss": 1.2465, "step": 72 }, { "epoch": 0.27238805970149255, "grad_norm": 4.24529782366461, "learning_rate": 1.9300400867717483e-06, "loss": 1.0913, "step": 73 }, { "epoch": 0.27611940298507465, "grad_norm": 5.1030664391900595, "learning_rate": 1.9278288280136647e-06, "loss": 1.1773, "step": 74 }, { "epoch": 0.2798507462686567, "grad_norm": 2.266554727723586, "learning_rate": 1.9255844717832204e-06, "loss": 1.4612, "step": 75 }, { "epoch": 0.2835820895522388, "grad_norm": 3.4335407395713795, "learning_rate": 1.9233070981410005e-06, "loss": 0.9848, "step": 76 }, { "epoch": 0.2873134328358209, "grad_norm": 3.0840869155577066, "learning_rate": 1.9209967883253844e-06, "loss": 1.1614, "step": 77 }, { "epoch": 0.291044776119403, "grad_norm": 3.9174952308981648, "learning_rate": 1.9186536247496515e-06, "loss": 0.993, "step": 78 }, { "epoch": 0.2947761194029851, "grad_norm": 6.675359765928192, "learning_rate": 1.916277690999037e-06, "loss": 1.1993, "step": 79 }, { "epoch": 0.29850746268656714, "grad_norm": 3.3312539449544336, "learning_rate": 1.9138690718277538e-06, "loss": 1.1122, "step": 80 }, { "epoch": 0.30223880597014924, "grad_norm": 3.848892934397973, "learning_rate": 1.9114278531559673e-06, "loss": 1.2558, "step": 81 }, { "epoch": 0.30597014925373134, "grad_norm": 4.176454718168841, "learning_rate": 1.908954122066731e-06, "loss": 1.1981, "step": 82 }, { "epoch": 0.30970149253731344, "grad_norm": 2.73071312325315, "learning_rate": 1.9064479668028799e-06, "loss": 1.2421, "step": 83 }, { "epoch": 0.31343283582089554, "grad_norm": 3.195988999340381, "learning_rate": 1.903909476763883e-06, "loss": 1.1304, "step": 84 }, { "epoch": 0.31716417910447764, "grad_norm": 4.218603688955521, "learning_rate": 1.9013387425026548e-06, "loss": 1.1864, "step": 85 }, { "epoch": 0.3208955223880597, "grad_norm": 3.6984773161373514, "learning_rate": 1.8987358557223229e-06, "loss": 1.1586, "step": 86 }, { "epoch": 0.3246268656716418, "grad_norm": 3.92008728241735, "learning_rate": 1.8961009092729597e-06, "loss": 1.4377, "step": 87 }, { "epoch": 0.3283582089552239, "grad_norm": 3.891396254447255, "learning_rate": 1.8934339971482673e-06, "loss": 0.8258, "step": 88 }, { "epoch": 0.332089552238806, "grad_norm": 3.0403561652586593, "learning_rate": 1.8907352144822281e-06, "loss": 1.1502, "step": 89 }, { "epoch": 0.3358208955223881, "grad_norm": 4.226515814729617, "learning_rate": 1.8880046575457071e-06, "loss": 1.3202, "step": 90 }, { "epoch": 0.33955223880597013, "grad_norm": 3.1371524388050562, "learning_rate": 1.8852424237430213e-06, "loss": 1.0916, "step": 91 }, { "epoch": 0.34328358208955223, "grad_norm": 5.798409840680744, "learning_rate": 1.882448611608463e-06, "loss": 1.0295, "step": 92 }, { "epoch": 0.34701492537313433, "grad_norm": 3.20282503021626, "learning_rate": 1.8796233208027847e-06, "loss": 1.0562, "step": 93 }, { "epoch": 0.35074626865671643, "grad_norm": 4.8622829880049165, "learning_rate": 1.8767666521096466e-06, "loss": 1.3517, "step": 94 }, { "epoch": 0.35447761194029853, "grad_norm": 3.236061343689611, "learning_rate": 1.8738787074320176e-06, "loss": 1.3072, "step": 95 }, { "epoch": 0.3582089552238806, "grad_norm": 2.8290205291903834, "learning_rate": 1.8709595897885436e-06, "loss": 1.0689, "step": 96 }, { "epoch": 0.3619402985074627, "grad_norm": 9.248683748830997, "learning_rate": 1.8680094033098714e-06, "loss": 1.1408, "step": 97 }, { "epoch": 0.3656716417910448, "grad_norm": 3.351494222719439, "learning_rate": 1.865028253234933e-06, "loss": 1.0601, "step": 98 }, { "epoch": 0.3694029850746269, "grad_norm": 2.9558835746922982, "learning_rate": 1.8620162459071933e-06, "loss": 1.469, "step": 99 }, { "epoch": 0.373134328358209, "grad_norm": 4.188173168279777, "learning_rate": 1.8589734887708555e-06, "loss": 1.0811, "step": 100 }, { "epoch": 0.376865671641791, "grad_norm": 4.149900932370019, "learning_rate": 1.855900090367029e-06, "loss": 1.2524, "step": 101 }, { "epoch": 0.3805970149253731, "grad_norm": 3.4354588132333954, "learning_rate": 1.852796160329857e-06, "loss": 1.2036, "step": 102 }, { "epoch": 0.3843283582089552, "grad_norm": 7.008352254151579, "learning_rate": 1.8496618093826062e-06, "loss": 1.3326, "step": 103 }, { "epoch": 0.3880597014925373, "grad_norm": 4.57793539598067, "learning_rate": 1.8464971493337165e-06, "loss": 1.1313, "step": 104 }, { "epoch": 0.3917910447761194, "grad_norm": 5.023484484297041, "learning_rate": 1.843302293072813e-06, "loss": 1.1537, "step": 105 }, { "epoch": 0.39552238805970147, "grad_norm": 5.417491975498189, "learning_rate": 1.8400773545666786e-06, "loss": 1.1948, "step": 106 }, { "epoch": 0.39925373134328357, "grad_norm": 6.758289030774797, "learning_rate": 1.8368224488551895e-06, "loss": 1.4521, "step": 107 }, { "epoch": 0.40298507462686567, "grad_norm": 3.113115656591011, "learning_rate": 1.8335376920472096e-06, "loss": 1.3848, "step": 108 }, { "epoch": 0.40671641791044777, "grad_norm": 6.762254585306641, "learning_rate": 1.8302232013164516e-06, "loss": 1.157, "step": 109 }, { "epoch": 0.41044776119402987, "grad_norm": 4.658736688116275, "learning_rate": 1.8268790948972938e-06, "loss": 1.0968, "step": 110 }, { "epoch": 0.4141791044776119, "grad_norm": 4.3724969896587, "learning_rate": 1.8235054920805651e-06, "loss": 1.3121, "step": 111 }, { "epoch": 0.417910447761194, "grad_norm": 1.9047640764173237, "learning_rate": 1.8201025132092886e-06, "loss": 0.966, "step": 112 }, { "epoch": 0.4216417910447761, "grad_norm": 2.6109096330803645, "learning_rate": 1.8166702796743888e-06, "loss": 0.9965, "step": 113 }, { "epoch": 0.4253731343283582, "grad_norm": 2.236001574200741, "learning_rate": 1.813208913910361e-06, "loss": 1.2068, "step": 114 }, { "epoch": 0.4291044776119403, "grad_norm": 3.5252250250259904, "learning_rate": 1.8097185393909047e-06, "loss": 0.9945, "step": 115 }, { "epoch": 0.43283582089552236, "grad_norm": 2.72651289344666, "learning_rate": 1.8061992806245183e-06, "loss": 1.1221, "step": 116 }, { "epoch": 0.43656716417910446, "grad_norm": 2.7580533850806663, "learning_rate": 1.802651263150058e-06, "loss": 1.1106, "step": 117 }, { "epoch": 0.44029850746268656, "grad_norm": 3.36730794119517, "learning_rate": 1.7990746135322592e-06, "loss": 1.3169, "step": 118 }, { "epoch": 0.44402985074626866, "grad_norm": 3.602491306889906, "learning_rate": 1.7954694593572225e-06, "loss": 1.2271, "step": 119 }, { "epoch": 0.44776119402985076, "grad_norm": 2.995624556867228, "learning_rate": 1.7918359292278611e-06, "loss": 1.4585, "step": 120 }, { "epoch": 0.45149253731343286, "grad_norm": 2.4713376740401394, "learning_rate": 1.7881741527593148e-06, "loss": 1.0635, "step": 121 }, { "epoch": 0.4552238805970149, "grad_norm": 2.9859360943624558, "learning_rate": 1.7844842605743255e-06, "loss": 1.1158, "step": 122 }, { "epoch": 0.458955223880597, "grad_norm": 2.236880197413377, "learning_rate": 1.7807663842985776e-06, "loss": 1.0568, "step": 123 }, { "epoch": 0.4626865671641791, "grad_norm": 2.812929367683673, "learning_rate": 1.777020656556003e-06, "loss": 0.9711, "step": 124 }, { "epoch": 0.4664179104477612, "grad_norm": 2.615520237147118, "learning_rate": 1.77324721096405e-06, "loss": 1.2155, "step": 125 }, { "epoch": 0.4701492537313433, "grad_norm": 2.56945954741141, "learning_rate": 1.7694461821289171e-06, "loss": 1.2214, "step": 126 }, { "epoch": 0.47388059701492535, "grad_norm": 2.6524624585109255, "learning_rate": 1.7656177056407504e-06, "loss": 1.0783, "step": 127 }, { "epoch": 0.47761194029850745, "grad_norm": 3.900106255085836, "learning_rate": 1.7617619180688084e-06, "loss": 1.1345, "step": 128 }, { "epoch": 0.48134328358208955, "grad_norm": 3.6445674759996973, "learning_rate": 1.7578789569565889e-06, "loss": 1.1407, "step": 129 }, { "epoch": 0.48507462686567165, "grad_norm": 3.2321962413724834, "learning_rate": 1.7539689608169236e-06, "loss": 1.2281, "step": 130 }, { "epoch": 0.48880597014925375, "grad_norm": 4.609891513693221, "learning_rate": 1.7500320691270363e-06, "loss": 1.2394, "step": 131 }, { "epoch": 0.4925373134328358, "grad_norm": 6.929277355854441, "learning_rate": 1.7460684223235678e-06, "loss": 1.233, "step": 132 }, { "epoch": 0.4962686567164179, "grad_norm": 3.231892093569866, "learning_rate": 1.7420781617975663e-06, "loss": 0.9962, "step": 133 }, { "epoch": 0.5, "grad_norm": 2.5090780090395115, "learning_rate": 1.738061429889444e-06, "loss": 0.9036, "step": 134 }, { "epoch": 0.503731343283582, "grad_norm": 4.328856707412604, "learning_rate": 1.734018369883898e-06, "loss": 1.1895, "step": 135 }, { "epoch": 0.5074626865671642, "grad_norm": 2.5691655391373875, "learning_rate": 1.7299491260048019e-06, "loss": 1.326, "step": 136 }, { "epoch": 0.5111940298507462, "grad_norm": 1.732025147621955, "learning_rate": 1.7258538434100576e-06, "loss": 1.2479, "step": 137 }, { "epoch": 0.5149253731343284, "grad_norm": 2.20121934912806, "learning_rate": 1.7217326681864206e-06, "loss": 1.0356, "step": 138 }, { "epoch": 0.5186567164179104, "grad_norm": 1.9985655432331606, "learning_rate": 1.717585747344286e-06, "loss": 1.1547, "step": 139 }, { "epoch": 0.5223880597014925, "grad_norm": 1.8929729658584291, "learning_rate": 1.7134132288124464e-06, "loss": 1.1972, "step": 140 }, { "epoch": 0.5261194029850746, "grad_norm": 2.5705526162651284, "learning_rate": 1.7092152614328136e-06, "loss": 0.9647, "step": 141 }, { "epoch": 0.5298507462686567, "grad_norm": 2.3300073188215134, "learning_rate": 1.7049919949551099e-06, "loss": 1.4177, "step": 142 }, { "epoch": 0.5335820895522388, "grad_norm": 2.919579845785974, "learning_rate": 1.7007435800315261e-06, "loss": 1.0245, "step": 143 }, { "epoch": 0.5373134328358209, "grad_norm": 2.58628020939173, "learning_rate": 1.6964701682113474e-06, "loss": 1.1438, "step": 144 }, { "epoch": 0.5410447761194029, "grad_norm": 2.1810582155175906, "learning_rate": 1.6921719119355466e-06, "loss": 1.1709, "step": 145 }, { "epoch": 0.5447761194029851, "grad_norm": 2.0256539029853036, "learning_rate": 1.687848964531348e-06, "loss": 1.2567, "step": 146 }, { "epoch": 0.5485074626865671, "grad_norm": 2.6789651782329003, "learning_rate": 1.6835014802067556e-06, "loss": 1.2105, "step": 147 }, { "epoch": 0.5522388059701493, "grad_norm": 2.2475712729751813, "learning_rate": 1.6791296140450543e-06, "loss": 1.0036, "step": 148 }, { "epoch": 0.5559701492537313, "grad_norm": 3.081758388528468, "learning_rate": 1.6747335219992774e-06, "loss": 1.229, "step": 149 }, { "epoch": 0.5597014925373134, "grad_norm": 3.4435580918281903, "learning_rate": 1.6703133608866414e-06, "loss": 1.2375, "step": 150 }, { "epoch": 0.5634328358208955, "grad_norm": 3.6488645320162263, "learning_rate": 1.6658692883829546e-06, "loss": 1.2528, "step": 151 }, { "epoch": 0.5671641791044776, "grad_norm": 2.6147378121358535, "learning_rate": 1.6614014630169915e-06, "loss": 1.0683, "step": 152 }, { "epoch": 0.5708955223880597, "grad_norm": 3.4412924263138502, "learning_rate": 1.6569100441648372e-06, "loss": 1.2073, "step": 153 }, { "epoch": 0.5746268656716418, "grad_norm": 3.8345977623754117, "learning_rate": 1.6523951920442032e-06, "loss": 1.1582, "step": 154 }, { "epoch": 0.5783582089552238, "grad_norm": 3.049354065878489, "learning_rate": 1.6478570677087116e-06, "loss": 1.26, "step": 155 }, { "epoch": 0.582089552238806, "grad_norm": 2.667157342873667, "learning_rate": 1.6432958330421497e-06, "loss": 1.1972, "step": 156 }, { "epoch": 0.585820895522388, "grad_norm": 2.3988481838806517, "learning_rate": 1.6387116507526955e-06, "loss": 1.0296, "step": 157 }, { "epoch": 0.5895522388059702, "grad_norm": 3.245331214881116, "learning_rate": 1.6341046843671142e-06, "loss": 1.0837, "step": 158 }, { "epoch": 0.5932835820895522, "grad_norm": 2.740295402410237, "learning_rate": 1.629475098224924e-06, "loss": 1.0756, "step": 159 }, { "epoch": 0.5970149253731343, "grad_norm": 3.8088940588573625, "learning_rate": 1.6248230574725338e-06, "loss": 1.2506, "step": 160 }, { "epoch": 0.6007462686567164, "grad_norm": 5.166361637828825, "learning_rate": 1.6201487280573533e-06, "loss": 0.9793, "step": 161 }, { "epoch": 0.6044776119402985, "grad_norm": 3.2415888531812485, "learning_rate": 1.6154522767218723e-06, "loss": 1.3401, "step": 162 }, { "epoch": 0.6082089552238806, "grad_norm": 3.305335197143126, "learning_rate": 1.6107338709977118e-06, "loss": 1.4258, "step": 163 }, { "epoch": 0.6119402985074627, "grad_norm": 3.074545865145304, "learning_rate": 1.6059936791996497e-06, "loss": 1.192, "step": 164 }, { "epoch": 0.6156716417910447, "grad_norm": 2.40059366672424, "learning_rate": 1.601231870419616e-06, "loss": 0.984, "step": 165 }, { "epoch": 0.6194029850746269, "grad_norm": 2.9844452713407197, "learning_rate": 1.596448614520661e-06, "loss": 1.1051, "step": 166 }, { "epoch": 0.6231343283582089, "grad_norm": 3.241493731745323, "learning_rate": 1.5916440821308947e-06, "loss": 1.1032, "step": 167 }, { "epoch": 0.6268656716417911, "grad_norm": 4.002012008083462, "learning_rate": 1.586818444637402e-06, "loss": 1.1281, "step": 168 }, { "epoch": 0.6305970149253731, "grad_norm": 2.891081597812952, "learning_rate": 1.5819718741801282e-06, "loss": 1.0984, "step": 169 }, { "epoch": 0.6343283582089553, "grad_norm": 2.510097661559307, "learning_rate": 1.577104543645738e-06, "loss": 0.9818, "step": 170 }, { "epoch": 0.6380597014925373, "grad_norm": 3.9519151526817784, "learning_rate": 1.5722166266614494e-06, "loss": 1.403, "step": 171 }, { "epoch": 0.6417910447761194, "grad_norm": 2.889629899144798, "learning_rate": 1.5673082975888386e-06, "loss": 1.4251, "step": 172 }, { "epoch": 0.6455223880597015, "grad_norm": 3.2843979337315203, "learning_rate": 1.5623797315176217e-06, "loss": 1.2102, "step": 173 }, { "epoch": 0.6492537313432836, "grad_norm": 3.851544142794571, "learning_rate": 1.5574311042594077e-06, "loss": 1.3174, "step": 174 }, { "epoch": 0.6529850746268657, "grad_norm": 3.0632504419966224, "learning_rate": 1.552462592341428e-06, "loss": 1.2578, "step": 175 }, { "epoch": 0.6567164179104478, "grad_norm": 2.9143363462552414, "learning_rate": 1.547474373000238e-06, "loss": 1.1117, "step": 176 }, { "epoch": 0.6604477611940298, "grad_norm": 3.33708665616015, "learning_rate": 1.5424666241753963e-06, "loss": 1.3296, "step": 177 }, { "epoch": 0.664179104477612, "grad_norm": 2.5174595420642767, "learning_rate": 1.5374395245031157e-06, "loss": 1.2501, "step": 178 }, { "epoch": 0.667910447761194, "grad_norm": 4.722876645619478, "learning_rate": 1.5323932533098924e-06, "loss": 0.8606, "step": 179 }, { "epoch": 0.6716417910447762, "grad_norm": 3.23675727446907, "learning_rate": 1.527327990606108e-06, "loss": 1.1848, "step": 180 }, { "epoch": 0.6753731343283582, "grad_norm": 3.2255476770575906, "learning_rate": 1.522243917079608e-06, "loss": 1.1501, "step": 181 }, { "epoch": 0.6791044776119403, "grad_norm": 2.660659180388112, "learning_rate": 1.5171412140892574e-06, "loss": 1.1792, "step": 182 }, { "epoch": 0.6828358208955224, "grad_norm": 2.5742735359754656, "learning_rate": 1.512020063658471e-06, "loss": 1.0524, "step": 183 }, { "epoch": 0.6865671641791045, "grad_norm": 2.7222921596819805, "learning_rate": 1.5068806484687188e-06, "loss": 0.9408, "step": 184 }, { "epoch": 0.6902985074626866, "grad_norm": 2.854241224431344, "learning_rate": 1.5017231518530115e-06, "loss": 1.1946, "step": 185 }, { "epoch": 0.6940298507462687, "grad_norm": 2.829758100829405, "learning_rate": 1.4965477577893596e-06, "loss": 1.0996, "step": 186 }, { "epoch": 0.6977611940298507, "grad_norm": 2.7341811827310907, "learning_rate": 1.4913546508942104e-06, "loss": 1.3112, "step": 187 }, { "epoch": 0.7014925373134329, "grad_norm": 2.727595416423421, "learning_rate": 1.486144016415862e-06, "loss": 0.8641, "step": 188 }, { "epoch": 0.7052238805970149, "grad_norm": 2.6779321037785957, "learning_rate": 1.4809160402278572e-06, "loss": 1.0673, "step": 189 }, { "epoch": 0.7089552238805971, "grad_norm": 2.105636468467112, "learning_rate": 1.4756709088223507e-06, "loss": 1.0804, "step": 190 }, { "epoch": 0.7126865671641791, "grad_norm": 2.308917007984876, "learning_rate": 1.470408809303457e-06, "loss": 1.0657, "step": 191 }, { "epoch": 0.7164179104477612, "grad_norm": 2.272233759263439, "learning_rate": 1.4651299293805772e-06, "loss": 0.97, "step": 192 }, { "epoch": 0.7201492537313433, "grad_norm": 2.38194076941112, "learning_rate": 1.459834457361702e-06, "loss": 1.1996, "step": 193 }, { "epoch": 0.7238805970149254, "grad_norm": 2.609236963602244, "learning_rate": 1.4545225821466949e-06, "loss": 1.4137, "step": 194 }, { "epoch": 0.7276119402985075, "grad_norm": 2.1583872582681303, "learning_rate": 1.449194493220553e-06, "loss": 1.21, "step": 195 }, { "epoch": 0.7313432835820896, "grad_norm": 2.0168668065761004, "learning_rate": 1.443850380646649e-06, "loss": 1.2648, "step": 196 }, { "epoch": 0.7350746268656716, "grad_norm": 2.8244668704260434, "learning_rate": 1.4384904350599496e-06, "loss": 1.158, "step": 197 }, { "epoch": 0.7388059701492538, "grad_norm": 2.154427501128158, "learning_rate": 1.433114847660217e-06, "loss": 1.1111, "step": 198 }, { "epoch": 0.7425373134328358, "grad_norm": 1.905058417754889, "learning_rate": 1.427723810205187e-06, "loss": 0.969, "step": 199 }, { "epoch": 0.746268656716418, "grad_norm": 2.739735762190122, "learning_rate": 1.4223175150037295e-06, "loss": 1.2142, "step": 200 }, { "epoch": 0.75, "grad_norm": 3.4650290962226777, "learning_rate": 1.4168961549089872e-06, "loss": 1.1373, "step": 201 }, { "epoch": 0.753731343283582, "grad_norm": 2.5869478423809786, "learning_rate": 1.4114599233114986e-06, "loss": 1.3506, "step": 202 }, { "epoch": 0.7574626865671642, "grad_norm": 3.1980963820842483, "learning_rate": 1.4060090141322966e-06, "loss": 1.0384, "step": 203 }, { "epoch": 0.7611940298507462, "grad_norm": 2.5362305958432443, "learning_rate": 1.4005436218159925e-06, "loss": 1.1983, "step": 204 }, { "epoch": 0.7649253731343284, "grad_norm": 1.7669955812420282, "learning_rate": 1.3950639413238393e-06, "loss": 1.1922, "step": 205 }, { "epoch": 0.7686567164179104, "grad_norm": 3.236818206550707, "learning_rate": 1.3895701681267782e-06, "loss": 1.1532, "step": 206 }, { "epoch": 0.7723880597014925, "grad_norm": 3.1410703998345917, "learning_rate": 1.384062498198464e-06, "loss": 1.2707, "step": 207 }, { "epoch": 0.7761194029850746, "grad_norm": 2.947726795909021, "learning_rate": 1.3785411280082746e-06, "loss": 1.1552, "step": 208 }, { "epoch": 0.7798507462686567, "grad_norm": 4.158405889593859, "learning_rate": 1.373006254514304e-06, "loss": 1.1323, "step": 209 }, { "epoch": 0.7835820895522388, "grad_norm": 3.6596410080845483, "learning_rate": 1.3674580751563357e-06, "loss": 1.1021, "step": 210 }, { "epoch": 0.7873134328358209, "grad_norm": 3.4837568397902063, "learning_rate": 1.361896787848798e-06, "loss": 1.1507, "step": 211 }, { "epoch": 0.7910447761194029, "grad_norm": 5.190434900700764, "learning_rate": 1.3563225909737074e-06, "loss": 1.1307, "step": 212 }, { "epoch": 0.7947761194029851, "grad_norm": 3.193649918972427, "learning_rate": 1.3507356833735885e-06, "loss": 1.1674, "step": 213 }, { "epoch": 0.7985074626865671, "grad_norm": 3.64309739990448, "learning_rate": 1.3451362643443831e-06, "loss": 1.1026, "step": 214 }, { "epoch": 0.8022388059701493, "grad_norm": 4.480821519285648, "learning_rate": 1.3395245336283396e-06, "loss": 1.1305, "step": 215 }, { "epoch": 0.8059701492537313, "grad_norm": 2.485764813025922, "learning_rate": 1.333900691406889e-06, "loss": 1.0909, "step": 216 }, { "epoch": 0.8097014925373134, "grad_norm": 2.8276534151044417, "learning_rate": 1.3282649382935028e-06, "loss": 1.2906, "step": 217 }, { "epoch": 0.8134328358208955, "grad_norm": 2.661022282944918, "learning_rate": 1.322617475326538e-06, "loss": 1.0923, "step": 218 }, { "epoch": 0.8171641791044776, "grad_norm": 2.6551254805947053, "learning_rate": 1.316958503962065e-06, "loss": 1.1648, "step": 219 }, { "epoch": 0.8208955223880597, "grad_norm": 2.3353396983390486, "learning_rate": 1.3112882260666805e-06, "loss": 1.2479, "step": 220 }, { "epoch": 0.8246268656716418, "grad_norm": 1.8853847357875915, "learning_rate": 1.3056068439103082e-06, "loss": 0.9367, "step": 221 }, { "epoch": 0.8283582089552238, "grad_norm": 1.7789270126386558, "learning_rate": 1.299914560158982e-06, "loss": 0.9866, "step": 222 }, { "epoch": 0.832089552238806, "grad_norm": 4.437767240695352, "learning_rate": 1.2942115778676175e-06, "loss": 1.0143, "step": 223 }, { "epoch": 0.835820895522388, "grad_norm": 2.643730633752304, "learning_rate": 1.2884981004727675e-06, "loss": 1.1737, "step": 224 }, { "epoch": 0.8395522388059702, "grad_norm": 4.113252275049106, "learning_rate": 1.2827743317853666e-06, "loss": 1.278, "step": 225 }, { "epoch": 0.8432835820895522, "grad_norm": 4.473452632975873, "learning_rate": 1.2770404759834592e-06, "loss": 1.2337, "step": 226 }, { "epoch": 0.8470149253731343, "grad_norm": 3.3121627389468227, "learning_rate": 1.2712967376049176e-06, "loss": 0.9808, "step": 227 }, { "epoch": 0.8507462686567164, "grad_norm": 2.765455947896225, "learning_rate": 1.2655433215401437e-06, "loss": 0.809, "step": 228 }, { "epoch": 0.8544776119402985, "grad_norm": 5.806520585625066, "learning_rate": 1.2597804330247629e-06, "loss": 1.3475, "step": 229 }, { "epoch": 0.8582089552238806, "grad_norm": 4.3730223037366365, "learning_rate": 1.2540082776323006e-06, "loss": 1.0836, "step": 230 }, { "epoch": 0.8619402985074627, "grad_norm": 2.5075803170353987, "learning_rate": 1.2482270612668507e-06, "loss": 1.1071, "step": 231 }, { "epoch": 0.8656716417910447, "grad_norm": 3.845367887472252, "learning_rate": 1.242436990155728e-06, "loss": 1.249, "step": 232 }, { "epoch": 0.8694029850746269, "grad_norm": 3.2664015113912237, "learning_rate": 1.2366382708421154e-06, "loss": 1.1988, "step": 233 }, { "epoch": 0.8731343283582089, "grad_norm": 3.9034686201589586, "learning_rate": 1.2308311101776932e-06, "loss": 1.2718, "step": 234 }, { "epoch": 0.8768656716417911, "grad_norm": 2.1523180105846844, "learning_rate": 1.2250157153152609e-06, "loss": 1.1845, "step": 235 }, { "epoch": 0.8805970149253731, "grad_norm": 2.8559179892421116, "learning_rate": 1.2191922937013488e-06, "loss": 1.2277, "step": 236 }, { "epoch": 0.8843283582089553, "grad_norm": 2.4964069518984697, "learning_rate": 1.2133610530688167e-06, "loss": 1.1304, "step": 237 }, { "epoch": 0.8880597014925373, "grad_norm": 1.6510558048415136, "learning_rate": 1.2075222014294447e-06, "loss": 1.0716, "step": 238 }, { "epoch": 0.8917910447761194, "grad_norm": 4.138846396996276, "learning_rate": 1.2016759470665109e-06, "loss": 1.1715, "step": 239 }, { "epoch": 0.8955223880597015, "grad_norm": 2.591000009602829, "learning_rate": 1.1958224985273645e-06, "loss": 1.2082, "step": 240 }, { "epoch": 0.8992537313432836, "grad_norm": 1.6656445235525534, "learning_rate": 1.1899620646159853e-06, "loss": 1.057, "step": 241 }, { "epoch": 0.9029850746268657, "grad_norm": 3.4344302308874486, "learning_rate": 1.1840948543855334e-06, "loss": 0.9381, "step": 242 }, { "epoch": 0.9067164179104478, "grad_norm": 2.5448689987449384, "learning_rate": 1.1782210771308947e-06, "loss": 1.1778, "step": 243 }, { "epoch": 0.9104477611940298, "grad_norm": 2.2000538592782433, "learning_rate": 1.1723409423812134e-06, "loss": 1.1269, "step": 244 }, { "epoch": 0.914179104477612, "grad_norm": 1.6886830906655088, "learning_rate": 1.1664546598924184e-06, "loss": 1.1615, "step": 245 }, { "epoch": 0.917910447761194, "grad_norm": 2.2494221352588886, "learning_rate": 1.1605624396397398e-06, "loss": 1.4029, "step": 246 }, { "epoch": 0.9216417910447762, "grad_norm": 2.012712883705275, "learning_rate": 1.1546644918102196e-06, "loss": 1.1799, "step": 247 }, { "epoch": 0.9253731343283582, "grad_norm": 2.3518817671490586, "learning_rate": 1.1487610267952142e-06, "loss": 1.1566, "step": 248 }, { "epoch": 0.9291044776119403, "grad_norm": 2.0646756101710593, "learning_rate": 1.1428522551828882e-06, "loss": 1.2883, "step": 249 }, { "epoch": 0.9328358208955224, "grad_norm": 1.812081401132651, "learning_rate": 1.1369383877507034e-06, "loss": 1.2653, "step": 250 }, { "epoch": 0.9365671641791045, "grad_norm": 2.242364078092567, "learning_rate": 1.131019635457899e-06, "loss": 1.1829, "step": 251 }, { "epoch": 0.9402985074626866, "grad_norm": 2.0182289267611258, "learning_rate": 1.1250962094379668e-06, "loss": 0.9778, "step": 252 }, { "epoch": 0.9440298507462687, "grad_norm": 2.4797725291235593, "learning_rate": 1.1191683209911201e-06, "loss": 1.0714, "step": 253 }, { "epoch": 0.9477611940298507, "grad_norm": 2.519766746322205, "learning_rate": 1.1132361815767552e-06, "loss": 1.2406, "step": 254 }, { "epoch": 0.9514925373134329, "grad_norm": 2.063185346641498, "learning_rate": 1.1073000028059095e-06, "loss": 0.987, "step": 255 }, { "epoch": 0.9552238805970149, "grad_norm": 1.5333052526002764, "learning_rate": 1.1013599964337106e-06, "loss": 0.8951, "step": 256 }, { "epoch": 0.9589552238805971, "grad_norm": 4.771307269529906, "learning_rate": 1.095416374351826e-06, "loss": 1.2666, "step": 257 }, { "epoch": 0.9626865671641791, "grad_norm": 2.717541175438304, "learning_rate": 1.0894693485809014e-06, "loss": 1.1109, "step": 258 }, { "epoch": 0.9664179104477612, "grad_norm": 2.756698383274168, "learning_rate": 1.0835191312629992e-06, "loss": 1.129, "step": 259 }, { "epoch": 0.9701492537313433, "grad_norm": 1.7033771070854546, "learning_rate": 1.0775659346540303e-06, "loss": 0.9603, "step": 260 }, { "epoch": 0.9738805970149254, "grad_norm": 2.5967915673434034, "learning_rate": 1.0716099711161832e-06, "loss": 1.1943, "step": 261 }, { "epoch": 0.9776119402985075, "grad_norm": 2.145466598370863, "learning_rate": 1.0656514531103483e-06, "loss": 0.8841, "step": 262 }, { "epoch": 0.9813432835820896, "grad_norm": 2.1800126434020477, "learning_rate": 1.0596905931885373e-06, "loss": 0.9661, "step": 263 }, { "epoch": 0.9850746268656716, "grad_norm": 2.8261681835210544, "learning_rate": 1.0537276039863047e-06, "loss": 1.1867, "step": 264 }, { "epoch": 0.9888059701492538, "grad_norm": 3.2978247537112586, "learning_rate": 1.04776269821516e-06, "loss": 1.2103, "step": 265 }, { "epoch": 0.9925373134328358, "grad_norm": 2.8052176780437064, "learning_rate": 1.0417960886549798e-06, "loss": 1.3141, "step": 266 }, { "epoch": 0.996268656716418, "grad_norm": 2.5933472539580635, "learning_rate": 1.035827988146418e-06, "loss": 1.078, "step": 267 }, { "epoch": 1.0, "grad_norm": 2.8520830177001395, "learning_rate": 1.0298586095833151e-06, "loss": 1.3273, "step": 268 } ], "logging_steps": 1, "max_steps": 536, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 268, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 50952048476160.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }