{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.98159509202454, "eval_steps": 500, "global_step": 366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0081799591002045, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.978, "step": 1 }, { "epoch": 0.016359918200409, "grad_norm": NaN, "learning_rate": 0.0, "loss": 2.0417, "step": 2 }, { "epoch": 0.024539877300613498, "grad_norm": 3.960280656814575, "learning_rate": 0.0, "loss": 2.4526, "step": 3 }, { "epoch": 0.032719836400818, "grad_norm": 4.871670722961426, "learning_rate": 2.702702702702703e-06, "loss": 2.3708, "step": 4 }, { "epoch": 0.0408997955010225, "grad_norm": NaN, "learning_rate": 5.405405405405406e-06, "loss": 2.7402, "step": 5 }, { "epoch": 0.049079754601226995, "grad_norm": 2.961730718612671, "learning_rate": 5.405405405405406e-06, "loss": 2.6255, "step": 6 }, { "epoch": 0.05725971370143149, "grad_norm": 2.4267706871032715, "learning_rate": 8.108108108108109e-06, "loss": 2.1672, "step": 7 }, { "epoch": 0.065439672801636, "grad_norm": 4.815489768981934, "learning_rate": 1.0810810810810812e-05, "loss": 2.6853, "step": 8 }, { "epoch": 0.0736196319018405, "grad_norm": 5.672784805297852, "learning_rate": 1.3513513513513515e-05, "loss": 2.5256, "step": 9 }, { "epoch": 0.081799591002045, "grad_norm": 2.97552490234375, "learning_rate": 1.6216216216216218e-05, "loss": 2.1108, "step": 10 }, { "epoch": 0.08997955010224949, "grad_norm": 2.305542469024658, "learning_rate": 1.891891891891892e-05, "loss": 2.2621, "step": 11 }, { "epoch": 0.09815950920245399, "grad_norm": 2.330063581466675, "learning_rate": 2.1621621621621624e-05, "loss": 2.2973, "step": 12 }, { "epoch": 0.10633946830265849, "grad_norm": 2.395848274230957, "learning_rate": 2.4324324324324327e-05, "loss": 2.2853, "step": 13 }, { "epoch": 0.11451942740286299, "grad_norm": 3.5902745723724365, "learning_rate": 2.702702702702703e-05, "loss": 2.3628, "step": 14 }, { "epoch": 0.12269938650306748, "grad_norm": 3.785466194152832, "learning_rate": 2.9729729729729733e-05, "loss": 1.6333, "step": 15 }, { "epoch": 0.130879345603272, "grad_norm": 2.845073699951172, "learning_rate": 3.2432432432432436e-05, "loss": 1.6592, "step": 16 }, { "epoch": 0.1390593047034765, "grad_norm": 2.9714362621307373, "learning_rate": 3.513513513513514e-05, "loss": 2.4303, "step": 17 }, { "epoch": 0.147239263803681, "grad_norm": 3.2374515533447266, "learning_rate": 3.783783783783784e-05, "loss": 1.6276, "step": 18 }, { "epoch": 0.1554192229038855, "grad_norm": 2.4501020908355713, "learning_rate": 4.0540540540540545e-05, "loss": 2.1786, "step": 19 }, { "epoch": 0.16359918200409, "grad_norm": 1.689795970916748, "learning_rate": 4.324324324324325e-05, "loss": 1.7335, "step": 20 }, { "epoch": 0.17177914110429449, "grad_norm": 1.5767645835876465, "learning_rate": 4.594594594594595e-05, "loss": 1.9214, "step": 21 }, { "epoch": 0.17995910020449898, "grad_norm": 2.6853578090667725, "learning_rate": 4.8648648648648654e-05, "loss": 1.9089, "step": 22 }, { "epoch": 0.18813905930470348, "grad_norm": 3.5681397914886475, "learning_rate": 5.135135135135135e-05, "loss": 2.092, "step": 23 }, { "epoch": 0.19631901840490798, "grad_norm": 3.208242416381836, "learning_rate": 5.405405405405406e-05, "loss": 1.9184, "step": 24 }, { "epoch": 0.20449897750511248, "grad_norm": 3.677274227142334, "learning_rate": 5.6756756756756757e-05, "loss": 1.6762, "step": 25 }, { "epoch": 0.21267893660531698, "grad_norm": 1.9500216245651245, "learning_rate": 5.9459459459459466e-05, "loss": 1.8447, "step": 26 }, { "epoch": 0.22085889570552147, "grad_norm": 2.72743821144104, "learning_rate": 6.216216216216216e-05, "loss": 1.659, "step": 27 }, { "epoch": 0.22903885480572597, "grad_norm": 1.4266787767410278, "learning_rate": 6.486486486486487e-05, "loss": 1.5518, "step": 28 }, { "epoch": 0.23721881390593047, "grad_norm": 1.8338373899459839, "learning_rate": 6.756756756756757e-05, "loss": 1.7725, "step": 29 }, { "epoch": 0.24539877300613497, "grad_norm": 2.702836751937866, "learning_rate": 7.027027027027028e-05, "loss": 1.3214, "step": 30 }, { "epoch": 0.25357873210633947, "grad_norm": 3.3664143085479736, "learning_rate": 7.297297297297297e-05, "loss": 1.5599, "step": 31 }, { "epoch": 0.261758691206544, "grad_norm": 1.7983371019363403, "learning_rate": 7.567567567567568e-05, "loss": 1.4007, "step": 32 }, { "epoch": 0.26993865030674846, "grad_norm": 1.4321403503417969, "learning_rate": 7.837837837837838e-05, "loss": 1.4209, "step": 33 }, { "epoch": 0.278118609406953, "grad_norm": 1.7886905670166016, "learning_rate": 8.108108108108109e-05, "loss": 1.1963, "step": 34 }, { "epoch": 0.28629856850715746, "grad_norm": 2.0502827167510986, "learning_rate": 8.378378378378379e-05, "loss": 1.6071, "step": 35 }, { "epoch": 0.294478527607362, "grad_norm": 2.351100206375122, "learning_rate": 8.64864864864865e-05, "loss": 1.1902, "step": 36 }, { "epoch": 0.30265848670756645, "grad_norm": 3.3446481227874756, "learning_rate": 8.918918918918919e-05, "loss": 1.1838, "step": 37 }, { "epoch": 0.310838445807771, "grad_norm": 1.3906322717666626, "learning_rate": 9.18918918918919e-05, "loss": 0.9948, "step": 38 }, { "epoch": 0.31901840490797545, "grad_norm": 1.9603602886199951, "learning_rate": 9.45945945945946e-05, "loss": 1.0997, "step": 39 }, { "epoch": 0.32719836400818, "grad_norm": 1.3630380630493164, "learning_rate": 9.729729729729731e-05, "loss": 0.6781, "step": 40 }, { "epoch": 0.33537832310838445, "grad_norm": 2.0062906742095947, "learning_rate": 0.0001, "loss": 1.4761, "step": 41 }, { "epoch": 0.34355828220858897, "grad_norm": 1.2718311548233032, "learning_rate": 9.99977204734326e-05, "loss": 1.0233, "step": 42 }, { "epoch": 0.35173824130879344, "grad_norm": 1.5002365112304688, "learning_rate": 9.999088210158001e-05, "loss": 0.8199, "step": 43 }, { "epoch": 0.35991820040899797, "grad_norm": 1.430582880973816, "learning_rate": 9.997948550797227e-05, "loss": 1.1087, "step": 44 }, { "epoch": 0.36809815950920244, "grad_norm": 2.0703279972076416, "learning_rate": 9.996353173176289e-05, "loss": 0.8149, "step": 45 }, { "epoch": 0.37627811860940696, "grad_norm": 1.5086464881896973, "learning_rate": 9.994302222763414e-05, "loss": 0.9549, "step": 46 }, { "epoch": 0.38445807770961143, "grad_norm": 1.5020016431808472, "learning_rate": 9.991795886566441e-05, "loss": 0.9542, "step": 47 }, { "epoch": 0.39263803680981596, "grad_norm": 1.6791132688522339, "learning_rate": 9.988834393115767e-05, "loss": 0.8864, "step": 48 }, { "epoch": 0.40081799591002043, "grad_norm": 2.7517454624176025, "learning_rate": 9.98541801244351e-05, "loss": 1.0826, "step": 49 }, { "epoch": 0.40899795501022496, "grad_norm": 1.6971582174301147, "learning_rate": 9.981547056058893e-05, "loss": 0.9114, "step": 50 }, { "epoch": 0.4171779141104294, "grad_norm": 1.4928799867630005, "learning_rate": 9.977221876919833e-05, "loss": 0.9755, "step": 51 }, { "epoch": 0.42535787321063395, "grad_norm": 1.1075421571731567, "learning_rate": 9.972442869400759e-05, "loss": 0.741, "step": 52 }, { "epoch": 0.4335378323108384, "grad_norm": 1.3488271236419678, "learning_rate": 9.967210469256656e-05, "loss": 1.0441, "step": 53 }, { "epoch": 0.44171779141104295, "grad_norm": 3.3010079860687256, "learning_rate": 9.961525153583327e-05, "loss": 1.2885, "step": 54 }, { "epoch": 0.4498977505112474, "grad_norm": 1.24274742603302, "learning_rate": 9.9553874407739e-05, "loss": 0.8107, "step": 55 }, { "epoch": 0.45807770961145194, "grad_norm": 1.1659082174301147, "learning_rate": 9.948797890471551e-05, "loss": 0.6129, "step": 56 }, { "epoch": 0.4662576687116564, "grad_norm": 1.5525306463241577, "learning_rate": 9.941757103518478e-05, "loss": 0.9262, "step": 57 }, { "epoch": 0.47443762781186094, "grad_norm": 1.3923064470291138, "learning_rate": 9.93426572190112e-05, "loss": 0.7552, "step": 58 }, { "epoch": 0.48261758691206547, "grad_norm": 1.166669487953186, "learning_rate": 9.926324428691611e-05, "loss": 0.6346, "step": 59 }, { "epoch": 0.49079754601226994, "grad_norm": 2.103994131088257, "learning_rate": 9.917933947985507e-05, "loss": 0.8199, "step": 60 }, { "epoch": 0.49897750511247446, "grad_norm": 1.633812665939331, "learning_rate": 9.909095044835754e-05, "loss": 0.7485, "step": 61 }, { "epoch": 0.5071574642126789, "grad_norm": 1.4533647298812866, "learning_rate": 9.899808525182935e-05, "loss": 1.1649, "step": 62 }, { "epoch": 0.5153374233128835, "grad_norm": 1.6267237663269043, "learning_rate": 9.890075235781779e-05, "loss": 1.1159, "step": 63 }, { "epoch": 0.523517382413088, "grad_norm": 1.2796165943145752, "learning_rate": 9.879896064123961e-05, "loss": 0.9613, "step": 64 }, { "epoch": 0.5316973415132924, "grad_norm": 1.3240866661071777, "learning_rate": 9.869271938357167e-05, "loss": 1.047, "step": 65 }, { "epoch": 0.5398773006134969, "grad_norm": 1.190612554550171, "learning_rate": 9.858203827200476e-05, "loss": 1.1846, "step": 66 }, { "epoch": 0.5480572597137015, "grad_norm": 1.1655223369598389, "learning_rate": 9.846692739856024e-05, "loss": 0.9566, "step": 67 }, { "epoch": 0.556237218813906, "grad_norm": 1.2617253065109253, "learning_rate": 9.834739725916988e-05, "loss": 1.1108, "step": 68 }, { "epoch": 0.5644171779141104, "grad_norm": 1.3576512336730957, "learning_rate": 9.822345875271883e-05, "loss": 1.1265, "step": 69 }, { "epoch": 0.5725971370143149, "grad_norm": 1.4342156648635864, "learning_rate": 9.809512318005181e-05, "loss": 0.7757, "step": 70 }, { "epoch": 0.5807770961145194, "grad_norm": 1.0733706951141357, "learning_rate": 9.796240224294271e-05, "loss": 0.9006, "step": 71 }, { "epoch": 0.588957055214724, "grad_norm": 1.323440432548523, "learning_rate": 9.782530804302763e-05, "loss": 0.9322, "step": 72 }, { "epoch": 0.5971370143149284, "grad_norm": 1.2899342775344849, "learning_rate": 9.768385308070138e-05, "loss": 0.8403, "step": 73 }, { "epoch": 0.6053169734151329, "grad_norm": 1.2755167484283447, "learning_rate": 9.753805025397779e-05, "loss": 0.8397, "step": 74 }, { "epoch": 0.6134969325153374, "grad_norm": 1.265972375869751, "learning_rate": 9.738791285731352e-05, "loss": 0.8143, "step": 75 }, { "epoch": 0.621676891615542, "grad_norm": 1.1493557691574097, "learning_rate": 9.723345458039594e-05, "loss": 1.059, "step": 76 }, { "epoch": 0.6298568507157464, "grad_norm": 1.1361910104751587, "learning_rate": 9.707468950689491e-05, "loss": 0.9112, "step": 77 }, { "epoch": 0.6380368098159509, "grad_norm": 1.4090393781661987, "learning_rate": 9.691163211317853e-05, "loss": 0.7847, "step": 78 }, { "epoch": 0.6462167689161554, "grad_norm": 1.300688624382019, "learning_rate": 9.674429726699323e-05, "loss": 0.9806, "step": 79 }, { "epoch": 0.65439672801636, "grad_norm": 1.0143762826919556, "learning_rate": 9.657270022610813e-05, "loss": 0.6648, "step": 80 }, { "epoch": 0.6625766871165644, "grad_norm": 1.2058460712432861, "learning_rate": 9.63968566369238e-05, "loss": 0.9702, "step": 81 }, { "epoch": 0.6707566462167689, "grad_norm": 1.0876555442810059, "learning_rate": 9.62167825330455e-05, "loss": 0.7861, "step": 82 }, { "epoch": 0.6789366053169734, "grad_norm": 1.502429723739624, "learning_rate": 9.603249433382144e-05, "loss": 1.1252, "step": 83 }, { "epoch": 0.6871165644171779, "grad_norm": 4.1136860847473145, "learning_rate": 9.584400884284545e-05, "loss": 0.7109, "step": 84 }, { "epoch": 0.6952965235173824, "grad_norm": 0.9980061650276184, "learning_rate": 9.56513432464249e-05, "loss": 0.6421, "step": 85 }, { "epoch": 0.7034764826175869, "grad_norm": 1.1087136268615723, "learning_rate": 9.545451511201364e-05, "loss": 0.6337, "step": 86 }, { "epoch": 0.7116564417177914, "grad_norm": 1.4353466033935547, "learning_rate": 9.525354238661009e-05, "loss": 1.0757, "step": 87 }, { "epoch": 0.7198364008179959, "grad_norm": 1.1413462162017822, "learning_rate": 9.504844339512095e-05, "loss": 0.7056, "step": 88 }, { "epoch": 0.7280163599182005, "grad_norm": 1.5157971382141113, "learning_rate": 9.483923683869024e-05, "loss": 0.8767, "step": 89 }, { "epoch": 0.7361963190184049, "grad_norm": 0.999251663684845, "learning_rate": 9.462594179299406e-05, "loss": 0.922, "step": 90 }, { "epoch": 0.7443762781186094, "grad_norm": 1.2393922805786133, "learning_rate": 9.440857770650138e-05, "loss": 0.8301, "step": 91 }, { "epoch": 0.7525562372188139, "grad_norm": 1.1513807773590088, "learning_rate": 9.418716439870057e-05, "loss": 0.5308, "step": 92 }, { "epoch": 0.7607361963190185, "grad_norm": 1.4229981899261475, "learning_rate": 9.396172205829234e-05, "loss": 1.1116, "step": 93 }, { "epoch": 0.7689161554192229, "grad_norm": 1.4916852712631226, "learning_rate": 9.373227124134888e-05, "loss": 0.8806, "step": 94 }, { "epoch": 0.7770961145194274, "grad_norm": 1.693434715270996, "learning_rate": 9.34988328694395e-05, "loss": 0.7924, "step": 95 }, { "epoch": 0.7852760736196319, "grad_norm": 1.1455564498901367, "learning_rate": 9.326142822772302e-05, "loss": 0.7091, "step": 96 }, { "epoch": 0.7934560327198364, "grad_norm": 1.031115174293518, "learning_rate": 9.302007896300698e-05, "loss": 0.8828, "step": 97 }, { "epoch": 0.8016359918200409, "grad_norm": 1.5678527355194092, "learning_rate": 9.27748070817738e-05, "loss": 1.0531, "step": 98 }, { "epoch": 0.8098159509202454, "grad_norm": 1.2558964490890503, "learning_rate": 9.252563494817425e-05, "loss": 0.9222, "step": 99 }, { "epoch": 0.8179959100204499, "grad_norm": 1.2573111057281494, "learning_rate": 9.227258528198831e-05, "loss": 0.8988, "step": 100 }, { "epoch": 0.8261758691206544, "grad_norm": 1.3229575157165527, "learning_rate": 9.201568115655342e-05, "loss": 1.0139, "step": 101 }, { "epoch": 0.8343558282208589, "grad_norm": 0.913388729095459, "learning_rate": 9.175494599666077e-05, "loss": 0.6802, "step": 102 }, { "epoch": 0.8425357873210634, "grad_norm": 3.0073134899139404, "learning_rate": 9.149040357641929e-05, "loss": 0.8834, "step": 103 }, { "epoch": 0.8507157464212679, "grad_norm": 0.9436314105987549, "learning_rate": 9.122207801708802e-05, "loss": 0.6559, "step": 104 }, { "epoch": 0.8588957055214724, "grad_norm": 1.1913410425186157, "learning_rate": 9.094999378487659e-05, "loss": 0.7427, "step": 105 }, { "epoch": 0.8670756646216768, "grad_norm": 1.03123140335083, "learning_rate": 9.067417568871445e-05, "loss": 0.6253, "step": 106 }, { "epoch": 0.8752556237218814, "grad_norm": 0.9424878358840942, "learning_rate": 9.03946488779887e-05, "loss": 0.7601, "step": 107 }, { "epoch": 0.8834355828220859, "grad_norm": 1.2202138900756836, "learning_rate": 9.011143884025101e-05, "loss": 1.024, "step": 108 }, { "epoch": 0.8916155419222904, "grad_norm": 1.3849170207977295, "learning_rate": 8.982457139889357e-05, "loss": 0.8283, "step": 109 }, { "epoch": 0.8997955010224948, "grad_norm": 1.2288554906845093, "learning_rate": 8.953407271079455e-05, "loss": 0.8297, "step": 110 }, { "epoch": 0.9079754601226994, "grad_norm": 0.9206739664077759, "learning_rate": 8.923996926393305e-05, "loss": 0.5966, "step": 111 }, { "epoch": 0.9161554192229039, "grad_norm": 0.9282044172286987, "learning_rate": 8.894228787497389e-05, "loss": 0.7775, "step": 112 }, { "epoch": 0.9243353783231084, "grad_norm": 1.2421815395355225, "learning_rate": 8.864105568682244e-05, "loss": 0.7838, "step": 113 }, { "epoch": 0.9325153374233128, "grad_norm": 1.1172124147415161, "learning_rate": 8.833630016614976e-05, "loss": 0.5921, "step": 114 }, { "epoch": 0.9406952965235174, "grad_norm": 1.310473918914795, "learning_rate": 8.802804910088809e-05, "loss": 1.0578, "step": 115 }, { "epoch": 0.9488752556237219, "grad_norm": 1.1656848192214966, "learning_rate": 8.771633059769711e-05, "loss": 0.8205, "step": 116 }, { "epoch": 0.9570552147239264, "grad_norm": 1.0159672498703003, "learning_rate": 8.740117307940123e-05, "loss": 0.8237, "step": 117 }, { "epoch": 0.9652351738241309, "grad_norm": 1.2568827867507935, "learning_rate": 8.708260528239788e-05, "loss": 1.0473, "step": 118 }, { "epoch": 0.9734151329243353, "grad_norm": 1.2711869478225708, "learning_rate": 8.676065625403733e-05, "loss": 1.0789, "step": 119 }, { "epoch": 0.9815950920245399, "grad_norm": 1.2562803030014038, "learning_rate": 8.64353553499741e-05, "loss": 0.5315, "step": 120 }, { "epoch": 0.9897750511247444, "grad_norm": 1.4107680320739746, "learning_rate": 8.610673223149034e-05, "loss": 0.9738, "step": 121 }, { "epoch": 0.9979550102249489, "grad_norm": 1.0771377086639404, "learning_rate": 8.577481686279123e-05, "loss": 0.6114, "step": 122 }, { "epoch": 1.0, "grad_norm": 0.6168379187583923, "learning_rate": 8.543963950827279e-05, "loss": 0.1857, "step": 123 }, { "epoch": 1.0081799591002045, "grad_norm": 1.0099263191223145, "learning_rate": 8.510123072976239e-05, "loss": 0.77, "step": 124 }, { "epoch": 1.016359918200409, "grad_norm": 1.0608845949172974, "learning_rate": 8.475962138373213e-05, "loss": 0.588, "step": 125 }, { "epoch": 1.0245398773006136, "grad_norm": 1.2696729898452759, "learning_rate": 8.441484261848514e-05, "loss": 0.8879, "step": 126 }, { "epoch": 1.032719836400818, "grad_norm": 1.556289792060852, "learning_rate": 8.406692587131568e-05, "loss": 1.1292, "step": 127 }, { "epoch": 1.0408997955010224, "grad_norm": 1.027779459953308, "learning_rate": 8.371590286564247e-05, "loss": 0.6589, "step": 128 }, { "epoch": 1.049079754601227, "grad_norm": 1.1132937669754028, "learning_rate": 8.336180560811619e-05, "loss": 0.6267, "step": 129 }, { "epoch": 1.0572597137014315, "grad_norm": 1.7684708833694458, "learning_rate": 8.30046663857011e-05, "loss": 0.7399, "step": 130 }, { "epoch": 1.065439672801636, "grad_norm": 1.372917890548706, "learning_rate": 8.264451776273104e-05, "loss": 0.8849, "step": 131 }, { "epoch": 1.0736196319018405, "grad_norm": 1.1678389310836792, "learning_rate": 8.228139257794012e-05, "loss": 0.901, "step": 132 }, { "epoch": 1.081799591002045, "grad_norm": 1.0002321004867554, "learning_rate": 8.191532394146865e-05, "loss": 0.3923, "step": 133 }, { "epoch": 1.0899795501022496, "grad_norm": 1.0493892431259155, "learning_rate": 8.154634523184388e-05, "loss": 0.77, "step": 134 }, { "epoch": 1.098159509202454, "grad_norm": 1.4020309448242188, "learning_rate": 8.117449009293668e-05, "loss": 0.7849, "step": 135 }, { "epoch": 1.1063394683026584, "grad_norm": 1.3133962154388428, "learning_rate": 8.07997924308938e-05, "loss": 0.7912, "step": 136 }, { "epoch": 1.114519427402863, "grad_norm": 1.3940467834472656, "learning_rate": 8.042228641104622e-05, "loss": 0.7142, "step": 137 }, { "epoch": 1.1226993865030674, "grad_norm": 1.3877304792404175, "learning_rate": 8.004200645479403e-05, "loss": 0.5454, "step": 138 }, { "epoch": 1.130879345603272, "grad_norm": 1.1132571697235107, "learning_rate": 7.965898723646776e-05, "loss": 0.7387, "step": 139 }, { "epoch": 1.1390593047034765, "grad_norm": 1.206382155418396, "learning_rate": 7.927326368016677e-05, "loss": 0.7271, "step": 140 }, { "epoch": 1.147239263803681, "grad_norm": 1.2813421487808228, "learning_rate": 7.888487095657484e-05, "loss": 0.8452, "step": 141 }, { "epoch": 1.1554192229038855, "grad_norm": 1.1103274822235107, "learning_rate": 7.849384447975321e-05, "loss": 0.5735, "step": 142 }, { "epoch": 1.16359918200409, "grad_norm": 1.1904572248458862, "learning_rate": 7.810021990391164e-05, "loss": 0.486, "step": 143 }, { "epoch": 1.1717791411042944, "grad_norm": 1.361222743988037, "learning_rate": 7.770403312015721e-05, "loss": 0.9265, "step": 144 }, { "epoch": 1.179959100204499, "grad_norm": 1.1453652381896973, "learning_rate": 7.73053202532219e-05, "loss": 0.6186, "step": 145 }, { "epoch": 1.1881390593047034, "grad_norm": 1.2070741653442383, "learning_rate": 7.690411765816864e-05, "loss": 0.7012, "step": 146 }, { "epoch": 1.196319018404908, "grad_norm": 1.4246371984481812, "learning_rate": 7.650046191707641e-05, "loss": 0.7644, "step": 147 }, { "epoch": 1.2044989775051125, "grad_norm": 1.2275187969207764, "learning_rate": 7.60943898357046e-05, "loss": 0.614, "step": 148 }, { "epoch": 1.212678936605317, "grad_norm": 1.292330265045166, "learning_rate": 7.568593844013718e-05, "loss": 0.6722, "step": 149 }, { "epoch": 1.2208588957055215, "grad_norm": 1.54197359085083, "learning_rate": 7.527514497340642e-05, "loss": 0.6981, "step": 150 }, { "epoch": 1.229038854805726, "grad_norm": 1.605914831161499, "learning_rate": 7.48620468920972e-05, "loss": 0.7524, "step": 151 }, { "epoch": 1.2372188139059306, "grad_norm": 1.2442176342010498, "learning_rate": 7.444668186293153e-05, "loss": 0.6238, "step": 152 }, { "epoch": 1.2453987730061349, "grad_norm": 1.4838721752166748, "learning_rate": 7.402908775933419e-05, "loss": 0.7599, "step": 153 }, { "epoch": 1.2535787321063394, "grad_norm": 1.8454984426498413, "learning_rate": 7.360930265797935e-05, "loss": 1.1331, "step": 154 }, { "epoch": 1.261758691206544, "grad_norm": 1.3571646213531494, "learning_rate": 7.31873648353186e-05, "loss": 0.6468, "step": 155 }, { "epoch": 1.2699386503067485, "grad_norm": 1.3795866966247559, "learning_rate": 7.276331276409106e-05, "loss": 0.7253, "step": 156 }, { "epoch": 1.278118609406953, "grad_norm": 1.4821308851242065, "learning_rate": 7.23371851098152e-05, "loss": 0.842, "step": 157 }, { "epoch": 1.2862985685071575, "grad_norm": 1.0921138525009155, "learning_rate": 7.190902072726335e-05, "loss": 0.5379, "step": 158 }, { "epoch": 1.294478527607362, "grad_norm": 1.5662935972213745, "learning_rate": 7.147885865691899e-05, "loss": 0.918, "step": 159 }, { "epoch": 1.3026584867075663, "grad_norm": 1.3333555459976196, "learning_rate": 7.104673812141675e-05, "loss": 0.6727, "step": 160 }, { "epoch": 1.310838445807771, "grad_norm": 1.1487497091293335, "learning_rate": 7.061269852196632e-05, "loss": 0.4279, "step": 161 }, { "epoch": 1.3190184049079754, "grad_norm": 1.1033565998077393, "learning_rate": 7.017677943475961e-05, "loss": 0.6372, "step": 162 }, { "epoch": 1.32719836400818, "grad_norm": 1.2100588083267212, "learning_rate": 6.973902060736226e-05, "loss": 0.7071, "step": 163 }, { "epoch": 1.3353783231083844, "grad_norm": 1.421066403388977, "learning_rate": 6.929946195508932e-05, "loss": 0.767, "step": 164 }, { "epoch": 1.343558282208589, "grad_norm": 1.2306902408599854, "learning_rate": 6.885814355736586e-05, "loss": 0.5587, "step": 165 }, { "epoch": 1.3517382413087935, "grad_norm": 1.5315287113189697, "learning_rate": 6.841510565407235e-05, "loss": 0.7519, "step": 166 }, { "epoch": 1.359918200408998, "grad_norm": 1.2497670650482178, "learning_rate": 6.797038864187564e-05, "loss": 0.5612, "step": 167 }, { "epoch": 1.3680981595092025, "grad_norm": 1.6106078624725342, "learning_rate": 6.752403307054549e-05, "loss": 0.7194, "step": 168 }, { "epoch": 1.3762781186094069, "grad_norm": 1.2407530546188354, "learning_rate": 6.707607963925724e-05, "loss": 0.531, "step": 169 }, { "epoch": 1.3844580777096114, "grad_norm": 1.663898229598999, "learning_rate": 6.66265691928808e-05, "loss": 0.7906, "step": 170 }, { "epoch": 1.392638036809816, "grad_norm": 1.3650121688842773, "learning_rate": 6.617554271825636e-05, "loss": 0.7207, "step": 171 }, { "epoch": 1.4008179959100204, "grad_norm": 1.1001300811767578, "learning_rate": 6.572304134045717e-05, "loss": 0.5145, "step": 172 }, { "epoch": 1.408997955010225, "grad_norm": 1.0687707662582397, "learning_rate": 6.526910631903973e-05, "loss": 0.3521, "step": 173 }, { "epoch": 1.4171779141104295, "grad_norm": 1.2442213296890259, "learning_rate": 6.481377904428171e-05, "loss": 0.7026, "step": 174 }, { "epoch": 1.425357873210634, "grad_norm": 1.31452214717865, "learning_rate": 6.435710103340786e-05, "loss": 0.7313, "step": 175 }, { "epoch": 1.4335378323108383, "grad_norm": 1.5573769807815552, "learning_rate": 6.389911392680456e-05, "loss": 0.7659, "step": 176 }, { "epoch": 1.441717791411043, "grad_norm": 1.2089431285858154, "learning_rate": 6.343985948422287e-05, "loss": 0.6916, "step": 177 }, { "epoch": 1.4498977505112474, "grad_norm": 1.5785194635391235, "learning_rate": 6.297937958097094e-05, "loss": 0.8101, "step": 178 }, { "epoch": 1.4580777096114519, "grad_norm": 1.4134269952774048, "learning_rate": 6.251771620409563e-05, "loss": 0.7504, "step": 179 }, { "epoch": 1.4662576687116564, "grad_norm": 1.4751485586166382, "learning_rate": 6.205491144855432e-05, "loss": 0.5948, "step": 180 }, { "epoch": 1.474437627811861, "grad_norm": 1.31548273563385, "learning_rate": 6.159100751337642e-05, "loss": 0.7057, "step": 181 }, { "epoch": 1.4826175869120655, "grad_norm": 1.8151648044586182, "learning_rate": 6.112604669781572e-05, "loss": 0.9037, "step": 182 }, { "epoch": 1.49079754601227, "grad_norm": 1.3681972026824951, "learning_rate": 6.0660071397493514e-05, "loss": 0.7223, "step": 183 }, { "epoch": 1.4989775051124745, "grad_norm": 1.6292760372161865, "learning_rate": 6.019312410053286e-05, "loss": 0.6083, "step": 184 }, { "epoch": 1.5071574642126788, "grad_norm": 1.8144514560699463, "learning_rate": 5.972524738368452e-05, "loss": 0.7662, "step": 185 }, { "epoch": 1.5153374233128836, "grad_norm": 1.650654911994934, "learning_rate": 5.925648390844476e-05, "loss": 0.902, "step": 186 }, { "epoch": 1.5235173824130879, "grad_norm": 1.4780257940292358, "learning_rate": 5.878687641716538e-05, "loss": 0.6566, "step": 187 }, { "epoch": 1.5316973415132924, "grad_norm": 1.1706862449645996, "learning_rate": 5.831646772915651e-05, "loss": 0.4189, "step": 188 }, { "epoch": 1.539877300613497, "grad_norm": 1.287718653678894, "learning_rate": 5.7845300736782204e-05, "loss": 0.5549, "step": 189 }, { "epoch": 1.5480572597137015, "grad_norm": 1.3776918649673462, "learning_rate": 5.737341840154956e-05, "loss": 0.5456, "step": 190 }, { "epoch": 1.556237218813906, "grad_norm": 1.2569301128387451, "learning_rate": 5.6900863750191347e-05, "loss": 0.6808, "step": 191 }, { "epoch": 1.5644171779141103, "grad_norm": 1.7013508081436157, "learning_rate": 5.642767987074288e-05, "loss": 0.7974, "step": 192 }, { "epoch": 1.572597137014315, "grad_norm": 1.5190353393554688, "learning_rate": 5.5953909908613114e-05, "loss": 0.5416, "step": 193 }, { "epoch": 1.5807770961145193, "grad_norm": 1.4736334085464478, "learning_rate": 5.547959706265068e-05, "loss": 0.6788, "step": 194 }, { "epoch": 1.588957055214724, "grad_norm": 2.006303548812866, "learning_rate": 5.5004784581204927e-05, "loss": 0.9634, "step": 195 }, { "epoch": 1.5971370143149284, "grad_norm": 1.3578423261642456, "learning_rate": 5.4529515758182506e-05, "loss": 0.6563, "step": 196 }, { "epoch": 1.605316973415133, "grad_norm": 1.4116990566253662, "learning_rate": 5.405383392909973e-05, "loss": 0.6062, "step": 197 }, { "epoch": 1.6134969325153374, "grad_norm": 1.2362536191940308, "learning_rate": 5.357778246713131e-05, "loss": 0.4829, "step": 198 }, { "epoch": 1.621676891615542, "grad_norm": 1.1780372858047485, "learning_rate": 5.310140477915544e-05, "loss": 0.465, "step": 199 }, { "epoch": 1.6298568507157465, "grad_norm": 1.3919291496276855, "learning_rate": 5.262474430179597e-05, "loss": 0.6967, "step": 200 }, { "epoch": 1.6380368098159508, "grad_norm": 1.7452629804611206, "learning_rate": 5.214784449746174e-05, "loss": 0.9096, "step": 201 }, { "epoch": 1.6462167689161555, "grad_norm": 1.4730846881866455, "learning_rate": 5.167074885038373e-05, "loss": 0.7473, "step": 202 }, { "epoch": 1.6543967280163598, "grad_norm": 1.5404870510101318, "learning_rate": 5.119350086265004e-05, "loss": 0.6233, "step": 203 }, { "epoch": 1.6625766871165644, "grad_norm": 1.4780898094177246, "learning_rate": 5.0716144050239375e-05, "loss": 0.7599, "step": 204 }, { "epoch": 1.670756646216769, "grad_norm": 1.194542407989502, "learning_rate": 5.023872193905316e-05, "loss": 0.5638, "step": 205 }, { "epoch": 1.6789366053169734, "grad_norm": 1.3504347801208496, "learning_rate": 4.976127806094684e-05, "loss": 0.4701, "step": 206 }, { "epoch": 1.687116564417178, "grad_norm": 2.1446099281311035, "learning_rate": 4.928385594976063e-05, "loss": 0.9383, "step": 207 }, { "epoch": 1.6952965235173822, "grad_norm": 1.4745142459869385, "learning_rate": 4.880649913734996e-05, "loss": 0.5817, "step": 208 }, { "epoch": 1.703476482617587, "grad_norm": 1.3029444217681885, "learning_rate": 4.832925114961629e-05, "loss": 0.3481, "step": 209 }, { "epoch": 1.7116564417177913, "grad_norm": 1.1414580345153809, "learning_rate": 4.785215550253826e-05, "loss": 0.4348, "step": 210 }, { "epoch": 1.719836400817996, "grad_norm": 1.4996669292449951, "learning_rate": 4.7375255698204045e-05, "loss": 0.653, "step": 211 }, { "epoch": 1.7280163599182004, "grad_norm": 1.66719388961792, "learning_rate": 4.6898595220844574e-05, "loss": 0.7181, "step": 212 }, { "epoch": 1.7361963190184049, "grad_norm": 1.476560354232788, "learning_rate": 4.64222175328687e-05, "loss": 0.6183, "step": 213 }, { "epoch": 1.7443762781186094, "grad_norm": 1.7405219078063965, "learning_rate": 4.594616607090028e-05, "loss": 0.689, "step": 214 }, { "epoch": 1.752556237218814, "grad_norm": 1.1866732835769653, "learning_rate": 4.547048424181751e-05, "loss": 0.4616, "step": 215 }, { "epoch": 1.7607361963190185, "grad_norm": 1.7068077325820923, "learning_rate": 4.4995215418795085e-05, "loss": 0.6318, "step": 216 }, { "epoch": 1.7689161554192228, "grad_norm": 1.4736443758010864, "learning_rate": 4.452040293734934e-05, "loss": 0.4611, "step": 217 }, { "epoch": 1.7770961145194275, "grad_norm": 0.8084559440612793, "learning_rate": 4.404609009138689e-05, "loss": 0.1962, "step": 218 }, { "epoch": 1.7852760736196318, "grad_norm": 1.1126220226287842, "learning_rate": 4.357232012925714e-05, "loss": 0.3804, "step": 219 }, { "epoch": 1.7934560327198366, "grad_norm": 1.4977810382843018, "learning_rate": 4.3099136249808665e-05, "loss": 0.5431, "step": 220 }, { "epoch": 1.8016359918200409, "grad_norm": 1.47788405418396, "learning_rate": 4.262658159845046e-05, "loss": 0.6498, "step": 221 }, { "epoch": 1.8098159509202454, "grad_norm": 1.2339309453964233, "learning_rate": 4.215469926321779e-05, "loss": 0.4812, "step": 222 }, { "epoch": 1.81799591002045, "grad_norm": 1.4342414140701294, "learning_rate": 4.1683532270843504e-05, "loss": 0.5703, "step": 223 }, { "epoch": 1.8261758691206544, "grad_norm": 1.795954942703247, "learning_rate": 4.121312358283463e-05, "loss": 0.992, "step": 224 }, { "epoch": 1.834355828220859, "grad_norm": 1.3253310918807983, "learning_rate": 4.074351609155527e-05, "loss": 0.5907, "step": 225 }, { "epoch": 1.8425357873210633, "grad_norm": 1.4935518503189087, "learning_rate": 4.027475261631548e-05, "loss": 0.7448, "step": 226 }, { "epoch": 1.850715746421268, "grad_norm": 1.8565064668655396, "learning_rate": 3.980687589946715e-05, "loss": 0.8506, "step": 227 }, { "epoch": 1.8588957055214723, "grad_norm": 2.0860605239868164, "learning_rate": 3.9339928602506505e-05, "loss": 0.4935, "step": 228 }, { "epoch": 1.8670756646216768, "grad_norm": 1.6324408054351807, "learning_rate": 3.887395330218429e-05, "loss": 0.586, "step": 229 }, { "epoch": 1.8752556237218814, "grad_norm": 1.8466233015060425, "learning_rate": 3.840899248662358e-05, "loss": 0.7596, "step": 230 }, { "epoch": 1.883435582822086, "grad_norm": 1.867876648902893, "learning_rate": 3.7945088551445693e-05, "loss": 0.7946, "step": 231 }, { "epoch": 1.8916155419222904, "grad_norm": 1.3713316917419434, "learning_rate": 3.748228379590438e-05, "loss": 0.5414, "step": 232 }, { "epoch": 1.8997955010224947, "grad_norm": 1.6689939498901367, "learning_rate": 3.7020620419029094e-05, "loss": 0.6574, "step": 233 }, { "epoch": 1.9079754601226995, "grad_norm": 1.4076114892959595, "learning_rate": 3.656014051577713e-05, "loss": 0.5052, "step": 234 }, { "epoch": 1.9161554192229038, "grad_norm": 1.6957688331604004, "learning_rate": 3.610088607319544e-05, "loss": 0.4209, "step": 235 }, { "epoch": 1.9243353783231085, "grad_norm": 1.465134620666504, "learning_rate": 3.564289896659214e-05, "loss": 0.562, "step": 236 }, { "epoch": 1.9325153374233128, "grad_norm": 1.626769781112671, "learning_rate": 3.5186220955718306e-05, "loss": 0.6494, "step": 237 }, { "epoch": 1.9406952965235174, "grad_norm": 1.2987111806869507, "learning_rate": 3.473089368096026e-05, "loss": 0.5365, "step": 238 }, { "epoch": 1.9488752556237219, "grad_norm": 1.7133764028549194, "learning_rate": 3.427695865954284e-05, "loss": 0.7972, "step": 239 }, { "epoch": 1.9570552147239264, "grad_norm": 1.067958116531372, "learning_rate": 3.3824457281743646e-05, "loss": 0.2413, "step": 240 }, { "epoch": 1.965235173824131, "grad_norm": 1.5035715103149414, "learning_rate": 3.337343080711921e-05, "loss": 0.655, "step": 241 }, { "epoch": 1.9734151329243352, "grad_norm": 1.9790688753128052, "learning_rate": 3.2923920360742774e-05, "loss": 0.7517, "step": 242 }, { "epoch": 1.98159509202454, "grad_norm": 1.79633367061615, "learning_rate": 3.2475966929454504e-05, "loss": 0.527, "step": 243 }, { "epoch": 1.9897750511247443, "grad_norm": 1.59013032913208, "learning_rate": 3.202961135812437e-05, "loss": 0.5922, "step": 244 }, { "epoch": 1.997955010224949, "grad_norm": 1.6466726064682007, "learning_rate": 3.158489434592766e-05, "loss": 0.6738, "step": 245 }, { "epoch": 2.0, "grad_norm": 0.8072463274002075, "learning_rate": 3.114185644263415e-05, "loss": 0.1228, "step": 246 }, { "epoch": 2.0081799591002043, "grad_norm": 1.412455439567566, "learning_rate": 3.070053804491068e-05, "loss": 0.5372, "step": 247 }, { "epoch": 2.016359918200409, "grad_norm": 1.15187406539917, "learning_rate": 3.026097939263775e-05, "loss": 0.3056, "step": 248 }, { "epoch": 2.0245398773006134, "grad_norm": 1.3967753648757935, "learning_rate": 2.9823220565240394e-05, "loss": 0.5469, "step": 249 }, { "epoch": 2.032719836400818, "grad_norm": 1.5053709745407104, "learning_rate": 2.938730147803369e-05, "loss": 0.5333, "step": 250 }, { "epoch": 2.0408997955010224, "grad_norm": 1.2719367742538452, "learning_rate": 2.895326187858326e-05, "loss": 0.4873, "step": 251 }, { "epoch": 2.049079754601227, "grad_norm": 1.3638321161270142, "learning_rate": 2.852114134308104e-05, "loss": 0.4676, "step": 252 }, { "epoch": 2.0572597137014315, "grad_norm": 1.4079426527023315, "learning_rate": 2.8090979272736662e-05, "loss": 0.5474, "step": 253 }, { "epoch": 2.065439672801636, "grad_norm": 1.3648539781570435, "learning_rate": 2.7662814890184818e-05, "loss": 0.3774, "step": 254 }, { "epoch": 2.0736196319018405, "grad_norm": 1.411365032196045, "learning_rate": 2.7236687235908953e-05, "loss": 0.5021, "step": 255 }, { "epoch": 2.081799591002045, "grad_norm": 0.8350080251693726, "learning_rate": 2.6812635164681386e-05, "loss": 0.295, "step": 256 }, { "epoch": 2.0899795501022496, "grad_norm": 1.4837121963500977, "learning_rate": 2.6390697342020665e-05, "loss": 0.4359, "step": 257 }, { "epoch": 2.098159509202454, "grad_norm": 1.447041392326355, "learning_rate": 2.5970912240665813e-05, "loss": 0.4699, "step": 258 }, { "epoch": 2.1063394683026586, "grad_norm": 1.5087660551071167, "learning_rate": 2.555331813706847e-05, "loss": 0.5016, "step": 259 }, { "epoch": 2.114519427402863, "grad_norm": 1.4970585107803345, "learning_rate": 2.5137953107902813e-05, "loss": 0.4827, "step": 260 }, { "epoch": 2.1226993865030677, "grad_norm": 1.5823018550872803, "learning_rate": 2.472485502659358e-05, "loss": 0.3951, "step": 261 }, { "epoch": 2.130879345603272, "grad_norm": 1.208630919456482, "learning_rate": 2.4314061559862833e-05, "loss": 0.3384, "step": 262 }, { "epoch": 2.1390593047034763, "grad_norm": 1.6956486701965332, "learning_rate": 2.3905610164295394e-05, "loss": 0.4982, "step": 263 }, { "epoch": 2.147239263803681, "grad_norm": 1.4397342205047607, "learning_rate": 2.3499538082923606e-05, "loss": 0.4574, "step": 264 }, { "epoch": 2.1554192229038853, "grad_norm": 1.3102678060531616, "learning_rate": 2.3095882341831372e-05, "loss": 0.3559, "step": 265 }, { "epoch": 2.16359918200409, "grad_norm": 1.2937331199645996, "learning_rate": 2.2694679746778115e-05, "loss": 0.3721, "step": 266 }, { "epoch": 2.1717791411042944, "grad_norm": 1.5506526231765747, "learning_rate": 2.22959668798428e-05, "loss": 0.4909, "step": 267 }, { "epoch": 2.179959100204499, "grad_norm": 1.8627556562423706, "learning_rate": 2.1899780096088375e-05, "loss": 0.7858, "step": 268 }, { "epoch": 2.1881390593047034, "grad_norm": 1.7848111391067505, "learning_rate": 2.1506155520246797e-05, "loss": 0.6337, "step": 269 }, { "epoch": 2.196319018404908, "grad_norm": 1.2659337520599365, "learning_rate": 2.1115129043425187e-05, "loss": 0.2693, "step": 270 }, { "epoch": 2.2044989775051125, "grad_norm": 1.6412032842636108, "learning_rate": 2.0726736319833228e-05, "loss": 0.5306, "step": 271 }, { "epoch": 2.212678936605317, "grad_norm": 1.611624002456665, "learning_rate": 2.0341012763532243e-05, "loss": 0.3388, "step": 272 }, { "epoch": 2.2208588957055215, "grad_norm": 1.1925326585769653, "learning_rate": 1.995799354520598e-05, "loss": 0.3615, "step": 273 }, { "epoch": 2.229038854805726, "grad_norm": 1.7512476444244385, "learning_rate": 1.9577713588953795e-05, "loss": 0.5129, "step": 274 }, { "epoch": 2.2372188139059306, "grad_norm": 1.5006930828094482, "learning_rate": 1.9200207569106216e-05, "loss": 0.4129, "step": 275 }, { "epoch": 2.245398773006135, "grad_norm": 1.8680585622787476, "learning_rate": 1.8825509907063327e-05, "loss": 0.5374, "step": 276 }, { "epoch": 2.2535787321063396, "grad_norm": 1.8856024742126465, "learning_rate": 1.8453654768156138e-05, "loss": 0.562, "step": 277 }, { "epoch": 2.261758691206544, "grad_norm": 1.9243358373641968, "learning_rate": 1.8084676058531373e-05, "loss": 0.6637, "step": 278 }, { "epoch": 2.2699386503067487, "grad_norm": 2.3150854110717773, "learning_rate": 1.771860742205988e-05, "loss": 0.5932, "step": 279 }, { "epoch": 2.278118609406953, "grad_norm": 1.2950345277786255, "learning_rate": 1.7355482237268983e-05, "loss": 0.341, "step": 280 }, { "epoch": 2.2862985685071573, "grad_norm": 1.5685244798660278, "learning_rate": 1.699533361429891e-05, "loss": 0.4248, "step": 281 }, { "epoch": 2.294478527607362, "grad_norm": 1.7234948873519897, "learning_rate": 1.663819439188382e-05, "loss": 0.7139, "step": 282 }, { "epoch": 2.3026584867075663, "grad_norm": 1.5493229627609253, "learning_rate": 1.6284097134357536e-05, "loss": 0.4609, "step": 283 }, { "epoch": 2.310838445807771, "grad_norm": 1.262978196144104, "learning_rate": 1.5933074128684332e-05, "loss": 0.3572, "step": 284 }, { "epoch": 2.3190184049079754, "grad_norm": 1.7874940633773804, "learning_rate": 1.5585157381514875e-05, "loss": 0.5078, "step": 285 }, { "epoch": 2.32719836400818, "grad_norm": 1.7057137489318848, "learning_rate": 1.5240378616267886e-05, "loss": 0.5262, "step": 286 }, { "epoch": 2.3353783231083844, "grad_norm": 1.5174486637115479, "learning_rate": 1.489876927023761e-05, "loss": 0.4075, "step": 287 }, { "epoch": 2.3435582822085887, "grad_norm": 1.473712682723999, "learning_rate": 1.4560360491727231e-05, "loss": 0.4237, "step": 288 }, { "epoch": 2.3517382413087935, "grad_norm": 2.0275111198425293, "learning_rate": 1.4225183137208776e-05, "loss": 0.7344, "step": 289 }, { "epoch": 2.359918200408998, "grad_norm": 1.5504990816116333, "learning_rate": 1.389326776850966e-05, "loss": 0.5226, "step": 290 }, { "epoch": 2.3680981595092025, "grad_norm": 0.9763877987861633, "learning_rate": 1.3564644650025893e-05, "loss": 0.2004, "step": 291 }, { "epoch": 2.376278118609407, "grad_norm": 1.6431723833084106, "learning_rate": 1.3239343745962679e-05, "loss": 0.5426, "step": 292 }, { "epoch": 2.3844580777096116, "grad_norm": 1.4661204814910889, "learning_rate": 1.2917394717602121e-05, "loss": 0.3689, "step": 293 }, { "epoch": 2.392638036809816, "grad_norm": 1.3995070457458496, "learning_rate": 1.2598826920598772e-05, "loss": 0.3994, "step": 294 }, { "epoch": 2.40081799591002, "grad_norm": 1.6375926733016968, "learning_rate": 1.2283669402302878e-05, "loss": 0.4635, "step": 295 }, { "epoch": 2.408997955010225, "grad_norm": 1.6579980850219727, "learning_rate": 1.197195089911191e-05, "loss": 0.44, "step": 296 }, { "epoch": 2.4171779141104293, "grad_norm": 2.057859420776367, "learning_rate": 1.1663699833850238e-05, "loss": 0.809, "step": 297 }, { "epoch": 2.425357873210634, "grad_norm": 1.9846243858337402, "learning_rate": 1.1358944313177567e-05, "loss": 0.526, "step": 298 }, { "epoch": 2.4335378323108383, "grad_norm": 1.8454967737197876, "learning_rate": 1.1057712125026116e-05, "loss": 0.4943, "step": 299 }, { "epoch": 2.441717791411043, "grad_norm": 1.3751471042633057, "learning_rate": 1.0760030736066951e-05, "loss": 0.2973, "step": 300 }, { "epoch": 2.4498977505112474, "grad_norm": 1.7352081537246704, "learning_rate": 1.0465927289205452e-05, "loss": 0.4647, "step": 301 }, { "epoch": 2.458077709611452, "grad_norm": 1.6583192348480225, "learning_rate": 1.017542860110644e-05, "loss": 0.5614, "step": 302 }, { "epoch": 2.4662576687116564, "grad_norm": 1.1086567640304565, "learning_rate": 9.888561159748993e-06, "loss": 0.2343, "step": 303 }, { "epoch": 2.474437627811861, "grad_norm": 1.2182183265686035, "learning_rate": 9.605351122011309e-06, "loss": 0.5084, "step": 304 }, { "epoch": 2.4826175869120655, "grad_norm": 1.5897687673568726, "learning_rate": 9.325824311285564e-06, "loss": 0.4916, "step": 305 }, { "epoch": 2.4907975460122698, "grad_norm": 1.7576637268066406, "learning_rate": 9.050006215123419e-06, "loss": 0.5896, "step": 306 }, { "epoch": 2.4989775051124745, "grad_norm": 1.3375118970870972, "learning_rate": 8.777921982911996e-06, "loss": 0.3472, "step": 307 }, { "epoch": 2.507157464212679, "grad_norm": 1.643762230873108, "learning_rate": 8.509596423580712e-06, "loss": 0.6561, "step": 308 }, { "epoch": 2.5153374233128836, "grad_norm": 1.8207759857177734, "learning_rate": 8.245054003339247e-06, "loss": 0.446, "step": 309 }, { "epoch": 2.523517382413088, "grad_norm": 1.7931218147277832, "learning_rate": 7.984318843446593e-06, "loss": 0.6626, "step": 310 }, { "epoch": 2.5316973415132926, "grad_norm": 1.5871256589889526, "learning_rate": 7.727414718011704e-06, "loss": 0.6779, "step": 311 }, { "epoch": 2.539877300613497, "grad_norm": 1.6045511960983276, "learning_rate": 7.474365051825749e-06, "loss": 0.4369, "step": 312 }, { "epoch": 2.5480572597137012, "grad_norm": 1.9614039659500122, "learning_rate": 7.225192918226214e-06, "loss": 0.5339, "step": 313 }, { "epoch": 2.556237218813906, "grad_norm": 1.6761356592178345, "learning_rate": 6.979921036993042e-06, "loss": 0.4714, "step": 314 }, { "epoch": 2.5644171779141103, "grad_norm": 1.268598198890686, "learning_rate": 6.738571772276997e-06, "loss": 0.3589, "step": 315 }, { "epoch": 2.572597137014315, "grad_norm": 1.9515974521636963, "learning_rate": 6.501167130560515e-06, "loss": 0.7677, "step": 316 }, { "epoch": 2.5807770961145193, "grad_norm": 1.752503514289856, "learning_rate": 6.267728758651132e-06, "loss": 0.6019, "step": 317 }, { "epoch": 2.588957055214724, "grad_norm": 1.6404023170471191, "learning_rate": 6.03827794170767e-06, "loss": 0.3813, "step": 318 }, { "epoch": 2.5971370143149284, "grad_norm": 1.6431866884231567, "learning_rate": 5.8128356012994375e-06, "loss": 0.5397, "step": 319 }, { "epoch": 2.6053169734151327, "grad_norm": 1.604200005531311, "learning_rate": 5.591422293498633e-06, "loss": 0.5326, "step": 320 }, { "epoch": 2.6134969325153374, "grad_norm": 1.955712080001831, "learning_rate": 5.374058207005944e-06, "loss": 0.6279, "step": 321 }, { "epoch": 2.621676891615542, "grad_norm": 1.9583613872528076, "learning_rate": 5.160763161309767e-06, "loss": 0.7064, "step": 322 }, { "epoch": 2.6298568507157465, "grad_norm": 1.465756893157959, "learning_rate": 4.951556604879048e-06, "loss": 0.3176, "step": 323 }, { "epoch": 2.638036809815951, "grad_norm": 1.0220084190368652, "learning_rate": 4.746457613389904e-06, "loss": 0.1989, "step": 324 }, { "epoch": 2.6462167689161555, "grad_norm": 1.9900139570236206, "learning_rate": 4.545484887986368e-06, "loss": 0.4488, "step": 325 }, { "epoch": 2.65439672801636, "grad_norm": 1.8389681577682495, "learning_rate": 4.348656753575092e-06, "loss": 0.8159, "step": 326 }, { "epoch": 2.662576687116564, "grad_norm": 1.8046656847000122, "learning_rate": 4.155991157154554e-06, "loss": 0.5941, "step": 327 }, { "epoch": 2.670756646216769, "grad_norm": 1.5946298837661743, "learning_rate": 3.967505666178556e-06, "loss": 0.6167, "step": 328 }, { "epoch": 2.6789366053169736, "grad_norm": 1.6215424537658691, "learning_rate": 3.783217466954503e-06, "loss": 0.5432, "step": 329 }, { "epoch": 2.687116564417178, "grad_norm": 1.5136370658874512, "learning_rate": 3.603143363076217e-06, "loss": 0.2688, "step": 330 }, { "epoch": 2.6952965235173822, "grad_norm": 2.0225648880004883, "learning_rate": 3.427299773891868e-06, "loss": 0.3968, "step": 331 }, { "epoch": 2.703476482617587, "grad_norm": 1.170069694519043, "learning_rate": 3.2557027330067658e-06, "loss": 0.3143, "step": 332 }, { "epoch": 2.7116564417177913, "grad_norm": 1.2336766719818115, "learning_rate": 3.0883678868214806e-06, "loss": 0.4023, "step": 333 }, { "epoch": 2.719836400817996, "grad_norm": 1.8785996437072754, "learning_rate": 2.925310493105099e-06, "loss": 0.6501, "step": 334 }, { "epoch": 2.7280163599182004, "grad_norm": 1.7136589288711548, "learning_rate": 2.7665454196040664e-06, "loss": 0.3418, "step": 335 }, { "epoch": 2.736196319018405, "grad_norm": 1.5453672409057617, "learning_rate": 2.612087142686487e-06, "loss": 0.4047, "step": 336 }, { "epoch": 2.7443762781186094, "grad_norm": 1.5091831684112549, "learning_rate": 2.4619497460222184e-06, "loss": 0.3707, "step": 337 }, { "epoch": 2.7525562372188137, "grad_norm": 1.996533751487732, "learning_rate": 2.316146919298623e-06, "loss": 0.7776, "step": 338 }, { "epoch": 2.7607361963190185, "grad_norm": 2.2293291091918945, "learning_rate": 2.1746919569723855e-06, "loss": 0.7055, "step": 339 }, { "epoch": 2.7689161554192228, "grad_norm": 1.906553864479065, "learning_rate": 2.0375977570572967e-06, "loss": 0.6423, "step": 340 }, { "epoch": 2.7770961145194275, "grad_norm": 1.684910535812378, "learning_rate": 1.9048768199481982e-06, "loss": 0.5679, "step": 341 }, { "epoch": 2.785276073619632, "grad_norm": 1.3118062019348145, "learning_rate": 1.7765412472811771e-06, "loss": 0.3036, "step": 342 }, { "epoch": 2.7934560327198366, "grad_norm": 1.9178974628448486, "learning_rate": 1.6526027408301226e-06, "loss": 0.5829, "step": 343 }, { "epoch": 2.801635991820041, "grad_norm": 1.860939860343933, "learning_rate": 1.5330726014397668e-06, "loss": 0.4617, "step": 344 }, { "epoch": 2.809815950920245, "grad_norm": 1.7959818840026855, "learning_rate": 1.417961727995254e-06, "loss": 0.4604, "step": 345 }, { "epoch": 2.81799591002045, "grad_norm": 1.414788842201233, "learning_rate": 1.3072806164283358e-06, "loss": 0.3398, "step": 346 }, { "epoch": 2.8261758691206547, "grad_norm": 1.316179633140564, "learning_rate": 1.2010393587603974e-06, "loss": 0.3707, "step": 347 }, { "epoch": 2.834355828220859, "grad_norm": 2.140214443206787, "learning_rate": 1.099247642182205e-06, "loss": 0.6991, "step": 348 }, { "epoch": 2.8425357873210633, "grad_norm": 1.6871670484542847, "learning_rate": 1.0019147481706625e-06, "loss": 0.6069, "step": 349 }, { "epoch": 2.850715746421268, "grad_norm": 1.7937754392623901, "learning_rate": 9.090495516424713e-07, "loss": 0.3841, "step": 350 }, { "epoch": 2.8588957055214723, "grad_norm": 1.3567599058151245, "learning_rate": 8.206605201449447e-07, "loss": 0.3186, "step": 351 }, { "epoch": 2.8670756646216766, "grad_norm": 1.0344539880752563, "learning_rate": 7.36755713083892e-07, "loss": 0.1676, "step": 352 }, { "epoch": 2.8752556237218814, "grad_norm": 1.908477783203125, "learning_rate": 6.573427809888067e-07, "loss": 0.7295, "step": 353 }, { "epoch": 2.883435582822086, "grad_norm": 2.14554500579834, "learning_rate": 5.824289648152126e-07, "loss": 0.8187, "step": 354 }, { "epoch": 2.8916155419222904, "grad_norm": 1.6814268827438354, "learning_rate": 5.120210952844872e-07, "loss": 0.5205, "step": 355 }, { "epoch": 2.8997955010224947, "grad_norm": 1.6498082876205444, "learning_rate": 4.461255922609986e-07, "loss": 0.4557, "step": 356 }, { "epoch": 2.9079754601226995, "grad_norm": 1.4337708950042725, "learning_rate": 3.8474846416672874e-07, "loss": 0.3251, "step": 357 }, { "epoch": 2.9161554192229038, "grad_norm": 1.875313401222229, "learning_rate": 3.278953074334512e-07, "loss": 0.5001, "step": 358 }, { "epoch": 2.9243353783231085, "grad_norm": 1.350846529006958, "learning_rate": 2.75571305992417e-07, "loss": 0.2414, "step": 359 }, { "epoch": 2.932515337423313, "grad_norm": 1.5978336334228516, "learning_rate": 2.2778123080167135e-07, "loss": 0.4585, "step": 360 }, { "epoch": 2.9406952965235176, "grad_norm": 1.672541856765747, "learning_rate": 1.8452943941106859e-07, "loss": 0.5382, "step": 361 }, { "epoch": 2.948875255623722, "grad_norm": 1.3987324237823486, "learning_rate": 1.4581987556490095e-07, "loss": 0.3326, "step": 362 }, { "epoch": 2.957055214723926, "grad_norm": 1.4565430879592896, "learning_rate": 1.1165606884234181e-07, "loss": 0.4901, "step": 363 }, { "epoch": 2.965235173824131, "grad_norm": 1.3861486911773682, "learning_rate": 8.204113433559201e-08, "loss": 0.2756, "step": 364 }, { "epoch": 2.9734151329243352, "grad_norm": 1.4839295148849487, "learning_rate": 5.697777236585711e-08, "loss": 0.3303, "step": 365 }, { "epoch": 2.98159509202454, "grad_norm": 1.6138904094696045, "learning_rate": 3.6468268237105366e-08, "loss": 0.558, "step": 366 }, { "epoch": 2.98159509202454, "step": 366, "total_flos": 3.142935032247091e+16, "train_loss": 0.7803121163341843, "train_runtime": 701.3998, "train_samples_per_second": 4.183, "train_steps_per_second": 0.522 } ], "logging_steps": 1, "max_steps": 366, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.142935032247091e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }