|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.98159509202454, |
|
"eval_steps": 500, |
|
"global_step": 366, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0081799591002045, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 1.978, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016359918200409, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 2.0417, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024539877300613498, |
|
"grad_norm": 3.960280656814575, |
|
"learning_rate": 0.0, |
|
"loss": 2.4526, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.032719836400818, |
|
"grad_norm": 4.871670722961426, |
|
"learning_rate": 2.702702702702703e-06, |
|
"loss": 2.3708, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0408997955010225, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 2.7402, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.049079754601226995, |
|
"grad_norm": 2.961730718612671, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 2.6255, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05725971370143149, |
|
"grad_norm": 2.4267706871032715, |
|
"learning_rate": 8.108108108108109e-06, |
|
"loss": 2.1672, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.065439672801636, |
|
"grad_norm": 4.815489768981934, |
|
"learning_rate": 1.0810810810810812e-05, |
|
"loss": 2.6853, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0736196319018405, |
|
"grad_norm": 5.672784805297852, |
|
"learning_rate": 1.3513513513513515e-05, |
|
"loss": 2.5256, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.081799591002045, |
|
"grad_norm": 2.97552490234375, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 2.1108, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08997955010224949, |
|
"grad_norm": 2.305542469024658, |
|
"learning_rate": 1.891891891891892e-05, |
|
"loss": 2.2621, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09815950920245399, |
|
"grad_norm": 2.330063581466675, |
|
"learning_rate": 2.1621621621621624e-05, |
|
"loss": 2.2973, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10633946830265849, |
|
"grad_norm": 2.395848274230957, |
|
"learning_rate": 2.4324324324324327e-05, |
|
"loss": 2.2853, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11451942740286299, |
|
"grad_norm": 3.5902745723724365, |
|
"learning_rate": 2.702702702702703e-05, |
|
"loss": 2.3628, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12269938650306748, |
|
"grad_norm": 3.785466194152832, |
|
"learning_rate": 2.9729729729729733e-05, |
|
"loss": 1.6333, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.130879345603272, |
|
"grad_norm": 2.845073699951172, |
|
"learning_rate": 3.2432432432432436e-05, |
|
"loss": 1.6592, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1390593047034765, |
|
"grad_norm": 2.9714362621307373, |
|
"learning_rate": 3.513513513513514e-05, |
|
"loss": 2.4303, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.147239263803681, |
|
"grad_norm": 3.2374515533447266, |
|
"learning_rate": 3.783783783783784e-05, |
|
"loss": 1.6276, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1554192229038855, |
|
"grad_norm": 2.4501020908355713, |
|
"learning_rate": 4.0540540540540545e-05, |
|
"loss": 2.1786, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16359918200409, |
|
"grad_norm": 1.689795970916748, |
|
"learning_rate": 4.324324324324325e-05, |
|
"loss": 1.7335, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17177914110429449, |
|
"grad_norm": 1.5767645835876465, |
|
"learning_rate": 4.594594594594595e-05, |
|
"loss": 1.9214, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.17995910020449898, |
|
"grad_norm": 2.6853578090667725, |
|
"learning_rate": 4.8648648648648654e-05, |
|
"loss": 1.9089, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18813905930470348, |
|
"grad_norm": 3.5681397914886475, |
|
"learning_rate": 5.135135135135135e-05, |
|
"loss": 2.092, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.19631901840490798, |
|
"grad_norm": 3.208242416381836, |
|
"learning_rate": 5.405405405405406e-05, |
|
"loss": 1.9184, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.20449897750511248, |
|
"grad_norm": 3.677274227142334, |
|
"learning_rate": 5.6756756756756757e-05, |
|
"loss": 1.6762, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.21267893660531698, |
|
"grad_norm": 1.9500216245651245, |
|
"learning_rate": 5.9459459459459466e-05, |
|
"loss": 1.8447, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.22085889570552147, |
|
"grad_norm": 2.72743821144104, |
|
"learning_rate": 6.216216216216216e-05, |
|
"loss": 1.659, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.22903885480572597, |
|
"grad_norm": 1.4266787767410278, |
|
"learning_rate": 6.486486486486487e-05, |
|
"loss": 1.5518, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23721881390593047, |
|
"grad_norm": 1.8338373899459839, |
|
"learning_rate": 6.756756756756757e-05, |
|
"loss": 1.7725, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24539877300613497, |
|
"grad_norm": 2.702836751937866, |
|
"learning_rate": 7.027027027027028e-05, |
|
"loss": 1.3214, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25357873210633947, |
|
"grad_norm": 3.3664143085479736, |
|
"learning_rate": 7.297297297297297e-05, |
|
"loss": 1.5599, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.261758691206544, |
|
"grad_norm": 1.7983371019363403, |
|
"learning_rate": 7.567567567567568e-05, |
|
"loss": 1.4007, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.26993865030674846, |
|
"grad_norm": 1.4321403503417969, |
|
"learning_rate": 7.837837837837838e-05, |
|
"loss": 1.4209, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.278118609406953, |
|
"grad_norm": 1.7886905670166016, |
|
"learning_rate": 8.108108108108109e-05, |
|
"loss": 1.1963, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.28629856850715746, |
|
"grad_norm": 2.0502827167510986, |
|
"learning_rate": 8.378378378378379e-05, |
|
"loss": 1.6071, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.294478527607362, |
|
"grad_norm": 2.351100206375122, |
|
"learning_rate": 8.64864864864865e-05, |
|
"loss": 1.1902, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.30265848670756645, |
|
"grad_norm": 3.3446481227874756, |
|
"learning_rate": 8.918918918918919e-05, |
|
"loss": 1.1838, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.310838445807771, |
|
"grad_norm": 1.3906322717666626, |
|
"learning_rate": 9.18918918918919e-05, |
|
"loss": 0.9948, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.31901840490797545, |
|
"grad_norm": 1.9603602886199951, |
|
"learning_rate": 9.45945945945946e-05, |
|
"loss": 1.0997, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.32719836400818, |
|
"grad_norm": 1.3630380630493164, |
|
"learning_rate": 9.729729729729731e-05, |
|
"loss": 0.6781, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.33537832310838445, |
|
"grad_norm": 2.0062906742095947, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4761, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.34355828220858897, |
|
"grad_norm": 1.2718311548233032, |
|
"learning_rate": 9.99977204734326e-05, |
|
"loss": 1.0233, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.35173824130879344, |
|
"grad_norm": 1.5002365112304688, |
|
"learning_rate": 9.999088210158001e-05, |
|
"loss": 0.8199, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.35991820040899797, |
|
"grad_norm": 1.430582880973816, |
|
"learning_rate": 9.997948550797227e-05, |
|
"loss": 1.1087, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.36809815950920244, |
|
"grad_norm": 2.0703279972076416, |
|
"learning_rate": 9.996353173176289e-05, |
|
"loss": 0.8149, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.37627811860940696, |
|
"grad_norm": 1.5086464881896973, |
|
"learning_rate": 9.994302222763414e-05, |
|
"loss": 0.9549, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.38445807770961143, |
|
"grad_norm": 1.5020016431808472, |
|
"learning_rate": 9.991795886566441e-05, |
|
"loss": 0.9542, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.39263803680981596, |
|
"grad_norm": 1.6791132688522339, |
|
"learning_rate": 9.988834393115767e-05, |
|
"loss": 0.8864, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.40081799591002043, |
|
"grad_norm": 2.7517454624176025, |
|
"learning_rate": 9.98541801244351e-05, |
|
"loss": 1.0826, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.40899795501022496, |
|
"grad_norm": 1.6971582174301147, |
|
"learning_rate": 9.981547056058893e-05, |
|
"loss": 0.9114, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4171779141104294, |
|
"grad_norm": 1.4928799867630005, |
|
"learning_rate": 9.977221876919833e-05, |
|
"loss": 0.9755, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.42535787321063395, |
|
"grad_norm": 1.1075421571731567, |
|
"learning_rate": 9.972442869400759e-05, |
|
"loss": 0.741, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4335378323108384, |
|
"grad_norm": 1.3488271236419678, |
|
"learning_rate": 9.967210469256656e-05, |
|
"loss": 1.0441, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.44171779141104295, |
|
"grad_norm": 3.3010079860687256, |
|
"learning_rate": 9.961525153583327e-05, |
|
"loss": 1.2885, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4498977505112474, |
|
"grad_norm": 1.24274742603302, |
|
"learning_rate": 9.9553874407739e-05, |
|
"loss": 0.8107, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.45807770961145194, |
|
"grad_norm": 1.1659082174301147, |
|
"learning_rate": 9.948797890471551e-05, |
|
"loss": 0.6129, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4662576687116564, |
|
"grad_norm": 1.5525306463241577, |
|
"learning_rate": 9.941757103518478e-05, |
|
"loss": 0.9262, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.47443762781186094, |
|
"grad_norm": 1.3923064470291138, |
|
"learning_rate": 9.93426572190112e-05, |
|
"loss": 0.7552, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.48261758691206547, |
|
"grad_norm": 1.166669487953186, |
|
"learning_rate": 9.926324428691611e-05, |
|
"loss": 0.6346, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 2.103994131088257, |
|
"learning_rate": 9.917933947985507e-05, |
|
"loss": 0.8199, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.49897750511247446, |
|
"grad_norm": 1.633812665939331, |
|
"learning_rate": 9.909095044835754e-05, |
|
"loss": 0.7485, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5071574642126789, |
|
"grad_norm": 1.4533647298812866, |
|
"learning_rate": 9.899808525182935e-05, |
|
"loss": 1.1649, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5153374233128835, |
|
"grad_norm": 1.6267237663269043, |
|
"learning_rate": 9.890075235781779e-05, |
|
"loss": 1.1159, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.523517382413088, |
|
"grad_norm": 1.2796165943145752, |
|
"learning_rate": 9.879896064123961e-05, |
|
"loss": 0.9613, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5316973415132924, |
|
"grad_norm": 1.3240866661071777, |
|
"learning_rate": 9.869271938357167e-05, |
|
"loss": 1.047, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5398773006134969, |
|
"grad_norm": 1.190612554550171, |
|
"learning_rate": 9.858203827200476e-05, |
|
"loss": 1.1846, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5480572597137015, |
|
"grad_norm": 1.1655223369598389, |
|
"learning_rate": 9.846692739856024e-05, |
|
"loss": 0.9566, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.556237218813906, |
|
"grad_norm": 1.2617253065109253, |
|
"learning_rate": 9.834739725916988e-05, |
|
"loss": 1.1108, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5644171779141104, |
|
"grad_norm": 1.3576512336730957, |
|
"learning_rate": 9.822345875271883e-05, |
|
"loss": 1.1265, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5725971370143149, |
|
"grad_norm": 1.4342156648635864, |
|
"learning_rate": 9.809512318005181e-05, |
|
"loss": 0.7757, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5807770961145194, |
|
"grad_norm": 1.0733706951141357, |
|
"learning_rate": 9.796240224294271e-05, |
|
"loss": 0.9006, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.588957055214724, |
|
"grad_norm": 1.323440432548523, |
|
"learning_rate": 9.782530804302763e-05, |
|
"loss": 0.9322, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5971370143149284, |
|
"grad_norm": 1.2899342775344849, |
|
"learning_rate": 9.768385308070138e-05, |
|
"loss": 0.8403, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6053169734151329, |
|
"grad_norm": 1.2755167484283447, |
|
"learning_rate": 9.753805025397779e-05, |
|
"loss": 0.8397, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"grad_norm": 1.265972375869751, |
|
"learning_rate": 9.738791285731352e-05, |
|
"loss": 0.8143, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.621676891615542, |
|
"grad_norm": 1.1493557691574097, |
|
"learning_rate": 9.723345458039594e-05, |
|
"loss": 1.059, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6298568507157464, |
|
"grad_norm": 1.1361910104751587, |
|
"learning_rate": 9.707468950689491e-05, |
|
"loss": 0.9112, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6380368098159509, |
|
"grad_norm": 1.4090393781661987, |
|
"learning_rate": 9.691163211317853e-05, |
|
"loss": 0.7847, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6462167689161554, |
|
"grad_norm": 1.300688624382019, |
|
"learning_rate": 9.674429726699323e-05, |
|
"loss": 0.9806, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.65439672801636, |
|
"grad_norm": 1.0143762826919556, |
|
"learning_rate": 9.657270022610813e-05, |
|
"loss": 0.6648, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6625766871165644, |
|
"grad_norm": 1.2058460712432861, |
|
"learning_rate": 9.63968566369238e-05, |
|
"loss": 0.9702, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6707566462167689, |
|
"grad_norm": 1.0876555442810059, |
|
"learning_rate": 9.62167825330455e-05, |
|
"loss": 0.7861, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6789366053169734, |
|
"grad_norm": 1.502429723739624, |
|
"learning_rate": 9.603249433382144e-05, |
|
"loss": 1.1252, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6871165644171779, |
|
"grad_norm": 4.1136860847473145, |
|
"learning_rate": 9.584400884284545e-05, |
|
"loss": 0.7109, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6952965235173824, |
|
"grad_norm": 0.9980061650276184, |
|
"learning_rate": 9.56513432464249e-05, |
|
"loss": 0.6421, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7034764826175869, |
|
"grad_norm": 1.1087136268615723, |
|
"learning_rate": 9.545451511201364e-05, |
|
"loss": 0.6337, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.7116564417177914, |
|
"grad_norm": 1.4353466033935547, |
|
"learning_rate": 9.525354238661009e-05, |
|
"loss": 1.0757, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7198364008179959, |
|
"grad_norm": 1.1413462162017822, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 0.7056, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7280163599182005, |
|
"grad_norm": 1.5157971382141113, |
|
"learning_rate": 9.483923683869024e-05, |
|
"loss": 0.8767, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7361963190184049, |
|
"grad_norm": 0.999251663684845, |
|
"learning_rate": 9.462594179299406e-05, |
|
"loss": 0.922, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7443762781186094, |
|
"grad_norm": 1.2393922805786133, |
|
"learning_rate": 9.440857770650138e-05, |
|
"loss": 0.8301, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7525562372188139, |
|
"grad_norm": 1.1513807773590088, |
|
"learning_rate": 9.418716439870057e-05, |
|
"loss": 0.5308, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7607361963190185, |
|
"grad_norm": 1.4229981899261475, |
|
"learning_rate": 9.396172205829234e-05, |
|
"loss": 1.1116, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7689161554192229, |
|
"grad_norm": 1.4916852712631226, |
|
"learning_rate": 9.373227124134888e-05, |
|
"loss": 0.8806, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7770961145194274, |
|
"grad_norm": 1.693434715270996, |
|
"learning_rate": 9.34988328694395e-05, |
|
"loss": 0.7924, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7852760736196319, |
|
"grad_norm": 1.1455564498901367, |
|
"learning_rate": 9.326142822772302e-05, |
|
"loss": 0.7091, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7934560327198364, |
|
"grad_norm": 1.031115174293518, |
|
"learning_rate": 9.302007896300698e-05, |
|
"loss": 0.8828, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.8016359918200409, |
|
"grad_norm": 1.5678527355194092, |
|
"learning_rate": 9.27748070817738e-05, |
|
"loss": 1.0531, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.8098159509202454, |
|
"grad_norm": 1.2558964490890503, |
|
"learning_rate": 9.252563494817425e-05, |
|
"loss": 0.9222, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8179959100204499, |
|
"grad_norm": 1.2573111057281494, |
|
"learning_rate": 9.227258528198831e-05, |
|
"loss": 0.8988, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8261758691206544, |
|
"grad_norm": 1.3229575157165527, |
|
"learning_rate": 9.201568115655342e-05, |
|
"loss": 1.0139, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8343558282208589, |
|
"grad_norm": 0.913388729095459, |
|
"learning_rate": 9.175494599666077e-05, |
|
"loss": 0.6802, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8425357873210634, |
|
"grad_norm": 3.0073134899139404, |
|
"learning_rate": 9.149040357641929e-05, |
|
"loss": 0.8834, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8507157464212679, |
|
"grad_norm": 0.9436314105987549, |
|
"learning_rate": 9.122207801708802e-05, |
|
"loss": 0.6559, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8588957055214724, |
|
"grad_norm": 1.1913410425186157, |
|
"learning_rate": 9.094999378487659e-05, |
|
"loss": 0.7427, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8670756646216768, |
|
"grad_norm": 1.03123140335083, |
|
"learning_rate": 9.067417568871445e-05, |
|
"loss": 0.6253, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8752556237218814, |
|
"grad_norm": 0.9424878358840942, |
|
"learning_rate": 9.03946488779887e-05, |
|
"loss": 0.7601, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8834355828220859, |
|
"grad_norm": 1.2202138900756836, |
|
"learning_rate": 9.011143884025101e-05, |
|
"loss": 1.024, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8916155419222904, |
|
"grad_norm": 1.3849170207977295, |
|
"learning_rate": 8.982457139889357e-05, |
|
"loss": 0.8283, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8997955010224948, |
|
"grad_norm": 1.2288554906845093, |
|
"learning_rate": 8.953407271079455e-05, |
|
"loss": 0.8297, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9079754601226994, |
|
"grad_norm": 0.9206739664077759, |
|
"learning_rate": 8.923996926393305e-05, |
|
"loss": 0.5966, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.9161554192229039, |
|
"grad_norm": 0.9282044172286987, |
|
"learning_rate": 8.894228787497389e-05, |
|
"loss": 0.7775, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9243353783231084, |
|
"grad_norm": 1.2421815395355225, |
|
"learning_rate": 8.864105568682244e-05, |
|
"loss": 0.7838, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9325153374233128, |
|
"grad_norm": 1.1172124147415161, |
|
"learning_rate": 8.833630016614976e-05, |
|
"loss": 0.5921, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9406952965235174, |
|
"grad_norm": 1.310473918914795, |
|
"learning_rate": 8.802804910088809e-05, |
|
"loss": 1.0578, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9488752556237219, |
|
"grad_norm": 1.1656848192214966, |
|
"learning_rate": 8.771633059769711e-05, |
|
"loss": 0.8205, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9570552147239264, |
|
"grad_norm": 1.0159672498703003, |
|
"learning_rate": 8.740117307940123e-05, |
|
"loss": 0.8237, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9652351738241309, |
|
"grad_norm": 1.2568827867507935, |
|
"learning_rate": 8.708260528239788e-05, |
|
"loss": 1.0473, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9734151329243353, |
|
"grad_norm": 1.2711869478225708, |
|
"learning_rate": 8.676065625403733e-05, |
|
"loss": 1.0789, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 1.2562803030014038, |
|
"learning_rate": 8.64353553499741e-05, |
|
"loss": 0.5315, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9897750511247444, |
|
"grad_norm": 1.4107680320739746, |
|
"learning_rate": 8.610673223149034e-05, |
|
"loss": 0.9738, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9979550102249489, |
|
"grad_norm": 1.0771377086639404, |
|
"learning_rate": 8.577481686279123e-05, |
|
"loss": 0.6114, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6168379187583923, |
|
"learning_rate": 8.543963950827279e-05, |
|
"loss": 0.1857, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.0081799591002045, |
|
"grad_norm": 1.0099263191223145, |
|
"learning_rate": 8.510123072976239e-05, |
|
"loss": 0.77, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.016359918200409, |
|
"grad_norm": 1.0608845949172974, |
|
"learning_rate": 8.475962138373213e-05, |
|
"loss": 0.588, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0245398773006136, |
|
"grad_norm": 1.2696729898452759, |
|
"learning_rate": 8.441484261848514e-05, |
|
"loss": 0.8879, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.032719836400818, |
|
"grad_norm": 1.556289792060852, |
|
"learning_rate": 8.406692587131568e-05, |
|
"loss": 1.1292, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.0408997955010224, |
|
"grad_norm": 1.027779459953308, |
|
"learning_rate": 8.371590286564247e-05, |
|
"loss": 0.6589, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.049079754601227, |
|
"grad_norm": 1.1132937669754028, |
|
"learning_rate": 8.336180560811619e-05, |
|
"loss": 0.6267, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.0572597137014315, |
|
"grad_norm": 1.7684708833694458, |
|
"learning_rate": 8.30046663857011e-05, |
|
"loss": 0.7399, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.065439672801636, |
|
"grad_norm": 1.372917890548706, |
|
"learning_rate": 8.264451776273104e-05, |
|
"loss": 0.8849, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.0736196319018405, |
|
"grad_norm": 1.1678389310836792, |
|
"learning_rate": 8.228139257794012e-05, |
|
"loss": 0.901, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.081799591002045, |
|
"grad_norm": 1.0002321004867554, |
|
"learning_rate": 8.191532394146865e-05, |
|
"loss": 0.3923, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0899795501022496, |
|
"grad_norm": 1.0493892431259155, |
|
"learning_rate": 8.154634523184388e-05, |
|
"loss": 0.77, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.098159509202454, |
|
"grad_norm": 1.4020309448242188, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 0.7849, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.1063394683026584, |
|
"grad_norm": 1.3133962154388428, |
|
"learning_rate": 8.07997924308938e-05, |
|
"loss": 0.7912, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.114519427402863, |
|
"grad_norm": 1.3940467834472656, |
|
"learning_rate": 8.042228641104622e-05, |
|
"loss": 0.7142, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.1226993865030674, |
|
"grad_norm": 1.3877304792404175, |
|
"learning_rate": 8.004200645479403e-05, |
|
"loss": 0.5454, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.130879345603272, |
|
"grad_norm": 1.1132571697235107, |
|
"learning_rate": 7.965898723646776e-05, |
|
"loss": 0.7387, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.1390593047034765, |
|
"grad_norm": 1.206382155418396, |
|
"learning_rate": 7.927326368016677e-05, |
|
"loss": 0.7271, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.147239263803681, |
|
"grad_norm": 1.2813421487808228, |
|
"learning_rate": 7.888487095657484e-05, |
|
"loss": 0.8452, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.1554192229038855, |
|
"grad_norm": 1.1103274822235107, |
|
"learning_rate": 7.849384447975321e-05, |
|
"loss": 0.5735, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.16359918200409, |
|
"grad_norm": 1.1904572248458862, |
|
"learning_rate": 7.810021990391164e-05, |
|
"loss": 0.486, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.1717791411042944, |
|
"grad_norm": 1.361222743988037, |
|
"learning_rate": 7.770403312015721e-05, |
|
"loss": 0.9265, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.179959100204499, |
|
"grad_norm": 1.1453652381896973, |
|
"learning_rate": 7.73053202532219e-05, |
|
"loss": 0.6186, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1881390593047034, |
|
"grad_norm": 1.2070741653442383, |
|
"learning_rate": 7.690411765816864e-05, |
|
"loss": 0.7012, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.196319018404908, |
|
"grad_norm": 1.4246371984481812, |
|
"learning_rate": 7.650046191707641e-05, |
|
"loss": 0.7644, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.2044989775051125, |
|
"grad_norm": 1.2275187969207764, |
|
"learning_rate": 7.60943898357046e-05, |
|
"loss": 0.614, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.212678936605317, |
|
"grad_norm": 1.292330265045166, |
|
"learning_rate": 7.568593844013718e-05, |
|
"loss": 0.6722, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.2208588957055215, |
|
"grad_norm": 1.54197359085083, |
|
"learning_rate": 7.527514497340642e-05, |
|
"loss": 0.6981, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.229038854805726, |
|
"grad_norm": 1.605914831161499, |
|
"learning_rate": 7.48620468920972e-05, |
|
"loss": 0.7524, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.2372188139059306, |
|
"grad_norm": 1.2442176342010498, |
|
"learning_rate": 7.444668186293153e-05, |
|
"loss": 0.6238, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.2453987730061349, |
|
"grad_norm": 1.4838721752166748, |
|
"learning_rate": 7.402908775933419e-05, |
|
"loss": 0.7599, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.2535787321063394, |
|
"grad_norm": 1.8454984426498413, |
|
"learning_rate": 7.360930265797935e-05, |
|
"loss": 1.1331, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.261758691206544, |
|
"grad_norm": 1.3571646213531494, |
|
"learning_rate": 7.31873648353186e-05, |
|
"loss": 0.6468, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2699386503067485, |
|
"grad_norm": 1.3795866966247559, |
|
"learning_rate": 7.276331276409106e-05, |
|
"loss": 0.7253, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.278118609406953, |
|
"grad_norm": 1.4821308851242065, |
|
"learning_rate": 7.23371851098152e-05, |
|
"loss": 0.842, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.2862985685071575, |
|
"grad_norm": 1.0921138525009155, |
|
"learning_rate": 7.190902072726335e-05, |
|
"loss": 0.5379, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.294478527607362, |
|
"grad_norm": 1.5662935972213745, |
|
"learning_rate": 7.147885865691899e-05, |
|
"loss": 0.918, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.3026584867075663, |
|
"grad_norm": 1.3333555459976196, |
|
"learning_rate": 7.104673812141675e-05, |
|
"loss": 0.6727, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.310838445807771, |
|
"grad_norm": 1.1487497091293335, |
|
"learning_rate": 7.061269852196632e-05, |
|
"loss": 0.4279, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.3190184049079754, |
|
"grad_norm": 1.1033565998077393, |
|
"learning_rate": 7.017677943475961e-05, |
|
"loss": 0.6372, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.32719836400818, |
|
"grad_norm": 1.2100588083267212, |
|
"learning_rate": 6.973902060736226e-05, |
|
"loss": 0.7071, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.3353783231083844, |
|
"grad_norm": 1.421066403388977, |
|
"learning_rate": 6.929946195508932e-05, |
|
"loss": 0.767, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.343558282208589, |
|
"grad_norm": 1.2306902408599854, |
|
"learning_rate": 6.885814355736586e-05, |
|
"loss": 0.5587, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3517382413087935, |
|
"grad_norm": 1.5315287113189697, |
|
"learning_rate": 6.841510565407235e-05, |
|
"loss": 0.7519, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.359918200408998, |
|
"grad_norm": 1.2497670650482178, |
|
"learning_rate": 6.797038864187564e-05, |
|
"loss": 0.5612, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.3680981595092025, |
|
"grad_norm": 1.6106078624725342, |
|
"learning_rate": 6.752403307054549e-05, |
|
"loss": 0.7194, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.3762781186094069, |
|
"grad_norm": 1.2407530546188354, |
|
"learning_rate": 6.707607963925724e-05, |
|
"loss": 0.531, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.3844580777096114, |
|
"grad_norm": 1.663898229598999, |
|
"learning_rate": 6.66265691928808e-05, |
|
"loss": 0.7906, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.392638036809816, |
|
"grad_norm": 1.3650121688842773, |
|
"learning_rate": 6.617554271825636e-05, |
|
"loss": 0.7207, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.4008179959100204, |
|
"grad_norm": 1.1001300811767578, |
|
"learning_rate": 6.572304134045717e-05, |
|
"loss": 0.5145, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.408997955010225, |
|
"grad_norm": 1.0687707662582397, |
|
"learning_rate": 6.526910631903973e-05, |
|
"loss": 0.3521, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.4171779141104295, |
|
"grad_norm": 1.2442213296890259, |
|
"learning_rate": 6.481377904428171e-05, |
|
"loss": 0.7026, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.425357873210634, |
|
"grad_norm": 1.31452214717865, |
|
"learning_rate": 6.435710103340786e-05, |
|
"loss": 0.7313, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4335378323108383, |
|
"grad_norm": 1.5573769807815552, |
|
"learning_rate": 6.389911392680456e-05, |
|
"loss": 0.7659, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.441717791411043, |
|
"grad_norm": 1.2089431285858154, |
|
"learning_rate": 6.343985948422287e-05, |
|
"loss": 0.6916, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.4498977505112474, |
|
"grad_norm": 1.5785194635391235, |
|
"learning_rate": 6.297937958097094e-05, |
|
"loss": 0.8101, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.4580777096114519, |
|
"grad_norm": 1.4134269952774048, |
|
"learning_rate": 6.251771620409563e-05, |
|
"loss": 0.7504, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.4662576687116564, |
|
"grad_norm": 1.4751485586166382, |
|
"learning_rate": 6.205491144855432e-05, |
|
"loss": 0.5948, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.474437627811861, |
|
"grad_norm": 1.31548273563385, |
|
"learning_rate": 6.159100751337642e-05, |
|
"loss": 0.7057, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.4826175869120655, |
|
"grad_norm": 1.8151648044586182, |
|
"learning_rate": 6.112604669781572e-05, |
|
"loss": 0.9037, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.49079754601227, |
|
"grad_norm": 1.3681972026824951, |
|
"learning_rate": 6.0660071397493514e-05, |
|
"loss": 0.7223, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.4989775051124745, |
|
"grad_norm": 1.6292760372161865, |
|
"learning_rate": 6.019312410053286e-05, |
|
"loss": 0.6083, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.5071574642126788, |
|
"grad_norm": 1.8144514560699463, |
|
"learning_rate": 5.972524738368452e-05, |
|
"loss": 0.7662, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.5153374233128836, |
|
"grad_norm": 1.650654911994934, |
|
"learning_rate": 5.925648390844476e-05, |
|
"loss": 0.902, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.5235173824130879, |
|
"grad_norm": 1.4780257940292358, |
|
"learning_rate": 5.878687641716538e-05, |
|
"loss": 0.6566, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.5316973415132924, |
|
"grad_norm": 1.1706862449645996, |
|
"learning_rate": 5.831646772915651e-05, |
|
"loss": 0.4189, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.539877300613497, |
|
"grad_norm": 1.287718653678894, |
|
"learning_rate": 5.7845300736782204e-05, |
|
"loss": 0.5549, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.5480572597137015, |
|
"grad_norm": 1.3776918649673462, |
|
"learning_rate": 5.737341840154956e-05, |
|
"loss": 0.5456, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.556237218813906, |
|
"grad_norm": 1.2569301128387451, |
|
"learning_rate": 5.6900863750191347e-05, |
|
"loss": 0.6808, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.5644171779141103, |
|
"grad_norm": 1.7013508081436157, |
|
"learning_rate": 5.642767987074288e-05, |
|
"loss": 0.7974, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.572597137014315, |
|
"grad_norm": 1.5190353393554688, |
|
"learning_rate": 5.5953909908613114e-05, |
|
"loss": 0.5416, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.5807770961145193, |
|
"grad_norm": 1.4736334085464478, |
|
"learning_rate": 5.547959706265068e-05, |
|
"loss": 0.6788, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.588957055214724, |
|
"grad_norm": 2.006303548812866, |
|
"learning_rate": 5.5004784581204927e-05, |
|
"loss": 0.9634, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5971370143149284, |
|
"grad_norm": 1.3578423261642456, |
|
"learning_rate": 5.4529515758182506e-05, |
|
"loss": 0.6563, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.605316973415133, |
|
"grad_norm": 1.4116990566253662, |
|
"learning_rate": 5.405383392909973e-05, |
|
"loss": 0.6062, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.6134969325153374, |
|
"grad_norm": 1.2362536191940308, |
|
"learning_rate": 5.357778246713131e-05, |
|
"loss": 0.4829, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.621676891615542, |
|
"grad_norm": 1.1780372858047485, |
|
"learning_rate": 5.310140477915544e-05, |
|
"loss": 0.465, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.6298568507157465, |
|
"grad_norm": 1.3919291496276855, |
|
"learning_rate": 5.262474430179597e-05, |
|
"loss": 0.6967, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6380368098159508, |
|
"grad_norm": 1.7452629804611206, |
|
"learning_rate": 5.214784449746174e-05, |
|
"loss": 0.9096, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.6462167689161555, |
|
"grad_norm": 1.4730846881866455, |
|
"learning_rate": 5.167074885038373e-05, |
|
"loss": 0.7473, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.6543967280163598, |
|
"grad_norm": 1.5404870510101318, |
|
"learning_rate": 5.119350086265004e-05, |
|
"loss": 0.6233, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.6625766871165644, |
|
"grad_norm": 1.4780898094177246, |
|
"learning_rate": 5.0716144050239375e-05, |
|
"loss": 0.7599, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.670756646216769, |
|
"grad_norm": 1.194542407989502, |
|
"learning_rate": 5.023872193905316e-05, |
|
"loss": 0.5638, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6789366053169734, |
|
"grad_norm": 1.3504347801208496, |
|
"learning_rate": 4.976127806094684e-05, |
|
"loss": 0.4701, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.687116564417178, |
|
"grad_norm": 2.1446099281311035, |
|
"learning_rate": 4.928385594976063e-05, |
|
"loss": 0.9383, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.6952965235173822, |
|
"grad_norm": 1.4745142459869385, |
|
"learning_rate": 4.880649913734996e-05, |
|
"loss": 0.5817, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.703476482617587, |
|
"grad_norm": 1.3029444217681885, |
|
"learning_rate": 4.832925114961629e-05, |
|
"loss": 0.3481, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.7116564417177913, |
|
"grad_norm": 1.1414580345153809, |
|
"learning_rate": 4.785215550253826e-05, |
|
"loss": 0.4348, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.719836400817996, |
|
"grad_norm": 1.4996669292449951, |
|
"learning_rate": 4.7375255698204045e-05, |
|
"loss": 0.653, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.7280163599182004, |
|
"grad_norm": 1.66719388961792, |
|
"learning_rate": 4.6898595220844574e-05, |
|
"loss": 0.7181, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.7361963190184049, |
|
"grad_norm": 1.476560354232788, |
|
"learning_rate": 4.64222175328687e-05, |
|
"loss": 0.6183, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.7443762781186094, |
|
"grad_norm": 1.7405219078063965, |
|
"learning_rate": 4.594616607090028e-05, |
|
"loss": 0.689, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.752556237218814, |
|
"grad_norm": 1.1866732835769653, |
|
"learning_rate": 4.547048424181751e-05, |
|
"loss": 0.4616, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7607361963190185, |
|
"grad_norm": 1.7068077325820923, |
|
"learning_rate": 4.4995215418795085e-05, |
|
"loss": 0.6318, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.7689161554192228, |
|
"grad_norm": 1.4736443758010864, |
|
"learning_rate": 4.452040293734934e-05, |
|
"loss": 0.4611, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.7770961145194275, |
|
"grad_norm": 0.8084559440612793, |
|
"learning_rate": 4.404609009138689e-05, |
|
"loss": 0.1962, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.7852760736196318, |
|
"grad_norm": 1.1126220226287842, |
|
"learning_rate": 4.357232012925714e-05, |
|
"loss": 0.3804, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.7934560327198366, |
|
"grad_norm": 1.4977810382843018, |
|
"learning_rate": 4.3099136249808665e-05, |
|
"loss": 0.5431, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8016359918200409, |
|
"grad_norm": 1.47788405418396, |
|
"learning_rate": 4.262658159845046e-05, |
|
"loss": 0.6498, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.8098159509202454, |
|
"grad_norm": 1.2339309453964233, |
|
"learning_rate": 4.215469926321779e-05, |
|
"loss": 0.4812, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.81799591002045, |
|
"grad_norm": 1.4342414140701294, |
|
"learning_rate": 4.1683532270843504e-05, |
|
"loss": 0.5703, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.8261758691206544, |
|
"grad_norm": 1.795954942703247, |
|
"learning_rate": 4.121312358283463e-05, |
|
"loss": 0.992, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.834355828220859, |
|
"grad_norm": 1.3253310918807983, |
|
"learning_rate": 4.074351609155527e-05, |
|
"loss": 0.5907, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.8425357873210633, |
|
"grad_norm": 1.4935518503189087, |
|
"learning_rate": 4.027475261631548e-05, |
|
"loss": 0.7448, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.850715746421268, |
|
"grad_norm": 1.8565064668655396, |
|
"learning_rate": 3.980687589946715e-05, |
|
"loss": 0.8506, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.8588957055214723, |
|
"grad_norm": 2.0860605239868164, |
|
"learning_rate": 3.9339928602506505e-05, |
|
"loss": 0.4935, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.8670756646216768, |
|
"grad_norm": 1.6324408054351807, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 0.586, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.8752556237218814, |
|
"grad_norm": 1.8466233015060425, |
|
"learning_rate": 3.840899248662358e-05, |
|
"loss": 0.7596, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.883435582822086, |
|
"grad_norm": 1.867876648902893, |
|
"learning_rate": 3.7945088551445693e-05, |
|
"loss": 0.7946, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.8916155419222904, |
|
"grad_norm": 1.3713316917419434, |
|
"learning_rate": 3.748228379590438e-05, |
|
"loss": 0.5414, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.8997955010224947, |
|
"grad_norm": 1.6689939498901367, |
|
"learning_rate": 3.7020620419029094e-05, |
|
"loss": 0.6574, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.9079754601226995, |
|
"grad_norm": 1.4076114892959595, |
|
"learning_rate": 3.656014051577713e-05, |
|
"loss": 0.5052, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.9161554192229038, |
|
"grad_norm": 1.6957688331604004, |
|
"learning_rate": 3.610088607319544e-05, |
|
"loss": 0.4209, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.9243353783231085, |
|
"grad_norm": 1.465134620666504, |
|
"learning_rate": 3.564289896659214e-05, |
|
"loss": 0.562, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.9325153374233128, |
|
"grad_norm": 1.626769781112671, |
|
"learning_rate": 3.5186220955718306e-05, |
|
"loss": 0.6494, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.9406952965235174, |
|
"grad_norm": 1.2987111806869507, |
|
"learning_rate": 3.473089368096026e-05, |
|
"loss": 0.5365, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.9488752556237219, |
|
"grad_norm": 1.7133764028549194, |
|
"learning_rate": 3.427695865954284e-05, |
|
"loss": 0.7972, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.9570552147239264, |
|
"grad_norm": 1.067958116531372, |
|
"learning_rate": 3.3824457281743646e-05, |
|
"loss": 0.2413, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.965235173824131, |
|
"grad_norm": 1.5035715103149414, |
|
"learning_rate": 3.337343080711921e-05, |
|
"loss": 0.655, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.9734151329243352, |
|
"grad_norm": 1.9790688753128052, |
|
"learning_rate": 3.2923920360742774e-05, |
|
"loss": 0.7517, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.98159509202454, |
|
"grad_norm": 1.79633367061615, |
|
"learning_rate": 3.2475966929454504e-05, |
|
"loss": 0.527, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.9897750511247443, |
|
"grad_norm": 1.59013032913208, |
|
"learning_rate": 3.202961135812437e-05, |
|
"loss": 0.5922, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.997955010224949, |
|
"grad_norm": 1.6466726064682007, |
|
"learning_rate": 3.158489434592766e-05, |
|
"loss": 0.6738, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.8072463274002075, |
|
"learning_rate": 3.114185644263415e-05, |
|
"loss": 0.1228, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.0081799591002043, |
|
"grad_norm": 1.412455439567566, |
|
"learning_rate": 3.070053804491068e-05, |
|
"loss": 0.5372, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.016359918200409, |
|
"grad_norm": 1.15187406539917, |
|
"learning_rate": 3.026097939263775e-05, |
|
"loss": 0.3056, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.0245398773006134, |
|
"grad_norm": 1.3967753648757935, |
|
"learning_rate": 2.9823220565240394e-05, |
|
"loss": 0.5469, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.032719836400818, |
|
"grad_norm": 1.5053709745407104, |
|
"learning_rate": 2.938730147803369e-05, |
|
"loss": 0.5333, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0408997955010224, |
|
"grad_norm": 1.2719367742538452, |
|
"learning_rate": 2.895326187858326e-05, |
|
"loss": 0.4873, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.049079754601227, |
|
"grad_norm": 1.3638321161270142, |
|
"learning_rate": 2.852114134308104e-05, |
|
"loss": 0.4676, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.0572597137014315, |
|
"grad_norm": 1.4079426527023315, |
|
"learning_rate": 2.8090979272736662e-05, |
|
"loss": 0.5474, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.065439672801636, |
|
"grad_norm": 1.3648539781570435, |
|
"learning_rate": 2.7662814890184818e-05, |
|
"loss": 0.3774, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.0736196319018405, |
|
"grad_norm": 1.411365032196045, |
|
"learning_rate": 2.7236687235908953e-05, |
|
"loss": 0.5021, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.081799591002045, |
|
"grad_norm": 0.8350080251693726, |
|
"learning_rate": 2.6812635164681386e-05, |
|
"loss": 0.295, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.0899795501022496, |
|
"grad_norm": 1.4837121963500977, |
|
"learning_rate": 2.6390697342020665e-05, |
|
"loss": 0.4359, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.098159509202454, |
|
"grad_norm": 1.447041392326355, |
|
"learning_rate": 2.5970912240665813e-05, |
|
"loss": 0.4699, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.1063394683026586, |
|
"grad_norm": 1.5087660551071167, |
|
"learning_rate": 2.555331813706847e-05, |
|
"loss": 0.5016, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.114519427402863, |
|
"grad_norm": 1.4970585107803345, |
|
"learning_rate": 2.5137953107902813e-05, |
|
"loss": 0.4827, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.1226993865030677, |
|
"grad_norm": 1.5823018550872803, |
|
"learning_rate": 2.472485502659358e-05, |
|
"loss": 0.3951, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.130879345603272, |
|
"grad_norm": 1.208630919456482, |
|
"learning_rate": 2.4314061559862833e-05, |
|
"loss": 0.3384, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.1390593047034763, |
|
"grad_norm": 1.6956486701965332, |
|
"learning_rate": 2.3905610164295394e-05, |
|
"loss": 0.4982, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.147239263803681, |
|
"grad_norm": 1.4397342205047607, |
|
"learning_rate": 2.3499538082923606e-05, |
|
"loss": 0.4574, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.1554192229038853, |
|
"grad_norm": 1.3102678060531616, |
|
"learning_rate": 2.3095882341831372e-05, |
|
"loss": 0.3559, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.16359918200409, |
|
"grad_norm": 1.2937331199645996, |
|
"learning_rate": 2.2694679746778115e-05, |
|
"loss": 0.3721, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.1717791411042944, |
|
"grad_norm": 1.5506526231765747, |
|
"learning_rate": 2.22959668798428e-05, |
|
"loss": 0.4909, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.179959100204499, |
|
"grad_norm": 1.8627556562423706, |
|
"learning_rate": 2.1899780096088375e-05, |
|
"loss": 0.7858, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.1881390593047034, |
|
"grad_norm": 1.7848111391067505, |
|
"learning_rate": 2.1506155520246797e-05, |
|
"loss": 0.6337, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.196319018404908, |
|
"grad_norm": 1.2659337520599365, |
|
"learning_rate": 2.1115129043425187e-05, |
|
"loss": 0.2693, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.2044989775051125, |
|
"grad_norm": 1.6412032842636108, |
|
"learning_rate": 2.0726736319833228e-05, |
|
"loss": 0.5306, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.212678936605317, |
|
"grad_norm": 1.611624002456665, |
|
"learning_rate": 2.0341012763532243e-05, |
|
"loss": 0.3388, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.2208588957055215, |
|
"grad_norm": 1.1925326585769653, |
|
"learning_rate": 1.995799354520598e-05, |
|
"loss": 0.3615, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.229038854805726, |
|
"grad_norm": 1.7512476444244385, |
|
"learning_rate": 1.9577713588953795e-05, |
|
"loss": 0.5129, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.2372188139059306, |
|
"grad_norm": 1.5006930828094482, |
|
"learning_rate": 1.9200207569106216e-05, |
|
"loss": 0.4129, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.245398773006135, |
|
"grad_norm": 1.8680585622787476, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 0.5374, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.2535787321063396, |
|
"grad_norm": 1.8856024742126465, |
|
"learning_rate": 1.8453654768156138e-05, |
|
"loss": 0.562, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.261758691206544, |
|
"grad_norm": 1.9243358373641968, |
|
"learning_rate": 1.8084676058531373e-05, |
|
"loss": 0.6637, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.2699386503067487, |
|
"grad_norm": 2.3150854110717773, |
|
"learning_rate": 1.771860742205988e-05, |
|
"loss": 0.5932, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.278118609406953, |
|
"grad_norm": 1.2950345277786255, |
|
"learning_rate": 1.7355482237268983e-05, |
|
"loss": 0.341, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.2862985685071573, |
|
"grad_norm": 1.5685244798660278, |
|
"learning_rate": 1.699533361429891e-05, |
|
"loss": 0.4248, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.294478527607362, |
|
"grad_norm": 1.7234948873519897, |
|
"learning_rate": 1.663819439188382e-05, |
|
"loss": 0.7139, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.3026584867075663, |
|
"grad_norm": 1.5493229627609253, |
|
"learning_rate": 1.6284097134357536e-05, |
|
"loss": 0.4609, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.310838445807771, |
|
"grad_norm": 1.262978196144104, |
|
"learning_rate": 1.5933074128684332e-05, |
|
"loss": 0.3572, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.3190184049079754, |
|
"grad_norm": 1.7874940633773804, |
|
"learning_rate": 1.5585157381514875e-05, |
|
"loss": 0.5078, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.32719836400818, |
|
"grad_norm": 1.7057137489318848, |
|
"learning_rate": 1.5240378616267886e-05, |
|
"loss": 0.5262, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.3353783231083844, |
|
"grad_norm": 1.5174486637115479, |
|
"learning_rate": 1.489876927023761e-05, |
|
"loss": 0.4075, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.3435582822085887, |
|
"grad_norm": 1.473712682723999, |
|
"learning_rate": 1.4560360491727231e-05, |
|
"loss": 0.4237, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.3517382413087935, |
|
"grad_norm": 2.0275111198425293, |
|
"learning_rate": 1.4225183137208776e-05, |
|
"loss": 0.7344, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.359918200408998, |
|
"grad_norm": 1.5504990816116333, |
|
"learning_rate": 1.389326776850966e-05, |
|
"loss": 0.5226, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.3680981595092025, |
|
"grad_norm": 0.9763877987861633, |
|
"learning_rate": 1.3564644650025893e-05, |
|
"loss": 0.2004, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.376278118609407, |
|
"grad_norm": 1.6431723833084106, |
|
"learning_rate": 1.3239343745962679e-05, |
|
"loss": 0.5426, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.3844580777096116, |
|
"grad_norm": 1.4661204814910889, |
|
"learning_rate": 1.2917394717602121e-05, |
|
"loss": 0.3689, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.392638036809816, |
|
"grad_norm": 1.3995070457458496, |
|
"learning_rate": 1.2598826920598772e-05, |
|
"loss": 0.3994, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.40081799591002, |
|
"grad_norm": 1.6375926733016968, |
|
"learning_rate": 1.2283669402302878e-05, |
|
"loss": 0.4635, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.408997955010225, |
|
"grad_norm": 1.6579980850219727, |
|
"learning_rate": 1.197195089911191e-05, |
|
"loss": 0.44, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.4171779141104293, |
|
"grad_norm": 2.057859420776367, |
|
"learning_rate": 1.1663699833850238e-05, |
|
"loss": 0.809, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.425357873210634, |
|
"grad_norm": 1.9846243858337402, |
|
"learning_rate": 1.1358944313177567e-05, |
|
"loss": 0.526, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.4335378323108383, |
|
"grad_norm": 1.8454967737197876, |
|
"learning_rate": 1.1057712125026116e-05, |
|
"loss": 0.4943, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.441717791411043, |
|
"grad_norm": 1.3751471042633057, |
|
"learning_rate": 1.0760030736066951e-05, |
|
"loss": 0.2973, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4498977505112474, |
|
"grad_norm": 1.7352081537246704, |
|
"learning_rate": 1.0465927289205452e-05, |
|
"loss": 0.4647, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.458077709611452, |
|
"grad_norm": 1.6583192348480225, |
|
"learning_rate": 1.017542860110644e-05, |
|
"loss": 0.5614, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.4662576687116564, |
|
"grad_norm": 1.1086567640304565, |
|
"learning_rate": 9.888561159748993e-06, |
|
"loss": 0.2343, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.474437627811861, |
|
"grad_norm": 1.2182183265686035, |
|
"learning_rate": 9.605351122011309e-06, |
|
"loss": 0.5084, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.4826175869120655, |
|
"grad_norm": 1.5897687673568726, |
|
"learning_rate": 9.325824311285564e-06, |
|
"loss": 0.4916, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.4907975460122698, |
|
"grad_norm": 1.7576637268066406, |
|
"learning_rate": 9.050006215123419e-06, |
|
"loss": 0.5896, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.4989775051124745, |
|
"grad_norm": 1.3375118970870972, |
|
"learning_rate": 8.777921982911996e-06, |
|
"loss": 0.3472, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.507157464212679, |
|
"grad_norm": 1.643762230873108, |
|
"learning_rate": 8.509596423580712e-06, |
|
"loss": 0.6561, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.5153374233128836, |
|
"grad_norm": 1.8207759857177734, |
|
"learning_rate": 8.245054003339247e-06, |
|
"loss": 0.446, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.523517382413088, |
|
"grad_norm": 1.7931218147277832, |
|
"learning_rate": 7.984318843446593e-06, |
|
"loss": 0.6626, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.5316973415132926, |
|
"grad_norm": 1.5871256589889526, |
|
"learning_rate": 7.727414718011704e-06, |
|
"loss": 0.6779, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.539877300613497, |
|
"grad_norm": 1.6045511960983276, |
|
"learning_rate": 7.474365051825749e-06, |
|
"loss": 0.4369, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.5480572597137012, |
|
"grad_norm": 1.9614039659500122, |
|
"learning_rate": 7.225192918226214e-06, |
|
"loss": 0.5339, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.556237218813906, |
|
"grad_norm": 1.6761356592178345, |
|
"learning_rate": 6.979921036993042e-06, |
|
"loss": 0.4714, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.5644171779141103, |
|
"grad_norm": 1.268598198890686, |
|
"learning_rate": 6.738571772276997e-06, |
|
"loss": 0.3589, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.572597137014315, |
|
"grad_norm": 1.9515974521636963, |
|
"learning_rate": 6.501167130560515e-06, |
|
"loss": 0.7677, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.5807770961145193, |
|
"grad_norm": 1.752503514289856, |
|
"learning_rate": 6.267728758651132e-06, |
|
"loss": 0.6019, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.588957055214724, |
|
"grad_norm": 1.6404023170471191, |
|
"learning_rate": 6.03827794170767e-06, |
|
"loss": 0.3813, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.5971370143149284, |
|
"grad_norm": 1.6431866884231567, |
|
"learning_rate": 5.8128356012994375e-06, |
|
"loss": 0.5397, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.6053169734151327, |
|
"grad_norm": 1.604200005531311, |
|
"learning_rate": 5.591422293498633e-06, |
|
"loss": 0.5326, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.6134969325153374, |
|
"grad_norm": 1.955712080001831, |
|
"learning_rate": 5.374058207005944e-06, |
|
"loss": 0.6279, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.621676891615542, |
|
"grad_norm": 1.9583613872528076, |
|
"learning_rate": 5.160763161309767e-06, |
|
"loss": 0.7064, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.6298568507157465, |
|
"grad_norm": 1.465756893157959, |
|
"learning_rate": 4.951556604879048e-06, |
|
"loss": 0.3176, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.638036809815951, |
|
"grad_norm": 1.0220084190368652, |
|
"learning_rate": 4.746457613389904e-06, |
|
"loss": 0.1989, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.6462167689161555, |
|
"grad_norm": 1.9900139570236206, |
|
"learning_rate": 4.545484887986368e-06, |
|
"loss": 0.4488, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.65439672801636, |
|
"grad_norm": 1.8389681577682495, |
|
"learning_rate": 4.348656753575092e-06, |
|
"loss": 0.8159, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.662576687116564, |
|
"grad_norm": 1.8046656847000122, |
|
"learning_rate": 4.155991157154554e-06, |
|
"loss": 0.5941, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.670756646216769, |
|
"grad_norm": 1.5946298837661743, |
|
"learning_rate": 3.967505666178556e-06, |
|
"loss": 0.6167, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.6789366053169736, |
|
"grad_norm": 1.6215424537658691, |
|
"learning_rate": 3.783217466954503e-06, |
|
"loss": 0.5432, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.687116564417178, |
|
"grad_norm": 1.5136370658874512, |
|
"learning_rate": 3.603143363076217e-06, |
|
"loss": 0.2688, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.6952965235173822, |
|
"grad_norm": 2.0225648880004883, |
|
"learning_rate": 3.427299773891868e-06, |
|
"loss": 0.3968, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.703476482617587, |
|
"grad_norm": 1.170069694519043, |
|
"learning_rate": 3.2557027330067658e-06, |
|
"loss": 0.3143, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.7116564417177913, |
|
"grad_norm": 1.2336766719818115, |
|
"learning_rate": 3.0883678868214806e-06, |
|
"loss": 0.4023, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.719836400817996, |
|
"grad_norm": 1.8785996437072754, |
|
"learning_rate": 2.925310493105099e-06, |
|
"loss": 0.6501, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.7280163599182004, |
|
"grad_norm": 1.7136589288711548, |
|
"learning_rate": 2.7665454196040664e-06, |
|
"loss": 0.3418, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.736196319018405, |
|
"grad_norm": 1.5453672409057617, |
|
"learning_rate": 2.612087142686487e-06, |
|
"loss": 0.4047, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.7443762781186094, |
|
"grad_norm": 1.5091831684112549, |
|
"learning_rate": 2.4619497460222184e-06, |
|
"loss": 0.3707, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.7525562372188137, |
|
"grad_norm": 1.996533751487732, |
|
"learning_rate": 2.316146919298623e-06, |
|
"loss": 0.7776, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.7607361963190185, |
|
"grad_norm": 2.2293291091918945, |
|
"learning_rate": 2.1746919569723855e-06, |
|
"loss": 0.7055, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.7689161554192228, |
|
"grad_norm": 1.906553864479065, |
|
"learning_rate": 2.0375977570572967e-06, |
|
"loss": 0.6423, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.7770961145194275, |
|
"grad_norm": 1.684910535812378, |
|
"learning_rate": 1.9048768199481982e-06, |
|
"loss": 0.5679, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.785276073619632, |
|
"grad_norm": 1.3118062019348145, |
|
"learning_rate": 1.7765412472811771e-06, |
|
"loss": 0.3036, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.7934560327198366, |
|
"grad_norm": 1.9178974628448486, |
|
"learning_rate": 1.6526027408301226e-06, |
|
"loss": 0.5829, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.801635991820041, |
|
"grad_norm": 1.860939860343933, |
|
"learning_rate": 1.5330726014397668e-06, |
|
"loss": 0.4617, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.809815950920245, |
|
"grad_norm": 1.7959818840026855, |
|
"learning_rate": 1.417961727995254e-06, |
|
"loss": 0.4604, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.81799591002045, |
|
"grad_norm": 1.414788842201233, |
|
"learning_rate": 1.3072806164283358e-06, |
|
"loss": 0.3398, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.8261758691206547, |
|
"grad_norm": 1.316179633140564, |
|
"learning_rate": 1.2010393587603974e-06, |
|
"loss": 0.3707, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.834355828220859, |
|
"grad_norm": 2.140214443206787, |
|
"learning_rate": 1.099247642182205e-06, |
|
"loss": 0.6991, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.8425357873210633, |
|
"grad_norm": 1.6871670484542847, |
|
"learning_rate": 1.0019147481706625e-06, |
|
"loss": 0.6069, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.850715746421268, |
|
"grad_norm": 1.7937754392623901, |
|
"learning_rate": 9.090495516424713e-07, |
|
"loss": 0.3841, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.8588957055214723, |
|
"grad_norm": 1.3567599058151245, |
|
"learning_rate": 8.206605201449447e-07, |
|
"loss": 0.3186, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.8670756646216766, |
|
"grad_norm": 1.0344539880752563, |
|
"learning_rate": 7.36755713083892e-07, |
|
"loss": 0.1676, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.8752556237218814, |
|
"grad_norm": 1.908477783203125, |
|
"learning_rate": 6.573427809888067e-07, |
|
"loss": 0.7295, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.883435582822086, |
|
"grad_norm": 2.14554500579834, |
|
"learning_rate": 5.824289648152126e-07, |
|
"loss": 0.8187, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.8916155419222904, |
|
"grad_norm": 1.6814268827438354, |
|
"learning_rate": 5.120210952844872e-07, |
|
"loss": 0.5205, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.8997955010224947, |
|
"grad_norm": 1.6498082876205444, |
|
"learning_rate": 4.461255922609986e-07, |
|
"loss": 0.4557, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.9079754601226995, |
|
"grad_norm": 1.4337708950042725, |
|
"learning_rate": 3.8474846416672874e-07, |
|
"loss": 0.3251, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.9161554192229038, |
|
"grad_norm": 1.875313401222229, |
|
"learning_rate": 3.278953074334512e-07, |
|
"loss": 0.5001, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.9243353783231085, |
|
"grad_norm": 1.350846529006958, |
|
"learning_rate": 2.75571305992417e-07, |
|
"loss": 0.2414, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.932515337423313, |
|
"grad_norm": 1.5978336334228516, |
|
"learning_rate": 2.2778123080167135e-07, |
|
"loss": 0.4585, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.9406952965235176, |
|
"grad_norm": 1.672541856765747, |
|
"learning_rate": 1.8452943941106859e-07, |
|
"loss": 0.5382, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.948875255623722, |
|
"grad_norm": 1.3987324237823486, |
|
"learning_rate": 1.4581987556490095e-07, |
|
"loss": 0.3326, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.957055214723926, |
|
"grad_norm": 1.4565430879592896, |
|
"learning_rate": 1.1165606884234181e-07, |
|
"loss": 0.4901, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.965235173824131, |
|
"grad_norm": 1.3861486911773682, |
|
"learning_rate": 8.204113433559201e-08, |
|
"loss": 0.2756, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.9734151329243352, |
|
"grad_norm": 1.4839295148849487, |
|
"learning_rate": 5.697777236585711e-08, |
|
"loss": 0.3303, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.98159509202454, |
|
"grad_norm": 1.6138904094696045, |
|
"learning_rate": 3.6468268237105366e-08, |
|
"loss": 0.558, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.98159509202454, |
|
"step": 366, |
|
"total_flos": 3.142935032247091e+16, |
|
"train_loss": 0.7803121163341843, |
|
"train_runtime": 701.3998, |
|
"train_samples_per_second": 4.183, |
|
"train_steps_per_second": 0.522 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 366, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.142935032247091e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|