|
{ |
|
"best_metric": 0.6048758625984192, |
|
"best_model_checkpoint": "/shared/data/meta-llama/Llama-3.1-8B/2_5M/8b_v1/checkpoint-1800", |
|
"epoch": 2.9989730950913946, |
|
"eval_steps": 100, |
|
"global_step": 1824, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001643047853768741, |
|
"eval_loss": 3.5505740642547607, |
|
"eval_runtime": 50.9394, |
|
"eval_samples_per_second": 494.294, |
|
"eval_steps_per_second": 0.982, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014787430683918669, |
|
"grad_norm": 105.67444610595703, |
|
"learning_rate": 9.000000000000001e-07, |
|
"loss": 3.1636, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.029574861367837338, |
|
"grad_norm": 46.17672348022461, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 2.2268, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04436229205175601, |
|
"grad_norm": 23.457378387451172, |
|
"learning_rate": 2.7000000000000004e-06, |
|
"loss": 1.3465, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.059149722735674676, |
|
"grad_norm": 1.2907930612564087, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 1.0364, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07393715341959335, |
|
"grad_norm": 1.3701411485671997, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.8822, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08872458410351201, |
|
"grad_norm": 1.0450048446655273, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 0.8243, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10351201478743069, |
|
"grad_norm": 1.1930880546569824, |
|
"learning_rate": 6.300000000000001e-06, |
|
"loss": 0.7871, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11829944547134935, |
|
"grad_norm": 0.8426429629325867, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.7663, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.133086876155268, |
|
"grad_norm": 1.421916127204895, |
|
"learning_rate": 8.1e-06, |
|
"loss": 0.7494, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1478743068391867, |
|
"grad_norm": 1.322658896446228, |
|
"learning_rate": 9e-06, |
|
"loss": 0.735, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16266173752310537, |
|
"grad_norm": 1.6398061513900757, |
|
"learning_rate": 9.9e-06, |
|
"loss": 0.7285, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.1643047853768741, |
|
"eval_loss": 0.747532844543457, |
|
"eval_runtime": 50.8409, |
|
"eval_samples_per_second": 495.251, |
|
"eval_steps_per_second": 0.983, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17744916820702403, |
|
"grad_norm": 1.1485223770141602, |
|
"learning_rate": 9.999468702816552e-06, |
|
"loss": 0.725, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.1922365988909427, |
|
"grad_norm": 0.9843918085098267, |
|
"learning_rate": 9.99760101052916e-06, |
|
"loss": 0.7137, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.20702402957486138, |
|
"grad_norm": 0.7764966487884521, |
|
"learning_rate": 9.994389123823944e-06, |
|
"loss": 0.7065, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22181146025878004, |
|
"grad_norm": 1.0616685152053833, |
|
"learning_rate": 9.989833906595432e-06, |
|
"loss": 0.7016, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2365988909426987, |
|
"grad_norm": 0.8764758706092834, |
|
"learning_rate": 9.983936584050992e-06, |
|
"loss": 0.6967, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.2513863216266174, |
|
"grad_norm": 0.8168648481369019, |
|
"learning_rate": 9.976698742381285e-06, |
|
"loss": 0.6889, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.266173752310536, |
|
"grad_norm": 0.8583444952964783, |
|
"learning_rate": 9.968122328333627e-06, |
|
"loss": 0.6904, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2809611829944547, |
|
"grad_norm": 1.1491072177886963, |
|
"learning_rate": 9.958209648688385e-06, |
|
"loss": 0.6968, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.2957486136783734, |
|
"grad_norm": 0.9233031868934631, |
|
"learning_rate": 9.946963369638524e-06, |
|
"loss": 0.6845, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.31053604436229204, |
|
"grad_norm": 1.02516770362854, |
|
"learning_rate": 9.934386516072483e-06, |
|
"loss": 0.6814, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.32532347504621073, |
|
"grad_norm": 0.7226729393005371, |
|
"learning_rate": 9.920482470760577e-06, |
|
"loss": 0.676, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.3286095707537482, |
|
"eval_loss": 0.6874070763587952, |
|
"eval_runtime": 50.9242, |
|
"eval_samples_per_second": 494.441, |
|
"eval_steps_per_second": 0.982, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.34011090573012936, |
|
"grad_norm": 0.7855489253997803, |
|
"learning_rate": 9.905254973445144e-06, |
|
"loss": 0.6722, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.35489833641404805, |
|
"grad_norm": 1.2335143089294434, |
|
"learning_rate": 9.88870811983468e-06, |
|
"loss": 0.6695, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.36968576709796674, |
|
"grad_norm": 1.9940311908721924, |
|
"learning_rate": 9.870846360502206e-06, |
|
"loss": 0.6711, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3844731977818854, |
|
"grad_norm": 0.9790201783180237, |
|
"learning_rate": 9.851674499688227e-06, |
|
"loss": 0.6683, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.39926062846580407, |
|
"grad_norm": 0.5254144668579102, |
|
"learning_rate": 9.831197694008529e-06, |
|
"loss": 0.6637, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.41404805914972276, |
|
"grad_norm": 0.41433659195899963, |
|
"learning_rate": 9.809421451067234e-06, |
|
"loss": 0.6609, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4288354898336414, |
|
"grad_norm": 0.6482775807380676, |
|
"learning_rate": 9.786351627975408e-06, |
|
"loss": 0.6572, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.4436229205175601, |
|
"grad_norm": 0.9043383002281189, |
|
"learning_rate": 9.7619944297757e-06, |
|
"loss": 0.6575, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4584103512014787, |
|
"grad_norm": 0.4247804880142212, |
|
"learning_rate": 9.736356407773386e-06, |
|
"loss": 0.6541, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.4731977818853974, |
|
"grad_norm": 0.5127539038658142, |
|
"learning_rate": 9.709444457774272e-06, |
|
"loss": 0.6541, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.4879852125693161, |
|
"grad_norm": 0.6547645926475525, |
|
"learning_rate": 9.681265818229938e-06, |
|
"loss": 0.6524, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.4929143561306223, |
|
"eval_loss": 0.6639789342880249, |
|
"eval_runtime": 50.7468, |
|
"eval_samples_per_second": 496.17, |
|
"eval_steps_per_second": 0.985, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5027726432532348, |
|
"grad_norm": 0.5972119569778442, |
|
"learning_rate": 9.651828068290847e-06, |
|
"loss": 0.651, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5175600739371534, |
|
"grad_norm": 0.424817830324173, |
|
"learning_rate": 9.621139125767774e-06, |
|
"loss": 0.6484, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.532347504621072, |
|
"grad_norm": 0.49317240715026855, |
|
"learning_rate": 9.589207245002178e-06, |
|
"loss": 0.6443, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5471349353049908, |
|
"grad_norm": 1.2876074314117432, |
|
"learning_rate": 9.556041014646054e-06, |
|
"loss": 0.6474, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.5619223659889094, |
|
"grad_norm": 0.8977052569389343, |
|
"learning_rate": 9.52164935535185e-06, |
|
"loss": 0.648, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.5767097966728281, |
|
"grad_norm": 1.3593140840530396, |
|
"learning_rate": 9.486041517373112e-06, |
|
"loss": 0.6446, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.5914972273567468, |
|
"grad_norm": 0.9276908040046692, |
|
"learning_rate": 9.449227078076444e-06, |
|
"loss": 0.6438, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6062846580406654, |
|
"grad_norm": 0.7807438373565674, |
|
"learning_rate": 9.411215939365522e-06, |
|
"loss": 0.6418, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.6210720887245841, |
|
"grad_norm": 0.7642928957939148, |
|
"learning_rate": 9.372018325017782e-06, |
|
"loss": 0.641, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.6358595194085028, |
|
"grad_norm": 1.0264766216278076, |
|
"learning_rate": 9.33164477793457e-06, |
|
"loss": 0.6396, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.6506469500924215, |
|
"grad_norm": 1.3351635932922363, |
|
"learning_rate": 9.290106157305424e-06, |
|
"loss": 0.6417, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.6572191415074964, |
|
"eval_loss": 0.6507741212844849, |
|
"eval_runtime": 50.7648, |
|
"eval_samples_per_second": 495.993, |
|
"eval_steps_per_second": 0.985, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6654343807763401, |
|
"grad_norm": 0.5435072183609009, |
|
"learning_rate": 9.247413635687308e-06, |
|
"loss": 0.6374, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6802218114602587, |
|
"grad_norm": 5113.79248046875, |
|
"learning_rate": 9.20357869599955e-06, |
|
"loss": 0.6345, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.6950092421441775, |
|
"grad_norm": 0.5009027123451233, |
|
"learning_rate": 9.158613128435309e-06, |
|
"loss": 0.6352, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.7097966728280961, |
|
"grad_norm": 0.5329261422157288, |
|
"learning_rate": 9.112529027290382e-06, |
|
"loss": 0.6343, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.7245841035120147, |
|
"grad_norm": 0.3809993863105774, |
|
"learning_rate": 9.065338787710241e-06, |
|
"loss": 0.6304, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.7393715341959335, |
|
"grad_norm": 0.30022993683815, |
|
"learning_rate": 9.017055102356116e-06, |
|
"loss": 0.6311, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7541589648798521, |
|
"grad_norm": 1.3200950622558594, |
|
"learning_rate": 8.967690957991097e-06, |
|
"loss": 0.6314, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.7689463955637708, |
|
"grad_norm": 2452.027587890625, |
|
"learning_rate": 8.917259631987099e-06, |
|
"loss": 0.6305, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.7837338262476895, |
|
"grad_norm": 1.8559657335281372, |
|
"learning_rate": 8.865774688753673e-06, |
|
"loss": 0.634, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.7985212569316081, |
|
"grad_norm": 0.7670221328735352, |
|
"learning_rate": 8.813249976089628e-06, |
|
"loss": 0.6334, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.8133086876155268, |
|
"grad_norm": 0.5721145272254944, |
|
"learning_rate": 8.7596996214584e-06, |
|
"loss": 0.6272, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8215239268843705, |
|
"eval_loss": 0.6392470002174377, |
|
"eval_runtime": 50.8387, |
|
"eval_samples_per_second": 495.272, |
|
"eval_steps_per_second": 0.984, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8280961182994455, |
|
"grad_norm": 0.5032740235328674, |
|
"learning_rate": 8.705138028188228e-06, |
|
"loss": 0.6265, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.8428835489833642, |
|
"grad_norm": 0.4043797254562378, |
|
"learning_rate": 8.649579871598124e-06, |
|
"loss": 0.6239, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.8576709796672828, |
|
"grad_norm": 0.29705339670181274, |
|
"learning_rate": 8.593040095050668e-06, |
|
"loss": 0.6248, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.8724584103512015, |
|
"grad_norm": 0.34723684191703796, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.6232, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.8872458410351202, |
|
"grad_norm": 0.4493858814239502, |
|
"learning_rate": 8.477076771565203e-06, |
|
"loss": 0.621, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9020332717190388, |
|
"grad_norm": 0.2866009473800659, |
|
"learning_rate": 8.417684415042712e-06, |
|
"loss": 0.6212, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.9168207024029574, |
|
"grad_norm": 1.456310749053955, |
|
"learning_rate": 8.357372811004678e-06, |
|
"loss": 0.6226, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.9316081330868762, |
|
"grad_norm": 0.6134287118911743, |
|
"learning_rate": 8.29615818133863e-06, |
|
"loss": 0.6208, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.9463955637707948, |
|
"grad_norm": 0.5051162838935852, |
|
"learning_rate": 8.234056990817025e-06, |
|
"loss": 0.6203, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.9611829944547134, |
|
"grad_norm": 0.37639549374580383, |
|
"learning_rate": 8.171085942668765e-06, |
|
"loss": 0.6192, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.9759704251386322, |
|
"grad_norm": 0.31022700667381287, |
|
"learning_rate": 8.107261974086562e-06, |
|
"loss": 0.6192, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.9858287122612446, |
|
"eval_loss": 0.630632758140564, |
|
"eval_runtime": 50.7072, |
|
"eval_samples_per_second": 496.557, |
|
"eval_steps_per_second": 0.986, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9907578558225508, |
|
"grad_norm": 0.6675688624382019, |
|
"learning_rate": 8.042602251671372e-06, |
|
"loss": 0.6164, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.006572191415075, |
|
"grad_norm": 0.48963022232055664, |
|
"learning_rate": 7.977124166815134e-06, |
|
"loss": 0.6806, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.0213596220989936, |
|
"grad_norm": 0.404670774936676, |
|
"learning_rate": 7.910845331023043e-06, |
|
"loss": 0.6073, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.0361470527829124, |
|
"grad_norm": 0.7819475531578064, |
|
"learning_rate": 7.843783571176617e-06, |
|
"loss": 0.607, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.050934483466831, |
|
"grad_norm": 0.7247259020805359, |
|
"learning_rate": 7.77595692473884e-06, |
|
"loss": 0.6047, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.0657219141507497, |
|
"grad_norm": 0.3864266574382782, |
|
"learning_rate": 7.707383634902658e-06, |
|
"loss": 0.604, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.0805093448346683, |
|
"grad_norm": 0.32667380571365356, |
|
"learning_rate": 7.638082145684161e-06, |
|
"loss": 0.6016, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.095296775518587, |
|
"grad_norm": 0.3104453682899475, |
|
"learning_rate": 7.568071096961707e-06, |
|
"loss": 0.6049, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.1100842062025056, |
|
"grad_norm": 0.23003344237804413, |
|
"learning_rate": 7.497369319462418e-06, |
|
"loss": 0.6011, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.1248716368864242, |
|
"grad_norm": 0.2505716383457184, |
|
"learning_rate": 7.425995829697304e-06, |
|
"loss": 0.6024, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.139659067570343, |
|
"grad_norm": 0.25305768847465515, |
|
"learning_rate": 7.353969824846438e-06, |
|
"loss": 0.6031, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.1511604025467241, |
|
"eval_loss": 0.6250694990158081, |
|
"eval_runtime": 50.9678, |
|
"eval_samples_per_second": 494.018, |
|
"eval_steps_per_second": 0.981, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1544464982542617, |
|
"grad_norm": 0.21149393916130066, |
|
"learning_rate": 7.281310677595526e-06, |
|
"loss": 0.6011, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.1692339289381803, |
|
"grad_norm": 0.6672160029411316, |
|
"learning_rate": 7.208037930925272e-06, |
|
"loss": 0.6018, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.184021359622099, |
|
"grad_norm": 1.4521163702011108, |
|
"learning_rate": 7.134171292854957e-06, |
|
"loss": 0.6138, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1988087903060176, |
|
"grad_norm": 0.7164549231529236, |
|
"learning_rate": 7.0597306311415995e-06, |
|
"loss": 0.6061, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.2135962209899362, |
|
"grad_norm": 0.464008629322052, |
|
"learning_rate": 6.984735967936173e-06, |
|
"loss": 0.6026, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.2283836516738549, |
|
"grad_norm": 0.3417239785194397, |
|
"learning_rate": 6.909207474398283e-06, |
|
"loss": 0.6015, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.2431710823577737, |
|
"grad_norm": 0.3259502947330475, |
|
"learning_rate": 6.833165465270786e-06, |
|
"loss": 0.5995, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.2579585130416924, |
|
"grad_norm": 0.3659493029117584, |
|
"learning_rate": 6.756630393415755e-06, |
|
"loss": 0.5955, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.272745943725611, |
|
"grad_norm": 0.30788564682006836, |
|
"learning_rate": 6.679622844313335e-06, |
|
"loss": 0.5996, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.2875333744095296, |
|
"grad_norm": 0.2533837556838989, |
|
"learning_rate": 6.602163530524894e-06, |
|
"loss": 0.5986, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.3023208050934483, |
|
"grad_norm": 0.27919843792915344, |
|
"learning_rate": 6.524273286122018e-06, |
|
"loss": 0.5982, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.3154651879235983, |
|
"eval_loss": 0.6208261251449585, |
|
"eval_runtime": 50.9478, |
|
"eval_samples_per_second": 494.211, |
|
"eval_steps_per_second": 0.981, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3171082357773671, |
|
"grad_norm": 0.2664376199245453, |
|
"learning_rate": 6.445973061082805e-06, |
|
"loss": 0.5971, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.3318956664612858, |
|
"grad_norm": 0.2566661238670349, |
|
"learning_rate": 6.3672839156570056e-06, |
|
"loss": 0.5962, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.3466830971452044, |
|
"grad_norm": 0.2494824081659317, |
|
"learning_rate": 6.288227014701473e-06, |
|
"loss": 0.5972, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.361470527829123, |
|
"grad_norm": 0.22812236845493317, |
|
"learning_rate": 6.208823621987516e-06, |
|
"loss": 0.5957, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.3762579585130417, |
|
"grad_norm": 0.25955238938331604, |
|
"learning_rate": 6.1290950944816065e-06, |
|
"loss": 0.5955, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 1.3910453891969603, |
|
"grad_norm": 0.28508901596069336, |
|
"learning_rate": 6.049062876601057e-06, |
|
"loss": 0.5947, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.405832819880879, |
|
"grad_norm": 0.21612407267093658, |
|
"learning_rate": 5.968748494446147e-06, |
|
"loss": 0.5927, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.4206202505647978, |
|
"grad_norm": 0.2369757890701294, |
|
"learning_rate": 5.888173550010301e-06, |
|
"loss": 0.592, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.4354076812487164, |
|
"grad_norm": 0.22978542745113373, |
|
"learning_rate": 5.807359715369843e-06, |
|
"loss": 0.5944, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 1.450195111932635, |
|
"grad_norm": 0.20011670887470245, |
|
"learning_rate": 5.726328726854896e-06, |
|
"loss": 0.5915, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.4649825426165537, |
|
"grad_norm": 0.2324371486902237, |
|
"learning_rate": 5.645102379203018e-06, |
|
"loss": 0.5956, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 1.4797699733004723, |
|
"grad_norm": 0.243639275431633, |
|
"learning_rate": 5.563702519697108e-06, |
|
"loss": 0.5934, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4797699733004723, |
|
"eval_loss": 0.6167550086975098, |
|
"eval_runtime": 50.557, |
|
"eval_samples_per_second": 498.031, |
|
"eval_steps_per_second": 0.989, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.494557403984391, |
|
"grad_norm": 0.23125344514846802, |
|
"learning_rate": 5.48215104228919e-06, |
|
"loss": 0.5938, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 1.5093448346683096, |
|
"grad_norm": 0.21182002127170563, |
|
"learning_rate": 5.40046988171164e-06, |
|
"loss": 0.5935, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 1.5241322653522285, |
|
"grad_norm": 0.22478973865509033, |
|
"learning_rate": 5.318681007577455e-06, |
|
"loss": 0.5923, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 1.538919696036147, |
|
"grad_norm": 0.20420202612876892, |
|
"learning_rate": 5.2368064184711136e-06, |
|
"loss": 0.5917, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.5537071267200657, |
|
"grad_norm": 0.21112091839313507, |
|
"learning_rate": 5.1548681360316824e-06, |
|
"loss": 0.5922, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.5684945574039844, |
|
"grad_norm": 0.21270057559013367, |
|
"learning_rate": 5.0728881990296904e-06, |
|
"loss": 0.5919, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 1.583281988087903, |
|
"grad_norm": 0.21418283879756927, |
|
"learning_rate": 4.990888657439405e-06, |
|
"loss": 0.5907, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 1.5980694187718218, |
|
"grad_norm": 0.2000851035118103, |
|
"learning_rate": 4.9088915665081035e-06, |
|
"loss": 0.5917, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.6128568494557403, |
|
"grad_norm": 0.20808270573616028, |
|
"learning_rate": 4.826918980823911e-06, |
|
"loss": 0.5917, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 1.6276442801396591, |
|
"grad_norm": 0.2385932058095932, |
|
"learning_rate": 4.744992948383827e-06, |
|
"loss": 0.5896, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.6424317108235778, |
|
"grad_norm": 0.2191840559244156, |
|
"learning_rate": 4.663135504663525e-06, |
|
"loss": 0.5884, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 1.6440747586773465, |
|
"eval_loss": 0.613418698310852, |
|
"eval_runtime": 51.0013, |
|
"eval_samples_per_second": 493.694, |
|
"eval_steps_per_second": 0.98, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6572191415074964, |
|
"grad_norm": 0.21977262198925018, |
|
"learning_rate": 4.58136866669051e-06, |
|
"loss": 0.5897, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.6720065721914152, |
|
"grad_norm": 0.24079816043376923, |
|
"learning_rate": 4.499714427122242e-06, |
|
"loss": 0.5899, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 1.6867940028753337, |
|
"grad_norm": 0.20791475474834442, |
|
"learning_rate": 4.418194748330831e-06, |
|
"loss": 0.5901, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 1.7015814335592525, |
|
"grad_norm": 0.23088951408863068, |
|
"learning_rate": 4.3368315564958415e-06, |
|
"loss": 0.5907, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.716368864243171, |
|
"grad_norm": 0.18772049248218536, |
|
"learning_rate": 4.2556467357068695e-06, |
|
"loss": 0.5897, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 1.7311562949270898, |
|
"grad_norm": 0.20672886073589325, |
|
"learning_rate": 4.174662122077424e-06, |
|
"loss": 0.5893, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 1.7459437256110084, |
|
"grad_norm": 0.2176688015460968, |
|
"learning_rate": 4.093899497871701e-06, |
|
"loss": 0.5899, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 1.760731156294927, |
|
"grad_norm": 0.20923234522342682, |
|
"learning_rate": 4.0133805856458615e-06, |
|
"loss": 0.5887, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 1.775518586978846, |
|
"grad_norm": 0.23178276419639587, |
|
"learning_rate": 3.933127042405362e-06, |
|
"loss": 0.5878, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.7903060176627643, |
|
"grad_norm": 0.21155081689357758, |
|
"learning_rate": 3.8531604537799075e-06, |
|
"loss": 0.5876, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 1.8050934483466832, |
|
"grad_norm": 0.20616762340068817, |
|
"learning_rate": 3.7735023282176146e-06, |
|
"loss": 0.587, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 1.8083795440542207, |
|
"eval_loss": 0.6106312870979309, |
|
"eval_runtime": 51.2953, |
|
"eval_samples_per_second": 490.864, |
|
"eval_steps_per_second": 0.975, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.8198808790306018, |
|
"grad_norm": 0.2053360491991043, |
|
"learning_rate": 3.6941740911999293e-06, |
|
"loss": 0.5901, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 1.8346683097145204, |
|
"grad_norm": 0.1950492113828659, |
|
"learning_rate": 3.6151970794788525e-06, |
|
"loss": 0.5879, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 1.849455740398439, |
|
"grad_norm": 0.18794511258602142, |
|
"learning_rate": 3.536592535338046e-06, |
|
"loss": 0.5885, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.8642431710823577, |
|
"grad_norm": 0.21698912978172302, |
|
"learning_rate": 3.4583816008793375e-06, |
|
"loss": 0.587, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 1.8790306017662766, |
|
"grad_norm": 0.2182021141052246, |
|
"learning_rate": 3.3805853123361687e-06, |
|
"loss": 0.5849, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 1.893818032450195, |
|
"grad_norm": 0.23845024406909943, |
|
"learning_rate": 3.303224594415528e-06, |
|
"loss": 0.5881, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 1.9086054631341138, |
|
"grad_norm": 0.19117599725723267, |
|
"learning_rate": 3.226320254669873e-06, |
|
"loss": 0.5871, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 1.9233928938180325, |
|
"grad_norm": 0.21598902344703674, |
|
"learning_rate": 3.1498929779005637e-06, |
|
"loss": 0.5865, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.938180324501951, |
|
"grad_norm": 0.22297415137290955, |
|
"learning_rate": 3.0739633205943237e-06, |
|
"loss": 0.5841, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 1.9529677551858697, |
|
"grad_norm": 0.18670949339866638, |
|
"learning_rate": 2.9985517053941926e-06, |
|
"loss": 0.5865, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 1.9677551858697884, |
|
"grad_norm": 0.20467469096183777, |
|
"learning_rate": 2.9236784156064936e-06, |
|
"loss": 0.5868, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 1.9726843294310947, |
|
"eval_loss": 0.6083265542984009, |
|
"eval_runtime": 50.99, |
|
"eval_samples_per_second": 493.802, |
|
"eval_steps_per_second": 0.981, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.9825426165537072, |
|
"grad_norm": 0.1806914359331131, |
|
"learning_rate": 2.8493635897452824e-06, |
|
"loss": 0.5875, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 1.9973300472376256, |
|
"grad_norm": 0.2014399617910385, |
|
"learning_rate": 2.77562721611572e-06, |
|
"loss": 0.5829, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.01314438283015, |
|
"grad_norm": 0.21652302145957947, |
|
"learning_rate": 2.7024891274378695e-06, |
|
"loss": 0.6427, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 2.0279318135140687, |
|
"grad_norm": 0.2011858969926834, |
|
"learning_rate": 2.629968995512327e-06, |
|
"loss": 0.5777, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 2.042719244197987, |
|
"grad_norm": 0.19227717816829681, |
|
"learning_rate": 2.5580863259291333e-06, |
|
"loss": 0.5765, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 2.057506674881906, |
|
"grad_norm": 0.17868341505527496, |
|
"learning_rate": 2.4868604528214042e-06, |
|
"loss": 0.5749, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 2.072294105565825, |
|
"grad_norm": 0.18664774298667908, |
|
"learning_rate": 2.4163105336650645e-06, |
|
"loss": 0.5742, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.0870815362497432, |
|
"grad_norm": 0.17733176052570343, |
|
"learning_rate": 2.3464555441261016e-06, |
|
"loss": 0.5747, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 2.101868966933662, |
|
"grad_norm": 0.1523798704147339, |
|
"learning_rate": 2.277314272956715e-06, |
|
"loss": 0.5755, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 2.1166563976175805, |
|
"grad_norm": 1108.92724609375, |
|
"learning_rate": 2.208905316941754e-06, |
|
"loss": 0.5732, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 2.1314438283014994, |
|
"grad_norm": 0.1998017281293869, |
|
"learning_rate": 2.1412470758967742e-06, |
|
"loss": 0.5744, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 2.138016019716574, |
|
"eval_loss": 0.6074865460395813, |
|
"eval_runtime": 51.2662, |
|
"eval_samples_per_second": 491.142, |
|
"eval_steps_per_second": 0.975, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.146231258985418, |
|
"grad_norm": 0.17162367701530457, |
|
"learning_rate": 2.0743577477190714e-06, |
|
"loss": 0.5762, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.1610186896693366, |
|
"grad_norm": 0.16444073617458344, |
|
"learning_rate": 2.0082553234930407e-06, |
|
"loss": 0.5774, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 2.1758061203532555, |
|
"grad_norm": 0.1596338301897049, |
|
"learning_rate": 1.9429575826511493e-06, |
|
"loss": 0.5764, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 2.190593551037174, |
|
"grad_norm": 0.16474801301956177, |
|
"learning_rate": 1.8784820881918275e-06, |
|
"loss": 0.5754, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 2.2053809817210928, |
|
"grad_norm": 0.17859473824501038, |
|
"learning_rate": 1.8148461819556095e-06, |
|
"loss": 0.5748, |
|
"step": 1341 |
|
}, |
|
{ |
|
"epoch": 2.220168412405011, |
|
"grad_norm": 0.16459307074546814, |
|
"learning_rate": 1.752066979960707e-06, |
|
"loss": 0.574, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.23495584308893, |
|
"grad_norm": 0.16431599855422974, |
|
"learning_rate": 1.6901613677993677e-06, |
|
"loss": 0.5739, |
|
"step": 1359 |
|
}, |
|
{ |
|
"epoch": 2.2497432737728484, |
|
"grad_norm": 0.16578958928585052, |
|
"learning_rate": 1.6291459960961886e-06, |
|
"loss": 0.5736, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 2.2645307044567673, |
|
"grad_norm": 0.17526379227638245, |
|
"learning_rate": 1.5690372760296235e-06, |
|
"loss": 0.5764, |
|
"step": 1377 |
|
}, |
|
{ |
|
"epoch": 2.279318135140686, |
|
"grad_norm": 0.15786723792552948, |
|
"learning_rate": 1.5098513749179156e-06, |
|
"loss": 0.5741, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 2.2941055658246046, |
|
"grad_norm": 0.1611924022436142, |
|
"learning_rate": 1.451604211870597e-06, |
|
"loss": 0.5725, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.3023208050934483, |
|
"eval_loss": 0.6064820885658264, |
|
"eval_runtime": 51.337, |
|
"eval_samples_per_second": 490.465, |
|
"eval_steps_per_second": 0.974, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.3088929965085234, |
|
"grad_norm": 0.1685233861207962, |
|
"learning_rate": 1.3943114535067632e-06, |
|
"loss": 0.5738, |
|
"step": 1404 |
|
}, |
|
{ |
|
"epoch": 2.323680427192442, |
|
"grad_norm": 0.15319103002548218, |
|
"learning_rate": 1.337988509741255e-06, |
|
"loss": 0.5737, |
|
"step": 1413 |
|
}, |
|
{ |
|
"epoch": 2.3384678578763607, |
|
"grad_norm": 0.15287065505981445, |
|
"learning_rate": 1.2826505296398805e-06, |
|
"loss": 0.5735, |
|
"step": 1422 |
|
}, |
|
{ |
|
"epoch": 2.3532552885602795, |
|
"grad_norm": 0.1517048478126526, |
|
"learning_rate": 1.2283123973448107e-06, |
|
"loss": 0.5734, |
|
"step": 1431 |
|
}, |
|
{ |
|
"epoch": 2.368042719244198, |
|
"grad_norm": 0.14937447011470795, |
|
"learning_rate": 1.1749887280712164e-06, |
|
"loss": 0.5745, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.382830149928117, |
|
"grad_norm": 0.1517946720123291, |
|
"learning_rate": 1.1226938641762464e-06, |
|
"loss": 0.5742, |
|
"step": 1449 |
|
}, |
|
{ |
|
"epoch": 2.3976175806120352, |
|
"grad_norm": 0.1503513902425766, |
|
"learning_rate": 1.0714418713013885e-06, |
|
"loss": 0.5731, |
|
"step": 1458 |
|
}, |
|
{ |
|
"epoch": 2.412405011295954, |
|
"grad_norm": 0.15668976306915283, |
|
"learning_rate": 1.021246534589272e-06, |
|
"loss": 0.5739, |
|
"step": 1467 |
|
}, |
|
{ |
|
"epoch": 2.4271924419798725, |
|
"grad_norm": 0.15115846693515778, |
|
"learning_rate": 9.721213549759011e-07, |
|
"loss": 0.5723, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 2.4419798726637914, |
|
"grad_norm": 0.1494954228401184, |
|
"learning_rate": 9.24079545559331e-07, |
|
"loss": 0.574, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.4567673033477098, |
|
"grad_norm": 0.14341481029987335, |
|
"learning_rate": 8.771340280457791e-07, |
|
"loss": 0.5742, |
|
"step": 1494 |
|
}, |
|
{ |
|
"epoch": 2.4666255904703225, |
|
"eval_loss": 0.6056612730026245, |
|
"eval_runtime": 51.0039, |
|
"eval_samples_per_second": 493.668, |
|
"eval_steps_per_second": 0.98, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.4715547340316286, |
|
"grad_norm": 0.14807379245758057, |
|
"learning_rate": 8.312974292740938e-07, |
|
"loss": 0.5768, |
|
"step": 1503 |
|
}, |
|
{ |
|
"epoch": 2.4863421647155475, |
|
"grad_norm": 0.14617814123630524, |
|
"learning_rate": 7.865820778195366e-07, |
|
"loss": 0.5742, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 2.501129595399466, |
|
"grad_norm": 0.14887727797031403, |
|
"learning_rate": 7.430000006778021e-07, |
|
"loss": 0.5732, |
|
"step": 1521 |
|
}, |
|
{ |
|
"epoch": 2.5159170260833847, |
|
"grad_norm": 0.1536111980676651, |
|
"learning_rate": 7.005629200301267e-07, |
|
"loss": 0.5719, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.530704456767303, |
|
"grad_norm": 0.1470237374305725, |
|
"learning_rate": 6.592822500904111e-07, |
|
"loss": 0.5729, |
|
"step": 1539 |
|
}, |
|
{ |
|
"epoch": 2.545491887451222, |
|
"grad_norm": 0.15134099125862122, |
|
"learning_rate": 6.191690940351569e-07, |
|
"loss": 0.5728, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 2.560279318135141, |
|
"grad_norm": 0.15583239495754242, |
|
"learning_rate": 5.802342410170636e-07, |
|
"loss": 0.5727, |
|
"step": 1557 |
|
}, |
|
{ |
|
"epoch": 2.5750667488190593, |
|
"grad_norm": 0.14715588092803955, |
|
"learning_rate": 5.424881632631023e-07, |
|
"loss": 0.5726, |
|
"step": 1566 |
|
}, |
|
{ |
|
"epoch": 2.589854179502978, |
|
"grad_norm": 0.13980697095394135, |
|
"learning_rate": 5.059410132578163e-07, |
|
"loss": 0.5723, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.6046416101868965, |
|
"grad_norm": 0.14238569140434265, |
|
"learning_rate": 4.7060262101263024e-07, |
|
"loss": 0.5728, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 2.6194290408708154, |
|
"grad_norm": 0.14448300004005432, |
|
"learning_rate": 4.3648249142188846e-07, |
|
"loss": 0.5744, |
|
"step": 1593 |
|
}, |
|
{ |
|
"epoch": 2.6309303758471967, |
|
"eval_loss": 0.6051778197288513, |
|
"eval_runtime": 51.0525, |
|
"eval_samples_per_second": 493.198, |
|
"eval_steps_per_second": 0.979, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.6342164715547343, |
|
"grad_norm": 0.13831211626529694, |
|
"learning_rate": 4.0358980170634945e-07, |
|
"loss": 0.5713, |
|
"step": 1602 |
|
}, |
|
{ |
|
"epoch": 2.6490039022386527, |
|
"grad_norm": 0.1441497951745987, |
|
"learning_rate": 3.7193339894480486e-07, |
|
"loss": 0.5735, |
|
"step": 1611 |
|
}, |
|
{ |
|
"epoch": 2.6637913329225715, |
|
"grad_norm": 0.14047347009181976, |
|
"learning_rate": 3.41521797694494e-07, |
|
"loss": 0.5722, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.67857876360649, |
|
"grad_norm": 0.14037346839904785, |
|
"learning_rate": 3.1236317770097335e-07, |
|
"loss": 0.5728, |
|
"step": 1629 |
|
}, |
|
{ |
|
"epoch": 2.693366194290409, |
|
"grad_norm": 0.14050990343093872, |
|
"learning_rate": 2.844653816980125e-07, |
|
"loss": 0.5758, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 2.708153624974327, |
|
"grad_norm": 0.14191824197769165, |
|
"learning_rate": 2.578359132981606e-07, |
|
"loss": 0.5739, |
|
"step": 1647 |
|
}, |
|
{ |
|
"epoch": 2.722941055658246, |
|
"grad_norm": 0.13892725110054016, |
|
"learning_rate": 2.3248193497451331e-07, |
|
"loss": 0.5721, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 2.7377284863421645, |
|
"grad_norm": 0.14035071432590485, |
|
"learning_rate": 2.0841026613423297e-07, |
|
"loss": 0.5755, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.7525159170260833, |
|
"grad_norm": 0.1366022676229477, |
|
"learning_rate": 1.8562738128435066e-07, |
|
"loss": 0.5744, |
|
"step": 1674 |
|
}, |
|
{ |
|
"epoch": 2.767303347710002, |
|
"grad_norm": 0.14230774343013763, |
|
"learning_rate": 1.6413940829033193e-07, |
|
"loss": 0.5707, |
|
"step": 1683 |
|
}, |
|
{ |
|
"epoch": 2.7820907783939206, |
|
"grad_norm": 0.14180122315883636, |
|
"learning_rate": 1.4395212672787373e-07, |
|
"loss": 0.573, |
|
"step": 1692 |
|
}, |
|
{ |
|
"epoch": 2.7952351612240705, |
|
"eval_loss": 0.6049104928970337, |
|
"eval_runtime": 51.0939, |
|
"eval_samples_per_second": 492.799, |
|
"eval_steps_per_second": 0.979, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.7968782090778395, |
|
"grad_norm": 0.13679684698581696, |
|
"learning_rate": 1.2507096632838833e-07, |
|
"loss": 0.5715, |
|
"step": 1701 |
|
}, |
|
{ |
|
"epoch": 2.811665639761758, |
|
"grad_norm": 0.1371249109506607, |
|
"learning_rate": 1.0750100551857546e-07, |
|
"loss": 0.5718, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.8264530704456767, |
|
"grad_norm": 0.1351563036441803, |
|
"learning_rate": 9.124697005449157e-08, |
|
"loss": 0.5728, |
|
"step": 1719 |
|
}, |
|
{ |
|
"epoch": 2.8412405011295956, |
|
"grad_norm": 0.1376628279685974, |
|
"learning_rate": 7.631323175047168e-08, |
|
"loss": 0.5731, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 2.856027931813514, |
|
"grad_norm": 0.14077013731002808, |
|
"learning_rate": 6.270380730325154e-08, |
|
"loss": 0.5752, |
|
"step": 1737 |
|
}, |
|
{ |
|
"epoch": 2.870815362497433, |
|
"grad_norm": 0.1379421055316925, |
|
"learning_rate": 5.042235721160471e-08, |
|
"loss": 0.5747, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 2.8856027931813513, |
|
"grad_norm": 0.1414673924446106, |
|
"learning_rate": 3.9472184791786716e-08, |
|
"loss": 0.5714, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.90039022386527, |
|
"grad_norm": 0.13783302903175354, |
|
"learning_rate": 2.985623528904913e-08, |
|
"loss": 0.5706, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 2.915177654549189, |
|
"grad_norm": 0.13526910543441772, |
|
"learning_rate": 2.1577095085460465e-08, |
|
"loss": 0.5711, |
|
"step": 1773 |
|
}, |
|
{ |
|
"epoch": 2.9299650852331074, |
|
"grad_norm": 0.13618573546409607, |
|
"learning_rate": 1.4636991004254864e-08, |
|
"loss": 0.5695, |
|
"step": 1782 |
|
}, |
|
{ |
|
"epoch": 2.9447525159170262, |
|
"grad_norm": 0.13651920855045319, |
|
"learning_rate": 9.037789710887868e-09, |
|
"loss": 0.5721, |
|
"step": 1791 |
|
}, |
|
{ |
|
"epoch": 2.9595399466009447, |
|
"grad_norm": 0.1356479376554489, |
|
"learning_rate": 4.780997210962479e-09, |
|
"loss": 0.5733, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.9595399466009447, |
|
"eval_loss": 0.6048758625984192, |
|
"eval_runtime": 50.6428, |
|
"eval_samples_per_second": 497.188, |
|
"eval_steps_per_second": 0.987, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.9743273772848635, |
|
"grad_norm": 0.1394314020872116, |
|
"learning_rate": 1.867758445161516e-09, |
|
"loss": 0.5713, |
|
"step": 1809 |
|
}, |
|
{ |
|
"epoch": 2.989114807968782, |
|
"grad_norm": 0.14245158433914185, |
|
"learning_rate": 2.988569812972797e-10, |
|
"loss": 0.5716, |
|
"step": 1818 |
|
} |
|
], |
|
"logging_steps": 9, |
|
"max_steps": 1824, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.756360977818969e+21, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|