arpitsh018's picture
Bulk upload of remaining files from temporary folder (excluding .pth files)
0557312 verified
{
"best_metric": 0.6048758625984192,
"best_model_checkpoint": "/shared/data/meta-llama/Llama-3.1-8B/2_5M/8b_v1/checkpoint-1800",
"epoch": 2.9989730950913946,
"eval_steps": 100,
"global_step": 1824,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001643047853768741,
"eval_loss": 3.5505740642547607,
"eval_runtime": 50.9394,
"eval_samples_per_second": 494.294,
"eval_steps_per_second": 0.982,
"step": 1
},
{
"epoch": 0.014787430683918669,
"grad_norm": 105.67444610595703,
"learning_rate": 9.000000000000001e-07,
"loss": 3.1636,
"step": 9
},
{
"epoch": 0.029574861367837338,
"grad_norm": 46.17672348022461,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.2268,
"step": 18
},
{
"epoch": 0.04436229205175601,
"grad_norm": 23.457378387451172,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.3465,
"step": 27
},
{
"epoch": 0.059149722735674676,
"grad_norm": 1.2907930612564087,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.0364,
"step": 36
},
{
"epoch": 0.07393715341959335,
"grad_norm": 1.3701411485671997,
"learning_rate": 4.5e-06,
"loss": 0.8822,
"step": 45
},
{
"epoch": 0.08872458410351201,
"grad_norm": 1.0450048446655273,
"learning_rate": 5.400000000000001e-06,
"loss": 0.8243,
"step": 54
},
{
"epoch": 0.10351201478743069,
"grad_norm": 1.1930880546569824,
"learning_rate": 6.300000000000001e-06,
"loss": 0.7871,
"step": 63
},
{
"epoch": 0.11829944547134935,
"grad_norm": 0.8426429629325867,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.7663,
"step": 72
},
{
"epoch": 0.133086876155268,
"grad_norm": 1.421916127204895,
"learning_rate": 8.1e-06,
"loss": 0.7494,
"step": 81
},
{
"epoch": 0.1478743068391867,
"grad_norm": 1.322658896446228,
"learning_rate": 9e-06,
"loss": 0.735,
"step": 90
},
{
"epoch": 0.16266173752310537,
"grad_norm": 1.6398061513900757,
"learning_rate": 9.9e-06,
"loss": 0.7285,
"step": 99
},
{
"epoch": 0.1643047853768741,
"eval_loss": 0.747532844543457,
"eval_runtime": 50.8409,
"eval_samples_per_second": 495.251,
"eval_steps_per_second": 0.983,
"step": 100
},
{
"epoch": 0.17744916820702403,
"grad_norm": 1.1485223770141602,
"learning_rate": 9.999468702816552e-06,
"loss": 0.725,
"step": 108
},
{
"epoch": 0.1922365988909427,
"grad_norm": 0.9843918085098267,
"learning_rate": 9.99760101052916e-06,
"loss": 0.7137,
"step": 117
},
{
"epoch": 0.20702402957486138,
"grad_norm": 0.7764966487884521,
"learning_rate": 9.994389123823944e-06,
"loss": 0.7065,
"step": 126
},
{
"epoch": 0.22181146025878004,
"grad_norm": 1.0616685152053833,
"learning_rate": 9.989833906595432e-06,
"loss": 0.7016,
"step": 135
},
{
"epoch": 0.2365988909426987,
"grad_norm": 0.8764758706092834,
"learning_rate": 9.983936584050992e-06,
"loss": 0.6967,
"step": 144
},
{
"epoch": 0.2513863216266174,
"grad_norm": 0.8168648481369019,
"learning_rate": 9.976698742381285e-06,
"loss": 0.6889,
"step": 153
},
{
"epoch": 0.266173752310536,
"grad_norm": 0.8583444952964783,
"learning_rate": 9.968122328333627e-06,
"loss": 0.6904,
"step": 162
},
{
"epoch": 0.2809611829944547,
"grad_norm": 1.1491072177886963,
"learning_rate": 9.958209648688385e-06,
"loss": 0.6968,
"step": 171
},
{
"epoch": 0.2957486136783734,
"grad_norm": 0.9233031868934631,
"learning_rate": 9.946963369638524e-06,
"loss": 0.6845,
"step": 180
},
{
"epoch": 0.31053604436229204,
"grad_norm": 1.02516770362854,
"learning_rate": 9.934386516072483e-06,
"loss": 0.6814,
"step": 189
},
{
"epoch": 0.32532347504621073,
"grad_norm": 0.7226729393005371,
"learning_rate": 9.920482470760577e-06,
"loss": 0.676,
"step": 198
},
{
"epoch": 0.3286095707537482,
"eval_loss": 0.6874070763587952,
"eval_runtime": 50.9242,
"eval_samples_per_second": 494.441,
"eval_steps_per_second": 0.982,
"step": 200
},
{
"epoch": 0.34011090573012936,
"grad_norm": 0.7855489253997803,
"learning_rate": 9.905254973445144e-06,
"loss": 0.6722,
"step": 207
},
{
"epoch": 0.35489833641404805,
"grad_norm": 1.2335143089294434,
"learning_rate": 9.88870811983468e-06,
"loss": 0.6695,
"step": 216
},
{
"epoch": 0.36968576709796674,
"grad_norm": 1.9940311908721924,
"learning_rate": 9.870846360502206e-06,
"loss": 0.6711,
"step": 225
},
{
"epoch": 0.3844731977818854,
"grad_norm": 0.9790201783180237,
"learning_rate": 9.851674499688227e-06,
"loss": 0.6683,
"step": 234
},
{
"epoch": 0.39926062846580407,
"grad_norm": 0.5254144668579102,
"learning_rate": 9.831197694008529e-06,
"loss": 0.6637,
"step": 243
},
{
"epoch": 0.41404805914972276,
"grad_norm": 0.41433659195899963,
"learning_rate": 9.809421451067234e-06,
"loss": 0.6609,
"step": 252
},
{
"epoch": 0.4288354898336414,
"grad_norm": 0.6482775807380676,
"learning_rate": 9.786351627975408e-06,
"loss": 0.6572,
"step": 261
},
{
"epoch": 0.4436229205175601,
"grad_norm": 0.9043383002281189,
"learning_rate": 9.7619944297757e-06,
"loss": 0.6575,
"step": 270
},
{
"epoch": 0.4584103512014787,
"grad_norm": 0.4247804880142212,
"learning_rate": 9.736356407773386e-06,
"loss": 0.6541,
"step": 279
},
{
"epoch": 0.4731977818853974,
"grad_norm": 0.5127539038658142,
"learning_rate": 9.709444457774272e-06,
"loss": 0.6541,
"step": 288
},
{
"epoch": 0.4879852125693161,
"grad_norm": 0.6547645926475525,
"learning_rate": 9.681265818229938e-06,
"loss": 0.6524,
"step": 297
},
{
"epoch": 0.4929143561306223,
"eval_loss": 0.6639789342880249,
"eval_runtime": 50.7468,
"eval_samples_per_second": 496.17,
"eval_steps_per_second": 0.985,
"step": 300
},
{
"epoch": 0.5027726432532348,
"grad_norm": 0.5972119569778442,
"learning_rate": 9.651828068290847e-06,
"loss": 0.651,
"step": 306
},
{
"epoch": 0.5175600739371534,
"grad_norm": 0.424817830324173,
"learning_rate": 9.621139125767774e-06,
"loss": 0.6484,
"step": 315
},
{
"epoch": 0.532347504621072,
"grad_norm": 0.49317240715026855,
"learning_rate": 9.589207245002178e-06,
"loss": 0.6443,
"step": 324
},
{
"epoch": 0.5471349353049908,
"grad_norm": 1.2876074314117432,
"learning_rate": 9.556041014646054e-06,
"loss": 0.6474,
"step": 333
},
{
"epoch": 0.5619223659889094,
"grad_norm": 0.8977052569389343,
"learning_rate": 9.52164935535185e-06,
"loss": 0.648,
"step": 342
},
{
"epoch": 0.5767097966728281,
"grad_norm": 1.3593140840530396,
"learning_rate": 9.486041517373112e-06,
"loss": 0.6446,
"step": 351
},
{
"epoch": 0.5914972273567468,
"grad_norm": 0.9276908040046692,
"learning_rate": 9.449227078076444e-06,
"loss": 0.6438,
"step": 360
},
{
"epoch": 0.6062846580406654,
"grad_norm": 0.7807438373565674,
"learning_rate": 9.411215939365522e-06,
"loss": 0.6418,
"step": 369
},
{
"epoch": 0.6210720887245841,
"grad_norm": 0.7642928957939148,
"learning_rate": 9.372018325017782e-06,
"loss": 0.641,
"step": 378
},
{
"epoch": 0.6358595194085028,
"grad_norm": 1.0264766216278076,
"learning_rate": 9.33164477793457e-06,
"loss": 0.6396,
"step": 387
},
{
"epoch": 0.6506469500924215,
"grad_norm": 1.3351635932922363,
"learning_rate": 9.290106157305424e-06,
"loss": 0.6417,
"step": 396
},
{
"epoch": 0.6572191415074964,
"eval_loss": 0.6507741212844849,
"eval_runtime": 50.7648,
"eval_samples_per_second": 495.993,
"eval_steps_per_second": 0.985,
"step": 400
},
{
"epoch": 0.6654343807763401,
"grad_norm": 0.5435072183609009,
"learning_rate": 9.247413635687308e-06,
"loss": 0.6374,
"step": 405
},
{
"epoch": 0.6802218114602587,
"grad_norm": 5113.79248046875,
"learning_rate": 9.20357869599955e-06,
"loss": 0.6345,
"step": 414
},
{
"epoch": 0.6950092421441775,
"grad_norm": 0.5009027123451233,
"learning_rate": 9.158613128435309e-06,
"loss": 0.6352,
"step": 423
},
{
"epoch": 0.7097966728280961,
"grad_norm": 0.5329261422157288,
"learning_rate": 9.112529027290382e-06,
"loss": 0.6343,
"step": 432
},
{
"epoch": 0.7245841035120147,
"grad_norm": 0.3809993863105774,
"learning_rate": 9.065338787710241e-06,
"loss": 0.6304,
"step": 441
},
{
"epoch": 0.7393715341959335,
"grad_norm": 0.30022993683815,
"learning_rate": 9.017055102356116e-06,
"loss": 0.6311,
"step": 450
},
{
"epoch": 0.7541589648798521,
"grad_norm": 1.3200950622558594,
"learning_rate": 8.967690957991097e-06,
"loss": 0.6314,
"step": 459
},
{
"epoch": 0.7689463955637708,
"grad_norm": 2452.027587890625,
"learning_rate": 8.917259631987099e-06,
"loss": 0.6305,
"step": 468
},
{
"epoch": 0.7837338262476895,
"grad_norm": 1.8559657335281372,
"learning_rate": 8.865774688753673e-06,
"loss": 0.634,
"step": 477
},
{
"epoch": 0.7985212569316081,
"grad_norm": 0.7670221328735352,
"learning_rate": 8.813249976089628e-06,
"loss": 0.6334,
"step": 486
},
{
"epoch": 0.8133086876155268,
"grad_norm": 0.5721145272254944,
"learning_rate": 8.7596996214584e-06,
"loss": 0.6272,
"step": 495
},
{
"epoch": 0.8215239268843705,
"eval_loss": 0.6392470002174377,
"eval_runtime": 50.8387,
"eval_samples_per_second": 495.272,
"eval_steps_per_second": 0.984,
"step": 500
},
{
"epoch": 0.8280961182994455,
"grad_norm": 0.5032740235328674,
"learning_rate": 8.705138028188228e-06,
"loss": 0.6265,
"step": 504
},
{
"epoch": 0.8428835489833642,
"grad_norm": 0.4043797254562378,
"learning_rate": 8.649579871598124e-06,
"loss": 0.6239,
"step": 513
},
{
"epoch": 0.8576709796672828,
"grad_norm": 0.29705339670181274,
"learning_rate": 8.593040095050668e-06,
"loss": 0.6248,
"step": 522
},
{
"epoch": 0.8724584103512015,
"grad_norm": 0.34723684191703796,
"learning_rate": 8.535533905932739e-06,
"loss": 0.6232,
"step": 531
},
{
"epoch": 0.8872458410351202,
"grad_norm": 0.4493858814239502,
"learning_rate": 8.477076771565203e-06,
"loss": 0.621,
"step": 540
},
{
"epoch": 0.9020332717190388,
"grad_norm": 0.2866009473800659,
"learning_rate": 8.417684415042712e-06,
"loss": 0.6212,
"step": 549
},
{
"epoch": 0.9168207024029574,
"grad_norm": 1.456310749053955,
"learning_rate": 8.357372811004678e-06,
"loss": 0.6226,
"step": 558
},
{
"epoch": 0.9316081330868762,
"grad_norm": 0.6134287118911743,
"learning_rate": 8.29615818133863e-06,
"loss": 0.6208,
"step": 567
},
{
"epoch": 0.9463955637707948,
"grad_norm": 0.5051162838935852,
"learning_rate": 8.234056990817025e-06,
"loss": 0.6203,
"step": 576
},
{
"epoch": 0.9611829944547134,
"grad_norm": 0.37639549374580383,
"learning_rate": 8.171085942668765e-06,
"loss": 0.6192,
"step": 585
},
{
"epoch": 0.9759704251386322,
"grad_norm": 0.31022700667381287,
"learning_rate": 8.107261974086562e-06,
"loss": 0.6192,
"step": 594
},
{
"epoch": 0.9858287122612446,
"eval_loss": 0.630632758140564,
"eval_runtime": 50.7072,
"eval_samples_per_second": 496.557,
"eval_steps_per_second": 0.986,
"step": 600
},
{
"epoch": 0.9907578558225508,
"grad_norm": 0.6675688624382019,
"learning_rate": 8.042602251671372e-06,
"loss": 0.6164,
"step": 603
},
{
"epoch": 1.006572191415075,
"grad_norm": 0.48963022232055664,
"learning_rate": 7.977124166815134e-06,
"loss": 0.6806,
"step": 612
},
{
"epoch": 1.0213596220989936,
"grad_norm": 0.404670774936676,
"learning_rate": 7.910845331023043e-06,
"loss": 0.6073,
"step": 621
},
{
"epoch": 1.0361470527829124,
"grad_norm": 0.7819475531578064,
"learning_rate": 7.843783571176617e-06,
"loss": 0.607,
"step": 630
},
{
"epoch": 1.050934483466831,
"grad_norm": 0.7247259020805359,
"learning_rate": 7.77595692473884e-06,
"loss": 0.6047,
"step": 639
},
{
"epoch": 1.0657219141507497,
"grad_norm": 0.3864266574382782,
"learning_rate": 7.707383634902658e-06,
"loss": 0.604,
"step": 648
},
{
"epoch": 1.0805093448346683,
"grad_norm": 0.32667380571365356,
"learning_rate": 7.638082145684161e-06,
"loss": 0.6016,
"step": 657
},
{
"epoch": 1.095296775518587,
"grad_norm": 0.3104453682899475,
"learning_rate": 7.568071096961707e-06,
"loss": 0.6049,
"step": 666
},
{
"epoch": 1.1100842062025056,
"grad_norm": 0.23003344237804413,
"learning_rate": 7.497369319462418e-06,
"loss": 0.6011,
"step": 675
},
{
"epoch": 1.1248716368864242,
"grad_norm": 0.2505716383457184,
"learning_rate": 7.425995829697304e-06,
"loss": 0.6024,
"step": 684
},
{
"epoch": 1.139659067570343,
"grad_norm": 0.25305768847465515,
"learning_rate": 7.353969824846438e-06,
"loss": 0.6031,
"step": 693
},
{
"epoch": 1.1511604025467241,
"eval_loss": 0.6250694990158081,
"eval_runtime": 50.9678,
"eval_samples_per_second": 494.018,
"eval_steps_per_second": 0.981,
"step": 700
},
{
"epoch": 1.1544464982542617,
"grad_norm": 0.21149393916130066,
"learning_rate": 7.281310677595526e-06,
"loss": 0.6011,
"step": 702
},
{
"epoch": 1.1692339289381803,
"grad_norm": 0.6672160029411316,
"learning_rate": 7.208037930925272e-06,
"loss": 0.6018,
"step": 711
},
{
"epoch": 1.184021359622099,
"grad_norm": 1.4521163702011108,
"learning_rate": 7.134171292854957e-06,
"loss": 0.6138,
"step": 720
},
{
"epoch": 1.1988087903060176,
"grad_norm": 0.7164549231529236,
"learning_rate": 7.0597306311415995e-06,
"loss": 0.6061,
"step": 729
},
{
"epoch": 1.2135962209899362,
"grad_norm": 0.464008629322052,
"learning_rate": 6.984735967936173e-06,
"loss": 0.6026,
"step": 738
},
{
"epoch": 1.2283836516738549,
"grad_norm": 0.3417239785194397,
"learning_rate": 6.909207474398283e-06,
"loss": 0.6015,
"step": 747
},
{
"epoch": 1.2431710823577737,
"grad_norm": 0.3259502947330475,
"learning_rate": 6.833165465270786e-06,
"loss": 0.5995,
"step": 756
},
{
"epoch": 1.2579585130416924,
"grad_norm": 0.3659493029117584,
"learning_rate": 6.756630393415755e-06,
"loss": 0.5955,
"step": 765
},
{
"epoch": 1.272745943725611,
"grad_norm": 0.30788564682006836,
"learning_rate": 6.679622844313335e-06,
"loss": 0.5996,
"step": 774
},
{
"epoch": 1.2875333744095296,
"grad_norm": 0.2533837556838989,
"learning_rate": 6.602163530524894e-06,
"loss": 0.5986,
"step": 783
},
{
"epoch": 1.3023208050934483,
"grad_norm": 0.27919843792915344,
"learning_rate": 6.524273286122018e-06,
"loss": 0.5982,
"step": 792
},
{
"epoch": 1.3154651879235983,
"eval_loss": 0.6208261251449585,
"eval_runtime": 50.9478,
"eval_samples_per_second": 494.211,
"eval_steps_per_second": 0.981,
"step": 800
},
{
"epoch": 1.3171082357773671,
"grad_norm": 0.2664376199245453,
"learning_rate": 6.445973061082805e-06,
"loss": 0.5971,
"step": 801
},
{
"epoch": 1.3318956664612858,
"grad_norm": 0.2566661238670349,
"learning_rate": 6.3672839156570056e-06,
"loss": 0.5962,
"step": 810
},
{
"epoch": 1.3466830971452044,
"grad_norm": 0.2494824081659317,
"learning_rate": 6.288227014701473e-06,
"loss": 0.5972,
"step": 819
},
{
"epoch": 1.361470527829123,
"grad_norm": 0.22812236845493317,
"learning_rate": 6.208823621987516e-06,
"loss": 0.5957,
"step": 828
},
{
"epoch": 1.3762579585130417,
"grad_norm": 0.25955238938331604,
"learning_rate": 6.1290950944816065e-06,
"loss": 0.5955,
"step": 837
},
{
"epoch": 1.3910453891969603,
"grad_norm": 0.28508901596069336,
"learning_rate": 6.049062876601057e-06,
"loss": 0.5947,
"step": 846
},
{
"epoch": 1.405832819880879,
"grad_norm": 0.21612407267093658,
"learning_rate": 5.968748494446147e-06,
"loss": 0.5927,
"step": 855
},
{
"epoch": 1.4206202505647978,
"grad_norm": 0.2369757890701294,
"learning_rate": 5.888173550010301e-06,
"loss": 0.592,
"step": 864
},
{
"epoch": 1.4354076812487164,
"grad_norm": 0.22978542745113373,
"learning_rate": 5.807359715369843e-06,
"loss": 0.5944,
"step": 873
},
{
"epoch": 1.450195111932635,
"grad_norm": 0.20011670887470245,
"learning_rate": 5.726328726854896e-06,
"loss": 0.5915,
"step": 882
},
{
"epoch": 1.4649825426165537,
"grad_norm": 0.2324371486902237,
"learning_rate": 5.645102379203018e-06,
"loss": 0.5956,
"step": 891
},
{
"epoch": 1.4797699733004723,
"grad_norm": 0.243639275431633,
"learning_rate": 5.563702519697108e-06,
"loss": 0.5934,
"step": 900
},
{
"epoch": 1.4797699733004723,
"eval_loss": 0.6167550086975098,
"eval_runtime": 50.557,
"eval_samples_per_second": 498.031,
"eval_steps_per_second": 0.989,
"step": 900
},
{
"epoch": 1.494557403984391,
"grad_norm": 0.23125344514846802,
"learning_rate": 5.48215104228919e-06,
"loss": 0.5938,
"step": 909
},
{
"epoch": 1.5093448346683096,
"grad_norm": 0.21182002127170563,
"learning_rate": 5.40046988171164e-06,
"loss": 0.5935,
"step": 918
},
{
"epoch": 1.5241322653522285,
"grad_norm": 0.22478973865509033,
"learning_rate": 5.318681007577455e-06,
"loss": 0.5923,
"step": 927
},
{
"epoch": 1.538919696036147,
"grad_norm": 0.20420202612876892,
"learning_rate": 5.2368064184711136e-06,
"loss": 0.5917,
"step": 936
},
{
"epoch": 1.5537071267200657,
"grad_norm": 0.21112091839313507,
"learning_rate": 5.1548681360316824e-06,
"loss": 0.5922,
"step": 945
},
{
"epoch": 1.5684945574039844,
"grad_norm": 0.21270057559013367,
"learning_rate": 5.0728881990296904e-06,
"loss": 0.5919,
"step": 954
},
{
"epoch": 1.583281988087903,
"grad_norm": 0.21418283879756927,
"learning_rate": 4.990888657439405e-06,
"loss": 0.5907,
"step": 963
},
{
"epoch": 1.5980694187718218,
"grad_norm": 0.2000851035118103,
"learning_rate": 4.9088915665081035e-06,
"loss": 0.5917,
"step": 972
},
{
"epoch": 1.6128568494557403,
"grad_norm": 0.20808270573616028,
"learning_rate": 4.826918980823911e-06,
"loss": 0.5917,
"step": 981
},
{
"epoch": 1.6276442801396591,
"grad_norm": 0.2385932058095932,
"learning_rate": 4.744992948383827e-06,
"loss": 0.5896,
"step": 990
},
{
"epoch": 1.6424317108235778,
"grad_norm": 0.2191840559244156,
"learning_rate": 4.663135504663525e-06,
"loss": 0.5884,
"step": 999
},
{
"epoch": 1.6440747586773465,
"eval_loss": 0.613418698310852,
"eval_runtime": 51.0013,
"eval_samples_per_second": 493.694,
"eval_steps_per_second": 0.98,
"step": 1000
},
{
"epoch": 1.6572191415074964,
"grad_norm": 0.21977262198925018,
"learning_rate": 4.58136866669051e-06,
"loss": 0.5897,
"step": 1008
},
{
"epoch": 1.6720065721914152,
"grad_norm": 0.24079816043376923,
"learning_rate": 4.499714427122242e-06,
"loss": 0.5899,
"step": 1017
},
{
"epoch": 1.6867940028753337,
"grad_norm": 0.20791475474834442,
"learning_rate": 4.418194748330831e-06,
"loss": 0.5901,
"step": 1026
},
{
"epoch": 1.7015814335592525,
"grad_norm": 0.23088951408863068,
"learning_rate": 4.3368315564958415e-06,
"loss": 0.5907,
"step": 1035
},
{
"epoch": 1.716368864243171,
"grad_norm": 0.18772049248218536,
"learning_rate": 4.2556467357068695e-06,
"loss": 0.5897,
"step": 1044
},
{
"epoch": 1.7311562949270898,
"grad_norm": 0.20672886073589325,
"learning_rate": 4.174662122077424e-06,
"loss": 0.5893,
"step": 1053
},
{
"epoch": 1.7459437256110084,
"grad_norm": 0.2176688015460968,
"learning_rate": 4.093899497871701e-06,
"loss": 0.5899,
"step": 1062
},
{
"epoch": 1.760731156294927,
"grad_norm": 0.20923234522342682,
"learning_rate": 4.0133805856458615e-06,
"loss": 0.5887,
"step": 1071
},
{
"epoch": 1.775518586978846,
"grad_norm": 0.23178276419639587,
"learning_rate": 3.933127042405362e-06,
"loss": 0.5878,
"step": 1080
},
{
"epoch": 1.7903060176627643,
"grad_norm": 0.21155081689357758,
"learning_rate": 3.8531604537799075e-06,
"loss": 0.5876,
"step": 1089
},
{
"epoch": 1.8050934483466832,
"grad_norm": 0.20616762340068817,
"learning_rate": 3.7735023282176146e-06,
"loss": 0.587,
"step": 1098
},
{
"epoch": 1.8083795440542207,
"eval_loss": 0.6106312870979309,
"eval_runtime": 51.2953,
"eval_samples_per_second": 490.864,
"eval_steps_per_second": 0.975,
"step": 1100
},
{
"epoch": 1.8198808790306018,
"grad_norm": 0.2053360491991043,
"learning_rate": 3.6941740911999293e-06,
"loss": 0.5901,
"step": 1107
},
{
"epoch": 1.8346683097145204,
"grad_norm": 0.1950492113828659,
"learning_rate": 3.6151970794788525e-06,
"loss": 0.5879,
"step": 1116
},
{
"epoch": 1.849455740398439,
"grad_norm": 0.18794511258602142,
"learning_rate": 3.536592535338046e-06,
"loss": 0.5885,
"step": 1125
},
{
"epoch": 1.8642431710823577,
"grad_norm": 0.21698912978172302,
"learning_rate": 3.4583816008793375e-06,
"loss": 0.587,
"step": 1134
},
{
"epoch": 1.8790306017662766,
"grad_norm": 0.2182021141052246,
"learning_rate": 3.3805853123361687e-06,
"loss": 0.5849,
"step": 1143
},
{
"epoch": 1.893818032450195,
"grad_norm": 0.23845024406909943,
"learning_rate": 3.303224594415528e-06,
"loss": 0.5881,
"step": 1152
},
{
"epoch": 1.9086054631341138,
"grad_norm": 0.19117599725723267,
"learning_rate": 3.226320254669873e-06,
"loss": 0.5871,
"step": 1161
},
{
"epoch": 1.9233928938180325,
"grad_norm": 0.21598902344703674,
"learning_rate": 3.1498929779005637e-06,
"loss": 0.5865,
"step": 1170
},
{
"epoch": 1.938180324501951,
"grad_norm": 0.22297415137290955,
"learning_rate": 3.0739633205943237e-06,
"loss": 0.5841,
"step": 1179
},
{
"epoch": 1.9529677551858697,
"grad_norm": 0.18670949339866638,
"learning_rate": 2.9985517053941926e-06,
"loss": 0.5865,
"step": 1188
},
{
"epoch": 1.9677551858697884,
"grad_norm": 0.20467469096183777,
"learning_rate": 2.9236784156064936e-06,
"loss": 0.5868,
"step": 1197
},
{
"epoch": 1.9726843294310947,
"eval_loss": 0.6083265542984009,
"eval_runtime": 50.99,
"eval_samples_per_second": 493.802,
"eval_steps_per_second": 0.981,
"step": 1200
},
{
"epoch": 1.9825426165537072,
"grad_norm": 0.1806914359331131,
"learning_rate": 2.8493635897452824e-06,
"loss": 0.5875,
"step": 1206
},
{
"epoch": 1.9973300472376256,
"grad_norm": 0.2014399617910385,
"learning_rate": 2.77562721611572e-06,
"loss": 0.5829,
"step": 1215
},
{
"epoch": 2.01314438283015,
"grad_norm": 0.21652302145957947,
"learning_rate": 2.7024891274378695e-06,
"loss": 0.6427,
"step": 1224
},
{
"epoch": 2.0279318135140687,
"grad_norm": 0.2011858969926834,
"learning_rate": 2.629968995512327e-06,
"loss": 0.5777,
"step": 1233
},
{
"epoch": 2.042719244197987,
"grad_norm": 0.19227717816829681,
"learning_rate": 2.5580863259291333e-06,
"loss": 0.5765,
"step": 1242
},
{
"epoch": 2.057506674881906,
"grad_norm": 0.17868341505527496,
"learning_rate": 2.4868604528214042e-06,
"loss": 0.5749,
"step": 1251
},
{
"epoch": 2.072294105565825,
"grad_norm": 0.18664774298667908,
"learning_rate": 2.4163105336650645e-06,
"loss": 0.5742,
"step": 1260
},
{
"epoch": 2.0870815362497432,
"grad_norm": 0.17733176052570343,
"learning_rate": 2.3464555441261016e-06,
"loss": 0.5747,
"step": 1269
},
{
"epoch": 2.101868966933662,
"grad_norm": 0.1523798704147339,
"learning_rate": 2.277314272956715e-06,
"loss": 0.5755,
"step": 1278
},
{
"epoch": 2.1166563976175805,
"grad_norm": 1108.92724609375,
"learning_rate": 2.208905316941754e-06,
"loss": 0.5732,
"step": 1287
},
{
"epoch": 2.1314438283014994,
"grad_norm": 0.1998017281293869,
"learning_rate": 2.1412470758967742e-06,
"loss": 0.5744,
"step": 1296
},
{
"epoch": 2.138016019716574,
"eval_loss": 0.6074865460395813,
"eval_runtime": 51.2662,
"eval_samples_per_second": 491.142,
"eval_steps_per_second": 0.975,
"step": 1300
},
{
"epoch": 2.146231258985418,
"grad_norm": 0.17162367701530457,
"learning_rate": 2.0743577477190714e-06,
"loss": 0.5762,
"step": 1305
},
{
"epoch": 2.1610186896693366,
"grad_norm": 0.16444073617458344,
"learning_rate": 2.0082553234930407e-06,
"loss": 0.5774,
"step": 1314
},
{
"epoch": 2.1758061203532555,
"grad_norm": 0.1596338301897049,
"learning_rate": 1.9429575826511493e-06,
"loss": 0.5764,
"step": 1323
},
{
"epoch": 2.190593551037174,
"grad_norm": 0.16474801301956177,
"learning_rate": 1.8784820881918275e-06,
"loss": 0.5754,
"step": 1332
},
{
"epoch": 2.2053809817210928,
"grad_norm": 0.17859473824501038,
"learning_rate": 1.8148461819556095e-06,
"loss": 0.5748,
"step": 1341
},
{
"epoch": 2.220168412405011,
"grad_norm": 0.16459307074546814,
"learning_rate": 1.752066979960707e-06,
"loss": 0.574,
"step": 1350
},
{
"epoch": 2.23495584308893,
"grad_norm": 0.16431599855422974,
"learning_rate": 1.6901613677993677e-06,
"loss": 0.5739,
"step": 1359
},
{
"epoch": 2.2497432737728484,
"grad_norm": 0.16578958928585052,
"learning_rate": 1.6291459960961886e-06,
"loss": 0.5736,
"step": 1368
},
{
"epoch": 2.2645307044567673,
"grad_norm": 0.17526379227638245,
"learning_rate": 1.5690372760296235e-06,
"loss": 0.5764,
"step": 1377
},
{
"epoch": 2.279318135140686,
"grad_norm": 0.15786723792552948,
"learning_rate": 1.5098513749179156e-06,
"loss": 0.5741,
"step": 1386
},
{
"epoch": 2.2941055658246046,
"grad_norm": 0.1611924022436142,
"learning_rate": 1.451604211870597e-06,
"loss": 0.5725,
"step": 1395
},
{
"epoch": 2.3023208050934483,
"eval_loss": 0.6064820885658264,
"eval_runtime": 51.337,
"eval_samples_per_second": 490.465,
"eval_steps_per_second": 0.974,
"step": 1400
},
{
"epoch": 2.3088929965085234,
"grad_norm": 0.1685233861207962,
"learning_rate": 1.3943114535067632e-06,
"loss": 0.5738,
"step": 1404
},
{
"epoch": 2.323680427192442,
"grad_norm": 0.15319103002548218,
"learning_rate": 1.337988509741255e-06,
"loss": 0.5737,
"step": 1413
},
{
"epoch": 2.3384678578763607,
"grad_norm": 0.15287065505981445,
"learning_rate": 1.2826505296398805e-06,
"loss": 0.5735,
"step": 1422
},
{
"epoch": 2.3532552885602795,
"grad_norm": 0.1517048478126526,
"learning_rate": 1.2283123973448107e-06,
"loss": 0.5734,
"step": 1431
},
{
"epoch": 2.368042719244198,
"grad_norm": 0.14937447011470795,
"learning_rate": 1.1749887280712164e-06,
"loss": 0.5745,
"step": 1440
},
{
"epoch": 2.382830149928117,
"grad_norm": 0.1517946720123291,
"learning_rate": 1.1226938641762464e-06,
"loss": 0.5742,
"step": 1449
},
{
"epoch": 2.3976175806120352,
"grad_norm": 0.1503513902425766,
"learning_rate": 1.0714418713013885e-06,
"loss": 0.5731,
"step": 1458
},
{
"epoch": 2.412405011295954,
"grad_norm": 0.15668976306915283,
"learning_rate": 1.021246534589272e-06,
"loss": 0.5739,
"step": 1467
},
{
"epoch": 2.4271924419798725,
"grad_norm": 0.15115846693515778,
"learning_rate": 9.721213549759011e-07,
"loss": 0.5723,
"step": 1476
},
{
"epoch": 2.4419798726637914,
"grad_norm": 0.1494954228401184,
"learning_rate": 9.24079545559331e-07,
"loss": 0.574,
"step": 1485
},
{
"epoch": 2.4567673033477098,
"grad_norm": 0.14341481029987335,
"learning_rate": 8.771340280457791e-07,
"loss": 0.5742,
"step": 1494
},
{
"epoch": 2.4666255904703225,
"eval_loss": 0.6056612730026245,
"eval_runtime": 51.0039,
"eval_samples_per_second": 493.668,
"eval_steps_per_second": 0.98,
"step": 1500
},
{
"epoch": 2.4715547340316286,
"grad_norm": 0.14807379245758057,
"learning_rate": 8.312974292740938e-07,
"loss": 0.5768,
"step": 1503
},
{
"epoch": 2.4863421647155475,
"grad_norm": 0.14617814123630524,
"learning_rate": 7.865820778195366e-07,
"loss": 0.5742,
"step": 1512
},
{
"epoch": 2.501129595399466,
"grad_norm": 0.14887727797031403,
"learning_rate": 7.430000006778021e-07,
"loss": 0.5732,
"step": 1521
},
{
"epoch": 2.5159170260833847,
"grad_norm": 0.1536111980676651,
"learning_rate": 7.005629200301267e-07,
"loss": 0.5719,
"step": 1530
},
{
"epoch": 2.530704456767303,
"grad_norm": 0.1470237374305725,
"learning_rate": 6.592822500904111e-07,
"loss": 0.5729,
"step": 1539
},
{
"epoch": 2.545491887451222,
"grad_norm": 0.15134099125862122,
"learning_rate": 6.191690940351569e-07,
"loss": 0.5728,
"step": 1548
},
{
"epoch": 2.560279318135141,
"grad_norm": 0.15583239495754242,
"learning_rate": 5.802342410170636e-07,
"loss": 0.5727,
"step": 1557
},
{
"epoch": 2.5750667488190593,
"grad_norm": 0.14715588092803955,
"learning_rate": 5.424881632631023e-07,
"loss": 0.5726,
"step": 1566
},
{
"epoch": 2.589854179502978,
"grad_norm": 0.13980697095394135,
"learning_rate": 5.059410132578163e-07,
"loss": 0.5723,
"step": 1575
},
{
"epoch": 2.6046416101868965,
"grad_norm": 0.14238569140434265,
"learning_rate": 4.7060262101263024e-07,
"loss": 0.5728,
"step": 1584
},
{
"epoch": 2.6194290408708154,
"grad_norm": 0.14448300004005432,
"learning_rate": 4.3648249142188846e-07,
"loss": 0.5744,
"step": 1593
},
{
"epoch": 2.6309303758471967,
"eval_loss": 0.6051778197288513,
"eval_runtime": 51.0525,
"eval_samples_per_second": 493.198,
"eval_steps_per_second": 0.979,
"step": 1600
},
{
"epoch": 2.6342164715547343,
"grad_norm": 0.13831211626529694,
"learning_rate": 4.0358980170634945e-07,
"loss": 0.5713,
"step": 1602
},
{
"epoch": 2.6490039022386527,
"grad_norm": 0.1441497951745987,
"learning_rate": 3.7193339894480486e-07,
"loss": 0.5735,
"step": 1611
},
{
"epoch": 2.6637913329225715,
"grad_norm": 0.14047347009181976,
"learning_rate": 3.41521797694494e-07,
"loss": 0.5722,
"step": 1620
},
{
"epoch": 2.67857876360649,
"grad_norm": 0.14037346839904785,
"learning_rate": 3.1236317770097335e-07,
"loss": 0.5728,
"step": 1629
},
{
"epoch": 2.693366194290409,
"grad_norm": 0.14050990343093872,
"learning_rate": 2.844653816980125e-07,
"loss": 0.5758,
"step": 1638
},
{
"epoch": 2.708153624974327,
"grad_norm": 0.14191824197769165,
"learning_rate": 2.578359132981606e-07,
"loss": 0.5739,
"step": 1647
},
{
"epoch": 2.722941055658246,
"grad_norm": 0.13892725110054016,
"learning_rate": 2.3248193497451331e-07,
"loss": 0.5721,
"step": 1656
},
{
"epoch": 2.7377284863421645,
"grad_norm": 0.14035071432590485,
"learning_rate": 2.0841026613423297e-07,
"loss": 0.5755,
"step": 1665
},
{
"epoch": 2.7525159170260833,
"grad_norm": 0.1366022676229477,
"learning_rate": 1.8562738128435066e-07,
"loss": 0.5744,
"step": 1674
},
{
"epoch": 2.767303347710002,
"grad_norm": 0.14230774343013763,
"learning_rate": 1.6413940829033193e-07,
"loss": 0.5707,
"step": 1683
},
{
"epoch": 2.7820907783939206,
"grad_norm": 0.14180122315883636,
"learning_rate": 1.4395212672787373e-07,
"loss": 0.573,
"step": 1692
},
{
"epoch": 2.7952351612240705,
"eval_loss": 0.6049104928970337,
"eval_runtime": 51.0939,
"eval_samples_per_second": 492.799,
"eval_steps_per_second": 0.979,
"step": 1700
},
{
"epoch": 2.7968782090778395,
"grad_norm": 0.13679684698581696,
"learning_rate": 1.2507096632838833e-07,
"loss": 0.5715,
"step": 1701
},
{
"epoch": 2.811665639761758,
"grad_norm": 0.1371249109506607,
"learning_rate": 1.0750100551857546e-07,
"loss": 0.5718,
"step": 1710
},
{
"epoch": 2.8264530704456767,
"grad_norm": 0.1351563036441803,
"learning_rate": 9.124697005449157e-08,
"loss": 0.5728,
"step": 1719
},
{
"epoch": 2.8412405011295956,
"grad_norm": 0.1376628279685974,
"learning_rate": 7.631323175047168e-08,
"loss": 0.5731,
"step": 1728
},
{
"epoch": 2.856027931813514,
"grad_norm": 0.14077013731002808,
"learning_rate": 6.270380730325154e-08,
"loss": 0.5752,
"step": 1737
},
{
"epoch": 2.870815362497433,
"grad_norm": 0.1379421055316925,
"learning_rate": 5.042235721160471e-08,
"loss": 0.5747,
"step": 1746
},
{
"epoch": 2.8856027931813513,
"grad_norm": 0.1414673924446106,
"learning_rate": 3.9472184791786716e-08,
"loss": 0.5714,
"step": 1755
},
{
"epoch": 2.90039022386527,
"grad_norm": 0.13783302903175354,
"learning_rate": 2.985623528904913e-08,
"loss": 0.5706,
"step": 1764
},
{
"epoch": 2.915177654549189,
"grad_norm": 0.13526910543441772,
"learning_rate": 2.1577095085460465e-08,
"loss": 0.5711,
"step": 1773
},
{
"epoch": 2.9299650852331074,
"grad_norm": 0.13618573546409607,
"learning_rate": 1.4636991004254864e-08,
"loss": 0.5695,
"step": 1782
},
{
"epoch": 2.9447525159170262,
"grad_norm": 0.13651920855045319,
"learning_rate": 9.037789710887868e-09,
"loss": 0.5721,
"step": 1791
},
{
"epoch": 2.9595399466009447,
"grad_norm": 0.1356479376554489,
"learning_rate": 4.780997210962479e-09,
"loss": 0.5733,
"step": 1800
},
{
"epoch": 2.9595399466009447,
"eval_loss": 0.6048758625984192,
"eval_runtime": 50.6428,
"eval_samples_per_second": 497.188,
"eval_steps_per_second": 0.987,
"step": 1800
},
{
"epoch": 2.9743273772848635,
"grad_norm": 0.1394314020872116,
"learning_rate": 1.867758445161516e-09,
"loss": 0.5713,
"step": 1809
},
{
"epoch": 2.989114807968782,
"grad_norm": 0.14245158433914185,
"learning_rate": 2.988569812972797e-10,
"loss": 0.5716,
"step": 1818
}
],
"logging_steps": 9,
"max_steps": 1824,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.756360977818969e+21,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}