{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 888, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016891891891891893, "grad_norm": 6.231681321080043, "learning_rate": 5.555555555555556e-06, "loss": 1.2707, "step": 5 }, { "epoch": 0.033783783783783786, "grad_norm": 4.950872417286887, "learning_rate": 1.1111111111111112e-05, "loss": 1.1132, "step": 10 }, { "epoch": 0.05067567567567568, "grad_norm": 3.967491399410514, "learning_rate": 1.6666666666666667e-05, "loss": 0.9841, "step": 15 }, { "epoch": 0.06756756756756757, "grad_norm": 3.3129251745959687, "learning_rate": 2.2222222222222223e-05, "loss": 0.9505, "step": 20 }, { "epoch": 0.08445945945945946, "grad_norm": 3.2010737153707294, "learning_rate": 2.777777777777778e-05, "loss": 0.9554, "step": 25 }, { "epoch": 0.10135135135135136, "grad_norm": 2.9508900364586625, "learning_rate": 3.3333333333333335e-05, "loss": 0.9444, "step": 30 }, { "epoch": 0.11824324324324324, "grad_norm": 2.8841214572197207, "learning_rate": 3.888888888888889e-05, "loss": 0.9727, "step": 35 }, { "epoch": 0.13513513513513514, "grad_norm": 2.6403601369490213, "learning_rate": 4.4444444444444447e-05, "loss": 0.9681, "step": 40 }, { "epoch": 0.15202702702702703, "grad_norm": 2.71201222699466, "learning_rate": 5e-05, "loss": 1.0178, "step": 45 }, { "epoch": 0.16891891891891891, "grad_norm": 2.4481643042636048, "learning_rate": 4.999609406904966e-05, "loss": 1.0114, "step": 50 }, { "epoch": 0.1858108108108108, "grad_norm": 2.2579319180534676, "learning_rate": 4.99843776323139e-05, "loss": 0.9717, "step": 55 }, { "epoch": 0.20270270270270271, "grad_norm": 2.6055124227126965, "learning_rate": 4.9964854757667645e-05, "loss": 0.9995, "step": 60 }, { "epoch": 0.2195945945945946, "grad_norm": 2.2286636459152467, "learning_rate": 4.993753222333314e-05, "loss": 0.9817, "step": 65 }, { "epoch": 0.23648648648648649, "grad_norm": 2.6802583991959867, "learning_rate": 4.9902419515526604e-05, "loss": 0.9936, "step": 70 }, { "epoch": 0.2533783783783784, "grad_norm": 2.2015258103078423, "learning_rate": 4.985952882516468e-05, "loss": 0.9776, "step": 75 }, { "epoch": 0.2702702702702703, "grad_norm": 1.9745512455187957, "learning_rate": 4.980887504363181e-05, "loss": 0.9709, "step": 80 }, { "epoch": 0.28716216216216217, "grad_norm": 2.691281356514746, "learning_rate": 4.975047575761004e-05, "loss": 0.9705, "step": 85 }, { "epoch": 0.30405405405405406, "grad_norm": 1.8744903969564979, "learning_rate": 4.9684351242973035e-05, "loss": 0.9381, "step": 90 }, { "epoch": 0.32094594594594594, "grad_norm": 2.499819434542776, "learning_rate": 4.961052445774642e-05, "loss": 0.9408, "step": 95 }, { "epoch": 0.33783783783783783, "grad_norm": 2.0213491561872856, "learning_rate": 4.952902103413691e-05, "loss": 0.9576, "step": 100 }, { "epoch": 0.3547297297297297, "grad_norm": 2.007777582220033, "learning_rate": 4.943986926963291e-05, "loss": 0.9538, "step": 105 }, { "epoch": 0.3716216216216216, "grad_norm": 1.8376742795926042, "learning_rate": 4.934310011717988e-05, "loss": 0.9588, "step": 110 }, { "epoch": 0.3885135135135135, "grad_norm": 2.1090401102371423, "learning_rate": 4.923874717443361e-05, "loss": 0.9777, "step": 115 }, { "epoch": 0.40540540540540543, "grad_norm": 1.9167602630034615, "learning_rate": 4.9126846672095325e-05, "loss": 0.9162, "step": 120 }, { "epoch": 0.4222972972972973, "grad_norm": 2.0456990567414644, "learning_rate": 4.900743746133262e-05, "loss": 0.9386, "step": 125 }, { "epoch": 0.4391891891891892, "grad_norm": 1.9096406870985225, "learning_rate": 4.888056100029056e-05, "loss": 0.9624, "step": 130 }, { "epoch": 0.4560810810810811, "grad_norm": 1.9809440749049774, "learning_rate": 4.87462613396977e-05, "loss": 0.9257, "step": 135 }, { "epoch": 0.47297297297297297, "grad_norm": 1.7258775836343794, "learning_rate": 4.860458510757189e-05, "loss": 0.9254, "step": 140 }, { "epoch": 0.48986486486486486, "grad_norm": 1.9425639247796285, "learning_rate": 4.845558149303135e-05, "loss": 0.9476, "step": 145 }, { "epoch": 0.5067567567567568, "grad_norm": 1.7602777895956392, "learning_rate": 4.8299302229216506e-05, "loss": 0.9615, "step": 150 }, { "epoch": 0.5236486486486487, "grad_norm": 1.6992288952271115, "learning_rate": 4.8135801575328546e-05, "loss": 0.8942, "step": 155 }, { "epoch": 0.5405405405405406, "grad_norm": 1.698637849222003, "learning_rate": 4.796513629779098e-05, "loss": 0.8944, "step": 160 }, { "epoch": 0.5574324324324325, "grad_norm": 1.5467487223203435, "learning_rate": 4.7787365650540663e-05, "loss": 0.9153, "step": 165 }, { "epoch": 0.5743243243243243, "grad_norm": 1.6334496828072615, "learning_rate": 4.760255135445521e-05, "loss": 0.9248, "step": 170 }, { "epoch": 0.5912162162162162, "grad_norm": 1.6259508720656166, "learning_rate": 4.741075757592389e-05, "loss": 0.8847, "step": 175 }, { "epoch": 0.6081081081081081, "grad_norm": 1.790410553077543, "learning_rate": 4.721205090456941e-05, "loss": 0.9289, "step": 180 }, { "epoch": 0.625, "grad_norm": 1.8532634400660235, "learning_rate": 4.7006500330128473e-05, "loss": 0.9015, "step": 185 }, { "epoch": 0.6418918918918919, "grad_norm": 1.7875929509109747, "learning_rate": 4.679417721849889e-05, "loss": 0.8905, "step": 190 }, { "epoch": 0.6587837837837838, "grad_norm": 1.5944547606006951, "learning_rate": 4.657515528696183e-05, "loss": 0.8929, "step": 195 }, { "epoch": 0.6756756756756757, "grad_norm": 1.5660749975109045, "learning_rate": 4.6349510578587635e-05, "loss": 0.8757, "step": 200 }, { "epoch": 0.6925675675675675, "grad_norm": 1.6423933991915096, "learning_rate": 4.61173214358341e-05, "loss": 0.8934, "step": 205 }, { "epoch": 0.7094594594594594, "grad_norm": 1.559933642024552, "learning_rate": 4.5878668473346464e-05, "loss": 0.9102, "step": 210 }, { "epoch": 0.7263513513513513, "grad_norm": 1.5955677484161037, "learning_rate": 4.5633634549968514e-05, "loss": 0.8886, "step": 215 }, { "epoch": 0.7432432432432432, "grad_norm": 1.5548188228815374, "learning_rate": 4.538230473997449e-05, "loss": 0.8737, "step": 220 }, { "epoch": 0.7601351351351351, "grad_norm": 1.6430486936390023, "learning_rate": 4.512476630353183e-05, "loss": 0.8651, "step": 225 }, { "epoch": 0.777027027027027, "grad_norm": 1.497167470194, "learning_rate": 4.4861108656404955e-05, "loss": 0.8724, "step": 230 }, { "epoch": 0.793918918918919, "grad_norm": 1.4259126734193541, "learning_rate": 4.459142333891067e-05, "loss": 0.8905, "step": 235 }, { "epoch": 0.8108108108108109, "grad_norm": 1.4831105867234027, "learning_rate": 4.4315803984135965e-05, "loss": 0.8619, "step": 240 }, { "epoch": 0.8277027027027027, "grad_norm": 1.5825985585703166, "learning_rate": 4.403434628542914e-05, "loss": 0.8619, "step": 245 }, { "epoch": 0.8445945945945946, "grad_norm": 1.5675917133492445, "learning_rate": 4.374714796317566e-05, "loss": 0.9152, "step": 250 }, { "epoch": 0.8614864864864865, "grad_norm": 1.5361276988078942, "learning_rate": 4.345430873087026e-05, "loss": 0.8688, "step": 255 }, { "epoch": 0.8783783783783784, "grad_norm": 1.369553191214301, "learning_rate": 4.315593026049703e-05, "loss": 0.851, "step": 260 }, { "epoch": 0.8952702702702703, "grad_norm": 1.6638541563918563, "learning_rate": 4.28521161472295e-05, "loss": 0.8734, "step": 265 }, { "epoch": 0.9121621621621622, "grad_norm": 1.441713025001977, "learning_rate": 4.2542971873463075e-05, "loss": 0.8433, "step": 270 }, { "epoch": 0.9290540540540541, "grad_norm": 1.4960913750522002, "learning_rate": 4.222860477219215e-05, "loss": 0.8442, "step": 275 }, { "epoch": 0.9459459459459459, "grad_norm": 1.462307887191494, "learning_rate": 4.190912398974478e-05, "loss": 0.8515, "step": 280 }, { "epoch": 0.9628378378378378, "grad_norm": 1.3528940388287598, "learning_rate": 4.158464044788774e-05, "loss": 0.8438, "step": 285 }, { "epoch": 0.9797297297297297, "grad_norm": 1.4502619959473744, "learning_rate": 4.125526680531517e-05, "loss": 0.8417, "step": 290 }, { "epoch": 0.9966216216216216, "grad_norm": 1.3664408170375748, "learning_rate": 4.092111741853414e-05, "loss": 0.8355, "step": 295 }, { "epoch": 1.0135135135135136, "grad_norm": 1.48215782409135, "learning_rate": 4.058230830216084e-05, "loss": 0.605, "step": 300 }, { "epoch": 1.0304054054054055, "grad_norm": 1.325024180793296, "learning_rate": 4.0238957088640935e-05, "loss": 0.5206, "step": 305 }, { "epoch": 1.0472972972972974, "grad_norm": 1.5992825466313396, "learning_rate": 3.989118298740837e-05, "loss": 0.5089, "step": 310 }, { "epoch": 1.0641891891891893, "grad_norm": 1.4273213416805155, "learning_rate": 3.953910674349652e-05, "loss": 0.4936, "step": 315 }, { "epoch": 1.0810810810810811, "grad_norm": 1.3287056923860654, "learning_rate": 3.9182850595616334e-05, "loss": 0.4983, "step": 320 }, { "epoch": 1.097972972972973, "grad_norm": 1.4134836690592798, "learning_rate": 3.882253823371574e-05, "loss": 0.4897, "step": 325 }, { "epoch": 1.114864864864865, "grad_norm": 1.302428194474921, "learning_rate": 3.8458294756035284e-05, "loss": 0.511, "step": 330 }, { "epoch": 1.1317567567567568, "grad_norm": 1.4644457082188613, "learning_rate": 3.809024662567478e-05, "loss": 0.5208, "step": 335 }, { "epoch": 1.1486486486486487, "grad_norm": 1.7300512446405285, "learning_rate": 3.771852162668611e-05, "loss": 0.5015, "step": 340 }, { "epoch": 1.1655405405405406, "grad_norm": 1.467044194490883, "learning_rate": 3.734324881970736e-05, "loss": 0.5272, "step": 345 }, { "epoch": 1.1824324324324325, "grad_norm": 1.2979658306396822, "learning_rate": 3.69645584971538e-05, "loss": 0.5032, "step": 350 }, { "epoch": 1.1993243243243243, "grad_norm": 1.3933445860127016, "learning_rate": 3.658258213798108e-05, "loss": 0.4999, "step": 355 }, { "epoch": 1.2162162162162162, "grad_norm": 1.3589175020707676, "learning_rate": 3.619745236203666e-05, "loss": 0.5183, "step": 360 }, { "epoch": 1.2331081081081081, "grad_norm": 1.4074778914667276, "learning_rate": 3.580930288401491e-05, "loss": 0.5073, "step": 365 }, { "epoch": 1.25, "grad_norm": 1.3086984542811302, "learning_rate": 3.541826846703224e-05, "loss": 0.4942, "step": 370 }, { "epoch": 1.2668918918918919, "grad_norm": 1.4381479516212257, "learning_rate": 3.5024484875838145e-05, "loss": 0.5071, "step": 375 }, { "epoch": 1.2837837837837838, "grad_norm": 1.2976783376198013, "learning_rate": 3.462808882967856e-05, "loss": 0.5278, "step": 380 }, { "epoch": 1.3006756756756757, "grad_norm": 1.3793720906248872, "learning_rate": 3.4229217954827716e-05, "loss": 0.5166, "step": 385 }, { "epoch": 1.3175675675675675, "grad_norm": 1.3862580674531486, "learning_rate": 3.3828010736805185e-05, "loss": 0.5077, "step": 390 }, { "epoch": 1.3344594594594594, "grad_norm": 1.3562667437179257, "learning_rate": 3.342460647229459e-05, "loss": 0.5048, "step": 395 }, { "epoch": 1.3513513513513513, "grad_norm": 1.374243227751759, "learning_rate": 3.301914522078055e-05, "loss": 0.4957, "step": 400 }, { "epoch": 1.3682432432432432, "grad_norm": 1.3910923941785058, "learning_rate": 3.261176775592097e-05, "loss": 0.4864, "step": 405 }, { "epoch": 1.385135135135135, "grad_norm": 1.4190426820456954, "learning_rate": 3.220261551667128e-05, "loss": 0.4908, "step": 410 }, { "epoch": 1.402027027027027, "grad_norm": 1.3067413904671548, "learning_rate": 3.179183055817767e-05, "loss": 0.4997, "step": 415 }, { "epoch": 1.4189189189189189, "grad_norm": 1.4464186039630254, "learning_rate": 3.137955550245643e-05, "loss": 0.4888, "step": 420 }, { "epoch": 1.4358108108108107, "grad_norm": 1.3944491264799062, "learning_rate": 3.096593348887647e-05, "loss": 0.5026, "step": 425 }, { "epoch": 1.4527027027027026, "grad_norm": 1.2998412690334573, "learning_rate": 3.055110812446221e-05, "loss": 0.4755, "step": 430 }, { "epoch": 1.4695945945945945, "grad_norm": 1.277085043268365, "learning_rate": 3.0135223434034053e-05, "loss": 0.4703, "step": 435 }, { "epoch": 1.4864864864864864, "grad_norm": 1.2464902265161508, "learning_rate": 2.9718423810203944e-05, "loss": 0.5012, "step": 440 }, { "epoch": 1.5033783783783785, "grad_norm": 1.289453338271534, "learning_rate": 2.9300853963243056e-05, "loss": 0.5004, "step": 445 }, { "epoch": 1.5202702702702702, "grad_norm": 1.2772690540474154, "learning_rate": 2.8882658870839326e-05, "loss": 0.493, "step": 450 }, { "epoch": 1.5371621621621623, "grad_norm": 1.286041286434996, "learning_rate": 2.8463983727762046e-05, "loss": 0.5165, "step": 455 }, { "epoch": 1.554054054054054, "grad_norm": 1.3335486353524793, "learning_rate": 2.8044973895451166e-05, "loss": 0.4804, "step": 460 }, { "epoch": 1.570945945945946, "grad_norm": 1.3917515334345343, "learning_rate": 2.7625774851548708e-05, "loss": 0.4764, "step": 465 }, { "epoch": 1.5878378378378377, "grad_norm": 1.3172805254924786, "learning_rate": 2.7206532139389767e-05, "loss": 0.4754, "step": 470 }, { "epoch": 1.6047297297297298, "grad_norm": 1.3246558687025178, "learning_rate": 2.67873913174708e-05, "loss": 0.4858, "step": 475 }, { "epoch": 1.6216216216216215, "grad_norm": 1.3328679629033688, "learning_rate": 2.6368497908912592e-05, "loss": 0.5026, "step": 480 }, { "epoch": 1.6385135135135136, "grad_norm": 1.343169827036629, "learning_rate": 2.5949997350935534e-05, "loss": 0.487, "step": 485 }, { "epoch": 1.6554054054054053, "grad_norm": 1.2317833751218523, "learning_rate": 2.5532034944364712e-05, "loss": 0.4796, "step": 490 }, { "epoch": 1.6722972972972974, "grad_norm": 1.25207138650032, "learning_rate": 2.511475580318233e-05, "loss": 0.4685, "step": 495 }, { "epoch": 1.689189189189189, "grad_norm": 1.2916401389153942, "learning_rate": 2.4698304804145038e-05, "loss": 0.4732, "step": 500 }, { "epoch": 1.7060810810810811, "grad_norm": 1.309320360265987, "learning_rate": 2.428282653648367e-05, "loss": 0.4758, "step": 505 }, { "epoch": 1.722972972972973, "grad_norm": 1.2227129579363303, "learning_rate": 2.386846525170263e-05, "loss": 0.4752, "step": 510 }, { "epoch": 1.739864864864865, "grad_norm": 1.3338028513859421, "learning_rate": 2.3455364813496755e-05, "loss": 0.4705, "step": 515 }, { "epoch": 1.7567567567567568, "grad_norm": 1.2501373969082323, "learning_rate": 2.3043668647802687e-05, "loss": 0.4633, "step": 520 }, { "epoch": 1.7736486486486487, "grad_norm": 1.2830950543814998, "learning_rate": 2.2633519693002247e-05, "loss": 0.4889, "step": 525 }, { "epoch": 1.7905405405405406, "grad_norm": 1.2838795731857355, "learning_rate": 2.2225060350295184e-05, "loss": 0.4634, "step": 530 }, { "epoch": 1.8074324324324325, "grad_norm": 1.2938941133398314, "learning_rate": 2.181843243425824e-05, "loss": 0.4639, "step": 535 }, { "epoch": 1.8243243243243243, "grad_norm": 1.3650137810330223, "learning_rate": 2.1413777123608103e-05, "loss": 0.4643, "step": 540 }, { "epoch": 1.8412162162162162, "grad_norm": 1.3447974317742337, "learning_rate": 2.1011234912184942e-05, "loss": 0.4533, "step": 545 }, { "epoch": 1.8581081081081081, "grad_norm": 1.31773148023275, "learning_rate": 2.0610945560173865e-05, "loss": 0.4624, "step": 550 }, { "epoch": 1.875, "grad_norm": 1.2818032468001688, "learning_rate": 2.0213048045581014e-05, "loss": 0.4455, "step": 555 }, { "epoch": 1.8918918918918919, "grad_norm": 1.3353447408158947, "learning_rate": 1.9817680515981274e-05, "loss": 0.4638, "step": 560 }, { "epoch": 1.9087837837837838, "grad_norm": 1.3297365957452907, "learning_rate": 1.9424980240554236e-05, "loss": 0.4716, "step": 565 }, { "epoch": 1.9256756756756757, "grad_norm": 1.1934548733440697, "learning_rate": 1.903508356242525e-05, "loss": 0.441, "step": 570 }, { "epoch": 1.9425675675675675, "grad_norm": 1.2911193562484828, "learning_rate": 1.864812585132787e-05, "loss": 0.4341, "step": 575 }, { "epoch": 1.9594594594594594, "grad_norm": 1.284850775623703, "learning_rate": 1.8264241456604324e-05, "loss": 0.448, "step": 580 }, { "epoch": 1.9763513513513513, "grad_norm": 1.2349841173785179, "learning_rate": 1.78835636605603e-05, "loss": 0.4423, "step": 585 }, { "epoch": 1.9932432432432432, "grad_norm": 1.2146186214149826, "learning_rate": 1.7506224632190065e-05, "loss": 0.4409, "step": 590 }, { "epoch": 2.010135135135135, "grad_norm": 1.0086827563539977, "learning_rate": 1.7132355381288255e-05, "loss": 0.2777, "step": 595 }, { "epoch": 2.027027027027027, "grad_norm": 1.342303752922279, "learning_rate": 1.676208571296408e-05, "loss": 0.1598, "step": 600 }, { "epoch": 2.043918918918919, "grad_norm": 1.113941499846872, "learning_rate": 1.6395544182573753e-05, "loss": 0.1611, "step": 605 }, { "epoch": 2.060810810810811, "grad_norm": 1.019799238243946, "learning_rate": 1.603285805108689e-05, "loss": 0.1604, "step": 610 }, { "epoch": 2.0777027027027026, "grad_norm": 1.1412888682882492, "learning_rate": 1.5674153240902257e-05, "loss": 0.1554, "step": 615 }, { "epoch": 2.0945945945945947, "grad_norm": 1.1509735977910904, "learning_rate": 1.5319554292128323e-05, "loss": 0.148, "step": 620 }, { "epoch": 2.1114864864864864, "grad_norm": 1.0040685111146865, "learning_rate": 1.4969184319343654e-05, "loss": 0.1515, "step": 625 }, { "epoch": 2.1283783783783785, "grad_norm": 1.1136156062268279, "learning_rate": 1.4623164968852327e-05, "loss": 0.1552, "step": 630 }, { "epoch": 2.14527027027027, "grad_norm": 1.0777999627533803, "learning_rate": 1.4281616376449048e-05, "loss": 0.1562, "step": 635 }, { "epoch": 2.1621621621621623, "grad_norm": 1.0504060905612465, "learning_rate": 1.394465712570876e-05, "loss": 0.1522, "step": 640 }, { "epoch": 2.179054054054054, "grad_norm": 1.0905056621961537, "learning_rate": 1.3612404206815144e-05, "loss": 0.1473, "step": 645 }, { "epoch": 2.195945945945946, "grad_norm": 1.19009398992958, "learning_rate": 1.3284972975942367e-05, "loss": 0.1516, "step": 650 }, { "epoch": 2.2128378378378377, "grad_norm": 1.1365455436625145, "learning_rate": 1.2962477115204094e-05, "loss": 0.1497, "step": 655 }, { "epoch": 2.22972972972973, "grad_norm": 1.1587962328316135, "learning_rate": 1.2645028593183763e-05, "loss": 0.1422, "step": 660 }, { "epoch": 2.2466216216216215, "grad_norm": 1.0337185281275136, "learning_rate": 1.233273762605982e-05, "loss": 0.1473, "step": 665 }, { "epoch": 2.2635135135135136, "grad_norm": 1.1195019763510503, "learning_rate": 1.2025712639339268e-05, "loss": 0.1529, "step": 670 }, { "epoch": 2.2804054054054053, "grad_norm": 1.0396864482228654, "learning_rate": 1.1724060230213075e-05, "loss": 0.1502, "step": 675 }, { "epoch": 2.2972972972972974, "grad_norm": 1.077156584604291, "learning_rate": 1.1427885130546207e-05, "loss": 0.1491, "step": 680 }, { "epoch": 2.314189189189189, "grad_norm": 1.0424824768401733, "learning_rate": 1.1137290170515429e-05, "loss": 0.1442, "step": 685 }, { "epoch": 2.331081081081081, "grad_norm": 1.007784936402244, "learning_rate": 1.085237624290726e-05, "loss": 0.1476, "step": 690 }, { "epoch": 2.347972972972973, "grad_norm": 1.0916964057550667, "learning_rate": 1.0573242268088629e-05, "loss": 0.1394, "step": 695 }, { "epoch": 2.364864864864865, "grad_norm": 1.0392881227066229, "learning_rate": 1.0299985159662348e-05, "loss": 0.1418, "step": 700 }, { "epoch": 2.3817567567567566, "grad_norm": 1.0610168893707066, "learning_rate": 1.0032699790819288e-05, "loss": 0.1486, "step": 705 }, { "epoch": 2.3986486486486487, "grad_norm": 1.016875434548862, "learning_rate": 9.77147896139897e-06, "loss": 0.1358, "step": 710 }, { "epoch": 2.4155405405405403, "grad_norm": 1.111072857598883, "learning_rate": 9.516413365670063e-06, "loss": 0.1527, "step": 715 }, { "epoch": 2.4324324324324325, "grad_norm": 1.0166047165710437, "learning_rate": 9.267591560841876e-06, "loss": 0.1432, "step": 720 }, { "epoch": 2.4493243243243246, "grad_norm": 1.0288128507258207, "learning_rate": 9.02509993631784e-06, "loss": 0.1381, "step": 725 }, { "epoch": 2.4662162162162162, "grad_norm": 1.067559654789781, "learning_rate": 8.789022683701629e-06, "loss": 0.144, "step": 730 }, { "epoch": 2.483108108108108, "grad_norm": 1.0329686983765223, "learning_rate": 8.559441767566378e-06, "loss": 0.143, "step": 735 }, { "epoch": 2.5, "grad_norm": 1.002825285922701, "learning_rate": 8.336436896997063e-06, "loss": 0.1297, "step": 740 }, { "epoch": 2.516891891891892, "grad_norm": 1.0924011480602662, "learning_rate": 8.120085497915995e-06, "loss": 0.139, "step": 745 }, { "epoch": 2.5337837837837838, "grad_norm": 1.09246056663418, "learning_rate": 7.91046268620102e-06, "loss": 0.1411, "step": 750 }, { "epoch": 2.5506756756756754, "grad_norm": 1.0967904913726172, "learning_rate": 7.70764124160576e-06, "loss": 0.1375, "step": 755 }, { "epoch": 2.5675675675675675, "grad_norm": 0.9863979550244457, "learning_rate": 7.5116915824908985e-06, "loss": 0.1337, "step": 760 }, { "epoch": 2.5844594594594597, "grad_norm": 1.0802259488346817, "learning_rate": 7.322681741375405e-06, "loss": 0.1353, "step": 765 }, { "epoch": 2.6013513513513513, "grad_norm": 1.1488305384571615, "learning_rate": 7.140677341316013e-06, "loss": 0.1392, "step": 770 }, { "epoch": 2.618243243243243, "grad_norm": 1.0654092076468062, "learning_rate": 6.96574157312333e-06, "loss": 0.1289, "step": 775 }, { "epoch": 2.635135135135135, "grad_norm": 1.0616589088316162, "learning_rate": 6.797935173422361e-06, "loss": 0.1296, "step": 780 }, { "epoch": 2.652027027027027, "grad_norm": 1.0808128072586296, "learning_rate": 6.6373164035651246e-06, "loss": 0.134, "step": 785 }, { "epoch": 2.668918918918919, "grad_norm": 1.0229579739214567, "learning_rate": 6.483941029402675e-06, "loss": 0.1297, "step": 790 }, { "epoch": 2.685810810810811, "grad_norm": 1.0341349547813405, "learning_rate": 6.337862301923552e-06, "loss": 0.129, "step": 795 }, { "epoch": 2.7027027027027026, "grad_norm": 0.9953533379030133, "learning_rate": 6.199130938765337e-06, "loss": 0.1292, "step": 800 }, { "epoch": 2.7195945945945947, "grad_norm": 0.9916711500948964, "learning_rate": 6.067795106605817e-06, "loss": 0.123, "step": 805 }, { "epoch": 2.7364864864864864, "grad_norm": 1.1347113646519535, "learning_rate": 5.943900404439816e-06, "loss": 0.1245, "step": 810 }, { "epoch": 2.7533783783783785, "grad_norm": 1.0854345457051158, "learning_rate": 5.827489847747492e-06, "loss": 0.1236, "step": 815 }, { "epoch": 2.77027027027027, "grad_norm": 1.0669578176856562, "learning_rate": 5.718603853559626e-06, "loss": 0.1289, "step": 820 }, { "epoch": 2.7871621621621623, "grad_norm": 1.0718894223631414, "learning_rate": 5.617280226425088e-06, "loss": 0.124, "step": 825 }, { "epoch": 2.804054054054054, "grad_norm": 1.0280712526002755, "learning_rate": 5.5235541452853204e-06, "loss": 0.1245, "step": 830 }, { "epoch": 2.820945945945946, "grad_norm": 1.0735185449716749, "learning_rate": 5.437458151260425e-06, "loss": 0.1258, "step": 835 }, { "epoch": 2.8378378378378377, "grad_norm": 0.9948777016385901, "learning_rate": 5.3590221363510965e-06, "loss": 0.1203, "step": 840 }, { "epoch": 2.85472972972973, "grad_norm": 1.0444692285037418, "learning_rate": 5.288273333060287e-06, "loss": 0.1279, "step": 845 }, { "epoch": 2.8716216216216215, "grad_norm": 1.0593788199176337, "learning_rate": 5.2252363049382626e-06, "loss": 0.1203, "step": 850 }, { "epoch": 2.8885135135135136, "grad_norm": 1.0224848591397384, "learning_rate": 5.169932938054281e-06, "loss": 0.1242, "step": 855 }, { "epoch": 2.9054054054054053, "grad_norm": 1.0109605507100674, "learning_rate": 5.122382433397887e-06, "loss": 0.1189, "step": 860 }, { "epoch": 2.9222972972972974, "grad_norm": 1.024940692386505, "learning_rate": 5.082601300212445e-06, "loss": 0.122, "step": 865 }, { "epoch": 2.939189189189189, "grad_norm": 1.1348408537056638, "learning_rate": 5.050603350263229e-06, "loss": 0.1272, "step": 870 }, { "epoch": 2.956081081081081, "grad_norm": 1.0587237780098981, "learning_rate": 5.026399693042064e-06, "loss": 0.122, "step": 875 }, { "epoch": 2.972972972972973, "grad_norm": 1.0526311563986077, "learning_rate": 5.009998731910186e-06, "loss": 0.1239, "step": 880 }, { "epoch": 2.989864864864865, "grad_norm": 1.079605600798441, "learning_rate": 5.001406161180633e-06, "loss": 0.1251, "step": 885 }, { "epoch": 3.0, "step": 888, "total_flos": 412579921461248.0, "train_loss": 0.5179592104175607, "train_runtime": 13894.0345, "train_samples_per_second": 2.043, "train_steps_per_second": 0.064 } ], "logging_steps": 5, "max_steps": 888, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 412579921461248.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }