{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995764506565015, "eval_steps": 500, "global_step": 590, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00847098686997035, "grad_norm": 13.040608406066895, "learning_rate": 1.6949152542372882e-06, "loss": 1.8317, "step": 5 }, { "epoch": 0.0169419737399407, "grad_norm": 7.28167200088501, "learning_rate": 3.3898305084745763e-06, "loss": 1.7467, "step": 10 }, { "epoch": 0.025412960609911054, "grad_norm": 3.6362063884735107, "learning_rate": 5.084745762711865e-06, "loss": 1.4691, "step": 15 }, { "epoch": 0.0338839474798814, "grad_norm": 2.443824291229248, "learning_rate": 6.779661016949153e-06, "loss": 1.1816, "step": 20 }, { "epoch": 0.042354934349851756, "grad_norm": 1.9287770986557007, "learning_rate": 8.47457627118644e-06, "loss": 1.0028, "step": 25 }, { "epoch": 0.05082592121982211, "grad_norm": 2.287205696105957, "learning_rate": 1.016949152542373e-05, "loss": 0.8979, "step": 30 }, { "epoch": 0.05929690808979246, "grad_norm": 1.8406411409378052, "learning_rate": 1.1864406779661018e-05, "loss": 0.8245, "step": 35 }, { "epoch": 0.0677678949597628, "grad_norm": 1.597153663635254, "learning_rate": 1.3559322033898305e-05, "loss": 0.801, "step": 40 }, { "epoch": 0.07623888182973317, "grad_norm": 1.4034295082092285, "learning_rate": 1.5254237288135594e-05, "loss": 0.7545, "step": 45 }, { "epoch": 0.08470986869970351, "grad_norm": 3.763864517211914, "learning_rate": 1.694915254237288e-05, "loss": 0.7336, "step": 50 }, { "epoch": 0.09318085556967387, "grad_norm": 1.451323390007019, "learning_rate": 1.864406779661017e-05, "loss": 0.7121, "step": 55 }, { "epoch": 0.10165184243964422, "grad_norm": 1.5066395998001099, "learning_rate": 1.9999824983320176e-05, "loss": 0.6881, "step": 60 }, { "epoch": 0.11012282930961458, "grad_norm": 1.4491875171661377, "learning_rate": 1.9993700042749937e-05, "loss": 0.673, "step": 65 }, { "epoch": 0.11859381617958492, "grad_norm": 1.27461838722229, "learning_rate": 1.9978830393392338e-05, "loss": 0.6507, "step": 70 }, { "epoch": 0.12706480304955528, "grad_norm": 1.191027045249939, "learning_rate": 1.995522904651977e-05, "loss": 0.6384, "step": 75 }, { "epoch": 0.1355357899195256, "grad_norm": 1.237403392791748, "learning_rate": 1.992291665383325e-05, "loss": 0.6637, "step": 80 }, { "epoch": 0.14400677678949597, "grad_norm": 1.1272355318069458, "learning_rate": 1.9881921489391738e-05, "loss": 0.635, "step": 85 }, { "epoch": 0.15247776365946633, "grad_norm": 1.168715476989746, "learning_rate": 1.983227942487172e-05, "loss": 0.6445, "step": 90 }, { "epoch": 0.1609487505294367, "grad_norm": 1.050166130065918, "learning_rate": 1.9774033898178668e-05, "loss": 0.6412, "step": 95 }, { "epoch": 0.16941973739940702, "grad_norm": 1.268314003944397, "learning_rate": 1.9707235875437932e-05, "loss": 0.6233, "step": 100 }, { "epoch": 0.17789072426937738, "grad_norm": 1.3509944677352905, "learning_rate": 1.963194380639825e-05, "loss": 0.6138, "step": 105 }, { "epoch": 0.18636171113934774, "grad_norm": 1.1367347240447998, "learning_rate": 1.954822357328692e-05, "loss": 0.6045, "step": 110 }, { "epoch": 0.19483269800931807, "grad_norm": 1.1019567251205444, "learning_rate": 1.9456148433161387e-05, "loss": 0.6176, "step": 115 }, { "epoch": 0.20330368487928843, "grad_norm": 1.0358731746673584, "learning_rate": 1.9355798953807715e-05, "loss": 0.5925, "step": 120 }, { "epoch": 0.2117746717492588, "grad_norm": 1.0830167531967163, "learning_rate": 1.924726294324196e-05, "loss": 0.6107, "step": 125 }, { "epoch": 0.22024565861922915, "grad_norm": 1.0958232879638672, "learning_rate": 1.9130635372876245e-05, "loss": 0.5953, "step": 130 }, { "epoch": 0.22871664548919948, "grad_norm": 1.0959957838058472, "learning_rate": 1.9006018294416648e-05, "loss": 0.6107, "step": 135 }, { "epoch": 0.23718763235916984, "grad_norm": 1.1108900308609009, "learning_rate": 1.8873520750565716e-05, "loss": 0.5817, "step": 140 }, { "epoch": 0.2456586192291402, "grad_norm": 1.057532787322998, "learning_rate": 1.8733258679607674e-05, "loss": 0.6108, "step": 145 }, { "epoch": 0.25412960609911056, "grad_norm": 1.2086259126663208, "learning_rate": 1.858535481395986e-05, "loss": 0.5875, "step": 150 }, { "epoch": 0.2626005929690809, "grad_norm": 1.0949163436889648, "learning_rate": 1.8429938572779154e-05, "loss": 0.5951, "step": 155 }, { "epoch": 0.2710715798390512, "grad_norm": 0.9488250017166138, "learning_rate": 1.8267145948717338e-05, "loss": 0.5868, "step": 160 }, { "epoch": 0.2795425667090216, "grad_norm": 1.0208171606063843, "learning_rate": 1.8097119388924524e-05, "loss": 0.5835, "step": 165 }, { "epoch": 0.28801355357899194, "grad_norm": 1.1816986799240112, "learning_rate": 1.7920007670404738e-05, "loss": 0.5703, "step": 170 }, { "epoch": 0.29648454044896233, "grad_norm": 1.1194366216659546, "learning_rate": 1.7735965769832754e-05, "loss": 0.5752, "step": 175 }, { "epoch": 0.30495552731893266, "grad_norm": 1.1583960056304932, "learning_rate": 1.7545154727946065e-05, "loss": 0.5967, "step": 180 }, { "epoch": 0.313426514188903, "grad_norm": 0.9982605576515198, "learning_rate": 1.7347741508630673e-05, "loss": 0.5798, "step": 185 }, { "epoch": 0.3218975010588734, "grad_norm": 1.17666494846344, "learning_rate": 1.7143898852824005e-05, "loss": 0.5647, "step": 190 }, { "epoch": 0.3303684879288437, "grad_norm": 0.9470816254615784, "learning_rate": 1.6933805127362744e-05, "loss": 0.546, "step": 195 }, { "epoch": 0.33883947479881404, "grad_norm": 0.9323899745941162, "learning_rate": 1.671764416890793e-05, "loss": 0.5567, "step": 200 }, { "epoch": 0.34731046166878443, "grad_norm": 1.0073851346969604, "learning_rate": 1.649560512308378e-05, "loss": 0.5649, "step": 205 }, { "epoch": 0.35578144853875476, "grad_norm": 1.2009499073028564, "learning_rate": 1.6267882278971102e-05, "loss": 0.5563, "step": 210 }, { "epoch": 0.3642524354087251, "grad_norm": 1.1687507629394531, "learning_rate": 1.603467489910004e-05, "loss": 0.5704, "step": 215 }, { "epoch": 0.3727234222786955, "grad_norm": 1.0257385969161987, "learning_rate": 1.5796187045090943e-05, "loss": 0.5473, "step": 220 }, { "epoch": 0.3811944091486658, "grad_norm": 1.0044339895248413, "learning_rate": 1.5552627399095943e-05, "loss": 0.5557, "step": 225 }, { "epoch": 0.38966539601863615, "grad_norm": 1.0678868293762207, "learning_rate": 1.5304209081197425e-05, "loss": 0.5487, "step": 230 }, { "epoch": 0.39813638288860653, "grad_norm": 0.9731626510620117, "learning_rate": 1.5051149462923285e-05, "loss": 0.5632, "step": 235 }, { "epoch": 0.40660736975857686, "grad_norm": 0.9739297032356262, "learning_rate": 1.4793669977041978e-05, "loss": 0.5604, "step": 240 }, { "epoch": 0.41507835662854725, "grad_norm": 1.1298881769180298, "learning_rate": 1.4531995923803974e-05, "loss": 0.5748, "step": 245 }, { "epoch": 0.4235493434985176, "grad_norm": 0.946855902671814, "learning_rate": 1.4266356273799044e-05, "loss": 0.5386, "step": 250 }, { "epoch": 0.4320203303684879, "grad_norm": 0.8729770183563232, "learning_rate": 1.3996983467601921e-05, "loss": 0.5672, "step": 255 }, { "epoch": 0.4404913172384583, "grad_norm": 0.9312366843223572, "learning_rate": 1.372411321238166e-05, "loss": 0.5441, "step": 260 }, { "epoch": 0.44896230410842863, "grad_norm": 0.9651340246200562, "learning_rate": 1.3447984275652638e-05, "loss": 0.5487, "step": 265 }, { "epoch": 0.45743329097839897, "grad_norm": 0.998904287815094, "learning_rate": 1.3168838276347691e-05, "loss": 0.5397, "step": 270 }, { "epoch": 0.46590427784836935, "grad_norm": 1.2853810787200928, "learning_rate": 1.2886919473396212e-05, "loss": 0.5386, "step": 275 }, { "epoch": 0.4743752647183397, "grad_norm": 1.006282925605774, "learning_rate": 1.2602474551992165e-05, "loss": 0.5501, "step": 280 }, { "epoch": 0.48284625158831, "grad_norm": 0.9217785596847534, "learning_rate": 1.2315752407739093e-05, "loss": 0.529, "step": 285 }, { "epoch": 0.4913172384582804, "grad_norm": 0.9664483070373535, "learning_rate": 1.2027003928860936e-05, "loss": 0.5295, "step": 290 }, { "epoch": 0.49978822532825073, "grad_norm": 1.0580369234085083, "learning_rate": 1.1736481776669307e-05, "loss": 0.5504, "step": 295 }, { "epoch": 0.5082592121982211, "grad_norm": 0.938615620136261, "learning_rate": 1.1444440164479215e-05, "loss": 0.5307, "step": 300 }, { "epoch": 0.5167301990681914, "grad_norm": 0.91016685962677, "learning_rate": 1.115113463516683e-05, "loss": 0.5214, "step": 305 }, { "epoch": 0.5252011859381618, "grad_norm": 0.9125617146492004, "learning_rate": 1.085682183756377e-05, "loss": 0.5407, "step": 310 }, { "epoch": 0.5336721728081322, "grad_norm": 0.9857435822486877, "learning_rate": 1.0561759301883714e-05, "loss": 0.5508, "step": 315 }, { "epoch": 0.5421431596781024, "grad_norm": 0.948715090751648, "learning_rate": 1.026620521437775e-05, "loss": 0.5275, "step": 320 }, { "epoch": 0.5506141465480728, "grad_norm": 0.9650415778160095, "learning_rate": 9.970418191415703e-06, "loss": 0.5143, "step": 325 }, { "epoch": 0.5590851334180432, "grad_norm": 0.9797161221504211, "learning_rate": 9.674657053191079e-06, "loss": 0.529, "step": 330 }, { "epoch": 0.5675561202880135, "grad_norm": 0.9044686555862427, "learning_rate": 9.379180597247661e-06, "loss": 0.5446, "step": 335 }, { "epoch": 0.5760271071579839, "grad_norm": 0.9637364149093628, "learning_rate": 9.084247372025938e-06, "loss": 0.5207, "step": 340 }, { "epoch": 0.5844980940279543, "grad_norm": 0.883882462978363, "learning_rate": 8.790115450627486e-06, "loss": 0.5177, "step": 345 }, { "epoch": 0.5929690808979247, "grad_norm": 0.9830591678619385, "learning_rate": 8.497042204995299e-06, "loss": 0.5386, "step": 350 }, { "epoch": 0.6014400677678949, "grad_norm": 0.9278344511985779, "learning_rate": 8.205284080707634e-06, "loss": 0.5258, "step": 355 }, { "epoch": 0.6099110546378653, "grad_norm": 0.8791617155075073, "learning_rate": 7.915096372582467e-06, "loss": 0.5407, "step": 360 }, { "epoch": 0.6183820415078357, "grad_norm": 0.9185119271278381, "learning_rate": 7.626733001288852e-06, "loss": 0.527, "step": 365 }, { "epoch": 0.626853028377806, "grad_norm": 0.8270648717880249, "learning_rate": 7.3404462911607325e-06, "loss": 0.5312, "step": 370 }, { "epoch": 0.6353240152477764, "grad_norm": 0.919711709022522, "learning_rate": 7.056486749407552e-06, "loss": 0.5254, "step": 375 }, { "epoch": 0.6437950021177468, "grad_norm": 0.8501454591751099, "learning_rate": 6.775102846914912e-06, "loss": 0.5159, "step": 380 }, { "epoch": 0.652265988987717, "grad_norm": 0.9181873202323914, "learning_rate": 6.4965408008270355e-06, "loss": 0.5175, "step": 385 }, { "epoch": 0.6607369758576874, "grad_norm": 0.8412730693817139, "learning_rate": 6.221044359101317e-06, "loss": 0.5249, "step": 390 }, { "epoch": 0.6692079627276578, "grad_norm": 0.8827643394470215, "learning_rate": 5.948854587223465e-06, "loss": 0.5222, "step": 395 }, { "epoch": 0.6776789495976281, "grad_norm": 0.7885822653770447, "learning_rate": 5.680209657269871e-06, "loss": 0.5122, "step": 400 }, { "epoch": 0.6861499364675985, "grad_norm": 0.8819693922996521, "learning_rate": 5.415344639501754e-06, "loss": 0.5287, "step": 405 }, { "epoch": 0.6946209233375689, "grad_norm": 0.8051272034645081, "learning_rate": 5.1544912966735e-06, "loss": 0.5132, "step": 410 }, { "epoch": 0.7030919102075391, "grad_norm": 0.831628680229187, "learning_rate": 4.897877881235091e-06, "loss": 0.5088, "step": 415 }, { "epoch": 0.7115628970775095, "grad_norm": 0.8426679968833923, "learning_rate": 4.645728935606194e-06, "loss": 0.5163, "step": 420 }, { "epoch": 0.7200338839474799, "grad_norm": 0.8241559267044067, "learning_rate": 4.398265095696539e-06, "loss": 0.5174, "step": 425 }, { "epoch": 0.7285048708174502, "grad_norm": 0.8578051924705505, "learning_rate": 4.1557028978446415e-06, "loss": 0.5129, "step": 430 }, { "epoch": 0.7369758576874206, "grad_norm": 0.8619440197944641, "learning_rate": 3.918254589343683e-06, "loss": 0.5102, "step": 435 }, { "epoch": 0.745446844557391, "grad_norm": 0.8140995502471924, "learning_rate": 3.6861279427204634e-06, "loss": 0.5052, "step": 440 }, { "epoch": 0.7539178314273612, "grad_norm": 0.7656389474868774, "learning_rate": 3.4595260739298174e-06, "loss": 0.5247, "step": 445 }, { "epoch": 0.7623888182973316, "grad_norm": 0.780764102935791, "learning_rate": 3.2386472646236565e-06, "loss": 0.5139, "step": 450 }, { "epoch": 0.770859805167302, "grad_norm": 0.7912269234657288, "learning_rate": 3.023684788650154e-06, "loss": 0.5184, "step": 455 }, { "epoch": 0.7793307920372723, "grad_norm": 0.7656291127204895, "learning_rate": 2.814826742934823e-06, "loss": 0.5168, "step": 460 }, { "epoch": 0.7878017789072427, "grad_norm": 0.7654049396514893, "learning_rate": 2.6122558828915647e-06, "loss": 0.5127, "step": 465 }, { "epoch": 0.7962727657772131, "grad_norm": 0.8781611919403076, "learning_rate": 2.4161494625076164e-06, "loss": 0.5068, "step": 470 }, { "epoch": 0.8047437526471835, "grad_norm": 0.7922006249427795, "learning_rate": 2.2266790792424096e-06, "loss": 0.5243, "step": 475 }, { "epoch": 0.8132147395171537, "grad_norm": 0.7805562019348145, "learning_rate": 2.044010523875969e-06, "loss": 0.5114, "step": 480 }, { "epoch": 0.8216857263871241, "grad_norm": 0.8500534296035767, "learning_rate": 1.868303635438332e-06, "loss": 0.4978, "step": 485 }, { "epoch": 0.8301567132570945, "grad_norm": 0.7626408934593201, "learning_rate": 1.699712161346846e-06, "loss": 0.5108, "step": 490 }, { "epoch": 0.8386277001270648, "grad_norm": 0.7929341197013855, "learning_rate": 1.5383836228737815e-06, "loss": 0.5126, "step": 495 }, { "epoch": 0.8470986869970352, "grad_norm": 0.7848495244979858, "learning_rate": 1.3844591860619382e-06, "loss": 0.5037, "step": 500 }, { "epoch": 0.8555696738670056, "grad_norm": 0.7474762797355652, "learning_rate": 1.2380735382012576e-06, "loss": 0.5151, "step": 505 }, { "epoch": 0.8640406607369758, "grad_norm": 0.7843493819236755, "learning_rate": 1.0993547699744366e-06, "loss": 0.5114, "step": 510 }, { "epoch": 0.8725116476069462, "grad_norm": 0.7787851095199585, "learning_rate": 9.684242633747642e-07, "loss": 0.5111, "step": 515 }, { "epoch": 0.8809826344769166, "grad_norm": 0.7845005989074707, "learning_rate": 8.453965854941748e-07, "loss": 0.5046, "step": 520 }, { "epoch": 0.8894536213468869, "grad_norm": 0.7967577576637268, "learning_rate": 7.303793882745181e-07, "loss": 0.5016, "step": 525 }, { "epoch": 0.8979246082168573, "grad_norm": 0.7523807883262634, "learning_rate": 6.234733143097215e-07, "loss": 0.4974, "step": 530 }, { "epoch": 0.9063955950868277, "grad_norm": 0.7827950119972229, "learning_rate": 5.247719087812897e-07, "loss": 0.4984, "step": 535 }, { "epoch": 0.9148665819567979, "grad_norm": 0.7315457463264465, "learning_rate": 4.343615376042065e-07, "loss": 0.5147, "step": 540 }, { "epoch": 0.9233375688267683, "grad_norm": 0.8033891916275024, "learning_rate": 3.5232131185484075e-07, "loss": 0.5116, "step": 545 }, { "epoch": 0.9318085556967387, "grad_norm": 0.7409123778343201, "learning_rate": 2.78723018547008e-07, "loss": 0.4918, "step": 550 }, { "epoch": 0.940279542566709, "grad_norm": 0.7420827150344849, "learning_rate": 2.1363105781673888e-07, "loss": 0.5066, "step": 555 }, { "epoch": 0.9487505294366794, "grad_norm": 0.7862848043441772, "learning_rate": 1.5710238657074218e-07, "loss": 0.5183, "step": 560 }, { "epoch": 0.9572215163066498, "grad_norm": 0.7402486205101013, "learning_rate": 1.0918646864784166e-07, "loss": 0.5182, "step": 565 }, { "epoch": 0.96569250317662, "grad_norm": 0.7287072539329529, "learning_rate": 6.99252315370269e-08, "loss": 0.4976, "step": 570 }, { "epoch": 0.9741634900465904, "grad_norm": 0.7665913105010986, "learning_rate": 3.9353029689974676e-08, "loss": 0.5127, "step": 575 }, { "epoch": 0.9826344769165608, "grad_norm": 0.6921188831329346, "learning_rate": 1.7496614460135174e-08, "loss": 0.5173, "step": 580 }, { "epoch": 0.9911054637865311, "grad_norm": 0.7035255432128906, "learning_rate": 4.375110694713192e-09, "loss": 0.4961, "step": 585 }, { "epoch": 0.9995764506565015, "grad_norm": 0.7657430768013, "learning_rate": 0.0, "loss": 0.5122, "step": 590 }, { "epoch": 0.9995764506565015, "step": 590, "total_flos": 8.380405809686774e+17, "train_loss": 0.5979039826635587, "train_runtime": 5676.8996, "train_samples_per_second": 6.654, "train_steps_per_second": 0.104 } ], "logging_steps": 5, "max_steps": 590, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.380405809686774e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }