{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 10, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 8.746766090393066, "learning_rate": 2.9998815663057244e-06, "loss": 1.3502, "step": 5 }, { "epoch": 0.016, "grad_norm": 6.427896499633789, "learning_rate": 2.99952628392495e-06, "loss": 0.6921, "step": 10 }, { "epoch": 0.016, "eval_loss": 0.6720694303512573, "eval_runtime": 13.3015, "eval_samples_per_second": 150.359, "eval_steps_per_second": 9.397, "step": 10 }, { "epoch": 0.024, "grad_norm": 4.996713638305664, "learning_rate": 2.9989342089608837e-06, "loss": 0.6382, "step": 15 }, { "epoch": 0.032, "grad_norm": 5.306614875793457, "learning_rate": 2.9981054349090266e-06, "loss": 0.6141, "step": 20 }, { "epoch": 0.032, "eval_loss": 0.6662230491638184, "eval_runtime": 12.0717, "eval_samples_per_second": 165.676, "eval_steps_per_second": 10.355, "step": 20 }, { "epoch": 0.04, "grad_norm": 4.872737407684326, "learning_rate": 2.9970400926424076e-06, "loss": 0.6373, "step": 25 }, { "epoch": 0.048, "grad_norm": 4.585391998291016, "learning_rate": 2.995738350390921e-06, "loss": 0.6665, "step": 30 }, { "epoch": 0.048, "eval_loss": 0.6603240370750427, "eval_runtime": 12.0003, "eval_samples_per_second": 166.663, "eval_steps_per_second": 10.416, "step": 30 }, { "epoch": 0.056, "grad_norm": 4.579038619995117, "learning_rate": 2.9942004137147588e-06, "loss": 0.6827, "step": 35 }, { "epoch": 0.064, "grad_norm": 4.587752819061279, "learning_rate": 2.9924265254719506e-06, "loss": 0.6116, "step": 40 }, { "epoch": 0.064, "eval_loss": 0.661081075668335, "eval_runtime": 12.0453, "eval_samples_per_second": 166.04, "eval_steps_per_second": 10.377, "step": 40 }, { "epoch": 0.072, "grad_norm": 4.197625160217285, "learning_rate": 2.9904169657800125e-06, "loss": 0.6483, "step": 45 }, { "epoch": 0.08, "grad_norm": 4.366331100463867, "learning_rate": 2.988172051971717e-06, "loss": 0.6102, "step": 50 }, { "epoch": 0.08, "eval_loss": 0.6579549908638, "eval_runtime": 12.021, "eval_samples_per_second": 166.376, "eval_steps_per_second": 10.398, "step": 50 }, { "epoch": 0.088, "grad_norm": 4.392234802246094, "learning_rate": 2.985692138544977e-06, "loss": 0.6567, "step": 55 }, { "epoch": 0.096, "grad_norm": 4.914165019989014, "learning_rate": 2.982977617106871e-06, "loss": 0.6886, "step": 60 }, { "epoch": 0.096, "eval_loss": 0.6593431234359741, "eval_runtime": 12.0251, "eval_samples_per_second": 166.319, "eval_steps_per_second": 10.395, "step": 60 }, { "epoch": 0.104, "grad_norm": 4.842955589294434, "learning_rate": 2.980028916311802e-06, "loss": 0.6573, "step": 65 }, { "epoch": 0.112, "grad_norm": 4.415096282958984, "learning_rate": 2.9768465017938084e-06, "loss": 0.6415, "step": 70 }, { "epoch": 0.112, "eval_loss": 0.6595576405525208, "eval_runtime": 12.0577, "eval_samples_per_second": 165.869, "eval_steps_per_second": 10.367, "step": 70 }, { "epoch": 0.12, "grad_norm": 4.4561028480529785, "learning_rate": 2.9734308760930334e-06, "loss": 0.696, "step": 75 }, { "epoch": 0.128, "grad_norm": 4.971661567687988, "learning_rate": 2.9697825785763704e-06, "loss": 0.6214, "step": 80 }, { "epoch": 0.128, "eval_loss": 0.6594560742378235, "eval_runtime": 12.0017, "eval_samples_per_second": 166.643, "eval_steps_per_second": 10.415, "step": 80 }, { "epoch": 0.136, "grad_norm": 4.301797389984131, "learning_rate": 2.9659021853522904e-06, "loss": 0.6265, "step": 85 }, { "epoch": 0.144, "grad_norm": 4.3297319412231445, "learning_rate": 2.961790309179866e-06, "loss": 0.6816, "step": 90 }, { "epoch": 0.144, "eval_loss": 0.6583801507949829, "eval_runtime": 12.0447, "eval_samples_per_second": 166.048, "eval_steps_per_second": 10.378, "step": 90 }, { "epoch": 0.152, "grad_norm": 4.100594997406006, "learning_rate": 2.957447599372011e-06, "loss": 0.6395, "step": 95 }, { "epoch": 0.16, "grad_norm": 4.249173641204834, "learning_rate": 2.9528747416929465e-06, "loss": 0.6481, "step": 100 }, { "epoch": 0.16, "eval_loss": 0.6597488522529602, "eval_runtime": 12.1055, "eval_samples_per_second": 165.214, "eval_steps_per_second": 10.326, "step": 100 }, { "epoch": 0.168, "grad_norm": 4.542850017547607, "learning_rate": 2.9480724582499107e-06, "loss": 0.6964, "step": 105 }, { "epoch": 0.176, "grad_norm": 3.8737800121307373, "learning_rate": 2.943041507379129e-06, "loss": 0.6022, "step": 110 }, { "epoch": 0.176, "eval_loss": 0.6589648723602295, "eval_runtime": 12.0836, "eval_samples_per_second": 165.514, "eval_steps_per_second": 10.345, "step": 110 }, { "epoch": 0.184, "grad_norm": 3.998706340789795, "learning_rate": 2.937782683526064e-06, "loss": 0.6373, "step": 115 }, { "epoch": 0.192, "grad_norm": 4.192784309387207, "learning_rate": 2.9322968171199645e-06, "loss": 0.6703, "step": 120 }, { "epoch": 0.192, "eval_loss": 0.6606726050376892, "eval_runtime": 12.0359, "eval_samples_per_second": 166.17, "eval_steps_per_second": 10.386, "step": 120 }, { "epoch": 0.2, "grad_norm": 4.3980231285095215, "learning_rate": 2.9265847744427307e-06, "loss": 0.5916, "step": 125 }, { "epoch": 0.208, "grad_norm": 4.94151496887207, "learning_rate": 2.9206474574921165e-06, "loss": 0.6742, "step": 130 }, { "epoch": 0.208, "eval_loss": 0.6614590287208557, "eval_runtime": 12.0042, "eval_samples_per_second": 166.608, "eval_steps_per_second": 10.413, "step": 130 }, { "epoch": 0.216, "grad_norm": 3.713902473449707, "learning_rate": 2.914485803839297e-06, "loss": 0.6717, "step": 135 }, { "epoch": 0.224, "grad_norm": 4.350754737854004, "learning_rate": 2.9081007864808113e-06, "loss": 0.6369, "step": 140 }, { "epoch": 0.224, "eval_loss": 0.6614840030670166, "eval_runtime": 11.9865, "eval_samples_per_second": 166.854, "eval_steps_per_second": 10.428, "step": 140 }, { "epoch": 0.232, "grad_norm": 3.956383466720581, "learning_rate": 2.9014934136849183e-06, "loss": 0.642, "step": 145 }, { "epoch": 0.24, "grad_norm": 4.635798454284668, "learning_rate": 2.894664728832377e-06, "loss": 0.7142, "step": 150 }, { "epoch": 0.24, "eval_loss": 0.660183846950531, "eval_runtime": 12.0567, "eval_samples_per_second": 165.883, "eval_steps_per_second": 10.368, "step": 150 }, { "epoch": 0.248, "grad_norm": 4.187668800354004, "learning_rate": 2.887615810251687e-06, "loss": 0.6241, "step": 155 }, { "epoch": 0.256, "grad_norm": 4.165324687957764, "learning_rate": 2.8803477710488056e-06, "loss": 0.6707, "step": 160 }, { "epoch": 0.256, "eval_loss": 0.6611347794532776, "eval_runtime": 12.5135, "eval_samples_per_second": 159.828, "eval_steps_per_second": 9.989, "step": 160 }, { "epoch": 0.264, "grad_norm": 4.027588844299316, "learning_rate": 2.8728617589313763e-06, "loss": 0.6436, "step": 165 }, { "epoch": 0.272, "grad_norm": 3.7992372512817383, "learning_rate": 2.8651589560274937e-06, "loss": 0.6629, "step": 170 }, { "epoch": 0.272, "eval_loss": 0.6608501672744751, "eval_runtime": 12.007, "eval_samples_per_second": 166.57, "eval_steps_per_second": 10.411, "step": 170 }, { "epoch": 0.28, "grad_norm": 3.8681631088256836, "learning_rate": 2.8572405786990296e-06, "loss": 0.6505, "step": 175 }, { "epoch": 0.288, "grad_norm": 3.890542507171631, "learning_rate": 2.8491078773495566e-06, "loss": 0.6299, "step": 180 }, { "epoch": 0.288, "eval_loss": 0.6610468626022339, "eval_runtime": 12.01, "eval_samples_per_second": 166.528, "eval_steps_per_second": 10.408, "step": 180 }, { "epoch": 0.296, "grad_norm": 3.706123113632202, "learning_rate": 2.840762136226896e-06, "loss": 0.6188, "step": 185 }, { "epoch": 0.304, "grad_norm": 4.024528503417969, "learning_rate": 2.832204673220317e-06, "loss": 0.6351, "step": 190 }, { "epoch": 0.304, "eval_loss": 0.6607259511947632, "eval_runtime": 12.56, "eval_samples_per_second": 159.235, "eval_steps_per_second": 9.952, "step": 190 }, { "epoch": 0.312, "grad_norm": 3.7646734714508057, "learning_rate": 2.8234368396524304e-06, "loss": 0.5825, "step": 195 }, { "epoch": 0.32, "grad_norm": 4.327579021453857, "learning_rate": 2.814460020065795e-06, "loss": 0.5885, "step": 200 }, { "epoch": 0.32, "eval_loss": 0.6609508991241455, "eval_runtime": 12.0117, "eval_samples_per_second": 166.504, "eval_steps_per_second": 10.407, "step": 200 }, { "epoch": 0.328, "grad_norm": 3.8449947834014893, "learning_rate": 2.8052756320042887e-06, "loss": 0.6157, "step": 205 }, { "epoch": 0.336, "grad_norm": 4.472646236419678, "learning_rate": 2.795885125789253e-06, "loss": 0.6613, "step": 210 }, { "epoch": 0.336, "eval_loss": 0.6619015336036682, "eval_runtime": 12.0457, "eval_samples_per_second": 166.034, "eval_steps_per_second": 10.377, "step": 210 }, { "epoch": 0.344, "grad_norm": 4.221950054168701, "learning_rate": 2.7862899842904785e-06, "loss": 0.662, "step": 215 }, { "epoch": 0.352, "grad_norm": 3.822866201400757, "learning_rate": 2.776491722692038e-06, "loss": 0.6151, "step": 220 }, { "epoch": 0.352, "eval_loss": 0.6602036356925964, "eval_runtime": 12.088, "eval_samples_per_second": 165.453, "eval_steps_per_second": 10.341, "step": 220 }, { "epoch": 0.36, "grad_norm": 4.046660423278809, "learning_rate": 2.7664918882530226e-06, "loss": 0.6348, "step": 225 }, { "epoch": 0.368, "grad_norm": 3.865205764770508, "learning_rate": 2.756292060063213e-06, "loss": 0.6342, "step": 230 }, { "epoch": 0.368, "eval_loss": 0.6608572602272034, "eval_runtime": 12.111, "eval_samples_per_second": 165.138, "eval_steps_per_second": 10.321, "step": 230 }, { "epoch": 0.376, "grad_norm": 3.76444673538208, "learning_rate": 2.745893848793719e-06, "loss": 0.658, "step": 235 }, { "epoch": 0.384, "grad_norm": 4.313933849334717, "learning_rate": 2.735298896442641e-06, "loss": 0.6376, "step": 240 }, { "epoch": 0.384, "eval_loss": 0.6601213216781616, "eval_runtime": 12.0967, "eval_samples_per_second": 165.335, "eval_steps_per_second": 10.333, "step": 240 }, { "epoch": 0.392, "grad_norm": 3.985441207885742, "learning_rate": 2.7245088760757763e-06, "loss": 0.6508, "step": 245 }, { "epoch": 0.4, "grad_norm": 3.7975800037384033, "learning_rate": 2.713525491562421e-06, "loss": 0.679, "step": 250 }, { "epoch": 0.4, "eval_loss": 0.6601463556289673, "eval_runtime": 12.1046, "eval_samples_per_second": 165.227, "eval_steps_per_second": 10.327, "step": 250 }, { "epoch": 0.408, "grad_norm": 4.147428512573242, "learning_rate": 2.702350477306315e-06, "loss": 0.6564, "step": 255 }, { "epoch": 0.416, "grad_norm": 4.010830879211426, "learning_rate": 2.690985597971753e-06, "loss": 0.6911, "step": 260 }, { "epoch": 0.416, "eval_loss": 0.6592859029769897, "eval_runtime": 12.2518, "eval_samples_per_second": 163.242, "eval_steps_per_second": 10.203, "step": 260 }, { "epoch": 0.424, "grad_norm": 4.228829383850098, "learning_rate": 2.679432648204928e-06, "loss": 0.6194, "step": 265 }, { "epoch": 0.432, "grad_norm": 3.865734577178955, "learning_rate": 2.6676934523505355e-06, "loss": 0.6717, "step": 270 }, { "epoch": 0.432, "eval_loss": 0.6591557860374451, "eval_runtime": 12.0618, "eval_samples_per_second": 165.813, "eval_steps_per_second": 10.363, "step": 270 }, { "epoch": 0.44, "grad_norm": 4.116940498352051, "learning_rate": 2.655769864163684e-06, "loss": 0.6292, "step": 275 }, { "epoch": 0.448, "grad_norm": 3.9206840991973877, "learning_rate": 2.643663766517172e-06, "loss": 0.6758, "step": 280 }, { "epoch": 0.448, "eval_loss": 0.6602749228477478, "eval_runtime": 12.1339, "eval_samples_per_second": 164.827, "eval_steps_per_second": 10.302, "step": 280 }, { "epoch": 0.456, "grad_norm": 3.7658212184906006, "learning_rate": 2.6313770711041557e-06, "loss": 0.6698, "step": 285 }, { "epoch": 0.464, "grad_norm": 3.8347365856170654, "learning_rate": 2.6189117181362736e-06, "loss": 0.6243, "step": 290 }, { "epoch": 0.464, "eval_loss": 0.660269021987915, "eval_runtime": 12.0287, "eval_samples_per_second": 166.269, "eval_steps_per_second": 10.392, "step": 290 }, { "epoch": 0.472, "grad_norm": 4.060953140258789, "learning_rate": 2.606269676037261e-06, "loss": 0.7274, "step": 295 }, { "epoch": 0.48, "grad_norm": 4.1027655601501465, "learning_rate": 2.5934529411321173e-06, "loss": 0.643, "step": 300 }, { "epoch": 0.48, "eval_loss": 0.6585854291915894, "eval_runtime": 12.0789, "eval_samples_per_second": 165.578, "eval_steps_per_second": 10.349, "step": 300 }, { "epoch": 0.488, "grad_norm": 3.78674578666687, "learning_rate": 2.5804635373318606e-06, "loss": 0.6707, "step": 305 }, { "epoch": 0.496, "grad_norm": 3.7603418827056885, "learning_rate": 2.5673035158139285e-06, "loss": 0.603, "step": 310 }, { "epoch": 0.496, "eval_loss": 0.6572903394699097, "eval_runtime": 12.2284, "eval_samples_per_second": 163.554, "eval_steps_per_second": 10.222, "step": 310 }, { "epoch": 0.504, "grad_norm": 4.131699562072754, "learning_rate": 2.553974954698274e-06, "loss": 0.6348, "step": 315 }, { "epoch": 0.512, "grad_norm": 4.052087783813477, "learning_rate": 2.5404799587192076e-06, "loss": 0.6336, "step": 320 }, { "epoch": 0.512, "eval_loss": 0.6567848920822144, "eval_runtime": 12.1155, "eval_samples_per_second": 165.077, "eval_steps_per_second": 10.317, "step": 320 }, { "epoch": 0.52, "grad_norm": 4.040383338928223, "learning_rate": 2.526820658893033e-06, "loss": 0.6919, "step": 325 }, { "epoch": 0.528, "grad_norm": 4.098529815673828, "learning_rate": 2.5129992121815365e-06, "loss": 0.6198, "step": 330 }, { "epoch": 0.528, "eval_loss": 0.6569080948829651, "eval_runtime": 12.4865, "eval_samples_per_second": 160.173, "eval_steps_per_second": 10.011, "step": 330 }, { "epoch": 0.536, "grad_norm": 3.941558837890625, "learning_rate": 2.4990178011513777e-06, "loss": 0.6361, "step": 335 }, { "epoch": 0.544, "grad_norm": 4.143987655639648, "learning_rate": 2.484878633629435e-06, "loss": 0.6989, "step": 340 }, { "epoch": 0.544, "eval_loss": 0.657779335975647, "eval_runtime": 12.0123, "eval_samples_per_second": 166.496, "eval_steps_per_second": 10.406, "step": 340 }, { "epoch": 0.552, "grad_norm": 3.3898260593414307, "learning_rate": 2.4705839423541666e-06, "loss": 0.658, "step": 345 }, { "epoch": 0.56, "grad_norm": 3.7825887203216553, "learning_rate": 2.456135984623035e-06, "loss": 0.6353, "step": 350 }, { "epoch": 0.56, "eval_loss": 0.6569732427597046, "eval_runtime": 12.0405, "eval_samples_per_second": 166.107, "eval_steps_per_second": 10.382, "step": 350 }, { "epoch": 0.568, "grad_norm": 4.2369537353515625, "learning_rate": 2.441537041936051e-06, "loss": 0.654, "step": 355 }, { "epoch": 0.576, "grad_norm": 3.7942652702331543, "learning_rate": 2.4267894196355018e-06, "loss": 0.6746, "step": 360 }, { "epoch": 0.576, "eval_loss": 0.6568124890327454, "eval_runtime": 12.3613, "eval_samples_per_second": 161.795, "eval_steps_per_second": 10.112, "step": 360 }, { "epoch": 0.584, "grad_norm": 4.185107231140137, "learning_rate": 2.4118954465419083e-06, "loss": 0.5986, "step": 365 }, { "epoch": 0.592, "grad_norm": 3.844982385635376, "learning_rate": 2.3968574745862785e-06, "loss": 0.6883, "step": 370 }, { "epoch": 0.592, "eval_loss": 0.6570600867271423, "eval_runtime": 12.0276, "eval_samples_per_second": 166.284, "eval_steps_per_second": 10.393, "step": 370 }, { "epoch": 0.6, "grad_norm": 3.494849443435669, "learning_rate": 2.3816778784387097e-06, "loss": 0.6193, "step": 375 }, { "epoch": 0.608, "grad_norm": 4.09348726272583, "learning_rate": 2.3663590551334015e-06, "loss": 0.6772, "step": 380 }, { "epoch": 0.608, "eval_loss": 0.6566088795661926, "eval_runtime": 12.1153, "eval_samples_per_second": 165.08, "eval_steps_per_second": 10.317, "step": 380 }, { "epoch": 0.616, "grad_norm": 3.824082612991333, "learning_rate": 2.350903423690135e-06, "loss": 0.6153, "step": 385 }, { "epoch": 0.624, "grad_norm": 4.054186820983887, "learning_rate": 2.3353134247322823e-06, "loss": 0.6563, "step": 390 }, { "epoch": 0.624, "eval_loss": 0.6563527584075928, "eval_runtime": 12.0218, "eval_samples_per_second": 166.364, "eval_steps_per_second": 10.398, "step": 390 }, { "epoch": 0.632, "grad_norm": 3.5578935146331787, "learning_rate": 2.3195915201014038e-06, "loss": 0.7107, "step": 395 }, { "epoch": 0.64, "grad_norm": 3.8796043395996094, "learning_rate": 2.303740192468495e-06, "loss": 0.6077, "step": 400 }, { "epoch": 0.64, "eval_loss": 0.6553810238838196, "eval_runtime": 12.1088, "eval_samples_per_second": 165.169, "eval_steps_per_second": 10.323, "step": 400 }, { "epoch": 0.648, "grad_norm": 3.7618532180786133, "learning_rate": 2.2877619449419438e-06, "loss": 0.6272, "step": 405 }, { "epoch": 0.656, "grad_norm": 3.750394344329834, "learning_rate": 2.2716593006722595e-06, "loss": 0.6291, "step": 410 }, { "epoch": 0.656, "eval_loss": 0.65521240234375, "eval_runtime": 12.0523, "eval_samples_per_second": 165.943, "eval_steps_per_second": 10.371, "step": 410 }, { "epoch": 0.664, "grad_norm": 4.360326290130615, "learning_rate": 2.2554348024536415e-06, "loss": 0.699, "step": 415 }, { "epoch": 0.672, "grad_norm": 3.571743965148926, "learning_rate": 2.2390910123224374e-06, "loss": 0.6073, "step": 420 }, { "epoch": 0.672, "eval_loss": 0.6547145843505859, "eval_runtime": 12.0682, "eval_samples_per_second": 165.725, "eval_steps_per_second": 10.358, "step": 420 }, { "epoch": 0.68, "grad_norm": 3.8361449241638184, "learning_rate": 2.222630511152573e-06, "loss": 0.5729, "step": 425 }, { "epoch": 0.688, "grad_norm": 3.7221009731292725, "learning_rate": 2.2060558982479992e-06, "loss": 0.6598, "step": 430 }, { "epoch": 0.688, "eval_loss": 0.6550743579864502, "eval_runtime": 12.0566, "eval_samples_per_second": 165.885, "eval_steps_per_second": 10.368, "step": 430 }, { "epoch": 0.696, "grad_norm": 4.329174518585205, "learning_rate": 2.1893697909322322e-06, "loss": 0.6447, "step": 435 }, { "epoch": 0.704, "grad_norm": 3.9657375812530518, "learning_rate": 2.1725748241350487e-06, "loss": 0.593, "step": 440 }, { "epoch": 0.704, "eval_loss": 0.6547417640686035, "eval_runtime": 12.1368, "eval_samples_per_second": 164.789, "eval_steps_per_second": 10.299, "step": 440 }, { "epoch": 0.712, "grad_norm": 3.378925323486328, "learning_rate": 2.1556736499763994e-06, "loss": 0.6351, "step": 445 }, { "epoch": 0.72, "grad_norm": 3.746727228164673, "learning_rate": 2.138668937347609e-06, "loss": 0.6352, "step": 450 }, { "epoch": 0.72, "eval_loss": 0.6547327637672424, "eval_runtime": 12.0347, "eval_samples_per_second": 166.187, "eval_steps_per_second": 10.387, "step": 450 }, { "epoch": 0.728, "grad_norm": 3.8396756649017334, "learning_rate": 2.1215633714899263e-06, "loss": 0.683, "step": 455 }, { "epoch": 0.736, "grad_norm": 3.8135344982147217, "learning_rate": 2.1043596535704943e-06, "loss": 0.6216, "step": 460 }, { "epoch": 0.736, "eval_loss": 0.6539892554283142, "eval_runtime": 12.0015, "eval_samples_per_second": 166.645, "eval_steps_per_second": 10.415, "step": 460 }, { "epoch": 0.744, "grad_norm": 3.2499091625213623, "learning_rate": 2.0870605002558037e-06, "loss": 0.6512, "step": 465 }, { "epoch": 0.752, "grad_norm": 3.963940382003784, "learning_rate": 2.069668643282702e-06, "loss": 0.6937, "step": 470 }, { "epoch": 0.752, "eval_loss": 0.6535360813140869, "eval_runtime": 13.6104, "eval_samples_per_second": 146.947, "eval_steps_per_second": 9.184, "step": 470 }, { "epoch": 0.76, "grad_norm": 4.249936103820801, "learning_rate": 2.0521868290270174e-06, "loss": 0.659, "step": 475 }, { "epoch": 0.768, "grad_norm": 3.558105230331421, "learning_rate": 2.034617818069876e-06, "loss": 0.669, "step": 480 }, { "epoch": 0.768, "eval_loss": 0.6529609560966492, "eval_runtime": 12.1045, "eval_samples_per_second": 165.227, "eval_steps_per_second": 10.327, "step": 480 }, { "epoch": 0.776, "grad_norm": 4.027170658111572, "learning_rate": 2.0169643847617756e-06, "loss": 0.6846, "step": 485 }, { "epoch": 0.784, "grad_norm": 3.395838737487793, "learning_rate": 1.99922931678448e-06, "loss": 0.6052, "step": 490 }, { "epoch": 0.784, "eval_loss": 0.6525455117225647, "eval_runtime": 12.7081, "eval_samples_per_second": 157.38, "eval_steps_per_second": 9.836, "step": 490 }, { "epoch": 0.792, "grad_norm": 3.7162697315216064, "learning_rate": 1.981415414710814e-06, "loss": 0.6307, "step": 495 }, { "epoch": 0.8, "grad_norm": 3.496490955352783, "learning_rate": 1.963525491562421e-06, "loss": 0.6218, "step": 500 }, { "epoch": 0.8, "eval_loss": 0.652510404586792, "eval_runtime": 12.0132, "eval_samples_per_second": 166.484, "eval_steps_per_second": 10.405, "step": 500 }, { "epoch": 0.808, "grad_norm": 4.182400703430176, "learning_rate": 1.9455623723655522e-06, "loss": 0.6504, "step": 505 }, { "epoch": 0.816, "grad_norm": 3.4778692722320557, "learning_rate": 1.927528893704964e-06, "loss": 0.6341, "step": 510 }, { "epoch": 0.816, "eval_loss": 0.6526325941085815, "eval_runtime": 12.1893, "eval_samples_per_second": 164.078, "eval_steps_per_second": 10.255, "step": 510 }, { "epoch": 0.824, "grad_norm": 3.5207061767578125, "learning_rate": 1.909427903275988e-06, "loss": 0.6774, "step": 515 }, { "epoch": 0.832, "grad_norm": 3.6349990367889404, "learning_rate": 1.8912622594348455e-06, "loss": 0.6681, "step": 520 }, { "epoch": 0.832, "eval_loss": 0.6522479057312012, "eval_runtime": 12.3219, "eval_samples_per_second": 162.312, "eval_steps_per_second": 10.145, "step": 520 }, { "epoch": 0.84, "grad_norm": 4.17547082901001, "learning_rate": 1.8730348307472826e-06, "loss": 0.639, "step": 525 }, { "epoch": 0.848, "grad_norm": 3.4751803874969482, "learning_rate": 1.8547484955355872e-06, "loss": 0.6203, "step": 530 }, { "epoch": 0.848, "eval_loss": 0.6515837907791138, "eval_runtime": 13.3997, "eval_samples_per_second": 149.257, "eval_steps_per_second": 9.329, "step": 530 }, { "epoch": 0.856, "grad_norm": 3.854713201522827, "learning_rate": 1.836406141424072e-06, "loss": 0.6283, "step": 535 }, { "epoch": 0.864, "grad_norm": 3.728891134262085, "learning_rate": 1.8180106648830824e-06, "loss": 0.6682, "step": 540 }, { "epoch": 0.864, "eval_loss": 0.6505850553512573, "eval_runtime": 14.2953, "eval_samples_per_second": 139.906, "eval_steps_per_second": 8.744, "step": 540 }, { "epoch": 0.872, "grad_norm": 3.6460957527160645, "learning_rate": 1.7995649707716105e-06, "loss": 0.6677, "step": 545 }, { "epoch": 0.88, "grad_norm": 3.6266672611236572, "learning_rate": 1.7810719718785873e-06, "loss": 0.6212, "step": 550 }, { "epoch": 0.88, "eval_loss": 0.6500898599624634, "eval_runtime": 13.827, "eval_samples_per_second": 144.644, "eval_steps_per_second": 9.04, "step": 550 }, { "epoch": 0.888, "grad_norm": 3.6777870655059814, "learning_rate": 1.7625345884629143e-06, "loss": 0.6827, "step": 555 }, { "epoch": 0.896, "grad_norm": 3.7290449142456055, "learning_rate": 1.7439557477923257e-06, "loss": 0.6887, "step": 560 }, { "epoch": 0.896, "eval_loss": 0.6501717567443848, "eval_runtime": 13.4717, "eval_samples_per_second": 148.459, "eval_steps_per_second": 9.279, "step": 560 }, { "epoch": 0.904, "grad_norm": 3.18540620803833, "learning_rate": 1.7253383836811356e-06, "loss": 0.575, "step": 565 }, { "epoch": 0.912, "grad_norm": 4.008258819580078, "learning_rate": 1.706685436026957e-06, "loss": 0.64, "step": 570 }, { "epoch": 0.912, "eval_loss": 0.6503917574882507, "eval_runtime": 13.6, "eval_samples_per_second": 147.058, "eval_steps_per_second": 9.191, "step": 570 }, { "epoch": 0.92, "grad_norm": 3.878329277038574, "learning_rate": 1.6879998503464564e-06, "loss": 0.6653, "step": 575 }, { "epoch": 0.928, "grad_norm": 3.8032970428466797, "learning_rate": 1.6692845773102223e-06, "loss": 0.6176, "step": 580 }, { "epoch": 0.928, "eval_loss": 0.6500183343887329, "eval_runtime": 13.5348, "eval_samples_per_second": 147.768, "eval_steps_per_second": 9.235, "step": 580 }, { "epoch": 0.936, "grad_norm": 3.704298973083496, "learning_rate": 1.6505425722768222e-06, "loss": 0.6302, "step": 585 }, { "epoch": 0.944, "grad_norm": 3.4791512489318848, "learning_rate": 1.6317767948261151e-06, "loss": 0.6285, "step": 590 }, { "epoch": 0.944, "eval_loss": 0.6499763131141663, "eval_runtime": 13.3791, "eval_samples_per_second": 149.487, "eval_steps_per_second": 9.343, "step": 590 }, { "epoch": 0.952, "grad_norm": 3.9690394401550293, "learning_rate": 1.6129902082918993e-06, "loss": 0.6118, "step": 595 }, { "epoch": 0.96, "grad_norm": 3.715287208557129, "learning_rate": 1.5941857792939703e-06, "loss": 0.6661, "step": 600 }, { "epoch": 0.96, "eval_loss": 0.6488626599311829, "eval_runtime": 13.0584, "eval_samples_per_second": 153.158, "eval_steps_per_second": 9.572, "step": 600 }, { "epoch": 0.968, "grad_norm": 3.739778518676758, "learning_rate": 1.5753664772696545e-06, "loss": 0.6188, "step": 605 }, { "epoch": 0.976, "grad_norm": 3.546541213989258, "learning_rate": 1.556535274004902e-06, "loss": 0.6537, "step": 610 }, { "epoch": 0.976, "eval_loss": 0.6488344669342041, "eval_runtime": 12.2525, "eval_samples_per_second": 163.231, "eval_steps_per_second": 10.202, "step": 610 }, { "epoch": 0.984, "grad_norm": 3.7501473426818848, "learning_rate": 1.5376951431650064e-06, "loss": 0.6508, "step": 615 }, { "epoch": 0.992, "grad_norm": 3.7694437503814697, "learning_rate": 1.518849059825029e-06, "loss": 0.657, "step": 620 }, { "epoch": 0.992, "eval_loss": 0.648171603679657, "eval_runtime": 11.9787, "eval_samples_per_second": 166.963, "eval_steps_per_second": 10.435, "step": 620 }, { "epoch": 1.0, "grad_norm": 3.7407610416412354, "learning_rate": 1.5e-06, "loss": 0.582, "step": 625 }, { "epoch": 1.008, "grad_norm": 3.2185583114624023, "learning_rate": 1.481150940174971e-06, "loss": 0.4004, "step": 630 }, { "epoch": 1.008, "eval_loss": 0.6502845287322998, "eval_runtime": 11.9952, "eval_samples_per_second": 166.733, "eval_steps_per_second": 10.421, "step": 630 }, { "epoch": 1.016, "grad_norm": 3.261690855026245, "learning_rate": 1.4623048568349939e-06, "loss": 0.3738, "step": 635 }, { "epoch": 1.024, "grad_norm": 5.511118412017822, "learning_rate": 1.4434647259950982e-06, "loss": 0.4014, "step": 640 }, { "epoch": 1.024, "eval_loss": 0.7170341610908508, "eval_runtime": 11.9625, "eval_samples_per_second": 167.189, "eval_steps_per_second": 10.449, "step": 640 }, { "epoch": 1.032, "grad_norm": 4.647660255432129, "learning_rate": 1.4246335227303458e-06, "loss": 0.3661, "step": 645 }, { "epoch": 1.04, "grad_norm": 3.8759918212890625, "learning_rate": 1.40581422070603e-06, "loss": 0.4179, "step": 650 }, { "epoch": 1.04, "eval_loss": 0.6922851800918579, "eval_runtime": 11.9429, "eval_samples_per_second": 167.463, "eval_steps_per_second": 10.466, "step": 650 }, { "epoch": 1.048, "grad_norm": 4.271312236785889, "learning_rate": 1.3870097917081012e-06, "loss": 0.3519, "step": 655 }, { "epoch": 1.056, "grad_norm": 4.502732276916504, "learning_rate": 1.3682232051738854e-06, "loss": 0.3998, "step": 660 }, { "epoch": 1.056, "eval_loss": 0.6920506358146667, "eval_runtime": 12.2751, "eval_samples_per_second": 162.931, "eval_steps_per_second": 10.183, "step": 660 }, { "epoch": 1.064, "grad_norm": 4.114953994750977, "learning_rate": 1.3494574277231775e-06, "loss": 0.399, "step": 665 }, { "epoch": 1.072, "grad_norm": 4.416531085968018, "learning_rate": 1.3307154226897775e-06, "loss": 0.3705, "step": 670 }, { "epoch": 1.072, "eval_loss": 0.7054136991500854, "eval_runtime": 12.407, "eval_samples_per_second": 161.199, "eval_steps_per_second": 10.075, "step": 670 }, { "epoch": 1.08, "grad_norm": 3.7480475902557373, "learning_rate": 1.3120001496535434e-06, "loss": 0.3634, "step": 675 }, { "epoch": 1.088, "grad_norm": 4.652655124664307, "learning_rate": 1.293314563973043e-06, "loss": 0.3513, "step": 680 }, { "epoch": 1.088, "eval_loss": 0.7035665512084961, "eval_runtime": 12.0092, "eval_samples_per_second": 166.539, "eval_steps_per_second": 10.409, "step": 680 }, { "epoch": 1.096, "grad_norm": 3.6166396141052246, "learning_rate": 1.2746616163188644e-06, "loss": 0.3481, "step": 685 }, { "epoch": 1.104, "grad_norm": 4.5175981521606445, "learning_rate": 1.2560442522076746e-06, "loss": 0.3815, "step": 690 }, { "epoch": 1.104, "eval_loss": 0.7024725079536438, "eval_runtime": 12.0834, "eval_samples_per_second": 165.516, "eval_steps_per_second": 10.345, "step": 690 }, { "epoch": 1.112, "grad_norm": 4.5786213874816895, "learning_rate": 1.2374654115370858e-06, "loss": 0.3672, "step": 695 }, { "epoch": 1.12, "grad_norm": 4.556806564331055, "learning_rate": 1.2189280281214128e-06, "loss": 0.3684, "step": 700 }, { "epoch": 1.12, "eval_loss": 0.704890787601471, "eval_runtime": 11.9719, "eval_samples_per_second": 167.058, "eval_steps_per_second": 10.441, "step": 700 }, { "epoch": 1.1280000000000001, "grad_norm": 4.46769380569458, "learning_rate": 1.2004350292283896e-06, "loss": 0.3424, "step": 705 }, { "epoch": 1.1360000000000001, "grad_norm": 5.269836902618408, "learning_rate": 1.1819893351169183e-06, "loss": 0.3914, "step": 710 }, { "epoch": 1.1360000000000001, "eval_loss": 0.7068949937820435, "eval_runtime": 11.9987, "eval_samples_per_second": 166.685, "eval_steps_per_second": 10.418, "step": 710 }, { "epoch": 1.144, "grad_norm": 4.130289554595947, "learning_rate": 1.1635938585759285e-06, "loss": 0.3516, "step": 715 }, { "epoch": 1.152, "grad_norm": 4.418029308319092, "learning_rate": 1.1452515044644133e-06, "loss": 0.4082, "step": 720 }, { "epoch": 1.152, "eval_loss": 0.7017790675163269, "eval_runtime": 11.9747, "eval_samples_per_second": 167.019, "eval_steps_per_second": 10.439, "step": 720 }, { "epoch": 1.16, "grad_norm": 3.497861623764038, "learning_rate": 1.1269651692527181e-06, "loss": 0.3627, "step": 725 }, { "epoch": 1.168, "grad_norm": 4.658440589904785, "learning_rate": 1.108737740565155e-06, "loss": 0.3494, "step": 730 }, { "epoch": 1.168, "eval_loss": 0.7041600346565247, "eval_runtime": 11.9932, "eval_samples_per_second": 166.762, "eval_steps_per_second": 10.423, "step": 730 }, { "epoch": 1.176, "grad_norm": 4.345962047576904, "learning_rate": 1.0905720967240124e-06, "loss": 0.3762, "step": 735 }, { "epoch": 1.184, "grad_norm": 3.5613420009613037, "learning_rate": 1.0724711062950359e-06, "loss": 0.3715, "step": 740 }, { "epoch": 1.184, "eval_loss": 0.707082986831665, "eval_runtime": 12.0082, "eval_samples_per_second": 166.553, "eval_steps_per_second": 10.41, "step": 740 }, { "epoch": 1.192, "grad_norm": 4.606480598449707, "learning_rate": 1.0544376276344478e-06, "loss": 0.386, "step": 745 }, { "epoch": 1.2, "grad_norm": 4.682101249694824, "learning_rate": 1.036474508437579e-06, "loss": 0.3675, "step": 750 }, { "epoch": 1.2, "eval_loss": 0.7084864377975464, "eval_runtime": 12.0192, "eval_samples_per_second": 166.4, "eval_steps_per_second": 10.4, "step": 750 }, { "epoch": 1.208, "grad_norm": 5.7233710289001465, "learning_rate": 1.018584585289186e-06, "loss": 0.354, "step": 755 }, { "epoch": 1.216, "grad_norm": 4.105285167694092, "learning_rate": 1.0007706832155202e-06, "loss": 0.3319, "step": 760 }, { "epoch": 1.216, "eval_loss": 0.7111691236495972, "eval_runtime": 12.5364, "eval_samples_per_second": 159.535, "eval_steps_per_second": 9.971, "step": 760 }, { "epoch": 1.224, "grad_norm": 4.163219451904297, "learning_rate": 9.830356152382247e-07, "loss": 0.357, "step": 765 }, { "epoch": 1.232, "grad_norm": 5.115902423858643, "learning_rate": 9.65382181930124e-07, "loss": 0.3823, "step": 770 }, { "epoch": 1.232, "eval_loss": 0.7141273021697998, "eval_runtime": 12.0055, "eval_samples_per_second": 166.591, "eval_steps_per_second": 10.412, "step": 770 }, { "epoch": 1.24, "grad_norm": 4.919798851013184, "learning_rate": 9.478131709729831e-07, "loss": 0.3669, "step": 775 }, { "epoch": 1.248, "grad_norm": 3.693423271179199, "learning_rate": 9.303313567172986e-07, "loss": 0.3571, "step": 780 }, { "epoch": 1.248, "eval_loss": 0.7113474011421204, "eval_runtime": 11.9538, "eval_samples_per_second": 167.311, "eval_steps_per_second": 10.457, "step": 780 }, { "epoch": 1.256, "grad_norm": 4.305978298187256, "learning_rate": 9.129394997441964e-07, "loss": 0.3765, "step": 785 }, { "epoch": 1.264, "grad_norm": 4.349783897399902, "learning_rate": 8.956403464295061e-07, "loss": 0.3503, "step": 790 }, { "epoch": 1.264, "eval_loss": 0.7126505970954895, "eval_runtime": 12.0036, "eval_samples_per_second": 166.617, "eval_steps_per_second": 10.414, "step": 790 }, { "epoch": 1.272, "grad_norm": 4.4210662841796875, "learning_rate": 8.784366285100739e-07, "loss": 0.3583, "step": 795 }, { "epoch": 1.28, "grad_norm": 4.530226707458496, "learning_rate": 8.613310626523911e-07, "loss": 0.3742, "step": 800 }, { "epoch": 1.28, "eval_loss": 0.7158926129341125, "eval_runtime": 11.9999, "eval_samples_per_second": 166.668, "eval_steps_per_second": 10.417, "step": 800 }, { "epoch": 1.288, "grad_norm": 3.9921834468841553, "learning_rate": 8.443263500236011e-07, "loss": 0.371, "step": 805 }, { "epoch": 1.296, "grad_norm": 5.109288215637207, "learning_rate": 8.274251758649519e-07, "loss": 0.4087, "step": 810 }, { "epoch": 1.296, "eval_loss": 0.7139343619346619, "eval_runtime": 12.0284, "eval_samples_per_second": 166.274, "eval_steps_per_second": 10.392, "step": 810 }, { "epoch": 1.304, "grad_norm": 4.64515495300293, "learning_rate": 8.106302090677683e-07, "loss": 0.3806, "step": 815 }, { "epoch": 1.312, "grad_norm": 6.393331527709961, "learning_rate": 7.939441017520012e-07, "loss": 0.3781, "step": 820 }, { "epoch": 1.312, "eval_loss": 0.7073184847831726, "eval_runtime": 12.02, "eval_samples_per_second": 166.389, "eval_steps_per_second": 10.399, "step": 820 }, { "epoch": 1.32, "grad_norm": 4.550942897796631, "learning_rate": 7.773694888474268e-07, "loss": 0.3891, "step": 825 }, { "epoch": 1.328, "grad_norm": 4.257821083068848, "learning_rate": 7.609089876775628e-07, "loss": 0.3475, "step": 830 }, { "epoch": 1.328, "eval_loss": 0.7128930687904358, "eval_runtime": 12.01, "eval_samples_per_second": 166.528, "eval_steps_per_second": 10.408, "step": 830 }, { "epoch": 1.336, "grad_norm": 3.6469428539276123, "learning_rate": 7.445651975463588e-07, "loss": 0.3546, "step": 835 }, { "epoch": 1.3439999999999999, "grad_norm": 6.628065586090088, "learning_rate": 7.283406993277403e-07, "loss": 0.3724, "step": 840 }, { "epoch": 1.3439999999999999, "eval_loss": 0.7112878561019897, "eval_runtime": 12.3921, "eval_samples_per_second": 161.393, "eval_steps_per_second": 10.087, "step": 840 }, { "epoch": 1.3519999999999999, "grad_norm": 4.343158721923828, "learning_rate": 7.122380550580563e-07, "loss": 0.3659, "step": 845 }, { "epoch": 1.3599999999999999, "grad_norm": 5.028806686401367, "learning_rate": 6.962598075315047e-07, "loss": 0.3612, "step": 850 }, { "epoch": 1.3599999999999999, "eval_loss": 0.7130332589149475, "eval_runtime": 12.01, "eval_samples_per_second": 166.528, "eval_steps_per_second": 10.408, "step": 850 }, { "epoch": 1.3679999999999999, "grad_norm": 4.212679386138916, "learning_rate": 6.804084798985965e-07, "loss": 0.3495, "step": 855 }, { "epoch": 1.376, "grad_norm": 4.199646949768066, "learning_rate": 6.646865752677186e-07, "loss": 0.3254, "step": 860 }, { "epoch": 1.376, "eval_loss": 0.7138640880584717, "eval_runtime": 12.0153, "eval_samples_per_second": 166.455, "eval_steps_per_second": 10.403, "step": 860 }, { "epoch": 1.384, "grad_norm": 4.8986921310424805, "learning_rate": 6.490965763098655e-07, "loss": 0.3986, "step": 865 }, { "epoch": 1.392, "grad_norm": 4.102914810180664, "learning_rate": 6.336409448665989e-07, "loss": 0.3626, "step": 870 }, { "epoch": 1.392, "eval_loss": 0.7144864797592163, "eval_runtime": 11.9979, "eval_samples_per_second": 166.696, "eval_steps_per_second": 10.418, "step": 870 }, { "epoch": 1.4, "grad_norm": 5.214079856872559, "learning_rate": 6.183221215612905e-07, "loss": 0.3739, "step": 875 }, { "epoch": 1.408, "grad_norm": 5.322539806365967, "learning_rate": 6.031425254137223e-07, "loss": 0.351, "step": 880 }, { "epoch": 1.408, "eval_loss": 0.7146536707878113, "eval_runtime": 12.0493, "eval_samples_per_second": 165.985, "eval_steps_per_second": 10.374, "step": 880 }, { "epoch": 1.416, "grad_norm": 4.931739330291748, "learning_rate": 5.881045534580923e-07, "loss": 0.3971, "step": 885 }, { "epoch": 1.424, "grad_norm": 3.9749743938446045, "learning_rate": 5.732105803644987e-07, "loss": 0.3357, "step": 890 }, { "epoch": 1.424, "eval_loss": 0.7105372548103333, "eval_runtime": 12.0068, "eval_samples_per_second": 166.572, "eval_steps_per_second": 10.411, "step": 890 }, { "epoch": 1.432, "grad_norm": 4.823004245758057, "learning_rate": 5.584629580639495e-07, "loss": 0.4003, "step": 895 }, { "epoch": 1.44, "grad_norm": 4.934309482574463, "learning_rate": 5.438640153769653e-07, "loss": 0.371, "step": 900 }, { "epoch": 1.44, "eval_loss": 0.7078642845153809, "eval_runtime": 12.0222, "eval_samples_per_second": 166.359, "eval_steps_per_second": 10.397, "step": 900 }, { "epoch": 1.448, "grad_norm": 4.136751174926758, "learning_rate": 5.29416057645834e-07, "loss": 0.3825, "step": 905 }, { "epoch": 1.456, "grad_norm": 4.164857864379883, "learning_rate": 5.151213663705655e-07, "loss": 0.3566, "step": 910 }, { "epoch": 1.456, "eval_loss": 0.7069818377494812, "eval_runtime": 12.3015, "eval_samples_per_second": 162.582, "eval_steps_per_second": 10.161, "step": 910 }, { "epoch": 1.464, "grad_norm": 4.379539489746094, "learning_rate": 5.009821988486227e-07, "loss": 0.3733, "step": 915 }, { "epoch": 1.472, "grad_norm": 4.559187412261963, "learning_rate": 4.870007878184633e-07, "loss": 0.3762, "step": 920 }, { "epoch": 1.472, "eval_loss": 0.711800754070282, "eval_runtime": 11.9873, "eval_samples_per_second": 166.843, "eval_steps_per_second": 10.428, "step": 920 }, { "epoch": 1.48, "grad_norm": 4.373340606689453, "learning_rate": 4.731793411069669e-07, "loss": 0.3768, "step": 925 }, { "epoch": 1.488, "grad_norm": 4.8367838859558105, "learning_rate": 4.5952004128079276e-07, "loss": 0.3755, "step": 930 }, { "epoch": 1.488, "eval_loss": 0.7125899791717529, "eval_runtime": 12.4233, "eval_samples_per_second": 160.988, "eval_steps_per_second": 10.062, "step": 930 }, { "epoch": 1.496, "grad_norm": 6.753279685974121, "learning_rate": 4.460250453017264e-07, "loss": 0.374, "step": 935 }, { "epoch": 1.504, "grad_norm": 4.952857971191406, "learning_rate": 4.3269648418607197e-07, "loss": 0.3595, "step": 940 }, { "epoch": 1.504, "eval_loss": 0.7107095718383789, "eval_runtime": 11.9826, "eval_samples_per_second": 166.909, "eval_steps_per_second": 10.432, "step": 940 }, { "epoch": 1.512, "grad_norm": 4.235396385192871, "learning_rate": 4.1953646266813963e-07, "loss": 0.4008, "step": 945 }, { "epoch": 1.52, "grad_norm": 4.504904747009277, "learning_rate": 4.06547058867883e-07, "loss": 0.3828, "step": 950 }, { "epoch": 1.52, "eval_loss": 0.7118133902549744, "eval_runtime": 12.4744, "eval_samples_per_second": 160.328, "eval_steps_per_second": 10.02, "step": 950 }, { "epoch": 1.528, "grad_norm": 3.8867733478546143, "learning_rate": 3.9373032396273926e-07, "loss": 0.3498, "step": 955 }, { "epoch": 1.536, "grad_norm": 4.430863857269287, "learning_rate": 3.8108828186372685e-07, "loss": 0.3793, "step": 960 }, { "epoch": 1.536, "eval_loss": 0.7172751426696777, "eval_runtime": 12.065, "eval_samples_per_second": 165.768, "eval_steps_per_second": 10.361, "step": 960 }, { "epoch": 1.544, "grad_norm": 4.033322334289551, "learning_rate": 3.686229288958442e-07, "loss": 0.3693, "step": 965 }, { "epoch": 1.552, "grad_norm": 4.288609027862549, "learning_rate": 3.56336233482828e-07, "loss": 0.3446, "step": 970 }, { "epoch": 1.552, "eval_loss": 0.7149741053581238, "eval_runtime": 12.0179, "eval_samples_per_second": 166.418, "eval_steps_per_second": 10.401, "step": 970 }, { "epoch": 1.56, "grad_norm": 4.438851356506348, "learning_rate": 3.442301358363163e-07, "loss": 0.3672, "step": 975 }, { "epoch": 1.568, "grad_norm": 4.459841728210449, "learning_rate": 3.32306547649465e-07, "loss": 0.3707, "step": 980 }, { "epoch": 1.568, "eval_loss": 0.7135123014450073, "eval_runtime": 11.9963, "eval_samples_per_second": 166.718, "eval_steps_per_second": 10.42, "step": 980 }, { "epoch": 1.576, "grad_norm": 3.922914743423462, "learning_rate": 3.2056735179507165e-07, "loss": 0.3938, "step": 985 }, { "epoch": 1.584, "grad_norm": 4.287975788116455, "learning_rate": 3.0901440202824693e-07, "loss": 0.3604, "step": 990 }, { "epoch": 1.584, "eval_loss": 0.714073657989502, "eval_runtime": 11.9966, "eval_samples_per_second": 166.714, "eval_steps_per_second": 10.42, "step": 990 }, { "epoch": 1.592, "grad_norm": 4.261760711669922, "learning_rate": 2.976495226936849e-07, "loss": 0.3538, "step": 995 }, { "epoch": 1.6, "grad_norm": 4.7926812171936035, "learning_rate": 2.86474508437579e-07, "loss": 0.3441, "step": 1000 }, { "epoch": 1.6, "eval_loss": 0.7137433290481567, "eval_runtime": 12.033, "eval_samples_per_second": 166.21, "eval_steps_per_second": 10.388, "step": 1000 }, { "epoch": 1.608, "grad_norm": 4.072969436645508, "learning_rate": 2.754911239242241e-07, "loss": 0.3653, "step": 1005 }, { "epoch": 1.616, "grad_norm": 4.100124835968018, "learning_rate": 2.647011035573588e-07, "loss": 0.3705, "step": 1010 }, { "epoch": 1.616, "eval_loss": 0.7153717279434204, "eval_runtime": 12.9324, "eval_samples_per_second": 154.65, "eval_steps_per_second": 9.666, "step": 1010 }, { "epoch": 1.624, "grad_norm": 4.28076696395874, "learning_rate": 2.5410615120628085e-07, "loss": 0.3299, "step": 1015 }, { "epoch": 1.6320000000000001, "grad_norm": 4.782041072845459, "learning_rate": 2.437079399367875e-07, "loss": 0.3857, "step": 1020 }, { "epoch": 1.6320000000000001, "eval_loss": 0.7188824415206909, "eval_runtime": 13.608, "eval_samples_per_second": 146.972, "eval_steps_per_second": 9.186, "step": 1020 }, { "epoch": 1.6400000000000001, "grad_norm": 4.323799133300781, "learning_rate": 2.3350811174697772e-07, "loss": 0.3702, "step": 1025 }, { "epoch": 1.6480000000000001, "grad_norm": 4.875490188598633, "learning_rate": 2.235082773079624e-07, "loss": 0.3952, "step": 1030 }, { "epoch": 1.6480000000000001, "eval_loss": 0.7147963643074036, "eval_runtime": 14.1342, "eval_samples_per_second": 141.501, "eval_steps_per_second": 8.844, "step": 1030 }, { "epoch": 1.6560000000000001, "grad_norm": 4.45432710647583, "learning_rate": 2.1371001570952186e-07, "loss": 0.4075, "step": 1035 }, { "epoch": 1.6640000000000001, "grad_norm": 4.542659759521484, "learning_rate": 2.0411487421074708e-07, "loss": 0.3815, "step": 1040 }, { "epoch": 1.6640000000000001, "eval_loss": 0.7115746140480042, "eval_runtime": 15.5186, "eval_samples_per_second": 128.878, "eval_steps_per_second": 8.055, "step": 1040 }, { "epoch": 1.6720000000000002, "grad_norm": 4.134467601776123, "learning_rate": 1.9472436799571147e-07, "loss": 0.3594, "step": 1045 }, { "epoch": 1.6800000000000002, "grad_norm": 4.484969615936279, "learning_rate": 1.8553997993420495e-07, "loss": 0.3507, "step": 1050 }, { "epoch": 1.6800000000000002, "eval_loss": 0.7108085751533508, "eval_runtime": 13.7067, "eval_samples_per_second": 145.914, "eval_steps_per_second": 9.12, "step": 1050 }, { "epoch": 1.688, "grad_norm": 5.112241744995117, "learning_rate": 1.7656316034757024e-07, "loss": 0.3736, "step": 1055 }, { "epoch": 1.696, "grad_norm": 4.2886786460876465, "learning_rate": 1.6779532677968329e-07, "loss": 0.3662, "step": 1060 }, { "epoch": 1.696, "eval_loss": 0.7123615741729736, "eval_runtime": 13.8684, "eval_samples_per_second": 144.212, "eval_steps_per_second": 9.013, "step": 1060 }, { "epoch": 1.704, "grad_norm": 4.763908863067627, "learning_rate": 1.5923786377310435e-07, "loss": 0.3736, "step": 1065 }, { "epoch": 1.712, "grad_norm": 4.227341651916504, "learning_rate": 1.508921226504434e-07, "loss": 0.3581, "step": 1070 }, { "epoch": 1.712, "eval_loss": 0.7136136293411255, "eval_runtime": 13.5861, "eval_samples_per_second": 147.209, "eval_steps_per_second": 9.201, "step": 1070 }, { "epoch": 1.72, "grad_norm": 4.712426662445068, "learning_rate": 1.4275942130097098e-07, "loss": 0.374, "step": 1075 }, { "epoch": 1.728, "grad_norm": 4.427713871002197, "learning_rate": 1.348410439725065e-07, "loss": 0.3867, "step": 1080 }, { "epoch": 1.728, "eval_loss": 0.7132411003112793, "eval_runtime": 13.1772, "eval_samples_per_second": 151.778, "eval_steps_per_second": 9.486, "step": 1080 }, { "epoch": 1.736, "grad_norm": 4.723916530609131, "learning_rate": 1.271382410686237e-07, "loss": 0.3859, "step": 1085 }, { "epoch": 1.744, "grad_norm": 4.608342170715332, "learning_rate": 1.1965222895119444e-07, "loss": 0.3707, "step": 1090 }, { "epoch": 1.744, "eval_loss": 0.7126567959785461, "eval_runtime": 13.2064, "eval_samples_per_second": 151.442, "eval_steps_per_second": 9.465, "step": 1090 }, { "epoch": 1.752, "grad_norm": 4.681605815887451, "learning_rate": 1.123841897483131e-07, "loss": 0.3439, "step": 1095 }, { "epoch": 1.76, "grad_norm": 5.060537338256836, "learning_rate": 1.0533527116762298e-07, "loss": 0.4078, "step": 1100 }, { "epoch": 1.76, "eval_loss": 0.7122145891189575, "eval_runtime": 12.046, "eval_samples_per_second": 166.03, "eval_steps_per_second": 10.377, "step": 1100 }, { "epoch": 1.768, "grad_norm": 4.388428211212158, "learning_rate": 9.850658631508197e-08, "loss": 0.3318, "step": 1105 }, { "epoch": 1.776, "grad_norm": 4.610175609588623, "learning_rate": 9.18992135191889e-08, "loss": 0.3713, "step": 1110 }, { "epoch": 1.776, "eval_loss": 0.711074709892273, "eval_runtime": 12.0327, "eval_samples_per_second": 166.213, "eval_steps_per_second": 10.388, "step": 1110 }, { "epoch": 1.784, "grad_norm": 4.19280481338501, "learning_rate": 8.551419616070322e-08, "loss": 0.3503, "step": 1115 }, { "epoch": 1.792, "grad_norm": 4.062910079956055, "learning_rate": 7.935254250788366e-08, "loss": 0.3525, "step": 1120 }, { "epoch": 1.792, "eval_loss": 0.7110276818275452, "eval_runtime": 12.001, "eval_samples_per_second": 166.653, "eval_steps_per_second": 10.416, "step": 1120 }, { "epoch": 1.8, "grad_norm": 4.942102909088135, "learning_rate": 7.341522555726971e-08, "loss": 0.3293, "step": 1125 }, { "epoch": 1.808, "grad_norm": 4.574679851531982, "learning_rate": 6.770318288003558e-08, "loss": 0.3873, "step": 1130 }, { "epoch": 1.808, "eval_loss": 0.7115213871002197, "eval_runtime": 12.0697, "eval_samples_per_second": 165.704, "eval_steps_per_second": 10.357, "step": 1130 }, { "epoch": 1.8159999999999998, "grad_norm": 4.616149425506592, "learning_rate": 6.221731647393609e-08, "loss": 0.3684, "step": 1135 }, { "epoch": 1.8239999999999998, "grad_norm": 5.204924583435059, "learning_rate": 5.6958492620871105e-08, "loss": 0.4008, "step": 1140 }, { "epoch": 1.8239999999999998, "eval_loss": 0.7119244337081909, "eval_runtime": 12.3474, "eval_samples_per_second": 161.977, "eval_steps_per_second": 10.124, "step": 1140 }, { "epoch": 1.8319999999999999, "grad_norm": 4.088404178619385, "learning_rate": 5.192754175008918e-08, "loss": 0.3352, "step": 1145 }, { "epoch": 1.8399999999999999, "grad_norm": 4.6251606941223145, "learning_rate": 4.712525830705339e-08, "loss": 0.3889, "step": 1150 }, { "epoch": 1.8399999999999999, "eval_loss": 0.711883008480072, "eval_runtime": 12.5273, "eval_samples_per_second": 159.651, "eval_steps_per_second": 9.978, "step": 1150 }, { "epoch": 1.8479999999999999, "grad_norm": 4.4832072257995605, "learning_rate": 4.255240062798904e-08, "loss": 0.3426, "step": 1155 }, { "epoch": 1.8559999999999999, "grad_norm": 4.493223190307617, "learning_rate": 3.820969082013415e-08, "loss": 0.3591, "step": 1160 }, { "epoch": 1.8559999999999999, "eval_loss": 0.711625337600708, "eval_runtime": 12.5893, "eval_samples_per_second": 158.865, "eval_steps_per_second": 9.929, "step": 1160 }, { "epoch": 1.8639999999999999, "grad_norm": 3.978353261947632, "learning_rate": 3.409781464770978e-08, "loss": 0.3594, "step": 1165 }, { "epoch": 1.8719999999999999, "grad_norm": 4.505449295043945, "learning_rate": 3.021742142362971e-08, "loss": 0.3843, "step": 1170 }, { "epoch": 1.8719999999999999, "eval_loss": 0.7115885019302368, "eval_runtime": 12.4393, "eval_samples_per_second": 160.781, "eval_steps_per_second": 10.049, "step": 1170 }, { "epoch": 1.88, "grad_norm": 4.039797782897949, "learning_rate": 2.6569123906967087e-08, "loss": 0.3353, "step": 1175 }, { "epoch": 1.888, "grad_norm": 5.2295732498168945, "learning_rate": 2.3153498206192002e-08, "loss": 0.3713, "step": 1180 }, { "epoch": 1.888, "eval_loss": 0.7115111351013184, "eval_runtime": 12.0362, "eval_samples_per_second": 166.166, "eval_steps_per_second": 10.385, "step": 1180 }, { "epoch": 1.896, "grad_norm": 4.890398979187012, "learning_rate": 1.9971083688197945e-08, "loss": 0.3765, "step": 1185 }, { "epoch": 1.904, "grad_norm": 4.141822814941406, "learning_rate": 1.7022382893129074e-08, "loss": 0.3659, "step": 1190 }, { "epoch": 1.904, "eval_loss": 0.7115259766578674, "eval_runtime": 12.0459, "eval_samples_per_second": 166.032, "eval_steps_per_second": 10.377, "step": 1190 }, { "epoch": 1.912, "grad_norm": 4.194301128387451, "learning_rate": 1.430786145502322e-08, "loss": 0.3534, "step": 1195 }, { "epoch": 1.92, "grad_norm": 4.5958051681518555, "learning_rate": 1.1827948028283353e-08, "loss": 0.3588, "step": 1200 }, { "epoch": 1.92, "eval_loss": 0.7115083932876587, "eval_runtime": 12.3331, "eval_samples_per_second": 162.165, "eval_steps_per_second": 10.135, "step": 1200 }, { "epoch": 1.928, "grad_norm": 4.880290508270264, "learning_rate": 9.583034219987408e-09, "loss": 0.3672, "step": 1205 }, { "epoch": 1.936, "grad_norm": 3.973886251449585, "learning_rate": 7.57347452804974e-09, "loss": 0.3556, "step": 1210 }, { "epoch": 1.936, "eval_loss": 0.7114896774291992, "eval_runtime": 12.1956, "eval_samples_per_second": 163.993, "eval_steps_per_second": 10.25, "step": 1210 }, { "epoch": 1.944, "grad_norm": 3.817405939102173, "learning_rate": 5.799586285241243e-09, "loss": 0.3265, "step": 1215 }, { "epoch": 1.952, "grad_norm": 4.495524883270264, "learning_rate": 4.261649609079099e-09, "loss": 0.3278, "step": 1220 }, { "epoch": 1.952, "eval_loss": 0.7115508317947388, "eval_runtime": 11.9967, "eval_samples_per_second": 166.712, "eval_steps_per_second": 10.42, "step": 1220 }, { "epoch": 1.96, "grad_norm": 4.464719772338867, "learning_rate": 2.9599073575926614e-09, "loss": 0.3817, "step": 1225 }, { "epoch": 1.968, "grad_norm": 4.08857536315918, "learning_rate": 1.8945650909737986e-09, "loss": 0.3642, "step": 1230 }, { "epoch": 1.968, "eval_loss": 0.7115161418914795, "eval_runtime": 12.0169, "eval_samples_per_second": 166.433, "eval_steps_per_second": 10.402, "step": 1230 }, { "epoch": 1.976, "grad_norm": 4.385359764099121, "learning_rate": 1.0657910391161929e-09, "loss": 0.3696, "step": 1235 }, { "epoch": 1.984, "grad_norm": 4.141005992889404, "learning_rate": 4.737160750500902e-10, "loss": 0.3718, "step": 1240 }, { "epoch": 1.984, "eval_loss": 0.7114637494087219, "eval_runtime": 12.0025, "eval_samples_per_second": 166.632, "eval_steps_per_second": 10.414, "step": 1240 }, { "epoch": 1.992, "grad_norm": 5.190242767333984, "learning_rate": 1.184336942758324e-10, "loss": 0.3882, "step": 1245 }, { "epoch": 2.0, "grad_norm": 4.157950401306152, "learning_rate": 0.0, "loss": 0.3611, "step": 1250 }, { "epoch": 2.0, "eval_loss": 0.7115355134010315, "eval_runtime": 11.9974, "eval_samples_per_second": 166.703, "eval_steps_per_second": 10.419, "step": 1250 }, { "epoch": 2.0, "step": 1250, "total_flos": 1.8775693783885414e+17, "train_loss": 0.5100525261878968, "train_runtime": 5131.1185, "train_samples_per_second": 3.898, "train_steps_per_second": 0.244 } ], "logging_steps": 5, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8775693783885414e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }