{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6073619631901841, "eval_steps": 66, "global_step": 198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003067484662576687, "grad_norm": 1.0254060683433053, "learning_rate": 5e-06, "loss": 1.9557, "step": 1 }, { "epoch": 0.003067484662576687, "eval_loss": 2.6437082290649414, "eval_runtime": 55.4152, "eval_samples_per_second": 1.805, "eval_steps_per_second": 0.126, "step": 1 }, { "epoch": 0.006134969325153374, "grad_norm": 0.5293660177597584, "learning_rate": 1e-05, "loss": 1.9268, "step": 2 }, { "epoch": 0.009202453987730062, "grad_norm": 0.6031237810490027, "learning_rate": 1.5e-05, "loss": 1.9666, "step": 3 }, { "epoch": 0.012269938650306749, "grad_norm": 0.5216691776821837, "learning_rate": 2e-05, "loss": 1.9176, "step": 4 }, { "epoch": 0.015337423312883436, "grad_norm": 0.45736012052053565, "learning_rate": 2.5e-05, "loss": 1.9172, "step": 5 }, { "epoch": 0.018404907975460124, "grad_norm": 0.4721331330094363, "learning_rate": 3e-05, "loss": 1.9038, "step": 6 }, { "epoch": 0.02147239263803681, "grad_norm": 0.4699970169077475, "learning_rate": 3.5e-05, "loss": 1.972, "step": 7 }, { "epoch": 0.024539877300613498, "grad_norm": 0.5998147513619175, "learning_rate": 4e-05, "loss": 1.9115, "step": 8 }, { "epoch": 0.027607361963190184, "grad_norm": 0.39982194363235835, "learning_rate": 4.5e-05, "loss": 1.9362, "step": 9 }, { "epoch": 0.03067484662576687, "grad_norm": 0.41316001445589784, "learning_rate": 5e-05, "loss": 1.9367, "step": 10 }, { "epoch": 0.03374233128834356, "grad_norm": 1.978145485337434, "learning_rate": 5.500000000000001e-05, "loss": 1.9018, "step": 11 }, { "epoch": 0.03680981595092025, "grad_norm": 0.5763394527514556, "learning_rate": 6e-05, "loss": 1.9239, "step": 12 }, { "epoch": 0.03987730061349693, "grad_norm": 0.6656094180752898, "learning_rate": 6.500000000000001e-05, "loss": 1.8601, "step": 13 }, { "epoch": 0.04294478527607362, "grad_norm": 0.3779888950718134, "learning_rate": 7e-05, "loss": 1.9467, "step": 14 }, { "epoch": 0.046012269938650305, "grad_norm": 0.4210293643738542, "learning_rate": 7.500000000000001e-05, "loss": 1.9491, "step": 15 }, { "epoch": 0.049079754601226995, "grad_norm": 0.284470526924256, "learning_rate": 8e-05, "loss": 1.96, "step": 16 }, { "epoch": 0.05214723926380368, "grad_norm": 0.4511944107373649, "learning_rate": 8.5e-05, "loss": 1.9688, "step": 17 }, { "epoch": 0.05521472392638037, "grad_norm": 0.5213533339486691, "learning_rate": 9e-05, "loss": 1.8883, "step": 18 }, { "epoch": 0.05828220858895705, "grad_norm": 0.3529095514608687, "learning_rate": 9.5e-05, "loss": 1.9652, "step": 19 }, { "epoch": 0.06134969325153374, "grad_norm": 0.37388599933304034, "learning_rate": 0.0001, "loss": 1.9701, "step": 20 }, { "epoch": 0.06441717791411043, "grad_norm": 0.6715118705762056, "learning_rate": 9.999762843192279e-05, "loss": 1.9591, "step": 21 }, { "epoch": 0.06748466257668712, "grad_norm": 0.3339477252516958, "learning_rate": 9.999051397766162e-05, "loss": 1.8851, "step": 22 }, { "epoch": 0.0705521472392638, "grad_norm": 0.38292464677189253, "learning_rate": 9.997865738710147e-05, "loss": 1.9505, "step": 23 }, { "epoch": 0.0736196319018405, "grad_norm": 0.46332198422774334, "learning_rate": 9.996205990996288e-05, "loss": 1.8819, "step": 24 }, { "epoch": 0.07668711656441718, "grad_norm": 0.32033971816842144, "learning_rate": 9.994072329567015e-05, "loss": 1.9778, "step": 25 }, { "epoch": 0.07975460122699386, "grad_norm": 0.32764211011622874, "learning_rate": 9.991464979316699e-05, "loss": 2.0035, "step": 26 }, { "epoch": 0.08282208588957055, "grad_norm": 0.35749570152374016, "learning_rate": 9.988384215067945e-05, "loss": 1.897, "step": 27 }, { "epoch": 0.08588957055214724, "grad_norm": 0.47517571287279864, "learning_rate": 9.984830361542625e-05, "loss": 1.9916, "step": 28 }, { "epoch": 0.08895705521472393, "grad_norm": 0.37844919890358947, "learning_rate": 9.980803793327656e-05, "loss": 1.9787, "step": 29 }, { "epoch": 0.09202453987730061, "grad_norm": 0.3392783686369942, "learning_rate": 9.976304934835509e-05, "loss": 1.9915, "step": 30 }, { "epoch": 0.0950920245398773, "grad_norm": 0.3672803421436023, "learning_rate": 9.97133426025948e-05, "loss": 1.9237, "step": 31 }, { "epoch": 0.09815950920245399, "grad_norm": 0.3717328207326788, "learning_rate": 9.965892293523712e-05, "loss": 1.8755, "step": 32 }, { "epoch": 0.10122699386503067, "grad_norm": 0.41380648649234975, "learning_rate": 9.959979608227961e-05, "loss": 2.021, "step": 33 }, { "epoch": 0.10429447852760736, "grad_norm": 1.0263652968268477, "learning_rate": 9.95359682758715e-05, "loss": 1.9528, "step": 34 }, { "epoch": 0.10736196319018405, "grad_norm": 0.9592485389518621, "learning_rate": 9.946744624365668e-05, "loss": 1.9055, "step": 35 }, { "epoch": 0.11042944785276074, "grad_norm": 0.43725271995243464, "learning_rate": 9.939423720806468e-05, "loss": 1.9306, "step": 36 }, { "epoch": 0.11349693251533742, "grad_norm": 0.3175345165915247, "learning_rate": 9.931634888554937e-05, "loss": 1.9159, "step": 37 }, { "epoch": 0.1165644171779141, "grad_norm": 0.4731845530714391, "learning_rate": 9.923378948577559e-05, "loss": 1.993, "step": 38 }, { "epoch": 0.1196319018404908, "grad_norm": 0.3274613986874974, "learning_rate": 9.914656771075387e-05, "loss": 1.8971, "step": 39 }, { "epoch": 0.12269938650306748, "grad_norm": 0.4175774555118117, "learning_rate": 9.90546927539232e-05, "loss": 1.9529, "step": 40 }, { "epoch": 0.12576687116564417, "grad_norm": 0.4723214170983414, "learning_rate": 9.895817429918203e-05, "loss": 1.9775, "step": 41 }, { "epoch": 0.12883435582822086, "grad_norm": 0.5517874328207245, "learning_rate": 9.885702251986753e-05, "loss": 1.9704, "step": 42 }, { "epoch": 0.13190184049079753, "grad_norm": 0.7112812651734346, "learning_rate": 9.875124807768324e-05, "loss": 1.9396, "step": 43 }, { "epoch": 0.13496932515337423, "grad_norm": 0.4122128687502141, "learning_rate": 9.864086212157544e-05, "loss": 1.9495, "step": 44 }, { "epoch": 0.13803680981595093, "grad_norm": 0.33784719392668305, "learning_rate": 9.852587628655787e-05, "loss": 1.8904, "step": 45 }, { "epoch": 0.1411042944785276, "grad_norm": 0.281184642101553, "learning_rate": 9.840630269248549e-05, "loss": 1.9156, "step": 46 }, { "epoch": 0.1441717791411043, "grad_norm": 0.7601259994555819, "learning_rate": 9.828215394277687e-05, "loss": 1.9516, "step": 47 }, { "epoch": 0.147239263803681, "grad_norm": 0.36449789385058556, "learning_rate": 9.815344312308587e-05, "loss": 1.9182, "step": 48 }, { "epoch": 0.15030674846625766, "grad_norm": 0.32613788602651017, "learning_rate": 9.80201837999223e-05, "loss": 1.9367, "step": 49 }, { "epoch": 0.15337423312883436, "grad_norm": 0.4437625986967123, "learning_rate": 9.788239001922206e-05, "loss": 1.8838, "step": 50 }, { "epoch": 0.15644171779141106, "grad_norm": 0.7368917728925937, "learning_rate": 9.774007630486651e-05, "loss": 1.9125, "step": 51 }, { "epoch": 0.15950920245398773, "grad_norm": 0.43661779665549927, "learning_rate": 9.759325765715176e-05, "loss": 1.9309, "step": 52 }, { "epoch": 0.16257668711656442, "grad_norm": 0.27925292993087114, "learning_rate": 9.744194955120748e-05, "loss": 1.9374, "step": 53 }, { "epoch": 0.1656441717791411, "grad_norm": 0.46390992287233235, "learning_rate": 9.728616793536588e-05, "loss": 1.9425, "step": 54 }, { "epoch": 0.1687116564417178, "grad_norm": 0.2514992126441497, "learning_rate": 9.712592922948057e-05, "loss": 1.9482, "step": 55 }, { "epoch": 0.17177914110429449, "grad_norm": 0.2703640459793386, "learning_rate": 9.6961250323196e-05, "loss": 1.8895, "step": 56 }, { "epoch": 0.17484662576687116, "grad_norm": 0.561176184389631, "learning_rate": 9.679214857416717e-05, "loss": 1.928, "step": 57 }, { "epoch": 0.17791411042944785, "grad_norm": 0.29671160399395613, "learning_rate": 9.661864180623003e-05, "loss": 1.9542, "step": 58 }, { "epoch": 0.18098159509202455, "grad_norm": 0.28259623949277235, "learning_rate": 9.644074830752293e-05, "loss": 1.9519, "step": 59 }, { "epoch": 0.18404907975460122, "grad_norm": 0.32102511884381013, "learning_rate": 9.625848682855884e-05, "loss": 1.8776, "step": 60 }, { "epoch": 0.18711656441717792, "grad_norm": 1.6811025479349568, "learning_rate": 9.607187658024912e-05, "loss": 1.9016, "step": 61 }, { "epoch": 0.1901840490797546, "grad_norm": 0.2951789033160566, "learning_rate": 9.588093723187857e-05, "loss": 1.9204, "step": 62 }, { "epoch": 0.19325153374233128, "grad_norm": 0.35508359779387055, "learning_rate": 9.568568890903221e-05, "loss": 1.9144, "step": 63 }, { "epoch": 0.19631901840490798, "grad_norm": 0.3620090919465414, "learning_rate": 9.548615219147405e-05, "loss": 1.8699, "step": 64 }, { "epoch": 0.19938650306748465, "grad_norm": 0.3475528667692185, "learning_rate": 9.528234811097782e-05, "loss": 1.855, "step": 65 }, { "epoch": 0.20245398773006135, "grad_norm": 0.2922421805064443, "learning_rate": 9.507429814911024e-05, "loss": 1.8648, "step": 66 }, { "epoch": 0.20245398773006135, "eval_loss": 2.6012535095214844, "eval_runtime": 55.5905, "eval_samples_per_second": 1.799, "eval_steps_per_second": 0.126, "step": 66 }, { "epoch": 0.20552147239263804, "grad_norm": 0.525841804476554, "learning_rate": 9.486202423496679e-05, "loss": 1.8319, "step": 67 }, { "epoch": 0.2085889570552147, "grad_norm": 0.33648300397500824, "learning_rate": 9.46455487428603e-05, "loss": 1.889, "step": 68 }, { "epoch": 0.2116564417177914, "grad_norm": 0.2982307248009996, "learning_rate": 9.442489448996261e-05, "loss": 1.9004, "step": 69 }, { "epoch": 0.2147239263803681, "grad_norm": 1.3863327829569763, "learning_rate": 9.42000847338996e-05, "loss": 1.9529, "step": 70 }, { "epoch": 0.21779141104294478, "grad_norm": 0.3507002144386185, "learning_rate": 9.397114317029975e-05, "loss": 1.9561, "step": 71 }, { "epoch": 0.22085889570552147, "grad_norm": 0.26047398296778806, "learning_rate": 9.373809393029654e-05, "loss": 1.9666, "step": 72 }, { "epoch": 0.22392638036809817, "grad_norm": 0.31142946623961487, "learning_rate": 9.350096157798505e-05, "loss": 1.9669, "step": 73 }, { "epoch": 0.22699386503067484, "grad_norm": 0.6059103096641723, "learning_rate": 9.325977110783264e-05, "loss": 1.8732, "step": 74 }, { "epoch": 0.23006134969325154, "grad_norm": 0.2988013721693877, "learning_rate": 9.301454794204464e-05, "loss": 1.9106, "step": 75 }, { "epoch": 0.2331288343558282, "grad_norm": 0.3322046656491888, "learning_rate": 9.276531792788471e-05, "loss": 1.9082, "step": 76 }, { "epoch": 0.2361963190184049, "grad_norm": 0.4251032871261752, "learning_rate": 9.251210733495039e-05, "loss": 1.873, "step": 77 }, { "epoch": 0.2392638036809816, "grad_norm": 0.5316920231449993, "learning_rate": 9.225494285240432e-05, "loss": 1.9237, "step": 78 }, { "epoch": 0.24233128834355827, "grad_norm": 0.3879744017362554, "learning_rate": 9.199385158616103e-05, "loss": 1.9097, "step": 79 }, { "epoch": 0.24539877300613497, "grad_norm": 0.34345641723744996, "learning_rate": 9.172886105602998e-05, "loss": 1.8854, "step": 80 }, { "epoch": 0.24846625766871167, "grad_norm": 0.28939057442749516, "learning_rate": 9.145999919281481e-05, "loss": 1.8964, "step": 81 }, { "epoch": 0.25153374233128833, "grad_norm": 1.3304291601448779, "learning_rate": 9.118729433536938e-05, "loss": 1.9008, "step": 82 }, { "epoch": 0.254601226993865, "grad_norm": 0.31217347045844684, "learning_rate": 9.091077522761079e-05, "loss": 1.9452, "step": 83 }, { "epoch": 0.25766871165644173, "grad_norm": 0.437112787156602, "learning_rate": 9.063047101548962e-05, "loss": 1.8645, "step": 84 }, { "epoch": 0.2607361963190184, "grad_norm": 0.29101868827151584, "learning_rate": 9.034641124391795e-05, "loss": 1.9555, "step": 85 }, { "epoch": 0.26380368098159507, "grad_norm": 0.3581357829575129, "learning_rate": 9.005862585365517e-05, "loss": 1.8963, "step": 86 }, { "epoch": 0.2668711656441718, "grad_norm": 0.2870730838141048, "learning_rate": 8.976714517815216e-05, "loss": 1.9004, "step": 87 }, { "epoch": 0.26993865030674846, "grad_norm": 0.432917577879272, "learning_rate": 8.947199994035401e-05, "loss": 1.9512, "step": 88 }, { "epoch": 0.27300613496932513, "grad_norm": 0.2818163590615669, "learning_rate": 8.917322124946182e-05, "loss": 1.951, "step": 89 }, { "epoch": 0.27607361963190186, "grad_norm": 0.35253042451634276, "learning_rate": 8.88708405976536e-05, "loss": 1.8632, "step": 90 }, { "epoch": 0.2791411042944785, "grad_norm": 0.2590173941857926, "learning_rate": 8.856488985676495e-05, "loss": 1.9345, "step": 91 }, { "epoch": 0.2822085889570552, "grad_norm": 0.27658536342174034, "learning_rate": 8.825540127492967e-05, "loss": 1.9323, "step": 92 }, { "epoch": 0.2852760736196319, "grad_norm": 0.4745120742354108, "learning_rate": 8.794240747318066e-05, "loss": 1.9018, "step": 93 }, { "epoch": 0.2883435582822086, "grad_norm": 0.26070920298493305, "learning_rate": 8.762594144201167e-05, "loss": 1.9387, "step": 94 }, { "epoch": 0.29141104294478526, "grad_norm": 0.5280391087971116, "learning_rate": 8.73060365378999e-05, "loss": 1.862, "step": 95 }, { "epoch": 0.294478527607362, "grad_norm": 0.2507206580092369, "learning_rate": 8.698272647979012e-05, "loss": 1.9286, "step": 96 }, { "epoch": 0.29754601226993865, "grad_norm": 0.26686171742356907, "learning_rate": 8.665604534554075e-05, "loss": 1.8256, "step": 97 }, { "epoch": 0.3006134969325153, "grad_norm": 0.2528790515143118, "learning_rate": 8.632602756833172e-05, "loss": 1.9627, "step": 98 }, { "epoch": 0.30368098159509205, "grad_norm": 0.3485782871675419, "learning_rate": 8.599270793303524e-05, "loss": 1.8465, "step": 99 }, { "epoch": 0.3067484662576687, "grad_norm": 0.26793745211248754, "learning_rate": 8.565612157254943e-05, "loss": 1.8918, "step": 100 }, { "epoch": 0.3098159509202454, "grad_norm": 0.25037629545985934, "learning_rate": 8.531630396409507e-05, "loss": 1.8663, "step": 101 }, { "epoch": 0.3128834355828221, "grad_norm": 0.2592216678438039, "learning_rate": 8.497329092547627e-05, "loss": 1.9302, "step": 102 }, { "epoch": 0.3159509202453988, "grad_norm": 0.26334854065125896, "learning_rate": 8.46271186113051e-05, "loss": 1.8775, "step": 103 }, { "epoch": 0.31901840490797545, "grad_norm": 0.2626800828290798, "learning_rate": 8.42778235091909e-05, "loss": 1.9522, "step": 104 }, { "epoch": 0.3220858895705521, "grad_norm": 0.24256073020090993, "learning_rate": 8.392544243589427e-05, "loss": 1.9295, "step": 105 }, { "epoch": 0.32515337423312884, "grad_norm": 0.2484627790629833, "learning_rate": 8.357001253344653e-05, "loss": 1.9287, "step": 106 }, { "epoch": 0.3282208588957055, "grad_norm": 0.31955912356468386, "learning_rate": 8.32115712652348e-05, "loss": 1.9886, "step": 107 }, { "epoch": 0.3312883435582822, "grad_norm": 0.2434642052279205, "learning_rate": 8.285015641205325e-05, "loss": 1.9623, "step": 108 }, { "epoch": 0.3343558282208589, "grad_norm": 0.28552157930226957, "learning_rate": 8.248580606812096e-05, "loss": 1.8705, "step": 109 }, { "epoch": 0.3374233128834356, "grad_norm": 0.27716036272992295, "learning_rate": 8.211855863706654e-05, "loss": 1.8958, "step": 110 }, { "epoch": 0.34049079754601225, "grad_norm": 0.40776621930987433, "learning_rate": 8.174845282788041e-05, "loss": 1.9219, "step": 111 }, { "epoch": 0.34355828220858897, "grad_norm": 0.27546145956009194, "learning_rate": 8.137552765083466e-05, "loss": 1.8948, "step": 112 }, { "epoch": 0.34662576687116564, "grad_norm": 0.2463745150403918, "learning_rate": 8.09998224133713e-05, "loss": 1.907, "step": 113 }, { "epoch": 0.3496932515337423, "grad_norm": 0.2530717713867962, "learning_rate": 8.062137671595911e-05, "loss": 1.8945, "step": 114 }, { "epoch": 0.35276073619631904, "grad_norm": 0.26804689577846247, "learning_rate": 8.024023044791964e-05, "loss": 1.8984, "step": 115 }, { "epoch": 0.3558282208588957, "grad_norm": 0.2922869142073029, "learning_rate": 7.985642378322276e-05, "loss": 1.9499, "step": 116 }, { "epoch": 0.3588957055214724, "grad_norm": 0.2302050850660013, "learning_rate": 7.946999717625221e-05, "loss": 1.9398, "step": 117 }, { "epoch": 0.3619631901840491, "grad_norm": 0.4179152288704764, "learning_rate": 7.908099135754152e-05, "loss": 1.909, "step": 118 }, { "epoch": 0.36503067484662577, "grad_norm": 0.2448034947982603, "learning_rate": 7.868944732948101e-05, "loss": 1.9202, "step": 119 }, { "epoch": 0.36809815950920244, "grad_norm": 0.3642159637354568, "learning_rate": 7.829540636199591e-05, "loss": 1.9188, "step": 120 }, { "epoch": 0.37116564417177916, "grad_norm": 0.2751031027135651, "learning_rate": 7.789890998819643e-05, "loss": 1.8903, "step": 121 }, { "epoch": 0.37423312883435583, "grad_norm": 0.2519348027896112, "learning_rate": 7.75e-05, "loss": 1.9422, "step": 122 }, { "epoch": 0.3773006134969325, "grad_norm": 0.2724753380540709, "learning_rate": 7.709871844372639e-05, "loss": 1.9314, "step": 123 }, { "epoch": 0.3803680981595092, "grad_norm": 0.2831411354349516, "learning_rate": 7.669510761566571e-05, "loss": 1.8467, "step": 124 }, { "epoch": 0.3834355828220859, "grad_norm": 0.34065192298819646, "learning_rate": 7.628921005762047e-05, "loss": 1.9109, "step": 125 }, { "epoch": 0.38650306748466257, "grad_norm": 0.2744987049992245, "learning_rate": 7.588106855242135e-05, "loss": 1.8961, "step": 126 }, { "epoch": 0.3895705521472393, "grad_norm": 0.24972903865472293, "learning_rate": 7.547072611941795e-05, "loss": 1.9183, "step": 127 }, { "epoch": 0.39263803680981596, "grad_norm": 0.2717954573790397, "learning_rate": 7.505822600994424e-05, "loss": 1.9925, "step": 128 }, { "epoch": 0.39570552147239263, "grad_norm": 0.2710599653280406, "learning_rate": 7.46436117027598e-05, "loss": 1.9588, "step": 129 }, { "epoch": 0.3987730061349693, "grad_norm": 0.3038954677693998, "learning_rate": 7.422692689946714e-05, "loss": 1.9182, "step": 130 }, { "epoch": 0.401840490797546, "grad_norm": 0.2587552748890865, "learning_rate": 7.380821551990525e-05, "loss": 1.9383, "step": 131 }, { "epoch": 0.4049079754601227, "grad_norm": 0.25905002770576757, "learning_rate": 7.338752169752042e-05, "loss": 1.9514, "step": 132 }, { "epoch": 0.4049079754601227, "eval_loss": 2.577134370803833, "eval_runtime": 55.6924, "eval_samples_per_second": 1.796, "eval_steps_per_second": 0.126, "step": 132 }, { "epoch": 0.40797546012269936, "grad_norm": 0.2703996506167688, "learning_rate": 7.29648897747144e-05, "loss": 1.9516, "step": 133 }, { "epoch": 0.4110429447852761, "grad_norm": 0.2499546230234631, "learning_rate": 7.254036429817058e-05, "loss": 2.0144, "step": 134 }, { "epoch": 0.41411042944785276, "grad_norm": 0.2755759481735348, "learning_rate": 7.211399001415866e-05, "loss": 1.8909, "step": 135 }, { "epoch": 0.4171779141104294, "grad_norm": 0.25578131710544816, "learning_rate": 7.168581186381824e-05, "loss": 1.9747, "step": 136 }, { "epoch": 0.42024539877300615, "grad_norm": 0.27719697668216164, "learning_rate": 7.12558749784219e-05, "loss": 1.9548, "step": 137 }, { "epoch": 0.4233128834355828, "grad_norm": 0.3398789070245734, "learning_rate": 7.082422467461816e-05, "loss": 1.9209, "step": 138 }, { "epoch": 0.4263803680981595, "grad_norm": 0.3891484871642631, "learning_rate": 7.03909064496551e-05, "loss": 1.8979, "step": 139 }, { "epoch": 0.4294478527607362, "grad_norm": 0.28744028744457395, "learning_rate": 6.995596597658468e-05, "loss": 1.8568, "step": 140 }, { "epoch": 0.4325153374233129, "grad_norm": 0.465137214109235, "learning_rate": 6.951944909944877e-05, "loss": 1.9201, "step": 141 }, { "epoch": 0.43558282208588955, "grad_norm": 0.26138177619827196, "learning_rate": 6.908140182844695e-05, "loss": 1.9864, "step": 142 }, { "epoch": 0.4386503067484663, "grad_norm": 0.2580799320688176, "learning_rate": 6.864187033508695e-05, "loss": 1.9603, "step": 143 }, { "epoch": 0.44171779141104295, "grad_norm": 0.2342374798488655, "learning_rate": 6.820090094731808e-05, "loss": 1.8695, "step": 144 }, { "epoch": 0.4447852760736196, "grad_norm": 0.31939812381318156, "learning_rate": 6.775854014464799e-05, "loss": 1.89, "step": 145 }, { "epoch": 0.44785276073619634, "grad_norm": 0.3745349673551468, "learning_rate": 6.731483455324374e-05, "loss": 1.9072, "step": 146 }, { "epoch": 0.450920245398773, "grad_norm": 0.2398137142916484, "learning_rate": 6.686983094101712e-05, "loss": 1.9224, "step": 147 }, { "epoch": 0.4539877300613497, "grad_norm": 0.7029063348936169, "learning_rate": 6.642357621269535e-05, "loss": 1.9042, "step": 148 }, { "epoch": 0.4570552147239264, "grad_norm": 0.9822378439608801, "learning_rate": 6.597611740487698e-05, "loss": 1.9367, "step": 149 }, { "epoch": 0.4601226993865031, "grad_norm": 0.30640641324748263, "learning_rate": 6.55275016810742e-05, "loss": 1.8906, "step": 150 }, { "epoch": 0.46319018404907975, "grad_norm": 0.28453603828616697, "learning_rate": 6.507777632674165e-05, "loss": 1.9607, "step": 151 }, { "epoch": 0.4662576687116564, "grad_norm": 0.6855412180718642, "learning_rate": 6.462698874429239e-05, "loss": 1.8572, "step": 152 }, { "epoch": 0.46932515337423314, "grad_norm": 0.2849104974414773, "learning_rate": 6.417518644810155e-05, "loss": 1.9385, "step": 153 }, { "epoch": 0.4723926380368098, "grad_norm": 0.31769414398981494, "learning_rate": 6.372241705949815e-05, "loss": 1.8972, "step": 154 }, { "epoch": 0.4754601226993865, "grad_norm": 0.6853208214886923, "learning_rate": 6.326872830174567e-05, "loss": 1.873, "step": 155 }, { "epoch": 0.4785276073619632, "grad_norm": 0.3810470202905365, "learning_rate": 6.281416799501188e-05, "loss": 2.0, "step": 156 }, { "epoch": 0.4815950920245399, "grad_norm": 0.3784628917790679, "learning_rate": 6.235878405132842e-05, "loss": 1.8814, "step": 157 }, { "epoch": 0.48466257668711654, "grad_norm": 0.3427014353184805, "learning_rate": 6.190262446954085e-05, "loss": 1.9223, "step": 158 }, { "epoch": 0.48773006134969327, "grad_norm": 0.46855229041092994, "learning_rate": 6.144573733024922e-05, "loss": 1.9059, "step": 159 }, { "epoch": 0.49079754601226994, "grad_norm": 0.29232827174073656, "learning_rate": 6.0988170790740416e-05, "loss": 1.8491, "step": 160 }, { "epoch": 0.4938650306748466, "grad_norm": 0.30132959369450213, "learning_rate": 6.052997307991214e-05, "loss": 1.9595, "step": 161 }, { "epoch": 0.49693251533742333, "grad_norm": 0.3195413242096082, "learning_rate": 6.007119249318945e-05, "loss": 1.9063, "step": 162 }, { "epoch": 0.5, "grad_norm": 0.34517635749728204, "learning_rate": 5.961187738743432e-05, "loss": 1.9111, "step": 163 }, { "epoch": 0.5030674846625767, "grad_norm": 0.2593428730143879, "learning_rate": 5.9152076175848594e-05, "loss": 1.9011, "step": 164 }, { "epoch": 0.5061349693251533, "grad_norm": 0.31658622781595325, "learning_rate": 5.86918373228712e-05, "loss": 1.9918, "step": 165 }, { "epoch": 0.50920245398773, "grad_norm": 0.6628038110211543, "learning_rate": 5.8231209339069746e-05, "loss": 1.9152, "step": 166 }, { "epoch": 0.5122699386503068, "grad_norm": 0.2797312671008732, "learning_rate": 5.777024077602744e-05, "loss": 1.868, "step": 167 }, { "epoch": 0.5153374233128835, "grad_norm": 0.26640093514522606, "learning_rate": 5.730898022122554e-05, "loss": 1.8938, "step": 168 }, { "epoch": 0.5184049079754601, "grad_norm": 0.4054825634426873, "learning_rate": 5.6847476292922155e-05, "loss": 1.9428, "step": 169 }, { "epoch": 0.5214723926380368, "grad_norm": 0.29142731230985613, "learning_rate": 5.6385777635027684e-05, "loss": 1.8903, "step": 170 }, { "epoch": 0.5245398773006135, "grad_norm": 0.3511142336480421, "learning_rate": 5.5923932911977575e-05, "loss": 1.9386, "step": 171 }, { "epoch": 0.5276073619631901, "grad_norm": 0.5560176165666619, "learning_rate": 5.5461990803603045e-05, "loss": 1.9562, "step": 172 }, { "epoch": 0.5306748466257669, "grad_norm": 0.3171565471545065, "learning_rate": 5.500000000000001e-05, "loss": 1.9565, "step": 173 }, { "epoch": 0.5337423312883436, "grad_norm": 0.29095744910567595, "learning_rate": 5.4538009196396966e-05, "loss": 1.9282, "step": 174 }, { "epoch": 0.5368098159509203, "grad_norm": 0.41192796716349284, "learning_rate": 5.407606708802244e-05, "loss": 1.918, "step": 175 }, { "epoch": 0.5398773006134969, "grad_norm": 0.5305521764688194, "learning_rate": 5.361422236497235e-05, "loss": 1.9096, "step": 176 }, { "epoch": 0.5429447852760736, "grad_norm": 0.6434585908707302, "learning_rate": 5.315252370707786e-05, "loss": 1.8935, "step": 177 }, { "epoch": 0.5460122699386503, "grad_norm": 0.2939723397914849, "learning_rate": 5.2691019778774465e-05, "loss": 1.9531, "step": 178 }, { "epoch": 0.549079754601227, "grad_norm": 0.4989500512121766, "learning_rate": 5.2229759223972574e-05, "loss": 1.9341, "step": 179 }, { "epoch": 0.5521472392638037, "grad_norm": 0.6024485433735285, "learning_rate": 5.1768790660930265e-05, "loss": 1.9001, "step": 180 }, { "epoch": 0.5552147239263804, "grad_norm": 0.47950946229716923, "learning_rate": 5.130816267712881e-05, "loss": 1.9209, "step": 181 }, { "epoch": 0.558282208588957, "grad_norm": 1.2341600337232164, "learning_rate": 5.0847923824151424e-05, "loss": 1.977, "step": 182 }, { "epoch": 0.5613496932515337, "grad_norm": 0.3100804420788902, "learning_rate": 5.038812261256569e-05, "loss": 1.9594, "step": 183 }, { "epoch": 0.5644171779141104, "grad_norm": 0.5390046601483737, "learning_rate": 4.992880750681056e-05, "loss": 1.8533, "step": 184 }, { "epoch": 0.5674846625766872, "grad_norm": 0.43167483611230206, "learning_rate": 4.9470026920087876e-05, "loss": 1.8782, "step": 185 }, { "epoch": 0.5705521472392638, "grad_norm": 0.3684508227191539, "learning_rate": 4.901182920925961e-05, "loss": 1.8684, "step": 186 }, { "epoch": 0.5736196319018405, "grad_norm": 0.2936392864589777, "learning_rate": 4.8554262669750794e-05, "loss": 1.8586, "step": 187 }, { "epoch": 0.5766871165644172, "grad_norm": 0.3204686860443095, "learning_rate": 4.809737553045916e-05, "loss": 1.8977, "step": 188 }, { "epoch": 0.5797546012269938, "grad_norm": 0.3024045894502796, "learning_rate": 4.764121594867157e-05, "loss": 1.8882, "step": 189 }, { "epoch": 0.5828220858895705, "grad_norm": 0.40522790311176354, "learning_rate": 4.718583200498814e-05, "loss": 1.924, "step": 190 }, { "epoch": 0.5858895705521472, "grad_norm": 0.5053931616075322, "learning_rate": 4.673127169825433e-05, "loss": 1.8868, "step": 191 }, { "epoch": 0.588957055214724, "grad_norm": 0.3211686422583536, "learning_rate": 4.627758294050185e-05, "loss": 1.9068, "step": 192 }, { "epoch": 0.5920245398773006, "grad_norm": 0.24127093990601076, "learning_rate": 4.582481355189846e-05, "loss": 1.895, "step": 193 }, { "epoch": 0.5950920245398773, "grad_norm": 0.4074710701692581, "learning_rate": 4.537301125570763e-05, "loss": 1.8969, "step": 194 }, { "epoch": 0.598159509202454, "grad_norm": 0.25841948774460555, "learning_rate": 4.492222367325837e-05, "loss": 1.94, "step": 195 }, { "epoch": 0.6012269938650306, "grad_norm": 0.2943706481314386, "learning_rate": 4.447249831892583e-05, "loss": 1.9482, "step": 196 }, { "epoch": 0.6042944785276073, "grad_norm": 0.3110992148589072, "learning_rate": 4.402388259512303e-05, "loss": 1.9495, "step": 197 }, { "epoch": 0.6073619631901841, "grad_norm": 0.3723312760498351, "learning_rate": 4.357642378730466e-05, "loss": 1.9213, "step": 198 }, { "epoch": 0.6073619631901841, "eval_loss": 2.594010353088379, "eval_runtime": 55.7716, "eval_samples_per_second": 1.793, "eval_steps_per_second": 0.126, "step": 198 } ], "logging_steps": 1, "max_steps": 326, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 66, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 216215096131584.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }