{ "best_global_step": 400, "best_metric": 0.060702841728925705, "best_model_checkpoint": null, "epoch": 0.9979259259259259, "eval_steps": 50, "global_step": 421, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "step": 0, "train/combined_loss": 4.310243457555771, "train/cross_entropy_loss": 7.370486944913864, "train/kl_divergence_loss": 1.25, "train/step_duration_seconds": 64.96706748008728, "train/steps_per_hour": 0.0, "train/total_elapsed_hours": 0.018046407500902813 }, { "epoch": 0.0023703703703703703, "grad_norm": 26752.0, "learning_rate": 0.0, "loss": 68.9639, "step": 1 }, { "epoch": 0.0023703703703703703, "step": 1, "train/combined_loss": 4.25145959854126, "train/cross_entropy_loss": 7.2763566970825195, "train/kl_divergence_loss": 1.2265625, "train/step_duration_seconds": 7.274043560028076, "train/steps_per_hour": 49.83312094637097, "train/total_elapsed_hours": 0.020066975156466167 }, { "epoch": 0.004740740740740741, "grad_norm": 27392.0, "learning_rate": 4.651162790697675e-07, "loss": 68.0234, "step": 2 }, { "epoch": 0.004740740740740741, "step": 2, "train/combined_loss": 4.210591048002243, "train/cross_entropy_loss": 7.187784105539322, "train/kl_divergence_loss": 1.2333984375, "train/step_duration_seconds": 7.055061340332031, "train/steps_per_hour": 90.79883463670956, "train/total_elapsed_hours": 0.02202671441766951 }, { "epoch": 0.0071111111111111115, "grad_norm": 27264.0, "learning_rate": 9.30232558139535e-07, "loss": 67.3695, "step": 3 }, { "epoch": 0.0071111111111111115, "step": 3, "train/combined_loss": 4.220440715551376, "train/cross_entropy_loss": 7.188928246498108, "train/kl_divergence_loss": 1.251953125, "train/step_duration_seconds": 7.075124740600586, "train/steps_per_hour": 125.04154064617586, "train/total_elapsed_hours": 0.023992026845614117 }, { "epoch": 0.009481481481481481, "grad_norm": 26240.0, "learning_rate": 1.3953488372093025e-06, "loss": 67.5271, "step": 4 }, { "epoch": 0.009481481481481481, "step": 4, "train/combined_loss": 4.050247877836227, "train/cross_entropy_loss": 6.869538724422455, "train/kl_divergence_loss": 1.23095703125, "train/step_duration_seconds": 7.09927773475647, "train/steps_per_hour": 154.0591795404681, "train/total_elapsed_hours": 0.025964048438602023 }, { "epoch": 0.011851851851851851, "grad_norm": 27264.0, "learning_rate": 1.86046511627907e-06, "loss": 64.804, "step": 5 }, { "epoch": 0.011851851851851851, "step": 5, "train/combined_loss": 3.8814118206501007, "train/cross_entropy_loss": 6.518194735050201, "train/kl_divergence_loss": 1.24462890625, "train/step_duration_seconds": 7.140614032745361, "train/steps_per_hour": 178.9065439356696, "train/total_elapsed_hours": 0.027947552336586846 }, { "epoch": 0.014222222222222223, "grad_norm": 28160.0, "learning_rate": 2.3255813953488376e-06, "loss": 62.1026, "step": 6 }, { "epoch": 0.014222222222222223, "step": 6, "train/combined_loss": 3.3225543051958084, "train/cross_entropy_loss": 5.414151579141617, "train/kl_divergence_loss": 1.23095703125, "train/step_duration_seconds": 7.164709091186523, "train/steps_per_hour": 200.41586755906124, "train/total_elapsed_hours": 0.02993774930636088 }, { "epoch": 0.016592592592592593, "grad_norm": 22912.0, "learning_rate": 2.790697674418605e-06, "loss": 53.1609, "step": 7 }, { "epoch": 0.016592592592592593, "step": 7, "train/combined_loss": 2.8946415334939957, "train/cross_entropy_loss": 4.530005723237991, "train/kl_divergence_loss": 1.25927734375, "train/step_duration_seconds": 7.1137425899505615, "train/steps_per_hour": 219.34092560159692, "train/total_elapsed_hours": 0.03191378891468048 }, { "epoch": 0.018962962962962963, "grad_norm": 6400.0, "learning_rate": 3.2558139534883724e-06, "loss": 46.3143, "step": 8 }, { "epoch": 0.018962962962962963, "step": 8, "train/combined_loss": 2.7050345838069916, "train/cross_entropy_loss": 4.093662917613983, "train/kl_divergence_loss": 1.31640625, "train/step_duration_seconds": 7.135659217834473, "train/steps_per_hour": 236.01663067159015, "train/total_elapsed_hours": 0.03389591647519006 }, { "epoch": 0.021333333333333333, "grad_norm": 9088.0, "learning_rate": 3.72093023255814e-06, "loss": 43.2806, "step": 9 }, { "epoch": 0.021333333333333333, "step": 9, "train/combined_loss": 2.491789221763611, "train/cross_entropy_loss": 3.7003750950098038, "train/kl_divergence_loss": 1.283203125, "train/step_duration_seconds": 7.159760475158691, "train/steps_per_hour": 250.802995746654, "train/total_elapsed_hours": 0.035884738829400804 }, { "epoch": 0.023703703703703703, "grad_norm": 14784.0, "learning_rate": 4.186046511627907e-06, "loss": 39.8686, "step": 10 }, { "epoch": 0.023703703703703703, "step": 10, "train/combined_loss": 2.106148824095726, "train/cross_entropy_loss": 2.877825006842613, "train/kl_divergence_loss": 1.33447265625, "train/step_duration_seconds": 7.181847333908081, "train/steps_per_hour": 263.99366796803935, "train/total_elapsed_hours": 0.037879696422153046 }, { "epoch": 0.026074074074074072, "grad_norm": 10880.0, "learning_rate": 4.651162790697675e-06, "loss": 33.6984, "step": 11 }, { "epoch": 0.026074074074074072, "step": 11, "train/combined_loss": 1.4576978087425232, "train/cross_entropy_loss": 1.571645624935627, "train/kl_divergence_loss": 1.34375, "train/step_duration_seconds": 7.17723274230957, "train/steps_per_hour": 275.8733309353751, "train/total_elapsed_hours": 0.03987337218390571 }, { "epoch": 0.028444444444444446, "grad_norm": 4704.0, "learning_rate": 5.116279069767442e-06, "loss": 23.3232, "step": 12 }, { "epoch": 0.028444444444444446, "step": 12, "train/combined_loss": 1.0812036469578743, "train/cross_entropy_loss": 0.8298877663910389, "train/kl_divergence_loss": 1.33251953125, "train/step_duration_seconds": 7.202847242355347, "train/steps_per_hour": 286.5728916364825, "train/total_elapsed_hours": 0.04187416308455997 }, { "epoch": 0.030814814814814816, "grad_norm": 1288.0, "learning_rate": 5.58139534883721e-06, "loss": 17.2993, "step": 13 }, { "epoch": 0.030814814814814816, "step": 13, "train/combined_loss": 0.917561486363411, "train/cross_entropy_loss": 0.5206698887050152, "train/kl_divergence_loss": 1.314453125, "train/step_duration_seconds": 7.210479736328125, "train/steps_per_hour": 296.28229001155563, "train/total_elapsed_hours": 0.043877074122428895 }, { "epoch": 0.033185185185185186, "grad_norm": 1064.0, "learning_rate": 6.046511627906977e-06, "loss": 14.681, "step": 14 }, { "epoch": 0.033185185185185186, "step": 14, "train/combined_loss": 0.8026629276573658, "train/cross_entropy_loss": 0.42270869202911854, "train/kl_divergence_loss": 1.1826171875, "train/step_duration_seconds": 7.176325559616089, "train/steps_per_hour": 305.20706432882776, "train/total_elapsed_hours": 0.04587049788898892 }, { "epoch": 0.035555555555555556, "grad_norm": 680.0, "learning_rate": 6.511627906976745e-06, "loss": 12.8426, "step": 15 }, { "epoch": 0.035555555555555556, "step": 15, "train/combined_loss": 0.7154323048889637, "train/cross_entropy_loss": 0.39229039661586285, "train/kl_divergence_loss": 1.03857421875, "train/step_duration_seconds": 7.2133026123046875, "train/steps_per_hour": 313.3212079729679, "train/total_elapsed_hours": 0.047874193059073554 }, { "epoch": 0.037925925925925925, "grad_norm": 516.0, "learning_rate": 6.976744186046513e-06, "loss": 11.4469, "step": 16 }, { "epoch": 0.037925925925925925, "step": 16, "train/combined_loss": 0.6362243294715881, "train/cross_entropy_loss": 0.37010490894317627, "train/kl_divergence_loss": 0.90234375, "train/step_duration_seconds": 7.195964336395264, "train/steps_per_hour": 320.81440635372326, "train/total_elapsed_hours": 0.04987307204140557 }, { "epoch": 0.040296296296296295, "grad_norm": 506.0, "learning_rate": 7.44186046511628e-06, "loss": 10.1796, "step": 17 }, { "epoch": 0.040296296296296295, "step": 17, "train/combined_loss": 0.5937556512653828, "train/cross_entropy_loss": 0.3642691094428301, "train/kl_divergence_loss": 0.8232421875, "train/step_duration_seconds": 7.201917409896851, "train/steps_per_hour": 327.71965844625936, "train/total_elapsed_hours": 0.05187360465526581 }, { "epoch": 0.042666666666666665, "grad_norm": 324.0, "learning_rate": 7.906976744186048e-06, "loss": 9.5001, "step": 18 }, { "epoch": 0.042666666666666665, "step": 18, "train/combined_loss": 0.5263483263552189, "train/cross_entropy_loss": 0.314903661608696, "train/kl_divergence_loss": 0.73779296875, "train/step_duration_seconds": 7.208310604095459, "train/steps_per_hour": 334.1010656793036, "train/total_elapsed_hours": 0.053875913156403436 }, { "epoch": 0.045037037037037035, "grad_norm": 219.0, "learning_rate": 8.372093023255815e-06, "loss": 8.4216, "step": 19 }, { "epoch": 0.045037037037037035, "step": 19, "train/combined_loss": 0.43465932086110115, "train/cross_entropy_loss": 0.2343088500201702, "train/kl_divergence_loss": 0.635009765625, "train/step_duration_seconds": 7.1786744594573975, "train/steps_per_hour": 340.075239062999, "train/total_elapsed_hours": 0.0558699893951416 }, { "epoch": 0.047407407407407405, "grad_norm": 146.0, "learning_rate": 8.837209302325582e-06, "loss": 6.9546, "step": 20 }, { "epoch": 0.047407407407407405, "step": 20, "train/combined_loss": 0.3771132677793503, "train/cross_entropy_loss": 0.18440232705324888, "train/kl_divergence_loss": 0.56982421875, "train/step_duration_seconds": 7.214315176010132, "train/steps_per_hour": 345.57852934665823, "train/total_elapsed_hours": 0.057873965832922196 }, { "epoch": 0.049777777777777775, "grad_norm": 109.5, "learning_rate": 9.30232558139535e-06, "loss": 6.0338, "step": 21 }, { "epoch": 0.049777777777777775, "step": 21, "train/combined_loss": 0.3578077703714371, "train/cross_entropy_loss": 0.15311553701758385, "train/kl_divergence_loss": 0.5625, "train/step_duration_seconds": 7.201047420501709, "train/steps_per_hour": 350.73504254237196, "train/total_elapsed_hours": 0.05987425678306156 }, { "epoch": 0.052148148148148145, "grad_norm": 100.5, "learning_rate": 9.767441860465117e-06, "loss": 5.7249, "step": 22 }, { "epoch": 0.052148148148148145, "step": 22, "train/combined_loss": 0.31270523741841316, "train/cross_entropy_loss": 0.11625519627705216, "train/kl_divergence_loss": 0.5091552734375, "train/step_duration_seconds": 7.181376218795776, "train/steps_per_hour": 355.5895570404658, "train/total_elapsed_hours": 0.06186908351050483 }, { "epoch": 0.05451851851851852, "grad_norm": 75.0, "learning_rate": 1.0232558139534884e-05, "loss": 5.0033, "step": 23 }, { "epoch": 0.05451851851851852, "step": 23, "train/combined_loss": 0.28502367064356804, "train/cross_entropy_loss": 0.10581392887979746, "train/kl_divergence_loss": 0.4642333984375, "train/step_duration_seconds": 7.173454284667969, "train/steps_per_hour": 360.15321399626356, "train/total_elapsed_hours": 0.06386170970069037 }, { "epoch": 0.05688888888888889, "grad_norm": 73.5, "learning_rate": 1.0697674418604651e-05, "loss": 4.5604, "step": 24 }, { "epoch": 0.05688888888888889, "step": 24, "train/combined_loss": 0.2662050947546959, "train/cross_entropy_loss": 0.09075977467000484, "train/kl_divergence_loss": 0.441650390625, "train/step_duration_seconds": 7.196916818618774, "train/steps_per_hour": 364.404632061752, "train/total_elapsed_hours": 0.06586085326141781 }, { "epoch": 0.05925925925925926, "grad_norm": 53.5, "learning_rate": 1.116279069767442e-05, "loss": 4.2593, "step": 25 }, { "epoch": 0.05925925925925926, "step": 25, "train/combined_loss": 0.2555910013616085, "train/cross_entropy_loss": 0.07075231382623315, "train/kl_divergence_loss": 0.4404296875, "train/step_duration_seconds": 7.2051169872283936, "train/steps_per_hour": 368.39319239049337, "train/total_elapsed_hours": 0.06786227464675904 }, { "epoch": 0.06162962962962963, "grad_norm": 45.0, "learning_rate": 1.1627906976744187e-05, "loss": 4.0895, "step": 26 }, { "epoch": 0.06162962962962963, "step": 26, "train/combined_loss": 0.25060202460736036, "train/cross_entropy_loss": 0.06773236347362399, "train/kl_divergence_loss": 0.4334716796875, "train/step_duration_seconds": 7.195337772369385, "train/steps_per_hour": 372.1676986924219, "train/total_elapsed_hours": 0.06986097958352831 }, { "epoch": 0.064, "grad_norm": 34.75, "learning_rate": 1.2093023255813954e-05, "loss": 4.0096, "step": 27 }, { "epoch": 0.064, "step": 27, "train/combined_loss": 0.22914704959839582, "train/cross_entropy_loss": 0.06779117416590452, "train/kl_divergence_loss": 0.3905029296875, "train/step_duration_seconds": 7.1646952629089355, "train/steps_per_hour": 375.7767476973663, "train/total_elapsed_hours": 0.07185117271211412 }, { "epoch": 0.06637037037037037, "grad_norm": 48.5, "learning_rate": 1.2558139534883723e-05, "loss": 3.6664, "step": 28 }, { "epoch": 0.06637037037037037, "step": 28, "train/combined_loss": 0.22941692359745502, "train/cross_entropy_loss": 0.0525838378816843, "train/kl_divergence_loss": 0.40625, "train/step_duration_seconds": 7.199108123779297, "train/steps_per_hour": 379.1421706885833, "train/total_elapsed_hours": 0.07385092496871948 }, { "epoch": 0.06874074074074074, "grad_norm": 72.5, "learning_rate": 1.302325581395349e-05, "loss": 3.6707, "step": 29 }, { "epoch": 0.06874074074074074, "step": 29, "train/combined_loss": 0.22196358162909746, "train/cross_entropy_loss": 0.048419348895549774, "train/kl_divergence_loss": 0.3955078125, "train/step_duration_seconds": 7.179231882095337, "train/steps_per_hour": 382.35797131195636, "train/total_elapsed_hours": 0.0758451560470793 }, { "epoch": 0.07111111111111111, "grad_norm": 63.75, "learning_rate": 1.3488372093023257e-05, "loss": 3.5514, "step": 30 }, { "epoch": 0.07111111111111111, "step": 30, "train/combined_loss": 0.2120195608586073, "train/cross_entropy_loss": 0.056119201704859734, "train/kl_divergence_loss": 0.367919921875, "train/step_duration_seconds": 7.1586713790893555, "train/steps_per_hour": 385.43727586928117, "train/total_elapsed_hours": 0.07783367587460412 }, { "epoch": 0.07348148148148148, "grad_norm": 34.5, "learning_rate": 1.3953488372093025e-05, "loss": 3.3923, "step": 31 }, { "epoch": 0.07348148148148148, "step": 31, "train/combined_loss": 0.20117830298841, "train/cross_entropy_loss": 0.0440802276134491, "train/kl_divergence_loss": 0.3582763671875, "train/step_duration_seconds": 7.1888298988342285, "train/steps_per_hour": 388.3224034144493, "train/total_elapsed_hours": 0.07983057306872474 }, { "epoch": 0.07585185185185185, "grad_norm": 37.75, "learning_rate": 1.441860465116279e-05, "loss": 3.2189, "step": 32 }, { "epoch": 0.07585185185185185, "step": 32, "train/combined_loss": 0.20670769922435284, "train/cross_entropy_loss": 0.03853747947141528, "train/kl_divergence_loss": 0.3748779296875, "train/step_duration_seconds": 7.163522005081177, "train/steps_per_hour": 391.1003153449009, "train/total_elapsed_hours": 0.0818204402923584 }, { "epoch": 0.07822222222222222, "grad_norm": 114.5, "learning_rate": 1.488372093023256e-05, "loss": 3.3073, "step": 33 }, { "epoch": 0.07822222222222222, "step": 33, "train/combined_loss": 0.19256712403148413, "train/cross_entropy_loss": 0.040407692082226276, "train/kl_divergence_loss": 0.3447265625, "train/step_duration_seconds": 7.171715974807739, "train/steps_per_hour": 393.73562507193196, "train/total_elapsed_hours": 0.08381258361869388 }, { "epoch": 0.08059259259259259, "grad_norm": 48.25, "learning_rate": 1.5348837209302328e-05, "loss": 3.0811, "step": 34 }, { "epoch": 0.08059259259259259, "step": 34, "train/combined_loss": 0.2014658311381936, "train/cross_entropy_loss": 0.0435566701926291, "train/kl_divergence_loss": 0.359375, "train/step_duration_seconds": 7.14280104637146, "train/steps_per_hour": 396.28566102564344, "train/total_elapsed_hours": 0.08579669502046373 }, { "epoch": 0.08296296296296296, "grad_norm": 105.5, "learning_rate": 1.5813953488372095e-05, "loss": 3.2235, "step": 35 }, { "epoch": 0.08296296296296296, "step": 35, "train/combined_loss": 0.1921772612258792, "train/cross_entropy_loss": 0.03669826895929873, "train/kl_divergence_loss": 0.34765625, "train/step_duration_seconds": 7.161575078964233, "train/steps_per_hour": 398.6967335955144, "train/total_elapsed_hours": 0.08778602143128712 }, { "epoch": 0.08533333333333333, "grad_norm": 76.5, "learning_rate": 1.6279069767441862e-05, "loss": 3.0748, "step": 36 }, { "epoch": 0.08533333333333333, "step": 36, "train/combined_loss": 0.18726484011858702, "train/cross_entropy_loss": 0.029070683754980564, "train/kl_divergence_loss": 0.345458984375, "train/step_duration_seconds": 7.136035680770874, "train/steps_per_hour": 401.0326431715552, "train/total_elapsed_hours": 0.08976825356483459 }, { "epoch": 0.0877037037037037, "grad_norm": 33.25, "learning_rate": 1.674418604651163e-05, "loss": 2.9962, "step": 37 }, { "epoch": 0.0877037037037037, "step": 37, "train/combined_loss": 0.1922608781605959, "train/cross_entropy_loss": 0.03149440907873213, "train/kl_divergence_loss": 0.35302734375, "train/step_duration_seconds": 7.1254284381866455, "train/steps_per_hour": 403.28057085391987, "train/total_elapsed_hours": 0.09174753924210867 }, { "epoch": 0.09007407407407407, "grad_norm": 50.0, "learning_rate": 1.7209302325581396e-05, "loss": 3.0762, "step": 38 }, { "epoch": 0.09007407407407407, "step": 38, "train/combined_loss": 0.17505795694887638, "train/cross_entropy_loss": 0.02919307304546237, "train/kl_divergence_loss": 0.3209228515625, "train/step_duration_seconds": 7.103011608123779, "train/steps_per_hour": 405.4604942984025, "train/total_elapsed_hours": 0.09372059802214304 }, { "epoch": 0.09244444444444444, "grad_norm": 30.75, "learning_rate": 1.7674418604651163e-05, "loss": 2.8009, "step": 39 }, { "epoch": 0.09244444444444444, "step": 39, "train/combined_loss": 0.17248188611119986, "train/cross_entropy_loss": 0.017937407130375504, "train/kl_divergence_loss": 0.3270263671875, "train/step_duration_seconds": 7.110121965408325, "train/steps_per_hour": 407.5421126867549, "train/total_elapsed_hours": 0.09569563190142313 }, { "epoch": 0.09481481481481481, "grad_norm": 38.5, "learning_rate": 1.813953488372093e-05, "loss": 2.7597, "step": 40 }, { "epoch": 0.09481481481481481, "step": 40, "train/combined_loss": 0.17410407960414886, "train/cross_entropy_loss": 0.016787268687039614, "train/kl_divergence_loss": 0.3314208984375, "train/step_duration_seconds": 7.180423736572266, "train/steps_per_hour": 409.4576778026899, "train/total_elapsed_hours": 0.09769019405047098 }, { "epoch": 0.09718518518518518, "grad_norm": 31.25, "learning_rate": 1.86046511627907e-05, "loss": 2.7857, "step": 41 }, { "epoch": 0.09718518518518518, "step": 41, "train/combined_loss": 0.16598839219659567, "train/cross_entropy_loss": 0.023749235086143017, "train/kl_divergence_loss": 0.3082275390625, "train/step_duration_seconds": 7.1705567836761475, "train/steps_per_hour": 411.30789585265995, "train/total_elapsed_hours": 0.09968201537926992 }, { "epoch": 0.09955555555555555, "grad_norm": 29.5, "learning_rate": 1.9069767441860468e-05, "loss": 2.6558, "step": 42 }, { "epoch": 0.09955555555555555, "step": 42, "train/combined_loss": 0.16566006373614073, "train/cross_entropy_loss": 0.030416806926950812, "train/kl_divergence_loss": 0.3009033203125, "train/step_duration_seconds": 7.185399770736694, "train/steps_per_hour": 413.06887057061425, "train/total_elapsed_hours": 0.1016779597600301 }, { "epoch": 0.10192592592592592, "grad_norm": 35.5, "learning_rate": 1.9534883720930235e-05, "loss": 2.6506, "step": 43 }, { "epoch": 0.10192592592592592, "step": 43, "train/combined_loss": 0.1595935821533203, "train/cross_entropy_loss": 0.024265281856060028, "train/kl_divergence_loss": 0.294921875, "train/step_duration_seconds": 7.195603370666504, "train/steps_per_hour": 414.75070139036376, "train/total_elapsed_hours": 0.10367673847410414 }, { "epoch": 0.10429629629629629, "grad_norm": 17.5, "learning_rate": 2e-05, "loss": 2.5535, "step": 44 }, { "epoch": 0.10429629629629629, "step": 44, "train/combined_loss": 0.1587589643895626, "train/cross_entropy_loss": 0.013806993083562702, "train/kl_divergence_loss": 0.3037109375, "train/step_duration_seconds": 7.19236421585083, "train/steps_per_hour": 416.37245606383044, "train/total_elapsed_hours": 0.10567461742295159 }, { "epoch": 0.10666666666666667, "grad_norm": 32.25, "learning_rate": 1.999965463076377e-05, "loss": 2.5401, "step": 45 }, { "epoch": 0.10666666666666667, "step": 45, "train/combined_loss": 0.15903105773031712, "train/cross_entropy_loss": 0.013862900086678565, "train/kl_divergence_loss": 0.30419921875, "train/step_duration_seconds": 7.199136018753052, "train/steps_per_hour": 417.9267256968682, "train/total_elapsed_hours": 0.10767437742816077 }, { "epoch": 0.10903703703703704, "grad_norm": 30.25, "learning_rate": 1.999861854691106e-05, "loss": 2.5445, "step": 46 }, { "epoch": 0.10903703703703704, "step": 46, "train/combined_loss": 0.16162511333823204, "train/cross_entropy_loss": 0.021858620457351208, "train/kl_divergence_loss": 0.3013916015625, "train/step_duration_seconds": 7.177664041519165, "train/steps_per_hour": 419.44712621402573, "train/total_elapsed_hours": 0.10966817299524943 }, { "epoch": 0.11140740740740741, "grad_norm": 10.4375, "learning_rate": 1.9996891820008165e-05, "loss": 2.586, "step": 47 }, { "epoch": 0.11140740740740741, "step": 47, "train/combined_loss": 0.15759198181331158, "train/cross_entropy_loss": 0.032835332211107016, "train/kl_divergence_loss": 0.2823486328125, "train/step_duration_seconds": 7.197060823440552, "train/steps_per_hour": 420.89292205888296, "train/total_elapsed_hours": 0.11166735655731624 }, { "epoch": 0.11377777777777778, "grad_norm": 34.5, "learning_rate": 1.999447456932676e-05, "loss": 2.5215, "step": 48 }, { "epoch": 0.11377777777777778, "step": 48, "train/combined_loss": 0.15159989707171917, "train/cross_entropy_loss": 0.02695468720048666, "train/kl_divergence_loss": 0.2762451171875, "train/step_duration_seconds": 7.164118528366089, "train/steps_per_hour": 422.32185886743343, "train/total_elapsed_hours": 0.11365738948186238 }, { "epoch": 0.11614814814814815, "grad_norm": 31.5, "learning_rate": 1.9991366961835643e-05, "loss": 2.4256, "step": 49 }, { "epoch": 0.11614814814814815, "step": 49, "train/combined_loss": 0.15017856005579233, "train/cross_entropy_loss": 0.019473322783596814, "train/kl_divergence_loss": 0.2808837890625, "train/step_duration_seconds": 7.150151968002319, "train/steps_per_hour": 423.7158323839195, "train/total_elapsed_hours": 0.11564354280630748 }, { "epoch": 0.11851851851851852, "grad_norm": 23.375, "learning_rate": 1.9987569212189224e-05, "loss": 2.4029, "step": 50 }, { "epoch": 0.11851851851851852, "eval_combined_loss": 0.14935019636029997, "eval_cross_entropy_loss": 0.01737226772059997, "eval_kl_divergence_loss": 0.281328125, "eval_loss": 0.14935019612312317, "eval_runtime": 220.3536, "eval_samples_per_second": 6.807, "eval_steps_per_second": 3.404, "step": 50 }, { "epoch": 0.11851851851851852, "step": 50, "train/combined_loss": 0.14872634038329124, "train/cross_entropy_loss": 0.017301325569860637, "train/kl_divergence_loss": 0.2801513671875, "train/step_duration_seconds": 227.54744958877563, "train/steps_per_hour": 279.5620551165938, "train/total_elapsed_hours": 0.17885116769207848 }, { "epoch": 0.12088888888888889, "grad_norm": 20.25, "learning_rate": 1.9983081582712684e-05, "loss": 2.3796, "step": 51 }, { "epoch": 0.12088888888888889, "step": 51, "train/combined_loss": 0.15616787131875753, "train/cross_entropy_loss": 0.019611137569881976, "train/kl_divergence_loss": 0.292724609375, "train/step_duration_seconds": 7.1779563426971436, "train/steps_per_hour": 282.00938628976195, "train/total_elapsed_hours": 0.1808450444539388 }, { "epoch": 0.12325925925925926, "grad_norm": 27.375, "learning_rate": 1.997790438338385e-05, "loss": 2.4987, "step": 52 }, { "epoch": 0.12325925925925926, "step": 52, "train/combined_loss": 0.1437931666150689, "train/cross_entropy_loss": 0.019641992752440274, "train/kl_divergence_loss": 0.2679443359375, "train/step_duration_seconds": 7.16746187210083, "train/steps_per_hour": 284.4078751961097, "train/total_elapsed_hours": 0.1828360060850779 }, { "epoch": 0.12562962962962962, "grad_norm": 15.4375, "learning_rate": 1.9972037971811802e-05, "loss": 2.3007, "step": 53 }, { "epoch": 0.12562962962962962, "step": 53, "train/combined_loss": 0.14357721991837025, "train/cross_entropy_loss": 0.02165149967186153, "train/kl_divergence_loss": 0.2655029296875, "train/step_duration_seconds": 7.177205562591553, "train/steps_per_hour": 286.75049177904856, "train/total_elapsed_hours": 0.1848296742969089 }, { "epoch": 0.128, "grad_norm": 20.125, "learning_rate": 1.9965482753212154e-05, "loss": 2.2972, "step": 54 }, { "epoch": 0.128, "step": 54, "train/combined_loss": 0.14043164812028408, "train/cross_entropy_loss": 0.023294932674616575, "train/kl_divergence_loss": 0.257568359375, "train/step_duration_seconds": 7.157373428344727, "train/steps_per_hour": 289.05163369286066, "train/total_elapsed_hours": 0.1868178335825602 }, { "epoch": 0.13037037037037036, "grad_norm": 17.125, "learning_rate": 1.995823918037908e-05, "loss": 2.2469, "step": 55 }, { "epoch": 0.13037037037037036, "step": 55, "train/combined_loss": 0.1433863490819931, "train/cross_entropy_loss": 0.02163596951868385, "train/kl_divergence_loss": 0.26513671875, "train/step_duration_seconds": 7.159351348876953, "train/steps_per_hour": 291.30346508519057, "train/total_elapsed_hours": 0.1888065422905816 }, { "epoch": 0.13274074074074074, "grad_norm": 12.5, "learning_rate": 1.9950307753654016e-05, "loss": 2.2942, "step": 56 }, { "epoch": 0.13274074074074074, "step": 56, "train/combined_loss": 0.1372276395559311, "train/cross_entropy_loss": 0.019206264754757285, "train/kl_divergence_loss": 0.2552490234375, "train/step_duration_seconds": 7.149013042449951, "train/steps_per_hour": 293.5127714276043, "train/total_elapsed_hours": 0.1907923792468177 }, { "epoch": 0.1351111111111111, "grad_norm": 15.0625, "learning_rate": 1.994168902089112e-05, "loss": 2.1956, "step": 57 }, { "epoch": 0.1351111111111111, "step": 57, "train/combined_loss": 0.13671019952744246, "train/cross_entropy_loss": 0.020124488859437406, "train/kl_divergence_loss": 0.2532958984375, "train/step_duration_seconds": 7.1693689823150635, "train/steps_per_hour": 295.6678886749936, "train/total_elapsed_hours": 0.1927838706307941 }, { "epoch": 0.13748148148148148, "grad_norm": 21.25, "learning_rate": 1.9932383577419432e-05, "loss": 2.1874, "step": 58 }, { "epoch": 0.13748148148148148, "step": 58, "train/combined_loss": 0.13713468238711357, "train/cross_entropy_loss": 0.020057943649590015, "train/kl_divergence_loss": 0.25421142578125, "train/step_duration_seconds": 7.144319295883179, "train/steps_per_hour": 297.7895740672935, "train/total_elapsed_hours": 0.19476840376853943 }, { "epoch": 0.13985185185185184, "grad_norm": 19.0, "learning_rate": 1.9922392066001724e-05, "loss": 2.1942, "step": 59 }, { "epoch": 0.13985185185185184, "step": 59, "train/combined_loss": 0.1343515245243907, "train/cross_entropy_loss": 0.019130286993458867, "train/kl_divergence_loss": 0.24957275390625, "train/step_duration_seconds": 7.157663822174072, "train/steps_per_hour": 299.86280963512706, "train/total_elapsed_hours": 0.19675664371914334 }, { "epoch": 0.14222222222222222, "grad_norm": 16.625, "learning_rate": 1.991171517679013e-05, "loss": 2.1496, "step": 60 }, { "epoch": 0.14222222222222222, "step": 60, "train/combined_loss": 0.13391043990850449, "train/cross_entropy_loss": 0.020872646826319396, "train/kl_divergence_loss": 0.2469482421875, "train/step_duration_seconds": 7.151079893112183, "train/steps_per_hour": 301.8973420742143, "train/total_elapsed_hours": 0.1987430548005634 }, { "epoch": 0.1445925925925926, "grad_norm": 5.65625, "learning_rate": 1.9900353647278466e-05, "loss": 2.1426, "step": 61 }, { "epoch": 0.1445925925925926, "step": 61, "train/combined_loss": 0.13261367939412594, "train/cross_entropy_loss": 0.022612601169385016, "train/kl_divergence_loss": 0.24261474609375, "train/step_duration_seconds": 7.164619207382202, "train/steps_per_hour": 303.88591351636484, "train/total_elapsed_hours": 0.200733226802614 }, { "epoch": 0.14696296296296296, "grad_norm": 8.8125, "learning_rate": 1.9888308262251286e-05, "loss": 2.1218, "step": 62 }, { "epoch": 0.14696296296296296, "step": 62, "train/combined_loss": 0.1269391467794776, "train/cross_entropy_loss": 0.022066760691814125, "train/kl_divergence_loss": 0.2318115234375, "train/step_duration_seconds": 7.142378091812134, "train/steps_per_hour": 305.84476139080533, "train/total_elapsed_hours": 0.20271722071700626 }, { "epoch": 0.14933333333333335, "grad_norm": 20.25, "learning_rate": 1.9875579853729677e-05, "loss": 2.031, "step": 63 }, { "epoch": 0.14933333333333335, "step": 63, "train/combined_loss": 0.1287386268377304, "train/cross_entropy_loss": 0.02023360482417047, "train/kl_divergence_loss": 0.23724365234375, "train/step_duration_seconds": 7.179792881011963, "train/steps_per_hour": 307.75001344506256, "train/total_elapsed_hours": 0.20471160762839846 }, { "epoch": 0.1517037037037037, "grad_norm": 8.125, "learning_rate": 1.9862169300913784e-05, "loss": 2.0598, "step": 64 }, { "epoch": 0.1517037037037037, "step": 64, "train/combined_loss": 0.1224580081179738, "train/cross_entropy_loss": 0.019146979437209666, "train/kl_divergence_loss": 0.22576904296875, "train/step_duration_seconds": 7.164458274841309, "train/steps_per_hour": 309.624880616572, "train/total_elapsed_hours": 0.2067017349269655 }, { "epoch": 0.15407407407407409, "grad_norm": 17.625, "learning_rate": 1.9848077530122083e-05, "loss": 1.9593, "step": 65 }, { "epoch": 0.15407407407407409, "step": 65, "train/combined_loss": 0.1219773581251502, "train/cross_entropy_loss": 0.02142053795978427, "train/kl_divergence_loss": 0.2225341796875, "train/step_duration_seconds": 7.1500184535980225, "train/steps_per_hour": 311.46997603082906, "train/total_elapsed_hours": 0.20868785116407607 }, { "epoch": 0.15644444444444444, "grad_norm": 14.625, "learning_rate": 1.9833305514727396e-05, "loss": 1.9516, "step": 66 }, { "epoch": 0.15644444444444444, "step": 66, "train/combined_loss": 0.11912317294627428, "train/cross_entropy_loss": 0.019252198981121182, "train/kl_divergence_loss": 0.218994140625, "train/step_duration_seconds": 7.173556327819824, "train/steps_per_hour": 313.2705599924478, "train/total_elapsed_hours": 0.21068050569958158 }, { "epoch": 0.15881481481481483, "grad_norm": 11.75, "learning_rate": 1.981785427508966e-05, "loss": 1.906, "step": 67 }, { "epoch": 0.15881481481481483, "step": 67, "train/combined_loss": 0.11908936966210604, "train/cross_entropy_loss": 0.02040529961232096, "train/kl_divergence_loss": 0.2177734375, "train/step_duration_seconds": 7.164828062057495, "train/steps_per_hour": 315.0409941178432, "train/total_elapsed_hours": 0.21267073571681977 }, { "epoch": 0.16118518518518518, "grad_norm": 7.46875, "learning_rate": 1.9801724878485438e-05, "loss": 1.9054, "step": 68 }, { "epoch": 0.16118518518518518, "step": 68, "train/combined_loss": 0.12613936699926853, "train/cross_entropy_loss": 0.025533129926770926, "train/kl_divergence_loss": 0.22674560546875, "train/step_duration_seconds": 7.197604179382324, "train/steps_per_hour": 316.7651640171972, "train/total_elapsed_hours": 0.21467007021109263 }, { "epoch": 0.16355555555555557, "grad_norm": 37.25, "learning_rate": 1.9784918439034216e-05, "loss": 2.0182, "step": 69 }, { "epoch": 0.16355555555555557, "step": 69, "train/combined_loss": 0.11615706328302622, "train/cross_entropy_loss": 0.024733559926971793, "train/kl_divergence_loss": 0.20758056640625, "train/step_duration_seconds": 7.158296346664429, "train/steps_per_hour": 318.4735632448237, "train/total_elapsed_hours": 0.21665848586294387 }, { "epoch": 0.16592592592592592, "grad_norm": 25.25, "learning_rate": 1.9767436117621416e-05, "loss": 1.8585, "step": 70 }, { "epoch": 0.16592592592592592, "step": 70, "train/combined_loss": 0.11343861371278763, "train/cross_entropy_loss": 0.018869414925575256, "train/kl_divergence_loss": 0.2080078125, "train/step_duration_seconds": 7.1701250076293945, "train/steps_per_hour": 320.1460784421751, "train/total_elapsed_hours": 0.21865018725395202 }, { "epoch": 0.1682962962962963, "grad_norm": 37.75, "learning_rate": 1.9749279121818235e-05, "loss": 1.815, "step": 71 }, { "epoch": 0.1682962962962963, "step": 71, "train/combined_loss": 0.1213934626430273, "train/cross_entropy_loss": 0.01866583281662315, "train/kl_divergence_loss": 0.22412109375, "train/step_duration_seconds": 7.159663677215576, "train/steps_per_hour": 321.7926366627231, "train/total_elapsed_hours": 0.22063898271984525 }, { "epoch": 0.17066666666666666, "grad_norm": 29.125, "learning_rate": 1.973044870579824e-05, "loss": 1.9423, "step": 72 }, { "epoch": 0.17066666666666666, "step": 72, "train/combined_loss": 0.11181758902966976, "train/cross_entropy_loss": 0.019289474468678236, "train/kl_divergence_loss": 0.204345703125, "train/step_duration_seconds": 7.168922185897827, "train/steps_per_hour": 323.4060405602183, "train/total_elapsed_hours": 0.22263034999370576 }, { "epoch": 0.17303703703703704, "grad_norm": 26.5, "learning_rate": 1.9710946170250702e-05, "loss": 1.7891, "step": 73 }, { "epoch": 0.17303703703703704, "step": 73, "train/combined_loss": 0.11201242171227932, "train/cross_entropy_loss": 0.020838803611695766, "train/kl_divergence_loss": 0.20318603515625, "train/step_duration_seconds": 7.184260606765747, "train/steps_per_hour": 324.9846730527734, "train/total_elapsed_hours": 0.22462597794002956 }, { "epoch": 0.1754074074074074, "grad_norm": 28.75, "learning_rate": 1.969077286229078e-05, "loss": 1.7922, "step": 74 }, { "epoch": 0.1754074074074074, "step": 74, "train/combined_loss": 0.11633206205442548, "train/cross_entropy_loss": 0.022031799773685634, "train/kl_divergence_loss": 0.21063232421875, "train/step_duration_seconds": 7.156804323196411, "train/steps_per_hour": 326.54649232377625, "train/total_elapsed_hours": 0.22661397914091747 }, { "epoch": 0.17777777777777778, "grad_norm": 6.15625, "learning_rate": 1.9669930175366474e-05, "loss": 1.8613, "step": 75 }, { "epoch": 0.17777777777777778, "step": 75, "train/combined_loss": 0.10736048221588135, "train/cross_entropy_loss": 0.019408458843827248, "train/kl_divergence_loss": 0.1953125, "train/step_duration_seconds": 7.157041549682617, "train/steps_per_hour": 328.08105279118894, "train/total_elapsed_hours": 0.22860204623805153 }, { "epoch": 0.18014814814814814, "grad_norm": 19.875, "learning_rate": 1.964841954916235e-05, "loss": 1.7178, "step": 76 }, { "epoch": 0.18014814814814814, "step": 76, "train/combined_loss": 0.11280431784689426, "train/cross_entropy_loss": 0.021934314048849046, "train/kl_divergence_loss": 0.20367431640625, "train/step_duration_seconds": 7.154073476791382, "train/steps_per_hour": 329.5903308097024, "train/total_elapsed_hours": 0.23058928887049357 }, { "epoch": 0.18251851851851852, "grad_norm": 8.625, "learning_rate": 1.962624246950012e-05, "loss": 1.8049, "step": 77 }, { "epoch": 0.18251851851851852, "step": 77, "train/combined_loss": 0.10877907322719693, "train/cross_entropy_loss": 0.021696327603422105, "train/kl_divergence_loss": 0.19586181640625, "train/step_duration_seconds": 7.163645029067993, "train/steps_per_hour": 331.0700321531476, "train/total_elapsed_hours": 0.2325791902674569 }, { "epoch": 0.18488888888888888, "grad_norm": 6.375, "learning_rate": 1.9603400468236e-05, "loss": 1.7405, "step": 78 }, { "epoch": 0.18488888888888888, "step": 78, "train/combined_loss": 0.11846707155928016, "train/cross_entropy_loss": 0.022090390557423234, "train/kl_divergence_loss": 0.21484375, "train/step_duration_seconds": 7.1567511558532715, "train/steps_per_hour": 332.52734290219325, "train/total_elapsed_hours": 0.23456717669963836 }, { "epoch": 0.18725925925925926, "grad_norm": 23.5, "learning_rate": 1.957989512315489e-05, "loss": 1.8955, "step": 79 }, { "epoch": 0.18725925925925926, "step": 79, "train/combined_loss": 0.10807515401393175, "train/cross_entropy_loss": 0.020105381147004664, "train/kl_divergence_loss": 0.196044921875, "train/step_duration_seconds": 7.179587125778198, "train/steps_per_hour": 333.9512044172201, "train/total_elapsed_hours": 0.23656150645679896 }, { "epoch": 0.18962962962962962, "grad_norm": 12.5625, "learning_rate": 1.955572805786141e-05, "loss": 1.7292, "step": 80 }, { "epoch": 0.18962962962962962, "step": 80, "train/combined_loss": 0.10483178775757551, "train/cross_entropy_loss": 0.022346684243530035, "train/kl_divergence_loss": 0.18731689453125, "train/step_duration_seconds": 7.17809271812439, "train/steps_per_hour": 335.3518424811757, "train/total_elapsed_hours": 0.23855542110072242 }, { "epoch": 0.192, "grad_norm": 43.25, "learning_rate": 1.9530900941667733e-05, "loss": 1.6773, "step": 81 }, { "epoch": 0.192, "step": 81, "train/combined_loss": 0.1046543437987566, "train/cross_entropy_loss": 0.02217490249313414, "train/kl_divergence_loss": 0.1871337890625, "train/step_duration_seconds": 7.168772459030151, "train/steps_per_hour": 336.7328849090869, "train/total_elapsed_hours": 0.24054674678378635 }, { "epoch": 0.19437037037037036, "grad_norm": 36.75, "learning_rate": 1.9505415489478293e-05, "loss": 1.6745, "step": 82 }, { "epoch": 0.19437037037037036, "step": 82, "train/combined_loss": 0.10497199138626456, "train/cross_entropy_loss": 0.019270156044512987, "train/kl_divergence_loss": 0.190673828125, "train/step_duration_seconds": 7.172302484512329, "train/steps_per_hour": 338.08988274573534, "train/total_elapsed_hours": 0.24253905302948423 }, { "epoch": 0.19674074074074074, "grad_norm": 19.375, "learning_rate": 1.947927346167132e-05, "loss": 1.6796, "step": 83 }, { "epoch": 0.19674074074074074, "step": 83, "train/combined_loss": 0.1053922800347209, "train/cross_entropy_loss": 0.01895106490701437, "train/kl_divergence_loss": 0.19183349609375, "train/step_duration_seconds": 7.172720909118652, "train/steps_per_hour": 339.42460711386434, "train/total_elapsed_hours": 0.2445314755042394 }, { "epoch": 0.1991111111111111, "grad_norm": 26.75, "learning_rate": 1.945247666397725e-05, "loss": 1.6863, "step": 84 }, { "epoch": 0.1991111111111111, "step": 84, "train/combined_loss": 0.10413095075637102, "train/cross_entropy_loss": 0.019968447857536376, "train/kl_divergence_loss": 0.18829345703125, "train/step_duration_seconds": 7.1637890338897705, "train/steps_per_hour": 340.74118612261555, "train/total_elapsed_hours": 0.24652141690254212 }, { "epoch": 0.20148148148148148, "grad_norm": 10.1875, "learning_rate": 1.9425026947353994e-05, "loss": 1.6661, "step": 85 }, { "epoch": 0.20148148148148148, "step": 85, "train/combined_loss": 0.1039673495106399, "train/cross_entropy_loss": 0.023974738782271743, "train/kl_divergence_loss": 0.1839599609375, "train/step_duration_seconds": 7.150022268295288, "train/steps_per_hour": 342.04194361300205, "train/total_elapsed_hours": 0.2485075341992908 }, { "epoch": 0.20385185185185184, "grad_norm": 52.25, "learning_rate": 1.9396926207859085e-05, "loss": 1.6635, "step": 86 }, { "epoch": 0.20385185185185184, "step": 86, "train/combined_loss": 0.10607085470110178, "train/cross_entropy_loss": 0.02512999135069549, "train/kl_divergence_loss": 0.18701171875, "train/step_duration_seconds": 7.172277927398682, "train/steps_per_hour": 343.31360127461863, "train/total_elapsed_hours": 0.2504998336235682 }, { "epoch": 0.20622222222222222, "grad_norm": 56.75, "learning_rate": 1.936817638651871e-05, "loss": 1.6971, "step": 87 }, { "epoch": 0.20622222222222222, "step": 87, "train/combined_loss": 0.10299085499718785, "train/cross_entropy_loss": 0.023303485824726522, "train/kl_divergence_loss": 0.18267822265625, "train/step_duration_seconds": 7.151454925537109, "train/steps_per_hour": 344.57308438928993, "train/total_elapsed_hours": 0.25248634888066185 }, { "epoch": 0.20859259259259258, "grad_norm": 27.0, "learning_rate": 1.9338779469193638e-05, "loss": 1.6479, "step": 88 }, { "epoch": 0.20859259259259258, "step": 88, "train/combined_loss": 0.10254441620782018, "train/cross_entropy_loss": 0.019541956367902458, "train/kl_divergence_loss": 0.185546875, "train/step_duration_seconds": 7.150182485580444, "train/steps_per_hour": 345.81338378782823, "train/total_elapsed_hours": 0.254472510682212 }, { "epoch": 0.21096296296296296, "grad_norm": 57.25, "learning_rate": 1.9308737486442045e-05, "loss": 1.6407, "step": 89 }, { "epoch": 0.21096296296296296, "step": 89, "train/combined_loss": 0.1078284471295774, "train/cross_entropy_loss": 0.01973404036834836, "train/kl_divergence_loss": 0.1959228515625, "train/step_duration_seconds": 7.157840967178345, "train/steps_per_hour": 347.0315933491895, "train/total_elapsed_hours": 0.2564607998397615 }, { "epoch": 0.21333333333333335, "grad_norm": 59.25, "learning_rate": 1.9278052513379256e-05, "loss": 1.7253, "step": 90 }, { "epoch": 0.21333333333333335, "step": 90, "train/combined_loss": 0.1028821705840528, "train/cross_entropy_loss": 0.01917986525222659, "train/kl_divergence_loss": 0.18658447265625, "train/step_duration_seconds": 7.162302017211914, "train/steps_per_hour": 348.2293895098418, "train/total_elapsed_hours": 0.25845032817787594 }, { "epoch": 0.2157037037037037, "grad_norm": 36.5, "learning_rate": 1.9246726669534416e-05, "loss": 1.6461, "step": 91 }, { "epoch": 0.2157037037037037, "step": 91, "train/combined_loss": 0.10168306482955813, "train/cross_entropy_loss": 0.02093204390257597, "train/kl_divergence_loss": 0.18243408203125, "train/step_duration_seconds": 7.153521299362183, "train/steps_per_hour": 349.4121578181558, "train/total_elapsed_hours": 0.26043741742769877 }, { "epoch": 0.2180740740740741, "grad_norm": 27.5, "learning_rate": 1.921476211870408e-05, "loss": 1.6269, "step": 92 }, { "epoch": 0.2180740740740741, "step": 92, "train/combined_loss": 0.09777736244723201, "train/cross_entropy_loss": 0.0219097004737705, "train/kl_divergence_loss": 0.17364501953125, "train/step_duration_seconds": 7.162153959274292, "train/steps_per_hour": 350.5738107405354, "train/total_elapsed_hours": 0.2624269046386083 }, { "epoch": 0.22044444444444444, "grad_norm": 34.5, "learning_rate": 1.9182161068802742e-05, "loss": 1.5644, "step": 93 }, { "epoch": 0.22044444444444444, "step": 93, "train/combined_loss": 0.10163608659058809, "train/cross_entropy_loss": 0.02199775306507945, "train/kl_divergence_loss": 0.1812744140625, "train/step_duration_seconds": 7.154653549194336, "train/steps_per_hour": 351.72075430393164, "train/total_elapsed_hours": 0.26441430840227337 }, { "epoch": 0.22281481481481483, "grad_norm": 20.5, "learning_rate": 1.9148925771710347e-05, "loss": 1.6262, "step": 94 }, { "epoch": 0.22281481481481483, "step": 94, "train/combined_loss": 0.09920958010479808, "train/cross_entropy_loss": 0.019586148322559893, "train/kl_divergence_loss": 0.1788330078125, "train/step_duration_seconds": 7.162054061889648, "train/steps_per_hour": 352.84786230208005, "train/total_elapsed_hours": 0.2664037678639094 }, { "epoch": 0.22518518518518518, "grad_norm": 31.625, "learning_rate": 1.9115058523116734e-05, "loss": 1.5874, "step": 95 }, { "epoch": 0.22518518518518518, "step": 95, "train/combined_loss": 0.09802744071930647, "train/cross_entropy_loss": 0.01880878652445972, "train/kl_divergence_loss": 0.17724609375, "train/step_duration_seconds": 7.153456926345825, "train/steps_per_hour": 353.9614104256406, "train/total_elapsed_hours": 0.2683908392323388 }, { "epoch": 0.22755555555555557, "grad_norm": 38.75, "learning_rate": 1.908056166236305e-05, "loss": 1.5684, "step": 96 }, { "epoch": 0.22755555555555557, "step": 96, "train/combined_loss": 0.09743851656094193, "train/cross_entropy_loss": 0.018851646571420133, "train/kl_divergence_loss": 0.176025390625, "train/step_duration_seconds": 7.1597981452941895, "train/steps_per_hour": 355.0562779811278, "train/total_elapsed_hours": 0.2703796720504761 }, { "epoch": 0.22992592592592592, "grad_norm": 26.125, "learning_rate": 1.9045437572280193e-05, "loss": 1.559, "step": 97 }, { "epoch": 0.22992592592592592, "step": 97, "train/combined_loss": 0.0965579142794013, "train/cross_entropy_loss": 0.02038634184282273, "train/kl_divergence_loss": 0.1727294921875, "train/step_duration_seconds": 7.157320976257324, "train/steps_per_hour": 356.13605583694607, "train/total_elapsed_hours": 0.2723678167661031 }, { "epoch": 0.2322962962962963, "grad_norm": 18.125, "learning_rate": 1.900968867902419e-05, "loss": 1.5449, "step": 98 }, { "epoch": 0.2322962962962963, "step": 98, "train/combined_loss": 0.09678681008517742, "train/cross_entropy_loss": 0.020966205513104796, "train/kl_divergence_loss": 0.172607421875, "train/step_duration_seconds": 7.1417396068573, "train/steps_per_hour": 357.2058194544125, "train/total_elapsed_hours": 0.2743516333235635 }, { "epoch": 0.23466666666666666, "grad_norm": 27.5, "learning_rate": 1.8973317451908642e-05, "loss": 1.5486, "step": 99 }, { "epoch": 0.23466666666666666, "step": 99, "train/combined_loss": 0.10045615630224347, "train/cross_entropy_loss": 0.02183516975492239, "train/kl_divergence_loss": 0.1790771484375, "train/step_duration_seconds": 7.167134761810303, "train/steps_per_hour": 358.25107804442126, "train/total_elapsed_hours": 0.276342504090733 }, { "epoch": 0.23703703703703705, "grad_norm": 17.75, "learning_rate": 1.8936326403234125e-05, "loss": 1.6073, "step": 100 }, { "epoch": 0.23703703703703705, "eval_combined_loss": 0.09612167934452494, "eval_cross_entropy_loss": 0.02090351493904988, "eval_kl_divergence_loss": 0.17133984375, "eval_loss": 0.09612167626619339, "eval_runtime": 219.9162, "eval_samples_per_second": 6.821, "eval_steps_per_second": 3.41, "step": 100 }, { "epoch": 0.23703703703703705, "step": 100, "train/combined_loss": 0.09815677208825946, "train/cross_entropy_loss": 0.020898508373647928, "train/kl_divergence_loss": 0.1754150390625, "train/step_duration_seconds": 227.11371684074402, "train/steps_per_hour": 294.61186048448315, "train/total_elapsed_hours": 0.33942964765760636 }, { "epoch": 0.2394074074074074, "grad_norm": 21.125, "learning_rate": 1.8898718088114688e-05, "loss": 1.5705, "step": 101 }, { "epoch": 0.2394074074074074, "step": 101, "train/combined_loss": 0.09819618007168174, "train/cross_entropy_loss": 0.020428007235750556, "train/kl_divergence_loss": 0.17596435546875, "train/step_duration_seconds": 7.171290159225464, "train/steps_per_hour": 295.82187680336136, "train/total_elapsed_hours": 0.34142167270183565 }, { "epoch": 0.24177777777777779, "grad_norm": 25.25, "learning_rate": 1.8860495104301346e-05, "loss": 1.5711, "step": 102 }, { "epoch": 0.24177777777777779, "step": 102, "train/combined_loss": 0.09767304686829448, "train/cross_entropy_loss": 0.020236226613633335, "train/kl_divergence_loss": 0.17510986328125, "train/step_duration_seconds": 7.163306951522827, "train/steps_per_hour": 297.0197733169854, "train/total_elapsed_hours": 0.34341148018836976 }, { "epoch": 0.24414814814814814, "grad_norm": 12.8125, "learning_rate": 1.8821660092002642e-05, "loss": 1.5628, "step": 103 }, { "epoch": 0.24414814814814814, "step": 103, "train/combined_loss": 0.09357908833771944, "train/cross_entropy_loss": 0.021325660520233214, "train/kl_divergence_loss": 0.16583251953125, "train/step_duration_seconds": 7.165699005126953, "train/steps_per_hour": 298.2032943460889, "train/total_elapsed_hours": 0.34540195213423835 }, { "epoch": 0.24651851851851853, "grad_norm": 33.25, "learning_rate": 1.8782215733702286e-05, "loss": 1.4973, "step": 104 }, { "epoch": 0.24651851851851853, "step": 104, "train/combined_loss": 0.09911046642810106, "train/cross_entropy_loss": 0.022012439789250493, "train/kl_divergence_loss": 0.17620849609375, "train/step_duration_seconds": 7.175975322723389, "train/steps_per_hour": 299.3707928769077, "train/total_elapsed_hours": 0.34739527861277264 }, { "epoch": 0.24888888888888888, "grad_norm": 38.0, "learning_rate": 1.874216475397386e-05, "loss": 1.5858, "step": 105 }, { "epoch": 0.24888888888888888, "step": 105, "train/combined_loss": 0.09527313988655806, "train/cross_entropy_loss": 0.02172303292900324, "train/kl_divergence_loss": 0.1688232421875, "train/step_duration_seconds": 7.173555850982666, "train/steps_per_hour": 300.5255479022072, "train/total_elapsed_hours": 0.3493879330158234 }, { "epoch": 0.25125925925925924, "grad_norm": 30.0, "learning_rate": 1.870150991929261e-05, "loss": 1.5244, "step": 106 }, { "epoch": 0.25125925925925924, "step": 106, "train/combined_loss": 0.09378110896795988, "train/cross_entropy_loss": 0.02069209818728268, "train/kl_divergence_loss": 0.1668701171875, "train/step_duration_seconds": 7.196229696273804, "train/steps_per_hour": 301.66179874940855, "train/total_elapsed_hours": 0.35138688570923277 }, { "epoch": 0.25362962962962965, "grad_norm": 9.4375, "learning_rate": 1.866025403784439e-05, "loss": 1.5005, "step": 107 }, { "epoch": 0.25362962962962965, "step": 107, "train/combined_loss": 0.09853040147572756, "train/cross_entropy_loss": 0.019936785800382495, "train/kl_divergence_loss": 0.1771240234375, "train/step_duration_seconds": 7.178696632385254, "train/steps_per_hour": 302.78936801023747, "train/total_elapsed_hours": 0.35338096810711755 }, { "epoch": 0.256, "grad_norm": 17.625, "learning_rate": 1.8618399959331642e-05, "loss": 1.5765, "step": 108 }, { "epoch": 0.256, "step": 108, "train/combined_loss": 0.09358909726142883, "train/cross_entropy_loss": 0.020552222267724574, "train/kl_divergence_loss": 0.1666259765625, "train/step_duration_seconds": 7.157944679260254, "train/steps_per_hour": 303.90921284525064, "train/total_elapsed_hours": 0.3553692860735787 }, { "epoch": 0.25837037037037036, "grad_norm": 9.6875, "learning_rate": 1.8575950574776595e-05, "loss": 1.4974, "step": 109 }, { "epoch": 0.25837037037037036, "step": 109, "train/combined_loss": 0.09076927369460464, "train/cross_entropy_loss": 0.0206498735351488, "train/kl_divergence_loss": 0.160888671875, "train/step_duration_seconds": 7.1567769050598145, "train/steps_per_hour": 305.0168730415088, "train/total_elapsed_hours": 0.35735727965831754 }, { "epoch": 0.2607407407407407, "grad_norm": 22.375, "learning_rate": 1.8532908816321557e-05, "loss": 1.4523, "step": 110 }, { "epoch": 0.2607407407407407, "step": 110, "train/combined_loss": 0.09263847023248672, "train/cross_entropy_loss": 0.021519619156606495, "train/kl_divergence_loss": 0.16375732421875, "train/step_duration_seconds": 7.173721790313721, "train/steps_per_hour": 306.1082679129725, "train/total_elapsed_hours": 0.3593499801556269 }, { "epoch": 0.26311111111111113, "grad_norm": 23.875, "learning_rate": 1.8489277657026377e-05, "loss": 1.4822, "step": 111 }, { "epoch": 0.26311111111111113, "step": 111, "train/combined_loss": 0.0902865119278431, "train/cross_entropy_loss": 0.020905061159282923, "train/kl_divergence_loss": 0.15966796875, "train/step_duration_seconds": 7.147202253341675, "train/steps_per_hour": 307.1938879594452, "train/total_elapsed_hours": 0.3613353141148885 }, { "epoch": 0.2654814814814815, "grad_norm": 11.125, "learning_rate": 1.844506011066308e-05, "loss": 1.4446, "step": 112 }, { "epoch": 0.2654814814814815, "step": 112, "train/combined_loss": 0.09026105608791113, "train/cross_entropy_loss": 0.019145151949487627, "train/kl_divergence_loss": 0.161376953125, "train/step_duration_seconds": 7.135612726211548, "train/steps_per_hour": 308.27037497741026, "train/total_elapsed_hours": 0.3633174287610584 }, { "epoch": 0.26785185185185184, "grad_norm": 35.25, "learning_rate": 1.8400259231507716e-05, "loss": 1.4442, "step": 113 }, { "epoch": 0.26785185185185184, "step": 113, "train/combined_loss": 0.09185996558517218, "train/cross_entropy_loss": 0.019108111271634698, "train/kl_divergence_loss": 0.16461181640625, "train/step_duration_seconds": 7.138214826583862, "train/steps_per_hour": 309.3345678925583, "train/total_elapsed_hours": 0.36530026621288725 }, { "epoch": 0.2702222222222222, "grad_norm": 42.0, "learning_rate": 1.8354878114129368e-05, "loss": 1.4698, "step": 114 }, { "epoch": 0.2702222222222222, "step": 114, "train/combined_loss": 0.08998283930122852, "train/cross_entropy_loss": 0.018710799398832023, "train/kl_divergence_loss": 0.1612548828125, "train/step_duration_seconds": 7.160963296890259, "train/steps_per_hour": 310.38193032311864, "train/total_elapsed_hours": 0.36728942268424564 }, { "epoch": 0.2725925925925926, "grad_norm": 32.75, "learning_rate": 1.8308919893176397e-05, "loss": 1.4397, "step": 115 }, { "epoch": 0.2725925925925926, "step": 115, "train/combined_loss": 0.08985556894913316, "train/cross_entropy_loss": 0.02016523655038327, "train/kl_divergence_loss": 0.1595458984375, "train/step_duration_seconds": 7.191335916519165, "train/steps_per_hour": 311.4108945594422, "train/total_elapsed_hours": 0.36928701599438984 }, { "epoch": 0.27496296296296296, "grad_norm": 9.5, "learning_rate": 1.826238774315995e-05, "loss": 1.4377, "step": 116 }, { "epoch": 0.27496296296296296, "step": 116, "train/combined_loss": 0.090081796515733, "train/cross_entropy_loss": 0.0222046107519418, "train/kl_divergence_loss": 0.157958984375, "train/step_duration_seconds": 7.165991544723511, "train/steps_per_hour": 312.43471090794657, "train/total_elapsed_hours": 0.3712775692012575 }, { "epoch": 0.2773333333333333, "grad_norm": 17.25, "learning_rate": 1.8215284878234644e-05, "loss": 1.4413, "step": 117 }, { "epoch": 0.2773333333333333, "step": 117, "train/combined_loss": 0.09464696934446692, "train/cross_entropy_loss": 0.023217282141558826, "train/kl_divergence_loss": 0.16607666015625, "train/step_duration_seconds": 7.209496736526489, "train/steps_per_hour": 313.43745997881086, "train/total_elapsed_hours": 0.373280207183626 }, { "epoch": 0.2797037037037037, "grad_norm": 10.8125, "learning_rate": 1.816761455197657e-05, "loss": 1.5144, "step": 118 }, { "epoch": 0.2797037037037037, "step": 118, "train/combined_loss": 0.08674649894237518, "train/cross_entropy_loss": 0.018219567835330963, "train/kl_divergence_loss": 0.1552734375, "train/step_duration_seconds": 7.183828592300415, "train/steps_per_hour": 314.43548101801, "train/total_elapsed_hours": 0.3752757151259316 }, { "epoch": 0.2820740740740741, "grad_norm": 22.625, "learning_rate": 1.811938005715857e-05, "loss": 1.3879, "step": 119 }, { "epoch": 0.2820740740740741, "step": 119, "train/combined_loss": 0.08797085983678699, "train/cross_entropy_loss": 0.017738587921485305, "train/kl_divergence_loss": 0.158203125, "train/step_duration_seconds": 7.165632963180542, "train/steps_per_hour": 315.4271701636993, "train/total_elapsed_hours": 0.3772661687268151 }, { "epoch": 0.28444444444444444, "grad_norm": 24.0, "learning_rate": 1.8070584725522763e-05, "loss": 1.4075, "step": 120 }, { "epoch": 0.28444444444444444, "step": 120, "train/combined_loss": 0.09114356013014913, "train/cross_entropy_loss": 0.018773937365040183, "train/kl_divergence_loss": 0.16351318359375, "train/step_duration_seconds": 7.167950868606567, "train/steps_per_hour": 316.4079127749189, "train/total_elapsed_hours": 0.37925726619031697 }, { "epoch": 0.2868148148148148, "grad_norm": 12.625, "learning_rate": 1.802123192755044e-05, "loss": 1.4583, "step": 121 }, { "epoch": 0.2868148148148148, "step": 121, "train/combined_loss": 0.09176525427028537, "train/cross_entropy_loss": 0.024289784720167518, "train/kl_divergence_loss": 0.15924072265625, "train/step_duration_seconds": 7.175249338150024, "train/steps_per_hour": 317.3767236816989, "train/total_elapsed_hours": 0.38125039100646974 }, { "epoch": 0.2891851851851852, "grad_norm": 31.75, "learning_rate": 1.7971325072229227e-05, "loss": 1.4682, "step": 122 }, { "epoch": 0.2891851851851852, "step": 122, "train/combined_loss": 0.0890495995990932, "train/cross_entropy_loss": 0.025206139660440385, "train/kl_divergence_loss": 0.15289306640625, "train/step_duration_seconds": 7.177129745483398, "train/steps_per_hour": 318.3350237785182, "train/total_elapsed_hours": 0.3832440381579929 }, { "epoch": 0.29155555555555557, "grad_norm": 38.75, "learning_rate": 1.7920867606817625e-05, "loss": 1.4248, "step": 123 }, { "epoch": 0.29155555555555557, "step": 123, "train/combined_loss": 0.0867073736153543, "train/cross_entropy_loss": 0.023878611740656197, "train/kl_divergence_loss": 0.1495361328125, "train/step_duration_seconds": 7.162760257720947, "train/steps_per_hour": 319.2867134529585, "train/total_elapsed_hours": 0.3852336937851376 }, { "epoch": 0.2939259259259259, "grad_norm": 33.0, "learning_rate": 1.7869863016606893e-05, "loss": 1.3873, "step": 124 }, { "epoch": 0.2939259259259259, "step": 124, "train/combined_loss": 0.08373823016881943, "train/cross_entropy_loss": 0.020015520974993706, "train/kl_divergence_loss": 0.1474609375, "train/step_duration_seconds": 7.192008972167969, "train/steps_per_hour": 320.221904230669, "train/total_elapsed_hours": 0.38723147405518427 }, { "epoch": 0.2962962962962963, "grad_norm": 5.34375, "learning_rate": 1.78183148246803e-05, "loss": 1.3398, "step": 125 }, { "epoch": 0.2962962962962963, "step": 125, "train/combined_loss": 0.08616493362933397, "train/cross_entropy_loss": 0.016812282847240567, "train/kl_divergence_loss": 0.155517578125, "train/step_duration_seconds": 7.1917195320129395, "train/steps_per_hour": 321.14756131827295, "train/total_elapsed_hours": 0.38922917392518785 }, { "epoch": 0.2986666666666667, "grad_norm": 39.0, "learning_rate": 1.7766226591669787e-05, "loss": 1.3786, "step": 126 }, { "epoch": 0.2986666666666667, "step": 126, "train/combined_loss": 0.09160786820575595, "train/cross_entropy_loss": 0.016711829113774, "train/kl_divergence_loss": 0.16650390625, "train/step_duration_seconds": 7.179956436157227, "train/steps_per_hour": 322.0664550428592, "train/total_elapsed_hours": 0.39122360626856484 }, { "epoch": 0.30103703703703705, "grad_norm": 44.0, "learning_rate": 1.771360191551e-05, "loss": 1.4657, "step": 127 }, { "epoch": 0.30103703703703705, "step": 127, "train/combined_loss": 0.08970451634377241, "train/cross_entropy_loss": 0.017787940218113363, "train/kl_divergence_loss": 0.16162109375, "train/step_duration_seconds": 7.176481485366821, "train/steps_per_hour": 322.9768202044927, "train/total_elapsed_hours": 0.3932170733478334 }, { "epoch": 0.3034074074074074, "grad_norm": 36.0, "learning_rate": 1.766044443118978e-05, "loss": 1.4353, "step": 128 }, { "epoch": 0.3034074074074074, "step": 128, "train/combined_loss": 0.08426619321107864, "train/cross_entropy_loss": 0.019362466409802437, "train/kl_divergence_loss": 0.149169921875, "train/step_duration_seconds": 7.183627605438232, "train/steps_per_hour": 323.876374745131, "train/total_elapsed_hours": 0.39521252546045516 }, { "epoch": 0.30577777777777776, "grad_norm": 18.25, "learning_rate": 1.760675781050109e-05, "loss": 1.3483, "step": 129 }, { "epoch": 0.30577777777777776, "step": 129, "train/combined_loss": 0.09070024406537414, "train/cross_entropy_loss": 0.02429600094910711, "train/kl_divergence_loss": 0.1571044921875, "train/step_duration_seconds": 7.154268741607666, "train/steps_per_hour": 324.77355916793476, "train/total_elapsed_hours": 0.39719982233312395 }, { "epoch": 0.30814814814814817, "grad_norm": 35.0, "learning_rate": 1.755254576178535e-05, "loss": 1.4512, "step": 130 }, { "epoch": 0.30814814814814817, "step": 130, "train/combined_loss": 0.09242757642641664, "train/cross_entropy_loss": 0.025797537760809064, "train/kl_divergence_loss": 0.1590576171875, "train/step_duration_seconds": 7.181564092636108, "train/steps_per_hour": 325.6556251715386, "train/total_elapsed_hours": 0.3991947012477451 }, { "epoch": 0.3105185185185185, "grad_norm": 40.5, "learning_rate": 1.7497812029677344e-05, "loss": 1.4788, "step": 131 }, { "epoch": 0.3105185185185185, "step": 131, "train/combined_loss": 0.08590791560709476, "train/cross_entropy_loss": 0.024782140040770173, "train/kl_divergence_loss": 0.14703369140625, "train/step_duration_seconds": 7.177969694137573, "train/steps_per_hour": 326.52973182564483, "train/total_elapsed_hours": 0.4011885817183389 }, { "epoch": 0.3128888888888889, "grad_norm": 36.5, "learning_rate": 1.7442560394846518e-05, "loss": 1.3745, "step": 132 }, { "epoch": 0.3128888888888889, "step": 132, "train/combined_loss": 0.08688511373475194, "train/cross_entropy_loss": 0.022158897132612765, "train/kl_divergence_loss": 0.151611328125, "train/step_duration_seconds": 7.1588640213012695, "train/steps_per_hour": 327.39950253665864, "train/total_elapsed_hours": 0.4031771550575892 }, { "epoch": 0.31525925925925924, "grad_norm": 16.0, "learning_rate": 1.738679467373586e-05, "loss": 1.3902, "step": 133 }, { "epoch": 0.31525925925925924, "step": 133, "train/combined_loss": 0.08377803396433592, "train/cross_entropy_loss": 0.017897860845550895, "train/kl_divergence_loss": 0.149658203125, "train/step_duration_seconds": 7.175210237503052, "train/steps_per_hour": 328.257056778055, "train/total_elapsed_hours": 0.4051702690124512 }, { "epoch": 0.31762962962962965, "grad_norm": 34.5, "learning_rate": 1.7330518718298263e-05, "loss": 1.3404, "step": 134 }, { "epoch": 0.31762962962962965, "step": 134, "train/combined_loss": 0.08916169637814164, "train/cross_entropy_loss": 0.017007473739795387, "train/kl_divergence_loss": 0.16131591796875, "train/step_duration_seconds": 7.173584222793579, "train/steps_per_hour": 329.106580437697, "train/total_elapsed_hours": 0.4071629312965605 }, { "epoch": 0.32, "grad_norm": 42.75, "learning_rate": 1.7273736415730488e-05, "loss": 1.4266, "step": 135 }, { "epoch": 0.32, "step": 135, "train/combined_loss": 0.08420996041968465, "train/cross_entropy_loss": 0.016442388528957963, "train/kl_divergence_loss": 0.1519775390625, "train/step_duration_seconds": 7.169410705566406, "train/steps_per_hour": 329.948764311339, "train/total_elapsed_hours": 0.40915443427032894 }, { "epoch": 0.32237037037037036, "grad_norm": 37.25, "learning_rate": 1.7216451688204623e-05, "loss": 1.3474, "step": 136 }, { "epoch": 0.32237037037037036, "step": 136, "train/combined_loss": 0.08458211086690426, "train/cross_entropy_loss": 0.017736003384925425, "train/kl_divergence_loss": 0.15142822265625, "train/step_duration_seconds": 7.171813011169434, "train/steps_per_hour": 330.78225259442917, "train/total_elapsed_hours": 0.4111466045512093 }, { "epoch": 0.3247407407407407, "grad_norm": 17.75, "learning_rate": 1.7158668492597186e-05, "loss": 1.3533, "step": 137 }, { "epoch": 0.3247407407407407, "step": 137, "train/combined_loss": 0.08640648704022169, "train/cross_entropy_loss": 0.022422353271394968, "train/kl_divergence_loss": 0.150390625, "train/step_duration_seconds": 7.173238754272461, "train/steps_per_hour": 331.60738477276607, "train/total_elapsed_hours": 0.4131391708718406 }, { "epoch": 0.32711111111111113, "grad_norm": 24.25, "learning_rate": 1.7100390820215805e-05, "loss": 1.3825, "step": 138 }, { "epoch": 0.32711111111111113, "step": 138, "train/combined_loss": 0.08674443326890469, "train/cross_entropy_loss": 0.025173434522002935, "train/kl_divergence_loss": 0.1483154296875, "train/step_duration_seconds": 7.182944059371948, "train/steps_per_hour": 332.422437151218, "train/total_elapsed_hours": 0.41513443311055503 }, { "epoch": 0.3294814814814815, "grad_norm": 30.125, "learning_rate": 1.704162269652352e-05, "loss": 1.3879, "step": 139 }, { "epoch": 0.3294814814814815, "step": 139, "train/combined_loss": 0.08193621598184109, "train/cross_entropy_loss": 0.02483434451278299, "train/kl_divergence_loss": 0.1390380859375, "train/step_duration_seconds": 7.187625885009766, "train/steps_per_hour": 333.2286533026057, "train/total_elapsed_hours": 0.4171309958563911 }, { "epoch": 0.33185185185185184, "grad_norm": 26.25, "learning_rate": 1.698236818086073e-05, "loss": 1.311, "step": 140 }, { "epoch": 0.33185185185185184, "step": 140, "train/combined_loss": 0.08312624553218484, "train/cross_entropy_loss": 0.021843309281393886, "train/kl_divergence_loss": 0.1444091796875, "train/step_duration_seconds": 7.165472507476807, "train/steps_per_hour": 334.0320927727315, "train/total_elapsed_hours": 0.4191214048862457 }, { "epoch": 0.3342222222222222, "grad_norm": 10.0, "learning_rate": 1.6922631366164795e-05, "loss": 1.33, "step": 141 }, { "epoch": 0.3342222222222222, "step": 141, "train/combined_loss": 0.08382831746712327, "train/cross_entropy_loss": 0.016838762094266713, "train/kl_divergence_loss": 0.15081787109375, "train/step_duration_seconds": 7.17199444770813, "train/steps_per_hour": 334.8264967927197, "train/total_elapsed_hours": 0.42111362556616466 }, { "epoch": 0.3365925925925926, "grad_norm": 25.0, "learning_rate": 1.686241637868734e-05, "loss": 1.3413, "step": 142 }, { "epoch": 0.3365925925925926, "step": 142, "train/combined_loss": 0.08172068372368813, "train/cross_entropy_loss": 0.015309049864299595, "train/kl_divergence_loss": 0.14813232421875, "train/step_duration_seconds": 7.141969680786133, "train/steps_per_hour": 335.62003551235273, "train/total_elapsed_hours": 0.4230975060330497 }, { "epoch": 0.33896296296296297, "grad_norm": 30.125, "learning_rate": 1.6801727377709195e-05, "loss": 1.3075, "step": 143 }, { "epoch": 0.33896296296296297, "step": 143, "train/combined_loss": 0.08254175027832389, "train/cross_entropy_loss": 0.016035647364333272, "train/kl_divergence_loss": 0.1490478515625, "train/step_duration_seconds": 7.139028072357178, "train/steps_per_hour": 336.40681390445945, "train/total_elapsed_hours": 0.42508056938648225 }, { "epoch": 0.3413333333333333, "grad_norm": 26.625, "learning_rate": 1.6740568555253153e-05, "loss": 1.3207, "step": 144 }, { "epoch": 0.3413333333333333, "step": 144, "train/combined_loss": 0.0809242157265544, "train/cross_entropy_loss": 0.019209268386475742, "train/kl_divergence_loss": 0.14263916015625, "train/step_duration_seconds": 7.164944648742676, "train/steps_per_hour": 337.180601627168, "train/total_elapsed_hours": 0.42707083178891075 }, { "epoch": 0.3437037037037037, "grad_norm": 10.6875, "learning_rate": 1.6678944135794375e-05, "loss": 1.2948, "step": 145 }, { "epoch": 0.3437037037037037, "step": 145, "train/combined_loss": 0.08305090665817261, "train/cross_entropy_loss": 0.027735116658732295, "train/kl_divergence_loss": 0.13836669921875, "train/step_duration_seconds": 7.151780843734741, "train/steps_per_hour": 337.95009082754103, "train/total_elapsed_hours": 0.4290574375788371 }, { "epoch": 0.3460740740740741, "grad_norm": 24.875, "learning_rate": 1.6616858375968596e-05, "loss": 1.3288, "step": 146 }, { "epoch": 0.3460740740740741, "step": 146, "train/combined_loss": 0.08167848456650972, "train/cross_entropy_loss": 0.028225134126842022, "train/kl_divergence_loss": 0.1351318359375, "train/step_duration_seconds": 7.142008066177368, "train/steps_per_hour": 338.7146203300441, "train/total_elapsed_hours": 0.4310413287083308 }, { "epoch": 0.34844444444444445, "grad_norm": 29.625, "learning_rate": 1.6554315564278102e-05, "loss": 1.3069, "step": 147 }, { "epoch": 0.34844444444444445, "step": 147, "train/combined_loss": 0.08066110266372561, "train/cross_entropy_loss": 0.02490862738341093, "train/kl_divergence_loss": 0.13641357421875, "train/step_duration_seconds": 7.151546478271484, "train/steps_per_hour": 339.47006737646905, "train/total_elapsed_hours": 0.43302786939673954 }, { "epoch": 0.3508148148148148, "grad_norm": 23.625, "learning_rate": 1.649132002079552e-05, "loss": 1.2906, "step": 148 }, { "epoch": 0.3508148148148148, "step": 148, "train/combined_loss": 0.08027565246447921, "train/cross_entropy_loss": 0.01925491786096245, "train/kl_divergence_loss": 0.14129638671875, "train/step_duration_seconds": 7.16980504989624, "train/steps_per_hour": 340.21464820376787, "train/total_elapsed_hours": 0.4350194819105996 }, { "epoch": 0.35318518518518516, "grad_norm": 13.9375, "learning_rate": 1.6427876096865394e-05, "loss": 1.2844, "step": 149 }, { "epoch": 0.35318518518518516, "step": 149, "train/combined_loss": 0.0810198406688869, "train/cross_entropy_loss": 0.01714221539441496, "train/kl_divergence_loss": 0.1448974609375, "train/step_duration_seconds": 7.167724132537842, "train/steps_per_hour": 340.95289337704213, "train/total_elapsed_hours": 0.4370105163918601 }, { "epoch": 0.35555555555555557, "grad_norm": 16.5, "learning_rate": 1.6363988174803638e-05, "loss": 1.2963, "step": 150 }, { "epoch": 0.35555555555555557, "eval_combined_loss": 0.08181474480902155, "eval_cross_entropy_loss": 0.01787818753470977, "eval_kl_divergence_loss": 0.14575130208333334, "eval_loss": 0.08181475102901459, "eval_runtime": 218.6737, "eval_samples_per_second": 6.86, "eval_steps_per_second": 3.43, "step": 150 }, { "epoch": 0.35555555555555557, "step": 150, "train/combined_loss": 0.08019543159753084, "train/cross_entropy_loss": 0.017629636102356017, "train/kl_divergence_loss": 0.14276123046875, "train/step_duration_seconds": 225.85686349868774, "train/steps_per_hour": 300.1509555021127, "train/total_elapsed_hours": 0.4997485340303845 }, { "epoch": 0.3579259259259259, "grad_norm": 23.125, "learning_rate": 1.6299660667594814e-05, "loss": 1.2831, "step": 151 }, { "epoch": 0.3579259259259259, "step": 151, "train/combined_loss": 0.08127154828980565, "train/cross_entropy_loss": 0.01935462059918791, "train/kl_divergence_loss": 0.1431884765625, "train/step_duration_seconds": 7.190611124038696, "train/steps_per_hour": 300.9491301723214, "train/total_elapsed_hours": 0.5017459260092841 }, { "epoch": 0.3602962962962963, "grad_norm": 20.375, "learning_rate": 1.6234898018587336e-05, "loss": 1.3003, "step": 152 }, { "epoch": 0.3602962962962963, "step": 152, "train/combined_loss": 0.08766383724287152, "train/cross_entropy_loss": 0.020847689942456782, "train/kl_divergence_loss": 0.15447998046875, "train/step_duration_seconds": 7.20133113861084, "train/steps_per_hour": 301.7391914865756, "train/total_elapsed_hours": 0.5037462957700094 }, { "epoch": 0.3626666666666667, "grad_norm": 12.0, "learning_rate": 1.6169704701186528e-05, "loss": 1.4026, "step": 153 }, { "epoch": 0.3626666666666667, "step": 153, "train/combined_loss": 0.07755696773529053, "train/cross_entropy_loss": 0.021324871107935905, "train/kl_divergence_loss": 0.1337890625, "train/step_duration_seconds": 7.173044919967651, "train/steps_per_hour": 302.52770304516605, "train/total_elapsed_hours": 0.5057388082477782 }, { "epoch": 0.36503703703703705, "grad_norm": 17.125, "learning_rate": 1.6104085218545633e-05, "loss": 1.2409, "step": 154 }, { "epoch": 0.36503703703703705, "step": 154, "train/combined_loss": 0.07878367276862264, "train/cross_entropy_loss": 0.022069291560910642, "train/kl_divergence_loss": 0.135498046875, "train/step_duration_seconds": 7.17523193359375, "train/steps_per_hour": 303.30966291023043, "train/total_elapsed_hours": 0.507731928229332 }, { "epoch": 0.3674074074074074, "grad_norm": 16.875, "learning_rate": 1.6038044103254775e-05, "loss": 1.2605, "step": 155 }, { "epoch": 0.3674074074074074, "step": 155, "train/combined_loss": 0.07806963194161654, "train/cross_entropy_loss": 0.021861913381144404, "train/kl_divergence_loss": 0.13427734375, "train/step_duration_seconds": 7.181210279464722, "train/steps_per_hour": 304.08451686962036, "train/total_elapsed_hours": 0.5097267088625166 }, { "epoch": 0.36977777777777776, "grad_norm": 11.75, "learning_rate": 1.5971585917027864e-05, "loss": 1.2491, "step": 156 }, { "epoch": 0.36977777777777776, "step": 156, "train/combined_loss": 0.07714226096868515, "train/cross_entropy_loss": 0.020495465025305748, "train/kl_divergence_loss": 0.1337890625, "train/step_duration_seconds": 7.1563475131988525, "train/steps_per_hour": 304.8574442281318, "train/total_elapsed_hours": 0.5117145831717386 }, { "epoch": 0.3721481481481482, "grad_norm": 13.125, "learning_rate": 1.5904715250387498e-05, "loss": 1.2343, "step": 157 }, { "epoch": 0.3721481481481482, "step": 157, "train/combined_loss": 0.07875645952299237, "train/cross_entropy_loss": 0.01914622518233955, "train/kl_divergence_loss": 0.13836669921875, "train/step_duration_seconds": 7.15238881111145, "train/steps_per_hour": 305.62504381863846, "train/total_elapsed_hours": 0.5137013578414917 }, { "epoch": 0.37451851851851853, "grad_norm": 7.9375, "learning_rate": 1.5837436722347902e-05, "loss": 1.2601, "step": 158 }, { "epoch": 0.37451851851851853, "step": 158, "train/combined_loss": 0.07961196266114712, "train/cross_entropy_loss": 0.019941705162636936, "train/kl_divergence_loss": 0.1392822265625, "train/step_duration_seconds": 7.165693759918213, "train/steps_per_hour": 306.3845330098647, "train/total_elapsed_hours": 0.5156918283303579 }, { "epoch": 0.3768888888888889, "grad_norm": 8.625, "learning_rate": 1.576975498009583e-05, "loss": 1.2738, "step": 159 }, { "epoch": 0.3768888888888889, "step": 159, "train/combined_loss": 0.08134815841913223, "train/cross_entropy_loss": 0.021583035704679787, "train/kl_divergence_loss": 0.14111328125, "train/step_duration_seconds": 7.175953149795532, "train/steps_per_hour": 307.1364910017458, "train/total_elapsed_hours": 0.5176851486497455 }, { "epoch": 0.37925925925925924, "grad_norm": 6.90625, "learning_rate": 1.570167469866962e-05, "loss": 1.3016, "step": 160 }, { "epoch": 0.37925925925925924, "step": 160, "train/combined_loss": 0.08167480118572712, "train/cross_entropy_loss": 0.020954578067176044, "train/kl_divergence_loss": 0.14239501953125, "train/step_duration_seconds": 7.178318023681641, "train/steps_per_hour": 307.88229126869635, "train/total_elapsed_hours": 0.5196791258785459 }, { "epoch": 0.38162962962962965, "grad_norm": 6.625, "learning_rate": 1.563320058063622e-05, "loss": 1.3068, "step": 161 }, { "epoch": 0.38162962962962965, "step": 161, "train/combined_loss": 0.08032544003799558, "train/cross_entropy_loss": 0.021124517312273383, "train/kl_divergence_loss": 0.1395263671875, "train/step_duration_seconds": 7.179118394851685, "train/steps_per_hour": 308.62225870273886, "train/total_elapsed_hours": 0.5216733254326714 }, { "epoch": 0.384, "grad_norm": 7.875, "learning_rate": 1.5564337355766412e-05, "loss": 1.2852, "step": 162 }, { "epoch": 0.384, "step": 162, "train/combined_loss": 0.07905747788026929, "train/cross_entropy_loss": 0.02096895850263536, "train/kl_divergence_loss": 0.13714599609375, "train/step_duration_seconds": 7.179133653640747, "train/steps_per_hour": 309.3565878328245, "train/total_elapsed_hours": 0.5236675292253494 }, { "epoch": 0.38637037037037036, "grad_norm": 4.15625, "learning_rate": 1.5495089780708062e-05, "loss": 1.2649, "step": 163 }, { "epoch": 0.38637037037037036, "step": 163, "train/combined_loss": 0.07707322854548693, "train/cross_entropy_loss": 0.019991177483461797, "train/kl_divergence_loss": 0.1341552734375, "train/step_duration_seconds": 7.165940761566162, "train/steps_per_hour": 310.0875071111404, "train/total_elapsed_hours": 0.5256580683257844 }, { "epoch": 0.3887407407407407, "grad_norm": 9.125, "learning_rate": 1.5425462638657597e-05, "loss": 1.2332, "step": 164 }, { "epoch": 0.3887407407407407, "step": 164, "train/combined_loss": 0.07535458076745272, "train/cross_entropy_loss": 0.019971861504018307, "train/kl_divergence_loss": 0.1307373046875, "train/step_duration_seconds": 7.187286376953125, "train/steps_per_hour": 310.80941899189014, "train/total_elapsed_hours": 0.527654536763827 }, { "epoch": 0.39111111111111113, "grad_norm": 9.0625, "learning_rate": 1.5355460739029585e-05, "loss": 1.2057, "step": 165 }, { "epoch": 0.39111111111111113, "step": 165, "train/combined_loss": 0.07617681892588735, "train/cross_entropy_loss": 0.021128055173903704, "train/kl_divergence_loss": 0.1312255859375, "train/step_duration_seconds": 7.1567230224609375, "train/steps_per_hour": 311.53088207288556, "train/total_elapsed_hours": 0.5296425153811772 }, { "epoch": 0.3934814814814815, "grad_norm": 8.8125, "learning_rate": 1.5285088917124555e-05, "loss": 1.2188, "step": 166 }, { "epoch": 0.3934814814814815, "step": 166, "train/combined_loss": 0.08005631249397993, "train/cross_entropy_loss": 0.021440746961161494, "train/kl_divergence_loss": 0.138671875, "train/step_duration_seconds": 7.164660930633545, "train/steps_per_hour": 312.24565441642335, "train/total_elapsed_hours": 0.5316326989730199 }, { "epoch": 0.39585185185185184, "grad_norm": 5.625, "learning_rate": 1.5214352033794981e-05, "loss": 1.2809, "step": 167 }, { "epoch": 0.39585185185185184, "step": 167, "train/combined_loss": 0.07657396793365479, "train/cross_entropy_loss": 0.020762681495398283, "train/kl_divergence_loss": 0.13238525390625, "train/step_duration_seconds": 7.171409845352173, "train/steps_per_hour": 312.9539957186981, "train/total_elapsed_hours": 0.5336247572633955 }, { "epoch": 0.3982222222222222, "grad_norm": 11.25, "learning_rate": 1.5143254975109538e-05, "loss": 1.2252, "step": 168 }, { "epoch": 0.3982222222222222, "step": 168, "train/combined_loss": 0.07494777115061879, "train/cross_entropy_loss": 0.02077566913794726, "train/kl_divergence_loss": 0.129119873046875, "train/step_duration_seconds": 7.178438186645508, "train/steps_per_hour": 313.65592484340516, "train/total_elapsed_hours": 0.5356187678707971 }, { "epoch": 0.4005925925925926, "grad_norm": 11.0625, "learning_rate": 1.5071802652015592e-05, "loss": 1.1992, "step": 169 }, { "epoch": 0.4005925925925926, "step": 169, "train/combined_loss": 0.07396322628483176, "train/cross_entropy_loss": 0.02079021732788533, "train/kl_divergence_loss": 0.12713623046875, "train/step_duration_seconds": 7.178174734115601, "train/steps_per_hour": 314.3526898358104, "train/total_elapsed_hours": 0.5376127052969403 }, { "epoch": 0.40296296296296297, "grad_norm": 3.921875, "learning_rate": 1.5000000000000002e-05, "loss": 1.1834, "step": 170 }, { "epoch": 0.40296296296296297, "step": 170, "train/combined_loss": 0.07202337961643934, "train/cross_entropy_loss": 0.01926037878729403, "train/kl_divergence_loss": 0.124786376953125, "train/step_duration_seconds": 7.181441783905029, "train/steps_per_hour": 315.043775657627, "train/total_elapsed_hours": 0.5396075502369139 }, { "epoch": 0.4053333333333333, "grad_norm": 7.46875, "learning_rate": 1.4927851978748177e-05, "loss": 1.1524, "step": 171 }, { "epoch": 0.4053333333333333, "step": 171, "train/combined_loss": 0.07204042701050639, "train/cross_entropy_loss": 0.020179486949928105, "train/kl_divergence_loss": 0.1239013671875, "train/step_duration_seconds": 7.173416376113892, "train/steps_per_hour": 315.7310702015876, "train/total_elapsed_hours": 0.5416001658969455 }, { "epoch": 0.4077037037037037, "grad_norm": 5.59375, "learning_rate": 1.4855363571801523e-05, "loss": 1.1526, "step": 172 }, { "epoch": 0.4077037037037037, "step": 172, "train/combined_loss": 0.072266333270818, "train/cross_entropy_loss": 0.02215718082152307, "train/kl_divergence_loss": 0.12237548828125, "train/step_duration_seconds": 7.157419681549072, "train/steps_per_hour": 316.4159124956855, "train/total_elapsed_hours": 0.5435883380307092 }, { "epoch": 0.4100740740740741, "grad_norm": 7.0625, "learning_rate": 1.4782539786213184e-05, "loss": 1.1563, "step": 173 }, { "epoch": 0.4100740740740741, "step": 173, "train/combined_loss": 0.07013190537691116, "train/cross_entropy_loss": 0.020634910091757774, "train/kl_divergence_loss": 0.11962890625, "train/step_duration_seconds": 7.1777660846710205, "train/steps_per_hour": 317.09247858077316, "train/total_elapsed_hours": 0.5455821619431178 }, { "epoch": 0.41244444444444445, "grad_norm": 4.3125, "learning_rate": 1.4709385652202204e-05, "loss": 1.1221, "step": 174 }, { "epoch": 0.41244444444444445, "step": 174, "train/combined_loss": 0.06954938173294067, "train/cross_entropy_loss": 0.018493298441171646, "train/kl_divergence_loss": 0.12060546875, "train/step_duration_seconds": 7.157723426818848, "train/steps_per_hour": 317.767348521783, "train/total_elapsed_hours": 0.5475704184505674 }, { "epoch": 0.4148148148148148, "grad_norm": 7.09375, "learning_rate": 1.4635906222806058e-05, "loss": 1.1128, "step": 175 }, { "epoch": 0.4148148148148148, "step": 175, "train/combined_loss": 0.07863931078463793, "train/cross_entropy_loss": 0.02086504956241697, "train/kl_divergence_loss": 0.13641357421875, "train/step_duration_seconds": 7.178933382034302, "train/steps_per_hour": 318.4339213839359, "train/total_elapsed_hours": 0.5495645666122436 }, { "epoch": 0.41718518518518516, "grad_norm": 5.84375, "learning_rate": 1.4562106573531632e-05, "loss": 1.2582, "step": 176 }, { "epoch": 0.41718518518518516, "step": 176, "train/combined_loss": 0.0717529603280127, "train/cross_entropy_loss": 0.022137517924420536, "train/kl_divergence_loss": 0.121368408203125, "train/step_duration_seconds": 7.176782131195068, "train/steps_per_hour": 319.09602000259764, "train/total_elapsed_hours": 0.5515581172042423 }, { "epoch": 0.41955555555555557, "grad_norm": 9.5625, "learning_rate": 1.4487991802004625e-05, "loss": 1.148, "step": 177 }, { "epoch": 0.41955555555555557, "step": 177, "train/combined_loss": 0.0723364045843482, "train/cross_entropy_loss": 0.021931110764853656, "train/kl_divergence_loss": 0.12274169921875, "train/step_duration_seconds": 7.177384376525879, "train/steps_per_hour": 319.75325304866163, "train/total_elapsed_hours": 0.5535518350866105 }, { "epoch": 0.4219259259259259, "grad_norm": 4.0, "learning_rate": 1.4413567027617442e-05, "loss": 1.1574, "step": 178 }, { "epoch": 0.4219259259259259, "step": 178, "train/combined_loss": 0.0698565854690969, "train/cross_entropy_loss": 0.017886995803564787, "train/kl_divergence_loss": 0.121826171875, "train/step_duration_seconds": 7.158464431762695, "train/steps_per_hour": 320.4087999085678, "train/total_elapsed_hours": 0.5555402974287669 }, { "epoch": 0.4242962962962963, "grad_norm": 14.1875, "learning_rate": 1.4338837391175582e-05, "loss": 1.1177, "step": 179 }, { "epoch": 0.4242962962962963, "step": 179, "train/combined_loss": 0.06887802015990019, "train/cross_entropy_loss": 0.017760923714376986, "train/kl_divergence_loss": 0.1199951171875, "train/step_duration_seconds": 7.177714586257935, "train/steps_per_hour": 321.05659141033044, "train/total_elapsed_hours": 0.5575341070360608 }, { "epoch": 0.4266666666666667, "grad_norm": 11.625, "learning_rate": 1.4263808054542541e-05, "loss": 1.102, "step": 180 }, { "epoch": 0.4266666666666667, "step": 180, "train/combined_loss": 0.07172887865453959, "train/cross_entropy_loss": 0.02205883653368801, "train/kl_divergence_loss": 0.12139892578125, "train/step_duration_seconds": 7.169630289077759, "train/steps_per_hour": 321.7010573913621, "train/total_elapsed_hours": 0.559525671005249 }, { "epoch": 0.42903703703703705, "grad_norm": 9.0, "learning_rate": 1.418848420028325e-05, "loss": 1.1477, "step": 181 }, { "epoch": 0.42903703703703705, "step": 181, "train/combined_loss": 0.07292318437248468, "train/cross_entropy_loss": 0.024111752747558057, "train/kl_divergence_loss": 0.121734619140625, "train/step_duration_seconds": 7.1448750495910645, "train/steps_per_hour": 322.34489934858004, "train/total_elapsed_hours": 0.5615103585190243 }, { "epoch": 0.4314074074074074, "grad_norm": 10.0625, "learning_rate": 1.4112871031306118e-05, "loss": 1.1668, "step": 182 }, { "epoch": 0.4314074074074074, "step": 182, "train/combined_loss": 0.07089130999520421, "train/cross_entropy_loss": 0.022153714206069708, "train/kl_divergence_loss": 0.11962890625, "train/step_duration_seconds": 7.131852149963379, "train/steps_per_hour": 322.9862794273204, "train/total_elapsed_hours": 0.5634914285606808 }, { "epoch": 0.43377777777777776, "grad_norm": 4.28125, "learning_rate": 1.4036973770503623e-05, "loss": 1.1343, "step": 183 }, { "epoch": 0.43377777777777776, "step": 183, "train/combined_loss": 0.06743072532117367, "train/cross_entropy_loss": 0.01742980582639575, "train/kl_divergence_loss": 0.117431640625, "train/step_duration_seconds": 7.159062623977661, "train/steps_per_hour": 323.61883980323483, "train/total_elapsed_hours": 0.5654800570673413 }, { "epoch": 0.4361481481481482, "grad_norm": 11.0625, "learning_rate": 1.396079766039157e-05, "loss": 1.0789, "step": 184 }, { "epoch": 0.4361481481481482, "step": 184, "train/combined_loss": 0.06741259898990393, "train/cross_entropy_loss": 0.018187020672485232, "train/kl_divergence_loss": 0.11663818359375, "train/step_duration_seconds": 7.166898250579834, "train/steps_per_hour": 324.24572304400647, "train/total_elapsed_hours": 0.5674708621369468 }, { "epoch": 0.43851851851851853, "grad_norm": 9.25, "learning_rate": 1.3884347962746949e-05, "loss": 1.0786, "step": 185 }, { "epoch": 0.43851851851851853, "step": 185, "train/combined_loss": 0.06627723574638367, "train/cross_entropy_loss": 0.022691186517477036, "train/kl_divergence_loss": 0.10986328125, "train/step_duration_seconds": 7.1953465938568115, "train/steps_per_hour": 324.863715114037, "train/total_elapsed_hours": 0.5694695695241292 }, { "epoch": 0.4408888888888889, "grad_norm": 12.25, "learning_rate": 1.3807629958244498e-05, "loss": 1.0604, "step": 186 }, { "epoch": 0.4408888888888889, "step": 186, "train/combined_loss": 0.06682039611041546, "train/cross_entropy_loss": 0.02396061283070594, "train/kl_divergence_loss": 0.10968017578125, "train/step_duration_seconds": 7.1774444580078125, "train/steps_per_hour": 325.4802166069086, "train/total_elapsed_hours": 0.571463304095798 }, { "epoch": 0.44325925925925924, "grad_norm": 12.75, "learning_rate": 1.373064894609194e-05, "loss": 1.0691, "step": 187 }, { "epoch": 0.44325925925925924, "step": 187, "train/combined_loss": 0.07040743064135313, "train/cross_entropy_loss": 0.02216251229401678, "train/kl_divergence_loss": 0.11865234375, "train/step_duration_seconds": 7.175534009933472, "train/steps_per_hour": 326.09273309370775, "train/total_elapsed_hours": 0.5734565079874463 }, { "epoch": 0.44562962962962965, "grad_norm": 7.28125, "learning_rate": 1.3653410243663953e-05, "loss": 1.1265, "step": 188 }, { "epoch": 0.44562962962962965, "step": 188, "train/combined_loss": 0.07080179871991277, "train/cross_entropy_loss": 0.018343106610700488, "train/kl_divergence_loss": 0.123260498046875, "train/step_duration_seconds": 7.184988975524902, "train/steps_per_hour": 326.69951532210575, "train/total_elapsed_hours": 0.5754523382584253 }, { "epoch": 0.448, "grad_norm": 12.5625, "learning_rate": 1.3575919186134862e-05, "loss": 1.1328, "step": 189 }, { "epoch": 0.448, "step": 189, "train/combined_loss": 0.06690460816025734, "train/cross_entropy_loss": 0.017476210254244506, "train/kl_divergence_loss": 0.1163330078125, "train/step_duration_seconds": 7.175193548202515, "train/steps_per_hour": 327.30364537972457, "train/total_elapsed_hours": 0.5774454475773705 }, { "epoch": 0.45037037037037037, "grad_norm": 13.875, "learning_rate": 1.349818112611015e-05, "loss": 1.0705, "step": 190 }, { "epoch": 0.45037037037037037, "step": 190, "train/combined_loss": 0.06556698912754655, "train/cross_entropy_loss": 0.018676697509363294, "train/kl_divergence_loss": 0.112457275390625, "train/step_duration_seconds": 7.1955156326293945, "train/steps_per_hour": 327.9004248735381, "train/total_elapsed_hours": 0.5794442019197676 }, { "epoch": 0.4527407407407407, "grad_norm": 8.6875, "learning_rate": 1.342020143325669e-05, "loss": 1.0491, "step": 191 }, { "epoch": 0.4527407407407407, "step": 191, "train/combined_loss": 0.06877634488046169, "train/cross_entropy_loss": 0.024057817296124995, "train/kl_divergence_loss": 0.113494873046875, "train/step_duration_seconds": 7.176729202270508, "train/steps_per_hour": 328.49604967101124, "train/total_elapsed_hours": 0.5814377378092872 }, { "epoch": 0.45511111111111113, "grad_norm": 19.0, "learning_rate": 1.3341985493931877e-05, "loss": 1.1004, "step": 192 }, { "epoch": 0.45511111111111113, "step": 192, "train/combined_loss": 0.06570423394441605, "train/cross_entropy_loss": 0.024963149800896645, "train/kl_divergence_loss": 0.1064453125, "train/step_duration_seconds": 7.157531499862671, "train/steps_per_hour": 329.0906120315698, "train/total_elapsed_hours": 0.5834259410036935 }, { "epoch": 0.4574814814814815, "grad_norm": 23.625, "learning_rate": 1.3263538710811559e-05, "loss": 1.0513, "step": 193 }, { "epoch": 0.4574814814814815, "step": 193, "train/combined_loss": 0.0659687272273004, "train/cross_entropy_loss": 0.024729204480536282, "train/kl_divergence_loss": 0.107208251953125, "train/step_duration_seconds": 7.212608098983765, "train/steps_per_hour": 329.6725202740739, "train/total_elapsed_hours": 0.5854294432534112 }, { "epoch": 0.45985185185185184, "grad_norm": 18.625, "learning_rate": 1.3184866502516846e-05, "loss": 1.0555, "step": 194 }, { "epoch": 0.45985185185185184, "step": 194, "train/combined_loss": 0.07015136396512389, "train/cross_entropy_loss": 0.021467273705638945, "train/kl_divergence_loss": 0.11883544921875, "train/step_duration_seconds": 7.18483304977417, "train/steps_per_hour": 330.25479673403044, "train/total_elapsed_hours": 0.5874252302116818 }, { "epoch": 0.4622222222222222, "grad_norm": 10.75, "learning_rate": 1.3105974303239838e-05, "loss": 1.1224, "step": 195 }, { "epoch": 0.4622222222222222, "step": 195, "train/combined_loss": 0.0675101918168366, "train/cross_entropy_loss": 0.01786339597310871, "train/kl_divergence_loss": 0.117156982421875, "train/step_duration_seconds": 7.176580190658569, "train/steps_per_hour": 330.83441673191504, "train/total_elapsed_hours": 0.589418724709087 }, { "epoch": 0.4645925925925926, "grad_norm": 11.875, "learning_rate": 1.3026867562368262e-05, "loss": 1.0802, "step": 196 }, { "epoch": 0.4645925925925926, "step": 196, "train/combined_loss": 0.06660758936777711, "train/cross_entropy_loss": 0.017553551122546196, "train/kl_divergence_loss": 0.11566162109375, "train/step_duration_seconds": 7.176948308944702, "train/steps_per_hour": 331.4100719369871, "train/total_elapsed_hours": 0.5914123214615716 }, { "epoch": 0.46696296296296297, "grad_norm": 10.8125, "learning_rate": 1.2947551744109044e-05, "loss": 1.0657, "step": 197 }, { "epoch": 0.46696296296296297, "step": 197, "train/combined_loss": 0.07065472798421979, "train/cross_entropy_loss": 0.01936120947357267, "train/kl_divergence_loss": 0.1219482421875, "train/step_duration_seconds": 7.178069353103638, "train/steps_per_hour": 331.98168500479466, "train/total_elapsed_hours": 0.5934062296152115 }, { "epoch": 0.4693333333333333, "grad_norm": 8.75, "learning_rate": 1.2868032327110904e-05, "loss": 1.1305, "step": 198 }, { "epoch": 0.4693333333333333, "step": 198, "train/combined_loss": 0.06524005252867937, "train/cross_entropy_loss": 0.02314978139474988, "train/kl_divergence_loss": 0.107330322265625, "train/step_duration_seconds": 7.202880859375, "train/steps_per_hour": 332.5456201761273, "train/total_elapsed_hours": 0.5954070298539268 }, { "epoch": 0.4717037037037037, "grad_norm": 8.375, "learning_rate": 1.2788314804085904e-05, "loss": 1.0438, "step": 199 }, { "epoch": 0.4717037037037037, "step": 199, "train/combined_loss": 0.06441081315279007, "train/cross_entropy_loss": 0.02335287816822529, "train/kl_divergence_loss": 0.10546875, "train/step_duration_seconds": 7.1744384765625, "train/steps_per_hour": 333.1101833065549, "train/total_elapsed_hours": 0.5973999294307497 }, { "epoch": 0.4740740740740741, "grad_norm": 9.3125, "learning_rate": 1.2708404681430054e-05, "loss": 1.0306, "step": 200 }, { "epoch": 0.4740740740740741, "eval_combined_loss": 0.06667867637053132, "eval_cross_entropy_loss": 0.021343029824395975, "eval_kl_divergence_loss": 0.11201432291666667, "eval_loss": 0.06667868047952652, "eval_runtime": 220.1815, "eval_samples_per_second": 6.813, "eval_steps_per_second": 3.406, "step": 200 }, { "epoch": 0.4740740740740741, "step": 200, "train/combined_loss": 0.06510467641055584, "train/cross_entropy_loss": 0.021170052816160023, "train/kl_divergence_loss": 0.109039306640625, "train/step_duration_seconds": 227.3877534866333, "train/steps_per_hour": 302.77194026156553, "train/total_elapsed_hours": 0.6605631942881478 }, { "epoch": 0.47644444444444445, "grad_norm": 3.859375, "learning_rate": 1.2628307478842955e-05, "loss": 1.0417, "step": 201 }, { "epoch": 0.47644444444444445, "step": 201, "train/combined_loss": 0.06429090350866318, "train/cross_entropy_loss": 0.016765402629971504, "train/kl_divergence_loss": 0.11181640625, "train/step_duration_seconds": 7.160262107849121, "train/steps_per_hour": 303.37234311953483, "train/total_elapsed_hours": 0.6625521559847726 }, { "epoch": 0.4788148148148148, "grad_norm": 14.9375, "learning_rate": 1.2548028728946548e-05, "loss": 1.0287, "step": 202 }, { "epoch": 0.4788148148148148, "step": 202, "train/combined_loss": 0.06479989876970649, "train/cross_entropy_loss": 0.01619648071937263, "train/kl_divergence_loss": 0.1134033203125, "train/step_duration_seconds": 7.1755499839782715, "train/steps_per_hour": 303.96720953523806, "train/total_elapsed_hours": 0.6645453643136554 }, { "epoch": 0.48118518518518516, "grad_norm": 16.625, "learning_rate": 1.2467573976902936e-05, "loss": 1.0368, "step": 203 }, { "epoch": 0.48118518518518516, "step": 203, "train/combined_loss": 0.066513289231807, "train/cross_entropy_loss": 0.01745650765951723, "train/kl_divergence_loss": 0.115570068359375, "train/step_duration_seconds": 7.177863121032715, "train/steps_per_hour": 304.55822459747407, "train/total_elapsed_hours": 0.666539215180609 }, { "epoch": 0.48355555555555557, "grad_norm": 10.75, "learning_rate": 1.238694878003138e-05, "loss": 1.0642, "step": 204 }, { "epoch": 0.48355555555555557, "step": 204, "train/combined_loss": 0.06533924676477909, "train/cross_entropy_loss": 0.0218833324033767, "train/kl_divergence_loss": 0.108795166015625, "train/step_duration_seconds": 7.178657293319702, "train/steps_per_hour": 305.14561364915744, "train/total_elapsed_hours": 0.6685332866509756 }, { "epoch": 0.48592592592592593, "grad_norm": 12.5625, "learning_rate": 1.2306158707424402e-05, "loss": 1.0454, "step": 205 }, { "epoch": 0.48592592592592593, "step": 205, "train/combined_loss": 0.06534022279083729, "train/cross_entropy_loss": 0.024326685117557645, "train/kl_divergence_loss": 0.106353759765625, "train/step_duration_seconds": 7.156978607177734, "train/steps_per_hour": 305.7322547589479, "train/total_elapsed_hours": 0.6705213362640805 }, { "epoch": 0.4882962962962963, "grad_norm": 16.375, "learning_rate": 1.2225209339563144e-05, "loss": 1.0454, "step": 206 }, { "epoch": 0.4882962962962963, "step": 206, "train/combined_loss": 0.06436126446351409, "train/cross_entropy_loss": 0.024413448525592685, "train/kl_divergence_loss": 0.10430908203125, "train/step_duration_seconds": 7.1745688915252686, "train/steps_per_hour": 306.3132018994396, "train/total_elapsed_hours": 0.672514272067282 }, { "epoch": 0.49066666666666664, "grad_norm": 12.25, "learning_rate": 1.2144106267931877e-05, "loss": 1.0298, "step": 207 }, { "epoch": 0.49066666666666664, "step": 207, "train/combined_loss": 0.06408173590898514, "train/cross_entropy_loss": 0.021962294937111437, "train/kl_divergence_loss": 0.106201171875, "train/step_duration_seconds": 7.178909778594971, "train/steps_per_hour": 306.89016742276357, "train/total_elapsed_hours": 0.6745084136724472 }, { "epoch": 0.49303703703703705, "grad_norm": 11.4375, "learning_rate": 1.2062855094631777e-05, "loss": 1.0253, "step": 208 }, { "epoch": 0.49303703703703705, "step": 208, "train/combined_loss": 0.06565954210236669, "train/cross_entropy_loss": 0.019319579121656716, "train/kl_divergence_loss": 0.11199951171875, "train/step_duration_seconds": 7.163602352142334, "train/steps_per_hour": 307.46566401066696, "train/total_elapsed_hours": 0.6764983032147089 }, { "epoch": 0.4954074074074074, "grad_norm": 9.375, "learning_rate": 1.1981461431993978e-05, "loss": 1.0506, "step": 209 }, { "epoch": 0.4954074074074074, "step": 209, "train/combined_loss": 0.0647607441060245, "train/cross_entropy_loss": 0.01816285285167396, "train/kl_divergence_loss": 0.111358642578125, "train/step_duration_seconds": 7.1943066120147705, "train/steps_per_hour": 308.03391278116044, "train/total_elapsed_hours": 0.6784967217180464 }, { "epoch": 0.49777777777777776, "grad_norm": 9.25, "learning_rate": 1.1899930902191904e-05, "loss": 1.0362, "step": 210 }, { "epoch": 0.49777777777777776, "step": 210, "train/combined_loss": 0.06525316601619124, "train/cross_entropy_loss": 0.01887303462717682, "train/kl_divergence_loss": 0.11163330078125, "train/step_duration_seconds": 7.1762001514434814, "train/steps_per_hour": 308.6011048708952, "train/total_elapsed_hours": 0.6804901106490029 }, { "epoch": 0.5001481481481481, "grad_norm": 10.0, "learning_rate": 1.181826913685291e-05, "loss": 1.0441, "step": 211 }, { "epoch": 0.5001481481481481, "step": 211, "train/combined_loss": 0.06376577913761139, "train/cross_entropy_loss": 0.02038433833513409, "train/kl_divergence_loss": 0.107147216796875, "train/step_duration_seconds": 7.198651552200317, "train/steps_per_hour": 309.1621585576703, "train/total_elapsed_hours": 0.6824897360801697 }, { "epoch": 0.5025185185185185, "grad_norm": 6.84375, "learning_rate": 1.1736481776669307e-05, "loss": 1.0203, "step": 212 }, { "epoch": 0.5025185185185185, "step": 212, "train/combined_loss": 0.0649118721485138, "train/cross_entropy_loss": 0.02285963052418083, "train/kl_divergence_loss": 0.106964111328125, "train/step_duration_seconds": 7.178544998168945, "train/steps_per_hour": 309.72246139738036, "train/total_elapsed_hours": 0.6844837763574388 }, { "epoch": 0.5048888888888889, "grad_norm": 13.125, "learning_rate": 1.1654574471008712e-05, "loss": 1.0386, "step": 213 }, { "epoch": 0.5048888888888889, "step": 213, "train/combined_loss": 0.07054620841518044, "train/cross_entropy_loss": 0.022958871792070568, "train/kl_divergence_loss": 0.118133544921875, "train/step_duration_seconds": 7.145255088806152, "train/steps_per_hour": 310.28368884315285, "train/total_elapsed_hours": 0.6864685694376628 }, { "epoch": 0.5072592592592593, "grad_norm": 15.4375, "learning_rate": 1.1572552877523855e-05, "loss": 1.1287, "step": 214 }, { "epoch": 0.5072592592592593, "step": 214, "train/combined_loss": 0.06366756092756987, "train/cross_entropy_loss": 0.022598788724280894, "train/kl_divergence_loss": 0.104736328125, "train/step_duration_seconds": 7.188857316970825, "train/steps_per_hour": 310.8362118431136, "train/total_elapsed_hours": 0.6884654742479325 }, { "epoch": 0.5096296296296297, "grad_norm": 11.3125, "learning_rate": 1.1490422661761744e-05, "loss": 1.0187, "step": 215 }, { "epoch": 0.5096296296296297, "step": 215, "train/combined_loss": 0.06394149828702211, "train/cross_entropy_loss": 0.021193550201132894, "train/kl_divergence_loss": 0.106689453125, "train/step_duration_seconds": 7.176607847213745, "train/steps_per_hour": 311.3870734397048, "train/total_elapsed_hours": 0.690458976427714 }, { "epoch": 0.512, "grad_norm": 13.9375, "learning_rate": 1.1408189496772369e-05, "loss": 1.0231, "step": 216 }, { "epoch": 0.512, "step": 216, "train/combined_loss": 0.06880563637241721, "train/cross_entropy_loss": 0.020667911507189274, "train/kl_divergence_loss": 0.116943359375, "train/step_duration_seconds": 7.1657140254974365, "train/steps_per_hour": 311.93612646502845, "train/total_elapsed_hours": 0.6924494525459077 }, { "epoch": 0.5143703703703704, "grad_norm": 13.875, "learning_rate": 1.1325859062716795e-05, "loss": 1.1009, "step": 217 }, { "epoch": 0.5143703703703704, "step": 217, "train/combined_loss": 0.06414779741317034, "train/cross_entropy_loss": 0.018890082137659192, "train/kl_divergence_loss": 0.109405517578125, "train/step_duration_seconds": 7.167219400405884, "train/steps_per_hour": 312.4818438221589, "train/total_elapsed_hours": 0.6944403468237983 }, { "epoch": 0.5167407407407407, "grad_norm": 10.3125, "learning_rate": 1.1243437046474854e-05, "loss": 1.0264, "step": 218 }, { "epoch": 0.5167407407407407, "step": 218, "train/combined_loss": 0.0651923450641334, "train/cross_entropy_loss": 0.01881242578383535, "train/kl_divergence_loss": 0.111572265625, "train/step_duration_seconds": 7.16694712638855, "train/steps_per_hour": 313.02447507913155, "train/total_elapsed_hours": 0.6964311654700174 }, { "epoch": 0.5191111111111111, "grad_norm": 11.4375, "learning_rate": 1.1160929141252303e-05, "loss": 1.0431, "step": 219 }, { "epoch": 0.5191111111111111, "step": 219, "train/combined_loss": 0.06706041377037764, "train/cross_entropy_loss": 0.019374735886231065, "train/kl_divergence_loss": 0.11474609375, "train/step_duration_seconds": 7.1320960521698, "train/steps_per_hour": 313.5683592299994, "train/total_elapsed_hours": 0.6984123032622868 }, { "epoch": 0.5214814814814814, "grad_norm": 13.125, "learning_rate": 1.1078341046187588e-05, "loss": 1.073, "step": 220 }, { "epoch": 0.5214814814814814, "step": 220, "train/combined_loss": 0.06729212449863553, "train/cross_entropy_loss": 0.0204179905122146, "train/kl_divergence_loss": 0.114166259765625, "train/step_duration_seconds": 7.177785634994507, "train/steps_per_hour": 314.10347476777997, "train/total_elapsed_hours": 0.7004061326053408 }, { "epoch": 0.5238518518518519, "grad_norm": 8.5625, "learning_rate": 1.0995678465958168e-05, "loss": 1.0767, "step": 221 }, { "epoch": 0.5238518518518519, "step": 221, "train/combined_loss": 0.06687723798677325, "train/cross_entropy_loss": 0.022487382288090885, "train/kl_divergence_loss": 0.11126708984375, "train/step_duration_seconds": 7.177214860916138, "train/steps_per_hour": 314.6356233732219, "train/total_elapsed_hours": 0.7023998034000397 }, { "epoch": 0.5262222222222223, "grad_norm": 12.4375, "learning_rate": 1.0912947110386484e-05, "loss": 1.07, "step": 222 }, { "epoch": 0.5262222222222223, "step": 222, "train/combined_loss": 0.0689978925511241, "train/cross_entropy_loss": 0.02382952021434903, "train/kl_divergence_loss": 0.114166259765625, "train/step_duration_seconds": 7.179453611373901, "train/steps_per_hour": 315.1644814155012, "train/total_elapsed_hours": 0.7043940960698658 }, { "epoch": 0.5285925925925926, "grad_norm": 15.25, "learning_rate": 1.0830152694045553e-05, "loss": 1.104, "step": 223 }, { "epoch": 0.5285925925925926, "step": 223, "train/combined_loss": 0.0666323616169393, "train/cross_entropy_loss": 0.022882646531797945, "train/kl_divergence_loss": 0.110382080078125, "train/step_duration_seconds": 7.199350595474243, "train/steps_per_hour": 315.68788327603403, "train/total_elapsed_hours": 0.7063939156797198 }, { "epoch": 0.530962962962963, "grad_norm": 11.0625, "learning_rate": 1.0747300935864245e-05, "loss": 1.0661, "step": 224 }, { "epoch": 0.530962962962963, "step": 224, "train/combined_loss": 0.06530413264408708, "train/cross_entropy_loss": 0.021294296951964498, "train/kl_divergence_loss": 0.10931396484375, "train/step_duration_seconds": 7.174778938293457, "train/steps_per_hour": 316.211376709367, "train/total_elapsed_hours": 0.7083869098292457 }, { "epoch": 0.5333333333333333, "grad_norm": 8.875, "learning_rate": 1.0664397558732245e-05, "loss": 1.0449, "step": 225 }, { "epoch": 0.5333333333333333, "step": 225, "train/combined_loss": 0.0637249075807631, "train/cross_entropy_loss": 0.020424663205631077, "train/kl_divergence_loss": 0.107025146484375, "train/step_duration_seconds": 7.199081659317017, "train/steps_per_hour": 316.72892290474715, "train/total_elapsed_hours": 0.7103866547346115 }, { "epoch": 0.5357037037037037, "grad_norm": 9.5, "learning_rate": 1.0581448289104759e-05, "loss": 1.0196, "step": 226 }, { "epoch": 0.5357037037037037, "step": 226, "train/combined_loss": 0.06304276920855045, "train/cross_entropy_loss": 0.019396084127947688, "train/kl_divergence_loss": 0.106689453125, "train/step_duration_seconds": 7.180125951766968, "train/steps_per_hour": 317.2459083502985, "train/total_elapsed_hours": 0.7123811341656578 }, { "epoch": 0.538074074074074, "grad_norm": 6.03125, "learning_rate": 1.0498458856606972e-05, "loss": 1.0087, "step": 227 }, { "epoch": 0.538074074074074, "step": 227, "train/combined_loss": 0.06372014014050364, "train/cross_entropy_loss": 0.019560642424039543, "train/kl_divergence_loss": 0.107879638671875, "train/step_duration_seconds": 7.176924228668213, "train/steps_per_hour": 317.76040263033815, "train/total_elapsed_hours": 0.7143747242291768 }, { "epoch": 0.5404444444444444, "grad_norm": 9.5625, "learning_rate": 1.0415434993638269e-05, "loss": 1.0195, "step": 228 }, { "epoch": 0.5404444444444444, "step": 228, "train/combined_loss": 0.06290752394124866, "train/cross_entropy_loss": 0.019766463432461023, "train/kl_divergence_loss": 0.106048583984375, "train/step_duration_seconds": 7.157245635986328, "train/steps_per_hour": 318.27446193106454, "train/total_elapsed_hours": 0.7163628480169508 }, { "epoch": 0.5428148148148149, "grad_norm": 13.3125, "learning_rate": 1.0332382434976267e-05, "loss": 1.0065, "step": 229 }, { "epoch": 0.5428148148148149, "step": 229, "train/combined_loss": 0.06345422472804785, "train/cross_entropy_loss": 0.02067675837315619, "train/kl_divergence_loss": 0.106231689453125, "train/step_duration_seconds": 7.180465459823608, "train/steps_per_hour": 318.782813491763, "train/total_elapsed_hours": 0.7183574217557908 }, { "epoch": 0.5451851851851852, "grad_norm": 7.78125, "learning_rate": 1.0249306917380731e-05, "loss": 1.0153, "step": 230 }, { "epoch": 0.5451851851851852, "step": 230, "train/combined_loss": 0.07120905723422766, "train/cross_entropy_loss": 0.022484031855128706, "train/kl_divergence_loss": 0.11993408203125, "train/step_duration_seconds": 7.1848931312561035, "train/steps_per_hour": 319.287804772737, "train/total_elapsed_hours": 0.7203532254033619 }, { "epoch": 0.5475555555555556, "grad_norm": 10.6875, "learning_rate": 1.0166214179197265e-05, "loss": 1.1393, "step": 231 }, { "epoch": 0.5475555555555556, "step": 231, "train/combined_loss": 0.0662783239968121, "train/cross_entropy_loss": 0.021716808201745152, "train/kl_divergence_loss": 0.11083984375, "train/step_duration_seconds": 7.186007499694824, "train/steps_per_hour": 319.7898684983009, "train/total_elapsed_hours": 0.7223493385977215 }, { "epoch": 0.5499259259259259, "grad_norm": 12.9375, "learning_rate": 1.0083109959960974e-05, "loss": 1.0605, "step": 232 }, { "epoch": 0.5499259259259259, "step": 232, "train/combined_loss": 0.06430092873051763, "train/cross_entropy_loss": 0.021363087464123964, "train/kl_divergence_loss": 0.10723876953125, "train/step_duration_seconds": 7.199544906616211, "train/steps_per_hour": 320.28750235103695, "train/total_elapsed_hours": 0.7243492121828927 }, { "epoch": 0.5522962962962963, "grad_norm": 11.625, "learning_rate": 1e-05, "loss": 1.0288, "step": 233 }, { "epoch": 0.5522962962962963, "step": 233, "train/combined_loss": 0.06349486531689763, "train/cross_entropy_loss": 0.021063214750029147, "train/kl_divergence_loss": 0.105926513671875, "train/step_duration_seconds": 7.180714845657349, "train/steps_per_hour": 320.78470593967154, "train/total_elapsed_hours": 0.7263438551955753 }, { "epoch": 0.5546666666666666, "grad_norm": 5.09375, "learning_rate": 9.916890040039031e-06, "loss": 1.0159, "step": 234 }, { "epoch": 0.5546666666666666, "step": 234, "train/combined_loss": 0.0634385438170284, "train/cross_entropy_loss": 0.02043177606537938, "train/kl_divergence_loss": 0.1064453125, "train/step_duration_seconds": 7.174914836883545, "train/steps_per_hour": 321.2798969078875, "train/total_elapsed_hours": 0.7283368870947096 }, { "epoch": 0.557037037037037, "grad_norm": 7.46875, "learning_rate": 9.833785820802739e-06, "loss": 1.015, "step": 235 }, { "epoch": 0.557037037037037, "step": 235, "train/combined_loss": 0.06511542480438948, "train/cross_entropy_loss": 0.01987928501330316, "train/kl_divergence_loss": 0.1103515625, "train/step_duration_seconds": 7.176663398742676, "train/steps_per_hour": 321.77217117883936, "train/total_elapsed_hours": 0.7303304047054715 }, { "epoch": 0.5594074074074074, "grad_norm": 5.6875, "learning_rate": 9.750693082619274e-06, "loss": 1.0418, "step": 236 }, { "epoch": 0.5594074074074074, "step": 236, "train/combined_loss": 0.06252926005981863, "train/cross_entropy_loss": 0.019498219480738044, "train/kl_divergence_loss": 0.105560302734375, "train/step_duration_seconds": 7.1978747844696045, "train/steps_per_hour": 322.2591725337913, "train/total_elapsed_hours": 0.7323298143678242 }, { "epoch": 0.5617777777777778, "grad_norm": 7.125, "learning_rate": 9.667617565023734e-06, "loss": 1.0005, "step": 237 }, { "epoch": 0.5617777777777778, "step": 237, "train/combined_loss": 0.06657313695177436, "train/cross_entropy_loss": 0.020200716448016465, "train/kl_divergence_loss": 0.112945556640625, "train/step_duration_seconds": 7.176342487335205, "train/steps_per_hour": 322.7461507067669, "train/total_elapsed_hours": 0.7343232428365284 }, { "epoch": 0.5641481481481482, "grad_norm": 8.0625, "learning_rate": 9.584565006361735e-06, "loss": 1.0652, "step": 238 }, { "epoch": 0.5641481481481482, "step": 238, "train/combined_loss": 0.06469497783109546, "train/cross_entropy_loss": 0.020838933647610247, "train/kl_divergence_loss": 0.108551025390625, "train/step_duration_seconds": 7.179325580596924, "train/steps_per_hour": 323.23012833277306, "train/total_elapsed_hours": 0.7363174999422497 }, { "epoch": 0.5665185185185185, "grad_norm": 3.3125, "learning_rate": 9.501541143393028e-06, "loss": 1.0351, "step": 239 }, { "epoch": 0.5665185185185185, "step": 239, "train/combined_loss": 0.06340441107749939, "train/cross_entropy_loss": 0.021584213944151998, "train/kl_divergence_loss": 0.105224609375, "train/step_duration_seconds": 7.157347917556763, "train/steps_per_hour": 323.7141681182355, "train/total_elapsed_hours": 0.7383056521415711 }, { "epoch": 0.5688888888888889, "grad_norm": 12.8125, "learning_rate": 9.418551710895243e-06, "loss": 1.0145, "step": 240 }, { "epoch": 0.5688888888888889, "step": 240, "train/combined_loss": 0.06330876937136054, "train/cross_entropy_loss": 0.02142344566527754, "train/kl_divergence_loss": 0.105194091796875, "train/step_duration_seconds": 7.197352170944214, "train/steps_per_hour": 324.1907416897731, "train/total_elapsed_hours": 0.7403049166335 }, { "epoch": 0.5712592592592592, "grad_norm": 14.3125, "learning_rate": 9.33560244126776e-06, "loss": 1.0129, "step": 241 }, { "epoch": 0.5712592592592592, "step": 241, "train/combined_loss": 0.0631729164160788, "train/cross_entropy_loss": 0.021395884454250336, "train/kl_divergence_loss": 0.104949951171875, "train/step_duration_seconds": 7.1834022998809814, "train/steps_per_hour": 324.666442947215, "train/total_elapsed_hours": 0.7423003061612448 }, { "epoch": 0.5736296296296296, "grad_norm": 8.6875, "learning_rate": 9.252699064135759e-06, "loss": 1.0108, "step": 242 }, { "epoch": 0.5736296296296296, "step": 242, "train/combined_loss": 0.06237048772163689, "train/cross_entropy_loss": 0.020492929965257645, "train/kl_divergence_loss": 0.104248046875, "train/step_duration_seconds": 7.1805572509765625, "train/steps_per_hour": 325.13993881325246, "train/total_elapsed_hours": 0.7442949053976271 }, { "epoch": 0.576, "grad_norm": 8.4375, "learning_rate": 9.169847305954448e-06, "loss": 0.9979, "step": 243 }, { "epoch": 0.576, "step": 243, "train/combined_loss": 0.0639663846231997, "train/cross_entropy_loss": 0.020327785867266357, "train/kl_divergence_loss": 0.10760498046875, "train/step_duration_seconds": 7.17130970954895, "train/steps_per_hour": 325.61202443655657, "train/total_elapsed_hours": 0.7462869358725018 }, { "epoch": 0.5783703703703704, "grad_norm": 12.6875, "learning_rate": 9.087052889613519e-06, "loss": 1.0235, "step": 244 }, { "epoch": 0.5783703703703704, "step": 244, "train/combined_loss": 0.06497443979606032, "train/cross_entropy_loss": 0.020543363760225475, "train/kl_divergence_loss": 0.109405517578125, "train/step_duration_seconds": 7.176990032196045, "train/steps_per_hour": 326.0809089404372, "train/total_elapsed_hours": 0.7482805442147785 }, { "epoch": 0.5807407407407408, "grad_norm": 8.9375, "learning_rate": 9.004321534041836e-06, "loss": 1.0396, "step": 245 }, { "epoch": 0.5807407407407408, "step": 245, "train/combined_loss": 0.06485264329239726, "train/cross_entropy_loss": 0.020574429305270314, "train/kl_divergence_loss": 0.109130859375, "train/step_duration_seconds": 7.177663564682007, "train/steps_per_hour": 326.54722020012497, "train/total_elapsed_hours": 0.7502743396494124 }, { "epoch": 0.5831111111111111, "grad_norm": 5.09375, "learning_rate": 8.921658953812416e-06, "loss": 1.0376, "step": 246 }, { "epoch": 0.5831111111111111, "step": 246, "train/combined_loss": 0.06371145462617278, "train/cross_entropy_loss": 0.020306207472458482, "train/kl_divergence_loss": 0.10711669921875, "train/step_duration_seconds": 7.178081512451172, "train/steps_per_hour": 327.0110091897602, "train/total_elapsed_hours": 0.7522682511806488 }, { "epoch": 0.5854814814814815, "grad_norm": 6.78125, "learning_rate": 8.839070858747697e-06, "loss": 1.0194, "step": 247 }, { "epoch": 0.5854814814814815, "step": 247, "train/combined_loss": 0.061982935993000865, "train/cross_entropy_loss": 0.02035869611427188, "train/kl_divergence_loss": 0.103607177734375, "train/step_duration_seconds": 7.1770946979522705, "train/steps_per_hour": 327.47246511357855, "train/total_elapsed_hours": 0.7542618885967467 }, { "epoch": 0.5878518518518518, "grad_norm": 5.625, "learning_rate": 8.756562953525151e-06, "loss": 0.9917, "step": 248 }, { "epoch": 0.5878518518518518, "step": 248, "train/combined_loss": 0.06333467178046703, "train/cross_entropy_loss": 0.020956451655365527, "train/kl_divergence_loss": 0.105712890625, "train/step_duration_seconds": 7.17914080619812, "train/steps_per_hour": 327.9312416048011, "train/total_elapsed_hours": 0.7562560943762461 }, { "epoch": 0.5902222222222222, "grad_norm": 2.671875, "learning_rate": 8.674140937283208e-06, "loss": 1.0134, "step": 249 }, { "epoch": 0.5902222222222222, "step": 249, "train/combined_loss": 0.06772775668650866, "train/cross_entropy_loss": 0.02138080890290439, "train/kl_divergence_loss": 0.11407470703125, "train/step_duration_seconds": 7.154328107833862, "train/steps_per_hour": 328.3905899588576, "train/total_elapsed_hours": 0.7582434077395334 }, { "epoch": 0.5925925925925926, "grad_norm": 2.65625, "learning_rate": 8.591810503227634e-06, "loss": 1.0836, "step": 250 }, { "epoch": 0.5925925925925926, "eval_combined_loss": 0.06404994005337358, "eval_cross_entropy_loss": 0.020620713440080485, "eval_kl_divergence_loss": 0.10747916666666667, "eval_loss": 0.06404994428157806, "eval_runtime": 220.1998, "eval_samples_per_second": 6.812, "eval_steps_per_second": 3.406, "step": 250 }, { "epoch": 0.5925925925925926, "step": 250, "train/combined_loss": 0.06369326380081475, "train/cross_entropy_loss": 0.020636040600948036, "train/kl_divergence_loss": 0.10675048828125, "train/step_duration_seconds": 227.41642260551453, "train/steps_per_hour": 304.35298930640323, "train/total_elapsed_hours": 0.8214146362410651 }, { "epoch": 0.5949629629629629, "grad_norm": 3.0625, "learning_rate": 8.509577338238255e-06, "loss": 1.0191, "step": 251 }, { "epoch": 0.5949629629629629, "step": 251, "train/combined_loss": 0.0633452923502773, "train/cross_entropy_loss": 0.020184239139780402, "train/kl_divergence_loss": 0.10650634765625, "train/step_duration_seconds": 7.193126916885376, "train/steps_per_hour": 304.82890506058317, "train/total_elapsed_hours": 0.8234127270513111 }, { "epoch": 0.5973333333333334, "grad_norm": 3.140625, "learning_rate": 8.427447122476148e-06, "loss": 1.0135, "step": 252 }, { "epoch": 0.5973333333333334, "step": 252, "train/combined_loss": 0.0627688483800739, "train/cross_entropy_loss": 0.020252052345313132, "train/kl_divergence_loss": 0.10528564453125, "train/step_duration_seconds": 7.173017740249634, "train/steps_per_hour": 305.3045828117105, "train/total_elapsed_hours": 0.8254052319791582 }, { "epoch": 0.5997037037037037, "grad_norm": 2.359375, "learning_rate": 8.34542552899129e-06, "loss": 1.0043, "step": 253 }, { "epoch": 0.5997037037037037, "step": 253, "train/combined_loss": 0.06621215213090181, "train/cross_entropy_loss": 0.02149291045498103, "train/kl_divergence_loss": 0.110931396484375, "train/step_duration_seconds": 7.1814353466033936, "train/steps_per_hour": 305.77710542260405, "train/total_elapsed_hours": 0.8274000751309925 }, { "epoch": 0.6020740740740741, "grad_norm": 2.703125, "learning_rate": 8.263518223330698e-06, "loss": 1.0594, "step": 254 }, { "epoch": 0.6020740740740741, "step": 254, "train/combined_loss": 0.06436787801794708, "train/cross_entropy_loss": 0.021680091507732868, "train/kl_divergence_loss": 0.1070556640625, "train/step_duration_seconds": 7.1533849239349365, "train/steps_per_hour": 306.25023211085755, "train/total_elapsed_hours": 0.8293871264987521 }, { "epoch": 0.6044444444444445, "grad_norm": 3.6875, "learning_rate": 8.181730863147094e-06, "loss": 1.0299, "step": 255 }, { "epoch": 0.6044444444444445, "step": 255, "train/combined_loss": 0.0639684284105897, "train/cross_entropy_loss": 0.020576014067046344, "train/kl_divergence_loss": 0.10736083984375, "train/step_duration_seconds": 7.142830848693848, "train/steps_per_hour": 306.7221787763317, "train/total_elapsed_hours": 0.8313712461789449 }, { "epoch": 0.6068148148148148, "grad_norm": 2.359375, "learning_rate": 8.100069097808103e-06, "loss": 1.0235, "step": 256 }, { "epoch": 0.6068148148148148, "step": 256, "train/combined_loss": 0.062460833229124546, "train/cross_entropy_loss": 0.020459996070712805, "train/kl_divergence_loss": 0.104461669921875, "train/step_duration_seconds": 7.155627012252808, "train/steps_per_hour": 307.1905678921464, "train/total_elapsed_hours": 0.8333589203490152 }, { "epoch": 0.6091851851851852, "grad_norm": 3.3125, "learning_rate": 8.018538568006027e-06, "loss": 0.9994, "step": 257 }, { "epoch": 0.6091851851851852, "step": 257, "train/combined_loss": 0.06057729944586754, "train/cross_entropy_loss": 0.020568661391735077, "train/kl_divergence_loss": 0.1005859375, "train/step_duration_seconds": 7.156771659851074, "train/steps_per_hour": 307.6566108780016, "train/total_elapsed_hours": 0.8353469124767515 }, { "epoch": 0.6115555555555555, "grad_norm": 5.40625, "learning_rate": 7.937144905368226e-06, "loss": 0.9692, "step": 258 }, { "epoch": 0.6115555555555555, "step": 258, "train/combined_loss": 0.06223560217767954, "train/cross_entropy_loss": 0.020253673777915537, "train/kl_divergence_loss": 0.104217529296875, "train/step_duration_seconds": 7.184788942337036, "train/steps_per_hour": 308.11757712721027, "train/total_elapsed_hours": 0.8373426871829562 }, { "epoch": 0.6139259259259259, "grad_norm": 5.28125, "learning_rate": 7.855893732068124e-06, "loss": 0.9958, "step": 259 }, { "epoch": 0.6139259259259259, "step": 259, "train/combined_loss": 0.06394938984885812, "train/cross_entropy_loss": 0.020690529490821064, "train/kl_divergence_loss": 0.107208251953125, "train/step_duration_seconds": 7.194268226623535, "train/steps_per_hour": 308.5753831613492, "train/total_elapsed_hours": 0.839341095023685 }, { "epoch": 0.6162962962962963, "grad_norm": 2.875, "learning_rate": 7.774790660436857e-06, "loss": 1.0232, "step": 260 }, { "epoch": 0.6162962962962963, "step": 260, "train/combined_loss": 0.060645608929917216, "train/cross_entropy_loss": 0.020613725995644927, "train/kl_divergence_loss": 0.100677490234375, "train/step_duration_seconds": 7.176680564880371, "train/steps_per_hour": 309.0328088515058, "train/total_elapsed_hours": 0.8413346174028185 }, { "epoch": 0.6186666666666667, "grad_norm": 3.09375, "learning_rate": 7.6938412925756e-06, "loss": 0.9703, "step": 261 }, { "epoch": 0.6186666666666667, "step": 261, "train/combined_loss": 0.06048685312271118, "train/cross_entropy_loss": 0.02075397619046271, "train/kl_divergence_loss": 0.1002197265625, "train/step_duration_seconds": 7.175957441329956, "train/steps_per_hour": 309.48814566254214, "train/total_elapsed_hours": 0.843327938914299 }, { "epoch": 0.621037037037037, "grad_norm": 4.84375, "learning_rate": 7.613051219968624e-06, "loss": 0.9678, "step": 262 }, { "epoch": 0.621037037037037, "step": 262, "train/combined_loss": 0.06825654301792383, "train/cross_entropy_loss": 0.02109560859389603, "train/kl_divergence_loss": 0.11541748046875, "train/step_duration_seconds": 7.180307388305664, "train/steps_per_hour": 309.94089201150274, "train/total_elapsed_hours": 0.8453224687443839 }, { "epoch": 0.6234074074074074, "grad_norm": 2.875, "learning_rate": 7.532426023097063e-06, "loss": 1.0921, "step": 263 }, { "epoch": 0.6234074074074074, "step": 263, "train/combined_loss": 0.06100269500166178, "train/cross_entropy_loss": 0.020168230053968728, "train/kl_divergence_loss": 0.101837158203125, "train/step_duration_seconds": 7.166715145111084, "train/steps_per_hour": 310.39288999163625, "train/total_elapsed_hours": 0.8473132229513592 }, { "epoch": 0.6257777777777778, "grad_norm": 5.75, "learning_rate": 7.451971271053455e-06, "loss": 0.976, "step": 264 }, { "epoch": 0.6257777777777778, "step": 264, "train/combined_loss": 0.06548905186355114, "train/cross_entropy_loss": 0.021114823641255498, "train/kl_divergence_loss": 0.10986328125, "train/step_duration_seconds": 7.166426181793213, "train/steps_per_hour": 310.8427983981813, "train/total_elapsed_hours": 0.8493038968907463 }, { "epoch": 0.6281481481481481, "grad_norm": 3.21875, "learning_rate": 7.371692521157048e-06, "loss": 1.0478, "step": 265 }, { "epoch": 0.6281481481481481, "step": 265, "train/combined_loss": 0.06573160830885172, "train/cross_entropy_loss": 0.021050618845038116, "train/kl_divergence_loss": 0.11041259765625, "train/step_duration_seconds": 7.166961431503296, "train/steps_per_hour": 311.2905482984119, "train/total_elapsed_hours": 0.8512947195106082 }, { "epoch": 0.6305185185185185, "grad_norm": 5.25, "learning_rate": 7.291595318569951e-06, "loss": 1.0517, "step": 266 }, { "epoch": 0.6305185185185185, "step": 266, "train/combined_loss": 0.06105546629987657, "train/cross_entropy_loss": 0.020487397676333785, "train/kl_divergence_loss": 0.10162353515625, "train/step_duration_seconds": 7.191303968429565, "train/steps_per_hour": 311.733738567441, "train/total_elapsed_hours": 0.8532923039462831 }, { "epoch": 0.6328888888888888, "grad_norm": 5.125, "learning_rate": 7.2116851959140965e-06, "loss": 0.9769, "step": 267 }, { "epoch": 0.6328888888888888, "step": 267, "train/combined_loss": 0.06089310604147613, "train/cross_entropy_loss": 0.020650955964811146, "train/kl_divergence_loss": 0.10113525390625, "train/step_duration_seconds": 7.178247451782227, "train/steps_per_hour": 312.17618240392204, "train/total_elapsed_hours": 0.8552862615717782 }, { "epoch": 0.6352592592592593, "grad_norm": 5.3125, "learning_rate": 7.131967672889101e-06, "loss": 0.9743, "step": 268 }, { "epoch": 0.6352592592592593, "step": 268, "train/combined_loss": 0.06254504946991801, "train/cross_entropy_loss": 0.02102515858132392, "train/kl_divergence_loss": 0.10406494140625, "train/step_duration_seconds": 7.1988208293914795, "train/steps_per_hour": 312.6144841102412, "train/total_elapsed_hours": 0.8572859340243869 }, { "epoch": 0.6376296296296297, "grad_norm": 3.8125, "learning_rate": 7.052448255890958e-06, "loss": 1.0007, "step": 269 }, { "epoch": 0.6376296296296297, "step": 269, "train/combined_loss": 0.06616902281530201, "train/cross_entropy_loss": 0.02134561410639435, "train/kl_divergence_loss": 0.110992431640625, "train/step_duration_seconds": 7.174272060394287, "train/steps_per_hour": 313.05323016517417, "train/total_elapsed_hours": 0.8592787873744965 }, { "epoch": 0.64, "grad_norm": 4.3125, "learning_rate": 6.973132437631743e-06, "loss": 1.0587, "step": 270 }, { "epoch": 0.64, "step": 270, "train/combined_loss": 0.0661325603723526, "train/cross_entropy_loss": 0.021059064893051982, "train/kl_divergence_loss": 0.1112060546875, "train/step_duration_seconds": 7.177139759063721, "train/steps_per_hour": 313.48965589105467, "train/total_elapsed_hours": 0.8612724373075697 }, { "epoch": 0.6423703703703704, "grad_norm": 3.859375, "learning_rate": 6.8940256967601625e-06, "loss": 1.0581, "step": 271 }, { "epoch": 0.6423703703703704, "step": 271, "train/combined_loss": 0.06747469631955028, "train/cross_entropy_loss": 0.021271413774229586, "train/kl_divergence_loss": 0.113677978515625, "train/step_duration_seconds": 7.159501552581787, "train/steps_per_hour": 313.92584752922323, "train/total_elapsed_hours": 0.8632611877388424 }, { "epoch": 0.6447407407407407, "grad_norm": 6.90625, "learning_rate": 6.815133497483157e-06, "loss": 1.0796, "step": 272 }, { "epoch": 0.6447407407407407, "step": 272, "train/combined_loss": 0.06747178034856915, "train/cross_entropy_loss": 0.02111299301031977, "train/kl_divergence_loss": 0.11383056640625, "train/step_duration_seconds": 7.15754246711731, "train/steps_per_hour": 314.36023173496596, "train/total_elapsed_hours": 0.8652493939797083 }, { "epoch": 0.6471111111111111, "grad_norm": 6.0625, "learning_rate": 6.736461289188445e-06, "loss": 1.0795, "step": 273 }, { "epoch": 0.6471111111111111, "step": 273, "train/combined_loss": 0.06073831953108311, "train/cross_entropy_loss": 0.020249835564754903, "train/kl_divergence_loss": 0.101226806640625, "train/step_duration_seconds": 7.178404092788696, "train/steps_per_hour": 314.7905207894922, "train/total_elapsed_hours": 0.8672433951165941 }, { "epoch": 0.6494814814814814, "grad_norm": 4.9375, "learning_rate": 6.6580145060681255e-06, "loss": 0.9718, "step": 274 }, { "epoch": 0.6494814814814814, "step": 274, "train/combined_loss": 0.06485259486362338, "train/cross_entropy_loss": 0.020971058984287083, "train/kl_divergence_loss": 0.108734130859375, "train/step_duration_seconds": 7.158320426940918, "train/steps_per_hour": 315.22085880551066, "train/total_elapsed_hours": 0.8692318174574111 }, { "epoch": 0.6518518518518519, "grad_norm": 4.5625, "learning_rate": 6.579798566743314e-06, "loss": 1.0376, "step": 275 }, { "epoch": 0.6518518518518519, "step": 275, "train/combined_loss": 0.05971498414874077, "train/cross_entropy_loss": 0.020308872684836388, "train/kl_divergence_loss": 0.09912109375, "train/step_duration_seconds": 7.181738376617432, "train/steps_per_hour": 315.64687567999425, "train/total_elapsed_hours": 0.8712267447842492 }, { "epoch": 0.6542222222222223, "grad_norm": 5.625, "learning_rate": 6.501818873889856e-06, "loss": 0.9554, "step": 276 }, { "epoch": 0.6542222222222223, "step": 276, "train/combined_loss": 0.06411758903414011, "train/cross_entropy_loss": 0.020782789448276162, "train/kl_divergence_loss": 0.107452392578125, "train/step_duration_seconds": 7.175511598587036, "train/steps_per_hour": 316.07157210067743, "train/total_elapsed_hours": 0.8732199424505234 }, { "epoch": 0.6565925925925926, "grad_norm": 1.8203125, "learning_rate": 6.424080813865139e-06, "loss": 1.0259, "step": 277 }, { "epoch": 0.6565925925925926, "step": 277, "train/combined_loss": 0.05957591161131859, "train/cross_entropy_loss": 0.020519012585282326, "train/kl_divergence_loss": 0.0986328125, "train/step_duration_seconds": 7.175374507904053, "train/steps_per_hour": 316.4943478972161, "train/total_elapsed_hours": 0.8752131020360523 }, { "epoch": 0.658962962962963, "grad_norm": 5.40625, "learning_rate": 6.34658975633605e-06, "loss": 0.9532, "step": 278 }, { "epoch": 0.658962962962963, "step": 278, "train/combined_loss": 0.06180824153125286, "train/cross_entropy_loss": 0.02071121137123555, "train/kl_divergence_loss": 0.1029052734375, "train/step_duration_seconds": 7.177731037139893, "train/steps_per_hour": 316.9149659696548, "train/total_elapsed_hours": 0.8772069162130356 }, { "epoch": 0.6613333333333333, "grad_norm": 3.59375, "learning_rate": 6.269351053908061e-06, "loss": 0.9889, "step": 279 }, { "epoch": 0.6613333333333333, "step": 279, "train/combined_loss": 0.062360771000385284, "train/cross_entropy_loss": 0.020839707227423787, "train/kl_divergence_loss": 0.1038818359375, "train/step_duration_seconds": 7.177475214004517, "train/steps_per_hour": 317.3337019711024, "train/total_elapsed_hours": 0.8792006593280368 }, { "epoch": 0.6637037037037037, "grad_norm": 5.40625, "learning_rate": 6.192370041755505e-06, "loss": 0.9978, "step": 280 }, { "epoch": 0.6637037037037037, "step": 280, "train/combined_loss": 0.06398251187056303, "train/cross_entropy_loss": 0.020573665155097842, "train/kl_divergence_loss": 0.107391357421875, "train/step_duration_seconds": 7.203123092651367, "train/steps_per_hour": 317.7479741776063, "train/total_elapsed_hours": 0.8812015268537733 }, { "epoch": 0.666074074074074, "grad_norm": 5.96875, "learning_rate": 6.115652037253054e-06, "loss": 1.0237, "step": 281 }, { "epoch": 0.666074074074074, "step": 281, "train/combined_loss": 0.05968505213968456, "train/cross_entropy_loss": 0.02018797560594976, "train/kl_divergence_loss": 0.09918212890625, "train/step_duration_seconds": 7.152847766876221, "train/steps_per_hour": 318.16540023581325, "train/total_elapsed_hours": 0.883188429011239 }, { "epoch": 0.6684444444444444, "grad_norm": 4.65625, "learning_rate": 6.039202339608432e-06, "loss": 0.955, "step": 282 }, { "epoch": 0.6684444444444444, "step": 282, "train/combined_loss": 0.07058762316592038, "train/cross_entropy_loss": 0.02191255264915526, "train/kl_divergence_loss": 0.1192626953125, "train/step_duration_seconds": 7.178384065628052, "train/steps_per_hour": 318.578399398522, "train/total_elapsed_hours": 0.8851824245850245 }, { "epoch": 0.6708148148148149, "grad_norm": 8.5625, "learning_rate": 5.963026229496378e-06, "loss": 1.1294, "step": 283 }, { "epoch": 0.6708148148148149, "step": 283, "train/combined_loss": 0.06238678935915232, "train/cross_entropy_loss": 0.021471575018949807, "train/kl_divergence_loss": 0.103302001953125, "train/step_duration_seconds": 7.157472133636475, "train/steps_per_hour": 318.9916306953401, "train/total_elapsed_hours": 0.8871706112888125 }, { "epoch": 0.6731851851851852, "grad_norm": 10.6875, "learning_rate": 5.887128968693887e-06, "loss": 0.9982, "step": 284 }, { "epoch": 0.6731851851851852, "step": 284, "train/combined_loss": 0.06445175595581532, "train/cross_entropy_loss": 0.021542673697695136, "train/kl_divergence_loss": 0.10736083984375, "train/step_duration_seconds": 7.179536581039429, "train/steps_per_hour": 319.40081235138257, "train/total_elapsed_hours": 0.8891649270057678 }, { "epoch": 0.6755555555555556, "grad_norm": 7.71875, "learning_rate": 5.811515799716754e-06, "loss": 1.0312, "step": 285 }, { "epoch": 0.6755555555555556, "step": 285, "train/combined_loss": 0.060818693600595, "train/cross_entropy_loss": 0.020441102096810937, "train/kl_divergence_loss": 0.1011962890625, "train/step_duration_seconds": 7.156339406967163, "train/steps_per_hour": 319.81047503815245, "train/total_elapsed_hours": 0.8911527990632587 }, { "epoch": 0.6779259259259259, "grad_norm": 2.265625, "learning_rate": 5.736191945457463e-06, "loss": 0.9731, "step": 286 }, { "epoch": 0.6779259259259259, "step": 286, "train/combined_loss": 0.06460838648490608, "train/cross_entropy_loss": 0.021092993556521833, "train/kl_divergence_loss": 0.108123779296875, "train/step_duration_seconds": 7.178438901901245, "train/steps_per_hour": 320.21611322984927, "train/total_elapsed_hours": 0.8931468098693424 }, { "epoch": 0.6802962962962963, "grad_norm": 3.359375, "learning_rate": 5.66116260882442e-06, "loss": 1.0337, "step": 287 }, { "epoch": 0.6802962962962963, "step": 287, "train/combined_loss": 0.06716977385804057, "train/cross_entropy_loss": 0.02099726488813758, "train/kl_divergence_loss": 0.11334228515625, "train/step_duration_seconds": 7.17467188835144, "train/steps_per_hour": 320.6203190221968, "train/total_elapsed_hours": 0.8951397742827734 }, { "epoch": 0.6826666666666666, "grad_norm": 3.203125, "learning_rate": 5.586432972382561e-06, "loss": 1.0747, "step": 288 }, { "epoch": 0.6826666666666666, "step": 288, "train/combined_loss": 0.06678420398384333, "train/cross_entropy_loss": 0.02086699299979955, "train/kl_divergence_loss": 0.112701416015625, "train/step_duration_seconds": 7.181233882904053, "train/steps_per_hour": 321.0220766963932, "train/total_elapsed_hours": 0.897134561472469 }, { "epoch": 0.685037037037037, "grad_norm": 3.8125, "learning_rate": 5.512008197995379e-06, "loss": 1.0685, "step": 289 }, { "epoch": 0.685037037037037, "step": 289, "train/combined_loss": 0.06064820708706975, "train/cross_entropy_loss": 0.020740994019433856, "train/kl_divergence_loss": 0.100555419921875, "train/step_duration_seconds": 7.175872325897217, "train/steps_per_hour": 321.4225841160013, "train/total_elapsed_hours": 0.8991278593407737 }, { "epoch": 0.6874074074074074, "grad_norm": 1.890625, "learning_rate": 5.43789342646837e-06, "loss": 0.9704, "step": 290 }, { "epoch": 0.6874074074074074, "step": 290, "train/combined_loss": 0.06774787046015263, "train/cross_entropy_loss": 0.021848278120160103, "train/kl_divergence_loss": 0.1136474609375, "train/step_duration_seconds": 7.154938459396362, "train/steps_per_hour": 321.8233964113103, "train/total_elapsed_hours": 0.9011153422461615 }, { "epoch": 0.6897777777777778, "grad_norm": 2.453125, "learning_rate": 5.364093777193944e-06, "loss": 1.084, "step": 291 }, { "epoch": 0.6897777777777778, "step": 291, "train/combined_loss": 0.06057584332302213, "train/cross_entropy_loss": 0.020870924927294254, "train/kl_divergence_loss": 0.10028076171875, "train/step_duration_seconds": 7.165515184402466, "train/steps_per_hour": 322.2213962971687, "train/total_elapsed_hours": 0.9031057631307178 }, { "epoch": 0.6921481481481482, "grad_norm": 4.4375, "learning_rate": 5.290614347797802e-06, "loss": 0.9692, "step": 292 }, { "epoch": 0.6921481481481482, "step": 292, "train/combined_loss": 0.0660083363763988, "train/cross_entropy_loss": 0.02096320828422904, "train/kl_divergence_loss": 0.111053466796875, "train/step_duration_seconds": 7.173615455627441, "train/steps_per_hour": 322.61684365124677, "train/total_elapsed_hours": 0.9050984340906143 }, { "epoch": 0.6945185185185185, "grad_norm": 2.5625, "learning_rate": 5.217460213786822e-06, "loss": 1.0561, "step": 293 }, { "epoch": 0.6945185185185185, "step": 293, "train/combined_loss": 0.06140920426696539, "train/cross_entropy_loss": 0.020706592011265457, "train/kl_divergence_loss": 0.10211181640625, "train/step_duration_seconds": 7.179595708847046, "train/steps_per_hour": 323.00996205395205, "train/total_elapsed_hours": 0.9070927662319607 }, { "epoch": 0.6968888888888889, "grad_norm": 2.34375, "learning_rate": 5.144636428198477e-06, "loss": 0.9825, "step": 294 }, { "epoch": 0.6968888888888889, "step": 294, "train/combined_loss": 0.061046687653288245, "train/cross_entropy_loss": 0.020591912092640996, "train/kl_divergence_loss": 0.10150146484375, "train/step_duration_seconds": 7.179559946060181, "train/steps_per_hour": 323.40135916435133, "train/total_elapsed_hours": 0.9090870884391996 }, { "epoch": 0.6992592592592592, "grad_norm": 3.296875, "learning_rate": 5.072148021251822e-06, "loss": 0.9767, "step": 295 }, { "epoch": 0.6992592592592592, "step": 295, "train/combined_loss": 0.06156940385699272, "train/cross_entropy_loss": 0.020782849984243512, "train/kl_divergence_loss": 0.10235595703125, "train/step_duration_seconds": 7.190384149551392, "train/steps_per_hour": 323.78997420643765, "train/total_elapsed_hours": 0.9110844173696306 }, { "epoch": 0.7016296296296296, "grad_norm": 3.484375, "learning_rate": 5.000000000000003e-06, "loss": 0.9851, "step": 296 }, { "epoch": 0.7016296296296296, "step": 296, "train/combined_loss": 0.05902678519487381, "train/cross_entropy_loss": 0.02039731852710247, "train/kl_divergence_loss": 0.09765625, "train/step_duration_seconds": 7.165556907653809, "train/steps_per_hour": 324.1793375982033, "train/total_elapsed_hours": 0.9130748498439789 }, { "epoch": 0.704, "grad_norm": 3.515625, "learning_rate": 4.92819734798441e-06, "loss": 0.9444, "step": 297 }, { "epoch": 0.704, "step": 297, "train/combined_loss": 0.0633976545650512, "train/cross_entropy_loss": 0.0211129350354895, "train/kl_divergence_loss": 0.105682373046875, "train/step_duration_seconds": 7.194438457489014, "train/steps_per_hour": 324.5641615666989, "train/total_elapsed_hours": 0.9150733049710592 }, { "epoch": 0.7063703703703703, "grad_norm": 5.28125, "learning_rate": 4.856745024890466e-06, "loss": 1.0144, "step": 298 }, { "epoch": 0.7063703703703703, "step": 298, "train/combined_loss": 0.06398635334335268, "train/cross_entropy_loss": 0.0211001462303102, "train/kl_divergence_loss": 0.10687255859375, "train/step_duration_seconds": 7.178032159805298, "train/steps_per_hour": 324.9489231458107, "train/total_elapsed_hours": 0.9170672027932273 }, { "epoch": 0.7087407407407408, "grad_norm": 3.671875, "learning_rate": 4.78564796620502e-06, "loss": 1.0238, "step": 299 }, { "epoch": 0.7087407407407408, "step": 299, "train/combined_loss": 0.058621928095817566, "train/cross_entropy_loss": 0.020564164966344833, "train/kl_divergence_loss": 0.0966796875, "train/step_duration_seconds": 7.176652193069458, "train/steps_per_hour": 325.33215093908757, "train/total_elapsed_hours": 0.9190607172913021 }, { "epoch": 0.7111111111111111, "grad_norm": 2.484375, "learning_rate": 4.714911082875446e-06, "loss": 0.938, "step": 300 }, { "epoch": 0.7111111111111111, "eval_combined_loss": 0.06135369462271532, "eval_cross_entropy_loss": 0.020845410078763962, "eval_kl_divergence_loss": 0.10186197916666667, "eval_loss": 0.06135369837284088, "eval_runtime": 220.2651, "eval_samples_per_second": 6.81, "eval_steps_per_second": 3.405, "step": 300 }, { "epoch": 0.7111111111111111, "step": 300, "train/combined_loss": 0.06502728187479079, "train/cross_entropy_loss": 0.02125939668621868, "train/kl_divergence_loss": 0.108795166015625, "train/step_duration_seconds": 227.4951696395874, "train/steps_per_hour": 305.4200389971287, "train/total_elapsed_hours": 0.9822538199689653 }, { "epoch": 0.7134814814814815, "grad_norm": 3.328125, "learning_rate": 4.644539260970417e-06, "loss": 1.0404, "step": 301 }, { "epoch": 0.7134814814814815, "step": 301, "train/combined_loss": 0.05955993290990591, "train/cross_entropy_loss": 0.020364978816360235, "train/kl_divergence_loss": 0.0987548828125, "train/step_duration_seconds": 7.195945978164673, "train/steps_per_hour": 305.81577462801977, "train/total_elapsed_hours": 0.9842526938517888 }, { "epoch": 0.7158518518518519, "grad_norm": 2.375, "learning_rate": 4.5745373613424075e-06, "loss": 0.953, "step": 302 }, { "epoch": 0.7158518518518519, "step": 302, "train/combined_loss": 0.06077713891863823, "train/cross_entropy_loss": 0.020449545118026435, "train/kl_divergence_loss": 0.101104736328125, "train/step_duration_seconds": 7.195575952529907, "train/steps_per_hour": 306.2099380662498, "train/total_elapsed_hours": 0.9862514649497138 }, { "epoch": 0.7182222222222222, "grad_norm": 4.125, "learning_rate": 4.504910219291941e-06, "loss": 0.9724, "step": 303 }, { "epoch": 0.7182222222222222, "step": 303, "train/combined_loss": 0.05961341969668865, "train/cross_entropy_loss": 0.020319371833465993, "train/kl_divergence_loss": 0.098907470703125, "train/step_duration_seconds": 7.177309036254883, "train/steps_per_hour": 306.6040813356026, "train/total_elapsed_hours": 0.9882451619042291 }, { "epoch": 0.7205925925925926, "grad_norm": 3.734375, "learning_rate": 4.435662644233594e-06, "loss": 0.9538, "step": 304 }, { "epoch": 0.7205925925925926, "step": 304, "train/combined_loss": 0.059920859755948186, "train/cross_entropy_loss": 0.020384933333843946, "train/kl_divergence_loss": 0.099456787109375, "train/step_duration_seconds": 7.177339553833008, "train/steps_per_hour": 306.9966348805165, "train/total_elapsed_hours": 0.9902388673358493 }, { "epoch": 0.7229629629629629, "grad_norm": 3.734375, "learning_rate": 4.3667994193637794e-06, "loss": 0.9587, "step": 305 }, { "epoch": 0.7229629629629629, "step": 305, "train/combined_loss": 0.0592358959838748, "train/cross_entropy_loss": 0.020632435218431056, "train/kl_divergence_loss": 0.09783935546875, "train/step_duration_seconds": 7.177339315414429, "train/steps_per_hour": 307.38761092036276, "train/total_elapsed_hours": 0.9922325727012422 }, { "epoch": 0.7253333333333334, "grad_norm": 2.609375, "learning_rate": 4.298325301330383e-06, "loss": 0.9478, "step": 306 }, { "epoch": 0.7253333333333334, "step": 306, "train/combined_loss": 0.06081084324978292, "train/cross_entropy_loss": 0.020883160177618265, "train/kl_divergence_loss": 0.100738525390625, "train/step_duration_seconds": 7.180126905441284, "train/steps_per_hour": 307.776779219795, "train/total_elapsed_hours": 0.9942270523971981 }, { "epoch": 0.7277037037037037, "grad_norm": 2.84375, "learning_rate": 4.23024501990417e-06, "loss": 0.973, "step": 307 }, { "epoch": 0.7277037037037037, "step": 307, "train/combined_loss": 0.05935222376137972, "train/cross_entropy_loss": 0.020437843864783645, "train/kl_divergence_loss": 0.0982666015625, "train/step_duration_seconds": 7.176982641220093, "train/steps_per_hour": 308.1646594287626, "train/total_elapsed_hours": 0.996220658686426 }, { "epoch": 0.7300740740740741, "grad_norm": 2.671875, "learning_rate": 4.162563277652104e-06, "loss": 0.9496, "step": 308 }, { "epoch": 0.7300740740740741, "step": 308, "train/combined_loss": 0.06314325472339988, "train/cross_entropy_loss": 0.020665171090513468, "train/kl_divergence_loss": 0.105621337890625, "train/step_duration_seconds": 7.161325693130493, "train/steps_per_hour": 308.55233465191134, "train/total_elapsed_hours": 0.9982099158234067 }, { "epoch": 0.7324444444444445, "grad_norm": 4.65625, "learning_rate": 4.095284749612504e-06, "loss": 1.0103, "step": 309 }, { "epoch": 0.7324444444444445, "step": 309, "train/combined_loss": 0.059852408710867167, "train/cross_entropy_loss": 0.02018699492327869, "train/kl_divergence_loss": 0.099517822265625, "train/step_duration_seconds": 7.173473358154297, "train/steps_per_hour": 308.93742555494475, "train/total_elapsed_hours": 1.0002025473117828 }, { "epoch": 0.7348148148148148, "grad_norm": 6.5, "learning_rate": 4.028414082972141e-06, "loss": 0.9576, "step": 310 }, { "epoch": 0.7348148148148148, "step": 310, "train/combined_loss": 0.061497040558606386, "train/cross_entropy_loss": 0.02069915970787406, "train/kl_divergence_loss": 0.102294921875, "train/step_duration_seconds": 7.176191329956055, "train/steps_per_hour": 309.3207521077788, "train/total_elapsed_hours": 1.002195933792326 }, { "epoch": 0.7371851851851852, "grad_norm": 5.625, "learning_rate": 3.961955896745224e-06, "loss": 0.984, "step": 311 }, { "epoch": 0.7371851851851852, "step": 311, "train/combined_loss": 0.06270680762827396, "train/cross_entropy_loss": 0.02095194417051971, "train/kl_divergence_loss": 0.104461669921875, "train/step_duration_seconds": 7.178727865219116, "train/steps_per_hour": 309.7023394964521, "train/total_elapsed_hours": 1.004190024865998 }, { "epoch": 0.7395555555555555, "grad_norm": 6.78125, "learning_rate": 3.89591478145437e-06, "loss": 1.0033, "step": 312 }, { "epoch": 0.7395555555555555, "step": 312, "train/combined_loss": 0.06055857567116618, "train/cross_entropy_loss": 0.02050069870892912, "train/kl_divergence_loss": 0.100616455078125, "train/step_duration_seconds": 7.1979944705963135, "train/steps_per_hour": 310.0807650969411, "train/total_elapsed_hours": 1.0061894677744971 }, { "epoch": 0.7419259259259259, "grad_norm": 1.8046875, "learning_rate": 3.830295298813475e-06, "loss": 0.9689, "step": 313 }, { "epoch": 0.7419259259259259, "step": 313, "train/combined_loss": 0.06014604773372412, "train/cross_entropy_loss": 0.020926860976032913, "train/kl_divergence_loss": 0.099365234375, "train/step_duration_seconds": 7.178622245788574, "train/steps_per_hour": 310.45934677419245, "train/total_elapsed_hours": 1.0081835295094383 }, { "epoch": 0.7442962962962963, "grad_norm": 6.34375, "learning_rate": 3.7651019814126656e-06, "loss": 0.9623, "step": 314 }, { "epoch": 0.7442962962962963, "step": 314, "train/combined_loss": 0.05981010291725397, "train/cross_entropy_loss": 0.020895838970318437, "train/kl_divergence_loss": 0.098724365234375, "train/step_duration_seconds": 7.296232223510742, "train/steps_per_hour": 310.8263816166535, "train/total_elapsed_hours": 1.0102102606826358 }, { "epoch": 0.7466666666666667, "grad_norm": 9.625, "learning_rate": 3.7003393324051874e-06, "loss": 0.957, "step": 315 }, { "epoch": 0.7466666666666667, "step": 315, "train/combined_loss": 0.060554551193490624, "train/cross_entropy_loss": 0.021896454854868352, "train/kl_divergence_loss": 0.099212646484375, "train/step_duration_seconds": 7.158878326416016, "train/steps_per_hour": 311.20367676354846, "train/total_elapsed_hours": 1.0121988379955291 }, { "epoch": 0.7490370370370371, "grad_norm": 7.0625, "learning_rate": 3.636011825196365e-06, "loss": 0.9689, "step": 316 }, { "epoch": 0.7490370370370371, "step": 316, "train/combined_loss": 0.05863487347960472, "train/cross_entropy_loss": 0.020590057596564293, "train/kl_divergence_loss": 0.0966796875, "train/step_duration_seconds": 7.176958084106445, "train/steps_per_hour": 311.5779494376515, "train/total_elapsed_hours": 1.0141924374633364 }, { "epoch": 0.7514074074074074, "grad_norm": 6.59375, "learning_rate": 3.5721239031346067e-06, "loss": 0.9382, "step": 317 }, { "epoch": 0.7514074074074074, "step": 317, "train/combined_loss": 0.059863541973754764, "train/cross_entropy_loss": 0.020819613593630493, "train/kl_divergence_loss": 0.098907470703125, "train/step_duration_seconds": 7.1567628383636475, "train/steps_per_hour": 311.95247569565794, "train/total_elapsed_hours": 1.0161804271406598 }, { "epoch": 0.7537777777777778, "grad_norm": 2.578125, "learning_rate": 3.5086799792044812e-06, "loss": 0.9578, "step": 318 }, { "epoch": 0.7537777777777778, "step": 318, "train/combined_loss": 0.06011883169412613, "train/cross_entropy_loss": 0.02053673774935305, "train/kl_divergence_loss": 0.099700927734375, "train/step_duration_seconds": 7.177694797515869, "train/steps_per_hour": 312.3237558362263, "train/total_elapsed_hours": 1.0181742312510809 }, { "epoch": 0.7561481481481481, "grad_norm": 4.21875, "learning_rate": 3.4456844357218977e-06, "loss": 0.9619, "step": 319 }, { "epoch": 0.7561481481481481, "step": 319, "train/combined_loss": 0.060971920378506184, "train/cross_entropy_loss": 0.020503410720266402, "train/kl_divergence_loss": 0.1014404296875, "train/step_duration_seconds": 7.183193206787109, "train/steps_per_hour": 312.69311658008917, "train/total_elapsed_hours": 1.0201695626974105 }, { "epoch": 0.7585185185185185, "grad_norm": 6.0625, "learning_rate": 3.3831416240314085e-06, "loss": 0.9756, "step": 320 }, { "epoch": 0.7585185185185185, "step": 320, "train/combined_loss": 0.05914177093654871, "train/cross_entropy_loss": 0.020505218068137765, "train/kl_divergence_loss": 0.0977783203125, "train/step_duration_seconds": 7.172834634780884, "train/steps_per_hour": 313.0619165575121, "train/total_elapsed_hours": 1.0221620167626275 }, { "epoch": 0.7608888888888888, "grad_norm": 7.09375, "learning_rate": 3.3210558642056277e-06, "loss": 0.9463, "step": 321 }, { "epoch": 0.7608888888888888, "step": 321, "train/combined_loss": 0.06318964948877692, "train/cross_entropy_loss": 0.02078847971279174, "train/kl_divergence_loss": 0.1055908203125, "train/step_duration_seconds": 7.178191423416138, "train/steps_per_hour": 313.4288261803181, "train/total_elapsed_hours": 1.0241559588246876 }, { "epoch": 0.7632592592592593, "grad_norm": 5.625, "learning_rate": 3.2594314447468457e-06, "loss": 1.011, "step": 322 }, { "epoch": 0.7632592592592593, "step": 322, "train/combined_loss": 0.06236264854669571, "train/cross_entropy_loss": 0.02066035382449627, "train/kl_divergence_loss": 0.10406494140625, "train/step_duration_seconds": 7.178012847900391, "train/steps_per_hour": 313.7943250662448, "train/total_elapsed_hours": 1.0261498512824376 }, { "epoch": 0.7656296296296297, "grad_norm": 6.65625, "learning_rate": 3.1982726222908046e-06, "loss": 0.9978, "step": 323 }, { "epoch": 0.7656296296296297, "step": 323, "train/combined_loss": 0.06084095500409603, "train/cross_entropy_loss": 0.02054665272589773, "train/kl_divergence_loss": 0.10113525390625, "train/step_duration_seconds": 7.176525115966797, "train/steps_per_hour": 314.1585325936901, "train/total_elapsed_hours": 1.0281433304813172 }, { "epoch": 0.768, "grad_norm": 4.46875, "learning_rate": 3.1375836213126653e-06, "loss": 0.9735, "step": 324 }, { "epoch": 0.768, "step": 324, "train/combined_loss": 0.06315464107319713, "train/cross_entropy_loss": 0.020748980692587793, "train/kl_divergence_loss": 0.105560302734375, "train/step_duration_seconds": 7.15761399269104, "train/steps_per_hour": 314.52293440305624, "train/total_elapsed_hours": 1.0301315565903981 }, { "epoch": 0.7703703703703704, "grad_norm": 8.0, "learning_rate": 3.077368633835205e-06, "loss": 1.0105, "step": 325 }, { "epoch": 0.7703703703703704, "step": 325, "train/combined_loss": 0.05943355988711119, "train/cross_entropy_loss": 0.020478446152992547, "train/kl_divergence_loss": 0.098388671875, "train/step_duration_seconds": 7.163183212280273, "train/steps_per_hour": 314.88546031009383, "train/total_elapsed_hours": 1.0321213297049205 }, { "epoch": 0.7727407407407407, "grad_norm": 7.96875, "learning_rate": 3.017631819139273e-06, "loss": 0.9509, "step": 326 }, { "epoch": 0.7727407407407407, "step": 326, "train/combined_loss": 0.06256748456507921, "train/cross_entropy_loss": 0.020612266613170505, "train/kl_divergence_loss": 0.104522705078125, "train/step_duration_seconds": 7.17126202583313, "train/steps_per_hour": 315.24590700472396, "train/total_elapsed_hours": 1.0341133469343184 }, { "epoch": 0.7751111111111111, "grad_norm": 4.03125, "learning_rate": 2.958377303476483e-06, "loss": 1.0011, "step": 327 }, { "epoch": 0.7751111111111111, "step": 327, "train/combined_loss": 0.059798732632771134, "train/cross_entropy_loss": 0.020720509812235832, "train/kl_divergence_loss": 0.098876953125, "train/step_duration_seconds": 7.176945686340332, "train/steps_per_hour": 315.60448679780745, "train/total_elapsed_hours": 1.0361069429583019 }, { "epoch": 0.7774814814814814, "grad_norm": 2.3125, "learning_rate": 2.8996091797841976e-06, "loss": 0.9568, "step": 328 }, { "epoch": 0.7774814814814814, "step": 328, "train/combined_loss": 0.058641964104026556, "train/cross_entropy_loss": 0.020512689370661974, "train/kl_divergence_loss": 0.096771240234375, "train/step_duration_seconds": 7.181287527084351, "train/steps_per_hour": 315.96132225407996, "train/total_elapsed_hours": 1.0381017450491588 }, { "epoch": 0.7798518518518519, "grad_norm": 3.1875, "learning_rate": 2.8413315074028157e-06, "loss": 0.9383, "step": 329 }, { "epoch": 0.7798518518518519, "step": 329, "train/combined_loss": 0.05962651362642646, "train/cross_entropy_loss": 0.02113901753909886, "train/kl_divergence_loss": 0.098114013671875, "train/step_duration_seconds": 7.1774678230285645, "train/steps_per_hour": 316.31711164339544, "train/total_elapsed_hours": 1.0400954861111111 }, { "epoch": 0.7822222222222223, "grad_norm": 3.0625, "learning_rate": 2.783548311795379e-06, "loss": 0.954, "step": 330 }, { "epoch": 0.7822222222222223, "step": 330, "train/combined_loss": 0.059158258605748415, "train/cross_entropy_loss": 0.020599229959771037, "train/kl_divergence_loss": 0.09771728515625, "train/step_duration_seconds": 7.179843902587891, "train/steps_per_hour": 316.67133906098195, "train/total_elapsed_hours": 1.0420898871951634 }, { "epoch": 0.7845925925925926, "grad_norm": 3.875, "learning_rate": 2.726263584269513e-06, "loss": 0.9465, "step": 331 }, { "epoch": 0.7845925925925926, "step": 331, "train/combined_loss": 0.05943922000005841, "train/cross_entropy_loss": 0.020550800720229745, "train/kl_divergence_loss": 0.09832763671875, "train/step_duration_seconds": 7.175478458404541, "train/steps_per_hour": 317.02458139366485, "train/total_elapsed_hours": 1.0440830756558312 }, { "epoch": 0.786962962962963, "grad_norm": 3.3125, "learning_rate": 2.669481281701739e-06, "loss": 0.951, "step": 332 }, { "epoch": 0.786962962962963, "step": 332, "train/combined_loss": 0.06072757695801556, "train/cross_entropy_loss": 0.020319899427704513, "train/kl_divergence_loss": 0.10113525390625, "train/step_duration_seconds": 7.182747840881348, "train/steps_per_hour": 317.3758649537043, "train/total_elapsed_hours": 1.0460782833894093 }, { "epoch": 0.7893333333333333, "grad_norm": 4.21875, "learning_rate": 2.6132053262641467e-06, "loss": 0.9716, "step": 333 }, { "epoch": 0.7893333333333333, "step": 333, "train/combined_loss": 0.05954708158969879, "train/cross_entropy_loss": 0.02046134858392179, "train/kl_divergence_loss": 0.0986328125, "train/step_duration_seconds": 7.171588897705078, "train/steps_per_hour": 317.72675072895083, "train/total_elapsed_hours": 1.0480703914165497 }, { "epoch": 0.7917037037037037, "grad_norm": 2.21875, "learning_rate": 2.5574396051534835e-06, "loss": 0.9528, "step": 334 }, { "epoch": 0.7917037037037037, "step": 334, "train/combined_loss": 0.05950666521675885, "train/cross_entropy_loss": 0.020380519214086235, "train/kl_divergence_loss": 0.0986328125, "train/step_duration_seconds": 7.180173397064209, "train/steps_per_hour": 318.0755828336615, "train/total_elapsed_hours": 1.0500648840268454 }, { "epoch": 0.794074074074074, "grad_norm": 2.78125, "learning_rate": 2.502187970322657e-06, "loss": 0.9521, "step": 335 }, { "epoch": 0.794074074074074, "step": 335, "train/combined_loss": 0.06000912608578801, "train/cross_entropy_loss": 0.020530943875201046, "train/kl_divergence_loss": 0.0994873046875, "train/step_duration_seconds": 7.176333665847778, "train/steps_per_hour": 318.4234151295545, "train/total_elapsed_hours": 1.0520583100451364 }, { "epoch": 0.7964444444444444, "grad_norm": 1.8984375, "learning_rate": 2.447454238214654e-06, "loss": 0.9601, "step": 336 }, { "epoch": 0.7964444444444444, "step": 336, "train/combined_loss": 0.061862445436418056, "train/cross_entropy_loss": 0.020911171450279653, "train/kl_divergence_loss": 0.102813720703125, "train/step_duration_seconds": 7.18380331993103, "train/steps_per_hour": 318.7693042840402, "train/total_elapsed_hours": 1.0540538109673394 }, { "epoch": 0.7988148148148149, "grad_norm": 5.3125, "learning_rate": 2.3932421894989167e-06, "loss": 0.9898, "step": 337 }, { "epoch": 0.7988148148148149, "step": 337, "train/combined_loss": 0.06748714856803417, "train/cross_entropy_loss": 0.02215080999303609, "train/kl_divergence_loss": 0.112823486328125, "train/step_duration_seconds": 7.178979873657227, "train/steps_per_hour": 319.1142911319986, "train/total_elapsed_hours": 1.0560479720433553 }, { "epoch": 0.8011851851851852, "grad_norm": 2.765625, "learning_rate": 2.339555568810221e-06, "loss": 1.0798, "step": 338 }, { "epoch": 0.8011851851851852, "step": 338, "train/combined_loss": 0.059276150073856115, "train/cross_entropy_loss": 0.020651907310821116, "train/kl_divergence_loss": 0.097900390625, "train/step_duration_seconds": 7.173582077026367, "train/steps_per_hour": 319.4584302570375, "train/total_elapsed_hours": 1.058040633731418 }, { "epoch": 0.8035555555555556, "grad_norm": 1.3515625, "learning_rate": 2.2863980844900036e-06, "loss": 0.9484, "step": 339 }, { "epoch": 0.8035555555555556, "step": 339, "train/combined_loss": 0.0625988682731986, "train/cross_entropy_loss": 0.02107176184654236, "train/kl_divergence_loss": 0.1041259765625, "train/step_duration_seconds": 7.17778754234314, "train/steps_per_hour": 319.8009231203147, "train/total_elapsed_hours": 1.0600344636042913 }, { "epoch": 0.8059259259259259, "grad_norm": 2.296875, "learning_rate": 2.2337734083302164e-06, "loss": 1.0016, "step": 340 }, { "epoch": 0.8059259259259259, "step": 340, "train/combined_loss": 0.058292608708143234, "train/cross_entropy_loss": 0.02039380930364132, "train/kl_divergence_loss": 0.09619140625, "train/step_duration_seconds": 7.182078838348389, "train/steps_per_hour": 320.14177067664195, "train/total_elapsed_hours": 1.0620294855038326 }, { "epoch": 0.8082962962962963, "grad_norm": 4.625, "learning_rate": 2.1816851753197023e-06, "loss": 0.9327, "step": 341 }, { "epoch": 0.8082962962962963, "step": 341, "train/combined_loss": 0.0593375526368618, "train/cross_entropy_loss": 0.020500056445598602, "train/kl_divergence_loss": 0.098175048828125, "train/step_duration_seconds": 7.171487331390381, "train/steps_per_hour": 320.4822262207178, "train/total_elapsed_hours": 1.0640215653181075 }, { "epoch": 0.8106666666666666, "grad_norm": 3.59375, "learning_rate": 2.130136983393112e-06, "loss": 0.9494, "step": 342 }, { "epoch": 0.8106666666666666, "step": 342, "train/combined_loss": 0.060327990911901, "train/cross_entropy_loss": 0.020588844665326178, "train/kl_divergence_loss": 0.100067138671875, "train/step_duration_seconds": 7.198361873626709, "train/steps_per_hour": 320.81916267981495, "train/total_elapsed_hours": 1.066021110283004 }, { "epoch": 0.813037037037037, "grad_norm": 1.53125, "learning_rate": 2.0791323931823783e-06, "loss": 0.9652, "step": 343 }, { "epoch": 0.813037037037037, "step": 343, "train/combined_loss": 0.05895965825766325, "train/cross_entropy_loss": 0.02056823973543942, "train/kl_divergence_loss": 0.09735107421875, "train/step_duration_seconds": 7.177961349487305, "train/steps_per_hour": 321.1565415410552, "train/total_elapsed_hours": 1.0680149884356394 }, { "epoch": 0.8154074074074074, "grad_norm": 2.78125, "learning_rate": 2.0286749277707783e-06, "loss": 0.9434, "step": 344 }, { "epoch": 0.8154074074074074, "step": 344, "train/combined_loss": 0.06692604720592499, "train/cross_entropy_loss": 0.02130326582118869, "train/kl_divergence_loss": 0.112548828125, "train/step_duration_seconds": 7.182729482650757, "train/steps_per_hour": 321.49226509337905, "train/total_elapsed_hours": 1.0700101910697088 }, { "epoch": 0.8177777777777778, "grad_norm": 2.78125, "learning_rate": 1.9787680724495617e-06, "loss": 1.0708, "step": 345 }, { "epoch": 0.8177777777777778, "step": 345, "train/combined_loss": 0.06856274465098977, "train/cross_entropy_loss": 0.02140283351764083, "train/kl_divergence_loss": 0.11572265625, "train/step_duration_seconds": 7.21046257019043, "train/steps_per_hour": 321.8244262652279, "train/total_elapsed_hours": 1.0720130973392064 }, { "epoch": 0.8201481481481482, "grad_norm": 2.53125, "learning_rate": 1.929415274477239e-06, "loss": 1.097, "step": 346 }, { "epoch": 0.8201481481481482, "step": 346, "train/combined_loss": 0.06000364082865417, "train/cross_entropy_loss": 0.020581011194735765, "train/kl_divergence_loss": 0.09942626953125, "train/step_duration_seconds": 7.178732395172119, "train/steps_per_hour": 322.1579923556434, "train/total_elapsed_hours": 1.0740071896711985 }, { "epoch": 0.8225185185185185, "grad_norm": 1.375, "learning_rate": 1.880619942841435e-06, "loss": 0.9601, "step": 347 }, { "epoch": 0.8225185185185185, "step": 347, "train/combined_loss": 0.06414050119929016, "train/cross_entropy_loss": 0.02085912844631821, "train/kl_divergence_loss": 0.107421875, "train/step_duration_seconds": 7.1721296310424805, "train/steps_per_hour": 322.4908717904752, "train/total_elapsed_hours": 1.0759994479020436 }, { "epoch": 0.8248888888888889, "grad_norm": 4.46875, "learning_rate": 1.8323854480234348e-06, "loss": 1.0262, "step": 348 }, { "epoch": 0.8248888888888889, "step": 348, "train/combined_loss": 0.059865488670766354, "train/cross_entropy_loss": 0.020457297330722213, "train/kl_divergence_loss": 0.099273681640625, "train/step_duration_seconds": 7.182539701461792, "train/steps_per_hour": 322.8216548617558, "train/total_elapsed_hours": 1.0779945978191163 }, { "epoch": 0.8272592592592592, "grad_norm": 0.95703125, "learning_rate": 1.7847151217653624e-06, "loss": 0.9578, "step": 349 }, { "epoch": 0.8272592592592592, "step": 349, "train/combined_loss": 0.061520870542153716, "train/cross_entropy_loss": 0.020655267755500972, "train/kl_divergence_loss": 0.102386474609375, "train/step_duration_seconds": 7.174387454986572, "train/steps_per_hour": 323.15189335194066, "train/total_elapsed_hours": 1.0799874832232794 }, { "epoch": 0.8296296296296296, "grad_norm": 1.3515625, "learning_rate": 1.7376122568400533e-06, "loss": 0.9843, "step": 350 }, { "epoch": 0.8296296296296296, "eval_combined_loss": 0.060819300456593436, "eval_cross_entropy_loss": 0.02078313216318687, "eval_kl_divergence_loss": 0.10085546875, "eval_loss": 0.060819294303655624, "eval_runtime": 220.1882, "eval_samples_per_second": 6.812, "eval_steps_per_second": 3.406, "step": 350 }, { "epoch": 0.8296296296296296, "step": 350, "train/combined_loss": 0.05945373326539993, "train/cross_entropy_loss": 0.020640864269807935, "train/kl_divergence_loss": 0.0982666015625, "train/step_duration_seconds": 227.37273049354553, "train/steps_per_hour": 306.17246086025364, "train/total_elapsed_hours": 1.143146575027042 }, { "epoch": 0.832, "grad_norm": 1.421875, "learning_rate": 1.6910801068236015e-06, "loss": 0.9513, "step": 351 }, { "epoch": 0.832, "step": 351, "train/combined_loss": 0.05878610094077885, "train/cross_entropy_loss": 0.02025164384394884, "train/kl_divergence_loss": 0.097320556640625, "train/step_duration_seconds": 7.182687520980835, "train/steps_per_hour": 306.51226810501225, "train/total_elapsed_hours": 1.1451417660050922 }, { "epoch": 0.8343703703703703, "grad_norm": 3.078125, "learning_rate": 1.6451218858706374e-06, "loss": 0.9406, "step": 352 }, { "epoch": 0.8343703703703703, "step": 352, "train/combined_loss": 0.05914012948051095, "train/cross_entropy_loss": 0.02041038265451789, "train/kl_divergence_loss": 0.097869873046875, "train/step_duration_seconds": 7.178761959075928, "train/steps_per_hour": 306.85118499420435, "train/total_elapsed_hours": 1.14713586654928 }, { "epoch": 0.8367407407407408, "grad_norm": 4.1875, "learning_rate": 1.599740768492286e-06, "loss": 0.9462, "step": 353 }, { "epoch": 0.8367407407407408, "step": 353, "train/combined_loss": 0.06183216394856572, "train/cross_entropy_loss": 0.02106422872748226, "train/kl_divergence_loss": 0.10260009765625, "train/step_duration_seconds": 7.19196081161499, "train/steps_per_hour": 307.1879455332317, "train/total_elapsed_hours": 1.1491336334413953 }, { "epoch": 0.8391111111111111, "grad_norm": 1.5234375, "learning_rate": 1.5549398893369216e-06, "loss": 0.9893, "step": 354 }, { "epoch": 0.8391111111111111, "step": 354, "train/combined_loss": 0.0688268430531025, "train/cross_entropy_loss": 0.022144652903079987, "train/kl_divergence_loss": 0.115509033203125, "train/step_duration_seconds": 7.177990674972534, "train/steps_per_hour": 307.5245738890482, "train/total_elapsed_hours": 1.1511275197399986 }, { "epoch": 0.8414814814814815, "grad_norm": 2.40625, "learning_rate": 1.5107223429736273e-06, "loss": 1.1012, "step": 355 }, { "epoch": 0.8414814814814815, "step": 355, "train/combined_loss": 0.060881074983626604, "train/cross_entropy_loss": 0.021084662177599967, "train/kl_divergence_loss": 0.100677490234375, "train/step_duration_seconds": 7.157759428024292, "train/steps_per_hour": 307.8615384801584, "train/total_elapsed_hours": 1.1531157862477832 }, { "epoch": 0.8438518518518519, "grad_norm": 1.6796875, "learning_rate": 1.467091183678444e-06, "loss": 0.9741, "step": 356 }, { "epoch": 0.8438518518518519, "step": 356, "train/combined_loss": 0.05890939268283546, "train/cross_entropy_loss": 0.02049823058769107, "train/kl_divergence_loss": 0.097320556640625, "train/step_duration_seconds": 7.175466537475586, "train/steps_per_hour": 308.1960306908749, "train/total_elapsed_hours": 1.155108971397082 }, { "epoch": 0.8462222222222222, "grad_norm": 3.640625, "learning_rate": 1.424049425223405e-06, "loss": 0.9426, "step": 357 }, { "epoch": 0.8462222222222222, "step": 357, "train/combined_loss": 0.06022225972265005, "train/cross_entropy_loss": 0.020865662721917033, "train/kl_divergence_loss": 0.099578857421875, "train/step_duration_seconds": 7.179744005203247, "train/steps_per_hour": 308.5290537144967, "train/total_elapsed_hours": 1.1571033447318606 }, { "epoch": 0.8485925925925926, "grad_norm": 1.8671875, "learning_rate": 1.3816000406683604e-06, "loss": 0.9636, "step": 358 }, { "epoch": 0.8485925925925926, "step": 358, "train/combined_loss": 0.06817956361919641, "train/cross_entropy_loss": 0.021460447693243623, "train/kl_divergence_loss": 0.114898681640625, "train/step_duration_seconds": 7.1635658740997314, "train/steps_per_hour": 308.8621282082033, "train/total_elapsed_hours": 1.1590932241413328 }, { "epoch": 0.8509629629629629, "grad_norm": 3.296875, "learning_rate": 1.339745962155613e-06, "loss": 1.0909, "step": 359 }, { "epoch": 0.8509629629629629, "step": 359, "train/combined_loss": 0.05856375303119421, "train/cross_entropy_loss": 0.020386786898598075, "train/kl_divergence_loss": 0.09674072265625, "train/step_duration_seconds": 7.170348644256592, "train/steps_per_hour": 309.19355931513985, "train/total_elapsed_hours": 1.1610849876536264 }, { "epoch": 0.8533333333333334, "grad_norm": 2.328125, "learning_rate": 1.2984900807073919e-06, "loss": 0.937, "step": 360 }, { "epoch": 0.8533333333333334, "step": 360, "train/combined_loss": 0.05872082710266113, "train/cross_entropy_loss": 0.020578863797709346, "train/kl_divergence_loss": 0.09686279296875, "train/step_duration_seconds": 7.177663326263428, "train/steps_per_hour": 309.5233145467673, "train/total_elapsed_hours": 1.1630787830220328 }, { "epoch": 0.8557037037037037, "grad_norm": 1.4140625, "learning_rate": 1.2578352460261456e-06, "loss": 0.9395, "step": 361 }, { "epoch": 0.8557037037037037, "step": 361, "train/combined_loss": 0.05911425780504942, "train/cross_entropy_loss": 0.02054174430668354, "train/kl_divergence_loss": 0.097686767578125, "train/step_duration_seconds": 7.158376932144165, "train/steps_per_hour": 309.8533659473708, "train/total_elapsed_hours": 1.1650672210587396 }, { "epoch": 0.8580740740740741, "grad_norm": 1.6953125, "learning_rate": 1.2177842662977136e-06, "loss": 0.9458, "step": 362 }, { "epoch": 0.8580740740740741, "step": 362, "train/combined_loss": 0.06256715022027493, "train/cross_entropy_loss": 0.02085573854856193, "train/kl_divergence_loss": 0.104278564453125, "train/step_duration_seconds": 7.160759925842285, "train/steps_per_hour": 310.182116727655, "train/total_elapsed_hours": 1.1670563210381402 }, { "epoch": 0.8604444444444445, "grad_norm": 2.109375, "learning_rate": 1.1783399079973578e-06, "loss": 1.0011, "step": 363 }, { "epoch": 0.8604444444444445, "step": 363, "train/combined_loss": 0.05845469981431961, "train/cross_entropy_loss": 0.02022971585392952, "train/kl_divergence_loss": 0.0966796875, "train/step_duration_seconds": 7.147622346878052, "train/steps_per_hour": 310.51071808599374, "train/total_elapsed_hours": 1.1690417716900507 }, { "epoch": 0.8628148148148148, "grad_norm": 1.40625, "learning_rate": 1.1395048956986577e-06, "loss": 0.9353, "step": 364 }, { "epoch": 0.8628148148148148, "step": 364, "train/combined_loss": 0.05994793586432934, "train/cross_entropy_loss": 0.020683227223344147, "train/kl_divergence_loss": 0.099212646484375, "train/step_duration_seconds": 7.1444926261901855, "train/steps_per_hour": 310.83843593717893, "train/total_elapsed_hours": 1.1710263529751035 }, { "epoch": 0.8651851851851852, "grad_norm": 5.96875, "learning_rate": 1.1012819118853147e-06, "loss": 0.9592, "step": 365 }, { "epoch": 0.8651851851851852, "step": 365, "train/combined_loss": 0.05859701009467244, "train/cross_entropy_loss": 0.02045329543761909, "train/kl_divergence_loss": 0.09674072265625, "train/step_duration_seconds": 7.179033279418945, "train/steps_per_hour": 311.1624997276206, "train/total_elapsed_hours": 1.1730205288860533 }, { "epoch": 0.8675555555555555, "grad_norm": 1.375, "learning_rate": 1.0636735967658785e-06, "loss": 0.9376, "step": 366 }, { "epoch": 0.8675555555555555, "step": 366, "train/combined_loss": 0.06006666086614132, "train/cross_entropy_loss": 0.020523948594927788, "train/kl_divergence_loss": 0.099609375, "train/step_duration_seconds": 7.176509141921997, "train/steps_per_hour": 311.48564941676807, "train/total_elapsed_hours": 1.1750140036476984 }, { "epoch": 0.8699259259259259, "grad_norm": 1.8828125, "learning_rate": 1.026682548091361e-06, "loss": 0.9611, "step": 367 }, { "epoch": 0.8699259259259259, "step": 367, "train/combined_loss": 0.0603836253285408, "train/cross_entropy_loss": 0.020761147141456604, "train/kl_divergence_loss": 0.100006103515625, "train/step_duration_seconds": 7.1585469245910645, "train/steps_per_hour": 311.8090262847088, "train/total_elapsed_hours": 1.1770024889045292 }, { "epoch": 0.8722962962962963, "grad_norm": 2.28125, "learning_rate": 9.903113209758098e-07, "loss": 0.9661, "step": 368 }, { "epoch": 0.8722962962962963, "step": 368, "train/combined_loss": 0.061565724201500416, "train/cross_entropy_loss": 0.020806010346859694, "train/kl_divergence_loss": 0.102325439453125, "train/step_duration_seconds": 7.178318738937378, "train/steps_per_hour": 312.1298583233999, "train/total_elapsed_hours": 1.1789964663320118 }, { "epoch": 0.8746666666666667, "grad_norm": 2.21875, "learning_rate": 9.545624277198085e-07, "loss": 0.9851, "step": 369 }, { "epoch": 0.8746666666666667, "step": 369, "train/combined_loss": 0.05880419351160526, "train/cross_entropy_loss": 0.02074559754692018, "train/kl_divergence_loss": 0.09686279296875, "train/step_duration_seconds": 7.1789703369140625, "train/steps_per_hour": 312.44955909393565, "train/total_elapsed_hours": 1.1809906247589324 }, { "epoch": 0.8770370370370371, "grad_norm": 2.8125, "learning_rate": 9.194383376369509e-07, "loss": 0.9409, "step": 370 }, { "epoch": 0.8770370370370371, "step": 370, "train/combined_loss": 0.059380816062912345, "train/cross_entropy_loss": 0.0207696893485263, "train/kl_divergence_loss": 0.097991943359375, "train/step_duration_seconds": 7.177523374557495, "train/steps_per_hour": 312.7682882917324, "train/total_elapsed_hours": 1.182984381251865 }, { "epoch": 0.8794074074074074, "grad_norm": 1.765625, "learning_rate": 8.849414768832687e-07, "loss": 0.9501, "step": 371 }, { "epoch": 0.8794074074074074, "step": 371, "train/combined_loss": 0.058346427977085114, "train/cross_entropy_loss": 0.02050144597887993, "train/kl_divergence_loss": 0.09619140625, "train/step_duration_seconds": 7.176830053329468, "train/steps_per_hour": 313.08599583369795, "train/total_elapsed_hours": 1.1849779451555675 }, { "epoch": 0.8817777777777778, "grad_norm": 2.546875, "learning_rate": 8.510742282896545e-07, "loss": 0.9335, "step": 372 }, { "epoch": 0.8817777777777778, "step": 372, "train/combined_loss": 0.061172885121777654, "train/cross_entropy_loss": 0.02075275091920048, "train/kl_divergence_loss": 0.101593017578125, "train/step_duration_seconds": 7.177177906036377, "train/steps_per_hour": 313.40261065917605, "train/total_elapsed_hours": 1.1869716056850221 }, { "epoch": 0.8841481481481481, "grad_norm": 2.578125, "learning_rate": 8.178389311972612e-07, "loss": 0.9788, "step": 373 }, { "epoch": 0.8841481481481481, "step": 373, "train/combined_loss": 0.06040294258855283, "train/cross_entropy_loss": 0.02064719540067017, "train/kl_divergence_loss": 0.10015869140625, "train/step_duration_seconds": 7.17975640296936, "train/steps_per_hour": 313.7179746952193, "train/total_elapsed_hours": 1.1889659824636247 }, { "epoch": 0.8865185185185185, "grad_norm": 2.109375, "learning_rate": 7.852378812959227e-07, "loss": 0.9664, "step": 374 }, { "epoch": 0.8865185185185185, "step": 374, "train/combined_loss": 0.061964265536516905, "train/cross_entropy_loss": 0.021664125844836235, "train/kl_divergence_loss": 0.102264404296875, "train/step_duration_seconds": 7.165625333786011, "train/steps_per_hour": 314.0333175421587, "train/total_elapsed_hours": 1.1909564339452319 }, { "epoch": 0.8888888888888888, "grad_norm": 3.921875, "learning_rate": 7.532733304655848e-07, "loss": 0.9914, "step": 375 }, { "epoch": 0.8888888888888888, "step": 375, "train/combined_loss": 0.0588826653547585, "train/cross_entropy_loss": 0.020475288503803313, "train/kl_divergence_loss": 0.0972900390625, "train/step_duration_seconds": 7.169605493545532, "train/steps_per_hour": 314.34731674868476, "train/total_elapsed_hours": 1.1929479910267724 }, { "epoch": 0.8912592592592593, "grad_norm": 1.4453125, "learning_rate": 7.219474866207465e-07, "loss": 0.9421, "step": 376 }, { "epoch": 0.8912592592592593, "step": 376, "train/combined_loss": 0.05903471680358052, "train/cross_entropy_loss": 0.02038266253657639, "train/kl_divergence_loss": 0.097686767578125, "train/step_duration_seconds": 7.174905776977539, "train/steps_per_hour": 314.65988159919425, "train/total_elapsed_hours": 1.194941020409266 }, { "epoch": 0.8936296296296297, "grad_norm": 2.6875, "learning_rate": 6.912625135579587e-07, "loss": 0.9446, "step": 377 }, { "epoch": 0.8936296296296297, "step": 377, "train/combined_loss": 0.06489993864670396, "train/cross_entropy_loss": 0.021004713140428066, "train/kl_divergence_loss": 0.108795166015625, "train/step_duration_seconds": 7.203597068786621, "train/steps_per_hour": 314.96930831081517, "train/total_elapsed_hours": 1.1969420195950402 }, { "epoch": 0.896, "grad_norm": 1.5, "learning_rate": 6.612205308063646e-07, "loss": 1.0384, "step": 378 }, { "epoch": 0.896, "step": 378, "train/combined_loss": 0.06032265955582261, "train/cross_entropy_loss": 0.020669732824899256, "train/kl_divergence_loss": 0.0999755859375, "train/step_duration_seconds": 7.174722909927368, "train/steps_per_hour": 315.27981131041514, "train/total_elapsed_hours": 1.1989349981811313 }, { "epoch": 0.8983703703703704, "grad_norm": 1.28125, "learning_rate": 6.318236134812917e-07, "loss": 0.9652, "step": 379 }, { "epoch": 0.8983703703703704, "step": 379, "train/combined_loss": 0.060588925145566463, "train/cross_entropy_loss": 0.02068346820306033, "train/kl_divergence_loss": 0.100494384765625, "train/step_duration_seconds": 7.157525300979614, "train/steps_per_hour": 315.5905391030105, "train/total_elapsed_hours": 1.2009231996536256 }, { "epoch": 0.9007407407407407, "grad_norm": 2.03125, "learning_rate": 6.030737921409169e-07, "loss": 0.9694, "step": 380 }, { "epoch": 0.9007407407407407, "step": 380, "train/combined_loss": 0.05855354503728449, "train/cross_entropy_loss": 0.020335851586423814, "train/kl_divergence_loss": 0.096771240234375, "train/step_duration_seconds": 7.176971912384033, "train/steps_per_hour": 315.89882115214573, "train/total_elapsed_hours": 1.2029168029626212 }, { "epoch": 0.9031111111111111, "grad_norm": 2.109375, "learning_rate": 5.749730526460073e-07, "loss": 0.9369, "step": 381 }, { "epoch": 0.9031111111111111, "step": 381, "train/combined_loss": 0.05888266093097627, "train/cross_entropy_loss": 0.02044476370792836, "train/kl_divergence_loss": 0.097320556640625, "train/step_duration_seconds": 7.179260730743408, "train/steps_per_hour": 316.20591620635895, "train/total_elapsed_hours": 1.2049110420544942 }, { "epoch": 0.9054814814814814, "grad_norm": 1.78125, "learning_rate": 5.475233360227516e-07, "loss": 0.9421, "step": 382 }, { "epoch": 0.9054814814814814, "step": 382, "train/combined_loss": 0.059826530516147614, "train/cross_entropy_loss": 0.020623518154025078, "train/kl_divergence_loss": 0.099029541015625, "train/step_duration_seconds": 7.175317049026489, "train/steps_per_hour": 316.512283686395, "train/total_elapsed_hours": 1.2069041856792238 }, { "epoch": 0.9078518518518518, "grad_norm": 3.328125, "learning_rate": 5.207265383286831e-07, "loss": 0.9572, "step": 383 }, { "epoch": 0.9078518518518518, "step": 383, "train/combined_loss": 0.06832170393317938, "train/cross_entropy_loss": 0.02159213600680232, "train/kl_divergence_loss": 0.11505126953125, "train/step_duration_seconds": 7.181811809539795, "train/steps_per_hour": 316.8171681300854, "train/total_elapsed_hours": 1.208899133404096 }, { "epoch": 0.9102222222222223, "grad_norm": 2.3125, "learning_rate": 4.945845105217118e-07, "loss": 1.0931, "step": 384 }, { "epoch": 0.9102222222222223, "step": 384, "train/combined_loss": 0.06399638252332807, "train/cross_entropy_loss": 0.021303314133547246, "train/kl_divergence_loss": 0.106689453125, "train/step_duration_seconds": 7.173643112182617, "train/steps_per_hour": 317.1216422308217, "train/total_elapsed_hours": 1.210891812046369 }, { "epoch": 0.9125925925925926, "grad_norm": 2.109375, "learning_rate": 4.6909905833226965e-07, "loss": 1.0239, "step": 385 }, { "epoch": 0.9125925925925926, "step": 385, "train/combined_loss": 0.058574909809976816, "train/cross_entropy_loss": 0.02040909300558269, "train/kl_divergence_loss": 0.09674072265625, "train/step_duration_seconds": 7.179981470108032, "train/steps_per_hour": 317.4246550931726, "train/total_elapsed_hours": 1.2128862513436212 }, { "epoch": 0.914962962962963, "grad_norm": 1.4140625, "learning_rate": 4.4427194213859216e-07, "loss": 0.9372, "step": 386 }, { "epoch": 0.914962962962963, "step": 386, "train/combined_loss": 0.059789648512378335, "train/cross_entropy_loss": 0.020763377659022808, "train/kl_divergence_loss": 0.09881591796875, "train/step_duration_seconds": 7.180516481399536, "train/steps_per_hour": 317.72663419127423, "train/total_elapsed_hours": 1.214880839255121 }, { "epoch": 0.9173333333333333, "grad_norm": 4.0625, "learning_rate": 4.2010487684511105e-07, "loss": 0.9566, "step": 387 }, { "epoch": 0.9173333333333333, "step": 387, "train/combined_loss": 0.06063654413446784, "train/cross_entropy_loss": 0.020656629814766347, "train/kl_divergence_loss": 0.100616455078125, "train/step_duration_seconds": 7.174912929534912, "train/steps_per_hour": 318.0280301371019, "train/total_elapsed_hours": 1.2168738706244362 }, { "epoch": 0.9197037037037037, "grad_norm": 1.765625, "learning_rate": 3.965995317640026e-07, "loss": 0.9702, "step": 388 }, { "epoch": 0.9197037037037037, "step": 388, "train/combined_loss": 0.05888652987778187, "train/cross_entropy_loss": 0.020452499389648438, "train/kl_divergence_loss": 0.097320556640625, "train/step_duration_seconds": 7.1626060009002686, "train/steps_per_hour": 318.32933325469384, "train/total_elapsed_hours": 1.218863483402464 }, { "epoch": 0.922074074074074, "grad_norm": 1.3984375, "learning_rate": 3.7375753049987974e-07, "loss": 0.9422, "step": 389 }, { "epoch": 0.922074074074074, "step": 389, "train/combined_loss": 0.0582260861992836, "train/cross_entropy_loss": 0.0202607661485672, "train/kl_divergence_loss": 0.09619140625, "train/step_duration_seconds": 7.176948070526123, "train/steps_per_hour": 318.6286145563663, "train/total_elapsed_hours": 1.2208570800887213 }, { "epoch": 0.9244444444444444, "grad_norm": 1.3359375, "learning_rate": 3.515804508376508e-07, "loss": 0.9316, "step": 390 }, { "epoch": 0.9244444444444444, "step": 390, "train/combined_loss": 0.06077071442268789, "train/cross_entropy_loss": 0.0208639376796782, "train/kl_divergence_loss": 0.100677490234375, "train/step_duration_seconds": 7.199989318847656, "train/steps_per_hour": 318.9252507888751, "train/total_elapsed_hours": 1.2228570771217346 }, { "epoch": 0.9268148148148149, "grad_norm": 1.5078125, "learning_rate": 3.3006982463352764e-07, "loss": 0.9723, "step": 391 }, { "epoch": 0.9268148148148149, "step": 391, "train/combined_loss": 0.06101406365633011, "train/cross_entropy_loss": 0.021350633818656206, "train/kl_divergence_loss": 0.100677490234375, "train/step_duration_seconds": 7.170354127883911, "train/steps_per_hour": 319.2230637303404, "train/total_elapsed_hours": 1.2248488421572579 }, { "epoch": 0.9291851851851852, "grad_norm": 3.0, "learning_rate": 3.0922713770922155e-07, "loss": 0.9762, "step": 392 }, { "epoch": 0.9291851851851852, "step": 392, "train/combined_loss": 0.0617949569132179, "train/cross_entropy_loss": 0.02153913036454469, "train/kl_divergence_loss": 0.10205078125, "train/step_duration_seconds": 7.201368808746338, "train/steps_per_hour": 319.5176659434464, "train/total_elapsed_hours": 1.2268492223819096 }, { "epoch": 0.9315555555555556, "grad_norm": 2.125, "learning_rate": 2.8905382974930173e-07, "loss": 0.9887, "step": 393 }, { "epoch": 0.9315555555555556, "step": 393, "train/combined_loss": 0.06196058611385524, "train/cross_entropy_loss": 0.021473662578500807, "train/kl_divergence_loss": 0.102447509765625, "train/step_duration_seconds": 7.19616436958313, "train/steps_per_hour": 319.81168526316264, "train/total_elapsed_hours": 1.2288481569290162 }, { "epoch": 0.9339259259259259, "grad_norm": 2.75, "learning_rate": 2.6955129420176193e-07, "loss": 0.9914, "step": 394 }, { "epoch": 0.9339259259259259, "step": 394, "train/combined_loss": 0.06139179831370711, "train/cross_entropy_loss": 0.0205191905843094, "train/kl_divergence_loss": 0.102264404296875, "train/step_duration_seconds": 7.170999526977539, "train/steps_per_hour": 320.10656754066326, "train/total_elapsed_hours": 1.2308401012420653 }, { "epoch": 0.9362962962962963, "grad_norm": 1.375, "learning_rate": 2.507208781817638e-07, "loss": 0.9823, "step": 395 }, { "epoch": 0.9362962962962963, "step": 395, "train/combined_loss": 0.05912953009828925, "train/cross_entropy_loss": 0.020389189943671227, "train/kl_divergence_loss": 0.097869873046875, "train/step_duration_seconds": 7.163103818893433, "train/steps_per_hour": 320.4010669129712, "train/total_elapsed_hours": 1.2328298523028691 }, { "epoch": 0.9386666666666666, "grad_norm": 1.46875, "learning_rate": 2.3256388237858806e-07, "loss": 0.9461, "step": 396 }, { "epoch": 0.9386666666666666, "step": 396, "train/combined_loss": 0.05818613991141319, "train/cross_entropy_loss": 0.020180873572826385, "train/kl_divergence_loss": 0.09619140625, "train/step_duration_seconds": 7.165131568908691, "train/steps_per_hour": 320.6944709054444, "train/total_elapsed_hours": 1.234820166627566 }, { "epoch": 0.941037037037037, "grad_norm": 1.9609375, "learning_rate": 2.1508156096578748e-07, "loss": 0.931, "step": 397 }, { "epoch": 0.941037037037037, "step": 397, "train/combined_loss": 0.06258085602894425, "train/cross_entropy_loss": 0.02100521558895707, "train/kl_divergence_loss": 0.104156494140625, "train/step_duration_seconds": 7.152077913284302, "train/steps_per_hour": 320.9878716432912, "train/total_elapsed_hours": 1.2368068549368116 }, { "epoch": 0.9434074074074074, "grad_norm": 2.71875, "learning_rate": 1.9827512151456175e-07, "loss": 1.0013, "step": 398 }, { "epoch": 0.9434074074074074, "step": 398, "train/combined_loss": 0.0631152824498713, "train/cross_entropy_loss": 0.020975436200387776, "train/kl_divergence_loss": 0.105255126953125, "train/step_duration_seconds": 7.156713008880615, "train/steps_per_hour": 321.2799973921229, "train/total_elapsed_hours": 1.2387948307726118 }, { "epoch": 0.9457777777777778, "grad_norm": 1.671875, "learning_rate": 1.82145724910342e-07, "loss": 1.0098, "step": 399 }, { "epoch": 0.9457777777777778, "step": 399, "train/combined_loss": 0.05855529848486185, "train/cross_entropy_loss": 0.020369877573102713, "train/kl_divergence_loss": 0.09674072265625, "train/step_duration_seconds": 7.177468776702881, "train/steps_per_hour": 321.56969283241614, "train/total_elapsed_hours": 1.2407885720994738 }, { "epoch": 0.9481481481481482, "grad_norm": 1.3203125, "learning_rate": 1.6669448527260602e-07, "loss": 0.9369, "step": 400 }, { "epoch": 0.9481481481481482, "eval_combined_loss": 0.06070284065480033, "eval_cross_entropy_loss": 0.020584717767934003, "eval_kl_divergence_loss": 0.10082096354166667, "eval_loss": 0.060702841728925705, "eval_runtime": 220.2816, "eval_samples_per_second": 6.809, "eval_steps_per_second": 3.405, "step": 400 }, { "epoch": 0.9481481481481482, "step": 400, "train/combined_loss": 0.059124535880982876, "train/cross_entropy_loss": 0.020409714779816568, "train/kl_divergence_loss": 0.09783935546875, "train/step_duration_seconds": 227.46868801116943, "train/steps_per_hour": 306.75450754086955, "train/total_elapsed_hours": 1.303974318769243 }, { "epoch": 0.9505185185185185, "grad_norm": 2.390625, "learning_rate": 1.519224698779198e-07, "loss": 0.946, "step": 401 }, { "epoch": 0.9505185185185185, "step": 401, "train/combined_loss": 0.0637058550491929, "train/cross_entropy_loss": 0.020966400275938213, "train/kl_divergence_loss": 0.1064453125, "train/step_duration_seconds": 7.181180477142334, "train/steps_per_hour": 307.05167735238854, "train/total_elapsed_hours": 1.3059690911240047 }, { "epoch": 0.9528888888888889, "grad_norm": 2.125, "learning_rate": 1.3783069908621772e-07, "loss": 1.0193, "step": 402 }, { "epoch": 0.9528888888888889, "step": 402, "train/combined_loss": 0.06105339783243835, "train/cross_entropy_loss": 0.0205748132430017, "train/kl_divergence_loss": 0.101531982421875, "train/step_duration_seconds": 7.198254823684692, "train/steps_per_hour": 307.3468262521633, "train/total_elapsed_hours": 1.307968606352806 }, { "epoch": 0.9552592592592593, "grad_norm": 1.6484375, "learning_rate": 1.2442014627032318e-07, "loss": 0.9769, "step": 403 }, { "epoch": 0.9552592592592593, "step": 403, "train/combined_loss": 0.0588757898658514, "train/cross_entropy_loss": 0.02040050830692053, "train/kl_divergence_loss": 0.09735107421875, "train/step_duration_seconds": 7.179206609725952, "train/steps_per_hour": 307.6423167469107, "train/total_elapsed_hours": 1.3099628304110633 }, { "epoch": 0.9576296296296296, "grad_norm": 1.734375, "learning_rate": 1.1169173774871478e-07, "loss": 0.942, "step": 404 }, { "epoch": 0.9576296296296296, "step": 404, "train/combined_loss": 0.059780715964734554, "train/cross_entropy_loss": 0.020562409423291683, "train/kl_divergence_loss": 0.0989990234375, "train/step_duration_seconds": 7.192591905593872, "train/steps_per_hour": 307.93603622552814, "train/total_elapsed_hours": 1.3119607726070617 }, { "epoch": 0.96, "grad_norm": 2.953125, "learning_rate": 9.964635272153633e-08, "loss": 0.9565, "step": 405 }, { "epoch": 0.96, "step": 405, "train/combined_loss": 0.0602156778331846, "train/cross_entropy_loss": 0.02063887706026435, "train/kl_divergence_loss": 0.09979248046875, "train/step_duration_seconds": 7.179564952850342, "train/steps_per_hour": 308.22971132705254, "train/total_elapsed_hours": 1.3139550962050757 }, { "epoch": 0.9623703703703703, "grad_norm": 1.125, "learning_rate": 8.82848232098732e-08, "loss": 0.9635, "step": 406 }, { "epoch": 0.9623703703703703, "step": 406, "train/combined_loss": 0.05820171535015106, "train/cross_entropy_loss": 0.020212026312947273, "train/kl_divergence_loss": 0.09619140625, "train/step_duration_seconds": 7.195192575454712, "train/steps_per_hour": 308.5214785588609, "train/total_elapsed_hours": 1.3159537608093685 }, { "epoch": 0.9647407407407408, "grad_norm": 1.328125, "learning_rate": 7.760793399827937e-08, "loss": 0.9312, "step": 407 }, { "epoch": 0.9647407407407408, "step": 407, "train/combined_loss": 0.061329676769673824, "train/cross_entropy_loss": 0.02094426902476698, "train/kl_divergence_loss": 0.101715087890625, "train/step_duration_seconds": 7.17832612991333, "train/steps_per_hour": 308.81345865085285, "train/total_elapsed_hours": 1.3179477402899 }, { "epoch": 0.9671111111111111, "grad_norm": 1.8828125, "learning_rate": 6.761642258056977e-08, "loss": 0.9813, "step": 408 }, { "epoch": 0.9671111111111111, "step": 408, "train/combined_loss": 0.06031193025410175, "train/cross_entropy_loss": 0.020617756876163185, "train/kl_divergence_loss": 0.100006103515625, "train/step_duration_seconds": 7.178639888763428, "train/steps_per_hour": 309.1045361691286, "train/total_elapsed_hours": 1.3199418069256676 }, { "epoch": 0.9694814814814815, "grad_norm": 1.421875, "learning_rate": 5.831097910887873e-08, "loss": 0.965, "step": 409 }, { "epoch": 0.9694814814814815, "step": 409, "train/combined_loss": 0.059776231879368424, "train/cross_entropy_loss": 0.020767063251696527, "train/kl_divergence_loss": 0.098785400390625, "train/step_duration_seconds": 7.177491903305054, "train/steps_per_hour": 309.3948101729231, "train/total_elapsed_hours": 1.3219355546765856 }, { "epoch": 0.9718518518518519, "grad_norm": 1.6484375, "learning_rate": 4.9692246345985905e-08, "loss": 0.9564, "step": 410 }, { "epoch": 0.9718518518518519, "step": 410, "train/combined_loss": 0.06045236345380545, "train/cross_entropy_loss": 0.020746038877405226, "train/kl_divergence_loss": 0.10015869140625, "train/step_duration_seconds": 7.197791576385498, "train/steps_per_hour": 309.68289092850614, "train/total_elapsed_hours": 1.3239349412255816 }, { "epoch": 0.9742222222222222, "grad_norm": 1.8125, "learning_rate": 4.176081962092182e-08, "loss": 0.9672, "step": 411 }, { "epoch": 0.9742222222222222, "step": 411, "train/combined_loss": 0.06638129102066159, "train/cross_entropy_loss": 0.021220834576524794, "train/kl_divergence_loss": 0.111541748046875, "train/step_duration_seconds": 7.180782318115234, "train/steps_per_hour": 309.97120742767606, "train/total_elapsed_hours": 1.3259296029806138 }, { "epoch": 0.9765925925925926, "grad_norm": 3.25, "learning_rate": 3.451724678784518e-08, "loss": 1.0621, "step": 412 }, { "epoch": 0.9765925925925926, "step": 412, "train/combined_loss": 0.060466301161795855, "train/cross_entropy_loss": 0.020804431289434433, "train/kl_divergence_loss": 0.100128173828125, "train/step_duration_seconds": 7.173940181732178, "train/steps_per_hour": 310.2591018309556, "train/total_elapsed_hours": 1.327922364142206 }, { "epoch": 0.9789629629629629, "grad_norm": 2.140625, "learning_rate": 2.796202818819871e-08, "loss": 0.9675, "step": 413 }, { "epoch": 0.9789629629629629, "step": 413, "train/combined_loss": 0.06092071859166026, "train/cross_entropy_loss": 0.02085877349600196, "train/kl_divergence_loss": 0.100982666015625, "train/step_duration_seconds": 7.189931392669678, "train/steps_per_hour": 310.5450962243895, "train/total_elapsed_hours": 1.3299195673068365 }, { "epoch": 0.9813333333333333, "grad_norm": 1.3984375, "learning_rate": 2.2095616616150117e-08, "loss": 0.9747, "step": 414 }, { "epoch": 0.9813333333333333, "step": 414, "train/combined_loss": 0.05822563171386719, "train/cross_entropy_loss": 0.020259857177734375, "train/kl_divergence_loss": 0.09619140625, "train/step_duration_seconds": 7.186516523361206, "train/steps_per_hour": 310.8304542928701, "train/total_elapsed_hours": 1.331915821896659 }, { "epoch": 0.9837037037037037, "grad_norm": 2.515625, "learning_rate": 1.6918417287318245e-08, "loss": 0.9316, "step": 415 }, { "epoch": 0.9837037037037037, "step": 415, "train/combined_loss": 0.05974208423867822, "train/cross_entropy_loss": 0.020485147717408836, "train/kl_divergence_loss": 0.0989990234375, "train/step_duration_seconds": 7.198460102081299, "train/steps_per_hour": 311.1141844684285, "train/total_elapsed_hours": 1.333915394147237 }, { "epoch": 0.9860740740740741, "grad_norm": 1.1171875, "learning_rate": 1.2430787810776556e-08, "loss": 0.9559, "step": 416 }, { "epoch": 0.9860740740740741, "step": 416, "train/combined_loss": 0.05951492628082633, "train/cross_entropy_loss": 0.02033600490540266, "train/kl_divergence_loss": 0.09869384765625, "train/step_duration_seconds": 7.179587125778198, "train/steps_per_hour": 311.3982872915823, "train/total_elapsed_hours": 1.3359097239043978 }, { "epoch": 0.9884444444444445, "grad_norm": 1.2578125, "learning_rate": 8.633038164358454e-09, "loss": 0.9522, "step": 417 }, { "epoch": 0.9884444444444445, "step": 417, "train/combined_loss": 0.060025526909157634, "train/cross_entropy_loss": 0.020472198841162026, "train/kl_divergence_loss": 0.099578857421875, "train/step_duration_seconds": 7.1582019329071045, "train/steps_per_hour": 311.6829270070737, "train/total_elapsed_hours": 1.3378981133302053 }, { "epoch": 0.9908148148148148, "grad_norm": 1.3125, "learning_rate": 5.525430673244403e-09, "loss": 0.9604, "step": 418 }, { "epoch": 0.9908148148148148, "step": 418, "train/combined_loss": 0.06418039370328188, "train/cross_entropy_loss": 0.0208778785308823, "train/kl_divergence_loss": 0.10748291015625, "train/step_duration_seconds": 7.19930624961853, "train/steps_per_hour": 311.964063505697, "train/total_elapsed_hours": 1.339897920621766 }, { "epoch": 0.9931851851851852, "grad_norm": 1.4765625, "learning_rate": 3.1081799918375454e-09, "loss": 1.0269, "step": 419 }, { "epoch": 0.9931851851851852, "step": 419, "train/combined_loss": 0.0633124178275466, "train/cross_entropy_loss": 0.020789876696653664, "train/kl_divergence_loss": 0.1058349609375, "train/step_duration_seconds": 7.172863245010376, "train/steps_per_hour": 312.2460712308407, "train/total_elapsed_hours": 1.341890382634269 }, { "epoch": 0.9955555555555555, "grad_norm": 1.3515625, "learning_rate": 1.3814530889433298e-09, "loss": 1.013, "step": 420 }, { "epoch": 0.9955555555555555, "step": 420, "train/combined_loss": 0.05849831993691623, "train/cross_entropy_loss": 0.020225399872288108, "train/kl_divergence_loss": 0.096771240234375, "train/step_duration_seconds": 7.199819087982178, "train/steps_per_hour": 312.5255014342565, "train/total_elapsed_hours": 1.3438903323809306 }, { "epoch": 0.9979259259259259, "grad_norm": 1.015625, "learning_rate": 3.4536923623096353e-10, "loss": 0.936, "step": 421 } ], "logging_steps": 1, "max_steps": 421, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0845947087447654e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }