Malikeh1375's picture
Upload fine-tuned Qwen2.5-1.5B safety and harmful content model
fb4ba08 verified
{
"best_global_step": 400,
"best_metric": 0.060702841728925705,
"best_model_checkpoint": null,
"epoch": 0.9979259259259259,
"eval_steps": 50,
"global_step": 421,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"step": 0,
"train/combined_loss": 4.310243457555771,
"train/cross_entropy_loss": 7.370486944913864,
"train/kl_divergence_loss": 1.25,
"train/step_duration_seconds": 64.96706748008728,
"train/steps_per_hour": 0.0,
"train/total_elapsed_hours": 0.018046407500902813
},
{
"epoch": 0.0023703703703703703,
"grad_norm": 26752.0,
"learning_rate": 0.0,
"loss": 68.9639,
"step": 1
},
{
"epoch": 0.0023703703703703703,
"step": 1,
"train/combined_loss": 4.25145959854126,
"train/cross_entropy_loss": 7.2763566970825195,
"train/kl_divergence_loss": 1.2265625,
"train/step_duration_seconds": 7.274043560028076,
"train/steps_per_hour": 49.83312094637097,
"train/total_elapsed_hours": 0.020066975156466167
},
{
"epoch": 0.004740740740740741,
"grad_norm": 27392.0,
"learning_rate": 4.651162790697675e-07,
"loss": 68.0234,
"step": 2
},
{
"epoch": 0.004740740740740741,
"step": 2,
"train/combined_loss": 4.210591048002243,
"train/cross_entropy_loss": 7.187784105539322,
"train/kl_divergence_loss": 1.2333984375,
"train/step_duration_seconds": 7.055061340332031,
"train/steps_per_hour": 90.79883463670956,
"train/total_elapsed_hours": 0.02202671441766951
},
{
"epoch": 0.0071111111111111115,
"grad_norm": 27264.0,
"learning_rate": 9.30232558139535e-07,
"loss": 67.3695,
"step": 3
},
{
"epoch": 0.0071111111111111115,
"step": 3,
"train/combined_loss": 4.220440715551376,
"train/cross_entropy_loss": 7.188928246498108,
"train/kl_divergence_loss": 1.251953125,
"train/step_duration_seconds": 7.075124740600586,
"train/steps_per_hour": 125.04154064617586,
"train/total_elapsed_hours": 0.023992026845614117
},
{
"epoch": 0.009481481481481481,
"grad_norm": 26240.0,
"learning_rate": 1.3953488372093025e-06,
"loss": 67.5271,
"step": 4
},
{
"epoch": 0.009481481481481481,
"step": 4,
"train/combined_loss": 4.050247877836227,
"train/cross_entropy_loss": 6.869538724422455,
"train/kl_divergence_loss": 1.23095703125,
"train/step_duration_seconds": 7.09927773475647,
"train/steps_per_hour": 154.0591795404681,
"train/total_elapsed_hours": 0.025964048438602023
},
{
"epoch": 0.011851851851851851,
"grad_norm": 27264.0,
"learning_rate": 1.86046511627907e-06,
"loss": 64.804,
"step": 5
},
{
"epoch": 0.011851851851851851,
"step": 5,
"train/combined_loss": 3.8814118206501007,
"train/cross_entropy_loss": 6.518194735050201,
"train/kl_divergence_loss": 1.24462890625,
"train/step_duration_seconds": 7.140614032745361,
"train/steps_per_hour": 178.9065439356696,
"train/total_elapsed_hours": 0.027947552336586846
},
{
"epoch": 0.014222222222222223,
"grad_norm": 28160.0,
"learning_rate": 2.3255813953488376e-06,
"loss": 62.1026,
"step": 6
},
{
"epoch": 0.014222222222222223,
"step": 6,
"train/combined_loss": 3.3225543051958084,
"train/cross_entropy_loss": 5.414151579141617,
"train/kl_divergence_loss": 1.23095703125,
"train/step_duration_seconds": 7.164709091186523,
"train/steps_per_hour": 200.41586755906124,
"train/total_elapsed_hours": 0.02993774930636088
},
{
"epoch": 0.016592592592592593,
"grad_norm": 22912.0,
"learning_rate": 2.790697674418605e-06,
"loss": 53.1609,
"step": 7
},
{
"epoch": 0.016592592592592593,
"step": 7,
"train/combined_loss": 2.8946415334939957,
"train/cross_entropy_loss": 4.530005723237991,
"train/kl_divergence_loss": 1.25927734375,
"train/step_duration_seconds": 7.1137425899505615,
"train/steps_per_hour": 219.34092560159692,
"train/total_elapsed_hours": 0.03191378891468048
},
{
"epoch": 0.018962962962962963,
"grad_norm": 6400.0,
"learning_rate": 3.2558139534883724e-06,
"loss": 46.3143,
"step": 8
},
{
"epoch": 0.018962962962962963,
"step": 8,
"train/combined_loss": 2.7050345838069916,
"train/cross_entropy_loss": 4.093662917613983,
"train/kl_divergence_loss": 1.31640625,
"train/step_duration_seconds": 7.135659217834473,
"train/steps_per_hour": 236.01663067159015,
"train/total_elapsed_hours": 0.03389591647519006
},
{
"epoch": 0.021333333333333333,
"grad_norm": 9088.0,
"learning_rate": 3.72093023255814e-06,
"loss": 43.2806,
"step": 9
},
{
"epoch": 0.021333333333333333,
"step": 9,
"train/combined_loss": 2.491789221763611,
"train/cross_entropy_loss": 3.7003750950098038,
"train/kl_divergence_loss": 1.283203125,
"train/step_duration_seconds": 7.159760475158691,
"train/steps_per_hour": 250.802995746654,
"train/total_elapsed_hours": 0.035884738829400804
},
{
"epoch": 0.023703703703703703,
"grad_norm": 14784.0,
"learning_rate": 4.186046511627907e-06,
"loss": 39.8686,
"step": 10
},
{
"epoch": 0.023703703703703703,
"step": 10,
"train/combined_loss": 2.106148824095726,
"train/cross_entropy_loss": 2.877825006842613,
"train/kl_divergence_loss": 1.33447265625,
"train/step_duration_seconds": 7.181847333908081,
"train/steps_per_hour": 263.99366796803935,
"train/total_elapsed_hours": 0.037879696422153046
},
{
"epoch": 0.026074074074074072,
"grad_norm": 10880.0,
"learning_rate": 4.651162790697675e-06,
"loss": 33.6984,
"step": 11
},
{
"epoch": 0.026074074074074072,
"step": 11,
"train/combined_loss": 1.4576978087425232,
"train/cross_entropy_loss": 1.571645624935627,
"train/kl_divergence_loss": 1.34375,
"train/step_duration_seconds": 7.17723274230957,
"train/steps_per_hour": 275.8733309353751,
"train/total_elapsed_hours": 0.03987337218390571
},
{
"epoch": 0.028444444444444446,
"grad_norm": 4704.0,
"learning_rate": 5.116279069767442e-06,
"loss": 23.3232,
"step": 12
},
{
"epoch": 0.028444444444444446,
"step": 12,
"train/combined_loss": 1.0812036469578743,
"train/cross_entropy_loss": 0.8298877663910389,
"train/kl_divergence_loss": 1.33251953125,
"train/step_duration_seconds": 7.202847242355347,
"train/steps_per_hour": 286.5728916364825,
"train/total_elapsed_hours": 0.04187416308455997
},
{
"epoch": 0.030814814814814816,
"grad_norm": 1288.0,
"learning_rate": 5.58139534883721e-06,
"loss": 17.2993,
"step": 13
},
{
"epoch": 0.030814814814814816,
"step": 13,
"train/combined_loss": 0.917561486363411,
"train/cross_entropy_loss": 0.5206698887050152,
"train/kl_divergence_loss": 1.314453125,
"train/step_duration_seconds": 7.210479736328125,
"train/steps_per_hour": 296.28229001155563,
"train/total_elapsed_hours": 0.043877074122428895
},
{
"epoch": 0.033185185185185186,
"grad_norm": 1064.0,
"learning_rate": 6.046511627906977e-06,
"loss": 14.681,
"step": 14
},
{
"epoch": 0.033185185185185186,
"step": 14,
"train/combined_loss": 0.8026629276573658,
"train/cross_entropy_loss": 0.42270869202911854,
"train/kl_divergence_loss": 1.1826171875,
"train/step_duration_seconds": 7.176325559616089,
"train/steps_per_hour": 305.20706432882776,
"train/total_elapsed_hours": 0.04587049788898892
},
{
"epoch": 0.035555555555555556,
"grad_norm": 680.0,
"learning_rate": 6.511627906976745e-06,
"loss": 12.8426,
"step": 15
},
{
"epoch": 0.035555555555555556,
"step": 15,
"train/combined_loss": 0.7154323048889637,
"train/cross_entropy_loss": 0.39229039661586285,
"train/kl_divergence_loss": 1.03857421875,
"train/step_duration_seconds": 7.2133026123046875,
"train/steps_per_hour": 313.3212079729679,
"train/total_elapsed_hours": 0.047874193059073554
},
{
"epoch": 0.037925925925925925,
"grad_norm": 516.0,
"learning_rate": 6.976744186046513e-06,
"loss": 11.4469,
"step": 16
},
{
"epoch": 0.037925925925925925,
"step": 16,
"train/combined_loss": 0.6362243294715881,
"train/cross_entropy_loss": 0.37010490894317627,
"train/kl_divergence_loss": 0.90234375,
"train/step_duration_seconds": 7.195964336395264,
"train/steps_per_hour": 320.81440635372326,
"train/total_elapsed_hours": 0.04987307204140557
},
{
"epoch": 0.040296296296296295,
"grad_norm": 506.0,
"learning_rate": 7.44186046511628e-06,
"loss": 10.1796,
"step": 17
},
{
"epoch": 0.040296296296296295,
"step": 17,
"train/combined_loss": 0.5937556512653828,
"train/cross_entropy_loss": 0.3642691094428301,
"train/kl_divergence_loss": 0.8232421875,
"train/step_duration_seconds": 7.201917409896851,
"train/steps_per_hour": 327.71965844625936,
"train/total_elapsed_hours": 0.05187360465526581
},
{
"epoch": 0.042666666666666665,
"grad_norm": 324.0,
"learning_rate": 7.906976744186048e-06,
"loss": 9.5001,
"step": 18
},
{
"epoch": 0.042666666666666665,
"step": 18,
"train/combined_loss": 0.5263483263552189,
"train/cross_entropy_loss": 0.314903661608696,
"train/kl_divergence_loss": 0.73779296875,
"train/step_duration_seconds": 7.208310604095459,
"train/steps_per_hour": 334.1010656793036,
"train/total_elapsed_hours": 0.053875913156403436
},
{
"epoch": 0.045037037037037035,
"grad_norm": 219.0,
"learning_rate": 8.372093023255815e-06,
"loss": 8.4216,
"step": 19
},
{
"epoch": 0.045037037037037035,
"step": 19,
"train/combined_loss": 0.43465932086110115,
"train/cross_entropy_loss": 0.2343088500201702,
"train/kl_divergence_loss": 0.635009765625,
"train/step_duration_seconds": 7.1786744594573975,
"train/steps_per_hour": 340.075239062999,
"train/total_elapsed_hours": 0.0558699893951416
},
{
"epoch": 0.047407407407407405,
"grad_norm": 146.0,
"learning_rate": 8.837209302325582e-06,
"loss": 6.9546,
"step": 20
},
{
"epoch": 0.047407407407407405,
"step": 20,
"train/combined_loss": 0.3771132677793503,
"train/cross_entropy_loss": 0.18440232705324888,
"train/kl_divergence_loss": 0.56982421875,
"train/step_duration_seconds": 7.214315176010132,
"train/steps_per_hour": 345.57852934665823,
"train/total_elapsed_hours": 0.057873965832922196
},
{
"epoch": 0.049777777777777775,
"grad_norm": 109.5,
"learning_rate": 9.30232558139535e-06,
"loss": 6.0338,
"step": 21
},
{
"epoch": 0.049777777777777775,
"step": 21,
"train/combined_loss": 0.3578077703714371,
"train/cross_entropy_loss": 0.15311553701758385,
"train/kl_divergence_loss": 0.5625,
"train/step_duration_seconds": 7.201047420501709,
"train/steps_per_hour": 350.73504254237196,
"train/total_elapsed_hours": 0.05987425678306156
},
{
"epoch": 0.052148148148148145,
"grad_norm": 100.5,
"learning_rate": 9.767441860465117e-06,
"loss": 5.7249,
"step": 22
},
{
"epoch": 0.052148148148148145,
"step": 22,
"train/combined_loss": 0.31270523741841316,
"train/cross_entropy_loss": 0.11625519627705216,
"train/kl_divergence_loss": 0.5091552734375,
"train/step_duration_seconds": 7.181376218795776,
"train/steps_per_hour": 355.5895570404658,
"train/total_elapsed_hours": 0.06186908351050483
},
{
"epoch": 0.05451851851851852,
"grad_norm": 75.0,
"learning_rate": 1.0232558139534884e-05,
"loss": 5.0033,
"step": 23
},
{
"epoch": 0.05451851851851852,
"step": 23,
"train/combined_loss": 0.28502367064356804,
"train/cross_entropy_loss": 0.10581392887979746,
"train/kl_divergence_loss": 0.4642333984375,
"train/step_duration_seconds": 7.173454284667969,
"train/steps_per_hour": 360.15321399626356,
"train/total_elapsed_hours": 0.06386170970069037
},
{
"epoch": 0.05688888888888889,
"grad_norm": 73.5,
"learning_rate": 1.0697674418604651e-05,
"loss": 4.5604,
"step": 24
},
{
"epoch": 0.05688888888888889,
"step": 24,
"train/combined_loss": 0.2662050947546959,
"train/cross_entropy_loss": 0.09075977467000484,
"train/kl_divergence_loss": 0.441650390625,
"train/step_duration_seconds": 7.196916818618774,
"train/steps_per_hour": 364.404632061752,
"train/total_elapsed_hours": 0.06586085326141781
},
{
"epoch": 0.05925925925925926,
"grad_norm": 53.5,
"learning_rate": 1.116279069767442e-05,
"loss": 4.2593,
"step": 25
},
{
"epoch": 0.05925925925925926,
"step": 25,
"train/combined_loss": 0.2555910013616085,
"train/cross_entropy_loss": 0.07075231382623315,
"train/kl_divergence_loss": 0.4404296875,
"train/step_duration_seconds": 7.2051169872283936,
"train/steps_per_hour": 368.39319239049337,
"train/total_elapsed_hours": 0.06786227464675904
},
{
"epoch": 0.06162962962962963,
"grad_norm": 45.0,
"learning_rate": 1.1627906976744187e-05,
"loss": 4.0895,
"step": 26
},
{
"epoch": 0.06162962962962963,
"step": 26,
"train/combined_loss": 0.25060202460736036,
"train/cross_entropy_loss": 0.06773236347362399,
"train/kl_divergence_loss": 0.4334716796875,
"train/step_duration_seconds": 7.195337772369385,
"train/steps_per_hour": 372.1676986924219,
"train/total_elapsed_hours": 0.06986097958352831
},
{
"epoch": 0.064,
"grad_norm": 34.75,
"learning_rate": 1.2093023255813954e-05,
"loss": 4.0096,
"step": 27
},
{
"epoch": 0.064,
"step": 27,
"train/combined_loss": 0.22914704959839582,
"train/cross_entropy_loss": 0.06779117416590452,
"train/kl_divergence_loss": 0.3905029296875,
"train/step_duration_seconds": 7.1646952629089355,
"train/steps_per_hour": 375.7767476973663,
"train/total_elapsed_hours": 0.07185117271211412
},
{
"epoch": 0.06637037037037037,
"grad_norm": 48.5,
"learning_rate": 1.2558139534883723e-05,
"loss": 3.6664,
"step": 28
},
{
"epoch": 0.06637037037037037,
"step": 28,
"train/combined_loss": 0.22941692359745502,
"train/cross_entropy_loss": 0.0525838378816843,
"train/kl_divergence_loss": 0.40625,
"train/step_duration_seconds": 7.199108123779297,
"train/steps_per_hour": 379.1421706885833,
"train/total_elapsed_hours": 0.07385092496871948
},
{
"epoch": 0.06874074074074074,
"grad_norm": 72.5,
"learning_rate": 1.302325581395349e-05,
"loss": 3.6707,
"step": 29
},
{
"epoch": 0.06874074074074074,
"step": 29,
"train/combined_loss": 0.22196358162909746,
"train/cross_entropy_loss": 0.048419348895549774,
"train/kl_divergence_loss": 0.3955078125,
"train/step_duration_seconds": 7.179231882095337,
"train/steps_per_hour": 382.35797131195636,
"train/total_elapsed_hours": 0.0758451560470793
},
{
"epoch": 0.07111111111111111,
"grad_norm": 63.75,
"learning_rate": 1.3488372093023257e-05,
"loss": 3.5514,
"step": 30
},
{
"epoch": 0.07111111111111111,
"step": 30,
"train/combined_loss": 0.2120195608586073,
"train/cross_entropy_loss": 0.056119201704859734,
"train/kl_divergence_loss": 0.367919921875,
"train/step_duration_seconds": 7.1586713790893555,
"train/steps_per_hour": 385.43727586928117,
"train/total_elapsed_hours": 0.07783367587460412
},
{
"epoch": 0.07348148148148148,
"grad_norm": 34.5,
"learning_rate": 1.3953488372093025e-05,
"loss": 3.3923,
"step": 31
},
{
"epoch": 0.07348148148148148,
"step": 31,
"train/combined_loss": 0.20117830298841,
"train/cross_entropy_loss": 0.0440802276134491,
"train/kl_divergence_loss": 0.3582763671875,
"train/step_duration_seconds": 7.1888298988342285,
"train/steps_per_hour": 388.3224034144493,
"train/total_elapsed_hours": 0.07983057306872474
},
{
"epoch": 0.07585185185185185,
"grad_norm": 37.75,
"learning_rate": 1.441860465116279e-05,
"loss": 3.2189,
"step": 32
},
{
"epoch": 0.07585185185185185,
"step": 32,
"train/combined_loss": 0.20670769922435284,
"train/cross_entropy_loss": 0.03853747947141528,
"train/kl_divergence_loss": 0.3748779296875,
"train/step_duration_seconds": 7.163522005081177,
"train/steps_per_hour": 391.1003153449009,
"train/total_elapsed_hours": 0.0818204402923584
},
{
"epoch": 0.07822222222222222,
"grad_norm": 114.5,
"learning_rate": 1.488372093023256e-05,
"loss": 3.3073,
"step": 33
},
{
"epoch": 0.07822222222222222,
"step": 33,
"train/combined_loss": 0.19256712403148413,
"train/cross_entropy_loss": 0.040407692082226276,
"train/kl_divergence_loss": 0.3447265625,
"train/step_duration_seconds": 7.171715974807739,
"train/steps_per_hour": 393.73562507193196,
"train/total_elapsed_hours": 0.08381258361869388
},
{
"epoch": 0.08059259259259259,
"grad_norm": 48.25,
"learning_rate": 1.5348837209302328e-05,
"loss": 3.0811,
"step": 34
},
{
"epoch": 0.08059259259259259,
"step": 34,
"train/combined_loss": 0.2014658311381936,
"train/cross_entropy_loss": 0.0435566701926291,
"train/kl_divergence_loss": 0.359375,
"train/step_duration_seconds": 7.14280104637146,
"train/steps_per_hour": 396.28566102564344,
"train/total_elapsed_hours": 0.08579669502046373
},
{
"epoch": 0.08296296296296296,
"grad_norm": 105.5,
"learning_rate": 1.5813953488372095e-05,
"loss": 3.2235,
"step": 35
},
{
"epoch": 0.08296296296296296,
"step": 35,
"train/combined_loss": 0.1921772612258792,
"train/cross_entropy_loss": 0.03669826895929873,
"train/kl_divergence_loss": 0.34765625,
"train/step_duration_seconds": 7.161575078964233,
"train/steps_per_hour": 398.6967335955144,
"train/total_elapsed_hours": 0.08778602143128712
},
{
"epoch": 0.08533333333333333,
"grad_norm": 76.5,
"learning_rate": 1.6279069767441862e-05,
"loss": 3.0748,
"step": 36
},
{
"epoch": 0.08533333333333333,
"step": 36,
"train/combined_loss": 0.18726484011858702,
"train/cross_entropy_loss": 0.029070683754980564,
"train/kl_divergence_loss": 0.345458984375,
"train/step_duration_seconds": 7.136035680770874,
"train/steps_per_hour": 401.0326431715552,
"train/total_elapsed_hours": 0.08976825356483459
},
{
"epoch": 0.0877037037037037,
"grad_norm": 33.25,
"learning_rate": 1.674418604651163e-05,
"loss": 2.9962,
"step": 37
},
{
"epoch": 0.0877037037037037,
"step": 37,
"train/combined_loss": 0.1922608781605959,
"train/cross_entropy_loss": 0.03149440907873213,
"train/kl_divergence_loss": 0.35302734375,
"train/step_duration_seconds": 7.1254284381866455,
"train/steps_per_hour": 403.28057085391987,
"train/total_elapsed_hours": 0.09174753924210867
},
{
"epoch": 0.09007407407407407,
"grad_norm": 50.0,
"learning_rate": 1.7209302325581396e-05,
"loss": 3.0762,
"step": 38
},
{
"epoch": 0.09007407407407407,
"step": 38,
"train/combined_loss": 0.17505795694887638,
"train/cross_entropy_loss": 0.02919307304546237,
"train/kl_divergence_loss": 0.3209228515625,
"train/step_duration_seconds": 7.103011608123779,
"train/steps_per_hour": 405.4604942984025,
"train/total_elapsed_hours": 0.09372059802214304
},
{
"epoch": 0.09244444444444444,
"grad_norm": 30.75,
"learning_rate": 1.7674418604651163e-05,
"loss": 2.8009,
"step": 39
},
{
"epoch": 0.09244444444444444,
"step": 39,
"train/combined_loss": 0.17248188611119986,
"train/cross_entropy_loss": 0.017937407130375504,
"train/kl_divergence_loss": 0.3270263671875,
"train/step_duration_seconds": 7.110121965408325,
"train/steps_per_hour": 407.5421126867549,
"train/total_elapsed_hours": 0.09569563190142313
},
{
"epoch": 0.09481481481481481,
"grad_norm": 38.5,
"learning_rate": 1.813953488372093e-05,
"loss": 2.7597,
"step": 40
},
{
"epoch": 0.09481481481481481,
"step": 40,
"train/combined_loss": 0.17410407960414886,
"train/cross_entropy_loss": 0.016787268687039614,
"train/kl_divergence_loss": 0.3314208984375,
"train/step_duration_seconds": 7.180423736572266,
"train/steps_per_hour": 409.4576778026899,
"train/total_elapsed_hours": 0.09769019405047098
},
{
"epoch": 0.09718518518518518,
"grad_norm": 31.25,
"learning_rate": 1.86046511627907e-05,
"loss": 2.7857,
"step": 41
},
{
"epoch": 0.09718518518518518,
"step": 41,
"train/combined_loss": 0.16598839219659567,
"train/cross_entropy_loss": 0.023749235086143017,
"train/kl_divergence_loss": 0.3082275390625,
"train/step_duration_seconds": 7.1705567836761475,
"train/steps_per_hour": 411.30789585265995,
"train/total_elapsed_hours": 0.09968201537926992
},
{
"epoch": 0.09955555555555555,
"grad_norm": 29.5,
"learning_rate": 1.9069767441860468e-05,
"loss": 2.6558,
"step": 42
},
{
"epoch": 0.09955555555555555,
"step": 42,
"train/combined_loss": 0.16566006373614073,
"train/cross_entropy_loss": 0.030416806926950812,
"train/kl_divergence_loss": 0.3009033203125,
"train/step_duration_seconds": 7.185399770736694,
"train/steps_per_hour": 413.06887057061425,
"train/total_elapsed_hours": 0.1016779597600301
},
{
"epoch": 0.10192592592592592,
"grad_norm": 35.5,
"learning_rate": 1.9534883720930235e-05,
"loss": 2.6506,
"step": 43
},
{
"epoch": 0.10192592592592592,
"step": 43,
"train/combined_loss": 0.1595935821533203,
"train/cross_entropy_loss": 0.024265281856060028,
"train/kl_divergence_loss": 0.294921875,
"train/step_duration_seconds": 7.195603370666504,
"train/steps_per_hour": 414.75070139036376,
"train/total_elapsed_hours": 0.10367673847410414
},
{
"epoch": 0.10429629629629629,
"grad_norm": 17.5,
"learning_rate": 2e-05,
"loss": 2.5535,
"step": 44
},
{
"epoch": 0.10429629629629629,
"step": 44,
"train/combined_loss": 0.1587589643895626,
"train/cross_entropy_loss": 0.013806993083562702,
"train/kl_divergence_loss": 0.3037109375,
"train/step_duration_seconds": 7.19236421585083,
"train/steps_per_hour": 416.37245606383044,
"train/total_elapsed_hours": 0.10567461742295159
},
{
"epoch": 0.10666666666666667,
"grad_norm": 32.25,
"learning_rate": 1.999965463076377e-05,
"loss": 2.5401,
"step": 45
},
{
"epoch": 0.10666666666666667,
"step": 45,
"train/combined_loss": 0.15903105773031712,
"train/cross_entropy_loss": 0.013862900086678565,
"train/kl_divergence_loss": 0.30419921875,
"train/step_duration_seconds": 7.199136018753052,
"train/steps_per_hour": 417.9267256968682,
"train/total_elapsed_hours": 0.10767437742816077
},
{
"epoch": 0.10903703703703704,
"grad_norm": 30.25,
"learning_rate": 1.999861854691106e-05,
"loss": 2.5445,
"step": 46
},
{
"epoch": 0.10903703703703704,
"step": 46,
"train/combined_loss": 0.16162511333823204,
"train/cross_entropy_loss": 0.021858620457351208,
"train/kl_divergence_loss": 0.3013916015625,
"train/step_duration_seconds": 7.177664041519165,
"train/steps_per_hour": 419.44712621402573,
"train/total_elapsed_hours": 0.10966817299524943
},
{
"epoch": 0.11140740740740741,
"grad_norm": 10.4375,
"learning_rate": 1.9996891820008165e-05,
"loss": 2.586,
"step": 47
},
{
"epoch": 0.11140740740740741,
"step": 47,
"train/combined_loss": 0.15759198181331158,
"train/cross_entropy_loss": 0.032835332211107016,
"train/kl_divergence_loss": 0.2823486328125,
"train/step_duration_seconds": 7.197060823440552,
"train/steps_per_hour": 420.89292205888296,
"train/total_elapsed_hours": 0.11166735655731624
},
{
"epoch": 0.11377777777777778,
"grad_norm": 34.5,
"learning_rate": 1.999447456932676e-05,
"loss": 2.5215,
"step": 48
},
{
"epoch": 0.11377777777777778,
"step": 48,
"train/combined_loss": 0.15159989707171917,
"train/cross_entropy_loss": 0.02695468720048666,
"train/kl_divergence_loss": 0.2762451171875,
"train/step_duration_seconds": 7.164118528366089,
"train/steps_per_hour": 422.32185886743343,
"train/total_elapsed_hours": 0.11365738948186238
},
{
"epoch": 0.11614814814814815,
"grad_norm": 31.5,
"learning_rate": 1.9991366961835643e-05,
"loss": 2.4256,
"step": 49
},
{
"epoch": 0.11614814814814815,
"step": 49,
"train/combined_loss": 0.15017856005579233,
"train/cross_entropy_loss": 0.019473322783596814,
"train/kl_divergence_loss": 0.2808837890625,
"train/step_duration_seconds": 7.150151968002319,
"train/steps_per_hour": 423.7158323839195,
"train/total_elapsed_hours": 0.11564354280630748
},
{
"epoch": 0.11851851851851852,
"grad_norm": 23.375,
"learning_rate": 1.9987569212189224e-05,
"loss": 2.4029,
"step": 50
},
{
"epoch": 0.11851851851851852,
"eval_combined_loss": 0.14935019636029997,
"eval_cross_entropy_loss": 0.01737226772059997,
"eval_kl_divergence_loss": 0.281328125,
"eval_loss": 0.14935019612312317,
"eval_runtime": 220.3536,
"eval_samples_per_second": 6.807,
"eval_steps_per_second": 3.404,
"step": 50
},
{
"epoch": 0.11851851851851852,
"step": 50,
"train/combined_loss": 0.14872634038329124,
"train/cross_entropy_loss": 0.017301325569860637,
"train/kl_divergence_loss": 0.2801513671875,
"train/step_duration_seconds": 227.54744958877563,
"train/steps_per_hour": 279.5620551165938,
"train/total_elapsed_hours": 0.17885116769207848
},
{
"epoch": 0.12088888888888889,
"grad_norm": 20.25,
"learning_rate": 1.9983081582712684e-05,
"loss": 2.3796,
"step": 51
},
{
"epoch": 0.12088888888888889,
"step": 51,
"train/combined_loss": 0.15616787131875753,
"train/cross_entropy_loss": 0.019611137569881976,
"train/kl_divergence_loss": 0.292724609375,
"train/step_duration_seconds": 7.1779563426971436,
"train/steps_per_hour": 282.00938628976195,
"train/total_elapsed_hours": 0.1808450444539388
},
{
"epoch": 0.12325925925925926,
"grad_norm": 27.375,
"learning_rate": 1.997790438338385e-05,
"loss": 2.4987,
"step": 52
},
{
"epoch": 0.12325925925925926,
"step": 52,
"train/combined_loss": 0.1437931666150689,
"train/cross_entropy_loss": 0.019641992752440274,
"train/kl_divergence_loss": 0.2679443359375,
"train/step_duration_seconds": 7.16746187210083,
"train/steps_per_hour": 284.4078751961097,
"train/total_elapsed_hours": 0.1828360060850779
},
{
"epoch": 0.12562962962962962,
"grad_norm": 15.4375,
"learning_rate": 1.9972037971811802e-05,
"loss": 2.3007,
"step": 53
},
{
"epoch": 0.12562962962962962,
"step": 53,
"train/combined_loss": 0.14357721991837025,
"train/cross_entropy_loss": 0.02165149967186153,
"train/kl_divergence_loss": 0.2655029296875,
"train/step_duration_seconds": 7.177205562591553,
"train/steps_per_hour": 286.75049177904856,
"train/total_elapsed_hours": 0.1848296742969089
},
{
"epoch": 0.128,
"grad_norm": 20.125,
"learning_rate": 1.9965482753212154e-05,
"loss": 2.2972,
"step": 54
},
{
"epoch": 0.128,
"step": 54,
"train/combined_loss": 0.14043164812028408,
"train/cross_entropy_loss": 0.023294932674616575,
"train/kl_divergence_loss": 0.257568359375,
"train/step_duration_seconds": 7.157373428344727,
"train/steps_per_hour": 289.05163369286066,
"train/total_elapsed_hours": 0.1868178335825602
},
{
"epoch": 0.13037037037037036,
"grad_norm": 17.125,
"learning_rate": 1.995823918037908e-05,
"loss": 2.2469,
"step": 55
},
{
"epoch": 0.13037037037037036,
"step": 55,
"train/combined_loss": 0.1433863490819931,
"train/cross_entropy_loss": 0.02163596951868385,
"train/kl_divergence_loss": 0.26513671875,
"train/step_duration_seconds": 7.159351348876953,
"train/steps_per_hour": 291.30346508519057,
"train/total_elapsed_hours": 0.1888065422905816
},
{
"epoch": 0.13274074074074074,
"grad_norm": 12.5,
"learning_rate": 1.9950307753654016e-05,
"loss": 2.2942,
"step": 56
},
{
"epoch": 0.13274074074074074,
"step": 56,
"train/combined_loss": 0.1372276395559311,
"train/cross_entropy_loss": 0.019206264754757285,
"train/kl_divergence_loss": 0.2552490234375,
"train/step_duration_seconds": 7.149013042449951,
"train/steps_per_hour": 293.5127714276043,
"train/total_elapsed_hours": 0.1907923792468177
},
{
"epoch": 0.1351111111111111,
"grad_norm": 15.0625,
"learning_rate": 1.994168902089112e-05,
"loss": 2.1956,
"step": 57
},
{
"epoch": 0.1351111111111111,
"step": 57,
"train/combined_loss": 0.13671019952744246,
"train/cross_entropy_loss": 0.020124488859437406,
"train/kl_divergence_loss": 0.2532958984375,
"train/step_duration_seconds": 7.1693689823150635,
"train/steps_per_hour": 295.6678886749936,
"train/total_elapsed_hours": 0.1927838706307941
},
{
"epoch": 0.13748148148148148,
"grad_norm": 21.25,
"learning_rate": 1.9932383577419432e-05,
"loss": 2.1874,
"step": 58
},
{
"epoch": 0.13748148148148148,
"step": 58,
"train/combined_loss": 0.13713468238711357,
"train/cross_entropy_loss": 0.020057943649590015,
"train/kl_divergence_loss": 0.25421142578125,
"train/step_duration_seconds": 7.144319295883179,
"train/steps_per_hour": 297.7895740672935,
"train/total_elapsed_hours": 0.19476840376853943
},
{
"epoch": 0.13985185185185184,
"grad_norm": 19.0,
"learning_rate": 1.9922392066001724e-05,
"loss": 2.1942,
"step": 59
},
{
"epoch": 0.13985185185185184,
"step": 59,
"train/combined_loss": 0.1343515245243907,
"train/cross_entropy_loss": 0.019130286993458867,
"train/kl_divergence_loss": 0.24957275390625,
"train/step_duration_seconds": 7.157663822174072,
"train/steps_per_hour": 299.86280963512706,
"train/total_elapsed_hours": 0.19675664371914334
},
{
"epoch": 0.14222222222222222,
"grad_norm": 16.625,
"learning_rate": 1.991171517679013e-05,
"loss": 2.1496,
"step": 60
},
{
"epoch": 0.14222222222222222,
"step": 60,
"train/combined_loss": 0.13391043990850449,
"train/cross_entropy_loss": 0.020872646826319396,
"train/kl_divergence_loss": 0.2469482421875,
"train/step_duration_seconds": 7.151079893112183,
"train/steps_per_hour": 301.8973420742143,
"train/total_elapsed_hours": 0.1987430548005634
},
{
"epoch": 0.1445925925925926,
"grad_norm": 5.65625,
"learning_rate": 1.9900353647278466e-05,
"loss": 2.1426,
"step": 61
},
{
"epoch": 0.1445925925925926,
"step": 61,
"train/combined_loss": 0.13261367939412594,
"train/cross_entropy_loss": 0.022612601169385016,
"train/kl_divergence_loss": 0.24261474609375,
"train/step_duration_seconds": 7.164619207382202,
"train/steps_per_hour": 303.88591351636484,
"train/total_elapsed_hours": 0.200733226802614
},
{
"epoch": 0.14696296296296296,
"grad_norm": 8.8125,
"learning_rate": 1.9888308262251286e-05,
"loss": 2.1218,
"step": 62
},
{
"epoch": 0.14696296296296296,
"step": 62,
"train/combined_loss": 0.1269391467794776,
"train/cross_entropy_loss": 0.022066760691814125,
"train/kl_divergence_loss": 0.2318115234375,
"train/step_duration_seconds": 7.142378091812134,
"train/steps_per_hour": 305.84476139080533,
"train/total_elapsed_hours": 0.20271722071700626
},
{
"epoch": 0.14933333333333335,
"grad_norm": 20.25,
"learning_rate": 1.9875579853729677e-05,
"loss": 2.031,
"step": 63
},
{
"epoch": 0.14933333333333335,
"step": 63,
"train/combined_loss": 0.1287386268377304,
"train/cross_entropy_loss": 0.02023360482417047,
"train/kl_divergence_loss": 0.23724365234375,
"train/step_duration_seconds": 7.179792881011963,
"train/steps_per_hour": 307.75001344506256,
"train/total_elapsed_hours": 0.20471160762839846
},
{
"epoch": 0.1517037037037037,
"grad_norm": 8.125,
"learning_rate": 1.9862169300913784e-05,
"loss": 2.0598,
"step": 64
},
{
"epoch": 0.1517037037037037,
"step": 64,
"train/combined_loss": 0.1224580081179738,
"train/cross_entropy_loss": 0.019146979437209666,
"train/kl_divergence_loss": 0.22576904296875,
"train/step_duration_seconds": 7.164458274841309,
"train/steps_per_hour": 309.624880616572,
"train/total_elapsed_hours": 0.2067017349269655
},
{
"epoch": 0.15407407407407409,
"grad_norm": 17.625,
"learning_rate": 1.9848077530122083e-05,
"loss": 1.9593,
"step": 65
},
{
"epoch": 0.15407407407407409,
"step": 65,
"train/combined_loss": 0.1219773581251502,
"train/cross_entropy_loss": 0.02142053795978427,
"train/kl_divergence_loss": 0.2225341796875,
"train/step_duration_seconds": 7.1500184535980225,
"train/steps_per_hour": 311.46997603082906,
"train/total_elapsed_hours": 0.20868785116407607
},
{
"epoch": 0.15644444444444444,
"grad_norm": 14.625,
"learning_rate": 1.9833305514727396e-05,
"loss": 1.9516,
"step": 66
},
{
"epoch": 0.15644444444444444,
"step": 66,
"train/combined_loss": 0.11912317294627428,
"train/cross_entropy_loss": 0.019252198981121182,
"train/kl_divergence_loss": 0.218994140625,
"train/step_duration_seconds": 7.173556327819824,
"train/steps_per_hour": 313.2705599924478,
"train/total_elapsed_hours": 0.21068050569958158
},
{
"epoch": 0.15881481481481483,
"grad_norm": 11.75,
"learning_rate": 1.981785427508966e-05,
"loss": 1.906,
"step": 67
},
{
"epoch": 0.15881481481481483,
"step": 67,
"train/combined_loss": 0.11908936966210604,
"train/cross_entropy_loss": 0.02040529961232096,
"train/kl_divergence_loss": 0.2177734375,
"train/step_duration_seconds": 7.164828062057495,
"train/steps_per_hour": 315.0409941178432,
"train/total_elapsed_hours": 0.21267073571681977
},
{
"epoch": 0.16118518518518518,
"grad_norm": 7.46875,
"learning_rate": 1.9801724878485438e-05,
"loss": 1.9054,
"step": 68
},
{
"epoch": 0.16118518518518518,
"step": 68,
"train/combined_loss": 0.12613936699926853,
"train/cross_entropy_loss": 0.025533129926770926,
"train/kl_divergence_loss": 0.22674560546875,
"train/step_duration_seconds": 7.197604179382324,
"train/steps_per_hour": 316.7651640171972,
"train/total_elapsed_hours": 0.21467007021109263
},
{
"epoch": 0.16355555555555557,
"grad_norm": 37.25,
"learning_rate": 1.9784918439034216e-05,
"loss": 2.0182,
"step": 69
},
{
"epoch": 0.16355555555555557,
"step": 69,
"train/combined_loss": 0.11615706328302622,
"train/cross_entropy_loss": 0.024733559926971793,
"train/kl_divergence_loss": 0.20758056640625,
"train/step_duration_seconds": 7.158296346664429,
"train/steps_per_hour": 318.4735632448237,
"train/total_elapsed_hours": 0.21665848586294387
},
{
"epoch": 0.16592592592592592,
"grad_norm": 25.25,
"learning_rate": 1.9767436117621416e-05,
"loss": 1.8585,
"step": 70
},
{
"epoch": 0.16592592592592592,
"step": 70,
"train/combined_loss": 0.11343861371278763,
"train/cross_entropy_loss": 0.018869414925575256,
"train/kl_divergence_loss": 0.2080078125,
"train/step_duration_seconds": 7.1701250076293945,
"train/steps_per_hour": 320.1460784421751,
"train/total_elapsed_hours": 0.21865018725395202
},
{
"epoch": 0.1682962962962963,
"grad_norm": 37.75,
"learning_rate": 1.9749279121818235e-05,
"loss": 1.815,
"step": 71
},
{
"epoch": 0.1682962962962963,
"step": 71,
"train/combined_loss": 0.1213934626430273,
"train/cross_entropy_loss": 0.01866583281662315,
"train/kl_divergence_loss": 0.22412109375,
"train/step_duration_seconds": 7.159663677215576,
"train/steps_per_hour": 321.7926366627231,
"train/total_elapsed_hours": 0.22063898271984525
},
{
"epoch": 0.17066666666666666,
"grad_norm": 29.125,
"learning_rate": 1.973044870579824e-05,
"loss": 1.9423,
"step": 72
},
{
"epoch": 0.17066666666666666,
"step": 72,
"train/combined_loss": 0.11181758902966976,
"train/cross_entropy_loss": 0.019289474468678236,
"train/kl_divergence_loss": 0.204345703125,
"train/step_duration_seconds": 7.168922185897827,
"train/steps_per_hour": 323.4060405602183,
"train/total_elapsed_hours": 0.22263034999370576
},
{
"epoch": 0.17303703703703704,
"grad_norm": 26.5,
"learning_rate": 1.9710946170250702e-05,
"loss": 1.7891,
"step": 73
},
{
"epoch": 0.17303703703703704,
"step": 73,
"train/combined_loss": 0.11201242171227932,
"train/cross_entropy_loss": 0.020838803611695766,
"train/kl_divergence_loss": 0.20318603515625,
"train/step_duration_seconds": 7.184260606765747,
"train/steps_per_hour": 324.9846730527734,
"train/total_elapsed_hours": 0.22462597794002956
},
{
"epoch": 0.1754074074074074,
"grad_norm": 28.75,
"learning_rate": 1.969077286229078e-05,
"loss": 1.7922,
"step": 74
},
{
"epoch": 0.1754074074074074,
"step": 74,
"train/combined_loss": 0.11633206205442548,
"train/cross_entropy_loss": 0.022031799773685634,
"train/kl_divergence_loss": 0.21063232421875,
"train/step_duration_seconds": 7.156804323196411,
"train/steps_per_hour": 326.54649232377625,
"train/total_elapsed_hours": 0.22661397914091747
},
{
"epoch": 0.17777777777777778,
"grad_norm": 6.15625,
"learning_rate": 1.9669930175366474e-05,
"loss": 1.8613,
"step": 75
},
{
"epoch": 0.17777777777777778,
"step": 75,
"train/combined_loss": 0.10736048221588135,
"train/cross_entropy_loss": 0.019408458843827248,
"train/kl_divergence_loss": 0.1953125,
"train/step_duration_seconds": 7.157041549682617,
"train/steps_per_hour": 328.08105279118894,
"train/total_elapsed_hours": 0.22860204623805153
},
{
"epoch": 0.18014814814814814,
"grad_norm": 19.875,
"learning_rate": 1.964841954916235e-05,
"loss": 1.7178,
"step": 76
},
{
"epoch": 0.18014814814814814,
"step": 76,
"train/combined_loss": 0.11280431784689426,
"train/cross_entropy_loss": 0.021934314048849046,
"train/kl_divergence_loss": 0.20367431640625,
"train/step_duration_seconds": 7.154073476791382,
"train/steps_per_hour": 329.5903308097024,
"train/total_elapsed_hours": 0.23058928887049357
},
{
"epoch": 0.18251851851851852,
"grad_norm": 8.625,
"learning_rate": 1.962624246950012e-05,
"loss": 1.8049,
"step": 77
},
{
"epoch": 0.18251851851851852,
"step": 77,
"train/combined_loss": 0.10877907322719693,
"train/cross_entropy_loss": 0.021696327603422105,
"train/kl_divergence_loss": 0.19586181640625,
"train/step_duration_seconds": 7.163645029067993,
"train/steps_per_hour": 331.0700321531476,
"train/total_elapsed_hours": 0.2325791902674569
},
{
"epoch": 0.18488888888888888,
"grad_norm": 6.375,
"learning_rate": 1.9603400468236e-05,
"loss": 1.7405,
"step": 78
},
{
"epoch": 0.18488888888888888,
"step": 78,
"train/combined_loss": 0.11846707155928016,
"train/cross_entropy_loss": 0.022090390557423234,
"train/kl_divergence_loss": 0.21484375,
"train/step_duration_seconds": 7.1567511558532715,
"train/steps_per_hour": 332.52734290219325,
"train/total_elapsed_hours": 0.23456717669963836
},
{
"epoch": 0.18725925925925926,
"grad_norm": 23.5,
"learning_rate": 1.957989512315489e-05,
"loss": 1.8955,
"step": 79
},
{
"epoch": 0.18725925925925926,
"step": 79,
"train/combined_loss": 0.10807515401393175,
"train/cross_entropy_loss": 0.020105381147004664,
"train/kl_divergence_loss": 0.196044921875,
"train/step_duration_seconds": 7.179587125778198,
"train/steps_per_hour": 333.9512044172201,
"train/total_elapsed_hours": 0.23656150645679896
},
{
"epoch": 0.18962962962962962,
"grad_norm": 12.5625,
"learning_rate": 1.955572805786141e-05,
"loss": 1.7292,
"step": 80
},
{
"epoch": 0.18962962962962962,
"step": 80,
"train/combined_loss": 0.10483178775757551,
"train/cross_entropy_loss": 0.022346684243530035,
"train/kl_divergence_loss": 0.18731689453125,
"train/step_duration_seconds": 7.17809271812439,
"train/steps_per_hour": 335.3518424811757,
"train/total_elapsed_hours": 0.23855542110072242
},
{
"epoch": 0.192,
"grad_norm": 43.25,
"learning_rate": 1.9530900941667733e-05,
"loss": 1.6773,
"step": 81
},
{
"epoch": 0.192,
"step": 81,
"train/combined_loss": 0.1046543437987566,
"train/cross_entropy_loss": 0.02217490249313414,
"train/kl_divergence_loss": 0.1871337890625,
"train/step_duration_seconds": 7.168772459030151,
"train/steps_per_hour": 336.7328849090869,
"train/total_elapsed_hours": 0.24054674678378635
},
{
"epoch": 0.19437037037037036,
"grad_norm": 36.75,
"learning_rate": 1.9505415489478293e-05,
"loss": 1.6745,
"step": 82
},
{
"epoch": 0.19437037037037036,
"step": 82,
"train/combined_loss": 0.10497199138626456,
"train/cross_entropy_loss": 0.019270156044512987,
"train/kl_divergence_loss": 0.190673828125,
"train/step_duration_seconds": 7.172302484512329,
"train/steps_per_hour": 338.08988274573534,
"train/total_elapsed_hours": 0.24253905302948423
},
{
"epoch": 0.19674074074074074,
"grad_norm": 19.375,
"learning_rate": 1.947927346167132e-05,
"loss": 1.6796,
"step": 83
},
{
"epoch": 0.19674074074074074,
"step": 83,
"train/combined_loss": 0.1053922800347209,
"train/cross_entropy_loss": 0.01895106490701437,
"train/kl_divergence_loss": 0.19183349609375,
"train/step_duration_seconds": 7.172720909118652,
"train/steps_per_hour": 339.42460711386434,
"train/total_elapsed_hours": 0.2445314755042394
},
{
"epoch": 0.1991111111111111,
"grad_norm": 26.75,
"learning_rate": 1.945247666397725e-05,
"loss": 1.6863,
"step": 84
},
{
"epoch": 0.1991111111111111,
"step": 84,
"train/combined_loss": 0.10413095075637102,
"train/cross_entropy_loss": 0.019968447857536376,
"train/kl_divergence_loss": 0.18829345703125,
"train/step_duration_seconds": 7.1637890338897705,
"train/steps_per_hour": 340.74118612261555,
"train/total_elapsed_hours": 0.24652141690254212
},
{
"epoch": 0.20148148148148148,
"grad_norm": 10.1875,
"learning_rate": 1.9425026947353994e-05,
"loss": 1.6661,
"step": 85
},
{
"epoch": 0.20148148148148148,
"step": 85,
"train/combined_loss": 0.1039673495106399,
"train/cross_entropy_loss": 0.023974738782271743,
"train/kl_divergence_loss": 0.1839599609375,
"train/step_duration_seconds": 7.150022268295288,
"train/steps_per_hour": 342.04194361300205,
"train/total_elapsed_hours": 0.2485075341992908
},
{
"epoch": 0.20385185185185184,
"grad_norm": 52.25,
"learning_rate": 1.9396926207859085e-05,
"loss": 1.6635,
"step": 86
},
{
"epoch": 0.20385185185185184,
"step": 86,
"train/combined_loss": 0.10607085470110178,
"train/cross_entropy_loss": 0.02512999135069549,
"train/kl_divergence_loss": 0.18701171875,
"train/step_duration_seconds": 7.172277927398682,
"train/steps_per_hour": 343.31360127461863,
"train/total_elapsed_hours": 0.2504998336235682
},
{
"epoch": 0.20622222222222222,
"grad_norm": 56.75,
"learning_rate": 1.936817638651871e-05,
"loss": 1.6971,
"step": 87
},
{
"epoch": 0.20622222222222222,
"step": 87,
"train/combined_loss": 0.10299085499718785,
"train/cross_entropy_loss": 0.023303485824726522,
"train/kl_divergence_loss": 0.18267822265625,
"train/step_duration_seconds": 7.151454925537109,
"train/steps_per_hour": 344.57308438928993,
"train/total_elapsed_hours": 0.25248634888066185
},
{
"epoch": 0.20859259259259258,
"grad_norm": 27.0,
"learning_rate": 1.9338779469193638e-05,
"loss": 1.6479,
"step": 88
},
{
"epoch": 0.20859259259259258,
"step": 88,
"train/combined_loss": 0.10254441620782018,
"train/cross_entropy_loss": 0.019541956367902458,
"train/kl_divergence_loss": 0.185546875,
"train/step_duration_seconds": 7.150182485580444,
"train/steps_per_hour": 345.81338378782823,
"train/total_elapsed_hours": 0.254472510682212
},
{
"epoch": 0.21096296296296296,
"grad_norm": 57.25,
"learning_rate": 1.9308737486442045e-05,
"loss": 1.6407,
"step": 89
},
{
"epoch": 0.21096296296296296,
"step": 89,
"train/combined_loss": 0.1078284471295774,
"train/cross_entropy_loss": 0.01973404036834836,
"train/kl_divergence_loss": 0.1959228515625,
"train/step_duration_seconds": 7.157840967178345,
"train/steps_per_hour": 347.0315933491895,
"train/total_elapsed_hours": 0.2564607998397615
},
{
"epoch": 0.21333333333333335,
"grad_norm": 59.25,
"learning_rate": 1.9278052513379256e-05,
"loss": 1.7253,
"step": 90
},
{
"epoch": 0.21333333333333335,
"step": 90,
"train/combined_loss": 0.1028821705840528,
"train/cross_entropy_loss": 0.01917986525222659,
"train/kl_divergence_loss": 0.18658447265625,
"train/step_duration_seconds": 7.162302017211914,
"train/steps_per_hour": 348.2293895098418,
"train/total_elapsed_hours": 0.25845032817787594
},
{
"epoch": 0.2157037037037037,
"grad_norm": 36.5,
"learning_rate": 1.9246726669534416e-05,
"loss": 1.6461,
"step": 91
},
{
"epoch": 0.2157037037037037,
"step": 91,
"train/combined_loss": 0.10168306482955813,
"train/cross_entropy_loss": 0.02093204390257597,
"train/kl_divergence_loss": 0.18243408203125,
"train/step_duration_seconds": 7.153521299362183,
"train/steps_per_hour": 349.4121578181558,
"train/total_elapsed_hours": 0.26043741742769877
},
{
"epoch": 0.2180740740740741,
"grad_norm": 27.5,
"learning_rate": 1.921476211870408e-05,
"loss": 1.6269,
"step": 92
},
{
"epoch": 0.2180740740740741,
"step": 92,
"train/combined_loss": 0.09777736244723201,
"train/cross_entropy_loss": 0.0219097004737705,
"train/kl_divergence_loss": 0.17364501953125,
"train/step_duration_seconds": 7.162153959274292,
"train/steps_per_hour": 350.5738107405354,
"train/total_elapsed_hours": 0.2624269046386083
},
{
"epoch": 0.22044444444444444,
"grad_norm": 34.5,
"learning_rate": 1.9182161068802742e-05,
"loss": 1.5644,
"step": 93
},
{
"epoch": 0.22044444444444444,
"step": 93,
"train/combined_loss": 0.10163608659058809,
"train/cross_entropy_loss": 0.02199775306507945,
"train/kl_divergence_loss": 0.1812744140625,
"train/step_duration_seconds": 7.154653549194336,
"train/steps_per_hour": 351.72075430393164,
"train/total_elapsed_hours": 0.26441430840227337
},
{
"epoch": 0.22281481481481483,
"grad_norm": 20.5,
"learning_rate": 1.9148925771710347e-05,
"loss": 1.6262,
"step": 94
},
{
"epoch": 0.22281481481481483,
"step": 94,
"train/combined_loss": 0.09920958010479808,
"train/cross_entropy_loss": 0.019586148322559893,
"train/kl_divergence_loss": 0.1788330078125,
"train/step_duration_seconds": 7.162054061889648,
"train/steps_per_hour": 352.84786230208005,
"train/total_elapsed_hours": 0.2664037678639094
},
{
"epoch": 0.22518518518518518,
"grad_norm": 31.625,
"learning_rate": 1.9115058523116734e-05,
"loss": 1.5874,
"step": 95
},
{
"epoch": 0.22518518518518518,
"step": 95,
"train/combined_loss": 0.09802744071930647,
"train/cross_entropy_loss": 0.01880878652445972,
"train/kl_divergence_loss": 0.17724609375,
"train/step_duration_seconds": 7.153456926345825,
"train/steps_per_hour": 353.9614104256406,
"train/total_elapsed_hours": 0.2683908392323388
},
{
"epoch": 0.22755555555555557,
"grad_norm": 38.75,
"learning_rate": 1.908056166236305e-05,
"loss": 1.5684,
"step": 96
},
{
"epoch": 0.22755555555555557,
"step": 96,
"train/combined_loss": 0.09743851656094193,
"train/cross_entropy_loss": 0.018851646571420133,
"train/kl_divergence_loss": 0.176025390625,
"train/step_duration_seconds": 7.1597981452941895,
"train/steps_per_hour": 355.0562779811278,
"train/total_elapsed_hours": 0.2703796720504761
},
{
"epoch": 0.22992592592592592,
"grad_norm": 26.125,
"learning_rate": 1.9045437572280193e-05,
"loss": 1.559,
"step": 97
},
{
"epoch": 0.22992592592592592,
"step": 97,
"train/combined_loss": 0.0965579142794013,
"train/cross_entropy_loss": 0.02038634184282273,
"train/kl_divergence_loss": 0.1727294921875,
"train/step_duration_seconds": 7.157320976257324,
"train/steps_per_hour": 356.13605583694607,
"train/total_elapsed_hours": 0.2723678167661031
},
{
"epoch": 0.2322962962962963,
"grad_norm": 18.125,
"learning_rate": 1.900968867902419e-05,
"loss": 1.5449,
"step": 98
},
{
"epoch": 0.2322962962962963,
"step": 98,
"train/combined_loss": 0.09678681008517742,
"train/cross_entropy_loss": 0.020966205513104796,
"train/kl_divergence_loss": 0.172607421875,
"train/step_duration_seconds": 7.1417396068573,
"train/steps_per_hour": 357.2058194544125,
"train/total_elapsed_hours": 0.2743516333235635
},
{
"epoch": 0.23466666666666666,
"grad_norm": 27.5,
"learning_rate": 1.8973317451908642e-05,
"loss": 1.5486,
"step": 99
},
{
"epoch": 0.23466666666666666,
"step": 99,
"train/combined_loss": 0.10045615630224347,
"train/cross_entropy_loss": 0.02183516975492239,
"train/kl_divergence_loss": 0.1790771484375,
"train/step_duration_seconds": 7.167134761810303,
"train/steps_per_hour": 358.25107804442126,
"train/total_elapsed_hours": 0.276342504090733
},
{
"epoch": 0.23703703703703705,
"grad_norm": 17.75,
"learning_rate": 1.8936326403234125e-05,
"loss": 1.6073,
"step": 100
},
{
"epoch": 0.23703703703703705,
"eval_combined_loss": 0.09612167934452494,
"eval_cross_entropy_loss": 0.02090351493904988,
"eval_kl_divergence_loss": 0.17133984375,
"eval_loss": 0.09612167626619339,
"eval_runtime": 219.9162,
"eval_samples_per_second": 6.821,
"eval_steps_per_second": 3.41,
"step": 100
},
{
"epoch": 0.23703703703703705,
"step": 100,
"train/combined_loss": 0.09815677208825946,
"train/cross_entropy_loss": 0.020898508373647928,
"train/kl_divergence_loss": 0.1754150390625,
"train/step_duration_seconds": 227.11371684074402,
"train/steps_per_hour": 294.61186048448315,
"train/total_elapsed_hours": 0.33942964765760636
},
{
"epoch": 0.2394074074074074,
"grad_norm": 21.125,
"learning_rate": 1.8898718088114688e-05,
"loss": 1.5705,
"step": 101
},
{
"epoch": 0.2394074074074074,
"step": 101,
"train/combined_loss": 0.09819618007168174,
"train/cross_entropy_loss": 0.020428007235750556,
"train/kl_divergence_loss": 0.17596435546875,
"train/step_duration_seconds": 7.171290159225464,
"train/steps_per_hour": 295.82187680336136,
"train/total_elapsed_hours": 0.34142167270183565
},
{
"epoch": 0.24177777777777779,
"grad_norm": 25.25,
"learning_rate": 1.8860495104301346e-05,
"loss": 1.5711,
"step": 102
},
{
"epoch": 0.24177777777777779,
"step": 102,
"train/combined_loss": 0.09767304686829448,
"train/cross_entropy_loss": 0.020236226613633335,
"train/kl_divergence_loss": 0.17510986328125,
"train/step_duration_seconds": 7.163306951522827,
"train/steps_per_hour": 297.0197733169854,
"train/total_elapsed_hours": 0.34341148018836976
},
{
"epoch": 0.24414814814814814,
"grad_norm": 12.8125,
"learning_rate": 1.8821660092002642e-05,
"loss": 1.5628,
"step": 103
},
{
"epoch": 0.24414814814814814,
"step": 103,
"train/combined_loss": 0.09357908833771944,
"train/cross_entropy_loss": 0.021325660520233214,
"train/kl_divergence_loss": 0.16583251953125,
"train/step_duration_seconds": 7.165699005126953,
"train/steps_per_hour": 298.2032943460889,
"train/total_elapsed_hours": 0.34540195213423835
},
{
"epoch": 0.24651851851851853,
"grad_norm": 33.25,
"learning_rate": 1.8782215733702286e-05,
"loss": 1.4973,
"step": 104
},
{
"epoch": 0.24651851851851853,
"step": 104,
"train/combined_loss": 0.09911046642810106,
"train/cross_entropy_loss": 0.022012439789250493,
"train/kl_divergence_loss": 0.17620849609375,
"train/step_duration_seconds": 7.175975322723389,
"train/steps_per_hour": 299.3707928769077,
"train/total_elapsed_hours": 0.34739527861277264
},
{
"epoch": 0.24888888888888888,
"grad_norm": 38.0,
"learning_rate": 1.874216475397386e-05,
"loss": 1.5858,
"step": 105
},
{
"epoch": 0.24888888888888888,
"step": 105,
"train/combined_loss": 0.09527313988655806,
"train/cross_entropy_loss": 0.02172303292900324,
"train/kl_divergence_loss": 0.1688232421875,
"train/step_duration_seconds": 7.173555850982666,
"train/steps_per_hour": 300.5255479022072,
"train/total_elapsed_hours": 0.3493879330158234
},
{
"epoch": 0.25125925925925924,
"grad_norm": 30.0,
"learning_rate": 1.870150991929261e-05,
"loss": 1.5244,
"step": 106
},
{
"epoch": 0.25125925925925924,
"step": 106,
"train/combined_loss": 0.09378110896795988,
"train/cross_entropy_loss": 0.02069209818728268,
"train/kl_divergence_loss": 0.1668701171875,
"train/step_duration_seconds": 7.196229696273804,
"train/steps_per_hour": 301.66179874940855,
"train/total_elapsed_hours": 0.35138688570923277
},
{
"epoch": 0.25362962962962965,
"grad_norm": 9.4375,
"learning_rate": 1.866025403784439e-05,
"loss": 1.5005,
"step": 107
},
{
"epoch": 0.25362962962962965,
"step": 107,
"train/combined_loss": 0.09853040147572756,
"train/cross_entropy_loss": 0.019936785800382495,
"train/kl_divergence_loss": 0.1771240234375,
"train/step_duration_seconds": 7.178696632385254,
"train/steps_per_hour": 302.78936801023747,
"train/total_elapsed_hours": 0.35338096810711755
},
{
"epoch": 0.256,
"grad_norm": 17.625,
"learning_rate": 1.8618399959331642e-05,
"loss": 1.5765,
"step": 108
},
{
"epoch": 0.256,
"step": 108,
"train/combined_loss": 0.09358909726142883,
"train/cross_entropy_loss": 0.020552222267724574,
"train/kl_divergence_loss": 0.1666259765625,
"train/step_duration_seconds": 7.157944679260254,
"train/steps_per_hour": 303.90921284525064,
"train/total_elapsed_hours": 0.3553692860735787
},
{
"epoch": 0.25837037037037036,
"grad_norm": 9.6875,
"learning_rate": 1.8575950574776595e-05,
"loss": 1.4974,
"step": 109
},
{
"epoch": 0.25837037037037036,
"step": 109,
"train/combined_loss": 0.09076927369460464,
"train/cross_entropy_loss": 0.0206498735351488,
"train/kl_divergence_loss": 0.160888671875,
"train/step_duration_seconds": 7.1567769050598145,
"train/steps_per_hour": 305.0168730415088,
"train/total_elapsed_hours": 0.35735727965831754
},
{
"epoch": 0.2607407407407407,
"grad_norm": 22.375,
"learning_rate": 1.8532908816321557e-05,
"loss": 1.4523,
"step": 110
},
{
"epoch": 0.2607407407407407,
"step": 110,
"train/combined_loss": 0.09263847023248672,
"train/cross_entropy_loss": 0.021519619156606495,
"train/kl_divergence_loss": 0.16375732421875,
"train/step_duration_seconds": 7.173721790313721,
"train/steps_per_hour": 306.1082679129725,
"train/total_elapsed_hours": 0.3593499801556269
},
{
"epoch": 0.26311111111111113,
"grad_norm": 23.875,
"learning_rate": 1.8489277657026377e-05,
"loss": 1.4822,
"step": 111
},
{
"epoch": 0.26311111111111113,
"step": 111,
"train/combined_loss": 0.0902865119278431,
"train/cross_entropy_loss": 0.020905061159282923,
"train/kl_divergence_loss": 0.15966796875,
"train/step_duration_seconds": 7.147202253341675,
"train/steps_per_hour": 307.1938879594452,
"train/total_elapsed_hours": 0.3613353141148885
},
{
"epoch": 0.2654814814814815,
"grad_norm": 11.125,
"learning_rate": 1.844506011066308e-05,
"loss": 1.4446,
"step": 112
},
{
"epoch": 0.2654814814814815,
"step": 112,
"train/combined_loss": 0.09026105608791113,
"train/cross_entropy_loss": 0.019145151949487627,
"train/kl_divergence_loss": 0.161376953125,
"train/step_duration_seconds": 7.135612726211548,
"train/steps_per_hour": 308.27037497741026,
"train/total_elapsed_hours": 0.3633174287610584
},
{
"epoch": 0.26785185185185184,
"grad_norm": 35.25,
"learning_rate": 1.8400259231507716e-05,
"loss": 1.4442,
"step": 113
},
{
"epoch": 0.26785185185185184,
"step": 113,
"train/combined_loss": 0.09185996558517218,
"train/cross_entropy_loss": 0.019108111271634698,
"train/kl_divergence_loss": 0.16461181640625,
"train/step_duration_seconds": 7.138214826583862,
"train/steps_per_hour": 309.3345678925583,
"train/total_elapsed_hours": 0.36530026621288725
},
{
"epoch": 0.2702222222222222,
"grad_norm": 42.0,
"learning_rate": 1.8354878114129368e-05,
"loss": 1.4698,
"step": 114
},
{
"epoch": 0.2702222222222222,
"step": 114,
"train/combined_loss": 0.08998283930122852,
"train/cross_entropy_loss": 0.018710799398832023,
"train/kl_divergence_loss": 0.1612548828125,
"train/step_duration_seconds": 7.160963296890259,
"train/steps_per_hour": 310.38193032311864,
"train/total_elapsed_hours": 0.36728942268424564
},
{
"epoch": 0.2725925925925926,
"grad_norm": 32.75,
"learning_rate": 1.8308919893176397e-05,
"loss": 1.4397,
"step": 115
},
{
"epoch": 0.2725925925925926,
"step": 115,
"train/combined_loss": 0.08985556894913316,
"train/cross_entropy_loss": 0.02016523655038327,
"train/kl_divergence_loss": 0.1595458984375,
"train/step_duration_seconds": 7.191335916519165,
"train/steps_per_hour": 311.4108945594422,
"train/total_elapsed_hours": 0.36928701599438984
},
{
"epoch": 0.27496296296296296,
"grad_norm": 9.5,
"learning_rate": 1.826238774315995e-05,
"loss": 1.4377,
"step": 116
},
{
"epoch": 0.27496296296296296,
"step": 116,
"train/combined_loss": 0.090081796515733,
"train/cross_entropy_loss": 0.0222046107519418,
"train/kl_divergence_loss": 0.157958984375,
"train/step_duration_seconds": 7.165991544723511,
"train/steps_per_hour": 312.43471090794657,
"train/total_elapsed_hours": 0.3712775692012575
},
{
"epoch": 0.2773333333333333,
"grad_norm": 17.25,
"learning_rate": 1.8215284878234644e-05,
"loss": 1.4413,
"step": 117
},
{
"epoch": 0.2773333333333333,
"step": 117,
"train/combined_loss": 0.09464696934446692,
"train/cross_entropy_loss": 0.023217282141558826,
"train/kl_divergence_loss": 0.16607666015625,
"train/step_duration_seconds": 7.209496736526489,
"train/steps_per_hour": 313.43745997881086,
"train/total_elapsed_hours": 0.373280207183626
},
{
"epoch": 0.2797037037037037,
"grad_norm": 10.8125,
"learning_rate": 1.816761455197657e-05,
"loss": 1.5144,
"step": 118
},
{
"epoch": 0.2797037037037037,
"step": 118,
"train/combined_loss": 0.08674649894237518,
"train/cross_entropy_loss": 0.018219567835330963,
"train/kl_divergence_loss": 0.1552734375,
"train/step_duration_seconds": 7.183828592300415,
"train/steps_per_hour": 314.43548101801,
"train/total_elapsed_hours": 0.3752757151259316
},
{
"epoch": 0.2820740740740741,
"grad_norm": 22.625,
"learning_rate": 1.811938005715857e-05,
"loss": 1.3879,
"step": 119
},
{
"epoch": 0.2820740740740741,
"step": 119,
"train/combined_loss": 0.08797085983678699,
"train/cross_entropy_loss": 0.017738587921485305,
"train/kl_divergence_loss": 0.158203125,
"train/step_duration_seconds": 7.165632963180542,
"train/steps_per_hour": 315.4271701636993,
"train/total_elapsed_hours": 0.3772661687268151
},
{
"epoch": 0.28444444444444444,
"grad_norm": 24.0,
"learning_rate": 1.8070584725522763e-05,
"loss": 1.4075,
"step": 120
},
{
"epoch": 0.28444444444444444,
"step": 120,
"train/combined_loss": 0.09114356013014913,
"train/cross_entropy_loss": 0.018773937365040183,
"train/kl_divergence_loss": 0.16351318359375,
"train/step_duration_seconds": 7.167950868606567,
"train/steps_per_hour": 316.4079127749189,
"train/total_elapsed_hours": 0.37925726619031697
},
{
"epoch": 0.2868148148148148,
"grad_norm": 12.625,
"learning_rate": 1.802123192755044e-05,
"loss": 1.4583,
"step": 121
},
{
"epoch": 0.2868148148148148,
"step": 121,
"train/combined_loss": 0.09176525427028537,
"train/cross_entropy_loss": 0.024289784720167518,
"train/kl_divergence_loss": 0.15924072265625,
"train/step_duration_seconds": 7.175249338150024,
"train/steps_per_hour": 317.3767236816989,
"train/total_elapsed_hours": 0.38125039100646974
},
{
"epoch": 0.2891851851851852,
"grad_norm": 31.75,
"learning_rate": 1.7971325072229227e-05,
"loss": 1.4682,
"step": 122
},
{
"epoch": 0.2891851851851852,
"step": 122,
"train/combined_loss": 0.0890495995990932,
"train/cross_entropy_loss": 0.025206139660440385,
"train/kl_divergence_loss": 0.15289306640625,
"train/step_duration_seconds": 7.177129745483398,
"train/steps_per_hour": 318.3350237785182,
"train/total_elapsed_hours": 0.3832440381579929
},
{
"epoch": 0.29155555555555557,
"grad_norm": 38.75,
"learning_rate": 1.7920867606817625e-05,
"loss": 1.4248,
"step": 123
},
{
"epoch": 0.29155555555555557,
"step": 123,
"train/combined_loss": 0.0867073736153543,
"train/cross_entropy_loss": 0.023878611740656197,
"train/kl_divergence_loss": 0.1495361328125,
"train/step_duration_seconds": 7.162760257720947,
"train/steps_per_hour": 319.2867134529585,
"train/total_elapsed_hours": 0.3852336937851376
},
{
"epoch": 0.2939259259259259,
"grad_norm": 33.0,
"learning_rate": 1.7869863016606893e-05,
"loss": 1.3873,
"step": 124
},
{
"epoch": 0.2939259259259259,
"step": 124,
"train/combined_loss": 0.08373823016881943,
"train/cross_entropy_loss": 0.020015520974993706,
"train/kl_divergence_loss": 0.1474609375,
"train/step_duration_seconds": 7.192008972167969,
"train/steps_per_hour": 320.221904230669,
"train/total_elapsed_hours": 0.38723147405518427
},
{
"epoch": 0.2962962962962963,
"grad_norm": 5.34375,
"learning_rate": 1.78183148246803e-05,
"loss": 1.3398,
"step": 125
},
{
"epoch": 0.2962962962962963,
"step": 125,
"train/combined_loss": 0.08616493362933397,
"train/cross_entropy_loss": 0.016812282847240567,
"train/kl_divergence_loss": 0.155517578125,
"train/step_duration_seconds": 7.1917195320129395,
"train/steps_per_hour": 321.14756131827295,
"train/total_elapsed_hours": 0.38922917392518785
},
{
"epoch": 0.2986666666666667,
"grad_norm": 39.0,
"learning_rate": 1.7766226591669787e-05,
"loss": 1.3786,
"step": 126
},
{
"epoch": 0.2986666666666667,
"step": 126,
"train/combined_loss": 0.09160786820575595,
"train/cross_entropy_loss": 0.016711829113774,
"train/kl_divergence_loss": 0.16650390625,
"train/step_duration_seconds": 7.179956436157227,
"train/steps_per_hour": 322.0664550428592,
"train/total_elapsed_hours": 0.39122360626856484
},
{
"epoch": 0.30103703703703705,
"grad_norm": 44.0,
"learning_rate": 1.771360191551e-05,
"loss": 1.4657,
"step": 127
},
{
"epoch": 0.30103703703703705,
"step": 127,
"train/combined_loss": 0.08970451634377241,
"train/cross_entropy_loss": 0.017787940218113363,
"train/kl_divergence_loss": 0.16162109375,
"train/step_duration_seconds": 7.176481485366821,
"train/steps_per_hour": 322.9768202044927,
"train/total_elapsed_hours": 0.3932170733478334
},
{
"epoch": 0.3034074074074074,
"grad_norm": 36.0,
"learning_rate": 1.766044443118978e-05,
"loss": 1.4353,
"step": 128
},
{
"epoch": 0.3034074074074074,
"step": 128,
"train/combined_loss": 0.08426619321107864,
"train/cross_entropy_loss": 0.019362466409802437,
"train/kl_divergence_loss": 0.149169921875,
"train/step_duration_seconds": 7.183627605438232,
"train/steps_per_hour": 323.876374745131,
"train/total_elapsed_hours": 0.39521252546045516
},
{
"epoch": 0.30577777777777776,
"grad_norm": 18.25,
"learning_rate": 1.760675781050109e-05,
"loss": 1.3483,
"step": 129
},
{
"epoch": 0.30577777777777776,
"step": 129,
"train/combined_loss": 0.09070024406537414,
"train/cross_entropy_loss": 0.02429600094910711,
"train/kl_divergence_loss": 0.1571044921875,
"train/step_duration_seconds": 7.154268741607666,
"train/steps_per_hour": 324.77355916793476,
"train/total_elapsed_hours": 0.39719982233312395
},
{
"epoch": 0.30814814814814817,
"grad_norm": 35.0,
"learning_rate": 1.755254576178535e-05,
"loss": 1.4512,
"step": 130
},
{
"epoch": 0.30814814814814817,
"step": 130,
"train/combined_loss": 0.09242757642641664,
"train/cross_entropy_loss": 0.025797537760809064,
"train/kl_divergence_loss": 0.1590576171875,
"train/step_duration_seconds": 7.181564092636108,
"train/steps_per_hour": 325.6556251715386,
"train/total_elapsed_hours": 0.3991947012477451
},
{
"epoch": 0.3105185185185185,
"grad_norm": 40.5,
"learning_rate": 1.7497812029677344e-05,
"loss": 1.4788,
"step": 131
},
{
"epoch": 0.3105185185185185,
"step": 131,
"train/combined_loss": 0.08590791560709476,
"train/cross_entropy_loss": 0.024782140040770173,
"train/kl_divergence_loss": 0.14703369140625,
"train/step_duration_seconds": 7.177969694137573,
"train/steps_per_hour": 326.52973182564483,
"train/total_elapsed_hours": 0.4011885817183389
},
{
"epoch": 0.3128888888888889,
"grad_norm": 36.5,
"learning_rate": 1.7442560394846518e-05,
"loss": 1.3745,
"step": 132
},
{
"epoch": 0.3128888888888889,
"step": 132,
"train/combined_loss": 0.08688511373475194,
"train/cross_entropy_loss": 0.022158897132612765,
"train/kl_divergence_loss": 0.151611328125,
"train/step_duration_seconds": 7.1588640213012695,
"train/steps_per_hour": 327.39950253665864,
"train/total_elapsed_hours": 0.4031771550575892
},
{
"epoch": 0.31525925925925924,
"grad_norm": 16.0,
"learning_rate": 1.738679467373586e-05,
"loss": 1.3902,
"step": 133
},
{
"epoch": 0.31525925925925924,
"step": 133,
"train/combined_loss": 0.08377803396433592,
"train/cross_entropy_loss": 0.017897860845550895,
"train/kl_divergence_loss": 0.149658203125,
"train/step_duration_seconds": 7.175210237503052,
"train/steps_per_hour": 328.257056778055,
"train/total_elapsed_hours": 0.4051702690124512
},
{
"epoch": 0.31762962962962965,
"grad_norm": 34.5,
"learning_rate": 1.7330518718298263e-05,
"loss": 1.3404,
"step": 134
},
{
"epoch": 0.31762962962962965,
"step": 134,
"train/combined_loss": 0.08916169637814164,
"train/cross_entropy_loss": 0.017007473739795387,
"train/kl_divergence_loss": 0.16131591796875,
"train/step_duration_seconds": 7.173584222793579,
"train/steps_per_hour": 329.106580437697,
"train/total_elapsed_hours": 0.4071629312965605
},
{
"epoch": 0.32,
"grad_norm": 42.75,
"learning_rate": 1.7273736415730488e-05,
"loss": 1.4266,
"step": 135
},
{
"epoch": 0.32,
"step": 135,
"train/combined_loss": 0.08420996041968465,
"train/cross_entropy_loss": 0.016442388528957963,
"train/kl_divergence_loss": 0.1519775390625,
"train/step_duration_seconds": 7.169410705566406,
"train/steps_per_hour": 329.948764311339,
"train/total_elapsed_hours": 0.40915443427032894
},
{
"epoch": 0.32237037037037036,
"grad_norm": 37.25,
"learning_rate": 1.7216451688204623e-05,
"loss": 1.3474,
"step": 136
},
{
"epoch": 0.32237037037037036,
"step": 136,
"train/combined_loss": 0.08458211086690426,
"train/cross_entropy_loss": 0.017736003384925425,
"train/kl_divergence_loss": 0.15142822265625,
"train/step_duration_seconds": 7.171813011169434,
"train/steps_per_hour": 330.78225259442917,
"train/total_elapsed_hours": 0.4111466045512093
},
{
"epoch": 0.3247407407407407,
"grad_norm": 17.75,
"learning_rate": 1.7158668492597186e-05,
"loss": 1.3533,
"step": 137
},
{
"epoch": 0.3247407407407407,
"step": 137,
"train/combined_loss": 0.08640648704022169,
"train/cross_entropy_loss": 0.022422353271394968,
"train/kl_divergence_loss": 0.150390625,
"train/step_duration_seconds": 7.173238754272461,
"train/steps_per_hour": 331.60738477276607,
"train/total_elapsed_hours": 0.4131391708718406
},
{
"epoch": 0.32711111111111113,
"grad_norm": 24.25,
"learning_rate": 1.7100390820215805e-05,
"loss": 1.3825,
"step": 138
},
{
"epoch": 0.32711111111111113,
"step": 138,
"train/combined_loss": 0.08674443326890469,
"train/cross_entropy_loss": 0.025173434522002935,
"train/kl_divergence_loss": 0.1483154296875,
"train/step_duration_seconds": 7.182944059371948,
"train/steps_per_hour": 332.422437151218,
"train/total_elapsed_hours": 0.41513443311055503
},
{
"epoch": 0.3294814814814815,
"grad_norm": 30.125,
"learning_rate": 1.704162269652352e-05,
"loss": 1.3879,
"step": 139
},
{
"epoch": 0.3294814814814815,
"step": 139,
"train/combined_loss": 0.08193621598184109,
"train/cross_entropy_loss": 0.02483434451278299,
"train/kl_divergence_loss": 0.1390380859375,
"train/step_duration_seconds": 7.187625885009766,
"train/steps_per_hour": 333.2286533026057,
"train/total_elapsed_hours": 0.4171309958563911
},
{
"epoch": 0.33185185185185184,
"grad_norm": 26.25,
"learning_rate": 1.698236818086073e-05,
"loss": 1.311,
"step": 140
},
{
"epoch": 0.33185185185185184,
"step": 140,
"train/combined_loss": 0.08312624553218484,
"train/cross_entropy_loss": 0.021843309281393886,
"train/kl_divergence_loss": 0.1444091796875,
"train/step_duration_seconds": 7.165472507476807,
"train/steps_per_hour": 334.0320927727315,
"train/total_elapsed_hours": 0.4191214048862457
},
{
"epoch": 0.3342222222222222,
"grad_norm": 10.0,
"learning_rate": 1.6922631366164795e-05,
"loss": 1.33,
"step": 141
},
{
"epoch": 0.3342222222222222,
"step": 141,
"train/combined_loss": 0.08382831746712327,
"train/cross_entropy_loss": 0.016838762094266713,
"train/kl_divergence_loss": 0.15081787109375,
"train/step_duration_seconds": 7.17199444770813,
"train/steps_per_hour": 334.8264967927197,
"train/total_elapsed_hours": 0.42111362556616466
},
{
"epoch": 0.3365925925925926,
"grad_norm": 25.0,
"learning_rate": 1.686241637868734e-05,
"loss": 1.3413,
"step": 142
},
{
"epoch": 0.3365925925925926,
"step": 142,
"train/combined_loss": 0.08172068372368813,
"train/cross_entropy_loss": 0.015309049864299595,
"train/kl_divergence_loss": 0.14813232421875,
"train/step_duration_seconds": 7.141969680786133,
"train/steps_per_hour": 335.62003551235273,
"train/total_elapsed_hours": 0.4230975060330497
},
{
"epoch": 0.33896296296296297,
"grad_norm": 30.125,
"learning_rate": 1.6801727377709195e-05,
"loss": 1.3075,
"step": 143
},
{
"epoch": 0.33896296296296297,
"step": 143,
"train/combined_loss": 0.08254175027832389,
"train/cross_entropy_loss": 0.016035647364333272,
"train/kl_divergence_loss": 0.1490478515625,
"train/step_duration_seconds": 7.139028072357178,
"train/steps_per_hour": 336.40681390445945,
"train/total_elapsed_hours": 0.42508056938648225
},
{
"epoch": 0.3413333333333333,
"grad_norm": 26.625,
"learning_rate": 1.6740568555253153e-05,
"loss": 1.3207,
"step": 144
},
{
"epoch": 0.3413333333333333,
"step": 144,
"train/combined_loss": 0.0809242157265544,
"train/cross_entropy_loss": 0.019209268386475742,
"train/kl_divergence_loss": 0.14263916015625,
"train/step_duration_seconds": 7.164944648742676,
"train/steps_per_hour": 337.180601627168,
"train/total_elapsed_hours": 0.42707083178891075
},
{
"epoch": 0.3437037037037037,
"grad_norm": 10.6875,
"learning_rate": 1.6678944135794375e-05,
"loss": 1.2948,
"step": 145
},
{
"epoch": 0.3437037037037037,
"step": 145,
"train/combined_loss": 0.08305090665817261,
"train/cross_entropy_loss": 0.027735116658732295,
"train/kl_divergence_loss": 0.13836669921875,
"train/step_duration_seconds": 7.151780843734741,
"train/steps_per_hour": 337.95009082754103,
"train/total_elapsed_hours": 0.4290574375788371
},
{
"epoch": 0.3460740740740741,
"grad_norm": 24.875,
"learning_rate": 1.6616858375968596e-05,
"loss": 1.3288,
"step": 146
},
{
"epoch": 0.3460740740740741,
"step": 146,
"train/combined_loss": 0.08167848456650972,
"train/cross_entropy_loss": 0.028225134126842022,
"train/kl_divergence_loss": 0.1351318359375,
"train/step_duration_seconds": 7.142008066177368,
"train/steps_per_hour": 338.7146203300441,
"train/total_elapsed_hours": 0.4310413287083308
},
{
"epoch": 0.34844444444444445,
"grad_norm": 29.625,
"learning_rate": 1.6554315564278102e-05,
"loss": 1.3069,
"step": 147
},
{
"epoch": 0.34844444444444445,
"step": 147,
"train/combined_loss": 0.08066110266372561,
"train/cross_entropy_loss": 0.02490862738341093,
"train/kl_divergence_loss": 0.13641357421875,
"train/step_duration_seconds": 7.151546478271484,
"train/steps_per_hour": 339.47006737646905,
"train/total_elapsed_hours": 0.43302786939673954
},
{
"epoch": 0.3508148148148148,
"grad_norm": 23.625,
"learning_rate": 1.649132002079552e-05,
"loss": 1.2906,
"step": 148
},
{
"epoch": 0.3508148148148148,
"step": 148,
"train/combined_loss": 0.08027565246447921,
"train/cross_entropy_loss": 0.01925491786096245,
"train/kl_divergence_loss": 0.14129638671875,
"train/step_duration_seconds": 7.16980504989624,
"train/steps_per_hour": 340.21464820376787,
"train/total_elapsed_hours": 0.4350194819105996
},
{
"epoch": 0.35318518518518516,
"grad_norm": 13.9375,
"learning_rate": 1.6427876096865394e-05,
"loss": 1.2844,
"step": 149
},
{
"epoch": 0.35318518518518516,
"step": 149,
"train/combined_loss": 0.0810198406688869,
"train/cross_entropy_loss": 0.01714221539441496,
"train/kl_divergence_loss": 0.1448974609375,
"train/step_duration_seconds": 7.167724132537842,
"train/steps_per_hour": 340.95289337704213,
"train/total_elapsed_hours": 0.4370105163918601
},
{
"epoch": 0.35555555555555557,
"grad_norm": 16.5,
"learning_rate": 1.6363988174803638e-05,
"loss": 1.2963,
"step": 150
},
{
"epoch": 0.35555555555555557,
"eval_combined_loss": 0.08181474480902155,
"eval_cross_entropy_loss": 0.01787818753470977,
"eval_kl_divergence_loss": 0.14575130208333334,
"eval_loss": 0.08181475102901459,
"eval_runtime": 218.6737,
"eval_samples_per_second": 6.86,
"eval_steps_per_second": 3.43,
"step": 150
},
{
"epoch": 0.35555555555555557,
"step": 150,
"train/combined_loss": 0.08019543159753084,
"train/cross_entropy_loss": 0.017629636102356017,
"train/kl_divergence_loss": 0.14276123046875,
"train/step_duration_seconds": 225.85686349868774,
"train/steps_per_hour": 300.1509555021127,
"train/total_elapsed_hours": 0.4997485340303845
},
{
"epoch": 0.3579259259259259,
"grad_norm": 23.125,
"learning_rate": 1.6299660667594814e-05,
"loss": 1.2831,
"step": 151
},
{
"epoch": 0.3579259259259259,
"step": 151,
"train/combined_loss": 0.08127154828980565,
"train/cross_entropy_loss": 0.01935462059918791,
"train/kl_divergence_loss": 0.1431884765625,
"train/step_duration_seconds": 7.190611124038696,
"train/steps_per_hour": 300.9491301723214,
"train/total_elapsed_hours": 0.5017459260092841
},
{
"epoch": 0.3602962962962963,
"grad_norm": 20.375,
"learning_rate": 1.6234898018587336e-05,
"loss": 1.3003,
"step": 152
},
{
"epoch": 0.3602962962962963,
"step": 152,
"train/combined_loss": 0.08766383724287152,
"train/cross_entropy_loss": 0.020847689942456782,
"train/kl_divergence_loss": 0.15447998046875,
"train/step_duration_seconds": 7.20133113861084,
"train/steps_per_hour": 301.7391914865756,
"train/total_elapsed_hours": 0.5037462957700094
},
{
"epoch": 0.3626666666666667,
"grad_norm": 12.0,
"learning_rate": 1.6169704701186528e-05,
"loss": 1.4026,
"step": 153
},
{
"epoch": 0.3626666666666667,
"step": 153,
"train/combined_loss": 0.07755696773529053,
"train/cross_entropy_loss": 0.021324871107935905,
"train/kl_divergence_loss": 0.1337890625,
"train/step_duration_seconds": 7.173044919967651,
"train/steps_per_hour": 302.52770304516605,
"train/total_elapsed_hours": 0.5057388082477782
},
{
"epoch": 0.36503703703703705,
"grad_norm": 17.125,
"learning_rate": 1.6104085218545633e-05,
"loss": 1.2409,
"step": 154
},
{
"epoch": 0.36503703703703705,
"step": 154,
"train/combined_loss": 0.07878367276862264,
"train/cross_entropy_loss": 0.022069291560910642,
"train/kl_divergence_loss": 0.135498046875,
"train/step_duration_seconds": 7.17523193359375,
"train/steps_per_hour": 303.30966291023043,
"train/total_elapsed_hours": 0.507731928229332
},
{
"epoch": 0.3674074074074074,
"grad_norm": 16.875,
"learning_rate": 1.6038044103254775e-05,
"loss": 1.2605,
"step": 155
},
{
"epoch": 0.3674074074074074,
"step": 155,
"train/combined_loss": 0.07806963194161654,
"train/cross_entropy_loss": 0.021861913381144404,
"train/kl_divergence_loss": 0.13427734375,
"train/step_duration_seconds": 7.181210279464722,
"train/steps_per_hour": 304.08451686962036,
"train/total_elapsed_hours": 0.5097267088625166
},
{
"epoch": 0.36977777777777776,
"grad_norm": 11.75,
"learning_rate": 1.5971585917027864e-05,
"loss": 1.2491,
"step": 156
},
{
"epoch": 0.36977777777777776,
"step": 156,
"train/combined_loss": 0.07714226096868515,
"train/cross_entropy_loss": 0.020495465025305748,
"train/kl_divergence_loss": 0.1337890625,
"train/step_duration_seconds": 7.1563475131988525,
"train/steps_per_hour": 304.8574442281318,
"train/total_elapsed_hours": 0.5117145831717386
},
{
"epoch": 0.3721481481481482,
"grad_norm": 13.125,
"learning_rate": 1.5904715250387498e-05,
"loss": 1.2343,
"step": 157
},
{
"epoch": 0.3721481481481482,
"step": 157,
"train/combined_loss": 0.07875645952299237,
"train/cross_entropy_loss": 0.01914622518233955,
"train/kl_divergence_loss": 0.13836669921875,
"train/step_duration_seconds": 7.15238881111145,
"train/steps_per_hour": 305.62504381863846,
"train/total_elapsed_hours": 0.5137013578414917
},
{
"epoch": 0.37451851851851853,
"grad_norm": 7.9375,
"learning_rate": 1.5837436722347902e-05,
"loss": 1.2601,
"step": 158
},
{
"epoch": 0.37451851851851853,
"step": 158,
"train/combined_loss": 0.07961196266114712,
"train/cross_entropy_loss": 0.019941705162636936,
"train/kl_divergence_loss": 0.1392822265625,
"train/step_duration_seconds": 7.165693759918213,
"train/steps_per_hour": 306.3845330098647,
"train/total_elapsed_hours": 0.5156918283303579
},
{
"epoch": 0.3768888888888889,
"grad_norm": 8.625,
"learning_rate": 1.576975498009583e-05,
"loss": 1.2738,
"step": 159
},
{
"epoch": 0.3768888888888889,
"step": 159,
"train/combined_loss": 0.08134815841913223,
"train/cross_entropy_loss": 0.021583035704679787,
"train/kl_divergence_loss": 0.14111328125,
"train/step_duration_seconds": 7.175953149795532,
"train/steps_per_hour": 307.1364910017458,
"train/total_elapsed_hours": 0.5176851486497455
},
{
"epoch": 0.37925925925925924,
"grad_norm": 6.90625,
"learning_rate": 1.570167469866962e-05,
"loss": 1.3016,
"step": 160
},
{
"epoch": 0.37925925925925924,
"step": 160,
"train/combined_loss": 0.08167480118572712,
"train/cross_entropy_loss": 0.020954578067176044,
"train/kl_divergence_loss": 0.14239501953125,
"train/step_duration_seconds": 7.178318023681641,
"train/steps_per_hour": 307.88229126869635,
"train/total_elapsed_hours": 0.5196791258785459
},
{
"epoch": 0.38162962962962965,
"grad_norm": 6.625,
"learning_rate": 1.563320058063622e-05,
"loss": 1.3068,
"step": 161
},
{
"epoch": 0.38162962962962965,
"step": 161,
"train/combined_loss": 0.08032544003799558,
"train/cross_entropy_loss": 0.021124517312273383,
"train/kl_divergence_loss": 0.1395263671875,
"train/step_duration_seconds": 7.179118394851685,
"train/steps_per_hour": 308.62225870273886,
"train/total_elapsed_hours": 0.5216733254326714
},
{
"epoch": 0.384,
"grad_norm": 7.875,
"learning_rate": 1.5564337355766412e-05,
"loss": 1.2852,
"step": 162
},
{
"epoch": 0.384,
"step": 162,
"train/combined_loss": 0.07905747788026929,
"train/cross_entropy_loss": 0.02096895850263536,
"train/kl_divergence_loss": 0.13714599609375,
"train/step_duration_seconds": 7.179133653640747,
"train/steps_per_hour": 309.3565878328245,
"train/total_elapsed_hours": 0.5236675292253494
},
{
"epoch": 0.38637037037037036,
"grad_norm": 4.15625,
"learning_rate": 1.5495089780708062e-05,
"loss": 1.2649,
"step": 163
},
{
"epoch": 0.38637037037037036,
"step": 163,
"train/combined_loss": 0.07707322854548693,
"train/cross_entropy_loss": 0.019991177483461797,
"train/kl_divergence_loss": 0.1341552734375,
"train/step_duration_seconds": 7.165940761566162,
"train/steps_per_hour": 310.0875071111404,
"train/total_elapsed_hours": 0.5256580683257844
},
{
"epoch": 0.3887407407407407,
"grad_norm": 9.125,
"learning_rate": 1.5425462638657597e-05,
"loss": 1.2332,
"step": 164
},
{
"epoch": 0.3887407407407407,
"step": 164,
"train/combined_loss": 0.07535458076745272,
"train/cross_entropy_loss": 0.019971861504018307,
"train/kl_divergence_loss": 0.1307373046875,
"train/step_duration_seconds": 7.187286376953125,
"train/steps_per_hour": 310.80941899189014,
"train/total_elapsed_hours": 0.527654536763827
},
{
"epoch": 0.39111111111111113,
"grad_norm": 9.0625,
"learning_rate": 1.5355460739029585e-05,
"loss": 1.2057,
"step": 165
},
{
"epoch": 0.39111111111111113,
"step": 165,
"train/combined_loss": 0.07617681892588735,
"train/cross_entropy_loss": 0.021128055173903704,
"train/kl_divergence_loss": 0.1312255859375,
"train/step_duration_seconds": 7.1567230224609375,
"train/steps_per_hour": 311.53088207288556,
"train/total_elapsed_hours": 0.5296425153811772
},
{
"epoch": 0.3934814814814815,
"grad_norm": 8.8125,
"learning_rate": 1.5285088917124555e-05,
"loss": 1.2188,
"step": 166
},
{
"epoch": 0.3934814814814815,
"step": 166,
"train/combined_loss": 0.08005631249397993,
"train/cross_entropy_loss": 0.021440746961161494,
"train/kl_divergence_loss": 0.138671875,
"train/step_duration_seconds": 7.164660930633545,
"train/steps_per_hour": 312.24565441642335,
"train/total_elapsed_hours": 0.5316326989730199
},
{
"epoch": 0.39585185185185184,
"grad_norm": 5.625,
"learning_rate": 1.5214352033794981e-05,
"loss": 1.2809,
"step": 167
},
{
"epoch": 0.39585185185185184,
"step": 167,
"train/combined_loss": 0.07657396793365479,
"train/cross_entropy_loss": 0.020762681495398283,
"train/kl_divergence_loss": 0.13238525390625,
"train/step_duration_seconds": 7.171409845352173,
"train/steps_per_hour": 312.9539957186981,
"train/total_elapsed_hours": 0.5336247572633955
},
{
"epoch": 0.3982222222222222,
"grad_norm": 11.25,
"learning_rate": 1.5143254975109538e-05,
"loss": 1.2252,
"step": 168
},
{
"epoch": 0.3982222222222222,
"step": 168,
"train/combined_loss": 0.07494777115061879,
"train/cross_entropy_loss": 0.02077566913794726,
"train/kl_divergence_loss": 0.129119873046875,
"train/step_duration_seconds": 7.178438186645508,
"train/steps_per_hour": 313.65592484340516,
"train/total_elapsed_hours": 0.5356187678707971
},
{
"epoch": 0.4005925925925926,
"grad_norm": 11.0625,
"learning_rate": 1.5071802652015592e-05,
"loss": 1.1992,
"step": 169
},
{
"epoch": 0.4005925925925926,
"step": 169,
"train/combined_loss": 0.07396322628483176,
"train/cross_entropy_loss": 0.02079021732788533,
"train/kl_divergence_loss": 0.12713623046875,
"train/step_duration_seconds": 7.178174734115601,
"train/steps_per_hour": 314.3526898358104,
"train/total_elapsed_hours": 0.5376127052969403
},
{
"epoch": 0.40296296296296297,
"grad_norm": 3.921875,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.1834,
"step": 170
},
{
"epoch": 0.40296296296296297,
"step": 170,
"train/combined_loss": 0.07202337961643934,
"train/cross_entropy_loss": 0.01926037878729403,
"train/kl_divergence_loss": 0.124786376953125,
"train/step_duration_seconds": 7.181441783905029,
"train/steps_per_hour": 315.043775657627,
"train/total_elapsed_hours": 0.5396075502369139
},
{
"epoch": 0.4053333333333333,
"grad_norm": 7.46875,
"learning_rate": 1.4927851978748177e-05,
"loss": 1.1524,
"step": 171
},
{
"epoch": 0.4053333333333333,
"step": 171,
"train/combined_loss": 0.07204042701050639,
"train/cross_entropy_loss": 0.020179486949928105,
"train/kl_divergence_loss": 0.1239013671875,
"train/step_duration_seconds": 7.173416376113892,
"train/steps_per_hour": 315.7310702015876,
"train/total_elapsed_hours": 0.5416001658969455
},
{
"epoch": 0.4077037037037037,
"grad_norm": 5.59375,
"learning_rate": 1.4855363571801523e-05,
"loss": 1.1526,
"step": 172
},
{
"epoch": 0.4077037037037037,
"step": 172,
"train/combined_loss": 0.072266333270818,
"train/cross_entropy_loss": 0.02215718082152307,
"train/kl_divergence_loss": 0.12237548828125,
"train/step_duration_seconds": 7.157419681549072,
"train/steps_per_hour": 316.4159124956855,
"train/total_elapsed_hours": 0.5435883380307092
},
{
"epoch": 0.4100740740740741,
"grad_norm": 7.0625,
"learning_rate": 1.4782539786213184e-05,
"loss": 1.1563,
"step": 173
},
{
"epoch": 0.4100740740740741,
"step": 173,
"train/combined_loss": 0.07013190537691116,
"train/cross_entropy_loss": 0.020634910091757774,
"train/kl_divergence_loss": 0.11962890625,
"train/step_duration_seconds": 7.1777660846710205,
"train/steps_per_hour": 317.09247858077316,
"train/total_elapsed_hours": 0.5455821619431178
},
{
"epoch": 0.41244444444444445,
"grad_norm": 4.3125,
"learning_rate": 1.4709385652202204e-05,
"loss": 1.1221,
"step": 174
},
{
"epoch": 0.41244444444444445,
"step": 174,
"train/combined_loss": 0.06954938173294067,
"train/cross_entropy_loss": 0.018493298441171646,
"train/kl_divergence_loss": 0.12060546875,
"train/step_duration_seconds": 7.157723426818848,
"train/steps_per_hour": 317.767348521783,
"train/total_elapsed_hours": 0.5475704184505674
},
{
"epoch": 0.4148148148148148,
"grad_norm": 7.09375,
"learning_rate": 1.4635906222806058e-05,
"loss": 1.1128,
"step": 175
},
{
"epoch": 0.4148148148148148,
"step": 175,
"train/combined_loss": 0.07863931078463793,
"train/cross_entropy_loss": 0.02086504956241697,
"train/kl_divergence_loss": 0.13641357421875,
"train/step_duration_seconds": 7.178933382034302,
"train/steps_per_hour": 318.4339213839359,
"train/total_elapsed_hours": 0.5495645666122436
},
{
"epoch": 0.41718518518518516,
"grad_norm": 5.84375,
"learning_rate": 1.4562106573531632e-05,
"loss": 1.2582,
"step": 176
},
{
"epoch": 0.41718518518518516,
"step": 176,
"train/combined_loss": 0.0717529603280127,
"train/cross_entropy_loss": 0.022137517924420536,
"train/kl_divergence_loss": 0.121368408203125,
"train/step_duration_seconds": 7.176782131195068,
"train/steps_per_hour": 319.09602000259764,
"train/total_elapsed_hours": 0.5515581172042423
},
{
"epoch": 0.41955555555555557,
"grad_norm": 9.5625,
"learning_rate": 1.4487991802004625e-05,
"loss": 1.148,
"step": 177
},
{
"epoch": 0.41955555555555557,
"step": 177,
"train/combined_loss": 0.0723364045843482,
"train/cross_entropy_loss": 0.021931110764853656,
"train/kl_divergence_loss": 0.12274169921875,
"train/step_duration_seconds": 7.177384376525879,
"train/steps_per_hour": 319.75325304866163,
"train/total_elapsed_hours": 0.5535518350866105
},
{
"epoch": 0.4219259259259259,
"grad_norm": 4.0,
"learning_rate": 1.4413567027617442e-05,
"loss": 1.1574,
"step": 178
},
{
"epoch": 0.4219259259259259,
"step": 178,
"train/combined_loss": 0.0698565854690969,
"train/cross_entropy_loss": 0.017886995803564787,
"train/kl_divergence_loss": 0.121826171875,
"train/step_duration_seconds": 7.158464431762695,
"train/steps_per_hour": 320.4087999085678,
"train/total_elapsed_hours": 0.5555402974287669
},
{
"epoch": 0.4242962962962963,
"grad_norm": 14.1875,
"learning_rate": 1.4338837391175582e-05,
"loss": 1.1177,
"step": 179
},
{
"epoch": 0.4242962962962963,
"step": 179,
"train/combined_loss": 0.06887802015990019,
"train/cross_entropy_loss": 0.017760923714376986,
"train/kl_divergence_loss": 0.1199951171875,
"train/step_duration_seconds": 7.177714586257935,
"train/steps_per_hour": 321.05659141033044,
"train/total_elapsed_hours": 0.5575341070360608
},
{
"epoch": 0.4266666666666667,
"grad_norm": 11.625,
"learning_rate": 1.4263808054542541e-05,
"loss": 1.102,
"step": 180
},
{
"epoch": 0.4266666666666667,
"step": 180,
"train/combined_loss": 0.07172887865453959,
"train/cross_entropy_loss": 0.02205883653368801,
"train/kl_divergence_loss": 0.12139892578125,
"train/step_duration_seconds": 7.169630289077759,
"train/steps_per_hour": 321.7010573913621,
"train/total_elapsed_hours": 0.559525671005249
},
{
"epoch": 0.42903703703703705,
"grad_norm": 9.0,
"learning_rate": 1.418848420028325e-05,
"loss": 1.1477,
"step": 181
},
{
"epoch": 0.42903703703703705,
"step": 181,
"train/combined_loss": 0.07292318437248468,
"train/cross_entropy_loss": 0.024111752747558057,
"train/kl_divergence_loss": 0.121734619140625,
"train/step_duration_seconds": 7.1448750495910645,
"train/steps_per_hour": 322.34489934858004,
"train/total_elapsed_hours": 0.5615103585190243
},
{
"epoch": 0.4314074074074074,
"grad_norm": 10.0625,
"learning_rate": 1.4112871031306118e-05,
"loss": 1.1668,
"step": 182
},
{
"epoch": 0.4314074074074074,
"step": 182,
"train/combined_loss": 0.07089130999520421,
"train/cross_entropy_loss": 0.022153714206069708,
"train/kl_divergence_loss": 0.11962890625,
"train/step_duration_seconds": 7.131852149963379,
"train/steps_per_hour": 322.9862794273204,
"train/total_elapsed_hours": 0.5634914285606808
},
{
"epoch": 0.43377777777777776,
"grad_norm": 4.28125,
"learning_rate": 1.4036973770503623e-05,
"loss": 1.1343,
"step": 183
},
{
"epoch": 0.43377777777777776,
"step": 183,
"train/combined_loss": 0.06743072532117367,
"train/cross_entropy_loss": 0.01742980582639575,
"train/kl_divergence_loss": 0.117431640625,
"train/step_duration_seconds": 7.159062623977661,
"train/steps_per_hour": 323.61883980323483,
"train/total_elapsed_hours": 0.5654800570673413
},
{
"epoch": 0.4361481481481482,
"grad_norm": 11.0625,
"learning_rate": 1.396079766039157e-05,
"loss": 1.0789,
"step": 184
},
{
"epoch": 0.4361481481481482,
"step": 184,
"train/combined_loss": 0.06741259898990393,
"train/cross_entropy_loss": 0.018187020672485232,
"train/kl_divergence_loss": 0.11663818359375,
"train/step_duration_seconds": 7.166898250579834,
"train/steps_per_hour": 324.24572304400647,
"train/total_elapsed_hours": 0.5674708621369468
},
{
"epoch": 0.43851851851851853,
"grad_norm": 9.25,
"learning_rate": 1.3884347962746949e-05,
"loss": 1.0786,
"step": 185
},
{
"epoch": 0.43851851851851853,
"step": 185,
"train/combined_loss": 0.06627723574638367,
"train/cross_entropy_loss": 0.022691186517477036,
"train/kl_divergence_loss": 0.10986328125,
"train/step_duration_seconds": 7.1953465938568115,
"train/steps_per_hour": 324.863715114037,
"train/total_elapsed_hours": 0.5694695695241292
},
{
"epoch": 0.4408888888888889,
"grad_norm": 12.25,
"learning_rate": 1.3807629958244498e-05,
"loss": 1.0604,
"step": 186
},
{
"epoch": 0.4408888888888889,
"step": 186,
"train/combined_loss": 0.06682039611041546,
"train/cross_entropy_loss": 0.02396061283070594,
"train/kl_divergence_loss": 0.10968017578125,
"train/step_duration_seconds": 7.1774444580078125,
"train/steps_per_hour": 325.4802166069086,
"train/total_elapsed_hours": 0.571463304095798
},
{
"epoch": 0.44325925925925924,
"grad_norm": 12.75,
"learning_rate": 1.373064894609194e-05,
"loss": 1.0691,
"step": 187
},
{
"epoch": 0.44325925925925924,
"step": 187,
"train/combined_loss": 0.07040743064135313,
"train/cross_entropy_loss": 0.02216251229401678,
"train/kl_divergence_loss": 0.11865234375,
"train/step_duration_seconds": 7.175534009933472,
"train/steps_per_hour": 326.09273309370775,
"train/total_elapsed_hours": 0.5734565079874463
},
{
"epoch": 0.44562962962962965,
"grad_norm": 7.28125,
"learning_rate": 1.3653410243663953e-05,
"loss": 1.1265,
"step": 188
},
{
"epoch": 0.44562962962962965,
"step": 188,
"train/combined_loss": 0.07080179871991277,
"train/cross_entropy_loss": 0.018343106610700488,
"train/kl_divergence_loss": 0.123260498046875,
"train/step_duration_seconds": 7.184988975524902,
"train/steps_per_hour": 326.69951532210575,
"train/total_elapsed_hours": 0.5754523382584253
},
{
"epoch": 0.448,
"grad_norm": 12.5625,
"learning_rate": 1.3575919186134862e-05,
"loss": 1.1328,
"step": 189
},
{
"epoch": 0.448,
"step": 189,
"train/combined_loss": 0.06690460816025734,
"train/cross_entropy_loss": 0.017476210254244506,
"train/kl_divergence_loss": 0.1163330078125,
"train/step_duration_seconds": 7.175193548202515,
"train/steps_per_hour": 327.30364537972457,
"train/total_elapsed_hours": 0.5774454475773705
},
{
"epoch": 0.45037037037037037,
"grad_norm": 13.875,
"learning_rate": 1.349818112611015e-05,
"loss": 1.0705,
"step": 190
},
{
"epoch": 0.45037037037037037,
"step": 190,
"train/combined_loss": 0.06556698912754655,
"train/cross_entropy_loss": 0.018676697509363294,
"train/kl_divergence_loss": 0.112457275390625,
"train/step_duration_seconds": 7.1955156326293945,
"train/steps_per_hour": 327.9004248735381,
"train/total_elapsed_hours": 0.5794442019197676
},
{
"epoch": 0.4527407407407407,
"grad_norm": 8.6875,
"learning_rate": 1.342020143325669e-05,
"loss": 1.0491,
"step": 191
},
{
"epoch": 0.4527407407407407,
"step": 191,
"train/combined_loss": 0.06877634488046169,
"train/cross_entropy_loss": 0.024057817296124995,
"train/kl_divergence_loss": 0.113494873046875,
"train/step_duration_seconds": 7.176729202270508,
"train/steps_per_hour": 328.49604967101124,
"train/total_elapsed_hours": 0.5814377378092872
},
{
"epoch": 0.45511111111111113,
"grad_norm": 19.0,
"learning_rate": 1.3341985493931877e-05,
"loss": 1.1004,
"step": 192
},
{
"epoch": 0.45511111111111113,
"step": 192,
"train/combined_loss": 0.06570423394441605,
"train/cross_entropy_loss": 0.024963149800896645,
"train/kl_divergence_loss": 0.1064453125,
"train/step_duration_seconds": 7.157531499862671,
"train/steps_per_hour": 329.0906120315698,
"train/total_elapsed_hours": 0.5834259410036935
},
{
"epoch": 0.4574814814814815,
"grad_norm": 23.625,
"learning_rate": 1.3263538710811559e-05,
"loss": 1.0513,
"step": 193
},
{
"epoch": 0.4574814814814815,
"step": 193,
"train/combined_loss": 0.0659687272273004,
"train/cross_entropy_loss": 0.024729204480536282,
"train/kl_divergence_loss": 0.107208251953125,
"train/step_duration_seconds": 7.212608098983765,
"train/steps_per_hour": 329.6725202740739,
"train/total_elapsed_hours": 0.5854294432534112
},
{
"epoch": 0.45985185185185184,
"grad_norm": 18.625,
"learning_rate": 1.3184866502516846e-05,
"loss": 1.0555,
"step": 194
},
{
"epoch": 0.45985185185185184,
"step": 194,
"train/combined_loss": 0.07015136396512389,
"train/cross_entropy_loss": 0.021467273705638945,
"train/kl_divergence_loss": 0.11883544921875,
"train/step_duration_seconds": 7.18483304977417,
"train/steps_per_hour": 330.25479673403044,
"train/total_elapsed_hours": 0.5874252302116818
},
{
"epoch": 0.4622222222222222,
"grad_norm": 10.75,
"learning_rate": 1.3105974303239838e-05,
"loss": 1.1224,
"step": 195
},
{
"epoch": 0.4622222222222222,
"step": 195,
"train/combined_loss": 0.0675101918168366,
"train/cross_entropy_loss": 0.01786339597310871,
"train/kl_divergence_loss": 0.117156982421875,
"train/step_duration_seconds": 7.176580190658569,
"train/steps_per_hour": 330.83441673191504,
"train/total_elapsed_hours": 0.589418724709087
},
{
"epoch": 0.4645925925925926,
"grad_norm": 11.875,
"learning_rate": 1.3026867562368262e-05,
"loss": 1.0802,
"step": 196
},
{
"epoch": 0.4645925925925926,
"step": 196,
"train/combined_loss": 0.06660758936777711,
"train/cross_entropy_loss": 0.017553551122546196,
"train/kl_divergence_loss": 0.11566162109375,
"train/step_duration_seconds": 7.176948308944702,
"train/steps_per_hour": 331.4100719369871,
"train/total_elapsed_hours": 0.5914123214615716
},
{
"epoch": 0.46696296296296297,
"grad_norm": 10.8125,
"learning_rate": 1.2947551744109044e-05,
"loss": 1.0657,
"step": 197
},
{
"epoch": 0.46696296296296297,
"step": 197,
"train/combined_loss": 0.07065472798421979,
"train/cross_entropy_loss": 0.01936120947357267,
"train/kl_divergence_loss": 0.1219482421875,
"train/step_duration_seconds": 7.178069353103638,
"train/steps_per_hour": 331.98168500479466,
"train/total_elapsed_hours": 0.5934062296152115
},
{
"epoch": 0.4693333333333333,
"grad_norm": 8.75,
"learning_rate": 1.2868032327110904e-05,
"loss": 1.1305,
"step": 198
},
{
"epoch": 0.4693333333333333,
"step": 198,
"train/combined_loss": 0.06524005252867937,
"train/cross_entropy_loss": 0.02314978139474988,
"train/kl_divergence_loss": 0.107330322265625,
"train/step_duration_seconds": 7.202880859375,
"train/steps_per_hour": 332.5456201761273,
"train/total_elapsed_hours": 0.5954070298539268
},
{
"epoch": 0.4717037037037037,
"grad_norm": 8.375,
"learning_rate": 1.2788314804085904e-05,
"loss": 1.0438,
"step": 199
},
{
"epoch": 0.4717037037037037,
"step": 199,
"train/combined_loss": 0.06441081315279007,
"train/cross_entropy_loss": 0.02335287816822529,
"train/kl_divergence_loss": 0.10546875,
"train/step_duration_seconds": 7.1744384765625,
"train/steps_per_hour": 333.1101833065549,
"train/total_elapsed_hours": 0.5973999294307497
},
{
"epoch": 0.4740740740740741,
"grad_norm": 9.3125,
"learning_rate": 1.2708404681430054e-05,
"loss": 1.0306,
"step": 200
},
{
"epoch": 0.4740740740740741,
"eval_combined_loss": 0.06667867637053132,
"eval_cross_entropy_loss": 0.021343029824395975,
"eval_kl_divergence_loss": 0.11201432291666667,
"eval_loss": 0.06667868047952652,
"eval_runtime": 220.1815,
"eval_samples_per_second": 6.813,
"eval_steps_per_second": 3.406,
"step": 200
},
{
"epoch": 0.4740740740740741,
"step": 200,
"train/combined_loss": 0.06510467641055584,
"train/cross_entropy_loss": 0.021170052816160023,
"train/kl_divergence_loss": 0.109039306640625,
"train/step_duration_seconds": 227.3877534866333,
"train/steps_per_hour": 302.77194026156553,
"train/total_elapsed_hours": 0.6605631942881478
},
{
"epoch": 0.47644444444444445,
"grad_norm": 3.859375,
"learning_rate": 1.2628307478842955e-05,
"loss": 1.0417,
"step": 201
},
{
"epoch": 0.47644444444444445,
"step": 201,
"train/combined_loss": 0.06429090350866318,
"train/cross_entropy_loss": 0.016765402629971504,
"train/kl_divergence_loss": 0.11181640625,
"train/step_duration_seconds": 7.160262107849121,
"train/steps_per_hour": 303.37234311953483,
"train/total_elapsed_hours": 0.6625521559847726
},
{
"epoch": 0.4788148148148148,
"grad_norm": 14.9375,
"learning_rate": 1.2548028728946548e-05,
"loss": 1.0287,
"step": 202
},
{
"epoch": 0.4788148148148148,
"step": 202,
"train/combined_loss": 0.06479989876970649,
"train/cross_entropy_loss": 0.01619648071937263,
"train/kl_divergence_loss": 0.1134033203125,
"train/step_duration_seconds": 7.1755499839782715,
"train/steps_per_hour": 303.96720953523806,
"train/total_elapsed_hours": 0.6645453643136554
},
{
"epoch": 0.48118518518518516,
"grad_norm": 16.625,
"learning_rate": 1.2467573976902936e-05,
"loss": 1.0368,
"step": 203
},
{
"epoch": 0.48118518518518516,
"step": 203,
"train/combined_loss": 0.066513289231807,
"train/cross_entropy_loss": 0.01745650765951723,
"train/kl_divergence_loss": 0.115570068359375,
"train/step_duration_seconds": 7.177863121032715,
"train/steps_per_hour": 304.55822459747407,
"train/total_elapsed_hours": 0.666539215180609
},
{
"epoch": 0.48355555555555557,
"grad_norm": 10.75,
"learning_rate": 1.238694878003138e-05,
"loss": 1.0642,
"step": 204
},
{
"epoch": 0.48355555555555557,
"step": 204,
"train/combined_loss": 0.06533924676477909,
"train/cross_entropy_loss": 0.0218833324033767,
"train/kl_divergence_loss": 0.108795166015625,
"train/step_duration_seconds": 7.178657293319702,
"train/steps_per_hour": 305.14561364915744,
"train/total_elapsed_hours": 0.6685332866509756
},
{
"epoch": 0.48592592592592593,
"grad_norm": 12.5625,
"learning_rate": 1.2306158707424402e-05,
"loss": 1.0454,
"step": 205
},
{
"epoch": 0.48592592592592593,
"step": 205,
"train/combined_loss": 0.06534022279083729,
"train/cross_entropy_loss": 0.024326685117557645,
"train/kl_divergence_loss": 0.106353759765625,
"train/step_duration_seconds": 7.156978607177734,
"train/steps_per_hour": 305.7322547589479,
"train/total_elapsed_hours": 0.6705213362640805
},
{
"epoch": 0.4882962962962963,
"grad_norm": 16.375,
"learning_rate": 1.2225209339563144e-05,
"loss": 1.0454,
"step": 206
},
{
"epoch": 0.4882962962962963,
"step": 206,
"train/combined_loss": 0.06436126446351409,
"train/cross_entropy_loss": 0.024413448525592685,
"train/kl_divergence_loss": 0.10430908203125,
"train/step_duration_seconds": 7.1745688915252686,
"train/steps_per_hour": 306.3132018994396,
"train/total_elapsed_hours": 0.672514272067282
},
{
"epoch": 0.49066666666666664,
"grad_norm": 12.25,
"learning_rate": 1.2144106267931877e-05,
"loss": 1.0298,
"step": 207
},
{
"epoch": 0.49066666666666664,
"step": 207,
"train/combined_loss": 0.06408173590898514,
"train/cross_entropy_loss": 0.021962294937111437,
"train/kl_divergence_loss": 0.106201171875,
"train/step_duration_seconds": 7.178909778594971,
"train/steps_per_hour": 306.89016742276357,
"train/total_elapsed_hours": 0.6745084136724472
},
{
"epoch": 0.49303703703703705,
"grad_norm": 11.4375,
"learning_rate": 1.2062855094631777e-05,
"loss": 1.0253,
"step": 208
},
{
"epoch": 0.49303703703703705,
"step": 208,
"train/combined_loss": 0.06565954210236669,
"train/cross_entropy_loss": 0.019319579121656716,
"train/kl_divergence_loss": 0.11199951171875,
"train/step_duration_seconds": 7.163602352142334,
"train/steps_per_hour": 307.46566401066696,
"train/total_elapsed_hours": 0.6764983032147089
},
{
"epoch": 0.4954074074074074,
"grad_norm": 9.375,
"learning_rate": 1.1981461431993978e-05,
"loss": 1.0506,
"step": 209
},
{
"epoch": 0.4954074074074074,
"step": 209,
"train/combined_loss": 0.0647607441060245,
"train/cross_entropy_loss": 0.01816285285167396,
"train/kl_divergence_loss": 0.111358642578125,
"train/step_duration_seconds": 7.1943066120147705,
"train/steps_per_hour": 308.03391278116044,
"train/total_elapsed_hours": 0.6784967217180464
},
{
"epoch": 0.49777777777777776,
"grad_norm": 9.25,
"learning_rate": 1.1899930902191904e-05,
"loss": 1.0362,
"step": 210
},
{
"epoch": 0.49777777777777776,
"step": 210,
"train/combined_loss": 0.06525316601619124,
"train/cross_entropy_loss": 0.01887303462717682,
"train/kl_divergence_loss": 0.11163330078125,
"train/step_duration_seconds": 7.1762001514434814,
"train/steps_per_hour": 308.6011048708952,
"train/total_elapsed_hours": 0.6804901106490029
},
{
"epoch": 0.5001481481481481,
"grad_norm": 10.0,
"learning_rate": 1.181826913685291e-05,
"loss": 1.0441,
"step": 211
},
{
"epoch": 0.5001481481481481,
"step": 211,
"train/combined_loss": 0.06376577913761139,
"train/cross_entropy_loss": 0.02038433833513409,
"train/kl_divergence_loss": 0.107147216796875,
"train/step_duration_seconds": 7.198651552200317,
"train/steps_per_hour": 309.1621585576703,
"train/total_elapsed_hours": 0.6824897360801697
},
{
"epoch": 0.5025185185185185,
"grad_norm": 6.84375,
"learning_rate": 1.1736481776669307e-05,
"loss": 1.0203,
"step": 212
},
{
"epoch": 0.5025185185185185,
"step": 212,
"train/combined_loss": 0.0649118721485138,
"train/cross_entropy_loss": 0.02285963052418083,
"train/kl_divergence_loss": 0.106964111328125,
"train/step_duration_seconds": 7.178544998168945,
"train/steps_per_hour": 309.72246139738036,
"train/total_elapsed_hours": 0.6844837763574388
},
{
"epoch": 0.5048888888888889,
"grad_norm": 13.125,
"learning_rate": 1.1654574471008712e-05,
"loss": 1.0386,
"step": 213
},
{
"epoch": 0.5048888888888889,
"step": 213,
"train/combined_loss": 0.07054620841518044,
"train/cross_entropy_loss": 0.022958871792070568,
"train/kl_divergence_loss": 0.118133544921875,
"train/step_duration_seconds": 7.145255088806152,
"train/steps_per_hour": 310.28368884315285,
"train/total_elapsed_hours": 0.6864685694376628
},
{
"epoch": 0.5072592592592593,
"grad_norm": 15.4375,
"learning_rate": 1.1572552877523855e-05,
"loss": 1.1287,
"step": 214
},
{
"epoch": 0.5072592592592593,
"step": 214,
"train/combined_loss": 0.06366756092756987,
"train/cross_entropy_loss": 0.022598788724280894,
"train/kl_divergence_loss": 0.104736328125,
"train/step_duration_seconds": 7.188857316970825,
"train/steps_per_hour": 310.8362118431136,
"train/total_elapsed_hours": 0.6884654742479325
},
{
"epoch": 0.5096296296296297,
"grad_norm": 11.3125,
"learning_rate": 1.1490422661761744e-05,
"loss": 1.0187,
"step": 215
},
{
"epoch": 0.5096296296296297,
"step": 215,
"train/combined_loss": 0.06394149828702211,
"train/cross_entropy_loss": 0.021193550201132894,
"train/kl_divergence_loss": 0.106689453125,
"train/step_duration_seconds": 7.176607847213745,
"train/steps_per_hour": 311.3870734397048,
"train/total_elapsed_hours": 0.690458976427714
},
{
"epoch": 0.512,
"grad_norm": 13.9375,
"learning_rate": 1.1408189496772369e-05,
"loss": 1.0231,
"step": 216
},
{
"epoch": 0.512,
"step": 216,
"train/combined_loss": 0.06880563637241721,
"train/cross_entropy_loss": 0.020667911507189274,
"train/kl_divergence_loss": 0.116943359375,
"train/step_duration_seconds": 7.1657140254974365,
"train/steps_per_hour": 311.93612646502845,
"train/total_elapsed_hours": 0.6924494525459077
},
{
"epoch": 0.5143703703703704,
"grad_norm": 13.875,
"learning_rate": 1.1325859062716795e-05,
"loss": 1.1009,
"step": 217
},
{
"epoch": 0.5143703703703704,
"step": 217,
"train/combined_loss": 0.06414779741317034,
"train/cross_entropy_loss": 0.018890082137659192,
"train/kl_divergence_loss": 0.109405517578125,
"train/step_duration_seconds": 7.167219400405884,
"train/steps_per_hour": 312.4818438221589,
"train/total_elapsed_hours": 0.6944403468237983
},
{
"epoch": 0.5167407407407407,
"grad_norm": 10.3125,
"learning_rate": 1.1243437046474854e-05,
"loss": 1.0264,
"step": 218
},
{
"epoch": 0.5167407407407407,
"step": 218,
"train/combined_loss": 0.0651923450641334,
"train/cross_entropy_loss": 0.01881242578383535,
"train/kl_divergence_loss": 0.111572265625,
"train/step_duration_seconds": 7.16694712638855,
"train/steps_per_hour": 313.02447507913155,
"train/total_elapsed_hours": 0.6964311654700174
},
{
"epoch": 0.5191111111111111,
"grad_norm": 11.4375,
"learning_rate": 1.1160929141252303e-05,
"loss": 1.0431,
"step": 219
},
{
"epoch": 0.5191111111111111,
"step": 219,
"train/combined_loss": 0.06706041377037764,
"train/cross_entropy_loss": 0.019374735886231065,
"train/kl_divergence_loss": 0.11474609375,
"train/step_duration_seconds": 7.1320960521698,
"train/steps_per_hour": 313.5683592299994,
"train/total_elapsed_hours": 0.6984123032622868
},
{
"epoch": 0.5214814814814814,
"grad_norm": 13.125,
"learning_rate": 1.1078341046187588e-05,
"loss": 1.073,
"step": 220
},
{
"epoch": 0.5214814814814814,
"step": 220,
"train/combined_loss": 0.06729212449863553,
"train/cross_entropy_loss": 0.0204179905122146,
"train/kl_divergence_loss": 0.114166259765625,
"train/step_duration_seconds": 7.177785634994507,
"train/steps_per_hour": 314.10347476777997,
"train/total_elapsed_hours": 0.7004061326053408
},
{
"epoch": 0.5238518518518519,
"grad_norm": 8.5625,
"learning_rate": 1.0995678465958168e-05,
"loss": 1.0767,
"step": 221
},
{
"epoch": 0.5238518518518519,
"step": 221,
"train/combined_loss": 0.06687723798677325,
"train/cross_entropy_loss": 0.022487382288090885,
"train/kl_divergence_loss": 0.11126708984375,
"train/step_duration_seconds": 7.177214860916138,
"train/steps_per_hour": 314.6356233732219,
"train/total_elapsed_hours": 0.7023998034000397
},
{
"epoch": 0.5262222222222223,
"grad_norm": 12.4375,
"learning_rate": 1.0912947110386484e-05,
"loss": 1.07,
"step": 222
},
{
"epoch": 0.5262222222222223,
"step": 222,
"train/combined_loss": 0.0689978925511241,
"train/cross_entropy_loss": 0.02382952021434903,
"train/kl_divergence_loss": 0.114166259765625,
"train/step_duration_seconds": 7.179453611373901,
"train/steps_per_hour": 315.1644814155012,
"train/total_elapsed_hours": 0.7043940960698658
},
{
"epoch": 0.5285925925925926,
"grad_norm": 15.25,
"learning_rate": 1.0830152694045553e-05,
"loss": 1.104,
"step": 223
},
{
"epoch": 0.5285925925925926,
"step": 223,
"train/combined_loss": 0.0666323616169393,
"train/cross_entropy_loss": 0.022882646531797945,
"train/kl_divergence_loss": 0.110382080078125,
"train/step_duration_seconds": 7.199350595474243,
"train/steps_per_hour": 315.68788327603403,
"train/total_elapsed_hours": 0.7063939156797198
},
{
"epoch": 0.530962962962963,
"grad_norm": 11.0625,
"learning_rate": 1.0747300935864245e-05,
"loss": 1.0661,
"step": 224
},
{
"epoch": 0.530962962962963,
"step": 224,
"train/combined_loss": 0.06530413264408708,
"train/cross_entropy_loss": 0.021294296951964498,
"train/kl_divergence_loss": 0.10931396484375,
"train/step_duration_seconds": 7.174778938293457,
"train/steps_per_hour": 316.211376709367,
"train/total_elapsed_hours": 0.7083869098292457
},
{
"epoch": 0.5333333333333333,
"grad_norm": 8.875,
"learning_rate": 1.0664397558732245e-05,
"loss": 1.0449,
"step": 225
},
{
"epoch": 0.5333333333333333,
"step": 225,
"train/combined_loss": 0.0637249075807631,
"train/cross_entropy_loss": 0.020424663205631077,
"train/kl_divergence_loss": 0.107025146484375,
"train/step_duration_seconds": 7.199081659317017,
"train/steps_per_hour": 316.72892290474715,
"train/total_elapsed_hours": 0.7103866547346115
},
{
"epoch": 0.5357037037037037,
"grad_norm": 9.5,
"learning_rate": 1.0581448289104759e-05,
"loss": 1.0196,
"step": 226
},
{
"epoch": 0.5357037037037037,
"step": 226,
"train/combined_loss": 0.06304276920855045,
"train/cross_entropy_loss": 0.019396084127947688,
"train/kl_divergence_loss": 0.106689453125,
"train/step_duration_seconds": 7.180125951766968,
"train/steps_per_hour": 317.2459083502985,
"train/total_elapsed_hours": 0.7123811341656578
},
{
"epoch": 0.538074074074074,
"grad_norm": 6.03125,
"learning_rate": 1.0498458856606972e-05,
"loss": 1.0087,
"step": 227
},
{
"epoch": 0.538074074074074,
"step": 227,
"train/combined_loss": 0.06372014014050364,
"train/cross_entropy_loss": 0.019560642424039543,
"train/kl_divergence_loss": 0.107879638671875,
"train/step_duration_seconds": 7.176924228668213,
"train/steps_per_hour": 317.76040263033815,
"train/total_elapsed_hours": 0.7143747242291768
},
{
"epoch": 0.5404444444444444,
"grad_norm": 9.5625,
"learning_rate": 1.0415434993638269e-05,
"loss": 1.0195,
"step": 228
},
{
"epoch": 0.5404444444444444,
"step": 228,
"train/combined_loss": 0.06290752394124866,
"train/cross_entropy_loss": 0.019766463432461023,
"train/kl_divergence_loss": 0.106048583984375,
"train/step_duration_seconds": 7.157245635986328,
"train/steps_per_hour": 318.27446193106454,
"train/total_elapsed_hours": 0.7163628480169508
},
{
"epoch": 0.5428148148148149,
"grad_norm": 13.3125,
"learning_rate": 1.0332382434976267e-05,
"loss": 1.0065,
"step": 229
},
{
"epoch": 0.5428148148148149,
"step": 229,
"train/combined_loss": 0.06345422472804785,
"train/cross_entropy_loss": 0.02067675837315619,
"train/kl_divergence_loss": 0.106231689453125,
"train/step_duration_seconds": 7.180465459823608,
"train/steps_per_hour": 318.782813491763,
"train/total_elapsed_hours": 0.7183574217557908
},
{
"epoch": 0.5451851851851852,
"grad_norm": 7.78125,
"learning_rate": 1.0249306917380731e-05,
"loss": 1.0153,
"step": 230
},
{
"epoch": 0.5451851851851852,
"step": 230,
"train/combined_loss": 0.07120905723422766,
"train/cross_entropy_loss": 0.022484031855128706,
"train/kl_divergence_loss": 0.11993408203125,
"train/step_duration_seconds": 7.1848931312561035,
"train/steps_per_hour": 319.287804772737,
"train/total_elapsed_hours": 0.7203532254033619
},
{
"epoch": 0.5475555555555556,
"grad_norm": 10.6875,
"learning_rate": 1.0166214179197265e-05,
"loss": 1.1393,
"step": 231
},
{
"epoch": 0.5475555555555556,
"step": 231,
"train/combined_loss": 0.0662783239968121,
"train/cross_entropy_loss": 0.021716808201745152,
"train/kl_divergence_loss": 0.11083984375,
"train/step_duration_seconds": 7.186007499694824,
"train/steps_per_hour": 319.7898684983009,
"train/total_elapsed_hours": 0.7223493385977215
},
{
"epoch": 0.5499259259259259,
"grad_norm": 12.9375,
"learning_rate": 1.0083109959960974e-05,
"loss": 1.0605,
"step": 232
},
{
"epoch": 0.5499259259259259,
"step": 232,
"train/combined_loss": 0.06430092873051763,
"train/cross_entropy_loss": 0.021363087464123964,
"train/kl_divergence_loss": 0.10723876953125,
"train/step_duration_seconds": 7.199544906616211,
"train/steps_per_hour": 320.28750235103695,
"train/total_elapsed_hours": 0.7243492121828927
},
{
"epoch": 0.5522962962962963,
"grad_norm": 11.625,
"learning_rate": 1e-05,
"loss": 1.0288,
"step": 233
},
{
"epoch": 0.5522962962962963,
"step": 233,
"train/combined_loss": 0.06349486531689763,
"train/cross_entropy_loss": 0.021063214750029147,
"train/kl_divergence_loss": 0.105926513671875,
"train/step_duration_seconds": 7.180714845657349,
"train/steps_per_hour": 320.78470593967154,
"train/total_elapsed_hours": 0.7263438551955753
},
{
"epoch": 0.5546666666666666,
"grad_norm": 5.09375,
"learning_rate": 9.916890040039031e-06,
"loss": 1.0159,
"step": 234
},
{
"epoch": 0.5546666666666666,
"step": 234,
"train/combined_loss": 0.0634385438170284,
"train/cross_entropy_loss": 0.02043177606537938,
"train/kl_divergence_loss": 0.1064453125,
"train/step_duration_seconds": 7.174914836883545,
"train/steps_per_hour": 321.2798969078875,
"train/total_elapsed_hours": 0.7283368870947096
},
{
"epoch": 0.557037037037037,
"grad_norm": 7.46875,
"learning_rate": 9.833785820802739e-06,
"loss": 1.015,
"step": 235
},
{
"epoch": 0.557037037037037,
"step": 235,
"train/combined_loss": 0.06511542480438948,
"train/cross_entropy_loss": 0.01987928501330316,
"train/kl_divergence_loss": 0.1103515625,
"train/step_duration_seconds": 7.176663398742676,
"train/steps_per_hour": 321.77217117883936,
"train/total_elapsed_hours": 0.7303304047054715
},
{
"epoch": 0.5594074074074074,
"grad_norm": 5.6875,
"learning_rate": 9.750693082619274e-06,
"loss": 1.0418,
"step": 236
},
{
"epoch": 0.5594074074074074,
"step": 236,
"train/combined_loss": 0.06252926005981863,
"train/cross_entropy_loss": 0.019498219480738044,
"train/kl_divergence_loss": 0.105560302734375,
"train/step_duration_seconds": 7.1978747844696045,
"train/steps_per_hour": 322.2591725337913,
"train/total_elapsed_hours": 0.7323298143678242
},
{
"epoch": 0.5617777777777778,
"grad_norm": 7.125,
"learning_rate": 9.667617565023734e-06,
"loss": 1.0005,
"step": 237
},
{
"epoch": 0.5617777777777778,
"step": 237,
"train/combined_loss": 0.06657313695177436,
"train/cross_entropy_loss": 0.020200716448016465,
"train/kl_divergence_loss": 0.112945556640625,
"train/step_duration_seconds": 7.176342487335205,
"train/steps_per_hour": 322.7461507067669,
"train/total_elapsed_hours": 0.7343232428365284
},
{
"epoch": 0.5641481481481482,
"grad_norm": 8.0625,
"learning_rate": 9.584565006361735e-06,
"loss": 1.0652,
"step": 238
},
{
"epoch": 0.5641481481481482,
"step": 238,
"train/combined_loss": 0.06469497783109546,
"train/cross_entropy_loss": 0.020838933647610247,
"train/kl_divergence_loss": 0.108551025390625,
"train/step_duration_seconds": 7.179325580596924,
"train/steps_per_hour": 323.23012833277306,
"train/total_elapsed_hours": 0.7363174999422497
},
{
"epoch": 0.5665185185185185,
"grad_norm": 3.3125,
"learning_rate": 9.501541143393028e-06,
"loss": 1.0351,
"step": 239
},
{
"epoch": 0.5665185185185185,
"step": 239,
"train/combined_loss": 0.06340441107749939,
"train/cross_entropy_loss": 0.021584213944151998,
"train/kl_divergence_loss": 0.105224609375,
"train/step_duration_seconds": 7.157347917556763,
"train/steps_per_hour": 323.7141681182355,
"train/total_elapsed_hours": 0.7383056521415711
},
{
"epoch": 0.5688888888888889,
"grad_norm": 12.8125,
"learning_rate": 9.418551710895243e-06,
"loss": 1.0145,
"step": 240
},
{
"epoch": 0.5688888888888889,
"step": 240,
"train/combined_loss": 0.06330876937136054,
"train/cross_entropy_loss": 0.02142344566527754,
"train/kl_divergence_loss": 0.105194091796875,
"train/step_duration_seconds": 7.197352170944214,
"train/steps_per_hour": 324.1907416897731,
"train/total_elapsed_hours": 0.7403049166335
},
{
"epoch": 0.5712592592592592,
"grad_norm": 14.3125,
"learning_rate": 9.33560244126776e-06,
"loss": 1.0129,
"step": 241
},
{
"epoch": 0.5712592592592592,
"step": 241,
"train/combined_loss": 0.0631729164160788,
"train/cross_entropy_loss": 0.021395884454250336,
"train/kl_divergence_loss": 0.104949951171875,
"train/step_duration_seconds": 7.1834022998809814,
"train/steps_per_hour": 324.666442947215,
"train/total_elapsed_hours": 0.7423003061612448
},
{
"epoch": 0.5736296296296296,
"grad_norm": 8.6875,
"learning_rate": 9.252699064135759e-06,
"loss": 1.0108,
"step": 242
},
{
"epoch": 0.5736296296296296,
"step": 242,
"train/combined_loss": 0.06237048772163689,
"train/cross_entropy_loss": 0.020492929965257645,
"train/kl_divergence_loss": 0.104248046875,
"train/step_duration_seconds": 7.1805572509765625,
"train/steps_per_hour": 325.13993881325246,
"train/total_elapsed_hours": 0.7442949053976271
},
{
"epoch": 0.576,
"grad_norm": 8.4375,
"learning_rate": 9.169847305954448e-06,
"loss": 0.9979,
"step": 243
},
{
"epoch": 0.576,
"step": 243,
"train/combined_loss": 0.0639663846231997,
"train/cross_entropy_loss": 0.020327785867266357,
"train/kl_divergence_loss": 0.10760498046875,
"train/step_duration_seconds": 7.17130970954895,
"train/steps_per_hour": 325.61202443655657,
"train/total_elapsed_hours": 0.7462869358725018
},
{
"epoch": 0.5783703703703704,
"grad_norm": 12.6875,
"learning_rate": 9.087052889613519e-06,
"loss": 1.0235,
"step": 244
},
{
"epoch": 0.5783703703703704,
"step": 244,
"train/combined_loss": 0.06497443979606032,
"train/cross_entropy_loss": 0.020543363760225475,
"train/kl_divergence_loss": 0.109405517578125,
"train/step_duration_seconds": 7.176990032196045,
"train/steps_per_hour": 326.0809089404372,
"train/total_elapsed_hours": 0.7482805442147785
},
{
"epoch": 0.5807407407407408,
"grad_norm": 8.9375,
"learning_rate": 9.004321534041836e-06,
"loss": 1.0396,
"step": 245
},
{
"epoch": 0.5807407407407408,
"step": 245,
"train/combined_loss": 0.06485264329239726,
"train/cross_entropy_loss": 0.020574429305270314,
"train/kl_divergence_loss": 0.109130859375,
"train/step_duration_seconds": 7.177663564682007,
"train/steps_per_hour": 326.54722020012497,
"train/total_elapsed_hours": 0.7502743396494124
},
{
"epoch": 0.5831111111111111,
"grad_norm": 5.09375,
"learning_rate": 8.921658953812416e-06,
"loss": 1.0376,
"step": 246
},
{
"epoch": 0.5831111111111111,
"step": 246,
"train/combined_loss": 0.06371145462617278,
"train/cross_entropy_loss": 0.020306207472458482,
"train/kl_divergence_loss": 0.10711669921875,
"train/step_duration_seconds": 7.178081512451172,
"train/steps_per_hour": 327.0110091897602,
"train/total_elapsed_hours": 0.7522682511806488
},
{
"epoch": 0.5854814814814815,
"grad_norm": 6.78125,
"learning_rate": 8.839070858747697e-06,
"loss": 1.0194,
"step": 247
},
{
"epoch": 0.5854814814814815,
"step": 247,
"train/combined_loss": 0.061982935993000865,
"train/cross_entropy_loss": 0.02035869611427188,
"train/kl_divergence_loss": 0.103607177734375,
"train/step_duration_seconds": 7.1770946979522705,
"train/steps_per_hour": 327.47246511357855,
"train/total_elapsed_hours": 0.7542618885967467
},
{
"epoch": 0.5878518518518518,
"grad_norm": 5.625,
"learning_rate": 8.756562953525151e-06,
"loss": 0.9917,
"step": 248
},
{
"epoch": 0.5878518518518518,
"step": 248,
"train/combined_loss": 0.06333467178046703,
"train/cross_entropy_loss": 0.020956451655365527,
"train/kl_divergence_loss": 0.105712890625,
"train/step_duration_seconds": 7.17914080619812,
"train/steps_per_hour": 327.9312416048011,
"train/total_elapsed_hours": 0.7562560943762461
},
{
"epoch": 0.5902222222222222,
"grad_norm": 2.671875,
"learning_rate": 8.674140937283208e-06,
"loss": 1.0134,
"step": 249
},
{
"epoch": 0.5902222222222222,
"step": 249,
"train/combined_loss": 0.06772775668650866,
"train/cross_entropy_loss": 0.02138080890290439,
"train/kl_divergence_loss": 0.11407470703125,
"train/step_duration_seconds": 7.154328107833862,
"train/steps_per_hour": 328.3905899588576,
"train/total_elapsed_hours": 0.7582434077395334
},
{
"epoch": 0.5925925925925926,
"grad_norm": 2.65625,
"learning_rate": 8.591810503227634e-06,
"loss": 1.0836,
"step": 250
},
{
"epoch": 0.5925925925925926,
"eval_combined_loss": 0.06404994005337358,
"eval_cross_entropy_loss": 0.020620713440080485,
"eval_kl_divergence_loss": 0.10747916666666667,
"eval_loss": 0.06404994428157806,
"eval_runtime": 220.1998,
"eval_samples_per_second": 6.812,
"eval_steps_per_second": 3.406,
"step": 250
},
{
"epoch": 0.5925925925925926,
"step": 250,
"train/combined_loss": 0.06369326380081475,
"train/cross_entropy_loss": 0.020636040600948036,
"train/kl_divergence_loss": 0.10675048828125,
"train/step_duration_seconds": 227.41642260551453,
"train/steps_per_hour": 304.35298930640323,
"train/total_elapsed_hours": 0.8214146362410651
},
{
"epoch": 0.5949629629629629,
"grad_norm": 3.0625,
"learning_rate": 8.509577338238255e-06,
"loss": 1.0191,
"step": 251
},
{
"epoch": 0.5949629629629629,
"step": 251,
"train/combined_loss": 0.0633452923502773,
"train/cross_entropy_loss": 0.020184239139780402,
"train/kl_divergence_loss": 0.10650634765625,
"train/step_duration_seconds": 7.193126916885376,
"train/steps_per_hour": 304.82890506058317,
"train/total_elapsed_hours": 0.8234127270513111
},
{
"epoch": 0.5973333333333334,
"grad_norm": 3.140625,
"learning_rate": 8.427447122476148e-06,
"loss": 1.0135,
"step": 252
},
{
"epoch": 0.5973333333333334,
"step": 252,
"train/combined_loss": 0.0627688483800739,
"train/cross_entropy_loss": 0.020252052345313132,
"train/kl_divergence_loss": 0.10528564453125,
"train/step_duration_seconds": 7.173017740249634,
"train/steps_per_hour": 305.3045828117105,
"train/total_elapsed_hours": 0.8254052319791582
},
{
"epoch": 0.5997037037037037,
"grad_norm": 2.359375,
"learning_rate": 8.34542552899129e-06,
"loss": 1.0043,
"step": 253
},
{
"epoch": 0.5997037037037037,
"step": 253,
"train/combined_loss": 0.06621215213090181,
"train/cross_entropy_loss": 0.02149291045498103,
"train/kl_divergence_loss": 0.110931396484375,
"train/step_duration_seconds": 7.1814353466033936,
"train/steps_per_hour": 305.77710542260405,
"train/total_elapsed_hours": 0.8274000751309925
},
{
"epoch": 0.6020740740740741,
"grad_norm": 2.703125,
"learning_rate": 8.263518223330698e-06,
"loss": 1.0594,
"step": 254
},
{
"epoch": 0.6020740740740741,
"step": 254,
"train/combined_loss": 0.06436787801794708,
"train/cross_entropy_loss": 0.021680091507732868,
"train/kl_divergence_loss": 0.1070556640625,
"train/step_duration_seconds": 7.1533849239349365,
"train/steps_per_hour": 306.25023211085755,
"train/total_elapsed_hours": 0.8293871264987521
},
{
"epoch": 0.6044444444444445,
"grad_norm": 3.6875,
"learning_rate": 8.181730863147094e-06,
"loss": 1.0299,
"step": 255
},
{
"epoch": 0.6044444444444445,
"step": 255,
"train/combined_loss": 0.0639684284105897,
"train/cross_entropy_loss": 0.020576014067046344,
"train/kl_divergence_loss": 0.10736083984375,
"train/step_duration_seconds": 7.142830848693848,
"train/steps_per_hour": 306.7221787763317,
"train/total_elapsed_hours": 0.8313712461789449
},
{
"epoch": 0.6068148148148148,
"grad_norm": 2.359375,
"learning_rate": 8.100069097808103e-06,
"loss": 1.0235,
"step": 256
},
{
"epoch": 0.6068148148148148,
"step": 256,
"train/combined_loss": 0.062460833229124546,
"train/cross_entropy_loss": 0.020459996070712805,
"train/kl_divergence_loss": 0.104461669921875,
"train/step_duration_seconds": 7.155627012252808,
"train/steps_per_hour": 307.1905678921464,
"train/total_elapsed_hours": 0.8333589203490152
},
{
"epoch": 0.6091851851851852,
"grad_norm": 3.3125,
"learning_rate": 8.018538568006027e-06,
"loss": 0.9994,
"step": 257
},
{
"epoch": 0.6091851851851852,
"step": 257,
"train/combined_loss": 0.06057729944586754,
"train/cross_entropy_loss": 0.020568661391735077,
"train/kl_divergence_loss": 0.1005859375,
"train/step_duration_seconds": 7.156771659851074,
"train/steps_per_hour": 307.6566108780016,
"train/total_elapsed_hours": 0.8353469124767515
},
{
"epoch": 0.6115555555555555,
"grad_norm": 5.40625,
"learning_rate": 7.937144905368226e-06,
"loss": 0.9692,
"step": 258
},
{
"epoch": 0.6115555555555555,
"step": 258,
"train/combined_loss": 0.06223560217767954,
"train/cross_entropy_loss": 0.020253673777915537,
"train/kl_divergence_loss": 0.104217529296875,
"train/step_duration_seconds": 7.184788942337036,
"train/steps_per_hour": 308.11757712721027,
"train/total_elapsed_hours": 0.8373426871829562
},
{
"epoch": 0.6139259259259259,
"grad_norm": 5.28125,
"learning_rate": 7.855893732068124e-06,
"loss": 0.9958,
"step": 259
},
{
"epoch": 0.6139259259259259,
"step": 259,
"train/combined_loss": 0.06394938984885812,
"train/cross_entropy_loss": 0.020690529490821064,
"train/kl_divergence_loss": 0.107208251953125,
"train/step_duration_seconds": 7.194268226623535,
"train/steps_per_hour": 308.5753831613492,
"train/total_elapsed_hours": 0.839341095023685
},
{
"epoch": 0.6162962962962963,
"grad_norm": 2.875,
"learning_rate": 7.774790660436857e-06,
"loss": 1.0232,
"step": 260
},
{
"epoch": 0.6162962962962963,
"step": 260,
"train/combined_loss": 0.060645608929917216,
"train/cross_entropy_loss": 0.020613725995644927,
"train/kl_divergence_loss": 0.100677490234375,
"train/step_duration_seconds": 7.176680564880371,
"train/steps_per_hour": 309.0328088515058,
"train/total_elapsed_hours": 0.8413346174028185
},
{
"epoch": 0.6186666666666667,
"grad_norm": 3.09375,
"learning_rate": 7.6938412925756e-06,
"loss": 0.9703,
"step": 261
},
{
"epoch": 0.6186666666666667,
"step": 261,
"train/combined_loss": 0.06048685312271118,
"train/cross_entropy_loss": 0.02075397619046271,
"train/kl_divergence_loss": 0.1002197265625,
"train/step_duration_seconds": 7.175957441329956,
"train/steps_per_hour": 309.48814566254214,
"train/total_elapsed_hours": 0.843327938914299
},
{
"epoch": 0.621037037037037,
"grad_norm": 4.84375,
"learning_rate": 7.613051219968624e-06,
"loss": 0.9678,
"step": 262
},
{
"epoch": 0.621037037037037,
"step": 262,
"train/combined_loss": 0.06825654301792383,
"train/cross_entropy_loss": 0.02109560859389603,
"train/kl_divergence_loss": 0.11541748046875,
"train/step_duration_seconds": 7.180307388305664,
"train/steps_per_hour": 309.94089201150274,
"train/total_elapsed_hours": 0.8453224687443839
},
{
"epoch": 0.6234074074074074,
"grad_norm": 2.875,
"learning_rate": 7.532426023097063e-06,
"loss": 1.0921,
"step": 263
},
{
"epoch": 0.6234074074074074,
"step": 263,
"train/combined_loss": 0.06100269500166178,
"train/cross_entropy_loss": 0.020168230053968728,
"train/kl_divergence_loss": 0.101837158203125,
"train/step_duration_seconds": 7.166715145111084,
"train/steps_per_hour": 310.39288999163625,
"train/total_elapsed_hours": 0.8473132229513592
},
{
"epoch": 0.6257777777777778,
"grad_norm": 5.75,
"learning_rate": 7.451971271053455e-06,
"loss": 0.976,
"step": 264
},
{
"epoch": 0.6257777777777778,
"step": 264,
"train/combined_loss": 0.06548905186355114,
"train/cross_entropy_loss": 0.021114823641255498,
"train/kl_divergence_loss": 0.10986328125,
"train/step_duration_seconds": 7.166426181793213,
"train/steps_per_hour": 310.8427983981813,
"train/total_elapsed_hours": 0.8493038968907463
},
{
"epoch": 0.6281481481481481,
"grad_norm": 3.21875,
"learning_rate": 7.371692521157048e-06,
"loss": 1.0478,
"step": 265
},
{
"epoch": 0.6281481481481481,
"step": 265,
"train/combined_loss": 0.06573160830885172,
"train/cross_entropy_loss": 0.021050618845038116,
"train/kl_divergence_loss": 0.11041259765625,
"train/step_duration_seconds": 7.166961431503296,
"train/steps_per_hour": 311.2905482984119,
"train/total_elapsed_hours": 0.8512947195106082
},
{
"epoch": 0.6305185185185185,
"grad_norm": 5.25,
"learning_rate": 7.291595318569951e-06,
"loss": 1.0517,
"step": 266
},
{
"epoch": 0.6305185185185185,
"step": 266,
"train/combined_loss": 0.06105546629987657,
"train/cross_entropy_loss": 0.020487397676333785,
"train/kl_divergence_loss": 0.10162353515625,
"train/step_duration_seconds": 7.191303968429565,
"train/steps_per_hour": 311.733738567441,
"train/total_elapsed_hours": 0.8532923039462831
},
{
"epoch": 0.6328888888888888,
"grad_norm": 5.125,
"learning_rate": 7.2116851959140965e-06,
"loss": 0.9769,
"step": 267
},
{
"epoch": 0.6328888888888888,
"step": 267,
"train/combined_loss": 0.06089310604147613,
"train/cross_entropy_loss": 0.020650955964811146,
"train/kl_divergence_loss": 0.10113525390625,
"train/step_duration_seconds": 7.178247451782227,
"train/steps_per_hour": 312.17618240392204,
"train/total_elapsed_hours": 0.8552862615717782
},
{
"epoch": 0.6352592592592593,
"grad_norm": 5.3125,
"learning_rate": 7.131967672889101e-06,
"loss": 0.9743,
"step": 268
},
{
"epoch": 0.6352592592592593,
"step": 268,
"train/combined_loss": 0.06254504946991801,
"train/cross_entropy_loss": 0.02102515858132392,
"train/kl_divergence_loss": 0.10406494140625,
"train/step_duration_seconds": 7.1988208293914795,
"train/steps_per_hour": 312.6144841102412,
"train/total_elapsed_hours": 0.8572859340243869
},
{
"epoch": 0.6376296296296297,
"grad_norm": 3.8125,
"learning_rate": 7.052448255890958e-06,
"loss": 1.0007,
"step": 269
},
{
"epoch": 0.6376296296296297,
"step": 269,
"train/combined_loss": 0.06616902281530201,
"train/cross_entropy_loss": 0.02134561410639435,
"train/kl_divergence_loss": 0.110992431640625,
"train/step_duration_seconds": 7.174272060394287,
"train/steps_per_hour": 313.05323016517417,
"train/total_elapsed_hours": 0.8592787873744965
},
{
"epoch": 0.64,
"grad_norm": 4.3125,
"learning_rate": 6.973132437631743e-06,
"loss": 1.0587,
"step": 270
},
{
"epoch": 0.64,
"step": 270,
"train/combined_loss": 0.0661325603723526,
"train/cross_entropy_loss": 0.021059064893051982,
"train/kl_divergence_loss": 0.1112060546875,
"train/step_duration_seconds": 7.177139759063721,
"train/steps_per_hour": 313.48965589105467,
"train/total_elapsed_hours": 0.8612724373075697
},
{
"epoch": 0.6423703703703704,
"grad_norm": 3.859375,
"learning_rate": 6.8940256967601625e-06,
"loss": 1.0581,
"step": 271
},
{
"epoch": 0.6423703703703704,
"step": 271,
"train/combined_loss": 0.06747469631955028,
"train/cross_entropy_loss": 0.021271413774229586,
"train/kl_divergence_loss": 0.113677978515625,
"train/step_duration_seconds": 7.159501552581787,
"train/steps_per_hour": 313.92584752922323,
"train/total_elapsed_hours": 0.8632611877388424
},
{
"epoch": 0.6447407407407407,
"grad_norm": 6.90625,
"learning_rate": 6.815133497483157e-06,
"loss": 1.0796,
"step": 272
},
{
"epoch": 0.6447407407407407,
"step": 272,
"train/combined_loss": 0.06747178034856915,
"train/cross_entropy_loss": 0.02111299301031977,
"train/kl_divergence_loss": 0.11383056640625,
"train/step_duration_seconds": 7.15754246711731,
"train/steps_per_hour": 314.36023173496596,
"train/total_elapsed_hours": 0.8652493939797083
},
{
"epoch": 0.6471111111111111,
"grad_norm": 6.0625,
"learning_rate": 6.736461289188445e-06,
"loss": 1.0795,
"step": 273
},
{
"epoch": 0.6471111111111111,
"step": 273,
"train/combined_loss": 0.06073831953108311,
"train/cross_entropy_loss": 0.020249835564754903,
"train/kl_divergence_loss": 0.101226806640625,
"train/step_duration_seconds": 7.178404092788696,
"train/steps_per_hour": 314.7905207894922,
"train/total_elapsed_hours": 0.8672433951165941
},
{
"epoch": 0.6494814814814814,
"grad_norm": 4.9375,
"learning_rate": 6.6580145060681255e-06,
"loss": 0.9718,
"step": 274
},
{
"epoch": 0.6494814814814814,
"step": 274,
"train/combined_loss": 0.06485259486362338,
"train/cross_entropy_loss": 0.020971058984287083,
"train/kl_divergence_loss": 0.108734130859375,
"train/step_duration_seconds": 7.158320426940918,
"train/steps_per_hour": 315.22085880551066,
"train/total_elapsed_hours": 0.8692318174574111
},
{
"epoch": 0.6518518518518519,
"grad_norm": 4.5625,
"learning_rate": 6.579798566743314e-06,
"loss": 1.0376,
"step": 275
},
{
"epoch": 0.6518518518518519,
"step": 275,
"train/combined_loss": 0.05971498414874077,
"train/cross_entropy_loss": 0.020308872684836388,
"train/kl_divergence_loss": 0.09912109375,
"train/step_duration_seconds": 7.181738376617432,
"train/steps_per_hour": 315.64687567999425,
"train/total_elapsed_hours": 0.8712267447842492
},
{
"epoch": 0.6542222222222223,
"grad_norm": 5.625,
"learning_rate": 6.501818873889856e-06,
"loss": 0.9554,
"step": 276
},
{
"epoch": 0.6542222222222223,
"step": 276,
"train/combined_loss": 0.06411758903414011,
"train/cross_entropy_loss": 0.020782789448276162,
"train/kl_divergence_loss": 0.107452392578125,
"train/step_duration_seconds": 7.175511598587036,
"train/steps_per_hour": 316.07157210067743,
"train/total_elapsed_hours": 0.8732199424505234
},
{
"epoch": 0.6565925925925926,
"grad_norm": 1.8203125,
"learning_rate": 6.424080813865139e-06,
"loss": 1.0259,
"step": 277
},
{
"epoch": 0.6565925925925926,
"step": 277,
"train/combined_loss": 0.05957591161131859,
"train/cross_entropy_loss": 0.020519012585282326,
"train/kl_divergence_loss": 0.0986328125,
"train/step_duration_seconds": 7.175374507904053,
"train/steps_per_hour": 316.4943478972161,
"train/total_elapsed_hours": 0.8752131020360523
},
{
"epoch": 0.658962962962963,
"grad_norm": 5.40625,
"learning_rate": 6.34658975633605e-06,
"loss": 0.9532,
"step": 278
},
{
"epoch": 0.658962962962963,
"step": 278,
"train/combined_loss": 0.06180824153125286,
"train/cross_entropy_loss": 0.02071121137123555,
"train/kl_divergence_loss": 0.1029052734375,
"train/step_duration_seconds": 7.177731037139893,
"train/steps_per_hour": 316.9149659696548,
"train/total_elapsed_hours": 0.8772069162130356
},
{
"epoch": 0.6613333333333333,
"grad_norm": 3.59375,
"learning_rate": 6.269351053908061e-06,
"loss": 0.9889,
"step": 279
},
{
"epoch": 0.6613333333333333,
"step": 279,
"train/combined_loss": 0.062360771000385284,
"train/cross_entropy_loss": 0.020839707227423787,
"train/kl_divergence_loss": 0.1038818359375,
"train/step_duration_seconds": 7.177475214004517,
"train/steps_per_hour": 317.3337019711024,
"train/total_elapsed_hours": 0.8792006593280368
},
{
"epoch": 0.6637037037037037,
"grad_norm": 5.40625,
"learning_rate": 6.192370041755505e-06,
"loss": 0.9978,
"step": 280
},
{
"epoch": 0.6637037037037037,
"step": 280,
"train/combined_loss": 0.06398251187056303,
"train/cross_entropy_loss": 0.020573665155097842,
"train/kl_divergence_loss": 0.107391357421875,
"train/step_duration_seconds": 7.203123092651367,
"train/steps_per_hour": 317.7479741776063,
"train/total_elapsed_hours": 0.8812015268537733
},
{
"epoch": 0.666074074074074,
"grad_norm": 5.96875,
"learning_rate": 6.115652037253054e-06,
"loss": 1.0237,
"step": 281
},
{
"epoch": 0.666074074074074,
"step": 281,
"train/combined_loss": 0.05968505213968456,
"train/cross_entropy_loss": 0.02018797560594976,
"train/kl_divergence_loss": 0.09918212890625,
"train/step_duration_seconds": 7.152847766876221,
"train/steps_per_hour": 318.16540023581325,
"train/total_elapsed_hours": 0.883188429011239
},
{
"epoch": 0.6684444444444444,
"grad_norm": 4.65625,
"learning_rate": 6.039202339608432e-06,
"loss": 0.955,
"step": 282
},
{
"epoch": 0.6684444444444444,
"step": 282,
"train/combined_loss": 0.07058762316592038,
"train/cross_entropy_loss": 0.02191255264915526,
"train/kl_divergence_loss": 0.1192626953125,
"train/step_duration_seconds": 7.178384065628052,
"train/steps_per_hour": 318.578399398522,
"train/total_elapsed_hours": 0.8851824245850245
},
{
"epoch": 0.6708148148148149,
"grad_norm": 8.5625,
"learning_rate": 5.963026229496378e-06,
"loss": 1.1294,
"step": 283
},
{
"epoch": 0.6708148148148149,
"step": 283,
"train/combined_loss": 0.06238678935915232,
"train/cross_entropy_loss": 0.021471575018949807,
"train/kl_divergence_loss": 0.103302001953125,
"train/step_duration_seconds": 7.157472133636475,
"train/steps_per_hour": 318.9916306953401,
"train/total_elapsed_hours": 0.8871706112888125
},
{
"epoch": 0.6731851851851852,
"grad_norm": 10.6875,
"learning_rate": 5.887128968693887e-06,
"loss": 0.9982,
"step": 284
},
{
"epoch": 0.6731851851851852,
"step": 284,
"train/combined_loss": 0.06445175595581532,
"train/cross_entropy_loss": 0.021542673697695136,
"train/kl_divergence_loss": 0.10736083984375,
"train/step_duration_seconds": 7.179536581039429,
"train/steps_per_hour": 319.40081235138257,
"train/total_elapsed_hours": 0.8891649270057678
},
{
"epoch": 0.6755555555555556,
"grad_norm": 7.71875,
"learning_rate": 5.811515799716754e-06,
"loss": 1.0312,
"step": 285
},
{
"epoch": 0.6755555555555556,
"step": 285,
"train/combined_loss": 0.060818693600595,
"train/cross_entropy_loss": 0.020441102096810937,
"train/kl_divergence_loss": 0.1011962890625,
"train/step_duration_seconds": 7.156339406967163,
"train/steps_per_hour": 319.81047503815245,
"train/total_elapsed_hours": 0.8911527990632587
},
{
"epoch": 0.6779259259259259,
"grad_norm": 2.265625,
"learning_rate": 5.736191945457463e-06,
"loss": 0.9731,
"step": 286
},
{
"epoch": 0.6779259259259259,
"step": 286,
"train/combined_loss": 0.06460838648490608,
"train/cross_entropy_loss": 0.021092993556521833,
"train/kl_divergence_loss": 0.108123779296875,
"train/step_duration_seconds": 7.178438901901245,
"train/steps_per_hour": 320.21611322984927,
"train/total_elapsed_hours": 0.8931468098693424
},
{
"epoch": 0.6802962962962963,
"grad_norm": 3.359375,
"learning_rate": 5.66116260882442e-06,
"loss": 1.0337,
"step": 287
},
{
"epoch": 0.6802962962962963,
"step": 287,
"train/combined_loss": 0.06716977385804057,
"train/cross_entropy_loss": 0.02099726488813758,
"train/kl_divergence_loss": 0.11334228515625,
"train/step_duration_seconds": 7.17467188835144,
"train/steps_per_hour": 320.6203190221968,
"train/total_elapsed_hours": 0.8951397742827734
},
{
"epoch": 0.6826666666666666,
"grad_norm": 3.203125,
"learning_rate": 5.586432972382561e-06,
"loss": 1.0747,
"step": 288
},
{
"epoch": 0.6826666666666666,
"step": 288,
"train/combined_loss": 0.06678420398384333,
"train/cross_entropy_loss": 0.02086699299979955,
"train/kl_divergence_loss": 0.112701416015625,
"train/step_duration_seconds": 7.181233882904053,
"train/steps_per_hour": 321.0220766963932,
"train/total_elapsed_hours": 0.897134561472469
},
{
"epoch": 0.685037037037037,
"grad_norm": 3.8125,
"learning_rate": 5.512008197995379e-06,
"loss": 1.0685,
"step": 289
},
{
"epoch": 0.685037037037037,
"step": 289,
"train/combined_loss": 0.06064820708706975,
"train/cross_entropy_loss": 0.020740994019433856,
"train/kl_divergence_loss": 0.100555419921875,
"train/step_duration_seconds": 7.175872325897217,
"train/steps_per_hour": 321.4225841160013,
"train/total_elapsed_hours": 0.8991278593407737
},
{
"epoch": 0.6874074074074074,
"grad_norm": 1.890625,
"learning_rate": 5.43789342646837e-06,
"loss": 0.9704,
"step": 290
},
{
"epoch": 0.6874074074074074,
"step": 290,
"train/combined_loss": 0.06774787046015263,
"train/cross_entropy_loss": 0.021848278120160103,
"train/kl_divergence_loss": 0.1136474609375,
"train/step_duration_seconds": 7.154938459396362,
"train/steps_per_hour": 321.8233964113103,
"train/total_elapsed_hours": 0.9011153422461615
},
{
"epoch": 0.6897777777777778,
"grad_norm": 2.453125,
"learning_rate": 5.364093777193944e-06,
"loss": 1.084,
"step": 291
},
{
"epoch": 0.6897777777777778,
"step": 291,
"train/combined_loss": 0.06057584332302213,
"train/cross_entropy_loss": 0.020870924927294254,
"train/kl_divergence_loss": 0.10028076171875,
"train/step_duration_seconds": 7.165515184402466,
"train/steps_per_hour": 322.2213962971687,
"train/total_elapsed_hours": 0.9031057631307178
},
{
"epoch": 0.6921481481481482,
"grad_norm": 4.4375,
"learning_rate": 5.290614347797802e-06,
"loss": 0.9692,
"step": 292
},
{
"epoch": 0.6921481481481482,
"step": 292,
"train/combined_loss": 0.0660083363763988,
"train/cross_entropy_loss": 0.02096320828422904,
"train/kl_divergence_loss": 0.111053466796875,
"train/step_duration_seconds": 7.173615455627441,
"train/steps_per_hour": 322.61684365124677,
"train/total_elapsed_hours": 0.9050984340906143
},
{
"epoch": 0.6945185185185185,
"grad_norm": 2.5625,
"learning_rate": 5.217460213786822e-06,
"loss": 1.0561,
"step": 293
},
{
"epoch": 0.6945185185185185,
"step": 293,
"train/combined_loss": 0.06140920426696539,
"train/cross_entropy_loss": 0.020706592011265457,
"train/kl_divergence_loss": 0.10211181640625,
"train/step_duration_seconds": 7.179595708847046,
"train/steps_per_hour": 323.00996205395205,
"train/total_elapsed_hours": 0.9070927662319607
},
{
"epoch": 0.6968888888888889,
"grad_norm": 2.34375,
"learning_rate": 5.144636428198477e-06,
"loss": 0.9825,
"step": 294
},
{
"epoch": 0.6968888888888889,
"step": 294,
"train/combined_loss": 0.061046687653288245,
"train/cross_entropy_loss": 0.020591912092640996,
"train/kl_divergence_loss": 0.10150146484375,
"train/step_duration_seconds": 7.179559946060181,
"train/steps_per_hour": 323.40135916435133,
"train/total_elapsed_hours": 0.9090870884391996
},
{
"epoch": 0.6992592592592592,
"grad_norm": 3.296875,
"learning_rate": 5.072148021251822e-06,
"loss": 0.9767,
"step": 295
},
{
"epoch": 0.6992592592592592,
"step": 295,
"train/combined_loss": 0.06156940385699272,
"train/cross_entropy_loss": 0.020782849984243512,
"train/kl_divergence_loss": 0.10235595703125,
"train/step_duration_seconds": 7.190384149551392,
"train/steps_per_hour": 323.78997420643765,
"train/total_elapsed_hours": 0.9110844173696306
},
{
"epoch": 0.7016296296296296,
"grad_norm": 3.484375,
"learning_rate": 5.000000000000003e-06,
"loss": 0.9851,
"step": 296
},
{
"epoch": 0.7016296296296296,
"step": 296,
"train/combined_loss": 0.05902678519487381,
"train/cross_entropy_loss": 0.02039731852710247,
"train/kl_divergence_loss": 0.09765625,
"train/step_duration_seconds": 7.165556907653809,
"train/steps_per_hour": 324.1793375982033,
"train/total_elapsed_hours": 0.9130748498439789
},
{
"epoch": 0.704,
"grad_norm": 3.515625,
"learning_rate": 4.92819734798441e-06,
"loss": 0.9444,
"step": 297
},
{
"epoch": 0.704,
"step": 297,
"train/combined_loss": 0.0633976545650512,
"train/cross_entropy_loss": 0.0211129350354895,
"train/kl_divergence_loss": 0.105682373046875,
"train/step_duration_seconds": 7.194438457489014,
"train/steps_per_hour": 324.5641615666989,
"train/total_elapsed_hours": 0.9150733049710592
},
{
"epoch": 0.7063703703703703,
"grad_norm": 5.28125,
"learning_rate": 4.856745024890466e-06,
"loss": 1.0144,
"step": 298
},
{
"epoch": 0.7063703703703703,
"step": 298,
"train/combined_loss": 0.06398635334335268,
"train/cross_entropy_loss": 0.0211001462303102,
"train/kl_divergence_loss": 0.10687255859375,
"train/step_duration_seconds": 7.178032159805298,
"train/steps_per_hour": 324.9489231458107,
"train/total_elapsed_hours": 0.9170672027932273
},
{
"epoch": 0.7087407407407408,
"grad_norm": 3.671875,
"learning_rate": 4.78564796620502e-06,
"loss": 1.0238,
"step": 299
},
{
"epoch": 0.7087407407407408,
"step": 299,
"train/combined_loss": 0.058621928095817566,
"train/cross_entropy_loss": 0.020564164966344833,
"train/kl_divergence_loss": 0.0966796875,
"train/step_duration_seconds": 7.176652193069458,
"train/steps_per_hour": 325.33215093908757,
"train/total_elapsed_hours": 0.9190607172913021
},
{
"epoch": 0.7111111111111111,
"grad_norm": 2.484375,
"learning_rate": 4.714911082875446e-06,
"loss": 0.938,
"step": 300
},
{
"epoch": 0.7111111111111111,
"eval_combined_loss": 0.06135369462271532,
"eval_cross_entropy_loss": 0.020845410078763962,
"eval_kl_divergence_loss": 0.10186197916666667,
"eval_loss": 0.06135369837284088,
"eval_runtime": 220.2651,
"eval_samples_per_second": 6.81,
"eval_steps_per_second": 3.405,
"step": 300
},
{
"epoch": 0.7111111111111111,
"step": 300,
"train/combined_loss": 0.06502728187479079,
"train/cross_entropy_loss": 0.02125939668621868,
"train/kl_divergence_loss": 0.108795166015625,
"train/step_duration_seconds": 227.4951696395874,
"train/steps_per_hour": 305.4200389971287,
"train/total_elapsed_hours": 0.9822538199689653
},
{
"epoch": 0.7134814814814815,
"grad_norm": 3.328125,
"learning_rate": 4.644539260970417e-06,
"loss": 1.0404,
"step": 301
},
{
"epoch": 0.7134814814814815,
"step": 301,
"train/combined_loss": 0.05955993290990591,
"train/cross_entropy_loss": 0.020364978816360235,
"train/kl_divergence_loss": 0.0987548828125,
"train/step_duration_seconds": 7.195945978164673,
"train/steps_per_hour": 305.81577462801977,
"train/total_elapsed_hours": 0.9842526938517888
},
{
"epoch": 0.7158518518518519,
"grad_norm": 2.375,
"learning_rate": 4.5745373613424075e-06,
"loss": 0.953,
"step": 302
},
{
"epoch": 0.7158518518518519,
"step": 302,
"train/combined_loss": 0.06077713891863823,
"train/cross_entropy_loss": 0.020449545118026435,
"train/kl_divergence_loss": 0.101104736328125,
"train/step_duration_seconds": 7.195575952529907,
"train/steps_per_hour": 306.2099380662498,
"train/total_elapsed_hours": 0.9862514649497138
},
{
"epoch": 0.7182222222222222,
"grad_norm": 4.125,
"learning_rate": 4.504910219291941e-06,
"loss": 0.9724,
"step": 303
},
{
"epoch": 0.7182222222222222,
"step": 303,
"train/combined_loss": 0.05961341969668865,
"train/cross_entropy_loss": 0.020319371833465993,
"train/kl_divergence_loss": 0.098907470703125,
"train/step_duration_seconds": 7.177309036254883,
"train/steps_per_hour": 306.6040813356026,
"train/total_elapsed_hours": 0.9882451619042291
},
{
"epoch": 0.7205925925925926,
"grad_norm": 3.734375,
"learning_rate": 4.435662644233594e-06,
"loss": 0.9538,
"step": 304
},
{
"epoch": 0.7205925925925926,
"step": 304,
"train/combined_loss": 0.059920859755948186,
"train/cross_entropy_loss": 0.020384933333843946,
"train/kl_divergence_loss": 0.099456787109375,
"train/step_duration_seconds": 7.177339553833008,
"train/steps_per_hour": 306.9966348805165,
"train/total_elapsed_hours": 0.9902388673358493
},
{
"epoch": 0.7229629629629629,
"grad_norm": 3.734375,
"learning_rate": 4.3667994193637794e-06,
"loss": 0.9587,
"step": 305
},
{
"epoch": 0.7229629629629629,
"step": 305,
"train/combined_loss": 0.0592358959838748,
"train/cross_entropy_loss": 0.020632435218431056,
"train/kl_divergence_loss": 0.09783935546875,
"train/step_duration_seconds": 7.177339315414429,
"train/steps_per_hour": 307.38761092036276,
"train/total_elapsed_hours": 0.9922325727012422
},
{
"epoch": 0.7253333333333334,
"grad_norm": 2.609375,
"learning_rate": 4.298325301330383e-06,
"loss": 0.9478,
"step": 306
},
{
"epoch": 0.7253333333333334,
"step": 306,
"train/combined_loss": 0.06081084324978292,
"train/cross_entropy_loss": 0.020883160177618265,
"train/kl_divergence_loss": 0.100738525390625,
"train/step_duration_seconds": 7.180126905441284,
"train/steps_per_hour": 307.776779219795,
"train/total_elapsed_hours": 0.9942270523971981
},
{
"epoch": 0.7277037037037037,
"grad_norm": 2.84375,
"learning_rate": 4.23024501990417e-06,
"loss": 0.973,
"step": 307
},
{
"epoch": 0.7277037037037037,
"step": 307,
"train/combined_loss": 0.05935222376137972,
"train/cross_entropy_loss": 0.020437843864783645,
"train/kl_divergence_loss": 0.0982666015625,
"train/step_duration_seconds": 7.176982641220093,
"train/steps_per_hour": 308.1646594287626,
"train/total_elapsed_hours": 0.996220658686426
},
{
"epoch": 0.7300740740740741,
"grad_norm": 2.671875,
"learning_rate": 4.162563277652104e-06,
"loss": 0.9496,
"step": 308
},
{
"epoch": 0.7300740740740741,
"step": 308,
"train/combined_loss": 0.06314325472339988,
"train/cross_entropy_loss": 0.020665171090513468,
"train/kl_divergence_loss": 0.105621337890625,
"train/step_duration_seconds": 7.161325693130493,
"train/steps_per_hour": 308.55233465191134,
"train/total_elapsed_hours": 0.9982099158234067
},
{
"epoch": 0.7324444444444445,
"grad_norm": 4.65625,
"learning_rate": 4.095284749612504e-06,
"loss": 1.0103,
"step": 309
},
{
"epoch": 0.7324444444444445,
"step": 309,
"train/combined_loss": 0.059852408710867167,
"train/cross_entropy_loss": 0.02018699492327869,
"train/kl_divergence_loss": 0.099517822265625,
"train/step_duration_seconds": 7.173473358154297,
"train/steps_per_hour": 308.93742555494475,
"train/total_elapsed_hours": 1.0002025473117828
},
{
"epoch": 0.7348148148148148,
"grad_norm": 6.5,
"learning_rate": 4.028414082972141e-06,
"loss": 0.9576,
"step": 310
},
{
"epoch": 0.7348148148148148,
"step": 310,
"train/combined_loss": 0.061497040558606386,
"train/cross_entropy_loss": 0.02069915970787406,
"train/kl_divergence_loss": 0.102294921875,
"train/step_duration_seconds": 7.176191329956055,
"train/steps_per_hour": 309.3207521077788,
"train/total_elapsed_hours": 1.002195933792326
},
{
"epoch": 0.7371851851851852,
"grad_norm": 5.625,
"learning_rate": 3.961955896745224e-06,
"loss": 0.984,
"step": 311
},
{
"epoch": 0.7371851851851852,
"step": 311,
"train/combined_loss": 0.06270680762827396,
"train/cross_entropy_loss": 0.02095194417051971,
"train/kl_divergence_loss": 0.104461669921875,
"train/step_duration_seconds": 7.178727865219116,
"train/steps_per_hour": 309.7023394964521,
"train/total_elapsed_hours": 1.004190024865998
},
{
"epoch": 0.7395555555555555,
"grad_norm": 6.78125,
"learning_rate": 3.89591478145437e-06,
"loss": 1.0033,
"step": 312
},
{
"epoch": 0.7395555555555555,
"step": 312,
"train/combined_loss": 0.06055857567116618,
"train/cross_entropy_loss": 0.02050069870892912,
"train/kl_divergence_loss": 0.100616455078125,
"train/step_duration_seconds": 7.1979944705963135,
"train/steps_per_hour": 310.0807650969411,
"train/total_elapsed_hours": 1.0061894677744971
},
{
"epoch": 0.7419259259259259,
"grad_norm": 1.8046875,
"learning_rate": 3.830295298813475e-06,
"loss": 0.9689,
"step": 313
},
{
"epoch": 0.7419259259259259,
"step": 313,
"train/combined_loss": 0.06014604773372412,
"train/cross_entropy_loss": 0.020926860976032913,
"train/kl_divergence_loss": 0.099365234375,
"train/step_duration_seconds": 7.178622245788574,
"train/steps_per_hour": 310.45934677419245,
"train/total_elapsed_hours": 1.0081835295094383
},
{
"epoch": 0.7442962962962963,
"grad_norm": 6.34375,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.9623,
"step": 314
},
{
"epoch": 0.7442962962962963,
"step": 314,
"train/combined_loss": 0.05981010291725397,
"train/cross_entropy_loss": 0.020895838970318437,
"train/kl_divergence_loss": 0.098724365234375,
"train/step_duration_seconds": 7.296232223510742,
"train/steps_per_hour": 310.8263816166535,
"train/total_elapsed_hours": 1.0102102606826358
},
{
"epoch": 0.7466666666666667,
"grad_norm": 9.625,
"learning_rate": 3.7003393324051874e-06,
"loss": 0.957,
"step": 315
},
{
"epoch": 0.7466666666666667,
"step": 315,
"train/combined_loss": 0.060554551193490624,
"train/cross_entropy_loss": 0.021896454854868352,
"train/kl_divergence_loss": 0.099212646484375,
"train/step_duration_seconds": 7.158878326416016,
"train/steps_per_hour": 311.20367676354846,
"train/total_elapsed_hours": 1.0121988379955291
},
{
"epoch": 0.7490370370370371,
"grad_norm": 7.0625,
"learning_rate": 3.636011825196365e-06,
"loss": 0.9689,
"step": 316
},
{
"epoch": 0.7490370370370371,
"step": 316,
"train/combined_loss": 0.05863487347960472,
"train/cross_entropy_loss": 0.020590057596564293,
"train/kl_divergence_loss": 0.0966796875,
"train/step_duration_seconds": 7.176958084106445,
"train/steps_per_hour": 311.5779494376515,
"train/total_elapsed_hours": 1.0141924374633364
},
{
"epoch": 0.7514074074074074,
"grad_norm": 6.59375,
"learning_rate": 3.5721239031346067e-06,
"loss": 0.9382,
"step": 317
},
{
"epoch": 0.7514074074074074,
"step": 317,
"train/combined_loss": 0.059863541973754764,
"train/cross_entropy_loss": 0.020819613593630493,
"train/kl_divergence_loss": 0.098907470703125,
"train/step_duration_seconds": 7.1567628383636475,
"train/steps_per_hour": 311.95247569565794,
"train/total_elapsed_hours": 1.0161804271406598
},
{
"epoch": 0.7537777777777778,
"grad_norm": 2.578125,
"learning_rate": 3.5086799792044812e-06,
"loss": 0.9578,
"step": 318
},
{
"epoch": 0.7537777777777778,
"step": 318,
"train/combined_loss": 0.06011883169412613,
"train/cross_entropy_loss": 0.02053673774935305,
"train/kl_divergence_loss": 0.099700927734375,
"train/step_duration_seconds": 7.177694797515869,
"train/steps_per_hour": 312.3237558362263,
"train/total_elapsed_hours": 1.0181742312510809
},
{
"epoch": 0.7561481481481481,
"grad_norm": 4.21875,
"learning_rate": 3.4456844357218977e-06,
"loss": 0.9619,
"step": 319
},
{
"epoch": 0.7561481481481481,
"step": 319,
"train/combined_loss": 0.060971920378506184,
"train/cross_entropy_loss": 0.020503410720266402,
"train/kl_divergence_loss": 0.1014404296875,
"train/step_duration_seconds": 7.183193206787109,
"train/steps_per_hour": 312.69311658008917,
"train/total_elapsed_hours": 1.0201695626974105
},
{
"epoch": 0.7585185185185185,
"grad_norm": 6.0625,
"learning_rate": 3.3831416240314085e-06,
"loss": 0.9756,
"step": 320
},
{
"epoch": 0.7585185185185185,
"step": 320,
"train/combined_loss": 0.05914177093654871,
"train/cross_entropy_loss": 0.020505218068137765,
"train/kl_divergence_loss": 0.0977783203125,
"train/step_duration_seconds": 7.172834634780884,
"train/steps_per_hour": 313.0619165575121,
"train/total_elapsed_hours": 1.0221620167626275
},
{
"epoch": 0.7608888888888888,
"grad_norm": 7.09375,
"learning_rate": 3.3210558642056277e-06,
"loss": 0.9463,
"step": 321
},
{
"epoch": 0.7608888888888888,
"step": 321,
"train/combined_loss": 0.06318964948877692,
"train/cross_entropy_loss": 0.02078847971279174,
"train/kl_divergence_loss": 0.1055908203125,
"train/step_duration_seconds": 7.178191423416138,
"train/steps_per_hour": 313.4288261803181,
"train/total_elapsed_hours": 1.0241559588246876
},
{
"epoch": 0.7632592592592593,
"grad_norm": 5.625,
"learning_rate": 3.2594314447468457e-06,
"loss": 1.011,
"step": 322
},
{
"epoch": 0.7632592592592593,
"step": 322,
"train/combined_loss": 0.06236264854669571,
"train/cross_entropy_loss": 0.02066035382449627,
"train/kl_divergence_loss": 0.10406494140625,
"train/step_duration_seconds": 7.178012847900391,
"train/steps_per_hour": 313.7943250662448,
"train/total_elapsed_hours": 1.0261498512824376
},
{
"epoch": 0.7656296296296297,
"grad_norm": 6.65625,
"learning_rate": 3.1982726222908046e-06,
"loss": 0.9978,
"step": 323
},
{
"epoch": 0.7656296296296297,
"step": 323,
"train/combined_loss": 0.06084095500409603,
"train/cross_entropy_loss": 0.02054665272589773,
"train/kl_divergence_loss": 0.10113525390625,
"train/step_duration_seconds": 7.176525115966797,
"train/steps_per_hour": 314.1585325936901,
"train/total_elapsed_hours": 1.0281433304813172
},
{
"epoch": 0.768,
"grad_norm": 4.46875,
"learning_rate": 3.1375836213126653e-06,
"loss": 0.9735,
"step": 324
},
{
"epoch": 0.768,
"step": 324,
"train/combined_loss": 0.06315464107319713,
"train/cross_entropy_loss": 0.020748980692587793,
"train/kl_divergence_loss": 0.105560302734375,
"train/step_duration_seconds": 7.15761399269104,
"train/steps_per_hour": 314.52293440305624,
"train/total_elapsed_hours": 1.0301315565903981
},
{
"epoch": 0.7703703703703704,
"grad_norm": 8.0,
"learning_rate": 3.077368633835205e-06,
"loss": 1.0105,
"step": 325
},
{
"epoch": 0.7703703703703704,
"step": 325,
"train/combined_loss": 0.05943355988711119,
"train/cross_entropy_loss": 0.020478446152992547,
"train/kl_divergence_loss": 0.098388671875,
"train/step_duration_seconds": 7.163183212280273,
"train/steps_per_hour": 314.88546031009383,
"train/total_elapsed_hours": 1.0321213297049205
},
{
"epoch": 0.7727407407407407,
"grad_norm": 7.96875,
"learning_rate": 3.017631819139273e-06,
"loss": 0.9509,
"step": 326
},
{
"epoch": 0.7727407407407407,
"step": 326,
"train/combined_loss": 0.06256748456507921,
"train/cross_entropy_loss": 0.020612266613170505,
"train/kl_divergence_loss": 0.104522705078125,
"train/step_duration_seconds": 7.17126202583313,
"train/steps_per_hour": 315.24590700472396,
"train/total_elapsed_hours": 1.0341133469343184
},
{
"epoch": 0.7751111111111111,
"grad_norm": 4.03125,
"learning_rate": 2.958377303476483e-06,
"loss": 1.0011,
"step": 327
},
{
"epoch": 0.7751111111111111,
"step": 327,
"train/combined_loss": 0.059798732632771134,
"train/cross_entropy_loss": 0.020720509812235832,
"train/kl_divergence_loss": 0.098876953125,
"train/step_duration_seconds": 7.176945686340332,
"train/steps_per_hour": 315.60448679780745,
"train/total_elapsed_hours": 1.0361069429583019
},
{
"epoch": 0.7774814814814814,
"grad_norm": 2.3125,
"learning_rate": 2.8996091797841976e-06,
"loss": 0.9568,
"step": 328
},
{
"epoch": 0.7774814814814814,
"step": 328,
"train/combined_loss": 0.058641964104026556,
"train/cross_entropy_loss": 0.020512689370661974,
"train/kl_divergence_loss": 0.096771240234375,
"train/step_duration_seconds": 7.181287527084351,
"train/steps_per_hour": 315.96132225407996,
"train/total_elapsed_hours": 1.0381017450491588
},
{
"epoch": 0.7798518518518519,
"grad_norm": 3.1875,
"learning_rate": 2.8413315074028157e-06,
"loss": 0.9383,
"step": 329
},
{
"epoch": 0.7798518518518519,
"step": 329,
"train/combined_loss": 0.05962651362642646,
"train/cross_entropy_loss": 0.02113901753909886,
"train/kl_divergence_loss": 0.098114013671875,
"train/step_duration_seconds": 7.1774678230285645,
"train/steps_per_hour": 316.31711164339544,
"train/total_elapsed_hours": 1.0400954861111111
},
{
"epoch": 0.7822222222222223,
"grad_norm": 3.0625,
"learning_rate": 2.783548311795379e-06,
"loss": 0.954,
"step": 330
},
{
"epoch": 0.7822222222222223,
"step": 330,
"train/combined_loss": 0.059158258605748415,
"train/cross_entropy_loss": 0.020599229959771037,
"train/kl_divergence_loss": 0.09771728515625,
"train/step_duration_seconds": 7.179843902587891,
"train/steps_per_hour": 316.67133906098195,
"train/total_elapsed_hours": 1.0420898871951634
},
{
"epoch": 0.7845925925925926,
"grad_norm": 3.875,
"learning_rate": 2.726263584269513e-06,
"loss": 0.9465,
"step": 331
},
{
"epoch": 0.7845925925925926,
"step": 331,
"train/combined_loss": 0.05943922000005841,
"train/cross_entropy_loss": 0.020550800720229745,
"train/kl_divergence_loss": 0.09832763671875,
"train/step_duration_seconds": 7.175478458404541,
"train/steps_per_hour": 317.02458139366485,
"train/total_elapsed_hours": 1.0440830756558312
},
{
"epoch": 0.786962962962963,
"grad_norm": 3.3125,
"learning_rate": 2.669481281701739e-06,
"loss": 0.951,
"step": 332
},
{
"epoch": 0.786962962962963,
"step": 332,
"train/combined_loss": 0.06072757695801556,
"train/cross_entropy_loss": 0.020319899427704513,
"train/kl_divergence_loss": 0.10113525390625,
"train/step_duration_seconds": 7.182747840881348,
"train/steps_per_hour": 317.3758649537043,
"train/total_elapsed_hours": 1.0460782833894093
},
{
"epoch": 0.7893333333333333,
"grad_norm": 4.21875,
"learning_rate": 2.6132053262641467e-06,
"loss": 0.9716,
"step": 333
},
{
"epoch": 0.7893333333333333,
"step": 333,
"train/combined_loss": 0.05954708158969879,
"train/cross_entropy_loss": 0.02046134858392179,
"train/kl_divergence_loss": 0.0986328125,
"train/step_duration_seconds": 7.171588897705078,
"train/steps_per_hour": 317.72675072895083,
"train/total_elapsed_hours": 1.0480703914165497
},
{
"epoch": 0.7917037037037037,
"grad_norm": 2.21875,
"learning_rate": 2.5574396051534835e-06,
"loss": 0.9528,
"step": 334
},
{
"epoch": 0.7917037037037037,
"step": 334,
"train/combined_loss": 0.05950666521675885,
"train/cross_entropy_loss": 0.020380519214086235,
"train/kl_divergence_loss": 0.0986328125,
"train/step_duration_seconds": 7.180173397064209,
"train/steps_per_hour": 318.0755828336615,
"train/total_elapsed_hours": 1.0500648840268454
},
{
"epoch": 0.794074074074074,
"grad_norm": 2.78125,
"learning_rate": 2.502187970322657e-06,
"loss": 0.9521,
"step": 335
},
{
"epoch": 0.794074074074074,
"step": 335,
"train/combined_loss": 0.06000912608578801,
"train/cross_entropy_loss": 0.020530943875201046,
"train/kl_divergence_loss": 0.0994873046875,
"train/step_duration_seconds": 7.176333665847778,
"train/steps_per_hour": 318.4234151295545,
"train/total_elapsed_hours": 1.0520583100451364
},
{
"epoch": 0.7964444444444444,
"grad_norm": 1.8984375,
"learning_rate": 2.447454238214654e-06,
"loss": 0.9601,
"step": 336
},
{
"epoch": 0.7964444444444444,
"step": 336,
"train/combined_loss": 0.061862445436418056,
"train/cross_entropy_loss": 0.020911171450279653,
"train/kl_divergence_loss": 0.102813720703125,
"train/step_duration_seconds": 7.18380331993103,
"train/steps_per_hour": 318.7693042840402,
"train/total_elapsed_hours": 1.0540538109673394
},
{
"epoch": 0.7988148148148149,
"grad_norm": 5.3125,
"learning_rate": 2.3932421894989167e-06,
"loss": 0.9898,
"step": 337
},
{
"epoch": 0.7988148148148149,
"step": 337,
"train/combined_loss": 0.06748714856803417,
"train/cross_entropy_loss": 0.02215080999303609,
"train/kl_divergence_loss": 0.112823486328125,
"train/step_duration_seconds": 7.178979873657227,
"train/steps_per_hour": 319.1142911319986,
"train/total_elapsed_hours": 1.0560479720433553
},
{
"epoch": 0.8011851851851852,
"grad_norm": 2.765625,
"learning_rate": 2.339555568810221e-06,
"loss": 1.0798,
"step": 338
},
{
"epoch": 0.8011851851851852,
"step": 338,
"train/combined_loss": 0.059276150073856115,
"train/cross_entropy_loss": 0.020651907310821116,
"train/kl_divergence_loss": 0.097900390625,
"train/step_duration_seconds": 7.173582077026367,
"train/steps_per_hour": 319.4584302570375,
"train/total_elapsed_hours": 1.058040633731418
},
{
"epoch": 0.8035555555555556,
"grad_norm": 1.3515625,
"learning_rate": 2.2863980844900036e-06,
"loss": 0.9484,
"step": 339
},
{
"epoch": 0.8035555555555556,
"step": 339,
"train/combined_loss": 0.0625988682731986,
"train/cross_entropy_loss": 0.02107176184654236,
"train/kl_divergence_loss": 0.1041259765625,
"train/step_duration_seconds": 7.17778754234314,
"train/steps_per_hour": 319.8009231203147,
"train/total_elapsed_hours": 1.0600344636042913
},
{
"epoch": 0.8059259259259259,
"grad_norm": 2.296875,
"learning_rate": 2.2337734083302164e-06,
"loss": 1.0016,
"step": 340
},
{
"epoch": 0.8059259259259259,
"step": 340,
"train/combined_loss": 0.058292608708143234,
"train/cross_entropy_loss": 0.02039380930364132,
"train/kl_divergence_loss": 0.09619140625,
"train/step_duration_seconds": 7.182078838348389,
"train/steps_per_hour": 320.14177067664195,
"train/total_elapsed_hours": 1.0620294855038326
},
{
"epoch": 0.8082962962962963,
"grad_norm": 4.625,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.9327,
"step": 341
},
{
"epoch": 0.8082962962962963,
"step": 341,
"train/combined_loss": 0.0593375526368618,
"train/cross_entropy_loss": 0.020500056445598602,
"train/kl_divergence_loss": 0.098175048828125,
"train/step_duration_seconds": 7.171487331390381,
"train/steps_per_hour": 320.4822262207178,
"train/total_elapsed_hours": 1.0640215653181075
},
{
"epoch": 0.8106666666666666,
"grad_norm": 3.59375,
"learning_rate": 2.130136983393112e-06,
"loss": 0.9494,
"step": 342
},
{
"epoch": 0.8106666666666666,
"step": 342,
"train/combined_loss": 0.060327990911901,
"train/cross_entropy_loss": 0.020588844665326178,
"train/kl_divergence_loss": 0.100067138671875,
"train/step_duration_seconds": 7.198361873626709,
"train/steps_per_hour": 320.81916267981495,
"train/total_elapsed_hours": 1.066021110283004
},
{
"epoch": 0.813037037037037,
"grad_norm": 1.53125,
"learning_rate": 2.0791323931823783e-06,
"loss": 0.9652,
"step": 343
},
{
"epoch": 0.813037037037037,
"step": 343,
"train/combined_loss": 0.05895965825766325,
"train/cross_entropy_loss": 0.02056823973543942,
"train/kl_divergence_loss": 0.09735107421875,
"train/step_duration_seconds": 7.177961349487305,
"train/steps_per_hour": 321.1565415410552,
"train/total_elapsed_hours": 1.0680149884356394
},
{
"epoch": 0.8154074074074074,
"grad_norm": 2.78125,
"learning_rate": 2.0286749277707783e-06,
"loss": 0.9434,
"step": 344
},
{
"epoch": 0.8154074074074074,
"step": 344,
"train/combined_loss": 0.06692604720592499,
"train/cross_entropy_loss": 0.02130326582118869,
"train/kl_divergence_loss": 0.112548828125,
"train/step_duration_seconds": 7.182729482650757,
"train/steps_per_hour": 321.49226509337905,
"train/total_elapsed_hours": 1.0700101910697088
},
{
"epoch": 0.8177777777777778,
"grad_norm": 2.78125,
"learning_rate": 1.9787680724495617e-06,
"loss": 1.0708,
"step": 345
},
{
"epoch": 0.8177777777777778,
"step": 345,
"train/combined_loss": 0.06856274465098977,
"train/cross_entropy_loss": 0.02140283351764083,
"train/kl_divergence_loss": 0.11572265625,
"train/step_duration_seconds": 7.21046257019043,
"train/steps_per_hour": 321.8244262652279,
"train/total_elapsed_hours": 1.0720130973392064
},
{
"epoch": 0.8201481481481482,
"grad_norm": 2.53125,
"learning_rate": 1.929415274477239e-06,
"loss": 1.097,
"step": 346
},
{
"epoch": 0.8201481481481482,
"step": 346,
"train/combined_loss": 0.06000364082865417,
"train/cross_entropy_loss": 0.020581011194735765,
"train/kl_divergence_loss": 0.09942626953125,
"train/step_duration_seconds": 7.178732395172119,
"train/steps_per_hour": 322.1579923556434,
"train/total_elapsed_hours": 1.0740071896711985
},
{
"epoch": 0.8225185185185185,
"grad_norm": 1.375,
"learning_rate": 1.880619942841435e-06,
"loss": 0.9601,
"step": 347
},
{
"epoch": 0.8225185185185185,
"step": 347,
"train/combined_loss": 0.06414050119929016,
"train/cross_entropy_loss": 0.02085912844631821,
"train/kl_divergence_loss": 0.107421875,
"train/step_duration_seconds": 7.1721296310424805,
"train/steps_per_hour": 322.4908717904752,
"train/total_elapsed_hours": 1.0759994479020436
},
{
"epoch": 0.8248888888888889,
"grad_norm": 4.46875,
"learning_rate": 1.8323854480234348e-06,
"loss": 1.0262,
"step": 348
},
{
"epoch": 0.8248888888888889,
"step": 348,
"train/combined_loss": 0.059865488670766354,
"train/cross_entropy_loss": 0.020457297330722213,
"train/kl_divergence_loss": 0.099273681640625,
"train/step_duration_seconds": 7.182539701461792,
"train/steps_per_hour": 322.8216548617558,
"train/total_elapsed_hours": 1.0779945978191163
},
{
"epoch": 0.8272592592592592,
"grad_norm": 0.95703125,
"learning_rate": 1.7847151217653624e-06,
"loss": 0.9578,
"step": 349
},
{
"epoch": 0.8272592592592592,
"step": 349,
"train/combined_loss": 0.061520870542153716,
"train/cross_entropy_loss": 0.020655267755500972,
"train/kl_divergence_loss": 0.102386474609375,
"train/step_duration_seconds": 7.174387454986572,
"train/steps_per_hour": 323.15189335194066,
"train/total_elapsed_hours": 1.0799874832232794
},
{
"epoch": 0.8296296296296296,
"grad_norm": 1.3515625,
"learning_rate": 1.7376122568400533e-06,
"loss": 0.9843,
"step": 350
},
{
"epoch": 0.8296296296296296,
"eval_combined_loss": 0.060819300456593436,
"eval_cross_entropy_loss": 0.02078313216318687,
"eval_kl_divergence_loss": 0.10085546875,
"eval_loss": 0.060819294303655624,
"eval_runtime": 220.1882,
"eval_samples_per_second": 6.812,
"eval_steps_per_second": 3.406,
"step": 350
},
{
"epoch": 0.8296296296296296,
"step": 350,
"train/combined_loss": 0.05945373326539993,
"train/cross_entropy_loss": 0.020640864269807935,
"train/kl_divergence_loss": 0.0982666015625,
"train/step_duration_seconds": 227.37273049354553,
"train/steps_per_hour": 306.17246086025364,
"train/total_elapsed_hours": 1.143146575027042
},
{
"epoch": 0.832,
"grad_norm": 1.421875,
"learning_rate": 1.6910801068236015e-06,
"loss": 0.9513,
"step": 351
},
{
"epoch": 0.832,
"step": 351,
"train/combined_loss": 0.05878610094077885,
"train/cross_entropy_loss": 0.02025164384394884,
"train/kl_divergence_loss": 0.097320556640625,
"train/step_duration_seconds": 7.182687520980835,
"train/steps_per_hour": 306.51226810501225,
"train/total_elapsed_hours": 1.1451417660050922
},
{
"epoch": 0.8343703703703703,
"grad_norm": 3.078125,
"learning_rate": 1.6451218858706374e-06,
"loss": 0.9406,
"step": 352
},
{
"epoch": 0.8343703703703703,
"step": 352,
"train/combined_loss": 0.05914012948051095,
"train/cross_entropy_loss": 0.02041038265451789,
"train/kl_divergence_loss": 0.097869873046875,
"train/step_duration_seconds": 7.178761959075928,
"train/steps_per_hour": 306.85118499420435,
"train/total_elapsed_hours": 1.14713586654928
},
{
"epoch": 0.8367407407407408,
"grad_norm": 4.1875,
"learning_rate": 1.599740768492286e-06,
"loss": 0.9462,
"step": 353
},
{
"epoch": 0.8367407407407408,
"step": 353,
"train/combined_loss": 0.06183216394856572,
"train/cross_entropy_loss": 0.02106422872748226,
"train/kl_divergence_loss": 0.10260009765625,
"train/step_duration_seconds": 7.19196081161499,
"train/steps_per_hour": 307.1879455332317,
"train/total_elapsed_hours": 1.1491336334413953
},
{
"epoch": 0.8391111111111111,
"grad_norm": 1.5234375,
"learning_rate": 1.5549398893369216e-06,
"loss": 0.9893,
"step": 354
},
{
"epoch": 0.8391111111111111,
"step": 354,
"train/combined_loss": 0.0688268430531025,
"train/cross_entropy_loss": 0.022144652903079987,
"train/kl_divergence_loss": 0.115509033203125,
"train/step_duration_seconds": 7.177990674972534,
"train/steps_per_hour": 307.5245738890482,
"train/total_elapsed_hours": 1.1511275197399986
},
{
"epoch": 0.8414814814814815,
"grad_norm": 2.40625,
"learning_rate": 1.5107223429736273e-06,
"loss": 1.1012,
"step": 355
},
{
"epoch": 0.8414814814814815,
"step": 355,
"train/combined_loss": 0.060881074983626604,
"train/cross_entropy_loss": 0.021084662177599967,
"train/kl_divergence_loss": 0.100677490234375,
"train/step_duration_seconds": 7.157759428024292,
"train/steps_per_hour": 307.8615384801584,
"train/total_elapsed_hours": 1.1531157862477832
},
{
"epoch": 0.8438518518518519,
"grad_norm": 1.6796875,
"learning_rate": 1.467091183678444e-06,
"loss": 0.9741,
"step": 356
},
{
"epoch": 0.8438518518518519,
"step": 356,
"train/combined_loss": 0.05890939268283546,
"train/cross_entropy_loss": 0.02049823058769107,
"train/kl_divergence_loss": 0.097320556640625,
"train/step_duration_seconds": 7.175466537475586,
"train/steps_per_hour": 308.1960306908749,
"train/total_elapsed_hours": 1.155108971397082
},
{
"epoch": 0.8462222222222222,
"grad_norm": 3.640625,
"learning_rate": 1.424049425223405e-06,
"loss": 0.9426,
"step": 357
},
{
"epoch": 0.8462222222222222,
"step": 357,
"train/combined_loss": 0.06022225972265005,
"train/cross_entropy_loss": 0.020865662721917033,
"train/kl_divergence_loss": 0.099578857421875,
"train/step_duration_seconds": 7.179744005203247,
"train/steps_per_hour": 308.5290537144967,
"train/total_elapsed_hours": 1.1571033447318606
},
{
"epoch": 0.8485925925925926,
"grad_norm": 1.8671875,
"learning_rate": 1.3816000406683604e-06,
"loss": 0.9636,
"step": 358
},
{
"epoch": 0.8485925925925926,
"step": 358,
"train/combined_loss": 0.06817956361919641,
"train/cross_entropy_loss": 0.021460447693243623,
"train/kl_divergence_loss": 0.114898681640625,
"train/step_duration_seconds": 7.1635658740997314,
"train/steps_per_hour": 308.8621282082033,
"train/total_elapsed_hours": 1.1590932241413328
},
{
"epoch": 0.8509629629629629,
"grad_norm": 3.296875,
"learning_rate": 1.339745962155613e-06,
"loss": 1.0909,
"step": 359
},
{
"epoch": 0.8509629629629629,
"step": 359,
"train/combined_loss": 0.05856375303119421,
"train/cross_entropy_loss": 0.020386786898598075,
"train/kl_divergence_loss": 0.09674072265625,
"train/step_duration_seconds": 7.170348644256592,
"train/steps_per_hour": 309.19355931513985,
"train/total_elapsed_hours": 1.1610849876536264
},
{
"epoch": 0.8533333333333334,
"grad_norm": 2.328125,
"learning_rate": 1.2984900807073919e-06,
"loss": 0.937,
"step": 360
},
{
"epoch": 0.8533333333333334,
"step": 360,
"train/combined_loss": 0.05872082710266113,
"train/cross_entropy_loss": 0.020578863797709346,
"train/kl_divergence_loss": 0.09686279296875,
"train/step_duration_seconds": 7.177663326263428,
"train/steps_per_hour": 309.5233145467673,
"train/total_elapsed_hours": 1.1630787830220328
},
{
"epoch": 0.8557037037037037,
"grad_norm": 1.4140625,
"learning_rate": 1.2578352460261456e-06,
"loss": 0.9395,
"step": 361
},
{
"epoch": 0.8557037037037037,
"step": 361,
"train/combined_loss": 0.05911425780504942,
"train/cross_entropy_loss": 0.02054174430668354,
"train/kl_divergence_loss": 0.097686767578125,
"train/step_duration_seconds": 7.158376932144165,
"train/steps_per_hour": 309.8533659473708,
"train/total_elapsed_hours": 1.1650672210587396
},
{
"epoch": 0.8580740740740741,
"grad_norm": 1.6953125,
"learning_rate": 1.2177842662977136e-06,
"loss": 0.9458,
"step": 362
},
{
"epoch": 0.8580740740740741,
"step": 362,
"train/combined_loss": 0.06256715022027493,
"train/cross_entropy_loss": 0.02085573854856193,
"train/kl_divergence_loss": 0.104278564453125,
"train/step_duration_seconds": 7.160759925842285,
"train/steps_per_hour": 310.182116727655,
"train/total_elapsed_hours": 1.1670563210381402
},
{
"epoch": 0.8604444444444445,
"grad_norm": 2.109375,
"learning_rate": 1.1783399079973578e-06,
"loss": 1.0011,
"step": 363
},
{
"epoch": 0.8604444444444445,
"step": 363,
"train/combined_loss": 0.05845469981431961,
"train/cross_entropy_loss": 0.02022971585392952,
"train/kl_divergence_loss": 0.0966796875,
"train/step_duration_seconds": 7.147622346878052,
"train/steps_per_hour": 310.51071808599374,
"train/total_elapsed_hours": 1.1690417716900507
},
{
"epoch": 0.8628148148148148,
"grad_norm": 1.40625,
"learning_rate": 1.1395048956986577e-06,
"loss": 0.9353,
"step": 364
},
{
"epoch": 0.8628148148148148,
"step": 364,
"train/combined_loss": 0.05994793586432934,
"train/cross_entropy_loss": 0.020683227223344147,
"train/kl_divergence_loss": 0.099212646484375,
"train/step_duration_seconds": 7.1444926261901855,
"train/steps_per_hour": 310.83843593717893,
"train/total_elapsed_hours": 1.1710263529751035
},
{
"epoch": 0.8651851851851852,
"grad_norm": 5.96875,
"learning_rate": 1.1012819118853147e-06,
"loss": 0.9592,
"step": 365
},
{
"epoch": 0.8651851851851852,
"step": 365,
"train/combined_loss": 0.05859701009467244,
"train/cross_entropy_loss": 0.02045329543761909,
"train/kl_divergence_loss": 0.09674072265625,
"train/step_duration_seconds": 7.179033279418945,
"train/steps_per_hour": 311.1624997276206,
"train/total_elapsed_hours": 1.1730205288860533
},
{
"epoch": 0.8675555555555555,
"grad_norm": 1.375,
"learning_rate": 1.0636735967658785e-06,
"loss": 0.9376,
"step": 366
},
{
"epoch": 0.8675555555555555,
"step": 366,
"train/combined_loss": 0.06006666086614132,
"train/cross_entropy_loss": 0.020523948594927788,
"train/kl_divergence_loss": 0.099609375,
"train/step_duration_seconds": 7.176509141921997,
"train/steps_per_hour": 311.48564941676807,
"train/total_elapsed_hours": 1.1750140036476984
},
{
"epoch": 0.8699259259259259,
"grad_norm": 1.8828125,
"learning_rate": 1.026682548091361e-06,
"loss": 0.9611,
"step": 367
},
{
"epoch": 0.8699259259259259,
"step": 367,
"train/combined_loss": 0.0603836253285408,
"train/cross_entropy_loss": 0.020761147141456604,
"train/kl_divergence_loss": 0.100006103515625,
"train/step_duration_seconds": 7.1585469245910645,
"train/steps_per_hour": 311.8090262847088,
"train/total_elapsed_hours": 1.1770024889045292
},
{
"epoch": 0.8722962962962963,
"grad_norm": 2.28125,
"learning_rate": 9.903113209758098e-07,
"loss": 0.9661,
"step": 368
},
{
"epoch": 0.8722962962962963,
"step": 368,
"train/combined_loss": 0.061565724201500416,
"train/cross_entropy_loss": 0.020806010346859694,
"train/kl_divergence_loss": 0.102325439453125,
"train/step_duration_seconds": 7.178318738937378,
"train/steps_per_hour": 312.1298583233999,
"train/total_elapsed_hours": 1.1789964663320118
},
{
"epoch": 0.8746666666666667,
"grad_norm": 2.21875,
"learning_rate": 9.545624277198085e-07,
"loss": 0.9851,
"step": 369
},
{
"epoch": 0.8746666666666667,
"step": 369,
"train/combined_loss": 0.05880419351160526,
"train/cross_entropy_loss": 0.02074559754692018,
"train/kl_divergence_loss": 0.09686279296875,
"train/step_duration_seconds": 7.1789703369140625,
"train/steps_per_hour": 312.44955909393565,
"train/total_elapsed_hours": 1.1809906247589324
},
{
"epoch": 0.8770370370370371,
"grad_norm": 2.8125,
"learning_rate": 9.194383376369509e-07,
"loss": 0.9409,
"step": 370
},
{
"epoch": 0.8770370370370371,
"step": 370,
"train/combined_loss": 0.059380816062912345,
"train/cross_entropy_loss": 0.0207696893485263,
"train/kl_divergence_loss": 0.097991943359375,
"train/step_duration_seconds": 7.177523374557495,
"train/steps_per_hour": 312.7682882917324,
"train/total_elapsed_hours": 1.182984381251865
},
{
"epoch": 0.8794074074074074,
"grad_norm": 1.765625,
"learning_rate": 8.849414768832687e-07,
"loss": 0.9501,
"step": 371
},
{
"epoch": 0.8794074074074074,
"step": 371,
"train/combined_loss": 0.058346427977085114,
"train/cross_entropy_loss": 0.02050144597887993,
"train/kl_divergence_loss": 0.09619140625,
"train/step_duration_seconds": 7.176830053329468,
"train/steps_per_hour": 313.08599583369795,
"train/total_elapsed_hours": 1.1849779451555675
},
{
"epoch": 0.8817777777777778,
"grad_norm": 2.546875,
"learning_rate": 8.510742282896545e-07,
"loss": 0.9335,
"step": 372
},
{
"epoch": 0.8817777777777778,
"step": 372,
"train/combined_loss": 0.061172885121777654,
"train/cross_entropy_loss": 0.02075275091920048,
"train/kl_divergence_loss": 0.101593017578125,
"train/step_duration_seconds": 7.177177906036377,
"train/steps_per_hour": 313.40261065917605,
"train/total_elapsed_hours": 1.1869716056850221
},
{
"epoch": 0.8841481481481481,
"grad_norm": 2.578125,
"learning_rate": 8.178389311972612e-07,
"loss": 0.9788,
"step": 373
},
{
"epoch": 0.8841481481481481,
"step": 373,
"train/combined_loss": 0.06040294258855283,
"train/cross_entropy_loss": 0.02064719540067017,
"train/kl_divergence_loss": 0.10015869140625,
"train/step_duration_seconds": 7.17975640296936,
"train/steps_per_hour": 313.7179746952193,
"train/total_elapsed_hours": 1.1889659824636247
},
{
"epoch": 0.8865185185185185,
"grad_norm": 2.109375,
"learning_rate": 7.852378812959227e-07,
"loss": 0.9664,
"step": 374
},
{
"epoch": 0.8865185185185185,
"step": 374,
"train/combined_loss": 0.061964265536516905,
"train/cross_entropy_loss": 0.021664125844836235,
"train/kl_divergence_loss": 0.102264404296875,
"train/step_duration_seconds": 7.165625333786011,
"train/steps_per_hour": 314.0333175421587,
"train/total_elapsed_hours": 1.1909564339452319
},
{
"epoch": 0.8888888888888888,
"grad_norm": 3.921875,
"learning_rate": 7.532733304655848e-07,
"loss": 0.9914,
"step": 375
},
{
"epoch": 0.8888888888888888,
"step": 375,
"train/combined_loss": 0.0588826653547585,
"train/cross_entropy_loss": 0.020475288503803313,
"train/kl_divergence_loss": 0.0972900390625,
"train/step_duration_seconds": 7.169605493545532,
"train/steps_per_hour": 314.34731674868476,
"train/total_elapsed_hours": 1.1929479910267724
},
{
"epoch": 0.8912592592592593,
"grad_norm": 1.4453125,
"learning_rate": 7.219474866207465e-07,
"loss": 0.9421,
"step": 376
},
{
"epoch": 0.8912592592592593,
"step": 376,
"train/combined_loss": 0.05903471680358052,
"train/cross_entropy_loss": 0.02038266253657639,
"train/kl_divergence_loss": 0.097686767578125,
"train/step_duration_seconds": 7.174905776977539,
"train/steps_per_hour": 314.65988159919425,
"train/total_elapsed_hours": 1.194941020409266
},
{
"epoch": 0.8936296296296297,
"grad_norm": 2.6875,
"learning_rate": 6.912625135579587e-07,
"loss": 0.9446,
"step": 377
},
{
"epoch": 0.8936296296296297,
"step": 377,
"train/combined_loss": 0.06489993864670396,
"train/cross_entropy_loss": 0.021004713140428066,
"train/kl_divergence_loss": 0.108795166015625,
"train/step_duration_seconds": 7.203597068786621,
"train/steps_per_hour": 314.96930831081517,
"train/total_elapsed_hours": 1.1969420195950402
},
{
"epoch": 0.896,
"grad_norm": 1.5,
"learning_rate": 6.612205308063646e-07,
"loss": 1.0384,
"step": 378
},
{
"epoch": 0.896,
"step": 378,
"train/combined_loss": 0.06032265955582261,
"train/cross_entropy_loss": 0.020669732824899256,
"train/kl_divergence_loss": 0.0999755859375,
"train/step_duration_seconds": 7.174722909927368,
"train/steps_per_hour": 315.27981131041514,
"train/total_elapsed_hours": 1.1989349981811313
},
{
"epoch": 0.8983703703703704,
"grad_norm": 1.28125,
"learning_rate": 6.318236134812917e-07,
"loss": 0.9652,
"step": 379
},
{
"epoch": 0.8983703703703704,
"step": 379,
"train/combined_loss": 0.060588925145566463,
"train/cross_entropy_loss": 0.02068346820306033,
"train/kl_divergence_loss": 0.100494384765625,
"train/step_duration_seconds": 7.157525300979614,
"train/steps_per_hour": 315.5905391030105,
"train/total_elapsed_hours": 1.2009231996536256
},
{
"epoch": 0.9007407407407407,
"grad_norm": 2.03125,
"learning_rate": 6.030737921409169e-07,
"loss": 0.9694,
"step": 380
},
{
"epoch": 0.9007407407407407,
"step": 380,
"train/combined_loss": 0.05855354503728449,
"train/cross_entropy_loss": 0.020335851586423814,
"train/kl_divergence_loss": 0.096771240234375,
"train/step_duration_seconds": 7.176971912384033,
"train/steps_per_hour": 315.89882115214573,
"train/total_elapsed_hours": 1.2029168029626212
},
{
"epoch": 0.9031111111111111,
"grad_norm": 2.109375,
"learning_rate": 5.749730526460073e-07,
"loss": 0.9369,
"step": 381
},
{
"epoch": 0.9031111111111111,
"step": 381,
"train/combined_loss": 0.05888266093097627,
"train/cross_entropy_loss": 0.02044476370792836,
"train/kl_divergence_loss": 0.097320556640625,
"train/step_duration_seconds": 7.179260730743408,
"train/steps_per_hour": 316.20591620635895,
"train/total_elapsed_hours": 1.2049110420544942
},
{
"epoch": 0.9054814814814814,
"grad_norm": 1.78125,
"learning_rate": 5.475233360227516e-07,
"loss": 0.9421,
"step": 382
},
{
"epoch": 0.9054814814814814,
"step": 382,
"train/combined_loss": 0.059826530516147614,
"train/cross_entropy_loss": 0.020623518154025078,
"train/kl_divergence_loss": 0.099029541015625,
"train/step_duration_seconds": 7.175317049026489,
"train/steps_per_hour": 316.512283686395,
"train/total_elapsed_hours": 1.2069041856792238
},
{
"epoch": 0.9078518518518518,
"grad_norm": 3.328125,
"learning_rate": 5.207265383286831e-07,
"loss": 0.9572,
"step": 383
},
{
"epoch": 0.9078518518518518,
"step": 383,
"train/combined_loss": 0.06832170393317938,
"train/cross_entropy_loss": 0.02159213600680232,
"train/kl_divergence_loss": 0.11505126953125,
"train/step_duration_seconds": 7.181811809539795,
"train/steps_per_hour": 316.8171681300854,
"train/total_elapsed_hours": 1.208899133404096
},
{
"epoch": 0.9102222222222223,
"grad_norm": 2.3125,
"learning_rate": 4.945845105217118e-07,
"loss": 1.0931,
"step": 384
},
{
"epoch": 0.9102222222222223,
"step": 384,
"train/combined_loss": 0.06399638252332807,
"train/cross_entropy_loss": 0.021303314133547246,
"train/kl_divergence_loss": 0.106689453125,
"train/step_duration_seconds": 7.173643112182617,
"train/steps_per_hour": 317.1216422308217,
"train/total_elapsed_hours": 1.210891812046369
},
{
"epoch": 0.9125925925925926,
"grad_norm": 2.109375,
"learning_rate": 4.6909905833226965e-07,
"loss": 1.0239,
"step": 385
},
{
"epoch": 0.9125925925925926,
"step": 385,
"train/combined_loss": 0.058574909809976816,
"train/cross_entropy_loss": 0.02040909300558269,
"train/kl_divergence_loss": 0.09674072265625,
"train/step_duration_seconds": 7.179981470108032,
"train/steps_per_hour": 317.4246550931726,
"train/total_elapsed_hours": 1.2128862513436212
},
{
"epoch": 0.914962962962963,
"grad_norm": 1.4140625,
"learning_rate": 4.4427194213859216e-07,
"loss": 0.9372,
"step": 386
},
{
"epoch": 0.914962962962963,
"step": 386,
"train/combined_loss": 0.059789648512378335,
"train/cross_entropy_loss": 0.020763377659022808,
"train/kl_divergence_loss": 0.09881591796875,
"train/step_duration_seconds": 7.180516481399536,
"train/steps_per_hour": 317.72663419127423,
"train/total_elapsed_hours": 1.214880839255121
},
{
"epoch": 0.9173333333333333,
"grad_norm": 4.0625,
"learning_rate": 4.2010487684511105e-07,
"loss": 0.9566,
"step": 387
},
{
"epoch": 0.9173333333333333,
"step": 387,
"train/combined_loss": 0.06063654413446784,
"train/cross_entropy_loss": 0.020656629814766347,
"train/kl_divergence_loss": 0.100616455078125,
"train/step_duration_seconds": 7.174912929534912,
"train/steps_per_hour": 318.0280301371019,
"train/total_elapsed_hours": 1.2168738706244362
},
{
"epoch": 0.9197037037037037,
"grad_norm": 1.765625,
"learning_rate": 3.965995317640026e-07,
"loss": 0.9702,
"step": 388
},
{
"epoch": 0.9197037037037037,
"step": 388,
"train/combined_loss": 0.05888652987778187,
"train/cross_entropy_loss": 0.020452499389648438,
"train/kl_divergence_loss": 0.097320556640625,
"train/step_duration_seconds": 7.1626060009002686,
"train/steps_per_hour": 318.32933325469384,
"train/total_elapsed_hours": 1.218863483402464
},
{
"epoch": 0.922074074074074,
"grad_norm": 1.3984375,
"learning_rate": 3.7375753049987974e-07,
"loss": 0.9422,
"step": 389
},
{
"epoch": 0.922074074074074,
"step": 389,
"train/combined_loss": 0.0582260861992836,
"train/cross_entropy_loss": 0.0202607661485672,
"train/kl_divergence_loss": 0.09619140625,
"train/step_duration_seconds": 7.176948070526123,
"train/steps_per_hour": 318.6286145563663,
"train/total_elapsed_hours": 1.2208570800887213
},
{
"epoch": 0.9244444444444444,
"grad_norm": 1.3359375,
"learning_rate": 3.515804508376508e-07,
"loss": 0.9316,
"step": 390
},
{
"epoch": 0.9244444444444444,
"step": 390,
"train/combined_loss": 0.06077071442268789,
"train/cross_entropy_loss": 0.0208639376796782,
"train/kl_divergence_loss": 0.100677490234375,
"train/step_duration_seconds": 7.199989318847656,
"train/steps_per_hour": 318.9252507888751,
"train/total_elapsed_hours": 1.2228570771217346
},
{
"epoch": 0.9268148148148149,
"grad_norm": 1.5078125,
"learning_rate": 3.3006982463352764e-07,
"loss": 0.9723,
"step": 391
},
{
"epoch": 0.9268148148148149,
"step": 391,
"train/combined_loss": 0.06101406365633011,
"train/cross_entropy_loss": 0.021350633818656206,
"train/kl_divergence_loss": 0.100677490234375,
"train/step_duration_seconds": 7.170354127883911,
"train/steps_per_hour": 319.2230637303404,
"train/total_elapsed_hours": 1.2248488421572579
},
{
"epoch": 0.9291851851851852,
"grad_norm": 3.0,
"learning_rate": 3.0922713770922155e-07,
"loss": 0.9762,
"step": 392
},
{
"epoch": 0.9291851851851852,
"step": 392,
"train/combined_loss": 0.0617949569132179,
"train/cross_entropy_loss": 0.02153913036454469,
"train/kl_divergence_loss": 0.10205078125,
"train/step_duration_seconds": 7.201368808746338,
"train/steps_per_hour": 319.5176659434464,
"train/total_elapsed_hours": 1.2268492223819096
},
{
"epoch": 0.9315555555555556,
"grad_norm": 2.125,
"learning_rate": 2.8905382974930173e-07,
"loss": 0.9887,
"step": 393
},
{
"epoch": 0.9315555555555556,
"step": 393,
"train/combined_loss": 0.06196058611385524,
"train/cross_entropy_loss": 0.021473662578500807,
"train/kl_divergence_loss": 0.102447509765625,
"train/step_duration_seconds": 7.19616436958313,
"train/steps_per_hour": 319.81168526316264,
"train/total_elapsed_hours": 1.2288481569290162
},
{
"epoch": 0.9339259259259259,
"grad_norm": 2.75,
"learning_rate": 2.6955129420176193e-07,
"loss": 0.9914,
"step": 394
},
{
"epoch": 0.9339259259259259,
"step": 394,
"train/combined_loss": 0.06139179831370711,
"train/cross_entropy_loss": 0.0205191905843094,
"train/kl_divergence_loss": 0.102264404296875,
"train/step_duration_seconds": 7.170999526977539,
"train/steps_per_hour": 320.10656754066326,
"train/total_elapsed_hours": 1.2308401012420653
},
{
"epoch": 0.9362962962962963,
"grad_norm": 1.375,
"learning_rate": 2.507208781817638e-07,
"loss": 0.9823,
"step": 395
},
{
"epoch": 0.9362962962962963,
"step": 395,
"train/combined_loss": 0.05912953009828925,
"train/cross_entropy_loss": 0.020389189943671227,
"train/kl_divergence_loss": 0.097869873046875,
"train/step_duration_seconds": 7.163103818893433,
"train/steps_per_hour": 320.4010669129712,
"train/total_elapsed_hours": 1.2328298523028691
},
{
"epoch": 0.9386666666666666,
"grad_norm": 1.46875,
"learning_rate": 2.3256388237858806e-07,
"loss": 0.9461,
"step": 396
},
{
"epoch": 0.9386666666666666,
"step": 396,
"train/combined_loss": 0.05818613991141319,
"train/cross_entropy_loss": 0.020180873572826385,
"train/kl_divergence_loss": 0.09619140625,
"train/step_duration_seconds": 7.165131568908691,
"train/steps_per_hour": 320.6944709054444,
"train/total_elapsed_hours": 1.234820166627566
},
{
"epoch": 0.941037037037037,
"grad_norm": 1.9609375,
"learning_rate": 2.1508156096578748e-07,
"loss": 0.931,
"step": 397
},
{
"epoch": 0.941037037037037,
"step": 397,
"train/combined_loss": 0.06258085602894425,
"train/cross_entropy_loss": 0.02100521558895707,
"train/kl_divergence_loss": 0.104156494140625,
"train/step_duration_seconds": 7.152077913284302,
"train/steps_per_hour": 320.9878716432912,
"train/total_elapsed_hours": 1.2368068549368116
},
{
"epoch": 0.9434074074074074,
"grad_norm": 2.71875,
"learning_rate": 1.9827512151456175e-07,
"loss": 1.0013,
"step": 398
},
{
"epoch": 0.9434074074074074,
"step": 398,
"train/combined_loss": 0.0631152824498713,
"train/cross_entropy_loss": 0.020975436200387776,
"train/kl_divergence_loss": 0.105255126953125,
"train/step_duration_seconds": 7.156713008880615,
"train/steps_per_hour": 321.2799973921229,
"train/total_elapsed_hours": 1.2387948307726118
},
{
"epoch": 0.9457777777777778,
"grad_norm": 1.671875,
"learning_rate": 1.82145724910342e-07,
"loss": 1.0098,
"step": 399
},
{
"epoch": 0.9457777777777778,
"step": 399,
"train/combined_loss": 0.05855529848486185,
"train/cross_entropy_loss": 0.020369877573102713,
"train/kl_divergence_loss": 0.09674072265625,
"train/step_duration_seconds": 7.177468776702881,
"train/steps_per_hour": 321.56969283241614,
"train/total_elapsed_hours": 1.2407885720994738
},
{
"epoch": 0.9481481481481482,
"grad_norm": 1.3203125,
"learning_rate": 1.6669448527260602e-07,
"loss": 0.9369,
"step": 400
},
{
"epoch": 0.9481481481481482,
"eval_combined_loss": 0.06070284065480033,
"eval_cross_entropy_loss": 0.020584717767934003,
"eval_kl_divergence_loss": 0.10082096354166667,
"eval_loss": 0.060702841728925705,
"eval_runtime": 220.2816,
"eval_samples_per_second": 6.809,
"eval_steps_per_second": 3.405,
"step": 400
},
{
"epoch": 0.9481481481481482,
"step": 400,
"train/combined_loss": 0.059124535880982876,
"train/cross_entropy_loss": 0.020409714779816568,
"train/kl_divergence_loss": 0.09783935546875,
"train/step_duration_seconds": 227.46868801116943,
"train/steps_per_hour": 306.75450754086955,
"train/total_elapsed_hours": 1.303974318769243
},
{
"epoch": 0.9505185185185185,
"grad_norm": 2.390625,
"learning_rate": 1.519224698779198e-07,
"loss": 0.946,
"step": 401
},
{
"epoch": 0.9505185185185185,
"step": 401,
"train/combined_loss": 0.0637058550491929,
"train/cross_entropy_loss": 0.020966400275938213,
"train/kl_divergence_loss": 0.1064453125,
"train/step_duration_seconds": 7.181180477142334,
"train/steps_per_hour": 307.05167735238854,
"train/total_elapsed_hours": 1.3059690911240047
},
{
"epoch": 0.9528888888888889,
"grad_norm": 2.125,
"learning_rate": 1.3783069908621772e-07,
"loss": 1.0193,
"step": 402
},
{
"epoch": 0.9528888888888889,
"step": 402,
"train/combined_loss": 0.06105339783243835,
"train/cross_entropy_loss": 0.0205748132430017,
"train/kl_divergence_loss": 0.101531982421875,
"train/step_duration_seconds": 7.198254823684692,
"train/steps_per_hour": 307.3468262521633,
"train/total_elapsed_hours": 1.307968606352806
},
{
"epoch": 0.9552592592592593,
"grad_norm": 1.6484375,
"learning_rate": 1.2442014627032318e-07,
"loss": 0.9769,
"step": 403
},
{
"epoch": 0.9552592592592593,
"step": 403,
"train/combined_loss": 0.0588757898658514,
"train/cross_entropy_loss": 0.02040050830692053,
"train/kl_divergence_loss": 0.09735107421875,
"train/step_duration_seconds": 7.179206609725952,
"train/steps_per_hour": 307.6423167469107,
"train/total_elapsed_hours": 1.3099628304110633
},
{
"epoch": 0.9576296296296296,
"grad_norm": 1.734375,
"learning_rate": 1.1169173774871478e-07,
"loss": 0.942,
"step": 404
},
{
"epoch": 0.9576296296296296,
"step": 404,
"train/combined_loss": 0.059780715964734554,
"train/cross_entropy_loss": 0.020562409423291683,
"train/kl_divergence_loss": 0.0989990234375,
"train/step_duration_seconds": 7.192591905593872,
"train/steps_per_hour": 307.93603622552814,
"train/total_elapsed_hours": 1.3119607726070617
},
{
"epoch": 0.96,
"grad_norm": 2.953125,
"learning_rate": 9.964635272153633e-08,
"loss": 0.9565,
"step": 405
},
{
"epoch": 0.96,
"step": 405,
"train/combined_loss": 0.0602156778331846,
"train/cross_entropy_loss": 0.02063887706026435,
"train/kl_divergence_loss": 0.09979248046875,
"train/step_duration_seconds": 7.179564952850342,
"train/steps_per_hour": 308.22971132705254,
"train/total_elapsed_hours": 1.3139550962050757
},
{
"epoch": 0.9623703703703703,
"grad_norm": 1.125,
"learning_rate": 8.82848232098732e-08,
"loss": 0.9635,
"step": 406
},
{
"epoch": 0.9623703703703703,
"step": 406,
"train/combined_loss": 0.05820171535015106,
"train/cross_entropy_loss": 0.020212026312947273,
"train/kl_divergence_loss": 0.09619140625,
"train/step_duration_seconds": 7.195192575454712,
"train/steps_per_hour": 308.5214785588609,
"train/total_elapsed_hours": 1.3159537608093685
},
{
"epoch": 0.9647407407407408,
"grad_norm": 1.328125,
"learning_rate": 7.760793399827937e-08,
"loss": 0.9312,
"step": 407
},
{
"epoch": 0.9647407407407408,
"step": 407,
"train/combined_loss": 0.061329676769673824,
"train/cross_entropy_loss": 0.02094426902476698,
"train/kl_divergence_loss": 0.101715087890625,
"train/step_duration_seconds": 7.17832612991333,
"train/steps_per_hour": 308.81345865085285,
"train/total_elapsed_hours": 1.3179477402899
},
{
"epoch": 0.9671111111111111,
"grad_norm": 1.8828125,
"learning_rate": 6.761642258056977e-08,
"loss": 0.9813,
"step": 408
},
{
"epoch": 0.9671111111111111,
"step": 408,
"train/combined_loss": 0.06031193025410175,
"train/cross_entropy_loss": 0.020617756876163185,
"train/kl_divergence_loss": 0.100006103515625,
"train/step_duration_seconds": 7.178639888763428,
"train/steps_per_hour": 309.1045361691286,
"train/total_elapsed_hours": 1.3199418069256676
},
{
"epoch": 0.9694814814814815,
"grad_norm": 1.421875,
"learning_rate": 5.831097910887873e-08,
"loss": 0.965,
"step": 409
},
{
"epoch": 0.9694814814814815,
"step": 409,
"train/combined_loss": 0.059776231879368424,
"train/cross_entropy_loss": 0.020767063251696527,
"train/kl_divergence_loss": 0.098785400390625,
"train/step_duration_seconds": 7.177491903305054,
"train/steps_per_hour": 309.3948101729231,
"train/total_elapsed_hours": 1.3219355546765856
},
{
"epoch": 0.9718518518518519,
"grad_norm": 1.6484375,
"learning_rate": 4.9692246345985905e-08,
"loss": 0.9564,
"step": 410
},
{
"epoch": 0.9718518518518519,
"step": 410,
"train/combined_loss": 0.06045236345380545,
"train/cross_entropy_loss": 0.020746038877405226,
"train/kl_divergence_loss": 0.10015869140625,
"train/step_duration_seconds": 7.197791576385498,
"train/steps_per_hour": 309.68289092850614,
"train/total_elapsed_hours": 1.3239349412255816
},
{
"epoch": 0.9742222222222222,
"grad_norm": 1.8125,
"learning_rate": 4.176081962092182e-08,
"loss": 0.9672,
"step": 411
},
{
"epoch": 0.9742222222222222,
"step": 411,
"train/combined_loss": 0.06638129102066159,
"train/cross_entropy_loss": 0.021220834576524794,
"train/kl_divergence_loss": 0.111541748046875,
"train/step_duration_seconds": 7.180782318115234,
"train/steps_per_hour": 309.97120742767606,
"train/total_elapsed_hours": 1.3259296029806138
},
{
"epoch": 0.9765925925925926,
"grad_norm": 3.25,
"learning_rate": 3.451724678784518e-08,
"loss": 1.0621,
"step": 412
},
{
"epoch": 0.9765925925925926,
"step": 412,
"train/combined_loss": 0.060466301161795855,
"train/cross_entropy_loss": 0.020804431289434433,
"train/kl_divergence_loss": 0.100128173828125,
"train/step_duration_seconds": 7.173940181732178,
"train/steps_per_hour": 310.2591018309556,
"train/total_elapsed_hours": 1.327922364142206
},
{
"epoch": 0.9789629629629629,
"grad_norm": 2.140625,
"learning_rate": 2.796202818819871e-08,
"loss": 0.9675,
"step": 413
},
{
"epoch": 0.9789629629629629,
"step": 413,
"train/combined_loss": 0.06092071859166026,
"train/cross_entropy_loss": 0.02085877349600196,
"train/kl_divergence_loss": 0.100982666015625,
"train/step_duration_seconds": 7.189931392669678,
"train/steps_per_hour": 310.5450962243895,
"train/total_elapsed_hours": 1.3299195673068365
},
{
"epoch": 0.9813333333333333,
"grad_norm": 1.3984375,
"learning_rate": 2.2095616616150117e-08,
"loss": 0.9747,
"step": 414
},
{
"epoch": 0.9813333333333333,
"step": 414,
"train/combined_loss": 0.05822563171386719,
"train/cross_entropy_loss": 0.020259857177734375,
"train/kl_divergence_loss": 0.09619140625,
"train/step_duration_seconds": 7.186516523361206,
"train/steps_per_hour": 310.8304542928701,
"train/total_elapsed_hours": 1.331915821896659
},
{
"epoch": 0.9837037037037037,
"grad_norm": 2.515625,
"learning_rate": 1.6918417287318245e-08,
"loss": 0.9316,
"step": 415
},
{
"epoch": 0.9837037037037037,
"step": 415,
"train/combined_loss": 0.05974208423867822,
"train/cross_entropy_loss": 0.020485147717408836,
"train/kl_divergence_loss": 0.0989990234375,
"train/step_duration_seconds": 7.198460102081299,
"train/steps_per_hour": 311.1141844684285,
"train/total_elapsed_hours": 1.333915394147237
},
{
"epoch": 0.9860740740740741,
"grad_norm": 1.1171875,
"learning_rate": 1.2430787810776556e-08,
"loss": 0.9559,
"step": 416
},
{
"epoch": 0.9860740740740741,
"step": 416,
"train/combined_loss": 0.05951492628082633,
"train/cross_entropy_loss": 0.02033600490540266,
"train/kl_divergence_loss": 0.09869384765625,
"train/step_duration_seconds": 7.179587125778198,
"train/steps_per_hour": 311.3982872915823,
"train/total_elapsed_hours": 1.3359097239043978
},
{
"epoch": 0.9884444444444445,
"grad_norm": 1.2578125,
"learning_rate": 8.633038164358454e-09,
"loss": 0.9522,
"step": 417
},
{
"epoch": 0.9884444444444445,
"step": 417,
"train/combined_loss": 0.060025526909157634,
"train/cross_entropy_loss": 0.020472198841162026,
"train/kl_divergence_loss": 0.099578857421875,
"train/step_duration_seconds": 7.1582019329071045,
"train/steps_per_hour": 311.6829270070737,
"train/total_elapsed_hours": 1.3378981133302053
},
{
"epoch": 0.9908148148148148,
"grad_norm": 1.3125,
"learning_rate": 5.525430673244403e-09,
"loss": 0.9604,
"step": 418
},
{
"epoch": 0.9908148148148148,
"step": 418,
"train/combined_loss": 0.06418039370328188,
"train/cross_entropy_loss": 0.0208778785308823,
"train/kl_divergence_loss": 0.10748291015625,
"train/step_duration_seconds": 7.19930624961853,
"train/steps_per_hour": 311.964063505697,
"train/total_elapsed_hours": 1.339897920621766
},
{
"epoch": 0.9931851851851852,
"grad_norm": 1.4765625,
"learning_rate": 3.1081799918375454e-09,
"loss": 1.0269,
"step": 419
},
{
"epoch": 0.9931851851851852,
"step": 419,
"train/combined_loss": 0.0633124178275466,
"train/cross_entropy_loss": 0.020789876696653664,
"train/kl_divergence_loss": 0.1058349609375,
"train/step_duration_seconds": 7.172863245010376,
"train/steps_per_hour": 312.2460712308407,
"train/total_elapsed_hours": 1.341890382634269
},
{
"epoch": 0.9955555555555555,
"grad_norm": 1.3515625,
"learning_rate": 1.3814530889433298e-09,
"loss": 1.013,
"step": 420
},
{
"epoch": 0.9955555555555555,
"step": 420,
"train/combined_loss": 0.05849831993691623,
"train/cross_entropy_loss": 0.020225399872288108,
"train/kl_divergence_loss": 0.096771240234375,
"train/step_duration_seconds": 7.199819087982178,
"train/steps_per_hour": 312.5255014342565,
"train/total_elapsed_hours": 1.3438903323809306
},
{
"epoch": 0.9979259259259259,
"grad_norm": 1.015625,
"learning_rate": 3.4536923623096353e-10,
"loss": 0.936,
"step": 421
}
],
"logging_steps": 1,
"max_steps": 421,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0845947087447654e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}