|
{ |
|
"best_global_step": 400, |
|
"best_metric": 0.060702841728925705, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9979259259259259, |
|
"eval_steps": 50, |
|
"global_step": 421, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0, |
|
"step": 0, |
|
"train/combined_loss": 4.310243457555771, |
|
"train/cross_entropy_loss": 7.370486944913864, |
|
"train/kl_divergence_loss": 1.25, |
|
"train/step_duration_seconds": 64.96706748008728, |
|
"train/steps_per_hour": 0.0, |
|
"train/total_elapsed_hours": 0.018046407500902813 |
|
}, |
|
{ |
|
"epoch": 0.0023703703703703703, |
|
"grad_norm": 26752.0, |
|
"learning_rate": 0.0, |
|
"loss": 68.9639, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0023703703703703703, |
|
"step": 1, |
|
"train/combined_loss": 4.25145959854126, |
|
"train/cross_entropy_loss": 7.2763566970825195, |
|
"train/kl_divergence_loss": 1.2265625, |
|
"train/step_duration_seconds": 7.274043560028076, |
|
"train/steps_per_hour": 49.83312094637097, |
|
"train/total_elapsed_hours": 0.020066975156466167 |
|
}, |
|
{ |
|
"epoch": 0.004740740740740741, |
|
"grad_norm": 27392.0, |
|
"learning_rate": 4.651162790697675e-07, |
|
"loss": 68.0234, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.004740740740740741, |
|
"step": 2, |
|
"train/combined_loss": 4.210591048002243, |
|
"train/cross_entropy_loss": 7.187784105539322, |
|
"train/kl_divergence_loss": 1.2333984375, |
|
"train/step_duration_seconds": 7.055061340332031, |
|
"train/steps_per_hour": 90.79883463670956, |
|
"train/total_elapsed_hours": 0.02202671441766951 |
|
}, |
|
{ |
|
"epoch": 0.0071111111111111115, |
|
"grad_norm": 27264.0, |
|
"learning_rate": 9.30232558139535e-07, |
|
"loss": 67.3695, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0071111111111111115, |
|
"step": 3, |
|
"train/combined_loss": 4.220440715551376, |
|
"train/cross_entropy_loss": 7.188928246498108, |
|
"train/kl_divergence_loss": 1.251953125, |
|
"train/step_duration_seconds": 7.075124740600586, |
|
"train/steps_per_hour": 125.04154064617586, |
|
"train/total_elapsed_hours": 0.023992026845614117 |
|
}, |
|
{ |
|
"epoch": 0.009481481481481481, |
|
"grad_norm": 26240.0, |
|
"learning_rate": 1.3953488372093025e-06, |
|
"loss": 67.5271, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009481481481481481, |
|
"step": 4, |
|
"train/combined_loss": 4.050247877836227, |
|
"train/cross_entropy_loss": 6.869538724422455, |
|
"train/kl_divergence_loss": 1.23095703125, |
|
"train/step_duration_seconds": 7.09927773475647, |
|
"train/steps_per_hour": 154.0591795404681, |
|
"train/total_elapsed_hours": 0.025964048438602023 |
|
}, |
|
{ |
|
"epoch": 0.011851851851851851, |
|
"grad_norm": 27264.0, |
|
"learning_rate": 1.86046511627907e-06, |
|
"loss": 64.804, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011851851851851851, |
|
"step": 5, |
|
"train/combined_loss": 3.8814118206501007, |
|
"train/cross_entropy_loss": 6.518194735050201, |
|
"train/kl_divergence_loss": 1.24462890625, |
|
"train/step_duration_seconds": 7.140614032745361, |
|
"train/steps_per_hour": 178.9065439356696, |
|
"train/total_elapsed_hours": 0.027947552336586846 |
|
}, |
|
{ |
|
"epoch": 0.014222222222222223, |
|
"grad_norm": 28160.0, |
|
"learning_rate": 2.3255813953488376e-06, |
|
"loss": 62.1026, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.014222222222222223, |
|
"step": 6, |
|
"train/combined_loss": 3.3225543051958084, |
|
"train/cross_entropy_loss": 5.414151579141617, |
|
"train/kl_divergence_loss": 1.23095703125, |
|
"train/step_duration_seconds": 7.164709091186523, |
|
"train/steps_per_hour": 200.41586755906124, |
|
"train/total_elapsed_hours": 0.02993774930636088 |
|
}, |
|
{ |
|
"epoch": 0.016592592592592593, |
|
"grad_norm": 22912.0, |
|
"learning_rate": 2.790697674418605e-06, |
|
"loss": 53.1609, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.016592592592592593, |
|
"step": 7, |
|
"train/combined_loss": 2.8946415334939957, |
|
"train/cross_entropy_loss": 4.530005723237991, |
|
"train/kl_divergence_loss": 1.25927734375, |
|
"train/step_duration_seconds": 7.1137425899505615, |
|
"train/steps_per_hour": 219.34092560159692, |
|
"train/total_elapsed_hours": 0.03191378891468048 |
|
}, |
|
{ |
|
"epoch": 0.018962962962962963, |
|
"grad_norm": 6400.0, |
|
"learning_rate": 3.2558139534883724e-06, |
|
"loss": 46.3143, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.018962962962962963, |
|
"step": 8, |
|
"train/combined_loss": 2.7050345838069916, |
|
"train/cross_entropy_loss": 4.093662917613983, |
|
"train/kl_divergence_loss": 1.31640625, |
|
"train/step_duration_seconds": 7.135659217834473, |
|
"train/steps_per_hour": 236.01663067159015, |
|
"train/total_elapsed_hours": 0.03389591647519006 |
|
}, |
|
{ |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 9088.0, |
|
"learning_rate": 3.72093023255814e-06, |
|
"loss": 43.2806, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.021333333333333333, |
|
"step": 9, |
|
"train/combined_loss": 2.491789221763611, |
|
"train/cross_entropy_loss": 3.7003750950098038, |
|
"train/kl_divergence_loss": 1.283203125, |
|
"train/step_duration_seconds": 7.159760475158691, |
|
"train/steps_per_hour": 250.802995746654, |
|
"train/total_elapsed_hours": 0.035884738829400804 |
|
}, |
|
{ |
|
"epoch": 0.023703703703703703, |
|
"grad_norm": 14784.0, |
|
"learning_rate": 4.186046511627907e-06, |
|
"loss": 39.8686, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023703703703703703, |
|
"step": 10, |
|
"train/combined_loss": 2.106148824095726, |
|
"train/cross_entropy_loss": 2.877825006842613, |
|
"train/kl_divergence_loss": 1.33447265625, |
|
"train/step_duration_seconds": 7.181847333908081, |
|
"train/steps_per_hour": 263.99366796803935, |
|
"train/total_elapsed_hours": 0.037879696422153046 |
|
}, |
|
{ |
|
"epoch": 0.026074074074074072, |
|
"grad_norm": 10880.0, |
|
"learning_rate": 4.651162790697675e-06, |
|
"loss": 33.6984, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.026074074074074072, |
|
"step": 11, |
|
"train/combined_loss": 1.4576978087425232, |
|
"train/cross_entropy_loss": 1.571645624935627, |
|
"train/kl_divergence_loss": 1.34375, |
|
"train/step_duration_seconds": 7.17723274230957, |
|
"train/steps_per_hour": 275.8733309353751, |
|
"train/total_elapsed_hours": 0.03987337218390571 |
|
}, |
|
{ |
|
"epoch": 0.028444444444444446, |
|
"grad_norm": 4704.0, |
|
"learning_rate": 5.116279069767442e-06, |
|
"loss": 23.3232, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.028444444444444446, |
|
"step": 12, |
|
"train/combined_loss": 1.0812036469578743, |
|
"train/cross_entropy_loss": 0.8298877663910389, |
|
"train/kl_divergence_loss": 1.33251953125, |
|
"train/step_duration_seconds": 7.202847242355347, |
|
"train/steps_per_hour": 286.5728916364825, |
|
"train/total_elapsed_hours": 0.04187416308455997 |
|
}, |
|
{ |
|
"epoch": 0.030814814814814816, |
|
"grad_norm": 1288.0, |
|
"learning_rate": 5.58139534883721e-06, |
|
"loss": 17.2993, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.030814814814814816, |
|
"step": 13, |
|
"train/combined_loss": 0.917561486363411, |
|
"train/cross_entropy_loss": 0.5206698887050152, |
|
"train/kl_divergence_loss": 1.314453125, |
|
"train/step_duration_seconds": 7.210479736328125, |
|
"train/steps_per_hour": 296.28229001155563, |
|
"train/total_elapsed_hours": 0.043877074122428895 |
|
}, |
|
{ |
|
"epoch": 0.033185185185185186, |
|
"grad_norm": 1064.0, |
|
"learning_rate": 6.046511627906977e-06, |
|
"loss": 14.681, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.033185185185185186, |
|
"step": 14, |
|
"train/combined_loss": 0.8026629276573658, |
|
"train/cross_entropy_loss": 0.42270869202911854, |
|
"train/kl_divergence_loss": 1.1826171875, |
|
"train/step_duration_seconds": 7.176325559616089, |
|
"train/steps_per_hour": 305.20706432882776, |
|
"train/total_elapsed_hours": 0.04587049788898892 |
|
}, |
|
{ |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 680.0, |
|
"learning_rate": 6.511627906976745e-06, |
|
"loss": 12.8426, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.035555555555555556, |
|
"step": 15, |
|
"train/combined_loss": 0.7154323048889637, |
|
"train/cross_entropy_loss": 0.39229039661586285, |
|
"train/kl_divergence_loss": 1.03857421875, |
|
"train/step_duration_seconds": 7.2133026123046875, |
|
"train/steps_per_hour": 313.3212079729679, |
|
"train/total_elapsed_hours": 0.047874193059073554 |
|
}, |
|
{ |
|
"epoch": 0.037925925925925925, |
|
"grad_norm": 516.0, |
|
"learning_rate": 6.976744186046513e-06, |
|
"loss": 11.4469, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.037925925925925925, |
|
"step": 16, |
|
"train/combined_loss": 0.6362243294715881, |
|
"train/cross_entropy_loss": 0.37010490894317627, |
|
"train/kl_divergence_loss": 0.90234375, |
|
"train/step_duration_seconds": 7.195964336395264, |
|
"train/steps_per_hour": 320.81440635372326, |
|
"train/total_elapsed_hours": 0.04987307204140557 |
|
}, |
|
{ |
|
"epoch": 0.040296296296296295, |
|
"grad_norm": 506.0, |
|
"learning_rate": 7.44186046511628e-06, |
|
"loss": 10.1796, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.040296296296296295, |
|
"step": 17, |
|
"train/combined_loss": 0.5937556512653828, |
|
"train/cross_entropy_loss": 0.3642691094428301, |
|
"train/kl_divergence_loss": 0.8232421875, |
|
"train/step_duration_seconds": 7.201917409896851, |
|
"train/steps_per_hour": 327.71965844625936, |
|
"train/total_elapsed_hours": 0.05187360465526581 |
|
}, |
|
{ |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 324.0, |
|
"learning_rate": 7.906976744186048e-06, |
|
"loss": 9.5001, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.042666666666666665, |
|
"step": 18, |
|
"train/combined_loss": 0.5263483263552189, |
|
"train/cross_entropy_loss": 0.314903661608696, |
|
"train/kl_divergence_loss": 0.73779296875, |
|
"train/step_duration_seconds": 7.208310604095459, |
|
"train/steps_per_hour": 334.1010656793036, |
|
"train/total_elapsed_hours": 0.053875913156403436 |
|
}, |
|
{ |
|
"epoch": 0.045037037037037035, |
|
"grad_norm": 219.0, |
|
"learning_rate": 8.372093023255815e-06, |
|
"loss": 8.4216, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.045037037037037035, |
|
"step": 19, |
|
"train/combined_loss": 0.43465932086110115, |
|
"train/cross_entropy_loss": 0.2343088500201702, |
|
"train/kl_divergence_loss": 0.635009765625, |
|
"train/step_duration_seconds": 7.1786744594573975, |
|
"train/steps_per_hour": 340.075239062999, |
|
"train/total_elapsed_hours": 0.0558699893951416 |
|
}, |
|
{ |
|
"epoch": 0.047407407407407405, |
|
"grad_norm": 146.0, |
|
"learning_rate": 8.837209302325582e-06, |
|
"loss": 6.9546, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.047407407407407405, |
|
"step": 20, |
|
"train/combined_loss": 0.3771132677793503, |
|
"train/cross_entropy_loss": 0.18440232705324888, |
|
"train/kl_divergence_loss": 0.56982421875, |
|
"train/step_duration_seconds": 7.214315176010132, |
|
"train/steps_per_hour": 345.57852934665823, |
|
"train/total_elapsed_hours": 0.057873965832922196 |
|
}, |
|
{ |
|
"epoch": 0.049777777777777775, |
|
"grad_norm": 109.5, |
|
"learning_rate": 9.30232558139535e-06, |
|
"loss": 6.0338, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.049777777777777775, |
|
"step": 21, |
|
"train/combined_loss": 0.3578077703714371, |
|
"train/cross_entropy_loss": 0.15311553701758385, |
|
"train/kl_divergence_loss": 0.5625, |
|
"train/step_duration_seconds": 7.201047420501709, |
|
"train/steps_per_hour": 350.73504254237196, |
|
"train/total_elapsed_hours": 0.05987425678306156 |
|
}, |
|
{ |
|
"epoch": 0.052148148148148145, |
|
"grad_norm": 100.5, |
|
"learning_rate": 9.767441860465117e-06, |
|
"loss": 5.7249, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.052148148148148145, |
|
"step": 22, |
|
"train/combined_loss": 0.31270523741841316, |
|
"train/cross_entropy_loss": 0.11625519627705216, |
|
"train/kl_divergence_loss": 0.5091552734375, |
|
"train/step_duration_seconds": 7.181376218795776, |
|
"train/steps_per_hour": 355.5895570404658, |
|
"train/total_elapsed_hours": 0.06186908351050483 |
|
}, |
|
{ |
|
"epoch": 0.05451851851851852, |
|
"grad_norm": 75.0, |
|
"learning_rate": 1.0232558139534884e-05, |
|
"loss": 5.0033, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05451851851851852, |
|
"step": 23, |
|
"train/combined_loss": 0.28502367064356804, |
|
"train/cross_entropy_loss": 0.10581392887979746, |
|
"train/kl_divergence_loss": 0.4642333984375, |
|
"train/step_duration_seconds": 7.173454284667969, |
|
"train/steps_per_hour": 360.15321399626356, |
|
"train/total_elapsed_hours": 0.06386170970069037 |
|
}, |
|
{ |
|
"epoch": 0.05688888888888889, |
|
"grad_norm": 73.5, |
|
"learning_rate": 1.0697674418604651e-05, |
|
"loss": 4.5604, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05688888888888889, |
|
"step": 24, |
|
"train/combined_loss": 0.2662050947546959, |
|
"train/cross_entropy_loss": 0.09075977467000484, |
|
"train/kl_divergence_loss": 0.441650390625, |
|
"train/step_duration_seconds": 7.196916818618774, |
|
"train/steps_per_hour": 364.404632061752, |
|
"train/total_elapsed_hours": 0.06586085326141781 |
|
}, |
|
{ |
|
"epoch": 0.05925925925925926, |
|
"grad_norm": 53.5, |
|
"learning_rate": 1.116279069767442e-05, |
|
"loss": 4.2593, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05925925925925926, |
|
"step": 25, |
|
"train/combined_loss": 0.2555910013616085, |
|
"train/cross_entropy_loss": 0.07075231382623315, |
|
"train/kl_divergence_loss": 0.4404296875, |
|
"train/step_duration_seconds": 7.2051169872283936, |
|
"train/steps_per_hour": 368.39319239049337, |
|
"train/total_elapsed_hours": 0.06786227464675904 |
|
}, |
|
{ |
|
"epoch": 0.06162962962962963, |
|
"grad_norm": 45.0, |
|
"learning_rate": 1.1627906976744187e-05, |
|
"loss": 4.0895, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06162962962962963, |
|
"step": 26, |
|
"train/combined_loss": 0.25060202460736036, |
|
"train/cross_entropy_loss": 0.06773236347362399, |
|
"train/kl_divergence_loss": 0.4334716796875, |
|
"train/step_duration_seconds": 7.195337772369385, |
|
"train/steps_per_hour": 372.1676986924219, |
|
"train/total_elapsed_hours": 0.06986097958352831 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 34.75, |
|
"learning_rate": 1.2093023255813954e-05, |
|
"loss": 4.0096, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"step": 27, |
|
"train/combined_loss": 0.22914704959839582, |
|
"train/cross_entropy_loss": 0.06779117416590452, |
|
"train/kl_divergence_loss": 0.3905029296875, |
|
"train/step_duration_seconds": 7.1646952629089355, |
|
"train/steps_per_hour": 375.7767476973663, |
|
"train/total_elapsed_hours": 0.07185117271211412 |
|
}, |
|
{ |
|
"epoch": 0.06637037037037037, |
|
"grad_norm": 48.5, |
|
"learning_rate": 1.2558139534883723e-05, |
|
"loss": 3.6664, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06637037037037037, |
|
"step": 28, |
|
"train/combined_loss": 0.22941692359745502, |
|
"train/cross_entropy_loss": 0.0525838378816843, |
|
"train/kl_divergence_loss": 0.40625, |
|
"train/step_duration_seconds": 7.199108123779297, |
|
"train/steps_per_hour": 379.1421706885833, |
|
"train/total_elapsed_hours": 0.07385092496871948 |
|
}, |
|
{ |
|
"epoch": 0.06874074074074074, |
|
"grad_norm": 72.5, |
|
"learning_rate": 1.302325581395349e-05, |
|
"loss": 3.6707, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06874074074074074, |
|
"step": 29, |
|
"train/combined_loss": 0.22196358162909746, |
|
"train/cross_entropy_loss": 0.048419348895549774, |
|
"train/kl_divergence_loss": 0.3955078125, |
|
"train/step_duration_seconds": 7.179231882095337, |
|
"train/steps_per_hour": 382.35797131195636, |
|
"train/total_elapsed_hours": 0.0758451560470793 |
|
}, |
|
{ |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 63.75, |
|
"learning_rate": 1.3488372093023257e-05, |
|
"loss": 3.5514, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07111111111111111, |
|
"step": 30, |
|
"train/combined_loss": 0.2120195608586073, |
|
"train/cross_entropy_loss": 0.056119201704859734, |
|
"train/kl_divergence_loss": 0.367919921875, |
|
"train/step_duration_seconds": 7.1586713790893555, |
|
"train/steps_per_hour": 385.43727586928117, |
|
"train/total_elapsed_hours": 0.07783367587460412 |
|
}, |
|
{ |
|
"epoch": 0.07348148148148148, |
|
"grad_norm": 34.5, |
|
"learning_rate": 1.3953488372093025e-05, |
|
"loss": 3.3923, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07348148148148148, |
|
"step": 31, |
|
"train/combined_loss": 0.20117830298841, |
|
"train/cross_entropy_loss": 0.0440802276134491, |
|
"train/kl_divergence_loss": 0.3582763671875, |
|
"train/step_duration_seconds": 7.1888298988342285, |
|
"train/steps_per_hour": 388.3224034144493, |
|
"train/total_elapsed_hours": 0.07983057306872474 |
|
}, |
|
{ |
|
"epoch": 0.07585185185185185, |
|
"grad_norm": 37.75, |
|
"learning_rate": 1.441860465116279e-05, |
|
"loss": 3.2189, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07585185185185185, |
|
"step": 32, |
|
"train/combined_loss": 0.20670769922435284, |
|
"train/cross_entropy_loss": 0.03853747947141528, |
|
"train/kl_divergence_loss": 0.3748779296875, |
|
"train/step_duration_seconds": 7.163522005081177, |
|
"train/steps_per_hour": 391.1003153449009, |
|
"train/total_elapsed_hours": 0.0818204402923584 |
|
}, |
|
{ |
|
"epoch": 0.07822222222222222, |
|
"grad_norm": 114.5, |
|
"learning_rate": 1.488372093023256e-05, |
|
"loss": 3.3073, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07822222222222222, |
|
"step": 33, |
|
"train/combined_loss": 0.19256712403148413, |
|
"train/cross_entropy_loss": 0.040407692082226276, |
|
"train/kl_divergence_loss": 0.3447265625, |
|
"train/step_duration_seconds": 7.171715974807739, |
|
"train/steps_per_hour": 393.73562507193196, |
|
"train/total_elapsed_hours": 0.08381258361869388 |
|
}, |
|
{ |
|
"epoch": 0.08059259259259259, |
|
"grad_norm": 48.25, |
|
"learning_rate": 1.5348837209302328e-05, |
|
"loss": 3.0811, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08059259259259259, |
|
"step": 34, |
|
"train/combined_loss": 0.2014658311381936, |
|
"train/cross_entropy_loss": 0.0435566701926291, |
|
"train/kl_divergence_loss": 0.359375, |
|
"train/step_duration_seconds": 7.14280104637146, |
|
"train/steps_per_hour": 396.28566102564344, |
|
"train/total_elapsed_hours": 0.08579669502046373 |
|
}, |
|
{ |
|
"epoch": 0.08296296296296296, |
|
"grad_norm": 105.5, |
|
"learning_rate": 1.5813953488372095e-05, |
|
"loss": 3.2235, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08296296296296296, |
|
"step": 35, |
|
"train/combined_loss": 0.1921772612258792, |
|
"train/cross_entropy_loss": 0.03669826895929873, |
|
"train/kl_divergence_loss": 0.34765625, |
|
"train/step_duration_seconds": 7.161575078964233, |
|
"train/steps_per_hour": 398.6967335955144, |
|
"train/total_elapsed_hours": 0.08778602143128712 |
|
}, |
|
{ |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 76.5, |
|
"learning_rate": 1.6279069767441862e-05, |
|
"loss": 3.0748, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08533333333333333, |
|
"step": 36, |
|
"train/combined_loss": 0.18726484011858702, |
|
"train/cross_entropy_loss": 0.029070683754980564, |
|
"train/kl_divergence_loss": 0.345458984375, |
|
"train/step_duration_seconds": 7.136035680770874, |
|
"train/steps_per_hour": 401.0326431715552, |
|
"train/total_elapsed_hours": 0.08976825356483459 |
|
}, |
|
{ |
|
"epoch": 0.0877037037037037, |
|
"grad_norm": 33.25, |
|
"learning_rate": 1.674418604651163e-05, |
|
"loss": 2.9962, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0877037037037037, |
|
"step": 37, |
|
"train/combined_loss": 0.1922608781605959, |
|
"train/cross_entropy_loss": 0.03149440907873213, |
|
"train/kl_divergence_loss": 0.35302734375, |
|
"train/step_duration_seconds": 7.1254284381866455, |
|
"train/steps_per_hour": 403.28057085391987, |
|
"train/total_elapsed_hours": 0.09174753924210867 |
|
}, |
|
{ |
|
"epoch": 0.09007407407407407, |
|
"grad_norm": 50.0, |
|
"learning_rate": 1.7209302325581396e-05, |
|
"loss": 3.0762, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09007407407407407, |
|
"step": 38, |
|
"train/combined_loss": 0.17505795694887638, |
|
"train/cross_entropy_loss": 0.02919307304546237, |
|
"train/kl_divergence_loss": 0.3209228515625, |
|
"train/step_duration_seconds": 7.103011608123779, |
|
"train/steps_per_hour": 405.4604942984025, |
|
"train/total_elapsed_hours": 0.09372059802214304 |
|
}, |
|
{ |
|
"epoch": 0.09244444444444444, |
|
"grad_norm": 30.75, |
|
"learning_rate": 1.7674418604651163e-05, |
|
"loss": 2.8009, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09244444444444444, |
|
"step": 39, |
|
"train/combined_loss": 0.17248188611119986, |
|
"train/cross_entropy_loss": 0.017937407130375504, |
|
"train/kl_divergence_loss": 0.3270263671875, |
|
"train/step_duration_seconds": 7.110121965408325, |
|
"train/steps_per_hour": 407.5421126867549, |
|
"train/total_elapsed_hours": 0.09569563190142313 |
|
}, |
|
{ |
|
"epoch": 0.09481481481481481, |
|
"grad_norm": 38.5, |
|
"learning_rate": 1.813953488372093e-05, |
|
"loss": 2.7597, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09481481481481481, |
|
"step": 40, |
|
"train/combined_loss": 0.17410407960414886, |
|
"train/cross_entropy_loss": 0.016787268687039614, |
|
"train/kl_divergence_loss": 0.3314208984375, |
|
"train/step_duration_seconds": 7.180423736572266, |
|
"train/steps_per_hour": 409.4576778026899, |
|
"train/total_elapsed_hours": 0.09769019405047098 |
|
}, |
|
{ |
|
"epoch": 0.09718518518518518, |
|
"grad_norm": 31.25, |
|
"learning_rate": 1.86046511627907e-05, |
|
"loss": 2.7857, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.09718518518518518, |
|
"step": 41, |
|
"train/combined_loss": 0.16598839219659567, |
|
"train/cross_entropy_loss": 0.023749235086143017, |
|
"train/kl_divergence_loss": 0.3082275390625, |
|
"train/step_duration_seconds": 7.1705567836761475, |
|
"train/steps_per_hour": 411.30789585265995, |
|
"train/total_elapsed_hours": 0.09968201537926992 |
|
}, |
|
{ |
|
"epoch": 0.09955555555555555, |
|
"grad_norm": 29.5, |
|
"learning_rate": 1.9069767441860468e-05, |
|
"loss": 2.6558, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09955555555555555, |
|
"step": 42, |
|
"train/combined_loss": 0.16566006373614073, |
|
"train/cross_entropy_loss": 0.030416806926950812, |
|
"train/kl_divergence_loss": 0.3009033203125, |
|
"train/step_duration_seconds": 7.185399770736694, |
|
"train/steps_per_hour": 413.06887057061425, |
|
"train/total_elapsed_hours": 0.1016779597600301 |
|
}, |
|
{ |
|
"epoch": 0.10192592592592592, |
|
"grad_norm": 35.5, |
|
"learning_rate": 1.9534883720930235e-05, |
|
"loss": 2.6506, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10192592592592592, |
|
"step": 43, |
|
"train/combined_loss": 0.1595935821533203, |
|
"train/cross_entropy_loss": 0.024265281856060028, |
|
"train/kl_divergence_loss": 0.294921875, |
|
"train/step_duration_seconds": 7.195603370666504, |
|
"train/steps_per_hour": 414.75070139036376, |
|
"train/total_elapsed_hours": 0.10367673847410414 |
|
}, |
|
{ |
|
"epoch": 0.10429629629629629, |
|
"grad_norm": 17.5, |
|
"learning_rate": 2e-05, |
|
"loss": 2.5535, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.10429629629629629, |
|
"step": 44, |
|
"train/combined_loss": 0.1587589643895626, |
|
"train/cross_entropy_loss": 0.013806993083562702, |
|
"train/kl_divergence_loss": 0.3037109375, |
|
"train/step_duration_seconds": 7.19236421585083, |
|
"train/steps_per_hour": 416.37245606383044, |
|
"train/total_elapsed_hours": 0.10567461742295159 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 32.25, |
|
"learning_rate": 1.999965463076377e-05, |
|
"loss": 2.5401, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"step": 45, |
|
"train/combined_loss": 0.15903105773031712, |
|
"train/cross_entropy_loss": 0.013862900086678565, |
|
"train/kl_divergence_loss": 0.30419921875, |
|
"train/step_duration_seconds": 7.199136018753052, |
|
"train/steps_per_hour": 417.9267256968682, |
|
"train/total_elapsed_hours": 0.10767437742816077 |
|
}, |
|
{ |
|
"epoch": 0.10903703703703704, |
|
"grad_norm": 30.25, |
|
"learning_rate": 1.999861854691106e-05, |
|
"loss": 2.5445, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.10903703703703704, |
|
"step": 46, |
|
"train/combined_loss": 0.16162511333823204, |
|
"train/cross_entropy_loss": 0.021858620457351208, |
|
"train/kl_divergence_loss": 0.3013916015625, |
|
"train/step_duration_seconds": 7.177664041519165, |
|
"train/steps_per_hour": 419.44712621402573, |
|
"train/total_elapsed_hours": 0.10966817299524943 |
|
}, |
|
{ |
|
"epoch": 0.11140740740740741, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 1.9996891820008165e-05, |
|
"loss": 2.586, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.11140740740740741, |
|
"step": 47, |
|
"train/combined_loss": 0.15759198181331158, |
|
"train/cross_entropy_loss": 0.032835332211107016, |
|
"train/kl_divergence_loss": 0.2823486328125, |
|
"train/step_duration_seconds": 7.197060823440552, |
|
"train/steps_per_hour": 420.89292205888296, |
|
"train/total_elapsed_hours": 0.11166735655731624 |
|
}, |
|
{ |
|
"epoch": 0.11377777777777778, |
|
"grad_norm": 34.5, |
|
"learning_rate": 1.999447456932676e-05, |
|
"loss": 2.5215, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.11377777777777778, |
|
"step": 48, |
|
"train/combined_loss": 0.15159989707171917, |
|
"train/cross_entropy_loss": 0.02695468720048666, |
|
"train/kl_divergence_loss": 0.2762451171875, |
|
"train/step_duration_seconds": 7.164118528366089, |
|
"train/steps_per_hour": 422.32185886743343, |
|
"train/total_elapsed_hours": 0.11365738948186238 |
|
}, |
|
{ |
|
"epoch": 0.11614814814814815, |
|
"grad_norm": 31.5, |
|
"learning_rate": 1.9991366961835643e-05, |
|
"loss": 2.4256, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.11614814814814815, |
|
"step": 49, |
|
"train/combined_loss": 0.15017856005579233, |
|
"train/cross_entropy_loss": 0.019473322783596814, |
|
"train/kl_divergence_loss": 0.2808837890625, |
|
"train/step_duration_seconds": 7.150151968002319, |
|
"train/steps_per_hour": 423.7158323839195, |
|
"train/total_elapsed_hours": 0.11564354280630748 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 23.375, |
|
"learning_rate": 1.9987569212189224e-05, |
|
"loss": 2.4029, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"eval_combined_loss": 0.14935019636029997, |
|
"eval_cross_entropy_loss": 0.01737226772059997, |
|
"eval_kl_divergence_loss": 0.281328125, |
|
"eval_loss": 0.14935019612312317, |
|
"eval_runtime": 220.3536, |
|
"eval_samples_per_second": 6.807, |
|
"eval_steps_per_second": 3.404, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"step": 50, |
|
"train/combined_loss": 0.14872634038329124, |
|
"train/cross_entropy_loss": 0.017301325569860637, |
|
"train/kl_divergence_loss": 0.2801513671875, |
|
"train/step_duration_seconds": 227.54744958877563, |
|
"train/steps_per_hour": 279.5620551165938, |
|
"train/total_elapsed_hours": 0.17885116769207848 |
|
}, |
|
{ |
|
"epoch": 0.12088888888888889, |
|
"grad_norm": 20.25, |
|
"learning_rate": 1.9983081582712684e-05, |
|
"loss": 2.3796, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12088888888888889, |
|
"step": 51, |
|
"train/combined_loss": 0.15616787131875753, |
|
"train/cross_entropy_loss": 0.019611137569881976, |
|
"train/kl_divergence_loss": 0.292724609375, |
|
"train/step_duration_seconds": 7.1779563426971436, |
|
"train/steps_per_hour": 282.00938628976195, |
|
"train/total_elapsed_hours": 0.1808450444539388 |
|
}, |
|
{ |
|
"epoch": 0.12325925925925926, |
|
"grad_norm": 27.375, |
|
"learning_rate": 1.997790438338385e-05, |
|
"loss": 2.4987, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.12325925925925926, |
|
"step": 52, |
|
"train/combined_loss": 0.1437931666150689, |
|
"train/cross_entropy_loss": 0.019641992752440274, |
|
"train/kl_divergence_loss": 0.2679443359375, |
|
"train/step_duration_seconds": 7.16746187210083, |
|
"train/steps_per_hour": 284.4078751961097, |
|
"train/total_elapsed_hours": 0.1828360060850779 |
|
}, |
|
{ |
|
"epoch": 0.12562962962962962, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 1.9972037971811802e-05, |
|
"loss": 2.3007, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.12562962962962962, |
|
"step": 53, |
|
"train/combined_loss": 0.14357721991837025, |
|
"train/cross_entropy_loss": 0.02165149967186153, |
|
"train/kl_divergence_loss": 0.2655029296875, |
|
"train/step_duration_seconds": 7.177205562591553, |
|
"train/steps_per_hour": 286.75049177904856, |
|
"train/total_elapsed_hours": 0.1848296742969089 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 20.125, |
|
"learning_rate": 1.9965482753212154e-05, |
|
"loss": 2.2972, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"step": 54, |
|
"train/combined_loss": 0.14043164812028408, |
|
"train/cross_entropy_loss": 0.023294932674616575, |
|
"train/kl_divergence_loss": 0.257568359375, |
|
"train/step_duration_seconds": 7.157373428344727, |
|
"train/steps_per_hour": 289.05163369286066, |
|
"train/total_elapsed_hours": 0.1868178335825602 |
|
}, |
|
{ |
|
"epoch": 0.13037037037037036, |
|
"grad_norm": 17.125, |
|
"learning_rate": 1.995823918037908e-05, |
|
"loss": 2.2469, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13037037037037036, |
|
"step": 55, |
|
"train/combined_loss": 0.1433863490819931, |
|
"train/cross_entropy_loss": 0.02163596951868385, |
|
"train/kl_divergence_loss": 0.26513671875, |
|
"train/step_duration_seconds": 7.159351348876953, |
|
"train/steps_per_hour": 291.30346508519057, |
|
"train/total_elapsed_hours": 0.1888065422905816 |
|
}, |
|
{ |
|
"epoch": 0.13274074074074074, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.9950307753654016e-05, |
|
"loss": 2.2942, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.13274074074074074, |
|
"step": 56, |
|
"train/combined_loss": 0.1372276395559311, |
|
"train/cross_entropy_loss": 0.019206264754757285, |
|
"train/kl_divergence_loss": 0.2552490234375, |
|
"train/step_duration_seconds": 7.149013042449951, |
|
"train/steps_per_hour": 293.5127714276043, |
|
"train/total_elapsed_hours": 0.1907923792468177 |
|
}, |
|
{ |
|
"epoch": 0.1351111111111111, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.994168902089112e-05, |
|
"loss": 2.1956, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1351111111111111, |
|
"step": 57, |
|
"train/combined_loss": 0.13671019952744246, |
|
"train/cross_entropy_loss": 0.020124488859437406, |
|
"train/kl_divergence_loss": 0.2532958984375, |
|
"train/step_duration_seconds": 7.1693689823150635, |
|
"train/steps_per_hour": 295.6678886749936, |
|
"train/total_elapsed_hours": 0.1927838706307941 |
|
}, |
|
{ |
|
"epoch": 0.13748148148148148, |
|
"grad_norm": 21.25, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 2.1874, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.13748148148148148, |
|
"step": 58, |
|
"train/combined_loss": 0.13713468238711357, |
|
"train/cross_entropy_loss": 0.020057943649590015, |
|
"train/kl_divergence_loss": 0.25421142578125, |
|
"train/step_duration_seconds": 7.144319295883179, |
|
"train/steps_per_hour": 297.7895740672935, |
|
"train/total_elapsed_hours": 0.19476840376853943 |
|
}, |
|
{ |
|
"epoch": 0.13985185185185184, |
|
"grad_norm": 19.0, |
|
"learning_rate": 1.9922392066001724e-05, |
|
"loss": 2.1942, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.13985185185185184, |
|
"step": 59, |
|
"train/combined_loss": 0.1343515245243907, |
|
"train/cross_entropy_loss": 0.019130286993458867, |
|
"train/kl_divergence_loss": 0.24957275390625, |
|
"train/step_duration_seconds": 7.157663822174072, |
|
"train/steps_per_hour": 299.86280963512706, |
|
"train/total_elapsed_hours": 0.19675664371914334 |
|
}, |
|
{ |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 16.625, |
|
"learning_rate": 1.991171517679013e-05, |
|
"loss": 2.1496, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14222222222222222, |
|
"step": 60, |
|
"train/combined_loss": 0.13391043990850449, |
|
"train/cross_entropy_loss": 0.020872646826319396, |
|
"train/kl_divergence_loss": 0.2469482421875, |
|
"train/step_duration_seconds": 7.151079893112183, |
|
"train/steps_per_hour": 301.8973420742143, |
|
"train/total_elapsed_hours": 0.1987430548005634 |
|
}, |
|
{ |
|
"epoch": 0.1445925925925926, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.9900353647278466e-05, |
|
"loss": 2.1426, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1445925925925926, |
|
"step": 61, |
|
"train/combined_loss": 0.13261367939412594, |
|
"train/cross_entropy_loss": 0.022612601169385016, |
|
"train/kl_divergence_loss": 0.24261474609375, |
|
"train/step_duration_seconds": 7.164619207382202, |
|
"train/steps_per_hour": 303.88591351636484, |
|
"train/total_elapsed_hours": 0.200733226802614 |
|
}, |
|
{ |
|
"epoch": 0.14696296296296296, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 1.9888308262251286e-05, |
|
"loss": 2.1218, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.14696296296296296, |
|
"step": 62, |
|
"train/combined_loss": 0.1269391467794776, |
|
"train/cross_entropy_loss": 0.022066760691814125, |
|
"train/kl_divergence_loss": 0.2318115234375, |
|
"train/step_duration_seconds": 7.142378091812134, |
|
"train/steps_per_hour": 305.84476139080533, |
|
"train/total_elapsed_hours": 0.20271722071700626 |
|
}, |
|
{ |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 20.25, |
|
"learning_rate": 1.9875579853729677e-05, |
|
"loss": 2.031, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.14933333333333335, |
|
"step": 63, |
|
"train/combined_loss": 0.1287386268377304, |
|
"train/cross_entropy_loss": 0.02023360482417047, |
|
"train/kl_divergence_loss": 0.23724365234375, |
|
"train/step_duration_seconds": 7.179792881011963, |
|
"train/steps_per_hour": 307.75001344506256, |
|
"train/total_elapsed_hours": 0.20471160762839846 |
|
}, |
|
{ |
|
"epoch": 0.1517037037037037, |
|
"grad_norm": 8.125, |
|
"learning_rate": 1.9862169300913784e-05, |
|
"loss": 2.0598, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1517037037037037, |
|
"step": 64, |
|
"train/combined_loss": 0.1224580081179738, |
|
"train/cross_entropy_loss": 0.019146979437209666, |
|
"train/kl_divergence_loss": 0.22576904296875, |
|
"train/step_duration_seconds": 7.164458274841309, |
|
"train/steps_per_hour": 309.624880616572, |
|
"train/total_elapsed_hours": 0.2067017349269655 |
|
}, |
|
{ |
|
"epoch": 0.15407407407407409, |
|
"grad_norm": 17.625, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 1.9593, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15407407407407409, |
|
"step": 65, |
|
"train/combined_loss": 0.1219773581251502, |
|
"train/cross_entropy_loss": 0.02142053795978427, |
|
"train/kl_divergence_loss": 0.2225341796875, |
|
"train/step_duration_seconds": 7.1500184535980225, |
|
"train/steps_per_hour": 311.46997603082906, |
|
"train/total_elapsed_hours": 0.20868785116407607 |
|
}, |
|
{ |
|
"epoch": 0.15644444444444444, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.9833305514727396e-05, |
|
"loss": 1.9516, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.15644444444444444, |
|
"step": 66, |
|
"train/combined_loss": 0.11912317294627428, |
|
"train/cross_entropy_loss": 0.019252198981121182, |
|
"train/kl_divergence_loss": 0.218994140625, |
|
"train/step_duration_seconds": 7.173556327819824, |
|
"train/steps_per_hour": 313.2705599924478, |
|
"train/total_elapsed_hours": 0.21068050569958158 |
|
}, |
|
{ |
|
"epoch": 0.15881481481481483, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.981785427508966e-05, |
|
"loss": 1.906, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.15881481481481483, |
|
"step": 67, |
|
"train/combined_loss": 0.11908936966210604, |
|
"train/cross_entropy_loss": 0.02040529961232096, |
|
"train/kl_divergence_loss": 0.2177734375, |
|
"train/step_duration_seconds": 7.164828062057495, |
|
"train/steps_per_hour": 315.0409941178432, |
|
"train/total_elapsed_hours": 0.21267073571681977 |
|
}, |
|
{ |
|
"epoch": 0.16118518518518518, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 1.9801724878485438e-05, |
|
"loss": 1.9054, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.16118518518518518, |
|
"step": 68, |
|
"train/combined_loss": 0.12613936699926853, |
|
"train/cross_entropy_loss": 0.025533129926770926, |
|
"train/kl_divergence_loss": 0.22674560546875, |
|
"train/step_duration_seconds": 7.197604179382324, |
|
"train/steps_per_hour": 316.7651640171972, |
|
"train/total_elapsed_hours": 0.21467007021109263 |
|
}, |
|
{ |
|
"epoch": 0.16355555555555557, |
|
"grad_norm": 37.25, |
|
"learning_rate": 1.9784918439034216e-05, |
|
"loss": 2.0182, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.16355555555555557, |
|
"step": 69, |
|
"train/combined_loss": 0.11615706328302622, |
|
"train/cross_entropy_loss": 0.024733559926971793, |
|
"train/kl_divergence_loss": 0.20758056640625, |
|
"train/step_duration_seconds": 7.158296346664429, |
|
"train/steps_per_hour": 318.4735632448237, |
|
"train/total_elapsed_hours": 0.21665848586294387 |
|
}, |
|
{ |
|
"epoch": 0.16592592592592592, |
|
"grad_norm": 25.25, |
|
"learning_rate": 1.9767436117621416e-05, |
|
"loss": 1.8585, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16592592592592592, |
|
"step": 70, |
|
"train/combined_loss": 0.11343861371278763, |
|
"train/cross_entropy_loss": 0.018869414925575256, |
|
"train/kl_divergence_loss": 0.2080078125, |
|
"train/step_duration_seconds": 7.1701250076293945, |
|
"train/steps_per_hour": 320.1460784421751, |
|
"train/total_elapsed_hours": 0.21865018725395202 |
|
}, |
|
{ |
|
"epoch": 0.1682962962962963, |
|
"grad_norm": 37.75, |
|
"learning_rate": 1.9749279121818235e-05, |
|
"loss": 1.815, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1682962962962963, |
|
"step": 71, |
|
"train/combined_loss": 0.1213934626430273, |
|
"train/cross_entropy_loss": 0.01866583281662315, |
|
"train/kl_divergence_loss": 0.22412109375, |
|
"train/step_duration_seconds": 7.159663677215576, |
|
"train/steps_per_hour": 321.7926366627231, |
|
"train/total_elapsed_hours": 0.22063898271984525 |
|
}, |
|
{ |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 29.125, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 1.9423, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.17066666666666666, |
|
"step": 72, |
|
"train/combined_loss": 0.11181758902966976, |
|
"train/cross_entropy_loss": 0.019289474468678236, |
|
"train/kl_divergence_loss": 0.204345703125, |
|
"train/step_duration_seconds": 7.168922185897827, |
|
"train/steps_per_hour": 323.4060405602183, |
|
"train/total_elapsed_hours": 0.22263034999370576 |
|
}, |
|
{ |
|
"epoch": 0.17303703703703704, |
|
"grad_norm": 26.5, |
|
"learning_rate": 1.9710946170250702e-05, |
|
"loss": 1.7891, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.17303703703703704, |
|
"step": 73, |
|
"train/combined_loss": 0.11201242171227932, |
|
"train/cross_entropy_loss": 0.020838803611695766, |
|
"train/kl_divergence_loss": 0.20318603515625, |
|
"train/step_duration_seconds": 7.184260606765747, |
|
"train/steps_per_hour": 324.9846730527734, |
|
"train/total_elapsed_hours": 0.22462597794002956 |
|
}, |
|
{ |
|
"epoch": 0.1754074074074074, |
|
"grad_norm": 28.75, |
|
"learning_rate": 1.969077286229078e-05, |
|
"loss": 1.7922, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1754074074074074, |
|
"step": 74, |
|
"train/combined_loss": 0.11633206205442548, |
|
"train/cross_entropy_loss": 0.022031799773685634, |
|
"train/kl_divergence_loss": 0.21063232421875, |
|
"train/step_duration_seconds": 7.156804323196411, |
|
"train/steps_per_hour": 326.54649232377625, |
|
"train/total_elapsed_hours": 0.22661397914091747 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.9669930175366474e-05, |
|
"loss": 1.8613, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"step": 75, |
|
"train/combined_loss": 0.10736048221588135, |
|
"train/cross_entropy_loss": 0.019408458843827248, |
|
"train/kl_divergence_loss": 0.1953125, |
|
"train/step_duration_seconds": 7.157041549682617, |
|
"train/steps_per_hour": 328.08105279118894, |
|
"train/total_elapsed_hours": 0.22860204623805153 |
|
}, |
|
{ |
|
"epoch": 0.18014814814814814, |
|
"grad_norm": 19.875, |
|
"learning_rate": 1.964841954916235e-05, |
|
"loss": 1.7178, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.18014814814814814, |
|
"step": 76, |
|
"train/combined_loss": 0.11280431784689426, |
|
"train/cross_entropy_loss": 0.021934314048849046, |
|
"train/kl_divergence_loss": 0.20367431640625, |
|
"train/step_duration_seconds": 7.154073476791382, |
|
"train/steps_per_hour": 329.5903308097024, |
|
"train/total_elapsed_hours": 0.23058928887049357 |
|
}, |
|
{ |
|
"epoch": 0.18251851851851852, |
|
"grad_norm": 8.625, |
|
"learning_rate": 1.962624246950012e-05, |
|
"loss": 1.8049, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.18251851851851852, |
|
"step": 77, |
|
"train/combined_loss": 0.10877907322719693, |
|
"train/cross_entropy_loss": 0.021696327603422105, |
|
"train/kl_divergence_loss": 0.19586181640625, |
|
"train/step_duration_seconds": 7.163645029067993, |
|
"train/steps_per_hour": 331.0700321531476, |
|
"train/total_elapsed_hours": 0.2325791902674569 |
|
}, |
|
{ |
|
"epoch": 0.18488888888888888, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.9603400468236e-05, |
|
"loss": 1.7405, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.18488888888888888, |
|
"step": 78, |
|
"train/combined_loss": 0.11846707155928016, |
|
"train/cross_entropy_loss": 0.022090390557423234, |
|
"train/kl_divergence_loss": 0.21484375, |
|
"train/step_duration_seconds": 7.1567511558532715, |
|
"train/steps_per_hour": 332.52734290219325, |
|
"train/total_elapsed_hours": 0.23456717669963836 |
|
}, |
|
{ |
|
"epoch": 0.18725925925925926, |
|
"grad_norm": 23.5, |
|
"learning_rate": 1.957989512315489e-05, |
|
"loss": 1.8955, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.18725925925925926, |
|
"step": 79, |
|
"train/combined_loss": 0.10807515401393175, |
|
"train/cross_entropy_loss": 0.020105381147004664, |
|
"train/kl_divergence_loss": 0.196044921875, |
|
"train/step_duration_seconds": 7.179587125778198, |
|
"train/steps_per_hour": 333.9512044172201, |
|
"train/total_elapsed_hours": 0.23656150645679896 |
|
}, |
|
{ |
|
"epoch": 0.18962962962962962, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.955572805786141e-05, |
|
"loss": 1.7292, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18962962962962962, |
|
"step": 80, |
|
"train/combined_loss": 0.10483178775757551, |
|
"train/cross_entropy_loss": 0.022346684243530035, |
|
"train/kl_divergence_loss": 0.18731689453125, |
|
"train/step_duration_seconds": 7.17809271812439, |
|
"train/steps_per_hour": 335.3518424811757, |
|
"train/total_elapsed_hours": 0.23855542110072242 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 43.25, |
|
"learning_rate": 1.9530900941667733e-05, |
|
"loss": 1.6773, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"step": 81, |
|
"train/combined_loss": 0.1046543437987566, |
|
"train/cross_entropy_loss": 0.02217490249313414, |
|
"train/kl_divergence_loss": 0.1871337890625, |
|
"train/step_duration_seconds": 7.168772459030151, |
|
"train/steps_per_hour": 336.7328849090869, |
|
"train/total_elapsed_hours": 0.24054674678378635 |
|
}, |
|
{ |
|
"epoch": 0.19437037037037036, |
|
"grad_norm": 36.75, |
|
"learning_rate": 1.9505415489478293e-05, |
|
"loss": 1.6745, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.19437037037037036, |
|
"step": 82, |
|
"train/combined_loss": 0.10497199138626456, |
|
"train/cross_entropy_loss": 0.019270156044512987, |
|
"train/kl_divergence_loss": 0.190673828125, |
|
"train/step_duration_seconds": 7.172302484512329, |
|
"train/steps_per_hour": 338.08988274573534, |
|
"train/total_elapsed_hours": 0.24253905302948423 |
|
}, |
|
{ |
|
"epoch": 0.19674074074074074, |
|
"grad_norm": 19.375, |
|
"learning_rate": 1.947927346167132e-05, |
|
"loss": 1.6796, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.19674074074074074, |
|
"step": 83, |
|
"train/combined_loss": 0.1053922800347209, |
|
"train/cross_entropy_loss": 0.01895106490701437, |
|
"train/kl_divergence_loss": 0.19183349609375, |
|
"train/step_duration_seconds": 7.172720909118652, |
|
"train/steps_per_hour": 339.42460711386434, |
|
"train/total_elapsed_hours": 0.2445314755042394 |
|
}, |
|
{ |
|
"epoch": 0.1991111111111111, |
|
"grad_norm": 26.75, |
|
"learning_rate": 1.945247666397725e-05, |
|
"loss": 1.6863, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1991111111111111, |
|
"step": 84, |
|
"train/combined_loss": 0.10413095075637102, |
|
"train/cross_entropy_loss": 0.019968447857536376, |
|
"train/kl_divergence_loss": 0.18829345703125, |
|
"train/step_duration_seconds": 7.1637890338897705, |
|
"train/steps_per_hour": 340.74118612261555, |
|
"train/total_elapsed_hours": 0.24652141690254212 |
|
}, |
|
{ |
|
"epoch": 0.20148148148148148, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 1.9425026947353994e-05, |
|
"loss": 1.6661, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.20148148148148148, |
|
"step": 85, |
|
"train/combined_loss": 0.1039673495106399, |
|
"train/cross_entropy_loss": 0.023974738782271743, |
|
"train/kl_divergence_loss": 0.1839599609375, |
|
"train/step_duration_seconds": 7.150022268295288, |
|
"train/steps_per_hour": 342.04194361300205, |
|
"train/total_elapsed_hours": 0.2485075341992908 |
|
}, |
|
{ |
|
"epoch": 0.20385185185185184, |
|
"grad_norm": 52.25, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 1.6635, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.20385185185185184, |
|
"step": 86, |
|
"train/combined_loss": 0.10607085470110178, |
|
"train/cross_entropy_loss": 0.02512999135069549, |
|
"train/kl_divergence_loss": 0.18701171875, |
|
"train/step_duration_seconds": 7.172277927398682, |
|
"train/steps_per_hour": 343.31360127461863, |
|
"train/total_elapsed_hours": 0.2504998336235682 |
|
}, |
|
{ |
|
"epoch": 0.20622222222222222, |
|
"grad_norm": 56.75, |
|
"learning_rate": 1.936817638651871e-05, |
|
"loss": 1.6971, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.20622222222222222, |
|
"step": 87, |
|
"train/combined_loss": 0.10299085499718785, |
|
"train/cross_entropy_loss": 0.023303485824726522, |
|
"train/kl_divergence_loss": 0.18267822265625, |
|
"train/step_duration_seconds": 7.151454925537109, |
|
"train/steps_per_hour": 344.57308438928993, |
|
"train/total_elapsed_hours": 0.25248634888066185 |
|
}, |
|
{ |
|
"epoch": 0.20859259259259258, |
|
"grad_norm": 27.0, |
|
"learning_rate": 1.9338779469193638e-05, |
|
"loss": 1.6479, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.20859259259259258, |
|
"step": 88, |
|
"train/combined_loss": 0.10254441620782018, |
|
"train/cross_entropy_loss": 0.019541956367902458, |
|
"train/kl_divergence_loss": 0.185546875, |
|
"train/step_duration_seconds": 7.150182485580444, |
|
"train/steps_per_hour": 345.81338378782823, |
|
"train/total_elapsed_hours": 0.254472510682212 |
|
}, |
|
{ |
|
"epoch": 0.21096296296296296, |
|
"grad_norm": 57.25, |
|
"learning_rate": 1.9308737486442045e-05, |
|
"loss": 1.6407, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.21096296296296296, |
|
"step": 89, |
|
"train/combined_loss": 0.1078284471295774, |
|
"train/cross_entropy_loss": 0.01973404036834836, |
|
"train/kl_divergence_loss": 0.1959228515625, |
|
"train/step_duration_seconds": 7.157840967178345, |
|
"train/steps_per_hour": 347.0315933491895, |
|
"train/total_elapsed_hours": 0.2564607998397615 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 59.25, |
|
"learning_rate": 1.9278052513379256e-05, |
|
"loss": 1.7253, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"step": 90, |
|
"train/combined_loss": 0.1028821705840528, |
|
"train/cross_entropy_loss": 0.01917986525222659, |
|
"train/kl_divergence_loss": 0.18658447265625, |
|
"train/step_duration_seconds": 7.162302017211914, |
|
"train/steps_per_hour": 348.2293895098418, |
|
"train/total_elapsed_hours": 0.25845032817787594 |
|
}, |
|
{ |
|
"epoch": 0.2157037037037037, |
|
"grad_norm": 36.5, |
|
"learning_rate": 1.9246726669534416e-05, |
|
"loss": 1.6461, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2157037037037037, |
|
"step": 91, |
|
"train/combined_loss": 0.10168306482955813, |
|
"train/cross_entropy_loss": 0.02093204390257597, |
|
"train/kl_divergence_loss": 0.18243408203125, |
|
"train/step_duration_seconds": 7.153521299362183, |
|
"train/steps_per_hour": 349.4121578181558, |
|
"train/total_elapsed_hours": 0.26043741742769877 |
|
}, |
|
{ |
|
"epoch": 0.2180740740740741, |
|
"grad_norm": 27.5, |
|
"learning_rate": 1.921476211870408e-05, |
|
"loss": 1.6269, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2180740740740741, |
|
"step": 92, |
|
"train/combined_loss": 0.09777736244723201, |
|
"train/cross_entropy_loss": 0.0219097004737705, |
|
"train/kl_divergence_loss": 0.17364501953125, |
|
"train/step_duration_seconds": 7.162153959274292, |
|
"train/steps_per_hour": 350.5738107405354, |
|
"train/total_elapsed_hours": 0.2624269046386083 |
|
}, |
|
{ |
|
"epoch": 0.22044444444444444, |
|
"grad_norm": 34.5, |
|
"learning_rate": 1.9182161068802742e-05, |
|
"loss": 1.5644, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.22044444444444444, |
|
"step": 93, |
|
"train/combined_loss": 0.10163608659058809, |
|
"train/cross_entropy_loss": 0.02199775306507945, |
|
"train/kl_divergence_loss": 0.1812744140625, |
|
"train/step_duration_seconds": 7.154653549194336, |
|
"train/steps_per_hour": 351.72075430393164, |
|
"train/total_elapsed_hours": 0.26441430840227337 |
|
}, |
|
{ |
|
"epoch": 0.22281481481481483, |
|
"grad_norm": 20.5, |
|
"learning_rate": 1.9148925771710347e-05, |
|
"loss": 1.6262, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.22281481481481483, |
|
"step": 94, |
|
"train/combined_loss": 0.09920958010479808, |
|
"train/cross_entropy_loss": 0.019586148322559893, |
|
"train/kl_divergence_loss": 0.1788330078125, |
|
"train/step_duration_seconds": 7.162054061889648, |
|
"train/steps_per_hour": 352.84786230208005, |
|
"train/total_elapsed_hours": 0.2664037678639094 |
|
}, |
|
{ |
|
"epoch": 0.22518518518518518, |
|
"grad_norm": 31.625, |
|
"learning_rate": 1.9115058523116734e-05, |
|
"loss": 1.5874, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.22518518518518518, |
|
"step": 95, |
|
"train/combined_loss": 0.09802744071930647, |
|
"train/cross_entropy_loss": 0.01880878652445972, |
|
"train/kl_divergence_loss": 0.17724609375, |
|
"train/step_duration_seconds": 7.153456926345825, |
|
"train/steps_per_hour": 353.9614104256406, |
|
"train/total_elapsed_hours": 0.2683908392323388 |
|
}, |
|
{ |
|
"epoch": 0.22755555555555557, |
|
"grad_norm": 38.75, |
|
"learning_rate": 1.908056166236305e-05, |
|
"loss": 1.5684, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.22755555555555557, |
|
"step": 96, |
|
"train/combined_loss": 0.09743851656094193, |
|
"train/cross_entropy_loss": 0.018851646571420133, |
|
"train/kl_divergence_loss": 0.176025390625, |
|
"train/step_duration_seconds": 7.1597981452941895, |
|
"train/steps_per_hour": 355.0562779811278, |
|
"train/total_elapsed_hours": 0.2703796720504761 |
|
}, |
|
{ |
|
"epoch": 0.22992592592592592, |
|
"grad_norm": 26.125, |
|
"learning_rate": 1.9045437572280193e-05, |
|
"loss": 1.559, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.22992592592592592, |
|
"step": 97, |
|
"train/combined_loss": 0.0965579142794013, |
|
"train/cross_entropy_loss": 0.02038634184282273, |
|
"train/kl_divergence_loss": 0.1727294921875, |
|
"train/step_duration_seconds": 7.157320976257324, |
|
"train/steps_per_hour": 356.13605583694607, |
|
"train/total_elapsed_hours": 0.2723678167661031 |
|
}, |
|
{ |
|
"epoch": 0.2322962962962963, |
|
"grad_norm": 18.125, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 1.5449, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2322962962962963, |
|
"step": 98, |
|
"train/combined_loss": 0.09678681008517742, |
|
"train/cross_entropy_loss": 0.020966205513104796, |
|
"train/kl_divergence_loss": 0.172607421875, |
|
"train/step_duration_seconds": 7.1417396068573, |
|
"train/steps_per_hour": 357.2058194544125, |
|
"train/total_elapsed_hours": 0.2743516333235635 |
|
}, |
|
{ |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 27.5, |
|
"learning_rate": 1.8973317451908642e-05, |
|
"loss": 1.5486, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.23466666666666666, |
|
"step": 99, |
|
"train/combined_loss": 0.10045615630224347, |
|
"train/cross_entropy_loss": 0.02183516975492239, |
|
"train/kl_divergence_loss": 0.1790771484375, |
|
"train/step_duration_seconds": 7.167134761810303, |
|
"train/steps_per_hour": 358.25107804442126, |
|
"train/total_elapsed_hours": 0.276342504090733 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 17.75, |
|
"learning_rate": 1.8936326403234125e-05, |
|
"loss": 1.6073, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"eval_combined_loss": 0.09612167934452494, |
|
"eval_cross_entropy_loss": 0.02090351493904988, |
|
"eval_kl_divergence_loss": 0.17133984375, |
|
"eval_loss": 0.09612167626619339, |
|
"eval_runtime": 219.9162, |
|
"eval_samples_per_second": 6.821, |
|
"eval_steps_per_second": 3.41, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"step": 100, |
|
"train/combined_loss": 0.09815677208825946, |
|
"train/cross_entropy_loss": 0.020898508373647928, |
|
"train/kl_divergence_loss": 0.1754150390625, |
|
"train/step_duration_seconds": 227.11371684074402, |
|
"train/steps_per_hour": 294.61186048448315, |
|
"train/total_elapsed_hours": 0.33942964765760636 |
|
}, |
|
{ |
|
"epoch": 0.2394074074074074, |
|
"grad_norm": 21.125, |
|
"learning_rate": 1.8898718088114688e-05, |
|
"loss": 1.5705, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2394074074074074, |
|
"step": 101, |
|
"train/combined_loss": 0.09819618007168174, |
|
"train/cross_entropy_loss": 0.020428007235750556, |
|
"train/kl_divergence_loss": 0.17596435546875, |
|
"train/step_duration_seconds": 7.171290159225464, |
|
"train/steps_per_hour": 295.82187680336136, |
|
"train/total_elapsed_hours": 0.34142167270183565 |
|
}, |
|
{ |
|
"epoch": 0.24177777777777779, |
|
"grad_norm": 25.25, |
|
"learning_rate": 1.8860495104301346e-05, |
|
"loss": 1.5711, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.24177777777777779, |
|
"step": 102, |
|
"train/combined_loss": 0.09767304686829448, |
|
"train/cross_entropy_loss": 0.020236226613633335, |
|
"train/kl_divergence_loss": 0.17510986328125, |
|
"train/step_duration_seconds": 7.163306951522827, |
|
"train/steps_per_hour": 297.0197733169854, |
|
"train/total_elapsed_hours": 0.34341148018836976 |
|
}, |
|
{ |
|
"epoch": 0.24414814814814814, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.8821660092002642e-05, |
|
"loss": 1.5628, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.24414814814814814, |
|
"step": 103, |
|
"train/combined_loss": 0.09357908833771944, |
|
"train/cross_entropy_loss": 0.021325660520233214, |
|
"train/kl_divergence_loss": 0.16583251953125, |
|
"train/step_duration_seconds": 7.165699005126953, |
|
"train/steps_per_hour": 298.2032943460889, |
|
"train/total_elapsed_hours": 0.34540195213423835 |
|
}, |
|
{ |
|
"epoch": 0.24651851851851853, |
|
"grad_norm": 33.25, |
|
"learning_rate": 1.8782215733702286e-05, |
|
"loss": 1.4973, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.24651851851851853, |
|
"step": 104, |
|
"train/combined_loss": 0.09911046642810106, |
|
"train/cross_entropy_loss": 0.022012439789250493, |
|
"train/kl_divergence_loss": 0.17620849609375, |
|
"train/step_duration_seconds": 7.175975322723389, |
|
"train/steps_per_hour": 299.3707928769077, |
|
"train/total_elapsed_hours": 0.34739527861277264 |
|
}, |
|
{ |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 38.0, |
|
"learning_rate": 1.874216475397386e-05, |
|
"loss": 1.5858, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.24888888888888888, |
|
"step": 105, |
|
"train/combined_loss": 0.09527313988655806, |
|
"train/cross_entropy_loss": 0.02172303292900324, |
|
"train/kl_divergence_loss": 0.1688232421875, |
|
"train/step_duration_seconds": 7.173555850982666, |
|
"train/steps_per_hour": 300.5255479022072, |
|
"train/total_elapsed_hours": 0.3493879330158234 |
|
}, |
|
{ |
|
"epoch": 0.25125925925925924, |
|
"grad_norm": 30.0, |
|
"learning_rate": 1.870150991929261e-05, |
|
"loss": 1.5244, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.25125925925925924, |
|
"step": 106, |
|
"train/combined_loss": 0.09378110896795988, |
|
"train/cross_entropy_loss": 0.02069209818728268, |
|
"train/kl_divergence_loss": 0.1668701171875, |
|
"train/step_duration_seconds": 7.196229696273804, |
|
"train/steps_per_hour": 301.66179874940855, |
|
"train/total_elapsed_hours": 0.35138688570923277 |
|
}, |
|
{ |
|
"epoch": 0.25362962962962965, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 1.5005, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.25362962962962965, |
|
"step": 107, |
|
"train/combined_loss": 0.09853040147572756, |
|
"train/cross_entropy_loss": 0.019936785800382495, |
|
"train/kl_divergence_loss": 0.1771240234375, |
|
"train/step_duration_seconds": 7.178696632385254, |
|
"train/steps_per_hour": 302.78936801023747, |
|
"train/total_elapsed_hours": 0.35338096810711755 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 17.625, |
|
"learning_rate": 1.8618399959331642e-05, |
|
"loss": 1.5765, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"step": 108, |
|
"train/combined_loss": 0.09358909726142883, |
|
"train/cross_entropy_loss": 0.020552222267724574, |
|
"train/kl_divergence_loss": 0.1666259765625, |
|
"train/step_duration_seconds": 7.157944679260254, |
|
"train/steps_per_hour": 303.90921284525064, |
|
"train/total_elapsed_hours": 0.3553692860735787 |
|
}, |
|
{ |
|
"epoch": 0.25837037037037036, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1.8575950574776595e-05, |
|
"loss": 1.4974, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.25837037037037036, |
|
"step": 109, |
|
"train/combined_loss": 0.09076927369460464, |
|
"train/cross_entropy_loss": 0.0206498735351488, |
|
"train/kl_divergence_loss": 0.160888671875, |
|
"train/step_duration_seconds": 7.1567769050598145, |
|
"train/steps_per_hour": 305.0168730415088, |
|
"train/total_elapsed_hours": 0.35735727965831754 |
|
}, |
|
{ |
|
"epoch": 0.2607407407407407, |
|
"grad_norm": 22.375, |
|
"learning_rate": 1.8532908816321557e-05, |
|
"loss": 1.4523, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2607407407407407, |
|
"step": 110, |
|
"train/combined_loss": 0.09263847023248672, |
|
"train/cross_entropy_loss": 0.021519619156606495, |
|
"train/kl_divergence_loss": 0.16375732421875, |
|
"train/step_duration_seconds": 7.173721790313721, |
|
"train/steps_per_hour": 306.1082679129725, |
|
"train/total_elapsed_hours": 0.3593499801556269 |
|
}, |
|
{ |
|
"epoch": 0.26311111111111113, |
|
"grad_norm": 23.875, |
|
"learning_rate": 1.8489277657026377e-05, |
|
"loss": 1.4822, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.26311111111111113, |
|
"step": 111, |
|
"train/combined_loss": 0.0902865119278431, |
|
"train/cross_entropy_loss": 0.020905061159282923, |
|
"train/kl_divergence_loss": 0.15966796875, |
|
"train/step_duration_seconds": 7.147202253341675, |
|
"train/steps_per_hour": 307.1938879594452, |
|
"train/total_elapsed_hours": 0.3613353141148885 |
|
}, |
|
{ |
|
"epoch": 0.2654814814814815, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.844506011066308e-05, |
|
"loss": 1.4446, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2654814814814815, |
|
"step": 112, |
|
"train/combined_loss": 0.09026105608791113, |
|
"train/cross_entropy_loss": 0.019145151949487627, |
|
"train/kl_divergence_loss": 0.161376953125, |
|
"train/step_duration_seconds": 7.135612726211548, |
|
"train/steps_per_hour": 308.27037497741026, |
|
"train/total_elapsed_hours": 0.3633174287610584 |
|
}, |
|
{ |
|
"epoch": 0.26785185185185184, |
|
"grad_norm": 35.25, |
|
"learning_rate": 1.8400259231507716e-05, |
|
"loss": 1.4442, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.26785185185185184, |
|
"step": 113, |
|
"train/combined_loss": 0.09185996558517218, |
|
"train/cross_entropy_loss": 0.019108111271634698, |
|
"train/kl_divergence_loss": 0.16461181640625, |
|
"train/step_duration_seconds": 7.138214826583862, |
|
"train/steps_per_hour": 309.3345678925583, |
|
"train/total_elapsed_hours": 0.36530026621288725 |
|
}, |
|
{ |
|
"epoch": 0.2702222222222222, |
|
"grad_norm": 42.0, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 1.4698, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2702222222222222, |
|
"step": 114, |
|
"train/combined_loss": 0.08998283930122852, |
|
"train/cross_entropy_loss": 0.018710799398832023, |
|
"train/kl_divergence_loss": 0.1612548828125, |
|
"train/step_duration_seconds": 7.160963296890259, |
|
"train/steps_per_hour": 310.38193032311864, |
|
"train/total_elapsed_hours": 0.36728942268424564 |
|
}, |
|
{ |
|
"epoch": 0.2725925925925926, |
|
"grad_norm": 32.75, |
|
"learning_rate": 1.8308919893176397e-05, |
|
"loss": 1.4397, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2725925925925926, |
|
"step": 115, |
|
"train/combined_loss": 0.08985556894913316, |
|
"train/cross_entropy_loss": 0.02016523655038327, |
|
"train/kl_divergence_loss": 0.1595458984375, |
|
"train/step_duration_seconds": 7.191335916519165, |
|
"train/steps_per_hour": 311.4108945594422, |
|
"train/total_elapsed_hours": 0.36928701599438984 |
|
}, |
|
{ |
|
"epoch": 0.27496296296296296, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.826238774315995e-05, |
|
"loss": 1.4377, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.27496296296296296, |
|
"step": 116, |
|
"train/combined_loss": 0.090081796515733, |
|
"train/cross_entropy_loss": 0.0222046107519418, |
|
"train/kl_divergence_loss": 0.157958984375, |
|
"train/step_duration_seconds": 7.165991544723511, |
|
"train/steps_per_hour": 312.43471090794657, |
|
"train/total_elapsed_hours": 0.3712775692012575 |
|
}, |
|
{ |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 17.25, |
|
"learning_rate": 1.8215284878234644e-05, |
|
"loss": 1.4413, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2773333333333333, |
|
"step": 117, |
|
"train/combined_loss": 0.09464696934446692, |
|
"train/cross_entropy_loss": 0.023217282141558826, |
|
"train/kl_divergence_loss": 0.16607666015625, |
|
"train/step_duration_seconds": 7.209496736526489, |
|
"train/steps_per_hour": 313.43745997881086, |
|
"train/total_elapsed_hours": 0.373280207183626 |
|
}, |
|
{ |
|
"epoch": 0.2797037037037037, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.816761455197657e-05, |
|
"loss": 1.5144, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2797037037037037, |
|
"step": 118, |
|
"train/combined_loss": 0.08674649894237518, |
|
"train/cross_entropy_loss": 0.018219567835330963, |
|
"train/kl_divergence_loss": 0.1552734375, |
|
"train/step_duration_seconds": 7.183828592300415, |
|
"train/steps_per_hour": 314.43548101801, |
|
"train/total_elapsed_hours": 0.3752757151259316 |
|
}, |
|
{ |
|
"epoch": 0.2820740740740741, |
|
"grad_norm": 22.625, |
|
"learning_rate": 1.811938005715857e-05, |
|
"loss": 1.3879, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.2820740740740741, |
|
"step": 119, |
|
"train/combined_loss": 0.08797085983678699, |
|
"train/cross_entropy_loss": 0.017738587921485305, |
|
"train/kl_divergence_loss": 0.158203125, |
|
"train/step_duration_seconds": 7.165632963180542, |
|
"train/steps_per_hour": 315.4271701636993, |
|
"train/total_elapsed_hours": 0.3772661687268151 |
|
}, |
|
{ |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 24.0, |
|
"learning_rate": 1.8070584725522763e-05, |
|
"loss": 1.4075, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.28444444444444444, |
|
"step": 120, |
|
"train/combined_loss": 0.09114356013014913, |
|
"train/cross_entropy_loss": 0.018773937365040183, |
|
"train/kl_divergence_loss": 0.16351318359375, |
|
"train/step_duration_seconds": 7.167950868606567, |
|
"train/steps_per_hour": 316.4079127749189, |
|
"train/total_elapsed_hours": 0.37925726619031697 |
|
}, |
|
{ |
|
"epoch": 0.2868148148148148, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.802123192755044e-05, |
|
"loss": 1.4583, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2868148148148148, |
|
"step": 121, |
|
"train/combined_loss": 0.09176525427028537, |
|
"train/cross_entropy_loss": 0.024289784720167518, |
|
"train/kl_divergence_loss": 0.15924072265625, |
|
"train/step_duration_seconds": 7.175249338150024, |
|
"train/steps_per_hour": 317.3767236816989, |
|
"train/total_elapsed_hours": 0.38125039100646974 |
|
}, |
|
{ |
|
"epoch": 0.2891851851851852, |
|
"grad_norm": 31.75, |
|
"learning_rate": 1.7971325072229227e-05, |
|
"loss": 1.4682, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.2891851851851852, |
|
"step": 122, |
|
"train/combined_loss": 0.0890495995990932, |
|
"train/cross_entropy_loss": 0.025206139660440385, |
|
"train/kl_divergence_loss": 0.15289306640625, |
|
"train/step_duration_seconds": 7.177129745483398, |
|
"train/steps_per_hour": 318.3350237785182, |
|
"train/total_elapsed_hours": 0.3832440381579929 |
|
}, |
|
{ |
|
"epoch": 0.29155555555555557, |
|
"grad_norm": 38.75, |
|
"learning_rate": 1.7920867606817625e-05, |
|
"loss": 1.4248, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.29155555555555557, |
|
"step": 123, |
|
"train/combined_loss": 0.0867073736153543, |
|
"train/cross_entropy_loss": 0.023878611740656197, |
|
"train/kl_divergence_loss": 0.1495361328125, |
|
"train/step_duration_seconds": 7.162760257720947, |
|
"train/steps_per_hour": 319.2867134529585, |
|
"train/total_elapsed_hours": 0.3852336937851376 |
|
}, |
|
{ |
|
"epoch": 0.2939259259259259, |
|
"grad_norm": 33.0, |
|
"learning_rate": 1.7869863016606893e-05, |
|
"loss": 1.3873, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2939259259259259, |
|
"step": 124, |
|
"train/combined_loss": 0.08373823016881943, |
|
"train/cross_entropy_loss": 0.020015520974993706, |
|
"train/kl_divergence_loss": 0.1474609375, |
|
"train/step_duration_seconds": 7.192008972167969, |
|
"train/steps_per_hour": 320.221904230669, |
|
"train/total_elapsed_hours": 0.38723147405518427 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.78183148246803e-05, |
|
"loss": 1.3398, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"step": 125, |
|
"train/combined_loss": 0.08616493362933397, |
|
"train/cross_entropy_loss": 0.016812282847240567, |
|
"train/kl_divergence_loss": 0.155517578125, |
|
"train/step_duration_seconds": 7.1917195320129395, |
|
"train/steps_per_hour": 321.14756131827295, |
|
"train/total_elapsed_hours": 0.38922917392518785 |
|
}, |
|
{ |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 39.0, |
|
"learning_rate": 1.7766226591669787e-05, |
|
"loss": 1.3786, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2986666666666667, |
|
"step": 126, |
|
"train/combined_loss": 0.09160786820575595, |
|
"train/cross_entropy_loss": 0.016711829113774, |
|
"train/kl_divergence_loss": 0.16650390625, |
|
"train/step_duration_seconds": 7.179956436157227, |
|
"train/steps_per_hour": 322.0664550428592, |
|
"train/total_elapsed_hours": 0.39122360626856484 |
|
}, |
|
{ |
|
"epoch": 0.30103703703703705, |
|
"grad_norm": 44.0, |
|
"learning_rate": 1.771360191551e-05, |
|
"loss": 1.4657, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.30103703703703705, |
|
"step": 127, |
|
"train/combined_loss": 0.08970451634377241, |
|
"train/cross_entropy_loss": 0.017787940218113363, |
|
"train/kl_divergence_loss": 0.16162109375, |
|
"train/step_duration_seconds": 7.176481485366821, |
|
"train/steps_per_hour": 322.9768202044927, |
|
"train/total_elapsed_hours": 0.3932170733478334 |
|
}, |
|
{ |
|
"epoch": 0.3034074074074074, |
|
"grad_norm": 36.0, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 1.4353, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3034074074074074, |
|
"step": 128, |
|
"train/combined_loss": 0.08426619321107864, |
|
"train/cross_entropy_loss": 0.019362466409802437, |
|
"train/kl_divergence_loss": 0.149169921875, |
|
"train/step_duration_seconds": 7.183627605438232, |
|
"train/steps_per_hour": 323.876374745131, |
|
"train/total_elapsed_hours": 0.39521252546045516 |
|
}, |
|
{ |
|
"epoch": 0.30577777777777776, |
|
"grad_norm": 18.25, |
|
"learning_rate": 1.760675781050109e-05, |
|
"loss": 1.3483, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.30577777777777776, |
|
"step": 129, |
|
"train/combined_loss": 0.09070024406537414, |
|
"train/cross_entropy_loss": 0.02429600094910711, |
|
"train/kl_divergence_loss": 0.1571044921875, |
|
"train/step_duration_seconds": 7.154268741607666, |
|
"train/steps_per_hour": 324.77355916793476, |
|
"train/total_elapsed_hours": 0.39719982233312395 |
|
}, |
|
{ |
|
"epoch": 0.30814814814814817, |
|
"grad_norm": 35.0, |
|
"learning_rate": 1.755254576178535e-05, |
|
"loss": 1.4512, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.30814814814814817, |
|
"step": 130, |
|
"train/combined_loss": 0.09242757642641664, |
|
"train/cross_entropy_loss": 0.025797537760809064, |
|
"train/kl_divergence_loss": 0.1590576171875, |
|
"train/step_duration_seconds": 7.181564092636108, |
|
"train/steps_per_hour": 325.6556251715386, |
|
"train/total_elapsed_hours": 0.3991947012477451 |
|
}, |
|
{ |
|
"epoch": 0.3105185185185185, |
|
"grad_norm": 40.5, |
|
"learning_rate": 1.7497812029677344e-05, |
|
"loss": 1.4788, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3105185185185185, |
|
"step": 131, |
|
"train/combined_loss": 0.08590791560709476, |
|
"train/cross_entropy_loss": 0.024782140040770173, |
|
"train/kl_divergence_loss": 0.14703369140625, |
|
"train/step_duration_seconds": 7.177969694137573, |
|
"train/steps_per_hour": 326.52973182564483, |
|
"train/total_elapsed_hours": 0.4011885817183389 |
|
}, |
|
{ |
|
"epoch": 0.3128888888888889, |
|
"grad_norm": 36.5, |
|
"learning_rate": 1.7442560394846518e-05, |
|
"loss": 1.3745, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3128888888888889, |
|
"step": 132, |
|
"train/combined_loss": 0.08688511373475194, |
|
"train/cross_entropy_loss": 0.022158897132612765, |
|
"train/kl_divergence_loss": 0.151611328125, |
|
"train/step_duration_seconds": 7.1588640213012695, |
|
"train/steps_per_hour": 327.39950253665864, |
|
"train/total_elapsed_hours": 0.4031771550575892 |
|
}, |
|
{ |
|
"epoch": 0.31525925925925924, |
|
"grad_norm": 16.0, |
|
"learning_rate": 1.738679467373586e-05, |
|
"loss": 1.3902, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.31525925925925924, |
|
"step": 133, |
|
"train/combined_loss": 0.08377803396433592, |
|
"train/cross_entropy_loss": 0.017897860845550895, |
|
"train/kl_divergence_loss": 0.149658203125, |
|
"train/step_duration_seconds": 7.175210237503052, |
|
"train/steps_per_hour": 328.257056778055, |
|
"train/total_elapsed_hours": 0.4051702690124512 |
|
}, |
|
{ |
|
"epoch": 0.31762962962962965, |
|
"grad_norm": 34.5, |
|
"learning_rate": 1.7330518718298263e-05, |
|
"loss": 1.3404, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.31762962962962965, |
|
"step": 134, |
|
"train/combined_loss": 0.08916169637814164, |
|
"train/cross_entropy_loss": 0.017007473739795387, |
|
"train/kl_divergence_loss": 0.16131591796875, |
|
"train/step_duration_seconds": 7.173584222793579, |
|
"train/steps_per_hour": 329.106580437697, |
|
"train/total_elapsed_hours": 0.4071629312965605 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 42.75, |
|
"learning_rate": 1.7273736415730488e-05, |
|
"loss": 1.4266, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"step": 135, |
|
"train/combined_loss": 0.08420996041968465, |
|
"train/cross_entropy_loss": 0.016442388528957963, |
|
"train/kl_divergence_loss": 0.1519775390625, |
|
"train/step_duration_seconds": 7.169410705566406, |
|
"train/steps_per_hour": 329.948764311339, |
|
"train/total_elapsed_hours": 0.40915443427032894 |
|
}, |
|
{ |
|
"epoch": 0.32237037037037036, |
|
"grad_norm": 37.25, |
|
"learning_rate": 1.7216451688204623e-05, |
|
"loss": 1.3474, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.32237037037037036, |
|
"step": 136, |
|
"train/combined_loss": 0.08458211086690426, |
|
"train/cross_entropy_loss": 0.017736003384925425, |
|
"train/kl_divergence_loss": 0.15142822265625, |
|
"train/step_duration_seconds": 7.171813011169434, |
|
"train/steps_per_hour": 330.78225259442917, |
|
"train/total_elapsed_hours": 0.4111466045512093 |
|
}, |
|
{ |
|
"epoch": 0.3247407407407407, |
|
"grad_norm": 17.75, |
|
"learning_rate": 1.7158668492597186e-05, |
|
"loss": 1.3533, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3247407407407407, |
|
"step": 137, |
|
"train/combined_loss": 0.08640648704022169, |
|
"train/cross_entropy_loss": 0.022422353271394968, |
|
"train/kl_divergence_loss": 0.150390625, |
|
"train/step_duration_seconds": 7.173238754272461, |
|
"train/steps_per_hour": 331.60738477276607, |
|
"train/total_elapsed_hours": 0.4131391708718406 |
|
}, |
|
{ |
|
"epoch": 0.32711111111111113, |
|
"grad_norm": 24.25, |
|
"learning_rate": 1.7100390820215805e-05, |
|
"loss": 1.3825, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.32711111111111113, |
|
"step": 138, |
|
"train/combined_loss": 0.08674443326890469, |
|
"train/cross_entropy_loss": 0.025173434522002935, |
|
"train/kl_divergence_loss": 0.1483154296875, |
|
"train/step_duration_seconds": 7.182944059371948, |
|
"train/steps_per_hour": 332.422437151218, |
|
"train/total_elapsed_hours": 0.41513443311055503 |
|
}, |
|
{ |
|
"epoch": 0.3294814814814815, |
|
"grad_norm": 30.125, |
|
"learning_rate": 1.704162269652352e-05, |
|
"loss": 1.3879, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3294814814814815, |
|
"step": 139, |
|
"train/combined_loss": 0.08193621598184109, |
|
"train/cross_entropy_loss": 0.02483434451278299, |
|
"train/kl_divergence_loss": 0.1390380859375, |
|
"train/step_duration_seconds": 7.187625885009766, |
|
"train/steps_per_hour": 333.2286533026057, |
|
"train/total_elapsed_hours": 0.4171309958563911 |
|
}, |
|
{ |
|
"epoch": 0.33185185185185184, |
|
"grad_norm": 26.25, |
|
"learning_rate": 1.698236818086073e-05, |
|
"loss": 1.311, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.33185185185185184, |
|
"step": 140, |
|
"train/combined_loss": 0.08312624553218484, |
|
"train/cross_entropy_loss": 0.021843309281393886, |
|
"train/kl_divergence_loss": 0.1444091796875, |
|
"train/step_duration_seconds": 7.165472507476807, |
|
"train/steps_per_hour": 334.0320927727315, |
|
"train/total_elapsed_hours": 0.4191214048862457 |
|
}, |
|
{ |
|
"epoch": 0.3342222222222222, |
|
"grad_norm": 10.0, |
|
"learning_rate": 1.6922631366164795e-05, |
|
"loss": 1.33, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3342222222222222, |
|
"step": 141, |
|
"train/combined_loss": 0.08382831746712327, |
|
"train/cross_entropy_loss": 0.016838762094266713, |
|
"train/kl_divergence_loss": 0.15081787109375, |
|
"train/step_duration_seconds": 7.17199444770813, |
|
"train/steps_per_hour": 334.8264967927197, |
|
"train/total_elapsed_hours": 0.42111362556616466 |
|
}, |
|
{ |
|
"epoch": 0.3365925925925926, |
|
"grad_norm": 25.0, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 1.3413, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3365925925925926, |
|
"step": 142, |
|
"train/combined_loss": 0.08172068372368813, |
|
"train/cross_entropy_loss": 0.015309049864299595, |
|
"train/kl_divergence_loss": 0.14813232421875, |
|
"train/step_duration_seconds": 7.141969680786133, |
|
"train/steps_per_hour": 335.62003551235273, |
|
"train/total_elapsed_hours": 0.4230975060330497 |
|
}, |
|
{ |
|
"epoch": 0.33896296296296297, |
|
"grad_norm": 30.125, |
|
"learning_rate": 1.6801727377709195e-05, |
|
"loss": 1.3075, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.33896296296296297, |
|
"step": 143, |
|
"train/combined_loss": 0.08254175027832389, |
|
"train/cross_entropy_loss": 0.016035647364333272, |
|
"train/kl_divergence_loss": 0.1490478515625, |
|
"train/step_duration_seconds": 7.139028072357178, |
|
"train/steps_per_hour": 336.40681390445945, |
|
"train/total_elapsed_hours": 0.42508056938648225 |
|
}, |
|
{ |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 26.625, |
|
"learning_rate": 1.6740568555253153e-05, |
|
"loss": 1.3207, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3413333333333333, |
|
"step": 144, |
|
"train/combined_loss": 0.0809242157265544, |
|
"train/cross_entropy_loss": 0.019209268386475742, |
|
"train/kl_divergence_loss": 0.14263916015625, |
|
"train/step_duration_seconds": 7.164944648742676, |
|
"train/steps_per_hour": 337.180601627168, |
|
"train/total_elapsed_hours": 0.42707083178891075 |
|
}, |
|
{ |
|
"epoch": 0.3437037037037037, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.6678944135794375e-05, |
|
"loss": 1.2948, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3437037037037037, |
|
"step": 145, |
|
"train/combined_loss": 0.08305090665817261, |
|
"train/cross_entropy_loss": 0.027735116658732295, |
|
"train/kl_divergence_loss": 0.13836669921875, |
|
"train/step_duration_seconds": 7.151780843734741, |
|
"train/steps_per_hour": 337.95009082754103, |
|
"train/total_elapsed_hours": 0.4290574375788371 |
|
}, |
|
{ |
|
"epoch": 0.3460740740740741, |
|
"grad_norm": 24.875, |
|
"learning_rate": 1.6616858375968596e-05, |
|
"loss": 1.3288, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3460740740740741, |
|
"step": 146, |
|
"train/combined_loss": 0.08167848456650972, |
|
"train/cross_entropy_loss": 0.028225134126842022, |
|
"train/kl_divergence_loss": 0.1351318359375, |
|
"train/step_duration_seconds": 7.142008066177368, |
|
"train/steps_per_hour": 338.7146203300441, |
|
"train/total_elapsed_hours": 0.4310413287083308 |
|
}, |
|
{ |
|
"epoch": 0.34844444444444445, |
|
"grad_norm": 29.625, |
|
"learning_rate": 1.6554315564278102e-05, |
|
"loss": 1.3069, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.34844444444444445, |
|
"step": 147, |
|
"train/combined_loss": 0.08066110266372561, |
|
"train/cross_entropy_loss": 0.02490862738341093, |
|
"train/kl_divergence_loss": 0.13641357421875, |
|
"train/step_duration_seconds": 7.151546478271484, |
|
"train/steps_per_hour": 339.47006737646905, |
|
"train/total_elapsed_hours": 0.43302786939673954 |
|
}, |
|
{ |
|
"epoch": 0.3508148148148148, |
|
"grad_norm": 23.625, |
|
"learning_rate": 1.649132002079552e-05, |
|
"loss": 1.2906, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3508148148148148, |
|
"step": 148, |
|
"train/combined_loss": 0.08027565246447921, |
|
"train/cross_entropy_loss": 0.01925491786096245, |
|
"train/kl_divergence_loss": 0.14129638671875, |
|
"train/step_duration_seconds": 7.16980504989624, |
|
"train/steps_per_hour": 340.21464820376787, |
|
"train/total_elapsed_hours": 0.4350194819105996 |
|
}, |
|
{ |
|
"epoch": 0.35318518518518516, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 1.2844, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.35318518518518516, |
|
"step": 149, |
|
"train/combined_loss": 0.0810198406688869, |
|
"train/cross_entropy_loss": 0.01714221539441496, |
|
"train/kl_divergence_loss": 0.1448974609375, |
|
"train/step_duration_seconds": 7.167724132537842, |
|
"train/steps_per_hour": 340.95289337704213, |
|
"train/total_elapsed_hours": 0.4370105163918601 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 16.5, |
|
"learning_rate": 1.6363988174803638e-05, |
|
"loss": 1.2963, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"eval_combined_loss": 0.08181474480902155, |
|
"eval_cross_entropy_loss": 0.01787818753470977, |
|
"eval_kl_divergence_loss": 0.14575130208333334, |
|
"eval_loss": 0.08181475102901459, |
|
"eval_runtime": 218.6737, |
|
"eval_samples_per_second": 6.86, |
|
"eval_steps_per_second": 3.43, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"step": 150, |
|
"train/combined_loss": 0.08019543159753084, |
|
"train/cross_entropy_loss": 0.017629636102356017, |
|
"train/kl_divergence_loss": 0.14276123046875, |
|
"train/step_duration_seconds": 225.85686349868774, |
|
"train/steps_per_hour": 300.1509555021127, |
|
"train/total_elapsed_hours": 0.4997485340303845 |
|
}, |
|
{ |
|
"epoch": 0.3579259259259259, |
|
"grad_norm": 23.125, |
|
"learning_rate": 1.6299660667594814e-05, |
|
"loss": 1.2831, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3579259259259259, |
|
"step": 151, |
|
"train/combined_loss": 0.08127154828980565, |
|
"train/cross_entropy_loss": 0.01935462059918791, |
|
"train/kl_divergence_loss": 0.1431884765625, |
|
"train/step_duration_seconds": 7.190611124038696, |
|
"train/steps_per_hour": 300.9491301723214, |
|
"train/total_elapsed_hours": 0.5017459260092841 |
|
}, |
|
{ |
|
"epoch": 0.3602962962962963, |
|
"grad_norm": 20.375, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 1.3003, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3602962962962963, |
|
"step": 152, |
|
"train/combined_loss": 0.08766383724287152, |
|
"train/cross_entropy_loss": 0.020847689942456782, |
|
"train/kl_divergence_loss": 0.15447998046875, |
|
"train/step_duration_seconds": 7.20133113861084, |
|
"train/steps_per_hour": 301.7391914865756, |
|
"train/total_elapsed_hours": 0.5037462957700094 |
|
}, |
|
{ |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.6169704701186528e-05, |
|
"loss": 1.4026, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3626666666666667, |
|
"step": 153, |
|
"train/combined_loss": 0.07755696773529053, |
|
"train/cross_entropy_loss": 0.021324871107935905, |
|
"train/kl_divergence_loss": 0.1337890625, |
|
"train/step_duration_seconds": 7.173044919967651, |
|
"train/steps_per_hour": 302.52770304516605, |
|
"train/total_elapsed_hours": 0.5057388082477782 |
|
}, |
|
{ |
|
"epoch": 0.36503703703703705, |
|
"grad_norm": 17.125, |
|
"learning_rate": 1.6104085218545633e-05, |
|
"loss": 1.2409, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.36503703703703705, |
|
"step": 154, |
|
"train/combined_loss": 0.07878367276862264, |
|
"train/cross_entropy_loss": 0.022069291560910642, |
|
"train/kl_divergence_loss": 0.135498046875, |
|
"train/step_duration_seconds": 7.17523193359375, |
|
"train/steps_per_hour": 303.30966291023043, |
|
"train/total_elapsed_hours": 0.507731928229332 |
|
}, |
|
{ |
|
"epoch": 0.3674074074074074, |
|
"grad_norm": 16.875, |
|
"learning_rate": 1.6038044103254775e-05, |
|
"loss": 1.2605, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3674074074074074, |
|
"step": 155, |
|
"train/combined_loss": 0.07806963194161654, |
|
"train/cross_entropy_loss": 0.021861913381144404, |
|
"train/kl_divergence_loss": 0.13427734375, |
|
"train/step_duration_seconds": 7.181210279464722, |
|
"train/steps_per_hour": 304.08451686962036, |
|
"train/total_elapsed_hours": 0.5097267088625166 |
|
}, |
|
{ |
|
"epoch": 0.36977777777777776, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.5971585917027864e-05, |
|
"loss": 1.2491, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.36977777777777776, |
|
"step": 156, |
|
"train/combined_loss": 0.07714226096868515, |
|
"train/cross_entropy_loss": 0.020495465025305748, |
|
"train/kl_divergence_loss": 0.1337890625, |
|
"train/step_duration_seconds": 7.1563475131988525, |
|
"train/steps_per_hour": 304.8574442281318, |
|
"train/total_elapsed_hours": 0.5117145831717386 |
|
}, |
|
{ |
|
"epoch": 0.3721481481481482, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.5904715250387498e-05, |
|
"loss": 1.2343, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3721481481481482, |
|
"step": 157, |
|
"train/combined_loss": 0.07875645952299237, |
|
"train/cross_entropy_loss": 0.01914622518233955, |
|
"train/kl_divergence_loss": 0.13836669921875, |
|
"train/step_duration_seconds": 7.15238881111145, |
|
"train/steps_per_hour": 305.62504381863846, |
|
"train/total_elapsed_hours": 0.5137013578414917 |
|
}, |
|
{ |
|
"epoch": 0.37451851851851853, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 1.5837436722347902e-05, |
|
"loss": 1.2601, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.37451851851851853, |
|
"step": 158, |
|
"train/combined_loss": 0.07961196266114712, |
|
"train/cross_entropy_loss": 0.019941705162636936, |
|
"train/kl_divergence_loss": 0.1392822265625, |
|
"train/step_duration_seconds": 7.165693759918213, |
|
"train/steps_per_hour": 306.3845330098647, |
|
"train/total_elapsed_hours": 0.5156918283303579 |
|
}, |
|
{ |
|
"epoch": 0.3768888888888889, |
|
"grad_norm": 8.625, |
|
"learning_rate": 1.576975498009583e-05, |
|
"loss": 1.2738, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3768888888888889, |
|
"step": 159, |
|
"train/combined_loss": 0.08134815841913223, |
|
"train/cross_entropy_loss": 0.021583035704679787, |
|
"train/kl_divergence_loss": 0.14111328125, |
|
"train/step_duration_seconds": 7.175953149795532, |
|
"train/steps_per_hour": 307.1364910017458, |
|
"train/total_elapsed_hours": 0.5176851486497455 |
|
}, |
|
{ |
|
"epoch": 0.37925925925925924, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 1.570167469866962e-05, |
|
"loss": 1.3016, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.37925925925925924, |
|
"step": 160, |
|
"train/combined_loss": 0.08167480118572712, |
|
"train/cross_entropy_loss": 0.020954578067176044, |
|
"train/kl_divergence_loss": 0.14239501953125, |
|
"train/step_duration_seconds": 7.178318023681641, |
|
"train/steps_per_hour": 307.88229126869635, |
|
"train/total_elapsed_hours": 0.5196791258785459 |
|
}, |
|
{ |
|
"epoch": 0.38162962962962965, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.563320058063622e-05, |
|
"loss": 1.3068, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.38162962962962965, |
|
"step": 161, |
|
"train/combined_loss": 0.08032544003799558, |
|
"train/cross_entropy_loss": 0.021124517312273383, |
|
"train/kl_divergence_loss": 0.1395263671875, |
|
"train/step_duration_seconds": 7.179118394851685, |
|
"train/steps_per_hour": 308.62225870273886, |
|
"train/total_elapsed_hours": 0.5216733254326714 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 7.875, |
|
"learning_rate": 1.5564337355766412e-05, |
|
"loss": 1.2852, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"step": 162, |
|
"train/combined_loss": 0.07905747788026929, |
|
"train/cross_entropy_loss": 0.02096895850263536, |
|
"train/kl_divergence_loss": 0.13714599609375, |
|
"train/step_duration_seconds": 7.179133653640747, |
|
"train/steps_per_hour": 309.3565878328245, |
|
"train/total_elapsed_hours": 0.5236675292253494 |
|
}, |
|
{ |
|
"epoch": 0.38637037037037036, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.5495089780708062e-05, |
|
"loss": 1.2649, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.38637037037037036, |
|
"step": 163, |
|
"train/combined_loss": 0.07707322854548693, |
|
"train/cross_entropy_loss": 0.019991177483461797, |
|
"train/kl_divergence_loss": 0.1341552734375, |
|
"train/step_duration_seconds": 7.165940761566162, |
|
"train/steps_per_hour": 310.0875071111404, |
|
"train/total_elapsed_hours": 0.5256580683257844 |
|
}, |
|
{ |
|
"epoch": 0.3887407407407407, |
|
"grad_norm": 9.125, |
|
"learning_rate": 1.5425462638657597e-05, |
|
"loss": 1.2332, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3887407407407407, |
|
"step": 164, |
|
"train/combined_loss": 0.07535458076745272, |
|
"train/cross_entropy_loss": 0.019971861504018307, |
|
"train/kl_divergence_loss": 0.1307373046875, |
|
"train/step_duration_seconds": 7.187286376953125, |
|
"train/steps_per_hour": 310.80941899189014, |
|
"train/total_elapsed_hours": 0.527654536763827 |
|
}, |
|
{ |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 1.5355460739029585e-05, |
|
"loss": 1.2057, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.39111111111111113, |
|
"step": 165, |
|
"train/combined_loss": 0.07617681892588735, |
|
"train/cross_entropy_loss": 0.021128055173903704, |
|
"train/kl_divergence_loss": 0.1312255859375, |
|
"train/step_duration_seconds": 7.1567230224609375, |
|
"train/steps_per_hour": 311.53088207288556, |
|
"train/total_elapsed_hours": 0.5296425153811772 |
|
}, |
|
{ |
|
"epoch": 0.3934814814814815, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 1.5285088917124555e-05, |
|
"loss": 1.2188, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3934814814814815, |
|
"step": 166, |
|
"train/combined_loss": 0.08005631249397993, |
|
"train/cross_entropy_loss": 0.021440746961161494, |
|
"train/kl_divergence_loss": 0.138671875, |
|
"train/step_duration_seconds": 7.164660930633545, |
|
"train/steps_per_hour": 312.24565441642335, |
|
"train/total_elapsed_hours": 0.5316326989730199 |
|
}, |
|
{ |
|
"epoch": 0.39585185185185184, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.5214352033794981e-05, |
|
"loss": 1.2809, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.39585185185185184, |
|
"step": 167, |
|
"train/combined_loss": 0.07657396793365479, |
|
"train/cross_entropy_loss": 0.020762681495398283, |
|
"train/kl_divergence_loss": 0.13238525390625, |
|
"train/step_duration_seconds": 7.171409845352173, |
|
"train/steps_per_hour": 312.9539957186981, |
|
"train/total_elapsed_hours": 0.5336247572633955 |
|
}, |
|
{ |
|
"epoch": 0.3982222222222222, |
|
"grad_norm": 11.25, |
|
"learning_rate": 1.5143254975109538e-05, |
|
"loss": 1.2252, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3982222222222222, |
|
"step": 168, |
|
"train/combined_loss": 0.07494777115061879, |
|
"train/cross_entropy_loss": 0.02077566913794726, |
|
"train/kl_divergence_loss": 0.129119873046875, |
|
"train/step_duration_seconds": 7.178438186645508, |
|
"train/steps_per_hour": 313.65592484340516, |
|
"train/total_elapsed_hours": 0.5356187678707971 |
|
}, |
|
{ |
|
"epoch": 0.4005925925925926, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.5071802652015592e-05, |
|
"loss": 1.1992, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4005925925925926, |
|
"step": 169, |
|
"train/combined_loss": 0.07396322628483176, |
|
"train/cross_entropy_loss": 0.02079021732788533, |
|
"train/kl_divergence_loss": 0.12713623046875, |
|
"train/step_duration_seconds": 7.178174734115601, |
|
"train/steps_per_hour": 314.3526898358104, |
|
"train/total_elapsed_hours": 0.5376127052969403 |
|
}, |
|
{ |
|
"epoch": 0.40296296296296297, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.1834, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.40296296296296297, |
|
"step": 170, |
|
"train/combined_loss": 0.07202337961643934, |
|
"train/cross_entropy_loss": 0.01926037878729403, |
|
"train/kl_divergence_loss": 0.124786376953125, |
|
"train/step_duration_seconds": 7.181441783905029, |
|
"train/steps_per_hour": 315.043775657627, |
|
"train/total_elapsed_hours": 0.5396075502369139 |
|
}, |
|
{ |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 1.4927851978748177e-05, |
|
"loss": 1.1524, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4053333333333333, |
|
"step": 171, |
|
"train/combined_loss": 0.07204042701050639, |
|
"train/cross_entropy_loss": 0.020179486949928105, |
|
"train/kl_divergence_loss": 0.1239013671875, |
|
"train/step_duration_seconds": 7.173416376113892, |
|
"train/steps_per_hour": 315.7310702015876, |
|
"train/total_elapsed_hours": 0.5416001658969455 |
|
}, |
|
{ |
|
"epoch": 0.4077037037037037, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.4855363571801523e-05, |
|
"loss": 1.1526, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4077037037037037, |
|
"step": 172, |
|
"train/combined_loss": 0.072266333270818, |
|
"train/cross_entropy_loss": 0.02215718082152307, |
|
"train/kl_divergence_loss": 0.12237548828125, |
|
"train/step_duration_seconds": 7.157419681549072, |
|
"train/steps_per_hour": 316.4159124956855, |
|
"train/total_elapsed_hours": 0.5435883380307092 |
|
}, |
|
{ |
|
"epoch": 0.4100740740740741, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 1.4782539786213184e-05, |
|
"loss": 1.1563, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.4100740740740741, |
|
"step": 173, |
|
"train/combined_loss": 0.07013190537691116, |
|
"train/cross_entropy_loss": 0.020634910091757774, |
|
"train/kl_divergence_loss": 0.11962890625, |
|
"train/step_duration_seconds": 7.1777660846710205, |
|
"train/steps_per_hour": 317.09247858077316, |
|
"train/total_elapsed_hours": 0.5455821619431178 |
|
}, |
|
{ |
|
"epoch": 0.41244444444444445, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.4709385652202204e-05, |
|
"loss": 1.1221, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.41244444444444445, |
|
"step": 174, |
|
"train/combined_loss": 0.06954938173294067, |
|
"train/cross_entropy_loss": 0.018493298441171646, |
|
"train/kl_divergence_loss": 0.12060546875, |
|
"train/step_duration_seconds": 7.157723426818848, |
|
"train/steps_per_hour": 317.767348521783, |
|
"train/total_elapsed_hours": 0.5475704184505674 |
|
}, |
|
{ |
|
"epoch": 0.4148148148148148, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 1.4635906222806058e-05, |
|
"loss": 1.1128, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4148148148148148, |
|
"step": 175, |
|
"train/combined_loss": 0.07863931078463793, |
|
"train/cross_entropy_loss": 0.02086504956241697, |
|
"train/kl_divergence_loss": 0.13641357421875, |
|
"train/step_duration_seconds": 7.178933382034302, |
|
"train/steps_per_hour": 318.4339213839359, |
|
"train/total_elapsed_hours": 0.5495645666122436 |
|
}, |
|
{ |
|
"epoch": 0.41718518518518516, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.4562106573531632e-05, |
|
"loss": 1.2582, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.41718518518518516, |
|
"step": 176, |
|
"train/combined_loss": 0.0717529603280127, |
|
"train/cross_entropy_loss": 0.022137517924420536, |
|
"train/kl_divergence_loss": 0.121368408203125, |
|
"train/step_duration_seconds": 7.176782131195068, |
|
"train/steps_per_hour": 319.09602000259764, |
|
"train/total_elapsed_hours": 0.5515581172042423 |
|
}, |
|
{ |
|
"epoch": 0.41955555555555557, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 1.4487991802004625e-05, |
|
"loss": 1.148, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.41955555555555557, |
|
"step": 177, |
|
"train/combined_loss": 0.0723364045843482, |
|
"train/cross_entropy_loss": 0.021931110764853656, |
|
"train/kl_divergence_loss": 0.12274169921875, |
|
"train/step_duration_seconds": 7.177384376525879, |
|
"train/steps_per_hour": 319.75325304866163, |
|
"train/total_elapsed_hours": 0.5535518350866105 |
|
}, |
|
{ |
|
"epoch": 0.4219259259259259, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.4413567027617442e-05, |
|
"loss": 1.1574, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4219259259259259, |
|
"step": 178, |
|
"train/combined_loss": 0.0698565854690969, |
|
"train/cross_entropy_loss": 0.017886995803564787, |
|
"train/kl_divergence_loss": 0.121826171875, |
|
"train/step_duration_seconds": 7.158464431762695, |
|
"train/steps_per_hour": 320.4087999085678, |
|
"train/total_elapsed_hours": 0.5555402974287669 |
|
}, |
|
{ |
|
"epoch": 0.4242962962962963, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.4338837391175582e-05, |
|
"loss": 1.1177, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4242962962962963, |
|
"step": 179, |
|
"train/combined_loss": 0.06887802015990019, |
|
"train/cross_entropy_loss": 0.017760923714376986, |
|
"train/kl_divergence_loss": 0.1199951171875, |
|
"train/step_duration_seconds": 7.177714586257935, |
|
"train/steps_per_hour": 321.05659141033044, |
|
"train/total_elapsed_hours": 0.5575341070360608 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.4263808054542541e-05, |
|
"loss": 1.102, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"step": 180, |
|
"train/combined_loss": 0.07172887865453959, |
|
"train/cross_entropy_loss": 0.02205883653368801, |
|
"train/kl_divergence_loss": 0.12139892578125, |
|
"train/step_duration_seconds": 7.169630289077759, |
|
"train/steps_per_hour": 321.7010573913621, |
|
"train/total_elapsed_hours": 0.559525671005249 |
|
}, |
|
{ |
|
"epoch": 0.42903703703703705, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.418848420028325e-05, |
|
"loss": 1.1477, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.42903703703703705, |
|
"step": 181, |
|
"train/combined_loss": 0.07292318437248468, |
|
"train/cross_entropy_loss": 0.024111752747558057, |
|
"train/kl_divergence_loss": 0.121734619140625, |
|
"train/step_duration_seconds": 7.1448750495910645, |
|
"train/steps_per_hour": 322.34489934858004, |
|
"train/total_elapsed_hours": 0.5615103585190243 |
|
}, |
|
{ |
|
"epoch": 0.4314074074074074, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 1.4112871031306118e-05, |
|
"loss": 1.1668, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4314074074074074, |
|
"step": 182, |
|
"train/combined_loss": 0.07089130999520421, |
|
"train/cross_entropy_loss": 0.022153714206069708, |
|
"train/kl_divergence_loss": 0.11962890625, |
|
"train/step_duration_seconds": 7.131852149963379, |
|
"train/steps_per_hour": 322.9862794273204, |
|
"train/total_elapsed_hours": 0.5634914285606808 |
|
}, |
|
{ |
|
"epoch": 0.43377777777777776, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.4036973770503623e-05, |
|
"loss": 1.1343, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.43377777777777776, |
|
"step": 183, |
|
"train/combined_loss": 0.06743072532117367, |
|
"train/cross_entropy_loss": 0.01742980582639575, |
|
"train/kl_divergence_loss": 0.117431640625, |
|
"train/step_duration_seconds": 7.159062623977661, |
|
"train/steps_per_hour": 323.61883980323483, |
|
"train/total_elapsed_hours": 0.5654800570673413 |
|
}, |
|
{ |
|
"epoch": 0.4361481481481482, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.396079766039157e-05, |
|
"loss": 1.0789, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4361481481481482, |
|
"step": 184, |
|
"train/combined_loss": 0.06741259898990393, |
|
"train/cross_entropy_loss": 0.018187020672485232, |
|
"train/kl_divergence_loss": 0.11663818359375, |
|
"train/step_duration_seconds": 7.166898250579834, |
|
"train/steps_per_hour": 324.24572304400647, |
|
"train/total_elapsed_hours": 0.5674708621369468 |
|
}, |
|
{ |
|
"epoch": 0.43851851851851853, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.3884347962746949e-05, |
|
"loss": 1.0786, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.43851851851851853, |
|
"step": 185, |
|
"train/combined_loss": 0.06627723574638367, |
|
"train/cross_entropy_loss": 0.022691186517477036, |
|
"train/kl_divergence_loss": 0.10986328125, |
|
"train/step_duration_seconds": 7.1953465938568115, |
|
"train/steps_per_hour": 324.863715114037, |
|
"train/total_elapsed_hours": 0.5694695695241292 |
|
}, |
|
{ |
|
"epoch": 0.4408888888888889, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.3807629958244498e-05, |
|
"loss": 1.0604, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4408888888888889, |
|
"step": 186, |
|
"train/combined_loss": 0.06682039611041546, |
|
"train/cross_entropy_loss": 0.02396061283070594, |
|
"train/kl_divergence_loss": 0.10968017578125, |
|
"train/step_duration_seconds": 7.1774444580078125, |
|
"train/steps_per_hour": 325.4802166069086, |
|
"train/total_elapsed_hours": 0.571463304095798 |
|
}, |
|
{ |
|
"epoch": 0.44325925925925924, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.373064894609194e-05, |
|
"loss": 1.0691, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.44325925925925924, |
|
"step": 187, |
|
"train/combined_loss": 0.07040743064135313, |
|
"train/cross_entropy_loss": 0.02216251229401678, |
|
"train/kl_divergence_loss": 0.11865234375, |
|
"train/step_duration_seconds": 7.175534009933472, |
|
"train/steps_per_hour": 326.09273309370775, |
|
"train/total_elapsed_hours": 0.5734565079874463 |
|
}, |
|
{ |
|
"epoch": 0.44562962962962965, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 1.3653410243663953e-05, |
|
"loss": 1.1265, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.44562962962962965, |
|
"step": 188, |
|
"train/combined_loss": 0.07080179871991277, |
|
"train/cross_entropy_loss": 0.018343106610700488, |
|
"train/kl_divergence_loss": 0.123260498046875, |
|
"train/step_duration_seconds": 7.184988975524902, |
|
"train/steps_per_hour": 326.69951532210575, |
|
"train/total_elapsed_hours": 0.5754523382584253 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.3575919186134862e-05, |
|
"loss": 1.1328, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"step": 189, |
|
"train/combined_loss": 0.06690460816025734, |
|
"train/cross_entropy_loss": 0.017476210254244506, |
|
"train/kl_divergence_loss": 0.1163330078125, |
|
"train/step_duration_seconds": 7.175193548202515, |
|
"train/steps_per_hour": 327.30364537972457, |
|
"train/total_elapsed_hours": 0.5774454475773705 |
|
}, |
|
{ |
|
"epoch": 0.45037037037037037, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.349818112611015e-05, |
|
"loss": 1.0705, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.45037037037037037, |
|
"step": 190, |
|
"train/combined_loss": 0.06556698912754655, |
|
"train/cross_entropy_loss": 0.018676697509363294, |
|
"train/kl_divergence_loss": 0.112457275390625, |
|
"train/step_duration_seconds": 7.1955156326293945, |
|
"train/steps_per_hour": 327.9004248735381, |
|
"train/total_elapsed_hours": 0.5794442019197676 |
|
}, |
|
{ |
|
"epoch": 0.4527407407407407, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1.342020143325669e-05, |
|
"loss": 1.0491, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4527407407407407, |
|
"step": 191, |
|
"train/combined_loss": 0.06877634488046169, |
|
"train/cross_entropy_loss": 0.024057817296124995, |
|
"train/kl_divergence_loss": 0.113494873046875, |
|
"train/step_duration_seconds": 7.176729202270508, |
|
"train/steps_per_hour": 328.49604967101124, |
|
"train/total_elapsed_hours": 0.5814377378092872 |
|
}, |
|
{ |
|
"epoch": 0.45511111111111113, |
|
"grad_norm": 19.0, |
|
"learning_rate": 1.3341985493931877e-05, |
|
"loss": 1.1004, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.45511111111111113, |
|
"step": 192, |
|
"train/combined_loss": 0.06570423394441605, |
|
"train/cross_entropy_loss": 0.024963149800896645, |
|
"train/kl_divergence_loss": 0.1064453125, |
|
"train/step_duration_seconds": 7.157531499862671, |
|
"train/steps_per_hour": 329.0906120315698, |
|
"train/total_elapsed_hours": 0.5834259410036935 |
|
}, |
|
{ |
|
"epoch": 0.4574814814814815, |
|
"grad_norm": 23.625, |
|
"learning_rate": 1.3263538710811559e-05, |
|
"loss": 1.0513, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4574814814814815, |
|
"step": 193, |
|
"train/combined_loss": 0.0659687272273004, |
|
"train/cross_entropy_loss": 0.024729204480536282, |
|
"train/kl_divergence_loss": 0.107208251953125, |
|
"train/step_duration_seconds": 7.212608098983765, |
|
"train/steps_per_hour": 329.6725202740739, |
|
"train/total_elapsed_hours": 0.5854294432534112 |
|
}, |
|
{ |
|
"epoch": 0.45985185185185184, |
|
"grad_norm": 18.625, |
|
"learning_rate": 1.3184866502516846e-05, |
|
"loss": 1.0555, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.45985185185185184, |
|
"step": 194, |
|
"train/combined_loss": 0.07015136396512389, |
|
"train/cross_entropy_loss": 0.021467273705638945, |
|
"train/kl_divergence_loss": 0.11883544921875, |
|
"train/step_duration_seconds": 7.18483304977417, |
|
"train/steps_per_hour": 330.25479673403044, |
|
"train/total_elapsed_hours": 0.5874252302116818 |
|
}, |
|
{ |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 10.75, |
|
"learning_rate": 1.3105974303239838e-05, |
|
"loss": 1.1224, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4622222222222222, |
|
"step": 195, |
|
"train/combined_loss": 0.0675101918168366, |
|
"train/cross_entropy_loss": 0.01786339597310871, |
|
"train/kl_divergence_loss": 0.117156982421875, |
|
"train/step_duration_seconds": 7.176580190658569, |
|
"train/steps_per_hour": 330.83441673191504, |
|
"train/total_elapsed_hours": 0.589418724709087 |
|
}, |
|
{ |
|
"epoch": 0.4645925925925926, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.3026867562368262e-05, |
|
"loss": 1.0802, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4645925925925926, |
|
"step": 196, |
|
"train/combined_loss": 0.06660758936777711, |
|
"train/cross_entropy_loss": 0.017553551122546196, |
|
"train/kl_divergence_loss": 0.11566162109375, |
|
"train/step_duration_seconds": 7.176948308944702, |
|
"train/steps_per_hour": 331.4100719369871, |
|
"train/total_elapsed_hours": 0.5914123214615716 |
|
}, |
|
{ |
|
"epoch": 0.46696296296296297, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.2947551744109044e-05, |
|
"loss": 1.0657, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.46696296296296297, |
|
"step": 197, |
|
"train/combined_loss": 0.07065472798421979, |
|
"train/cross_entropy_loss": 0.01936120947357267, |
|
"train/kl_divergence_loss": 0.1219482421875, |
|
"train/step_duration_seconds": 7.178069353103638, |
|
"train/steps_per_hour": 331.98168500479466, |
|
"train/total_elapsed_hours": 0.5934062296152115 |
|
}, |
|
{ |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 8.75, |
|
"learning_rate": 1.2868032327110904e-05, |
|
"loss": 1.1305, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4693333333333333, |
|
"step": 198, |
|
"train/combined_loss": 0.06524005252867937, |
|
"train/cross_entropy_loss": 0.02314978139474988, |
|
"train/kl_divergence_loss": 0.107330322265625, |
|
"train/step_duration_seconds": 7.202880859375, |
|
"train/steps_per_hour": 332.5456201761273, |
|
"train/total_elapsed_hours": 0.5954070298539268 |
|
}, |
|
{ |
|
"epoch": 0.4717037037037037, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.2788314804085904e-05, |
|
"loss": 1.0438, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4717037037037037, |
|
"step": 199, |
|
"train/combined_loss": 0.06441081315279007, |
|
"train/cross_entropy_loss": 0.02335287816822529, |
|
"train/kl_divergence_loss": 0.10546875, |
|
"train/step_duration_seconds": 7.1744384765625, |
|
"train/steps_per_hour": 333.1101833065549, |
|
"train/total_elapsed_hours": 0.5973999294307497 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 1.2708404681430054e-05, |
|
"loss": 1.0306, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"eval_combined_loss": 0.06667867637053132, |
|
"eval_cross_entropy_loss": 0.021343029824395975, |
|
"eval_kl_divergence_loss": 0.11201432291666667, |
|
"eval_loss": 0.06667868047952652, |
|
"eval_runtime": 220.1815, |
|
"eval_samples_per_second": 6.813, |
|
"eval_steps_per_second": 3.406, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"step": 200, |
|
"train/combined_loss": 0.06510467641055584, |
|
"train/cross_entropy_loss": 0.021170052816160023, |
|
"train/kl_divergence_loss": 0.109039306640625, |
|
"train/step_duration_seconds": 227.3877534866333, |
|
"train/steps_per_hour": 302.77194026156553, |
|
"train/total_elapsed_hours": 0.6605631942881478 |
|
}, |
|
{ |
|
"epoch": 0.47644444444444445, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.2628307478842955e-05, |
|
"loss": 1.0417, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.47644444444444445, |
|
"step": 201, |
|
"train/combined_loss": 0.06429090350866318, |
|
"train/cross_entropy_loss": 0.016765402629971504, |
|
"train/kl_divergence_loss": 0.11181640625, |
|
"train/step_duration_seconds": 7.160262107849121, |
|
"train/steps_per_hour": 303.37234311953483, |
|
"train/total_elapsed_hours": 0.6625521559847726 |
|
}, |
|
{ |
|
"epoch": 0.4788148148148148, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 1.2548028728946548e-05, |
|
"loss": 1.0287, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.4788148148148148, |
|
"step": 202, |
|
"train/combined_loss": 0.06479989876970649, |
|
"train/cross_entropy_loss": 0.01619648071937263, |
|
"train/kl_divergence_loss": 0.1134033203125, |
|
"train/step_duration_seconds": 7.1755499839782715, |
|
"train/steps_per_hour": 303.96720953523806, |
|
"train/total_elapsed_hours": 0.6645453643136554 |
|
}, |
|
{ |
|
"epoch": 0.48118518518518516, |
|
"grad_norm": 16.625, |
|
"learning_rate": 1.2467573976902936e-05, |
|
"loss": 1.0368, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.48118518518518516, |
|
"step": 203, |
|
"train/combined_loss": 0.066513289231807, |
|
"train/cross_entropy_loss": 0.01745650765951723, |
|
"train/kl_divergence_loss": 0.115570068359375, |
|
"train/step_duration_seconds": 7.177863121032715, |
|
"train/steps_per_hour": 304.55822459747407, |
|
"train/total_elapsed_hours": 0.666539215180609 |
|
}, |
|
{ |
|
"epoch": 0.48355555555555557, |
|
"grad_norm": 10.75, |
|
"learning_rate": 1.238694878003138e-05, |
|
"loss": 1.0642, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.48355555555555557, |
|
"step": 204, |
|
"train/combined_loss": 0.06533924676477909, |
|
"train/cross_entropy_loss": 0.0218833324033767, |
|
"train/kl_divergence_loss": 0.108795166015625, |
|
"train/step_duration_seconds": 7.178657293319702, |
|
"train/steps_per_hour": 305.14561364915744, |
|
"train/total_elapsed_hours": 0.6685332866509756 |
|
}, |
|
{ |
|
"epoch": 0.48592592592592593, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.2306158707424402e-05, |
|
"loss": 1.0454, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.48592592592592593, |
|
"step": 205, |
|
"train/combined_loss": 0.06534022279083729, |
|
"train/cross_entropy_loss": 0.024326685117557645, |
|
"train/kl_divergence_loss": 0.106353759765625, |
|
"train/step_duration_seconds": 7.156978607177734, |
|
"train/steps_per_hour": 305.7322547589479, |
|
"train/total_elapsed_hours": 0.6705213362640805 |
|
}, |
|
{ |
|
"epoch": 0.4882962962962963, |
|
"grad_norm": 16.375, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 1.0454, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4882962962962963, |
|
"step": 206, |
|
"train/combined_loss": 0.06436126446351409, |
|
"train/cross_entropy_loss": 0.024413448525592685, |
|
"train/kl_divergence_loss": 0.10430908203125, |
|
"train/step_duration_seconds": 7.1745688915252686, |
|
"train/steps_per_hour": 306.3132018994396, |
|
"train/total_elapsed_hours": 0.672514272067282 |
|
}, |
|
{ |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.2144106267931877e-05, |
|
"loss": 1.0298, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.49066666666666664, |
|
"step": 207, |
|
"train/combined_loss": 0.06408173590898514, |
|
"train/cross_entropy_loss": 0.021962294937111437, |
|
"train/kl_divergence_loss": 0.106201171875, |
|
"train/step_duration_seconds": 7.178909778594971, |
|
"train/steps_per_hour": 306.89016742276357, |
|
"train/total_elapsed_hours": 0.6745084136724472 |
|
}, |
|
{ |
|
"epoch": 0.49303703703703705, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 1.2062855094631777e-05, |
|
"loss": 1.0253, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.49303703703703705, |
|
"step": 208, |
|
"train/combined_loss": 0.06565954210236669, |
|
"train/cross_entropy_loss": 0.019319579121656716, |
|
"train/kl_divergence_loss": 0.11199951171875, |
|
"train/step_duration_seconds": 7.163602352142334, |
|
"train/steps_per_hour": 307.46566401066696, |
|
"train/total_elapsed_hours": 0.6764983032147089 |
|
}, |
|
{ |
|
"epoch": 0.4954074074074074, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.1981461431993978e-05, |
|
"loss": 1.0506, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.4954074074074074, |
|
"step": 209, |
|
"train/combined_loss": 0.0647607441060245, |
|
"train/cross_entropy_loss": 0.01816285285167396, |
|
"train/kl_divergence_loss": 0.111358642578125, |
|
"train/step_duration_seconds": 7.1943066120147705, |
|
"train/steps_per_hour": 308.03391278116044, |
|
"train/total_elapsed_hours": 0.6784967217180464 |
|
}, |
|
{ |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.1899930902191904e-05, |
|
"loss": 1.0362, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.49777777777777776, |
|
"step": 210, |
|
"train/combined_loss": 0.06525316601619124, |
|
"train/cross_entropy_loss": 0.01887303462717682, |
|
"train/kl_divergence_loss": 0.11163330078125, |
|
"train/step_duration_seconds": 7.1762001514434814, |
|
"train/steps_per_hour": 308.6011048708952, |
|
"train/total_elapsed_hours": 0.6804901106490029 |
|
}, |
|
{ |
|
"epoch": 0.5001481481481481, |
|
"grad_norm": 10.0, |
|
"learning_rate": 1.181826913685291e-05, |
|
"loss": 1.0441, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5001481481481481, |
|
"step": 211, |
|
"train/combined_loss": 0.06376577913761139, |
|
"train/cross_entropy_loss": 0.02038433833513409, |
|
"train/kl_divergence_loss": 0.107147216796875, |
|
"train/step_duration_seconds": 7.198651552200317, |
|
"train/steps_per_hour": 309.1621585576703, |
|
"train/total_elapsed_hours": 0.6824897360801697 |
|
}, |
|
{ |
|
"epoch": 0.5025185185185185, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 1.0203, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5025185185185185, |
|
"step": 212, |
|
"train/combined_loss": 0.0649118721485138, |
|
"train/cross_entropy_loss": 0.02285963052418083, |
|
"train/kl_divergence_loss": 0.106964111328125, |
|
"train/step_duration_seconds": 7.178544998168945, |
|
"train/steps_per_hour": 309.72246139738036, |
|
"train/total_elapsed_hours": 0.6844837763574388 |
|
}, |
|
{ |
|
"epoch": 0.5048888888888889, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.1654574471008712e-05, |
|
"loss": 1.0386, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5048888888888889, |
|
"step": 213, |
|
"train/combined_loss": 0.07054620841518044, |
|
"train/cross_entropy_loss": 0.022958871792070568, |
|
"train/kl_divergence_loss": 0.118133544921875, |
|
"train/step_duration_seconds": 7.145255088806152, |
|
"train/steps_per_hour": 310.28368884315285, |
|
"train/total_elapsed_hours": 0.6864685694376628 |
|
}, |
|
{ |
|
"epoch": 0.5072592592592593, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 1.1572552877523855e-05, |
|
"loss": 1.1287, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5072592592592593, |
|
"step": 214, |
|
"train/combined_loss": 0.06366756092756987, |
|
"train/cross_entropy_loss": 0.022598788724280894, |
|
"train/kl_divergence_loss": 0.104736328125, |
|
"train/step_duration_seconds": 7.188857316970825, |
|
"train/steps_per_hour": 310.8362118431136, |
|
"train/total_elapsed_hours": 0.6884654742479325 |
|
}, |
|
{ |
|
"epoch": 0.5096296296296297, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 1.1490422661761744e-05, |
|
"loss": 1.0187, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5096296296296297, |
|
"step": 215, |
|
"train/combined_loss": 0.06394149828702211, |
|
"train/cross_entropy_loss": 0.021193550201132894, |
|
"train/kl_divergence_loss": 0.106689453125, |
|
"train/step_duration_seconds": 7.176607847213745, |
|
"train/steps_per_hour": 311.3870734397048, |
|
"train/total_elapsed_hours": 0.690458976427714 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.1408189496772369e-05, |
|
"loss": 1.0231, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"step": 216, |
|
"train/combined_loss": 0.06880563637241721, |
|
"train/cross_entropy_loss": 0.020667911507189274, |
|
"train/kl_divergence_loss": 0.116943359375, |
|
"train/step_duration_seconds": 7.1657140254974365, |
|
"train/steps_per_hour": 311.93612646502845, |
|
"train/total_elapsed_hours": 0.6924494525459077 |
|
}, |
|
{ |
|
"epoch": 0.5143703703703704, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.1325859062716795e-05, |
|
"loss": 1.1009, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5143703703703704, |
|
"step": 217, |
|
"train/combined_loss": 0.06414779741317034, |
|
"train/cross_entropy_loss": 0.018890082137659192, |
|
"train/kl_divergence_loss": 0.109405517578125, |
|
"train/step_duration_seconds": 7.167219400405884, |
|
"train/steps_per_hour": 312.4818438221589, |
|
"train/total_elapsed_hours": 0.6944403468237983 |
|
}, |
|
{ |
|
"epoch": 0.5167407407407407, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 1.1243437046474854e-05, |
|
"loss": 1.0264, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5167407407407407, |
|
"step": 218, |
|
"train/combined_loss": 0.0651923450641334, |
|
"train/cross_entropy_loss": 0.01881242578383535, |
|
"train/kl_divergence_loss": 0.111572265625, |
|
"train/step_duration_seconds": 7.16694712638855, |
|
"train/steps_per_hour": 313.02447507913155, |
|
"train/total_elapsed_hours": 0.6964311654700174 |
|
}, |
|
{ |
|
"epoch": 0.5191111111111111, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 1.1160929141252303e-05, |
|
"loss": 1.0431, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5191111111111111, |
|
"step": 219, |
|
"train/combined_loss": 0.06706041377037764, |
|
"train/cross_entropy_loss": 0.019374735886231065, |
|
"train/kl_divergence_loss": 0.11474609375, |
|
"train/step_duration_seconds": 7.1320960521698, |
|
"train/steps_per_hour": 313.5683592299994, |
|
"train/total_elapsed_hours": 0.6984123032622868 |
|
}, |
|
{ |
|
"epoch": 0.5214814814814814, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.1078341046187588e-05, |
|
"loss": 1.073, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5214814814814814, |
|
"step": 220, |
|
"train/combined_loss": 0.06729212449863553, |
|
"train/cross_entropy_loss": 0.0204179905122146, |
|
"train/kl_divergence_loss": 0.114166259765625, |
|
"train/step_duration_seconds": 7.177785634994507, |
|
"train/steps_per_hour": 314.10347476777997, |
|
"train/total_elapsed_hours": 0.7004061326053408 |
|
}, |
|
{ |
|
"epoch": 0.5238518518518519, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 1.0995678465958168e-05, |
|
"loss": 1.0767, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5238518518518519, |
|
"step": 221, |
|
"train/combined_loss": 0.06687723798677325, |
|
"train/cross_entropy_loss": 0.022487382288090885, |
|
"train/kl_divergence_loss": 0.11126708984375, |
|
"train/step_duration_seconds": 7.177214860916138, |
|
"train/steps_per_hour": 314.6356233732219, |
|
"train/total_elapsed_hours": 0.7023998034000397 |
|
}, |
|
{ |
|
"epoch": 0.5262222222222223, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.0912947110386484e-05, |
|
"loss": 1.07, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5262222222222223, |
|
"step": 222, |
|
"train/combined_loss": 0.0689978925511241, |
|
"train/cross_entropy_loss": 0.02382952021434903, |
|
"train/kl_divergence_loss": 0.114166259765625, |
|
"train/step_duration_seconds": 7.179453611373901, |
|
"train/steps_per_hour": 315.1644814155012, |
|
"train/total_elapsed_hours": 0.7043940960698658 |
|
}, |
|
{ |
|
"epoch": 0.5285925925925926, |
|
"grad_norm": 15.25, |
|
"learning_rate": 1.0830152694045553e-05, |
|
"loss": 1.104, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5285925925925926, |
|
"step": 223, |
|
"train/combined_loss": 0.0666323616169393, |
|
"train/cross_entropy_loss": 0.022882646531797945, |
|
"train/kl_divergence_loss": 0.110382080078125, |
|
"train/step_duration_seconds": 7.199350595474243, |
|
"train/steps_per_hour": 315.68788327603403, |
|
"train/total_elapsed_hours": 0.7063939156797198 |
|
}, |
|
{ |
|
"epoch": 0.530962962962963, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.0747300935864245e-05, |
|
"loss": 1.0661, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.530962962962963, |
|
"step": 224, |
|
"train/combined_loss": 0.06530413264408708, |
|
"train/cross_entropy_loss": 0.021294296951964498, |
|
"train/kl_divergence_loss": 0.10931396484375, |
|
"train/step_duration_seconds": 7.174778938293457, |
|
"train/steps_per_hour": 316.211376709367, |
|
"train/total_elapsed_hours": 0.7083869098292457 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 8.875, |
|
"learning_rate": 1.0664397558732245e-05, |
|
"loss": 1.0449, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"step": 225, |
|
"train/combined_loss": 0.0637249075807631, |
|
"train/cross_entropy_loss": 0.020424663205631077, |
|
"train/kl_divergence_loss": 0.107025146484375, |
|
"train/step_duration_seconds": 7.199081659317017, |
|
"train/steps_per_hour": 316.72892290474715, |
|
"train/total_elapsed_hours": 0.7103866547346115 |
|
}, |
|
{ |
|
"epoch": 0.5357037037037037, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.0581448289104759e-05, |
|
"loss": 1.0196, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5357037037037037, |
|
"step": 226, |
|
"train/combined_loss": 0.06304276920855045, |
|
"train/cross_entropy_loss": 0.019396084127947688, |
|
"train/kl_divergence_loss": 0.106689453125, |
|
"train/step_duration_seconds": 7.180125951766968, |
|
"train/steps_per_hour": 317.2459083502985, |
|
"train/total_elapsed_hours": 0.7123811341656578 |
|
}, |
|
{ |
|
"epoch": 0.538074074074074, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.0498458856606972e-05, |
|
"loss": 1.0087, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.538074074074074, |
|
"step": 227, |
|
"train/combined_loss": 0.06372014014050364, |
|
"train/cross_entropy_loss": 0.019560642424039543, |
|
"train/kl_divergence_loss": 0.107879638671875, |
|
"train/step_duration_seconds": 7.176924228668213, |
|
"train/steps_per_hour": 317.76040263033815, |
|
"train/total_elapsed_hours": 0.7143747242291768 |
|
}, |
|
{ |
|
"epoch": 0.5404444444444444, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 1.0415434993638269e-05, |
|
"loss": 1.0195, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5404444444444444, |
|
"step": 228, |
|
"train/combined_loss": 0.06290752394124866, |
|
"train/cross_entropy_loss": 0.019766463432461023, |
|
"train/kl_divergence_loss": 0.106048583984375, |
|
"train/step_duration_seconds": 7.157245635986328, |
|
"train/steps_per_hour": 318.27446193106454, |
|
"train/total_elapsed_hours": 0.7163628480169508 |
|
}, |
|
{ |
|
"epoch": 0.5428148148148149, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.0332382434976267e-05, |
|
"loss": 1.0065, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5428148148148149, |
|
"step": 229, |
|
"train/combined_loss": 0.06345422472804785, |
|
"train/cross_entropy_loss": 0.02067675837315619, |
|
"train/kl_divergence_loss": 0.106231689453125, |
|
"train/step_duration_seconds": 7.180465459823608, |
|
"train/steps_per_hour": 318.782813491763, |
|
"train/total_elapsed_hours": 0.7183574217557908 |
|
}, |
|
{ |
|
"epoch": 0.5451851851851852, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 1.0249306917380731e-05, |
|
"loss": 1.0153, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5451851851851852, |
|
"step": 230, |
|
"train/combined_loss": 0.07120905723422766, |
|
"train/cross_entropy_loss": 0.022484031855128706, |
|
"train/kl_divergence_loss": 0.11993408203125, |
|
"train/step_duration_seconds": 7.1848931312561035, |
|
"train/steps_per_hour": 319.287804772737, |
|
"train/total_elapsed_hours": 0.7203532254033619 |
|
}, |
|
{ |
|
"epoch": 0.5475555555555556, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.0166214179197265e-05, |
|
"loss": 1.1393, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5475555555555556, |
|
"step": 231, |
|
"train/combined_loss": 0.0662783239968121, |
|
"train/cross_entropy_loss": 0.021716808201745152, |
|
"train/kl_divergence_loss": 0.11083984375, |
|
"train/step_duration_seconds": 7.186007499694824, |
|
"train/steps_per_hour": 319.7898684983009, |
|
"train/total_elapsed_hours": 0.7223493385977215 |
|
}, |
|
{ |
|
"epoch": 0.5499259259259259, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.0083109959960974e-05, |
|
"loss": 1.0605, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5499259259259259, |
|
"step": 232, |
|
"train/combined_loss": 0.06430092873051763, |
|
"train/cross_entropy_loss": 0.021363087464123964, |
|
"train/kl_divergence_loss": 0.10723876953125, |
|
"train/step_duration_seconds": 7.199544906616211, |
|
"train/steps_per_hour": 320.28750235103695, |
|
"train/total_elapsed_hours": 0.7243492121828927 |
|
}, |
|
{ |
|
"epoch": 0.5522962962962963, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0288, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5522962962962963, |
|
"step": 233, |
|
"train/combined_loss": 0.06349486531689763, |
|
"train/cross_entropy_loss": 0.021063214750029147, |
|
"train/kl_divergence_loss": 0.105926513671875, |
|
"train/step_duration_seconds": 7.180714845657349, |
|
"train/steps_per_hour": 320.78470593967154, |
|
"train/total_elapsed_hours": 0.7263438551955753 |
|
}, |
|
{ |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 9.916890040039031e-06, |
|
"loss": 1.0159, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5546666666666666, |
|
"step": 234, |
|
"train/combined_loss": 0.0634385438170284, |
|
"train/cross_entropy_loss": 0.02043177606537938, |
|
"train/kl_divergence_loss": 0.1064453125, |
|
"train/step_duration_seconds": 7.174914836883545, |
|
"train/steps_per_hour": 321.2798969078875, |
|
"train/total_elapsed_hours": 0.7283368870947096 |
|
}, |
|
{ |
|
"epoch": 0.557037037037037, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 9.833785820802739e-06, |
|
"loss": 1.015, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.557037037037037, |
|
"step": 235, |
|
"train/combined_loss": 0.06511542480438948, |
|
"train/cross_entropy_loss": 0.01987928501330316, |
|
"train/kl_divergence_loss": 0.1103515625, |
|
"train/step_duration_seconds": 7.176663398742676, |
|
"train/steps_per_hour": 321.77217117883936, |
|
"train/total_elapsed_hours": 0.7303304047054715 |
|
}, |
|
{ |
|
"epoch": 0.5594074074074074, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 9.750693082619274e-06, |
|
"loss": 1.0418, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5594074074074074, |
|
"step": 236, |
|
"train/combined_loss": 0.06252926005981863, |
|
"train/cross_entropy_loss": 0.019498219480738044, |
|
"train/kl_divergence_loss": 0.105560302734375, |
|
"train/step_duration_seconds": 7.1978747844696045, |
|
"train/steps_per_hour": 322.2591725337913, |
|
"train/total_elapsed_hours": 0.7323298143678242 |
|
}, |
|
{ |
|
"epoch": 0.5617777777777778, |
|
"grad_norm": 7.125, |
|
"learning_rate": 9.667617565023734e-06, |
|
"loss": 1.0005, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5617777777777778, |
|
"step": 237, |
|
"train/combined_loss": 0.06657313695177436, |
|
"train/cross_entropy_loss": 0.020200716448016465, |
|
"train/kl_divergence_loss": 0.112945556640625, |
|
"train/step_duration_seconds": 7.176342487335205, |
|
"train/steps_per_hour": 322.7461507067669, |
|
"train/total_elapsed_hours": 0.7343232428365284 |
|
}, |
|
{ |
|
"epoch": 0.5641481481481482, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 9.584565006361735e-06, |
|
"loss": 1.0652, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5641481481481482, |
|
"step": 238, |
|
"train/combined_loss": 0.06469497783109546, |
|
"train/cross_entropy_loss": 0.020838933647610247, |
|
"train/kl_divergence_loss": 0.108551025390625, |
|
"train/step_duration_seconds": 7.179325580596924, |
|
"train/steps_per_hour": 323.23012833277306, |
|
"train/total_elapsed_hours": 0.7363174999422497 |
|
}, |
|
{ |
|
"epoch": 0.5665185185185185, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 9.501541143393028e-06, |
|
"loss": 1.0351, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5665185185185185, |
|
"step": 239, |
|
"train/combined_loss": 0.06340441107749939, |
|
"train/cross_entropy_loss": 0.021584213944151998, |
|
"train/kl_divergence_loss": 0.105224609375, |
|
"train/step_duration_seconds": 7.157347917556763, |
|
"train/steps_per_hour": 323.7141681182355, |
|
"train/total_elapsed_hours": 0.7383056521415711 |
|
}, |
|
{ |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 9.418551710895243e-06, |
|
"loss": 1.0145, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5688888888888889, |
|
"step": 240, |
|
"train/combined_loss": 0.06330876937136054, |
|
"train/cross_entropy_loss": 0.02142344566527754, |
|
"train/kl_divergence_loss": 0.105194091796875, |
|
"train/step_duration_seconds": 7.197352170944214, |
|
"train/steps_per_hour": 324.1907416897731, |
|
"train/total_elapsed_hours": 0.7403049166335 |
|
}, |
|
{ |
|
"epoch": 0.5712592592592592, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 9.33560244126776e-06, |
|
"loss": 1.0129, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.5712592592592592, |
|
"step": 241, |
|
"train/combined_loss": 0.0631729164160788, |
|
"train/cross_entropy_loss": 0.021395884454250336, |
|
"train/kl_divergence_loss": 0.104949951171875, |
|
"train/step_duration_seconds": 7.1834022998809814, |
|
"train/steps_per_hour": 324.666442947215, |
|
"train/total_elapsed_hours": 0.7423003061612448 |
|
}, |
|
{ |
|
"epoch": 0.5736296296296296, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 9.252699064135759e-06, |
|
"loss": 1.0108, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5736296296296296, |
|
"step": 242, |
|
"train/combined_loss": 0.06237048772163689, |
|
"train/cross_entropy_loss": 0.020492929965257645, |
|
"train/kl_divergence_loss": 0.104248046875, |
|
"train/step_duration_seconds": 7.1805572509765625, |
|
"train/steps_per_hour": 325.13993881325246, |
|
"train/total_elapsed_hours": 0.7442949053976271 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 9.169847305954448e-06, |
|
"loss": 0.9979, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"step": 243, |
|
"train/combined_loss": 0.0639663846231997, |
|
"train/cross_entropy_loss": 0.020327785867266357, |
|
"train/kl_divergence_loss": 0.10760498046875, |
|
"train/step_duration_seconds": 7.17130970954895, |
|
"train/steps_per_hour": 325.61202443655657, |
|
"train/total_elapsed_hours": 0.7462869358725018 |
|
}, |
|
{ |
|
"epoch": 0.5783703703703704, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 9.087052889613519e-06, |
|
"loss": 1.0235, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.5783703703703704, |
|
"step": 244, |
|
"train/combined_loss": 0.06497443979606032, |
|
"train/cross_entropy_loss": 0.020543363760225475, |
|
"train/kl_divergence_loss": 0.109405517578125, |
|
"train/step_duration_seconds": 7.176990032196045, |
|
"train/steps_per_hour": 326.0809089404372, |
|
"train/total_elapsed_hours": 0.7482805442147785 |
|
}, |
|
{ |
|
"epoch": 0.5807407407407408, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 9.004321534041836e-06, |
|
"loss": 1.0396, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5807407407407408, |
|
"step": 245, |
|
"train/combined_loss": 0.06485264329239726, |
|
"train/cross_entropy_loss": 0.020574429305270314, |
|
"train/kl_divergence_loss": 0.109130859375, |
|
"train/step_duration_seconds": 7.177663564682007, |
|
"train/steps_per_hour": 326.54722020012497, |
|
"train/total_elapsed_hours": 0.7502743396494124 |
|
}, |
|
{ |
|
"epoch": 0.5831111111111111, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 8.921658953812416e-06, |
|
"loss": 1.0376, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.5831111111111111, |
|
"step": 246, |
|
"train/combined_loss": 0.06371145462617278, |
|
"train/cross_entropy_loss": 0.020306207472458482, |
|
"train/kl_divergence_loss": 0.10711669921875, |
|
"train/step_duration_seconds": 7.178081512451172, |
|
"train/steps_per_hour": 327.0110091897602, |
|
"train/total_elapsed_hours": 0.7522682511806488 |
|
}, |
|
{ |
|
"epoch": 0.5854814814814815, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 8.839070858747697e-06, |
|
"loss": 1.0194, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.5854814814814815, |
|
"step": 247, |
|
"train/combined_loss": 0.061982935993000865, |
|
"train/cross_entropy_loss": 0.02035869611427188, |
|
"train/kl_divergence_loss": 0.103607177734375, |
|
"train/step_duration_seconds": 7.1770946979522705, |
|
"train/steps_per_hour": 327.47246511357855, |
|
"train/total_elapsed_hours": 0.7542618885967467 |
|
}, |
|
{ |
|
"epoch": 0.5878518518518518, |
|
"grad_norm": 5.625, |
|
"learning_rate": 8.756562953525151e-06, |
|
"loss": 0.9917, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.5878518518518518, |
|
"step": 248, |
|
"train/combined_loss": 0.06333467178046703, |
|
"train/cross_entropy_loss": 0.020956451655365527, |
|
"train/kl_divergence_loss": 0.105712890625, |
|
"train/step_duration_seconds": 7.17914080619812, |
|
"train/steps_per_hour": 327.9312416048011, |
|
"train/total_elapsed_hours": 0.7562560943762461 |
|
}, |
|
{ |
|
"epoch": 0.5902222222222222, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 8.674140937283208e-06, |
|
"loss": 1.0134, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.5902222222222222, |
|
"step": 249, |
|
"train/combined_loss": 0.06772775668650866, |
|
"train/cross_entropy_loss": 0.02138080890290439, |
|
"train/kl_divergence_loss": 0.11407470703125, |
|
"train/step_duration_seconds": 7.154328107833862, |
|
"train/steps_per_hour": 328.3905899588576, |
|
"train/total_elapsed_hours": 0.7582434077395334 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 8.591810503227634e-06, |
|
"loss": 1.0836, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"eval_combined_loss": 0.06404994005337358, |
|
"eval_cross_entropy_loss": 0.020620713440080485, |
|
"eval_kl_divergence_loss": 0.10747916666666667, |
|
"eval_loss": 0.06404994428157806, |
|
"eval_runtime": 220.1998, |
|
"eval_samples_per_second": 6.812, |
|
"eval_steps_per_second": 3.406, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"step": 250, |
|
"train/combined_loss": 0.06369326380081475, |
|
"train/cross_entropy_loss": 0.020636040600948036, |
|
"train/kl_divergence_loss": 0.10675048828125, |
|
"train/step_duration_seconds": 227.41642260551453, |
|
"train/steps_per_hour": 304.35298930640323, |
|
"train/total_elapsed_hours": 0.8214146362410651 |
|
}, |
|
{ |
|
"epoch": 0.5949629629629629, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 8.509577338238255e-06, |
|
"loss": 1.0191, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.5949629629629629, |
|
"step": 251, |
|
"train/combined_loss": 0.0633452923502773, |
|
"train/cross_entropy_loss": 0.020184239139780402, |
|
"train/kl_divergence_loss": 0.10650634765625, |
|
"train/step_duration_seconds": 7.193126916885376, |
|
"train/steps_per_hour": 304.82890506058317, |
|
"train/total_elapsed_hours": 0.8234127270513111 |
|
}, |
|
{ |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 8.427447122476148e-06, |
|
"loss": 1.0135, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5973333333333334, |
|
"step": 252, |
|
"train/combined_loss": 0.0627688483800739, |
|
"train/cross_entropy_loss": 0.020252052345313132, |
|
"train/kl_divergence_loss": 0.10528564453125, |
|
"train/step_duration_seconds": 7.173017740249634, |
|
"train/steps_per_hour": 305.3045828117105, |
|
"train/total_elapsed_hours": 0.8254052319791582 |
|
}, |
|
{ |
|
"epoch": 0.5997037037037037, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.34542552899129e-06, |
|
"loss": 1.0043, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5997037037037037, |
|
"step": 253, |
|
"train/combined_loss": 0.06621215213090181, |
|
"train/cross_entropy_loss": 0.02149291045498103, |
|
"train/kl_divergence_loss": 0.110931396484375, |
|
"train/step_duration_seconds": 7.1814353466033936, |
|
"train/steps_per_hour": 305.77710542260405, |
|
"train/total_elapsed_hours": 0.8274000751309925 |
|
}, |
|
{ |
|
"epoch": 0.6020740740740741, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 1.0594, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6020740740740741, |
|
"step": 254, |
|
"train/combined_loss": 0.06436787801794708, |
|
"train/cross_entropy_loss": 0.021680091507732868, |
|
"train/kl_divergence_loss": 0.1070556640625, |
|
"train/step_duration_seconds": 7.1533849239349365, |
|
"train/steps_per_hour": 306.25023211085755, |
|
"train/total_elapsed_hours": 0.8293871264987521 |
|
}, |
|
{ |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 8.181730863147094e-06, |
|
"loss": 1.0299, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6044444444444445, |
|
"step": 255, |
|
"train/combined_loss": 0.0639684284105897, |
|
"train/cross_entropy_loss": 0.020576014067046344, |
|
"train/kl_divergence_loss": 0.10736083984375, |
|
"train/step_duration_seconds": 7.142830848693848, |
|
"train/steps_per_hour": 306.7221787763317, |
|
"train/total_elapsed_hours": 0.8313712461789449 |
|
}, |
|
{ |
|
"epoch": 0.6068148148148148, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.100069097808103e-06, |
|
"loss": 1.0235, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6068148148148148, |
|
"step": 256, |
|
"train/combined_loss": 0.062460833229124546, |
|
"train/cross_entropy_loss": 0.020459996070712805, |
|
"train/kl_divergence_loss": 0.104461669921875, |
|
"train/step_duration_seconds": 7.155627012252808, |
|
"train/steps_per_hour": 307.1905678921464, |
|
"train/total_elapsed_hours": 0.8333589203490152 |
|
}, |
|
{ |
|
"epoch": 0.6091851851851852, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 8.018538568006027e-06, |
|
"loss": 0.9994, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6091851851851852, |
|
"step": 257, |
|
"train/combined_loss": 0.06057729944586754, |
|
"train/cross_entropy_loss": 0.020568661391735077, |
|
"train/kl_divergence_loss": 0.1005859375, |
|
"train/step_duration_seconds": 7.156771659851074, |
|
"train/steps_per_hour": 307.6566108780016, |
|
"train/total_elapsed_hours": 0.8353469124767515 |
|
}, |
|
{ |
|
"epoch": 0.6115555555555555, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 7.937144905368226e-06, |
|
"loss": 0.9692, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6115555555555555, |
|
"step": 258, |
|
"train/combined_loss": 0.06223560217767954, |
|
"train/cross_entropy_loss": 0.020253673777915537, |
|
"train/kl_divergence_loss": 0.104217529296875, |
|
"train/step_duration_seconds": 7.184788942337036, |
|
"train/steps_per_hour": 308.11757712721027, |
|
"train/total_elapsed_hours": 0.8373426871829562 |
|
}, |
|
{ |
|
"epoch": 0.6139259259259259, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 7.855893732068124e-06, |
|
"loss": 0.9958, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6139259259259259, |
|
"step": 259, |
|
"train/combined_loss": 0.06394938984885812, |
|
"train/cross_entropy_loss": 0.020690529490821064, |
|
"train/kl_divergence_loss": 0.107208251953125, |
|
"train/step_duration_seconds": 7.194268226623535, |
|
"train/steps_per_hour": 308.5753831613492, |
|
"train/total_elapsed_hours": 0.839341095023685 |
|
}, |
|
{ |
|
"epoch": 0.6162962962962963, |
|
"grad_norm": 2.875, |
|
"learning_rate": 7.774790660436857e-06, |
|
"loss": 1.0232, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6162962962962963, |
|
"step": 260, |
|
"train/combined_loss": 0.060645608929917216, |
|
"train/cross_entropy_loss": 0.020613725995644927, |
|
"train/kl_divergence_loss": 0.100677490234375, |
|
"train/step_duration_seconds": 7.176680564880371, |
|
"train/steps_per_hour": 309.0328088515058, |
|
"train/total_elapsed_hours": 0.8413346174028185 |
|
}, |
|
{ |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 7.6938412925756e-06, |
|
"loss": 0.9703, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6186666666666667, |
|
"step": 261, |
|
"train/combined_loss": 0.06048685312271118, |
|
"train/cross_entropy_loss": 0.02075397619046271, |
|
"train/kl_divergence_loss": 0.1002197265625, |
|
"train/step_duration_seconds": 7.175957441329956, |
|
"train/steps_per_hour": 309.48814566254214, |
|
"train/total_elapsed_hours": 0.843327938914299 |
|
}, |
|
{ |
|
"epoch": 0.621037037037037, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 7.613051219968624e-06, |
|
"loss": 0.9678, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.621037037037037, |
|
"step": 262, |
|
"train/combined_loss": 0.06825654301792383, |
|
"train/cross_entropy_loss": 0.02109560859389603, |
|
"train/kl_divergence_loss": 0.11541748046875, |
|
"train/step_duration_seconds": 7.180307388305664, |
|
"train/steps_per_hour": 309.94089201150274, |
|
"train/total_elapsed_hours": 0.8453224687443839 |
|
}, |
|
{ |
|
"epoch": 0.6234074074074074, |
|
"grad_norm": 2.875, |
|
"learning_rate": 7.532426023097063e-06, |
|
"loss": 1.0921, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6234074074074074, |
|
"step": 263, |
|
"train/combined_loss": 0.06100269500166178, |
|
"train/cross_entropy_loss": 0.020168230053968728, |
|
"train/kl_divergence_loss": 0.101837158203125, |
|
"train/step_duration_seconds": 7.166715145111084, |
|
"train/steps_per_hour": 310.39288999163625, |
|
"train/total_elapsed_hours": 0.8473132229513592 |
|
}, |
|
{ |
|
"epoch": 0.6257777777777778, |
|
"grad_norm": 5.75, |
|
"learning_rate": 7.451971271053455e-06, |
|
"loss": 0.976, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6257777777777778, |
|
"step": 264, |
|
"train/combined_loss": 0.06548905186355114, |
|
"train/cross_entropy_loss": 0.021114823641255498, |
|
"train/kl_divergence_loss": 0.10986328125, |
|
"train/step_duration_seconds": 7.166426181793213, |
|
"train/steps_per_hour": 310.8427983981813, |
|
"train/total_elapsed_hours": 0.8493038968907463 |
|
}, |
|
{ |
|
"epoch": 0.6281481481481481, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 7.371692521157048e-06, |
|
"loss": 1.0478, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6281481481481481, |
|
"step": 265, |
|
"train/combined_loss": 0.06573160830885172, |
|
"train/cross_entropy_loss": 0.021050618845038116, |
|
"train/kl_divergence_loss": 0.11041259765625, |
|
"train/step_duration_seconds": 7.166961431503296, |
|
"train/steps_per_hour": 311.2905482984119, |
|
"train/total_elapsed_hours": 0.8512947195106082 |
|
}, |
|
{ |
|
"epoch": 0.6305185185185185, |
|
"grad_norm": 5.25, |
|
"learning_rate": 7.291595318569951e-06, |
|
"loss": 1.0517, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6305185185185185, |
|
"step": 266, |
|
"train/combined_loss": 0.06105546629987657, |
|
"train/cross_entropy_loss": 0.020487397676333785, |
|
"train/kl_divergence_loss": 0.10162353515625, |
|
"train/step_duration_seconds": 7.191303968429565, |
|
"train/steps_per_hour": 311.733738567441, |
|
"train/total_elapsed_hours": 0.8532923039462831 |
|
}, |
|
{ |
|
"epoch": 0.6328888888888888, |
|
"grad_norm": 5.125, |
|
"learning_rate": 7.2116851959140965e-06, |
|
"loss": 0.9769, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6328888888888888, |
|
"step": 267, |
|
"train/combined_loss": 0.06089310604147613, |
|
"train/cross_entropy_loss": 0.020650955964811146, |
|
"train/kl_divergence_loss": 0.10113525390625, |
|
"train/step_duration_seconds": 7.178247451782227, |
|
"train/steps_per_hour": 312.17618240392204, |
|
"train/total_elapsed_hours": 0.8552862615717782 |
|
}, |
|
{ |
|
"epoch": 0.6352592592592593, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 7.131967672889101e-06, |
|
"loss": 0.9743, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6352592592592593, |
|
"step": 268, |
|
"train/combined_loss": 0.06254504946991801, |
|
"train/cross_entropy_loss": 0.02102515858132392, |
|
"train/kl_divergence_loss": 0.10406494140625, |
|
"train/step_duration_seconds": 7.1988208293914795, |
|
"train/steps_per_hour": 312.6144841102412, |
|
"train/total_elapsed_hours": 0.8572859340243869 |
|
}, |
|
{ |
|
"epoch": 0.6376296296296297, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 7.052448255890958e-06, |
|
"loss": 1.0007, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6376296296296297, |
|
"step": 269, |
|
"train/combined_loss": 0.06616902281530201, |
|
"train/cross_entropy_loss": 0.02134561410639435, |
|
"train/kl_divergence_loss": 0.110992431640625, |
|
"train/step_duration_seconds": 7.174272060394287, |
|
"train/steps_per_hour": 313.05323016517417, |
|
"train/total_elapsed_hours": 0.8592787873744965 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 6.973132437631743e-06, |
|
"loss": 1.0587, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"step": 270, |
|
"train/combined_loss": 0.0661325603723526, |
|
"train/cross_entropy_loss": 0.021059064893051982, |
|
"train/kl_divergence_loss": 0.1112060546875, |
|
"train/step_duration_seconds": 7.177139759063721, |
|
"train/steps_per_hour": 313.48965589105467, |
|
"train/total_elapsed_hours": 0.8612724373075697 |
|
}, |
|
{ |
|
"epoch": 0.6423703703703704, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 6.8940256967601625e-06, |
|
"loss": 1.0581, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6423703703703704, |
|
"step": 271, |
|
"train/combined_loss": 0.06747469631955028, |
|
"train/cross_entropy_loss": 0.021271413774229586, |
|
"train/kl_divergence_loss": 0.113677978515625, |
|
"train/step_duration_seconds": 7.159501552581787, |
|
"train/steps_per_hour": 313.92584752922323, |
|
"train/total_elapsed_hours": 0.8632611877388424 |
|
}, |
|
{ |
|
"epoch": 0.6447407407407407, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 6.815133497483157e-06, |
|
"loss": 1.0796, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6447407407407407, |
|
"step": 272, |
|
"train/combined_loss": 0.06747178034856915, |
|
"train/cross_entropy_loss": 0.02111299301031977, |
|
"train/kl_divergence_loss": 0.11383056640625, |
|
"train/step_duration_seconds": 7.15754246711731, |
|
"train/steps_per_hour": 314.36023173496596, |
|
"train/total_elapsed_hours": 0.8652493939797083 |
|
}, |
|
{ |
|
"epoch": 0.6471111111111111, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 6.736461289188445e-06, |
|
"loss": 1.0795, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6471111111111111, |
|
"step": 273, |
|
"train/combined_loss": 0.06073831953108311, |
|
"train/cross_entropy_loss": 0.020249835564754903, |
|
"train/kl_divergence_loss": 0.101226806640625, |
|
"train/step_duration_seconds": 7.178404092788696, |
|
"train/steps_per_hour": 314.7905207894922, |
|
"train/total_elapsed_hours": 0.8672433951165941 |
|
}, |
|
{ |
|
"epoch": 0.6494814814814814, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 6.6580145060681255e-06, |
|
"loss": 0.9718, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6494814814814814, |
|
"step": 274, |
|
"train/combined_loss": 0.06485259486362338, |
|
"train/cross_entropy_loss": 0.020971058984287083, |
|
"train/kl_divergence_loss": 0.108734130859375, |
|
"train/step_duration_seconds": 7.158320426940918, |
|
"train/steps_per_hour": 315.22085880551066, |
|
"train/total_elapsed_hours": 0.8692318174574111 |
|
}, |
|
{ |
|
"epoch": 0.6518518518518519, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 6.579798566743314e-06, |
|
"loss": 1.0376, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6518518518518519, |
|
"step": 275, |
|
"train/combined_loss": 0.05971498414874077, |
|
"train/cross_entropy_loss": 0.020308872684836388, |
|
"train/kl_divergence_loss": 0.09912109375, |
|
"train/step_duration_seconds": 7.181738376617432, |
|
"train/steps_per_hour": 315.64687567999425, |
|
"train/total_elapsed_hours": 0.8712267447842492 |
|
}, |
|
{ |
|
"epoch": 0.6542222222222223, |
|
"grad_norm": 5.625, |
|
"learning_rate": 6.501818873889856e-06, |
|
"loss": 0.9554, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6542222222222223, |
|
"step": 276, |
|
"train/combined_loss": 0.06411758903414011, |
|
"train/cross_entropy_loss": 0.020782789448276162, |
|
"train/kl_divergence_loss": 0.107452392578125, |
|
"train/step_duration_seconds": 7.175511598587036, |
|
"train/steps_per_hour": 316.07157210067743, |
|
"train/total_elapsed_hours": 0.8732199424505234 |
|
}, |
|
{ |
|
"epoch": 0.6565925925925926, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 6.424080813865139e-06, |
|
"loss": 1.0259, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6565925925925926, |
|
"step": 277, |
|
"train/combined_loss": 0.05957591161131859, |
|
"train/cross_entropy_loss": 0.020519012585282326, |
|
"train/kl_divergence_loss": 0.0986328125, |
|
"train/step_duration_seconds": 7.175374507904053, |
|
"train/steps_per_hour": 316.4943478972161, |
|
"train/total_elapsed_hours": 0.8752131020360523 |
|
}, |
|
{ |
|
"epoch": 0.658962962962963, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 6.34658975633605e-06, |
|
"loss": 0.9532, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.658962962962963, |
|
"step": 278, |
|
"train/combined_loss": 0.06180824153125286, |
|
"train/cross_entropy_loss": 0.02071121137123555, |
|
"train/kl_divergence_loss": 0.1029052734375, |
|
"train/step_duration_seconds": 7.177731037139893, |
|
"train/steps_per_hour": 316.9149659696548, |
|
"train/total_elapsed_hours": 0.8772069162130356 |
|
}, |
|
{ |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.269351053908061e-06, |
|
"loss": 0.9889, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6613333333333333, |
|
"step": 279, |
|
"train/combined_loss": 0.062360771000385284, |
|
"train/cross_entropy_loss": 0.020839707227423787, |
|
"train/kl_divergence_loss": 0.1038818359375, |
|
"train/step_duration_seconds": 7.177475214004517, |
|
"train/steps_per_hour": 317.3337019711024, |
|
"train/total_elapsed_hours": 0.8792006593280368 |
|
}, |
|
{ |
|
"epoch": 0.6637037037037037, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 6.192370041755505e-06, |
|
"loss": 0.9978, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6637037037037037, |
|
"step": 280, |
|
"train/combined_loss": 0.06398251187056303, |
|
"train/cross_entropy_loss": 0.020573665155097842, |
|
"train/kl_divergence_loss": 0.107391357421875, |
|
"train/step_duration_seconds": 7.203123092651367, |
|
"train/steps_per_hour": 317.7479741776063, |
|
"train/total_elapsed_hours": 0.8812015268537733 |
|
}, |
|
{ |
|
"epoch": 0.666074074074074, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 6.115652037253054e-06, |
|
"loss": 1.0237, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.666074074074074, |
|
"step": 281, |
|
"train/combined_loss": 0.05968505213968456, |
|
"train/cross_entropy_loss": 0.02018797560594976, |
|
"train/kl_divergence_loss": 0.09918212890625, |
|
"train/step_duration_seconds": 7.152847766876221, |
|
"train/steps_per_hour": 318.16540023581325, |
|
"train/total_elapsed_hours": 0.883188429011239 |
|
}, |
|
{ |
|
"epoch": 0.6684444444444444, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.955, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6684444444444444, |
|
"step": 282, |
|
"train/combined_loss": 0.07058762316592038, |
|
"train/cross_entropy_loss": 0.02191255264915526, |
|
"train/kl_divergence_loss": 0.1192626953125, |
|
"train/step_duration_seconds": 7.178384065628052, |
|
"train/steps_per_hour": 318.578399398522, |
|
"train/total_elapsed_hours": 0.8851824245850245 |
|
}, |
|
{ |
|
"epoch": 0.6708148148148149, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 5.963026229496378e-06, |
|
"loss": 1.1294, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6708148148148149, |
|
"step": 283, |
|
"train/combined_loss": 0.06238678935915232, |
|
"train/cross_entropy_loss": 0.021471575018949807, |
|
"train/kl_divergence_loss": 0.103302001953125, |
|
"train/step_duration_seconds": 7.157472133636475, |
|
"train/steps_per_hour": 318.9916306953401, |
|
"train/total_elapsed_hours": 0.8871706112888125 |
|
}, |
|
{ |
|
"epoch": 0.6731851851851852, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 5.887128968693887e-06, |
|
"loss": 0.9982, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6731851851851852, |
|
"step": 284, |
|
"train/combined_loss": 0.06445175595581532, |
|
"train/cross_entropy_loss": 0.021542673697695136, |
|
"train/kl_divergence_loss": 0.10736083984375, |
|
"train/step_duration_seconds": 7.179536581039429, |
|
"train/steps_per_hour": 319.40081235138257, |
|
"train/total_elapsed_hours": 0.8891649270057678 |
|
}, |
|
{ |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 5.811515799716754e-06, |
|
"loss": 1.0312, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6755555555555556, |
|
"step": 285, |
|
"train/combined_loss": 0.060818693600595, |
|
"train/cross_entropy_loss": 0.020441102096810937, |
|
"train/kl_divergence_loss": 0.1011962890625, |
|
"train/step_duration_seconds": 7.156339406967163, |
|
"train/steps_per_hour": 319.81047503815245, |
|
"train/total_elapsed_hours": 0.8911527990632587 |
|
}, |
|
{ |
|
"epoch": 0.6779259259259259, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.736191945457463e-06, |
|
"loss": 0.9731, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6779259259259259, |
|
"step": 286, |
|
"train/combined_loss": 0.06460838648490608, |
|
"train/cross_entropy_loss": 0.021092993556521833, |
|
"train/kl_divergence_loss": 0.108123779296875, |
|
"train/step_duration_seconds": 7.178438901901245, |
|
"train/steps_per_hour": 320.21611322984927, |
|
"train/total_elapsed_hours": 0.8931468098693424 |
|
}, |
|
{ |
|
"epoch": 0.6802962962962963, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 5.66116260882442e-06, |
|
"loss": 1.0337, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.6802962962962963, |
|
"step": 287, |
|
"train/combined_loss": 0.06716977385804057, |
|
"train/cross_entropy_loss": 0.02099726488813758, |
|
"train/kl_divergence_loss": 0.11334228515625, |
|
"train/step_duration_seconds": 7.17467188835144, |
|
"train/steps_per_hour": 320.6203190221968, |
|
"train/total_elapsed_hours": 0.8951397742827734 |
|
}, |
|
{ |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 5.586432972382561e-06, |
|
"loss": 1.0747, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.6826666666666666, |
|
"step": 288, |
|
"train/combined_loss": 0.06678420398384333, |
|
"train/cross_entropy_loss": 0.02086699299979955, |
|
"train/kl_divergence_loss": 0.112701416015625, |
|
"train/step_duration_seconds": 7.181233882904053, |
|
"train/steps_per_hour": 321.0220766963932, |
|
"train/total_elapsed_hours": 0.897134561472469 |
|
}, |
|
{ |
|
"epoch": 0.685037037037037, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 5.512008197995379e-06, |
|
"loss": 1.0685, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.685037037037037, |
|
"step": 289, |
|
"train/combined_loss": 0.06064820708706975, |
|
"train/cross_entropy_loss": 0.020740994019433856, |
|
"train/kl_divergence_loss": 0.100555419921875, |
|
"train/step_duration_seconds": 7.175872325897217, |
|
"train/steps_per_hour": 321.4225841160013, |
|
"train/total_elapsed_hours": 0.8991278593407737 |
|
}, |
|
{ |
|
"epoch": 0.6874074074074074, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 5.43789342646837e-06, |
|
"loss": 0.9704, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6874074074074074, |
|
"step": 290, |
|
"train/combined_loss": 0.06774787046015263, |
|
"train/cross_entropy_loss": 0.021848278120160103, |
|
"train/kl_divergence_loss": 0.1136474609375, |
|
"train/step_duration_seconds": 7.154938459396362, |
|
"train/steps_per_hour": 321.8233964113103, |
|
"train/total_elapsed_hours": 0.9011153422461615 |
|
}, |
|
{ |
|
"epoch": 0.6897777777777778, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.364093777193944e-06, |
|
"loss": 1.084, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.6897777777777778, |
|
"step": 291, |
|
"train/combined_loss": 0.06057584332302213, |
|
"train/cross_entropy_loss": 0.020870924927294254, |
|
"train/kl_divergence_loss": 0.10028076171875, |
|
"train/step_duration_seconds": 7.165515184402466, |
|
"train/steps_per_hour": 322.2213962971687, |
|
"train/total_elapsed_hours": 0.9031057631307178 |
|
}, |
|
{ |
|
"epoch": 0.6921481481481482, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 5.290614347797802e-06, |
|
"loss": 0.9692, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.6921481481481482, |
|
"step": 292, |
|
"train/combined_loss": 0.0660083363763988, |
|
"train/cross_entropy_loss": 0.02096320828422904, |
|
"train/kl_divergence_loss": 0.111053466796875, |
|
"train/step_duration_seconds": 7.173615455627441, |
|
"train/steps_per_hour": 322.61684365124677, |
|
"train/total_elapsed_hours": 0.9050984340906143 |
|
}, |
|
{ |
|
"epoch": 0.6945185185185185, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.217460213786822e-06, |
|
"loss": 1.0561, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.6945185185185185, |
|
"step": 293, |
|
"train/combined_loss": 0.06140920426696539, |
|
"train/cross_entropy_loss": 0.020706592011265457, |
|
"train/kl_divergence_loss": 0.10211181640625, |
|
"train/step_duration_seconds": 7.179595708847046, |
|
"train/steps_per_hour": 323.00996205395205, |
|
"train/total_elapsed_hours": 0.9070927662319607 |
|
}, |
|
{ |
|
"epoch": 0.6968888888888889, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.144636428198477e-06, |
|
"loss": 0.9825, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.6968888888888889, |
|
"step": 294, |
|
"train/combined_loss": 0.061046687653288245, |
|
"train/cross_entropy_loss": 0.020591912092640996, |
|
"train/kl_divergence_loss": 0.10150146484375, |
|
"train/step_duration_seconds": 7.179559946060181, |
|
"train/steps_per_hour": 323.40135916435133, |
|
"train/total_elapsed_hours": 0.9090870884391996 |
|
}, |
|
{ |
|
"epoch": 0.6992592592592592, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 5.072148021251822e-06, |
|
"loss": 0.9767, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6992592592592592, |
|
"step": 295, |
|
"train/combined_loss": 0.06156940385699272, |
|
"train/cross_entropy_loss": 0.020782849984243512, |
|
"train/kl_divergence_loss": 0.10235595703125, |
|
"train/step_duration_seconds": 7.190384149551392, |
|
"train/steps_per_hour": 323.78997420643765, |
|
"train/total_elapsed_hours": 0.9110844173696306 |
|
}, |
|
{ |
|
"epoch": 0.7016296296296296, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.9851, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7016296296296296, |
|
"step": 296, |
|
"train/combined_loss": 0.05902678519487381, |
|
"train/cross_entropy_loss": 0.02039731852710247, |
|
"train/kl_divergence_loss": 0.09765625, |
|
"train/step_duration_seconds": 7.165556907653809, |
|
"train/steps_per_hour": 324.1793375982033, |
|
"train/total_elapsed_hours": 0.9130748498439789 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 4.92819734798441e-06, |
|
"loss": 0.9444, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"step": 297, |
|
"train/combined_loss": 0.0633976545650512, |
|
"train/cross_entropy_loss": 0.0211129350354895, |
|
"train/kl_divergence_loss": 0.105682373046875, |
|
"train/step_duration_seconds": 7.194438457489014, |
|
"train/steps_per_hour": 324.5641615666989, |
|
"train/total_elapsed_hours": 0.9150733049710592 |
|
}, |
|
{ |
|
"epoch": 0.7063703703703703, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 4.856745024890466e-06, |
|
"loss": 1.0144, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7063703703703703, |
|
"step": 298, |
|
"train/combined_loss": 0.06398635334335268, |
|
"train/cross_entropy_loss": 0.0211001462303102, |
|
"train/kl_divergence_loss": 0.10687255859375, |
|
"train/step_duration_seconds": 7.178032159805298, |
|
"train/steps_per_hour": 324.9489231458107, |
|
"train/total_elapsed_hours": 0.9170672027932273 |
|
}, |
|
{ |
|
"epoch": 0.7087407407407408, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 4.78564796620502e-06, |
|
"loss": 1.0238, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7087407407407408, |
|
"step": 299, |
|
"train/combined_loss": 0.058621928095817566, |
|
"train/cross_entropy_loss": 0.020564164966344833, |
|
"train/kl_divergence_loss": 0.0966796875, |
|
"train/step_duration_seconds": 7.176652193069458, |
|
"train/steps_per_hour": 325.33215093908757, |
|
"train/total_elapsed_hours": 0.9190607172913021 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.714911082875446e-06, |
|
"loss": 0.938, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"eval_combined_loss": 0.06135369462271532, |
|
"eval_cross_entropy_loss": 0.020845410078763962, |
|
"eval_kl_divergence_loss": 0.10186197916666667, |
|
"eval_loss": 0.06135369837284088, |
|
"eval_runtime": 220.2651, |
|
"eval_samples_per_second": 6.81, |
|
"eval_steps_per_second": 3.405, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"step": 300, |
|
"train/combined_loss": 0.06502728187479079, |
|
"train/cross_entropy_loss": 0.02125939668621868, |
|
"train/kl_divergence_loss": 0.108795166015625, |
|
"train/step_duration_seconds": 227.4951696395874, |
|
"train/steps_per_hour": 305.4200389971287, |
|
"train/total_elapsed_hours": 0.9822538199689653 |
|
}, |
|
{ |
|
"epoch": 0.7134814814814815, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 4.644539260970417e-06, |
|
"loss": 1.0404, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7134814814814815, |
|
"step": 301, |
|
"train/combined_loss": 0.05955993290990591, |
|
"train/cross_entropy_loss": 0.020364978816360235, |
|
"train/kl_divergence_loss": 0.0987548828125, |
|
"train/step_duration_seconds": 7.195945978164673, |
|
"train/steps_per_hour": 305.81577462801977, |
|
"train/total_elapsed_hours": 0.9842526938517888 |
|
}, |
|
{ |
|
"epoch": 0.7158518518518519, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.5745373613424075e-06, |
|
"loss": 0.953, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7158518518518519, |
|
"step": 302, |
|
"train/combined_loss": 0.06077713891863823, |
|
"train/cross_entropy_loss": 0.020449545118026435, |
|
"train/kl_divergence_loss": 0.101104736328125, |
|
"train/step_duration_seconds": 7.195575952529907, |
|
"train/steps_per_hour": 306.2099380662498, |
|
"train/total_elapsed_hours": 0.9862514649497138 |
|
}, |
|
{ |
|
"epoch": 0.7182222222222222, |
|
"grad_norm": 4.125, |
|
"learning_rate": 4.504910219291941e-06, |
|
"loss": 0.9724, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7182222222222222, |
|
"step": 303, |
|
"train/combined_loss": 0.05961341969668865, |
|
"train/cross_entropy_loss": 0.020319371833465993, |
|
"train/kl_divergence_loss": 0.098907470703125, |
|
"train/step_duration_seconds": 7.177309036254883, |
|
"train/steps_per_hour": 306.6040813356026, |
|
"train/total_elapsed_hours": 0.9882451619042291 |
|
}, |
|
{ |
|
"epoch": 0.7205925925925926, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 4.435662644233594e-06, |
|
"loss": 0.9538, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7205925925925926, |
|
"step": 304, |
|
"train/combined_loss": 0.059920859755948186, |
|
"train/cross_entropy_loss": 0.020384933333843946, |
|
"train/kl_divergence_loss": 0.099456787109375, |
|
"train/step_duration_seconds": 7.177339553833008, |
|
"train/steps_per_hour": 306.9966348805165, |
|
"train/total_elapsed_hours": 0.9902388673358493 |
|
}, |
|
{ |
|
"epoch": 0.7229629629629629, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 4.3667994193637794e-06, |
|
"loss": 0.9587, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7229629629629629, |
|
"step": 305, |
|
"train/combined_loss": 0.0592358959838748, |
|
"train/cross_entropy_loss": 0.020632435218431056, |
|
"train/kl_divergence_loss": 0.09783935546875, |
|
"train/step_duration_seconds": 7.177339315414429, |
|
"train/steps_per_hour": 307.38761092036276, |
|
"train/total_elapsed_hours": 0.9922325727012422 |
|
}, |
|
{ |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 4.298325301330383e-06, |
|
"loss": 0.9478, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7253333333333334, |
|
"step": 306, |
|
"train/combined_loss": 0.06081084324978292, |
|
"train/cross_entropy_loss": 0.020883160177618265, |
|
"train/kl_divergence_loss": 0.100738525390625, |
|
"train/step_duration_seconds": 7.180126905441284, |
|
"train/steps_per_hour": 307.776779219795, |
|
"train/total_elapsed_hours": 0.9942270523971981 |
|
}, |
|
{ |
|
"epoch": 0.7277037037037037, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.23024501990417e-06, |
|
"loss": 0.973, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7277037037037037, |
|
"step": 307, |
|
"train/combined_loss": 0.05935222376137972, |
|
"train/cross_entropy_loss": 0.020437843864783645, |
|
"train/kl_divergence_loss": 0.0982666015625, |
|
"train/step_duration_seconds": 7.176982641220093, |
|
"train/steps_per_hour": 308.1646594287626, |
|
"train/total_elapsed_hours": 0.996220658686426 |
|
}, |
|
{ |
|
"epoch": 0.7300740740740741, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 4.162563277652104e-06, |
|
"loss": 0.9496, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7300740740740741, |
|
"step": 308, |
|
"train/combined_loss": 0.06314325472339988, |
|
"train/cross_entropy_loss": 0.020665171090513468, |
|
"train/kl_divergence_loss": 0.105621337890625, |
|
"train/step_duration_seconds": 7.161325693130493, |
|
"train/steps_per_hour": 308.55233465191134, |
|
"train/total_elapsed_hours": 0.9982099158234067 |
|
}, |
|
{ |
|
"epoch": 0.7324444444444445, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 4.095284749612504e-06, |
|
"loss": 1.0103, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7324444444444445, |
|
"step": 309, |
|
"train/combined_loss": 0.059852408710867167, |
|
"train/cross_entropy_loss": 0.02018699492327869, |
|
"train/kl_divergence_loss": 0.099517822265625, |
|
"train/step_duration_seconds": 7.173473358154297, |
|
"train/steps_per_hour": 308.93742555494475, |
|
"train/total_elapsed_hours": 1.0002025473117828 |
|
}, |
|
{ |
|
"epoch": 0.7348148148148148, |
|
"grad_norm": 6.5, |
|
"learning_rate": 4.028414082972141e-06, |
|
"loss": 0.9576, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7348148148148148, |
|
"step": 310, |
|
"train/combined_loss": 0.061497040558606386, |
|
"train/cross_entropy_loss": 0.02069915970787406, |
|
"train/kl_divergence_loss": 0.102294921875, |
|
"train/step_duration_seconds": 7.176191329956055, |
|
"train/steps_per_hour": 309.3207521077788, |
|
"train/total_elapsed_hours": 1.002195933792326 |
|
}, |
|
{ |
|
"epoch": 0.7371851851851852, |
|
"grad_norm": 5.625, |
|
"learning_rate": 3.961955896745224e-06, |
|
"loss": 0.984, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7371851851851852, |
|
"step": 311, |
|
"train/combined_loss": 0.06270680762827396, |
|
"train/cross_entropy_loss": 0.02095194417051971, |
|
"train/kl_divergence_loss": 0.104461669921875, |
|
"train/step_duration_seconds": 7.178727865219116, |
|
"train/steps_per_hour": 309.7023394964521, |
|
"train/total_elapsed_hours": 1.004190024865998 |
|
}, |
|
{ |
|
"epoch": 0.7395555555555555, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 3.89591478145437e-06, |
|
"loss": 1.0033, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7395555555555555, |
|
"step": 312, |
|
"train/combined_loss": 0.06055857567116618, |
|
"train/cross_entropy_loss": 0.02050069870892912, |
|
"train/kl_divergence_loss": 0.100616455078125, |
|
"train/step_duration_seconds": 7.1979944705963135, |
|
"train/steps_per_hour": 310.0807650969411, |
|
"train/total_elapsed_hours": 1.0061894677744971 |
|
}, |
|
{ |
|
"epoch": 0.7419259259259259, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.830295298813475e-06, |
|
"loss": 0.9689, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7419259259259259, |
|
"step": 313, |
|
"train/combined_loss": 0.06014604773372412, |
|
"train/cross_entropy_loss": 0.020926860976032913, |
|
"train/kl_divergence_loss": 0.099365234375, |
|
"train/step_duration_seconds": 7.178622245788574, |
|
"train/steps_per_hour": 310.45934677419245, |
|
"train/total_elapsed_hours": 1.0081835295094383 |
|
}, |
|
{ |
|
"epoch": 0.7442962962962963, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 0.9623, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7442962962962963, |
|
"step": 314, |
|
"train/combined_loss": 0.05981010291725397, |
|
"train/cross_entropy_loss": 0.020895838970318437, |
|
"train/kl_divergence_loss": 0.098724365234375, |
|
"train/step_duration_seconds": 7.296232223510742, |
|
"train/steps_per_hour": 310.8263816166535, |
|
"train/total_elapsed_hours": 1.0102102606826358 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 9.625, |
|
"learning_rate": 3.7003393324051874e-06, |
|
"loss": 0.957, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"step": 315, |
|
"train/combined_loss": 0.060554551193490624, |
|
"train/cross_entropy_loss": 0.021896454854868352, |
|
"train/kl_divergence_loss": 0.099212646484375, |
|
"train/step_duration_seconds": 7.158878326416016, |
|
"train/steps_per_hour": 311.20367676354846, |
|
"train/total_elapsed_hours": 1.0121988379955291 |
|
}, |
|
{ |
|
"epoch": 0.7490370370370371, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 3.636011825196365e-06, |
|
"loss": 0.9689, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7490370370370371, |
|
"step": 316, |
|
"train/combined_loss": 0.05863487347960472, |
|
"train/cross_entropy_loss": 0.020590057596564293, |
|
"train/kl_divergence_loss": 0.0966796875, |
|
"train/step_duration_seconds": 7.176958084106445, |
|
"train/steps_per_hour": 311.5779494376515, |
|
"train/total_elapsed_hours": 1.0141924374633364 |
|
}, |
|
{ |
|
"epoch": 0.7514074074074074, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 3.5721239031346067e-06, |
|
"loss": 0.9382, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7514074074074074, |
|
"step": 317, |
|
"train/combined_loss": 0.059863541973754764, |
|
"train/cross_entropy_loss": 0.020819613593630493, |
|
"train/kl_divergence_loss": 0.098907470703125, |
|
"train/step_duration_seconds": 7.1567628383636475, |
|
"train/steps_per_hour": 311.95247569565794, |
|
"train/total_elapsed_hours": 1.0161804271406598 |
|
}, |
|
{ |
|
"epoch": 0.7537777777777778, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.5086799792044812e-06, |
|
"loss": 0.9578, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7537777777777778, |
|
"step": 318, |
|
"train/combined_loss": 0.06011883169412613, |
|
"train/cross_entropy_loss": 0.02053673774935305, |
|
"train/kl_divergence_loss": 0.099700927734375, |
|
"train/step_duration_seconds": 7.177694797515869, |
|
"train/steps_per_hour": 312.3237558362263, |
|
"train/total_elapsed_hours": 1.0181742312510809 |
|
}, |
|
{ |
|
"epoch": 0.7561481481481481, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 3.4456844357218977e-06, |
|
"loss": 0.9619, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7561481481481481, |
|
"step": 319, |
|
"train/combined_loss": 0.060971920378506184, |
|
"train/cross_entropy_loss": 0.020503410720266402, |
|
"train/kl_divergence_loss": 0.1014404296875, |
|
"train/step_duration_seconds": 7.183193206787109, |
|
"train/steps_per_hour": 312.69311658008917, |
|
"train/total_elapsed_hours": 1.0201695626974105 |
|
}, |
|
{ |
|
"epoch": 0.7585185185185185, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 3.3831416240314085e-06, |
|
"loss": 0.9756, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7585185185185185, |
|
"step": 320, |
|
"train/combined_loss": 0.05914177093654871, |
|
"train/cross_entropy_loss": 0.020505218068137765, |
|
"train/kl_divergence_loss": 0.0977783203125, |
|
"train/step_duration_seconds": 7.172834634780884, |
|
"train/steps_per_hour": 313.0619165575121, |
|
"train/total_elapsed_hours": 1.0221620167626275 |
|
}, |
|
{ |
|
"epoch": 0.7608888888888888, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 3.3210558642056277e-06, |
|
"loss": 0.9463, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7608888888888888, |
|
"step": 321, |
|
"train/combined_loss": 0.06318964948877692, |
|
"train/cross_entropy_loss": 0.02078847971279174, |
|
"train/kl_divergence_loss": 0.1055908203125, |
|
"train/step_duration_seconds": 7.178191423416138, |
|
"train/steps_per_hour": 313.4288261803181, |
|
"train/total_elapsed_hours": 1.0241559588246876 |
|
}, |
|
{ |
|
"epoch": 0.7632592592592593, |
|
"grad_norm": 5.625, |
|
"learning_rate": 3.2594314447468457e-06, |
|
"loss": 1.011, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7632592592592593, |
|
"step": 322, |
|
"train/combined_loss": 0.06236264854669571, |
|
"train/cross_entropy_loss": 0.02066035382449627, |
|
"train/kl_divergence_loss": 0.10406494140625, |
|
"train/step_duration_seconds": 7.178012847900391, |
|
"train/steps_per_hour": 313.7943250662448, |
|
"train/total_elapsed_hours": 1.0261498512824376 |
|
}, |
|
{ |
|
"epoch": 0.7656296296296297, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 3.1982726222908046e-06, |
|
"loss": 0.9978, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7656296296296297, |
|
"step": 323, |
|
"train/combined_loss": 0.06084095500409603, |
|
"train/cross_entropy_loss": 0.02054665272589773, |
|
"train/kl_divergence_loss": 0.10113525390625, |
|
"train/step_duration_seconds": 7.176525115966797, |
|
"train/steps_per_hour": 314.1585325936901, |
|
"train/total_elapsed_hours": 1.0281433304813172 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.1375836213126653e-06, |
|
"loss": 0.9735, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"step": 324, |
|
"train/combined_loss": 0.06315464107319713, |
|
"train/cross_entropy_loss": 0.020748980692587793, |
|
"train/kl_divergence_loss": 0.105560302734375, |
|
"train/step_duration_seconds": 7.15761399269104, |
|
"train/steps_per_hour": 314.52293440305624, |
|
"train/total_elapsed_hours": 1.0301315565903981 |
|
}, |
|
{ |
|
"epoch": 0.7703703703703704, |
|
"grad_norm": 8.0, |
|
"learning_rate": 3.077368633835205e-06, |
|
"loss": 1.0105, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7703703703703704, |
|
"step": 325, |
|
"train/combined_loss": 0.05943355988711119, |
|
"train/cross_entropy_loss": 0.020478446152992547, |
|
"train/kl_divergence_loss": 0.098388671875, |
|
"train/step_duration_seconds": 7.163183212280273, |
|
"train/steps_per_hour": 314.88546031009383, |
|
"train/total_elapsed_hours": 1.0321213297049205 |
|
}, |
|
{ |
|
"epoch": 0.7727407407407407, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 3.017631819139273e-06, |
|
"loss": 0.9509, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.7727407407407407, |
|
"step": 326, |
|
"train/combined_loss": 0.06256748456507921, |
|
"train/cross_entropy_loss": 0.020612266613170505, |
|
"train/kl_divergence_loss": 0.104522705078125, |
|
"train/step_duration_seconds": 7.17126202583313, |
|
"train/steps_per_hour": 315.24590700472396, |
|
"train/total_elapsed_hours": 1.0341133469343184 |
|
}, |
|
{ |
|
"epoch": 0.7751111111111111, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 2.958377303476483e-06, |
|
"loss": 1.0011, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.7751111111111111, |
|
"step": 327, |
|
"train/combined_loss": 0.059798732632771134, |
|
"train/cross_entropy_loss": 0.020720509812235832, |
|
"train/kl_divergence_loss": 0.098876953125, |
|
"train/step_duration_seconds": 7.176945686340332, |
|
"train/steps_per_hour": 315.60448679780745, |
|
"train/total_elapsed_hours": 1.0361069429583019 |
|
}, |
|
{ |
|
"epoch": 0.7774814814814814, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.8996091797841976e-06, |
|
"loss": 0.9568, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.7774814814814814, |
|
"step": 328, |
|
"train/combined_loss": 0.058641964104026556, |
|
"train/cross_entropy_loss": 0.020512689370661974, |
|
"train/kl_divergence_loss": 0.096771240234375, |
|
"train/step_duration_seconds": 7.181287527084351, |
|
"train/steps_per_hour": 315.96132225407996, |
|
"train/total_elapsed_hours": 1.0381017450491588 |
|
}, |
|
{ |
|
"epoch": 0.7798518518518519, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 2.8413315074028157e-06, |
|
"loss": 0.9383, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.7798518518518519, |
|
"step": 329, |
|
"train/combined_loss": 0.05962651362642646, |
|
"train/cross_entropy_loss": 0.02113901753909886, |
|
"train/kl_divergence_loss": 0.098114013671875, |
|
"train/step_duration_seconds": 7.1774678230285645, |
|
"train/steps_per_hour": 316.31711164339544, |
|
"train/total_elapsed_hours": 1.0400954861111111 |
|
}, |
|
{ |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 2.783548311795379e-06, |
|
"loss": 0.954, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7822222222222223, |
|
"step": 330, |
|
"train/combined_loss": 0.059158258605748415, |
|
"train/cross_entropy_loss": 0.020599229959771037, |
|
"train/kl_divergence_loss": 0.09771728515625, |
|
"train/step_duration_seconds": 7.179843902587891, |
|
"train/steps_per_hour": 316.67133906098195, |
|
"train/total_elapsed_hours": 1.0420898871951634 |
|
}, |
|
{ |
|
"epoch": 0.7845925925925926, |
|
"grad_norm": 3.875, |
|
"learning_rate": 2.726263584269513e-06, |
|
"loss": 0.9465, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.7845925925925926, |
|
"step": 331, |
|
"train/combined_loss": 0.05943922000005841, |
|
"train/cross_entropy_loss": 0.020550800720229745, |
|
"train/kl_divergence_loss": 0.09832763671875, |
|
"train/step_duration_seconds": 7.175478458404541, |
|
"train/steps_per_hour": 317.02458139366485, |
|
"train/total_elapsed_hours": 1.0440830756558312 |
|
}, |
|
{ |
|
"epoch": 0.786962962962963, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 2.669481281701739e-06, |
|
"loss": 0.951, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.786962962962963, |
|
"step": 332, |
|
"train/combined_loss": 0.06072757695801556, |
|
"train/cross_entropy_loss": 0.020319899427704513, |
|
"train/kl_divergence_loss": 0.10113525390625, |
|
"train/step_duration_seconds": 7.182747840881348, |
|
"train/steps_per_hour": 317.3758649537043, |
|
"train/total_elapsed_hours": 1.0460782833894093 |
|
}, |
|
{ |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 2.6132053262641467e-06, |
|
"loss": 0.9716, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.7893333333333333, |
|
"step": 333, |
|
"train/combined_loss": 0.05954708158969879, |
|
"train/cross_entropy_loss": 0.02046134858392179, |
|
"train/kl_divergence_loss": 0.0986328125, |
|
"train/step_duration_seconds": 7.171588897705078, |
|
"train/steps_per_hour": 317.72675072895083, |
|
"train/total_elapsed_hours": 1.0480703914165497 |
|
}, |
|
{ |
|
"epoch": 0.7917037037037037, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 2.5574396051534835e-06, |
|
"loss": 0.9528, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.7917037037037037, |
|
"step": 334, |
|
"train/combined_loss": 0.05950666521675885, |
|
"train/cross_entropy_loss": 0.020380519214086235, |
|
"train/kl_divergence_loss": 0.0986328125, |
|
"train/step_duration_seconds": 7.180173397064209, |
|
"train/steps_per_hour": 318.0755828336615, |
|
"train/total_elapsed_hours": 1.0500648840268454 |
|
}, |
|
{ |
|
"epoch": 0.794074074074074, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.502187970322657e-06, |
|
"loss": 0.9521, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.794074074074074, |
|
"step": 335, |
|
"train/combined_loss": 0.06000912608578801, |
|
"train/cross_entropy_loss": 0.020530943875201046, |
|
"train/kl_divergence_loss": 0.0994873046875, |
|
"train/step_duration_seconds": 7.176333665847778, |
|
"train/steps_per_hour": 318.4234151295545, |
|
"train/total_elapsed_hours": 1.0520583100451364 |
|
}, |
|
{ |
|
"epoch": 0.7964444444444444, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 2.447454238214654e-06, |
|
"loss": 0.9601, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.7964444444444444, |
|
"step": 336, |
|
"train/combined_loss": 0.061862445436418056, |
|
"train/cross_entropy_loss": 0.020911171450279653, |
|
"train/kl_divergence_loss": 0.102813720703125, |
|
"train/step_duration_seconds": 7.18380331993103, |
|
"train/steps_per_hour": 318.7693042840402, |
|
"train/total_elapsed_hours": 1.0540538109673394 |
|
}, |
|
{ |
|
"epoch": 0.7988148148148149, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 2.3932421894989167e-06, |
|
"loss": 0.9898, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.7988148148148149, |
|
"step": 337, |
|
"train/combined_loss": 0.06748714856803417, |
|
"train/cross_entropy_loss": 0.02215080999303609, |
|
"train/kl_divergence_loss": 0.112823486328125, |
|
"train/step_duration_seconds": 7.178979873657227, |
|
"train/steps_per_hour": 319.1142911319986, |
|
"train/total_elapsed_hours": 1.0560479720433553 |
|
}, |
|
{ |
|
"epoch": 0.8011851851851852, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 1.0798, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8011851851851852, |
|
"step": 338, |
|
"train/combined_loss": 0.059276150073856115, |
|
"train/cross_entropy_loss": 0.020651907310821116, |
|
"train/kl_divergence_loss": 0.097900390625, |
|
"train/step_duration_seconds": 7.173582077026367, |
|
"train/steps_per_hour": 319.4584302570375, |
|
"train/total_elapsed_hours": 1.058040633731418 |
|
}, |
|
{ |
|
"epoch": 0.8035555555555556, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.2863980844900036e-06, |
|
"loss": 0.9484, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8035555555555556, |
|
"step": 339, |
|
"train/combined_loss": 0.0625988682731986, |
|
"train/cross_entropy_loss": 0.02107176184654236, |
|
"train/kl_divergence_loss": 0.1041259765625, |
|
"train/step_duration_seconds": 7.17778754234314, |
|
"train/steps_per_hour": 319.8009231203147, |
|
"train/total_elapsed_hours": 1.0600344636042913 |
|
}, |
|
{ |
|
"epoch": 0.8059259259259259, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.2337734083302164e-06, |
|
"loss": 1.0016, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8059259259259259, |
|
"step": 340, |
|
"train/combined_loss": 0.058292608708143234, |
|
"train/cross_entropy_loss": 0.02039380930364132, |
|
"train/kl_divergence_loss": 0.09619140625, |
|
"train/step_duration_seconds": 7.182078838348389, |
|
"train/steps_per_hour": 320.14177067664195, |
|
"train/total_elapsed_hours": 1.0620294855038326 |
|
}, |
|
{ |
|
"epoch": 0.8082962962962963, |
|
"grad_norm": 4.625, |
|
"learning_rate": 2.1816851753197023e-06, |
|
"loss": 0.9327, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8082962962962963, |
|
"step": 341, |
|
"train/combined_loss": 0.0593375526368618, |
|
"train/cross_entropy_loss": 0.020500056445598602, |
|
"train/kl_divergence_loss": 0.098175048828125, |
|
"train/step_duration_seconds": 7.171487331390381, |
|
"train/steps_per_hour": 320.4822262207178, |
|
"train/total_elapsed_hours": 1.0640215653181075 |
|
}, |
|
{ |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 2.130136983393112e-06, |
|
"loss": 0.9494, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8106666666666666, |
|
"step": 342, |
|
"train/combined_loss": 0.060327990911901, |
|
"train/cross_entropy_loss": 0.020588844665326178, |
|
"train/kl_divergence_loss": 0.100067138671875, |
|
"train/step_duration_seconds": 7.198361873626709, |
|
"train/steps_per_hour": 320.81916267981495, |
|
"train/total_elapsed_hours": 1.066021110283004 |
|
}, |
|
{ |
|
"epoch": 0.813037037037037, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.0791323931823783e-06, |
|
"loss": 0.9652, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.813037037037037, |
|
"step": 343, |
|
"train/combined_loss": 0.05895965825766325, |
|
"train/cross_entropy_loss": 0.02056823973543942, |
|
"train/kl_divergence_loss": 0.09735107421875, |
|
"train/step_duration_seconds": 7.177961349487305, |
|
"train/steps_per_hour": 321.1565415410552, |
|
"train/total_elapsed_hours": 1.0680149884356394 |
|
}, |
|
{ |
|
"epoch": 0.8154074074074074, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.0286749277707783e-06, |
|
"loss": 0.9434, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8154074074074074, |
|
"step": 344, |
|
"train/combined_loss": 0.06692604720592499, |
|
"train/cross_entropy_loss": 0.02130326582118869, |
|
"train/kl_divergence_loss": 0.112548828125, |
|
"train/step_duration_seconds": 7.182729482650757, |
|
"train/steps_per_hour": 321.49226509337905, |
|
"train/total_elapsed_hours": 1.0700101910697088 |
|
}, |
|
{ |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.9787680724495617e-06, |
|
"loss": 1.0708, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8177777777777778, |
|
"step": 345, |
|
"train/combined_loss": 0.06856274465098977, |
|
"train/cross_entropy_loss": 0.02140283351764083, |
|
"train/kl_divergence_loss": 0.11572265625, |
|
"train/step_duration_seconds": 7.21046257019043, |
|
"train/steps_per_hour": 321.8244262652279, |
|
"train/total_elapsed_hours": 1.0720130973392064 |
|
}, |
|
{ |
|
"epoch": 0.8201481481481482, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.929415274477239e-06, |
|
"loss": 1.097, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8201481481481482, |
|
"step": 346, |
|
"train/combined_loss": 0.06000364082865417, |
|
"train/cross_entropy_loss": 0.020581011194735765, |
|
"train/kl_divergence_loss": 0.09942626953125, |
|
"train/step_duration_seconds": 7.178732395172119, |
|
"train/steps_per_hour": 322.1579923556434, |
|
"train/total_elapsed_hours": 1.0740071896711985 |
|
}, |
|
{ |
|
"epoch": 0.8225185185185185, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.880619942841435e-06, |
|
"loss": 0.9601, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8225185185185185, |
|
"step": 347, |
|
"train/combined_loss": 0.06414050119929016, |
|
"train/cross_entropy_loss": 0.02085912844631821, |
|
"train/kl_divergence_loss": 0.107421875, |
|
"train/step_duration_seconds": 7.1721296310424805, |
|
"train/steps_per_hour": 322.4908717904752, |
|
"train/total_elapsed_hours": 1.0759994479020436 |
|
}, |
|
{ |
|
"epoch": 0.8248888888888889, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.8323854480234348e-06, |
|
"loss": 1.0262, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8248888888888889, |
|
"step": 348, |
|
"train/combined_loss": 0.059865488670766354, |
|
"train/cross_entropy_loss": 0.020457297330722213, |
|
"train/kl_divergence_loss": 0.099273681640625, |
|
"train/step_duration_seconds": 7.182539701461792, |
|
"train/steps_per_hour": 322.8216548617558, |
|
"train/total_elapsed_hours": 1.0779945978191163 |
|
}, |
|
{ |
|
"epoch": 0.8272592592592592, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 1.7847151217653624e-06, |
|
"loss": 0.9578, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8272592592592592, |
|
"step": 349, |
|
"train/combined_loss": 0.061520870542153716, |
|
"train/cross_entropy_loss": 0.020655267755500972, |
|
"train/kl_divergence_loss": 0.102386474609375, |
|
"train/step_duration_seconds": 7.174387454986572, |
|
"train/steps_per_hour": 323.15189335194066, |
|
"train/total_elapsed_hours": 1.0799874832232794 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.7376122568400533e-06, |
|
"loss": 0.9843, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"eval_combined_loss": 0.060819300456593436, |
|
"eval_cross_entropy_loss": 0.02078313216318687, |
|
"eval_kl_divergence_loss": 0.10085546875, |
|
"eval_loss": 0.060819294303655624, |
|
"eval_runtime": 220.1882, |
|
"eval_samples_per_second": 6.812, |
|
"eval_steps_per_second": 3.406, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"step": 350, |
|
"train/combined_loss": 0.05945373326539993, |
|
"train/cross_entropy_loss": 0.020640864269807935, |
|
"train/kl_divergence_loss": 0.0982666015625, |
|
"train/step_duration_seconds": 227.37273049354553, |
|
"train/steps_per_hour": 306.17246086025364, |
|
"train/total_elapsed_hours": 1.143146575027042 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.6910801068236015e-06, |
|
"loss": 0.9513, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"step": 351, |
|
"train/combined_loss": 0.05878610094077885, |
|
"train/cross_entropy_loss": 0.02025164384394884, |
|
"train/kl_divergence_loss": 0.097320556640625, |
|
"train/step_duration_seconds": 7.182687520980835, |
|
"train/steps_per_hour": 306.51226810501225, |
|
"train/total_elapsed_hours": 1.1451417660050922 |
|
}, |
|
{ |
|
"epoch": 0.8343703703703703, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.9406, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8343703703703703, |
|
"step": 352, |
|
"train/combined_loss": 0.05914012948051095, |
|
"train/cross_entropy_loss": 0.02041038265451789, |
|
"train/kl_divergence_loss": 0.097869873046875, |
|
"train/step_duration_seconds": 7.178761959075928, |
|
"train/steps_per_hour": 306.85118499420435, |
|
"train/total_elapsed_hours": 1.14713586654928 |
|
}, |
|
{ |
|
"epoch": 0.8367407407407408, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.599740768492286e-06, |
|
"loss": 0.9462, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8367407407407408, |
|
"step": 353, |
|
"train/combined_loss": 0.06183216394856572, |
|
"train/cross_entropy_loss": 0.02106422872748226, |
|
"train/kl_divergence_loss": 0.10260009765625, |
|
"train/step_duration_seconds": 7.19196081161499, |
|
"train/steps_per_hour": 307.1879455332317, |
|
"train/total_elapsed_hours": 1.1491336334413953 |
|
}, |
|
{ |
|
"epoch": 0.8391111111111111, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.5549398893369216e-06, |
|
"loss": 0.9893, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8391111111111111, |
|
"step": 354, |
|
"train/combined_loss": 0.0688268430531025, |
|
"train/cross_entropy_loss": 0.022144652903079987, |
|
"train/kl_divergence_loss": 0.115509033203125, |
|
"train/step_duration_seconds": 7.177990674972534, |
|
"train/steps_per_hour": 307.5245738890482, |
|
"train/total_elapsed_hours": 1.1511275197399986 |
|
}, |
|
{ |
|
"epoch": 0.8414814814814815, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.5107223429736273e-06, |
|
"loss": 1.1012, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8414814814814815, |
|
"step": 355, |
|
"train/combined_loss": 0.060881074983626604, |
|
"train/cross_entropy_loss": 0.021084662177599967, |
|
"train/kl_divergence_loss": 0.100677490234375, |
|
"train/step_duration_seconds": 7.157759428024292, |
|
"train/steps_per_hour": 307.8615384801584, |
|
"train/total_elapsed_hours": 1.1531157862477832 |
|
}, |
|
{ |
|
"epoch": 0.8438518518518519, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.467091183678444e-06, |
|
"loss": 0.9741, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8438518518518519, |
|
"step": 356, |
|
"train/combined_loss": 0.05890939268283546, |
|
"train/cross_entropy_loss": 0.02049823058769107, |
|
"train/kl_divergence_loss": 0.097320556640625, |
|
"train/step_duration_seconds": 7.175466537475586, |
|
"train/steps_per_hour": 308.1960306908749, |
|
"train/total_elapsed_hours": 1.155108971397082 |
|
}, |
|
{ |
|
"epoch": 0.8462222222222222, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.424049425223405e-06, |
|
"loss": 0.9426, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8462222222222222, |
|
"step": 357, |
|
"train/combined_loss": 0.06022225972265005, |
|
"train/cross_entropy_loss": 0.020865662721917033, |
|
"train/kl_divergence_loss": 0.099578857421875, |
|
"train/step_duration_seconds": 7.179744005203247, |
|
"train/steps_per_hour": 308.5290537144967, |
|
"train/total_elapsed_hours": 1.1571033447318606 |
|
}, |
|
{ |
|
"epoch": 0.8485925925925926, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.3816000406683604e-06, |
|
"loss": 0.9636, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8485925925925926, |
|
"step": 358, |
|
"train/combined_loss": 0.06817956361919641, |
|
"train/cross_entropy_loss": 0.021460447693243623, |
|
"train/kl_divergence_loss": 0.114898681640625, |
|
"train/step_duration_seconds": 7.1635658740997314, |
|
"train/steps_per_hour": 308.8621282082033, |
|
"train/total_elapsed_hours": 1.1590932241413328 |
|
}, |
|
{ |
|
"epoch": 0.8509629629629629, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 1.0909, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8509629629629629, |
|
"step": 359, |
|
"train/combined_loss": 0.05856375303119421, |
|
"train/cross_entropy_loss": 0.020386786898598075, |
|
"train/kl_divergence_loss": 0.09674072265625, |
|
"train/step_duration_seconds": 7.170348644256592, |
|
"train/steps_per_hour": 309.19355931513985, |
|
"train/total_elapsed_hours": 1.1610849876536264 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.2984900807073919e-06, |
|
"loss": 0.937, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"step": 360, |
|
"train/combined_loss": 0.05872082710266113, |
|
"train/cross_entropy_loss": 0.020578863797709346, |
|
"train/kl_divergence_loss": 0.09686279296875, |
|
"train/step_duration_seconds": 7.177663326263428, |
|
"train/steps_per_hour": 309.5233145467673, |
|
"train/total_elapsed_hours": 1.1630787830220328 |
|
}, |
|
{ |
|
"epoch": 0.8557037037037037, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.2578352460261456e-06, |
|
"loss": 0.9395, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8557037037037037, |
|
"step": 361, |
|
"train/combined_loss": 0.05911425780504942, |
|
"train/cross_entropy_loss": 0.02054174430668354, |
|
"train/kl_divergence_loss": 0.097686767578125, |
|
"train/step_duration_seconds": 7.158376932144165, |
|
"train/steps_per_hour": 309.8533659473708, |
|
"train/total_elapsed_hours": 1.1650672210587396 |
|
}, |
|
{ |
|
"epoch": 0.8580740740740741, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.2177842662977136e-06, |
|
"loss": 0.9458, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8580740740740741, |
|
"step": 362, |
|
"train/combined_loss": 0.06256715022027493, |
|
"train/cross_entropy_loss": 0.02085573854856193, |
|
"train/kl_divergence_loss": 0.104278564453125, |
|
"train/step_duration_seconds": 7.160759925842285, |
|
"train/steps_per_hour": 310.182116727655, |
|
"train/total_elapsed_hours": 1.1670563210381402 |
|
}, |
|
{ |
|
"epoch": 0.8604444444444445, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.1783399079973578e-06, |
|
"loss": 1.0011, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8604444444444445, |
|
"step": 363, |
|
"train/combined_loss": 0.05845469981431961, |
|
"train/cross_entropy_loss": 0.02022971585392952, |
|
"train/kl_divergence_loss": 0.0966796875, |
|
"train/step_duration_seconds": 7.147622346878052, |
|
"train/steps_per_hour": 310.51071808599374, |
|
"train/total_elapsed_hours": 1.1690417716900507 |
|
}, |
|
{ |
|
"epoch": 0.8628148148148148, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.1395048956986577e-06, |
|
"loss": 0.9353, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8628148148148148, |
|
"step": 364, |
|
"train/combined_loss": 0.05994793586432934, |
|
"train/cross_entropy_loss": 0.020683227223344147, |
|
"train/kl_divergence_loss": 0.099212646484375, |
|
"train/step_duration_seconds": 7.1444926261901855, |
|
"train/steps_per_hour": 310.83843593717893, |
|
"train/total_elapsed_hours": 1.1710263529751035 |
|
}, |
|
{ |
|
"epoch": 0.8651851851851852, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.1012819118853147e-06, |
|
"loss": 0.9592, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8651851851851852, |
|
"step": 365, |
|
"train/combined_loss": 0.05859701009467244, |
|
"train/cross_entropy_loss": 0.02045329543761909, |
|
"train/kl_divergence_loss": 0.09674072265625, |
|
"train/step_duration_seconds": 7.179033279418945, |
|
"train/steps_per_hour": 311.1624997276206, |
|
"train/total_elapsed_hours": 1.1730205288860533 |
|
}, |
|
{ |
|
"epoch": 0.8675555555555555, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.0636735967658785e-06, |
|
"loss": 0.9376, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.8675555555555555, |
|
"step": 366, |
|
"train/combined_loss": 0.06006666086614132, |
|
"train/cross_entropy_loss": 0.020523948594927788, |
|
"train/kl_divergence_loss": 0.099609375, |
|
"train/step_duration_seconds": 7.176509141921997, |
|
"train/steps_per_hour": 311.48564941676807, |
|
"train/total_elapsed_hours": 1.1750140036476984 |
|
}, |
|
{ |
|
"epoch": 0.8699259259259259, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.026682548091361e-06, |
|
"loss": 0.9611, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.8699259259259259, |
|
"step": 367, |
|
"train/combined_loss": 0.0603836253285408, |
|
"train/cross_entropy_loss": 0.020761147141456604, |
|
"train/kl_divergence_loss": 0.100006103515625, |
|
"train/step_duration_seconds": 7.1585469245910645, |
|
"train/steps_per_hour": 311.8090262847088, |
|
"train/total_elapsed_hours": 1.1770024889045292 |
|
}, |
|
{ |
|
"epoch": 0.8722962962962963, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.903113209758098e-07, |
|
"loss": 0.9661, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.8722962962962963, |
|
"step": 368, |
|
"train/combined_loss": 0.061565724201500416, |
|
"train/cross_entropy_loss": 0.020806010346859694, |
|
"train/kl_divergence_loss": 0.102325439453125, |
|
"train/step_duration_seconds": 7.178318738937378, |
|
"train/steps_per_hour": 312.1298583233999, |
|
"train/total_elapsed_hours": 1.1789964663320118 |
|
}, |
|
{ |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.545624277198085e-07, |
|
"loss": 0.9851, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.8746666666666667, |
|
"step": 369, |
|
"train/combined_loss": 0.05880419351160526, |
|
"train/cross_entropy_loss": 0.02074559754692018, |
|
"train/kl_divergence_loss": 0.09686279296875, |
|
"train/step_duration_seconds": 7.1789703369140625, |
|
"train/steps_per_hour": 312.44955909393565, |
|
"train/total_elapsed_hours": 1.1809906247589324 |
|
}, |
|
{ |
|
"epoch": 0.8770370370370371, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.194383376369509e-07, |
|
"loss": 0.9409, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8770370370370371, |
|
"step": 370, |
|
"train/combined_loss": 0.059380816062912345, |
|
"train/cross_entropy_loss": 0.0207696893485263, |
|
"train/kl_divergence_loss": 0.097991943359375, |
|
"train/step_duration_seconds": 7.177523374557495, |
|
"train/steps_per_hour": 312.7682882917324, |
|
"train/total_elapsed_hours": 1.182984381251865 |
|
}, |
|
{ |
|
"epoch": 0.8794074074074074, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 8.849414768832687e-07, |
|
"loss": 0.9501, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.8794074074074074, |
|
"step": 371, |
|
"train/combined_loss": 0.058346427977085114, |
|
"train/cross_entropy_loss": 0.02050144597887993, |
|
"train/kl_divergence_loss": 0.09619140625, |
|
"train/step_duration_seconds": 7.176830053329468, |
|
"train/steps_per_hour": 313.08599583369795, |
|
"train/total_elapsed_hours": 1.1849779451555675 |
|
}, |
|
{ |
|
"epoch": 0.8817777777777778, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.510742282896545e-07, |
|
"loss": 0.9335, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.8817777777777778, |
|
"step": 372, |
|
"train/combined_loss": 0.061172885121777654, |
|
"train/cross_entropy_loss": 0.02075275091920048, |
|
"train/kl_divergence_loss": 0.101593017578125, |
|
"train/step_duration_seconds": 7.177177906036377, |
|
"train/steps_per_hour": 313.40261065917605, |
|
"train/total_elapsed_hours": 1.1869716056850221 |
|
}, |
|
{ |
|
"epoch": 0.8841481481481481, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 8.178389311972612e-07, |
|
"loss": 0.9788, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.8841481481481481, |
|
"step": 373, |
|
"train/combined_loss": 0.06040294258855283, |
|
"train/cross_entropy_loss": 0.02064719540067017, |
|
"train/kl_divergence_loss": 0.10015869140625, |
|
"train/step_duration_seconds": 7.17975640296936, |
|
"train/steps_per_hour": 313.7179746952193, |
|
"train/total_elapsed_hours": 1.1889659824636247 |
|
}, |
|
{ |
|
"epoch": 0.8865185185185185, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.852378812959227e-07, |
|
"loss": 0.9664, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.8865185185185185, |
|
"step": 374, |
|
"train/combined_loss": 0.061964265536516905, |
|
"train/cross_entropy_loss": 0.021664125844836235, |
|
"train/kl_divergence_loss": 0.102264404296875, |
|
"train/step_duration_seconds": 7.165625333786011, |
|
"train/steps_per_hour": 314.0333175421587, |
|
"train/total_elapsed_hours": 1.1909564339452319 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 7.532733304655848e-07, |
|
"loss": 0.9914, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"step": 375, |
|
"train/combined_loss": 0.0588826653547585, |
|
"train/cross_entropy_loss": 0.020475288503803313, |
|
"train/kl_divergence_loss": 0.0972900390625, |
|
"train/step_duration_seconds": 7.169605493545532, |
|
"train/steps_per_hour": 314.34731674868476, |
|
"train/total_elapsed_hours": 1.1929479910267724 |
|
}, |
|
{ |
|
"epoch": 0.8912592592592593, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 7.219474866207465e-07, |
|
"loss": 0.9421, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.8912592592592593, |
|
"step": 376, |
|
"train/combined_loss": 0.05903471680358052, |
|
"train/cross_entropy_loss": 0.02038266253657639, |
|
"train/kl_divergence_loss": 0.097686767578125, |
|
"train/step_duration_seconds": 7.174905776977539, |
|
"train/steps_per_hour": 314.65988159919425, |
|
"train/total_elapsed_hours": 1.194941020409266 |
|
}, |
|
{ |
|
"epoch": 0.8936296296296297, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 6.912625135579587e-07, |
|
"loss": 0.9446, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.8936296296296297, |
|
"step": 377, |
|
"train/combined_loss": 0.06489993864670396, |
|
"train/cross_entropy_loss": 0.021004713140428066, |
|
"train/kl_divergence_loss": 0.108795166015625, |
|
"train/step_duration_seconds": 7.203597068786621, |
|
"train/steps_per_hour": 314.96930831081517, |
|
"train/total_elapsed_hours": 1.1969420195950402 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.5, |
|
"learning_rate": 6.612205308063646e-07, |
|
"loss": 1.0384, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"step": 378, |
|
"train/combined_loss": 0.06032265955582261, |
|
"train/cross_entropy_loss": 0.020669732824899256, |
|
"train/kl_divergence_loss": 0.0999755859375, |
|
"train/step_duration_seconds": 7.174722909927368, |
|
"train/steps_per_hour": 315.27981131041514, |
|
"train/total_elapsed_hours": 1.1989349981811313 |
|
}, |
|
{ |
|
"epoch": 0.8983703703703704, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 6.318236134812917e-07, |
|
"loss": 0.9652, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.8983703703703704, |
|
"step": 379, |
|
"train/combined_loss": 0.060588925145566463, |
|
"train/cross_entropy_loss": 0.02068346820306033, |
|
"train/kl_divergence_loss": 0.100494384765625, |
|
"train/step_duration_seconds": 7.157525300979614, |
|
"train/steps_per_hour": 315.5905391030105, |
|
"train/total_elapsed_hours": 1.2009231996536256 |
|
}, |
|
{ |
|
"epoch": 0.9007407407407407, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.9694, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9007407407407407, |
|
"step": 380, |
|
"train/combined_loss": 0.05855354503728449, |
|
"train/cross_entropy_loss": 0.020335851586423814, |
|
"train/kl_divergence_loss": 0.096771240234375, |
|
"train/step_duration_seconds": 7.176971912384033, |
|
"train/steps_per_hour": 315.89882115214573, |
|
"train/total_elapsed_hours": 1.2029168029626212 |
|
}, |
|
{ |
|
"epoch": 0.9031111111111111, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 5.749730526460073e-07, |
|
"loss": 0.9369, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9031111111111111, |
|
"step": 381, |
|
"train/combined_loss": 0.05888266093097627, |
|
"train/cross_entropy_loss": 0.02044476370792836, |
|
"train/kl_divergence_loss": 0.097320556640625, |
|
"train/step_duration_seconds": 7.179260730743408, |
|
"train/steps_per_hour": 316.20591620635895, |
|
"train/total_elapsed_hours": 1.2049110420544942 |
|
}, |
|
{ |
|
"epoch": 0.9054814814814814, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.475233360227516e-07, |
|
"loss": 0.9421, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9054814814814814, |
|
"step": 382, |
|
"train/combined_loss": 0.059826530516147614, |
|
"train/cross_entropy_loss": 0.020623518154025078, |
|
"train/kl_divergence_loss": 0.099029541015625, |
|
"train/step_duration_seconds": 7.175317049026489, |
|
"train/steps_per_hour": 316.512283686395, |
|
"train/total_elapsed_hours": 1.2069041856792238 |
|
}, |
|
{ |
|
"epoch": 0.9078518518518518, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 5.207265383286831e-07, |
|
"loss": 0.9572, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9078518518518518, |
|
"step": 383, |
|
"train/combined_loss": 0.06832170393317938, |
|
"train/cross_entropy_loss": 0.02159213600680232, |
|
"train/kl_divergence_loss": 0.11505126953125, |
|
"train/step_duration_seconds": 7.181811809539795, |
|
"train/steps_per_hour": 316.8171681300854, |
|
"train/total_elapsed_hours": 1.208899133404096 |
|
}, |
|
{ |
|
"epoch": 0.9102222222222223, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.945845105217118e-07, |
|
"loss": 1.0931, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9102222222222223, |
|
"step": 384, |
|
"train/combined_loss": 0.06399638252332807, |
|
"train/cross_entropy_loss": 0.021303314133547246, |
|
"train/kl_divergence_loss": 0.106689453125, |
|
"train/step_duration_seconds": 7.173643112182617, |
|
"train/steps_per_hour": 317.1216422308217, |
|
"train/total_elapsed_hours": 1.210891812046369 |
|
}, |
|
{ |
|
"epoch": 0.9125925925925926, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.6909905833226965e-07, |
|
"loss": 1.0239, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9125925925925926, |
|
"step": 385, |
|
"train/combined_loss": 0.058574909809976816, |
|
"train/cross_entropy_loss": 0.02040909300558269, |
|
"train/kl_divergence_loss": 0.09674072265625, |
|
"train/step_duration_seconds": 7.179981470108032, |
|
"train/steps_per_hour": 317.4246550931726, |
|
"train/total_elapsed_hours": 1.2128862513436212 |
|
}, |
|
{ |
|
"epoch": 0.914962962962963, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.4427194213859216e-07, |
|
"loss": 0.9372, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.914962962962963, |
|
"step": 386, |
|
"train/combined_loss": 0.059789648512378335, |
|
"train/cross_entropy_loss": 0.020763377659022808, |
|
"train/kl_divergence_loss": 0.09881591796875, |
|
"train/step_duration_seconds": 7.180516481399536, |
|
"train/steps_per_hour": 317.72663419127423, |
|
"train/total_elapsed_hours": 1.214880839255121 |
|
}, |
|
{ |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 4.2010487684511105e-07, |
|
"loss": 0.9566, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.9173333333333333, |
|
"step": 387, |
|
"train/combined_loss": 0.06063654413446784, |
|
"train/cross_entropy_loss": 0.020656629814766347, |
|
"train/kl_divergence_loss": 0.100616455078125, |
|
"train/step_duration_seconds": 7.174912929534912, |
|
"train/steps_per_hour": 318.0280301371019, |
|
"train/total_elapsed_hours": 1.2168738706244362 |
|
}, |
|
{ |
|
"epoch": 0.9197037037037037, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.965995317640026e-07, |
|
"loss": 0.9702, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9197037037037037, |
|
"step": 388, |
|
"train/combined_loss": 0.05888652987778187, |
|
"train/cross_entropy_loss": 0.020452499389648438, |
|
"train/kl_divergence_loss": 0.097320556640625, |
|
"train/step_duration_seconds": 7.1626060009002686, |
|
"train/steps_per_hour": 318.32933325469384, |
|
"train/total_elapsed_hours": 1.218863483402464 |
|
}, |
|
{ |
|
"epoch": 0.922074074074074, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.7375753049987974e-07, |
|
"loss": 0.9422, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.922074074074074, |
|
"step": 389, |
|
"train/combined_loss": 0.0582260861992836, |
|
"train/cross_entropy_loss": 0.0202607661485672, |
|
"train/kl_divergence_loss": 0.09619140625, |
|
"train/step_duration_seconds": 7.176948070526123, |
|
"train/steps_per_hour": 318.6286145563663, |
|
"train/total_elapsed_hours": 1.2208570800887213 |
|
}, |
|
{ |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.515804508376508e-07, |
|
"loss": 0.9316, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9244444444444444, |
|
"step": 390, |
|
"train/combined_loss": 0.06077071442268789, |
|
"train/cross_entropy_loss": 0.0208639376796782, |
|
"train/kl_divergence_loss": 0.100677490234375, |
|
"train/step_duration_seconds": 7.199989318847656, |
|
"train/steps_per_hour": 318.9252507888751, |
|
"train/total_elapsed_hours": 1.2228570771217346 |
|
}, |
|
{ |
|
"epoch": 0.9268148148148149, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.3006982463352764e-07, |
|
"loss": 0.9723, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9268148148148149, |
|
"step": 391, |
|
"train/combined_loss": 0.06101406365633011, |
|
"train/cross_entropy_loss": 0.021350633818656206, |
|
"train/kl_divergence_loss": 0.100677490234375, |
|
"train/step_duration_seconds": 7.170354127883911, |
|
"train/steps_per_hour": 319.2230637303404, |
|
"train/total_elapsed_hours": 1.2248488421572579 |
|
}, |
|
{ |
|
"epoch": 0.9291851851851852, |
|
"grad_norm": 3.0, |
|
"learning_rate": 3.0922713770922155e-07, |
|
"loss": 0.9762, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9291851851851852, |
|
"step": 392, |
|
"train/combined_loss": 0.0617949569132179, |
|
"train/cross_entropy_loss": 0.02153913036454469, |
|
"train/kl_divergence_loss": 0.10205078125, |
|
"train/step_duration_seconds": 7.201368808746338, |
|
"train/steps_per_hour": 319.5176659434464, |
|
"train/total_elapsed_hours": 1.2268492223819096 |
|
}, |
|
{ |
|
"epoch": 0.9315555555555556, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.8905382974930173e-07, |
|
"loss": 0.9887, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9315555555555556, |
|
"step": 393, |
|
"train/combined_loss": 0.06196058611385524, |
|
"train/cross_entropy_loss": 0.021473662578500807, |
|
"train/kl_divergence_loss": 0.102447509765625, |
|
"train/step_duration_seconds": 7.19616436958313, |
|
"train/steps_per_hour": 319.81168526316264, |
|
"train/total_elapsed_hours": 1.2288481569290162 |
|
}, |
|
{ |
|
"epoch": 0.9339259259259259, |
|
"grad_norm": 2.75, |
|
"learning_rate": 2.6955129420176193e-07, |
|
"loss": 0.9914, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9339259259259259, |
|
"step": 394, |
|
"train/combined_loss": 0.06139179831370711, |
|
"train/cross_entropy_loss": 0.0205191905843094, |
|
"train/kl_divergence_loss": 0.102264404296875, |
|
"train/step_duration_seconds": 7.170999526977539, |
|
"train/steps_per_hour": 320.10656754066326, |
|
"train/total_elapsed_hours": 1.2308401012420653 |
|
}, |
|
{ |
|
"epoch": 0.9362962962962963, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.507208781817638e-07, |
|
"loss": 0.9823, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9362962962962963, |
|
"step": 395, |
|
"train/combined_loss": 0.05912953009828925, |
|
"train/cross_entropy_loss": 0.020389189943671227, |
|
"train/kl_divergence_loss": 0.097869873046875, |
|
"train/step_duration_seconds": 7.163103818893433, |
|
"train/steps_per_hour": 320.4010669129712, |
|
"train/total_elapsed_hours": 1.2328298523028691 |
|
}, |
|
{ |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.3256388237858806e-07, |
|
"loss": 0.9461, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9386666666666666, |
|
"step": 396, |
|
"train/combined_loss": 0.05818613991141319, |
|
"train/cross_entropy_loss": 0.020180873572826385, |
|
"train/kl_divergence_loss": 0.09619140625, |
|
"train/step_duration_seconds": 7.165131568908691, |
|
"train/steps_per_hour": 320.6944709054444, |
|
"train/total_elapsed_hours": 1.234820166627566 |
|
}, |
|
{ |
|
"epoch": 0.941037037037037, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 2.1508156096578748e-07, |
|
"loss": 0.931, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.941037037037037, |
|
"step": 397, |
|
"train/combined_loss": 0.06258085602894425, |
|
"train/cross_entropy_loss": 0.02100521558895707, |
|
"train/kl_divergence_loss": 0.104156494140625, |
|
"train/step_duration_seconds": 7.152077913284302, |
|
"train/steps_per_hour": 320.9878716432912, |
|
"train/total_elapsed_hours": 1.2368068549368116 |
|
}, |
|
{ |
|
"epoch": 0.9434074074074074, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.9827512151456175e-07, |
|
"loss": 1.0013, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9434074074074074, |
|
"step": 398, |
|
"train/combined_loss": 0.0631152824498713, |
|
"train/cross_entropy_loss": 0.020975436200387776, |
|
"train/kl_divergence_loss": 0.105255126953125, |
|
"train/step_duration_seconds": 7.156713008880615, |
|
"train/steps_per_hour": 321.2799973921229, |
|
"train/total_elapsed_hours": 1.2387948307726118 |
|
}, |
|
{ |
|
"epoch": 0.9457777777777778, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.82145724910342e-07, |
|
"loss": 1.0098, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9457777777777778, |
|
"step": 399, |
|
"train/combined_loss": 0.05855529848486185, |
|
"train/cross_entropy_loss": 0.020369877573102713, |
|
"train/kl_divergence_loss": 0.09674072265625, |
|
"train/step_duration_seconds": 7.177468776702881, |
|
"train/steps_per_hour": 321.56969283241614, |
|
"train/total_elapsed_hours": 1.2407885720994738 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.6669448527260602e-07, |
|
"loss": 0.9369, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"eval_combined_loss": 0.06070284065480033, |
|
"eval_cross_entropy_loss": 0.020584717767934003, |
|
"eval_kl_divergence_loss": 0.10082096354166667, |
|
"eval_loss": 0.060702841728925705, |
|
"eval_runtime": 220.2816, |
|
"eval_samples_per_second": 6.809, |
|
"eval_steps_per_second": 3.405, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"step": 400, |
|
"train/combined_loss": 0.059124535880982876, |
|
"train/cross_entropy_loss": 0.020409714779816568, |
|
"train/kl_divergence_loss": 0.09783935546875, |
|
"train/step_duration_seconds": 227.46868801116943, |
|
"train/steps_per_hour": 306.75450754086955, |
|
"train/total_elapsed_hours": 1.303974318769243 |
|
}, |
|
{ |
|
"epoch": 0.9505185185185185, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.519224698779198e-07, |
|
"loss": 0.946, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.9505185185185185, |
|
"step": 401, |
|
"train/combined_loss": 0.0637058550491929, |
|
"train/cross_entropy_loss": 0.020966400275938213, |
|
"train/kl_divergence_loss": 0.1064453125, |
|
"train/step_duration_seconds": 7.181180477142334, |
|
"train/steps_per_hour": 307.05167735238854, |
|
"train/total_elapsed_hours": 1.3059690911240047 |
|
}, |
|
{ |
|
"epoch": 0.9528888888888889, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.3783069908621772e-07, |
|
"loss": 1.0193, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.9528888888888889, |
|
"step": 402, |
|
"train/combined_loss": 0.06105339783243835, |
|
"train/cross_entropy_loss": 0.0205748132430017, |
|
"train/kl_divergence_loss": 0.101531982421875, |
|
"train/step_duration_seconds": 7.198254823684692, |
|
"train/steps_per_hour": 307.3468262521633, |
|
"train/total_elapsed_hours": 1.307968606352806 |
|
}, |
|
{ |
|
"epoch": 0.9552592592592593, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.2442014627032318e-07, |
|
"loss": 0.9769, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9552592592592593, |
|
"step": 403, |
|
"train/combined_loss": 0.0588757898658514, |
|
"train/cross_entropy_loss": 0.02040050830692053, |
|
"train/kl_divergence_loss": 0.09735107421875, |
|
"train/step_duration_seconds": 7.179206609725952, |
|
"train/steps_per_hour": 307.6423167469107, |
|
"train/total_elapsed_hours": 1.3099628304110633 |
|
}, |
|
{ |
|
"epoch": 0.9576296296296296, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.1169173774871478e-07, |
|
"loss": 0.942, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.9576296296296296, |
|
"step": 404, |
|
"train/combined_loss": 0.059780715964734554, |
|
"train/cross_entropy_loss": 0.020562409423291683, |
|
"train/kl_divergence_loss": 0.0989990234375, |
|
"train/step_duration_seconds": 7.192591905593872, |
|
"train/steps_per_hour": 307.93603622552814, |
|
"train/total_elapsed_hours": 1.3119607726070617 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.964635272153633e-08, |
|
"loss": 0.9565, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"step": 405, |
|
"train/combined_loss": 0.0602156778331846, |
|
"train/cross_entropy_loss": 0.02063887706026435, |
|
"train/kl_divergence_loss": 0.09979248046875, |
|
"train/step_duration_seconds": 7.179564952850342, |
|
"train/steps_per_hour": 308.22971132705254, |
|
"train/total_elapsed_hours": 1.3139550962050757 |
|
}, |
|
{ |
|
"epoch": 0.9623703703703703, |
|
"grad_norm": 1.125, |
|
"learning_rate": 8.82848232098732e-08, |
|
"loss": 0.9635, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.9623703703703703, |
|
"step": 406, |
|
"train/combined_loss": 0.05820171535015106, |
|
"train/cross_entropy_loss": 0.020212026312947273, |
|
"train/kl_divergence_loss": 0.09619140625, |
|
"train/step_duration_seconds": 7.195192575454712, |
|
"train/steps_per_hour": 308.5214785588609, |
|
"train/total_elapsed_hours": 1.3159537608093685 |
|
}, |
|
{ |
|
"epoch": 0.9647407407407408, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.760793399827937e-08, |
|
"loss": 0.9312, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.9647407407407408, |
|
"step": 407, |
|
"train/combined_loss": 0.061329676769673824, |
|
"train/cross_entropy_loss": 0.02094426902476698, |
|
"train/kl_divergence_loss": 0.101715087890625, |
|
"train/step_duration_seconds": 7.17832612991333, |
|
"train/steps_per_hour": 308.81345865085285, |
|
"train/total_elapsed_hours": 1.3179477402899 |
|
}, |
|
{ |
|
"epoch": 0.9671111111111111, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 6.761642258056977e-08, |
|
"loss": 0.9813, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.9671111111111111, |
|
"step": 408, |
|
"train/combined_loss": 0.06031193025410175, |
|
"train/cross_entropy_loss": 0.020617756876163185, |
|
"train/kl_divergence_loss": 0.100006103515625, |
|
"train/step_duration_seconds": 7.178639888763428, |
|
"train/steps_per_hour": 309.1045361691286, |
|
"train/total_elapsed_hours": 1.3199418069256676 |
|
}, |
|
{ |
|
"epoch": 0.9694814814814815, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 5.831097910887873e-08, |
|
"loss": 0.965, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.9694814814814815, |
|
"step": 409, |
|
"train/combined_loss": 0.059776231879368424, |
|
"train/cross_entropy_loss": 0.020767063251696527, |
|
"train/kl_divergence_loss": 0.098785400390625, |
|
"train/step_duration_seconds": 7.177491903305054, |
|
"train/steps_per_hour": 309.3948101729231, |
|
"train/total_elapsed_hours": 1.3219355546765856 |
|
}, |
|
{ |
|
"epoch": 0.9718518518518519, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.9692246345985905e-08, |
|
"loss": 0.9564, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9718518518518519, |
|
"step": 410, |
|
"train/combined_loss": 0.06045236345380545, |
|
"train/cross_entropy_loss": 0.020746038877405226, |
|
"train/kl_divergence_loss": 0.10015869140625, |
|
"train/step_duration_seconds": 7.197791576385498, |
|
"train/steps_per_hour": 309.68289092850614, |
|
"train/total_elapsed_hours": 1.3239349412255816 |
|
}, |
|
{ |
|
"epoch": 0.9742222222222222, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.176081962092182e-08, |
|
"loss": 0.9672, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.9742222222222222, |
|
"step": 411, |
|
"train/combined_loss": 0.06638129102066159, |
|
"train/cross_entropy_loss": 0.021220834576524794, |
|
"train/kl_divergence_loss": 0.111541748046875, |
|
"train/step_duration_seconds": 7.180782318115234, |
|
"train/steps_per_hour": 309.97120742767606, |
|
"train/total_elapsed_hours": 1.3259296029806138 |
|
}, |
|
{ |
|
"epoch": 0.9765925925925926, |
|
"grad_norm": 3.25, |
|
"learning_rate": 3.451724678784518e-08, |
|
"loss": 1.0621, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.9765925925925926, |
|
"step": 412, |
|
"train/combined_loss": 0.060466301161795855, |
|
"train/cross_entropy_loss": 0.020804431289434433, |
|
"train/kl_divergence_loss": 0.100128173828125, |
|
"train/step_duration_seconds": 7.173940181732178, |
|
"train/steps_per_hour": 310.2591018309556, |
|
"train/total_elapsed_hours": 1.327922364142206 |
|
}, |
|
{ |
|
"epoch": 0.9789629629629629, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.796202818819871e-08, |
|
"loss": 0.9675, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.9789629629629629, |
|
"step": 413, |
|
"train/combined_loss": 0.06092071859166026, |
|
"train/cross_entropy_loss": 0.02085877349600196, |
|
"train/kl_divergence_loss": 0.100982666015625, |
|
"train/step_duration_seconds": 7.189931392669678, |
|
"train/steps_per_hour": 310.5450962243895, |
|
"train/total_elapsed_hours": 1.3299195673068365 |
|
}, |
|
{ |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.2095616616150117e-08, |
|
"loss": 0.9747, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.9813333333333333, |
|
"step": 414, |
|
"train/combined_loss": 0.05822563171386719, |
|
"train/cross_entropy_loss": 0.020259857177734375, |
|
"train/kl_divergence_loss": 0.09619140625, |
|
"train/step_duration_seconds": 7.186516523361206, |
|
"train/steps_per_hour": 310.8304542928701, |
|
"train/total_elapsed_hours": 1.331915821896659 |
|
}, |
|
{ |
|
"epoch": 0.9837037037037037, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.6918417287318245e-08, |
|
"loss": 0.9316, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9837037037037037, |
|
"step": 415, |
|
"train/combined_loss": 0.05974208423867822, |
|
"train/cross_entropy_loss": 0.020485147717408836, |
|
"train/kl_divergence_loss": 0.0989990234375, |
|
"train/step_duration_seconds": 7.198460102081299, |
|
"train/steps_per_hour": 311.1141844684285, |
|
"train/total_elapsed_hours": 1.333915394147237 |
|
}, |
|
{ |
|
"epoch": 0.9860740740740741, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.2430787810776556e-08, |
|
"loss": 0.9559, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.9860740740740741, |
|
"step": 416, |
|
"train/combined_loss": 0.05951492628082633, |
|
"train/cross_entropy_loss": 0.02033600490540266, |
|
"train/kl_divergence_loss": 0.09869384765625, |
|
"train/step_duration_seconds": 7.179587125778198, |
|
"train/steps_per_hour": 311.3982872915823, |
|
"train/total_elapsed_hours": 1.3359097239043978 |
|
}, |
|
{ |
|
"epoch": 0.9884444444444445, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 8.633038164358454e-09, |
|
"loss": 0.9522, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.9884444444444445, |
|
"step": 417, |
|
"train/combined_loss": 0.060025526909157634, |
|
"train/cross_entropy_loss": 0.020472198841162026, |
|
"train/kl_divergence_loss": 0.099578857421875, |
|
"train/step_duration_seconds": 7.1582019329071045, |
|
"train/steps_per_hour": 311.6829270070737, |
|
"train/total_elapsed_hours": 1.3378981133302053 |
|
}, |
|
{ |
|
"epoch": 0.9908148148148148, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 5.525430673244403e-09, |
|
"loss": 0.9604, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.9908148148148148, |
|
"step": 418, |
|
"train/combined_loss": 0.06418039370328188, |
|
"train/cross_entropy_loss": 0.0208778785308823, |
|
"train/kl_divergence_loss": 0.10748291015625, |
|
"train/step_duration_seconds": 7.19930624961853, |
|
"train/steps_per_hour": 311.964063505697, |
|
"train/total_elapsed_hours": 1.339897920621766 |
|
}, |
|
{ |
|
"epoch": 0.9931851851851852, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 3.1081799918375454e-09, |
|
"loss": 1.0269, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.9931851851851852, |
|
"step": 419, |
|
"train/combined_loss": 0.0633124178275466, |
|
"train/cross_entropy_loss": 0.020789876696653664, |
|
"train/kl_divergence_loss": 0.1058349609375, |
|
"train/step_duration_seconds": 7.172863245010376, |
|
"train/steps_per_hour": 312.2460712308407, |
|
"train/total_elapsed_hours": 1.341890382634269 |
|
}, |
|
{ |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.3814530889433298e-09, |
|
"loss": 1.013, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9955555555555555, |
|
"step": 420, |
|
"train/combined_loss": 0.05849831993691623, |
|
"train/cross_entropy_loss": 0.020225399872288108, |
|
"train/kl_divergence_loss": 0.096771240234375, |
|
"train/step_duration_seconds": 7.199819087982178, |
|
"train/steps_per_hour": 312.5255014342565, |
|
"train/total_elapsed_hours": 1.3438903323809306 |
|
}, |
|
{ |
|
"epoch": 0.9979259259259259, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.4536923623096353e-10, |
|
"loss": 0.936, |
|
"step": 421 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 421, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0845947087447654e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|