diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24481 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5995203836930456, + "eval_steps": 250, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003996802557953637, + "grad_norm": 27253.678894810444, + "learning_rate": 0.0, + "loss": 88.7727, + "num_input_tokens_seen": 173048, + "step": 1 + }, + { + "epoch": 0.0003996802557953637, + "eval_websight_new_IoU": 0.007802221458405256, + "eval_websight_new_MAE_all": 0.22719886153936386, + "eval_websight_new_MAE_h": 0.1250949464738369, + "eval_websight_new_MAE_w": 0.26685621589422226, + "eval_websight_new_MAE_x": 0.23003952950239182, + "eval_websight_new_MAE_y": 0.28680478781461716, + "eval_websight_new_NUM_probability": 2.108378305276659e-09, + "eval_websight_new_inside_bbox": 0.0, + "eval_websight_new_loss": 42.4260139465332, + "eval_websight_new_loss_ce": 5.392822742462158, + "eval_websight_new_loss_xval": 39.01171875, + "eval_websight_new_runtime": 64.218, + "eval_websight_new_samples_per_second": 0.779, + "eval_websight_new_steps_per_second": 0.031, + "num_input_tokens_seen": 173048, + "step": 1 + }, + { + "epoch": 0.0003996802557953637, + "eval_seeclick_IoU": 0.013260291889309883, + "eval_seeclick_MAE_all": 0.3815549612045288, + "eval_seeclick_MAE_h": 0.40597620606422424, + "eval_seeclick_MAE_w": 0.4406091570854187, + "eval_seeclick_MAE_x": 0.3167571872472763, + "eval_seeclick_MAE_y": 0.3628772497177124, + "eval_seeclick_NUM_probability": 2.7780518996323167e-09, + "eval_seeclick_inside_bbox": 0.015625, + "eval_seeclick_loss": 46.768917083740234, + "eval_seeclick_loss_ce": 6.483319997787476, + "eval_seeclick_loss_xval": 39.03125, + "eval_seeclick_runtime": 84.0912, + "eval_seeclick_samples_per_second": 0.595, + "eval_seeclick_steps_per_second": 0.024, + "num_input_tokens_seen": 173048, + "step": 1 + }, + { + "epoch": 0.0003996802557953637, + "eval_icons_IoU": 9.602530917618424e-05, + "eval_icons_MAE_all": 0.24617066234350204, + "eval_icons_MAE_h": 0.16503974795341492, + "eval_icons_MAE_w": 0.14773830771446228, + "eval_icons_MAE_x": 0.3416582942008972, + "eval_icons_MAE_y": 0.3302464038133621, + "eval_icons_NUM_probability": 4.84057904870383e-10, + "eval_icons_inside_bbox": 0.0, + "eval_icons_loss": 22.6812686920166, + "eval_icons_loss_ce": 5.316617250442505, + "eval_icons_loss_xval": 18.2421875, + "eval_icons_runtime": 82.5017, + "eval_icons_samples_per_second": 0.606, + "eval_icons_steps_per_second": 0.024, + "num_input_tokens_seen": 173048, + "step": 1 + }, + { + "epoch": 0.0003996802557953637, + "loss": 22.062753677368164, + "loss_ce": 5.414316177368164, + "loss_xval": 16.625, + "num_input_tokens_seen": 173048, + "step": 1 + }, + { + "epoch": 0.0007993605115907274, + "grad_norm": 41154.27591780116, + "learning_rate": 6.276845846337281e-07, + "loss": 59.1779, + "num_input_tokens_seen": 346016, + "step": 2 + }, + { + "epoch": 0.0007993605115907274, + "loss": 79.56485748291016, + "loss_ce": 5.002355575561523, + "loss_xval": 74.5, + "num_input_tokens_seen": 346016, + "step": 2 + }, + { + "epoch": 0.001199040767386091, + "grad_norm": 32361.98582055134, + "learning_rate": 9.94856528925194e-07, + "loss": 59.2298, + "num_input_tokens_seen": 519088, + "step": 3 + }, + { + "epoch": 0.001199040767386091, + "loss": 53.204307556152344, + "loss_ce": 5.1574320793151855, + "loss_xval": 48.0, + "num_input_tokens_seen": 519088, + "step": 3 + }, + { + "epoch": 0.0015987210231814548, + "grad_norm": 18680.271876014704, + "learning_rate": 1.2553691692674561e-06, + "loss": 41.3081, + "num_input_tokens_seen": 691920, + "step": 4 + }, + { + "epoch": 0.0015987210231814548, + "loss": 39.82504653930664, + "loss_ce": 4.950046539306641, + "loss_xval": 35.0, + "num_input_tokens_seen": 691920, + "step": 4 + }, + { + "epoch": 0.0019984012789768186, + "grad_norm": 5449.561908994174, + "learning_rate": 1.4574384717887574e-06, + "loss": 27.6026, + "num_input_tokens_seen": 864928, + "step": 5 + }, + { + "epoch": 0.0019984012789768186, + "loss": 28.755176544189453, + "loss_ce": 4.9895524978637695, + "loss_xval": 23.75, + "num_input_tokens_seen": 864928, + "step": 5 + }, + { + "epoch": 0.002398081534772182, + "grad_norm": 3825.7071201245612, + "learning_rate": 1.622541113558922e-06, + "loss": 22.1129, + "num_input_tokens_seen": 1037544, + "step": 6 + }, + { + "epoch": 0.002398081534772182, + "loss": 20.575275421142578, + "loss_ce": 5.247150421142578, + "loss_xval": 15.3125, + "num_input_tokens_seen": 1037544, + "step": 6 + }, + { + "epoch": 0.002797761790567546, + "grad_norm": 3619.914754340841, + "learning_rate": 1.762133408171179e-06, + "loss": 21.0579, + "num_input_tokens_seen": 1209928, + "step": 7 + }, + { + "epoch": 0.002797761790567546, + "loss": 20.604846954345703, + "loss_ce": 5.347036361694336, + "loss_xval": 15.25, + "num_input_tokens_seen": 1209928, + "step": 7 + }, + { + "epoch": 0.0031974420463629096, + "grad_norm": 2391.231418170818, + "learning_rate": 1.8830537539011838e-06, + "loss": 19.8215, + "num_input_tokens_seen": 1382680, + "step": 8 + }, + { + "epoch": 0.0031974420463629096, + "loss": 20.428314208984375, + "loss_ce": 5.27987813949585, + "loss_xval": 15.125, + "num_input_tokens_seen": 1382680, + "step": 8 + }, + { + "epoch": 0.0035971223021582736, + "grad_norm": 2056.06429238663, + "learning_rate": 1.989713057850388e-06, + "loss": 17.948, + "num_input_tokens_seen": 1555768, + "step": 9 + }, + { + "epoch": 0.0035971223021582736, + "loss": 18.224111557006836, + "loss_ce": 5.302236557006836, + "loss_xval": 12.9375, + "num_input_tokens_seen": 1555768, + "step": 9 + }, + { + "epoch": 0.003996802557953637, + "grad_norm": 689.953934673361, + "learning_rate": 2.085123056422486e-06, + "loss": 15.8543, + "num_input_tokens_seen": 1729384, + "step": 10 + }, + { + "epoch": 0.003996802557953637, + "loss": 15.537797927856445, + "loss_ce": 5.276078701019287, + "loss_xval": 10.25, + "num_input_tokens_seen": 1729384, + "step": 10 + }, + { + "epoch": 0.004396482813749001, + "grad_norm": 785.0266162674142, + "learning_rate": 2.1714318986131375e-06, + "loss": 15.001, + "num_input_tokens_seen": 1902192, + "step": 11 + }, + { + "epoch": 0.004396482813749001, + "loss": 14.884502410888672, + "loss_ce": 5.197001934051514, + "loss_xval": 9.6875, + "num_input_tokens_seen": 1902192, + "step": 11 + }, + { + "epoch": 0.004796163069544364, + "grad_norm": 2016.36977346388, + "learning_rate": 2.25022569819265e-06, + "loss": 16.9301, + "num_input_tokens_seen": 2075288, + "step": 12 + }, + { + "epoch": 0.004796163069544364, + "loss": 16.57305145263672, + "loss_ce": 5.272271156311035, + "loss_xval": 11.3125, + "num_input_tokens_seen": 2075288, + "step": 12 + }, + { + "epoch": 0.005195843325339729, + "grad_norm": 3023.4049224405285, + "learning_rate": 2.3227089674435414e-06, + "loss": 18.3376, + "num_input_tokens_seen": 2247968, + "step": 13 + }, + { + "epoch": 0.005195843325339729, + "loss": 18.748249053955078, + "loss_ce": 5.170123100280762, + "loss_xval": 13.5625, + "num_input_tokens_seen": 2247968, + "step": 13 + }, + { + "epoch": 0.005595523581135092, + "grad_norm": 3434.7471291290594, + "learning_rate": 2.389817992804907e-06, + "loss": 19.9897, + "num_input_tokens_seen": 2420728, + "step": 14 + }, + { + "epoch": 0.005595523581135092, + "loss": 20.42713165283203, + "loss_ce": 5.231818675994873, + "loss_xval": 15.1875, + "num_input_tokens_seen": 2420728, + "step": 14 + }, + { + "epoch": 0.005995203836930456, + "grad_norm": 2501.863798675274, + "learning_rate": 2.4522950007139517e-06, + "loss": 16.8772, + "num_input_tokens_seen": 2593888, + "step": 15 + }, + { + "epoch": 0.005995203836930456, + "loss": 16.648954391479492, + "loss_ce": 5.109891891479492, + "loss_xval": 11.5625, + "num_input_tokens_seen": 2593888, + "step": 15 + }, + { + "epoch": 0.006394884092725819, + "grad_norm": 1750.9020613468717, + "learning_rate": 2.5107383385349122e-06, + "loss": 14.1557, + "num_input_tokens_seen": 2766600, + "step": 16 + }, + { + "epoch": 0.006394884092725819, + "loss": 13.727005958557129, + "loss_ce": 5.047318458557129, + "loss_xval": 8.6875, + "num_input_tokens_seen": 2766600, + "step": 16 + }, + { + "epoch": 0.006794564348521183, + "grad_norm": 227.32613482390494, + "learning_rate": 2.5656374157160176e-06, + "loss": 12.464, + "num_input_tokens_seen": 2939256, + "step": 17 + }, + { + "epoch": 0.006794564348521183, + "loss": 12.111137390136719, + "loss_ce": 5.044731140136719, + "loss_xval": 7.0625, + "num_input_tokens_seen": 2939256, + "step": 17 + }, + { + "epoch": 0.007194244604316547, + "grad_norm": 1252.224814993924, + "learning_rate": 2.6173976424841156e-06, + "loss": 13.1703, + "num_input_tokens_seen": 3112192, + "step": 18 + }, + { + "epoch": 0.007194244604316547, + "loss": 12.594108581542969, + "loss_ce": 4.973014831542969, + "loss_xval": 7.625, + "num_input_tokens_seen": 3112192, + "step": 18 + }, + { + "epoch": 0.007593924860111911, + "grad_norm": 2028.8518634432514, + "learning_rate": 2.6663586168300222e-06, + "loss": 14.8689, + "num_input_tokens_seen": 3282040, + "step": 19 + }, + { + "epoch": 0.007593924860111911, + "loss": 14.035215377807617, + "loss_ce": 4.902403354644775, + "loss_xval": 9.125, + "num_input_tokens_seen": 3282040, + "step": 19 + }, + { + "epoch": 0.007993605115907274, + "grad_norm": 2398.1141656415566, + "learning_rate": 2.712807641056214e-06, + "loss": 15.648, + "num_input_tokens_seen": 3455080, + "step": 20 + }, + { + "epoch": 0.007993605115907274, + "loss": 15.920930862426758, + "loss_ce": 4.975618839263916, + "loss_xval": 10.9375, + "num_input_tokens_seen": 3455080, + "step": 20 + }, + { + "epoch": 0.008393285371702638, + "grad_norm": 2410.923655837602, + "learning_rate": 2.756989937096373e-06, + "loss": 15.3804, + "num_input_tokens_seen": 3628016, + "step": 21 + }, + { + "epoch": 0.008393285371702638, + "loss": 15.361221313476562, + "loss_ce": 4.939347267150879, + "loss_xval": 10.4375, + "num_input_tokens_seen": 3628016, + "step": 21 + }, + { + "epoch": 0.008792965627498001, + "grad_norm": 1750.9570099084196, + "learning_rate": 2.799116483246866e-06, + "loss": 13.9054, + "num_input_tokens_seen": 3797864, + "step": 22 + }, + { + "epoch": 0.008792965627498001, + "loss": 13.663543701171875, + "loss_ce": 4.839325904846191, + "loss_xval": 8.8125, + "num_input_tokens_seen": 3797864, + "step": 22 + }, + { + "epoch": 0.009192645883293365, + "grad_norm": 1467.4142596254105, + "learning_rate": 2.8393701074525802e-06, + "loss": 12.8355, + "num_input_tokens_seen": 3970640, + "step": 23 + }, + { + "epoch": 0.009192645883293365, + "loss": 12.972114562988281, + "loss_ce": 4.925239562988281, + "loss_xval": 8.0625, + "num_input_tokens_seen": 3970640, + "step": 23 + }, + { + "epoch": 0.009592326139088728, + "grad_norm": 573.4862114776328, + "learning_rate": 2.8779102828263783e-06, + "loss": 12.2069, + "num_input_tokens_seen": 4143328, + "step": 24 + }, + { + "epoch": 0.009592326139088728, + "loss": 12.610280990600586, + "loss_ce": 4.926686763763428, + "loss_xval": 7.6875, + "num_input_tokens_seen": 4143328, + "step": 24 + }, + { + "epoch": 0.009992006394884092, + "grad_norm": 279.5844578074333, + "learning_rate": 2.914876943577515e-06, + "loss": 11.6072, + "num_input_tokens_seen": 4315720, + "step": 25 + }, + { + "epoch": 0.009992006394884092, + "loss": 12.251864433288574, + "loss_ce": 4.790926933288574, + "loss_xval": 7.46875, + "num_input_tokens_seen": 4315720, + "step": 25 + }, + { + "epoch": 0.010391686650679457, + "grad_norm": 745.4356271224037, + "learning_rate": 2.9503935520772694e-06, + "loss": 11.3671, + "num_input_tokens_seen": 4489000, + "step": 26 + }, + { + "epoch": 0.010391686650679457, + "loss": 11.730391502380371, + "loss_ce": 4.703047275543213, + "loss_xval": 7.03125, + "num_input_tokens_seen": 4489000, + "step": 26 + }, + { + "epoch": 0.01079136690647482, + "grad_norm": 1260.0413305023137, + "learning_rate": 2.9845695867755812e-06, + "loss": 12.7243, + "num_input_tokens_seen": 4661752, + "step": 27 + }, + { + "epoch": 0.01079136690647482, + "loss": 12.388510704040527, + "loss_ce": 4.7322611808776855, + "loss_xval": 7.65625, + "num_input_tokens_seen": 4661752, + "step": 27 + }, + { + "epoch": 0.011191047162270184, + "grad_norm": 1537.0491298782129, + "learning_rate": 3.017502577438635e-06, + "loss": 12.7051, + "num_input_tokens_seen": 4834536, + "step": 28 + }, + { + "epoch": 0.011191047162270184, + "loss": 13.197071075439453, + "loss_ce": 4.650196075439453, + "loss_xval": 8.5625, + "num_input_tokens_seen": 4834536, + "step": 28 + }, + { + "epoch": 0.011590727418065548, + "grad_norm": 1481.2643398938644, + "learning_rate": 3.0492797830851952e-06, + "loss": 12.6727, + "num_input_tokens_seen": 5007272, + "step": 29 + }, + { + "epoch": 0.011590727418065548, + "loss": 11.455208778381348, + "loss_ce": 4.564583778381348, + "loss_xval": 6.875, + "num_input_tokens_seen": 5007272, + "step": 29 + }, + { + "epoch": 0.011990407673860911, + "grad_norm": 1043.4564438418174, + "learning_rate": 3.079979585347679e-06, + "loss": 11.1842, + "num_input_tokens_seen": 5178384, + "step": 30 + }, + { + "epoch": 0.011990407673860911, + "loss": 11.960172653198242, + "loss_ce": 4.397672653198242, + "loss_xval": 7.5625, + "num_input_tokens_seen": 5178384, + "step": 30 + }, + { + "epoch": 0.012390087929656275, + "grad_norm": 792.8424890447506, + "learning_rate": 3.1096726532791336e-06, + "loss": 10.8114, + "num_input_tokens_seen": 5350952, + "step": 31 + }, + { + "epoch": 0.012390087929656275, + "loss": 10.764305114746094, + "loss_ce": 4.383445739746094, + "loss_xval": 6.375, + "num_input_tokens_seen": 5350952, + "step": 31 + }, + { + "epoch": 0.012789768185451638, + "grad_norm": 145.9379236072859, + "learning_rate": 3.13842292316864e-06, + "loss": 10.5285, + "num_input_tokens_seen": 5524152, + "step": 32 + }, + { + "epoch": 0.012789768185451638, + "loss": 11.266403198242188, + "loss_ce": 4.449997901916504, + "loss_xval": 6.8125, + "num_input_tokens_seen": 5524152, + "step": 32 + }, + { + "epoch": 0.013189448441247002, + "grad_norm": 261.9315218162029, + "learning_rate": 3.1662884275383315e-06, + "loss": 9.8499, + "num_input_tokens_seen": 5697032, + "step": 33 + }, + { + "epoch": 0.013189448441247002, + "loss": 9.855939865112305, + "loss_ce": 4.297346591949463, + "loss_xval": 5.5625, + "num_input_tokens_seen": 5697032, + "step": 33 + }, + { + "epoch": 0.013589128697042365, + "grad_norm": 403.77946842148157, + "learning_rate": 3.1933220003497456e-06, + "loss": 9.9114, + "num_input_tokens_seen": 5869808, + "step": 34 + }, + { + "epoch": 0.013589128697042365, + "loss": 10.173905372619629, + "loss_ce": 4.193436622619629, + "loss_xval": 5.96875, + "num_input_tokens_seen": 5869808, + "step": 34 + }, + { + "epoch": 0.013988808952837729, + "grad_norm": 1035.8631252474581, + "learning_rate": 3.2195718799599367e-06, + "loss": 10.7195, + "num_input_tokens_seen": 6043328, + "step": 35 + }, + { + "epoch": 0.013988808952837729, + "loss": 10.887596130371094, + "loss_ce": 4.102439880371094, + "loss_xval": 6.78125, + "num_input_tokens_seen": 6043328, + "step": 35 + }, + { + "epoch": 0.014388489208633094, + "grad_norm": 1149.9950193906377, + "learning_rate": 3.245082227117844e-06, + "loss": 10.5161, + "num_input_tokens_seen": 6216344, + "step": 36 + }, + { + "epoch": 0.014388489208633094, + "loss": 10.513891220092773, + "loss_ce": 3.982640504837036, + "loss_xval": 6.53125, + "num_input_tokens_seen": 6216344, + "step": 36 + }, + { + "epoch": 0.014788169464428458, + "grad_norm": 889.0168898022567, + "learning_rate": 3.2698935719735842e-06, + "loss": 10.6335, + "num_input_tokens_seen": 6389384, + "step": 37 + }, + { + "epoch": 0.014788169464428458, + "loss": 10.846721649169922, + "loss_ce": 3.956096649169922, + "loss_xval": 6.875, + "num_input_tokens_seen": 6389384, + "step": 37 + }, + { + "epoch": 0.015187849720223821, + "grad_norm": 962.657302962428, + "learning_rate": 3.29404320146375e-06, + "loss": 9.9492, + "num_input_tokens_seen": 6562488, + "step": 38 + }, + { + "epoch": 0.015187849720223821, + "loss": 10.308053970336914, + "loss_ce": 4.020944595336914, + "loss_xval": 6.28125, + "num_input_tokens_seen": 6562488, + "step": 38 + }, + { + "epoch": 0.015587529976019185, + "grad_norm": 194.46637819778587, + "learning_rate": 3.3175654963687346e-06, + "loss": 9.1389, + "num_input_tokens_seen": 6735512, + "step": 39 + }, + { + "epoch": 0.015587529976019185, + "loss": 9.079648971557617, + "loss_ce": 3.8530867099761963, + "loss_xval": 5.21875, + "num_input_tokens_seen": 6735512, + "step": 39 + }, + { + "epoch": 0.01598721023181455, + "grad_norm": 483.308185338452, + "learning_rate": 3.340492225689942e-06, + "loss": 9.9182, + "num_input_tokens_seen": 6908304, + "step": 40 + }, + { + "epoch": 0.01598721023181455, + "loss": 9.45429515838623, + "loss_ce": 3.7667951583862305, + "loss_xval": 5.6875, + "num_input_tokens_seen": 6908304, + "step": 40 + }, + { + "epoch": 0.016386890487609912, + "grad_norm": 664.1523962678483, + "learning_rate": 3.3628528046722993e-06, + "loss": 9.4767, + "num_input_tokens_seen": 7081120, + "step": 41 + }, + { + "epoch": 0.016386890487609912, + "loss": 9.607213020324707, + "loss_ce": 3.615025043487549, + "loss_xval": 6.0, + "num_input_tokens_seen": 7081120, + "step": 41 + }, + { + "epoch": 0.016786570743405275, + "grad_norm": 903.1201640859815, + "learning_rate": 3.3846745217301015e-06, + "loss": 10.2732, + "num_input_tokens_seen": 7253696, + "step": 42 + }, + { + "epoch": 0.016786570743405275, + "loss": 10.472900390625, + "loss_ce": 3.750244617462158, + "loss_xval": 6.71875, + "num_input_tokens_seen": 7253696, + "step": 42 + }, + { + "epoch": 0.01718625099920064, + "grad_norm": 871.80664974741, + "learning_rate": 3.4059827386678244e-06, + "loss": 10.2289, + "num_input_tokens_seen": 7426136, + "step": 43 + }, + { + "epoch": 0.01718625099920064, + "loss": 10.360870361328125, + "loss_ce": 3.575714588165283, + "loss_xval": 6.78125, + "num_input_tokens_seen": 7426136, + "step": 43 + }, + { + "epoch": 0.017585931254996003, + "grad_norm": 602.4666850744355, + "learning_rate": 3.4268010678805934e-06, + "loss": 9.0809, + "num_input_tokens_seen": 7599104, + "step": 44 + }, + { + "epoch": 0.017585931254996003, + "loss": 9.004478454589844, + "loss_ce": 3.3775248527526855, + "loss_xval": 5.625, + "num_input_tokens_seen": 7599104, + "step": 44 + }, + { + "epoch": 0.017985611510791366, + "grad_norm": 137.35583139064687, + "learning_rate": 3.447151529639145e-06, + "loss": 8.3695, + "num_input_tokens_seen": 7772208, + "step": 45 + }, + { + "epoch": 0.017985611510791366, + "loss": 8.689802169799805, + "loss_ce": 3.4398021697998047, + "loss_xval": 5.25, + "num_input_tokens_seen": 7772208, + "step": 45 + }, + { + "epoch": 0.01838529176658673, + "grad_norm": 478.5359664685662, + "learning_rate": 3.4670546920863086e-06, + "loss": 8.1446, + "num_input_tokens_seen": 7944840, + "step": 46 + }, + { + "epoch": 0.01838529176658673, + "loss": 7.91407585144043, + "loss_ce": 3.4550914764404297, + "loss_xval": 4.46875, + "num_input_tokens_seen": 7944840, + "step": 46 + }, + { + "epoch": 0.018784972022382093, + "grad_norm": 557.9408362107046, + "learning_rate": 3.4865297961764146e-06, + "loss": 8.3954, + "num_input_tokens_seen": 8118024, + "step": 47 + }, + { + "epoch": 0.018784972022382093, + "loss": 8.321405410766602, + "loss_ce": 3.3067569732666016, + "loss_xval": 5.0, + "num_input_tokens_seen": 8118024, + "step": 47 + }, + { + "epoch": 0.019184652278177457, + "grad_norm": 846.6195239782504, + "learning_rate": 3.5055948674601067e-06, + "loss": 8.578, + "num_input_tokens_seen": 8290688, + "step": 48 + }, + { + "epoch": 0.019184652278177457, + "loss": 9.086427688598633, + "loss_ce": 3.121584892272949, + "loss_xval": 5.96875, + "num_input_tokens_seen": 8290688, + "step": 48 + }, + { + "epoch": 0.01958433253397282, + "grad_norm": 590.9741857230598, + "learning_rate": 3.524266816342358e-06, + "loss": 7.7949, + "num_input_tokens_seen": 8463320, + "step": 49 + }, + { + "epoch": 0.01958433253397282, + "loss": 7.50385046005249, + "loss_ce": 3.1073663234710693, + "loss_xval": 4.40625, + "num_input_tokens_seen": 8463320, + "step": 49 + }, + { + "epoch": 0.019984012789768184, + "grad_norm": 261.24703891283673, + "learning_rate": 3.542561528211243e-06, + "loss": 7.8687, + "num_input_tokens_seen": 8636560, + "step": 50 + }, + { + "epoch": 0.019984012789768184, + "loss": 7.583156108856201, + "loss_ce": 3.1534688472747803, + "loss_xval": 4.4375, + "num_input_tokens_seen": 8636560, + "step": 50 + }, + { + "epoch": 0.02038369304556355, + "grad_norm": 322.9570931125352, + "learning_rate": 3.5604939446412112e-06, + "loss": 7.6471, + "num_input_tokens_seen": 8809720, + "step": 51 + }, + { + "epoch": 0.02038369304556355, + "eval_websight_new_IoU": 0.030084313824772835, + "eval_websight_new_MAE_all": 0.1372687742114067, + "eval_websight_new_MAE_h": 0.10130885243415833, + "eval_websight_new_MAE_w": 0.14034898951649666, + "eval_websight_new_MAE_x": 0.08298783376812935, + "eval_websight_new_MAE_y": 0.2244294062256813, + "eval_websight_new_NUM_probability": 1.1027492252679849e-08, + "eval_websight_new_inside_bbox": 0.03125, + "eval_websight_new_loss": 7.040191650390625, + "eval_websight_new_loss_ce": 3.3154985904693604, + "eval_websight_new_loss_xval": 3.492431640625, + "eval_websight_new_runtime": 55.9794, + "eval_websight_new_samples_per_second": 0.893, + "eval_websight_new_steps_per_second": 0.036, + "num_input_tokens_seen": 8809720, + "step": 51 + }, + { + "epoch": 0.02038369304556355, + "eval_seeclick_IoU": 0.025128517299890518, + "eval_seeclick_MAE_all": 0.20313503593206406, + "eval_seeclick_MAE_h": 0.13093940913677216, + "eval_seeclick_MAE_w": 0.2555273696780205, + "eval_seeclick_MAE_x": 0.21082086116075516, + "eval_seeclick_MAE_y": 0.21525250375270844, + "eval_seeclick_NUM_probability": 1.2050559661247462e-08, + "eval_seeclick_inside_bbox": 0.07465277798473835, + "eval_seeclick_loss": 10.326189994812012, + "eval_seeclick_loss_ce": 3.7456430196762085, + "eval_seeclick_loss_xval": 7.052734375, + "eval_seeclick_runtime": 82.3558, + "eval_seeclick_samples_per_second": 0.607, + "eval_seeclick_steps_per_second": 0.024, + "num_input_tokens_seen": 8809720, + "step": 51 + }, + { + "epoch": 0.02038369304556355, + "eval_icons_IoU": 0.0, + "eval_icons_MAE_all": 0.12515902519226074, + "eval_icons_MAE_h": 0.027381721884012222, + "eval_icons_MAE_w": 0.05067274160683155, + "eval_icons_MAE_x": 0.18365809321403503, + "eval_icons_MAE_y": 0.23892351984977722, + "eval_icons_NUM_probability": 1.5372147998959917e-08, + "eval_icons_inside_bbox": 0.0, + "eval_icons_loss": 7.085067272186279, + "eval_icons_loss_ce": 3.12102210521698, + "eval_icons_loss_xval": 3.6298828125, + "eval_icons_runtime": 89.2513, + "eval_icons_samples_per_second": 0.56, + "eval_icons_steps_per_second": 0.022, + "num_input_tokens_seen": 8809720, + "step": 51 + }, + { + "epoch": 0.02038369304556355, + "loss": 5.90113639831543, + "loss_ce": 3.1316046714782715, + "loss_xval": 2.765625, + "num_input_tokens_seen": 8809720, + "step": 51 + }, + { + "epoch": 0.020783373301358914, + "grad_norm": 591.8732945437766, + "learning_rate": 3.5780781367109973e-06, + "loss": 7.3836, + "num_input_tokens_seen": 8982736, + "step": 52 + }, + { + "epoch": 0.020783373301358914, + "loss": 7.991217136383057, + "loss_ce": 3.0302796363830566, + "loss_xval": 4.96875, + "num_input_tokens_seen": 8982736, + "step": 52 + }, + { + "epoch": 0.021183053557154278, + "grad_norm": 626.8464467894951, + "learning_rate": 3.5953273713375363e-06, + "loss": 7.577, + "num_input_tokens_seen": 9155480, + "step": 53 + }, + { + "epoch": 0.021183053557154278, + "loss": 7.484159469604492, + "loss_ce": 2.9724409580230713, + "loss_xval": 4.5, + "num_input_tokens_seen": 9155480, + "step": 53 + }, + { + "epoch": 0.02158273381294964, + "grad_norm": 256.7768068798319, + "learning_rate": 3.6122541714093096e-06, + "loss": 6.6296, + "num_input_tokens_seen": 9328328, + "step": 54 + }, + { + "epoch": 0.02158273381294964, + "loss": 6.372915744781494, + "loss_ce": 2.915884494781494, + "loss_xval": 3.453125, + "num_input_tokens_seen": 9328328, + "step": 54 + }, + { + "epoch": 0.021982414068745005, + "grad_norm": 211.77074553660972, + "learning_rate": 3.628870370401895e-06, + "loss": 6.8308, + "num_input_tokens_seen": 9501408, + "step": 55 + }, + { + "epoch": 0.021982414068745005, + "loss": 6.57294225692749, + "loss_ce": 2.8893485069274902, + "loss_xval": 3.6875, + "num_input_tokens_seen": 9501408, + "step": 55 + }, + { + "epoch": 0.02238209432454037, + "grad_norm": 192.3988438237558, + "learning_rate": 3.645187162072364e-06, + "loss": 7.7137, + "num_input_tokens_seen": 9674048, + "step": 56 + }, + { + "epoch": 0.02238209432454037, + "loss": 7.810283660888672, + "loss_ce": 2.751690149307251, + "loss_xval": 5.0625, + "num_input_tokens_seen": 9674048, + "step": 56 + }, + { + "epoch": 0.022781774580335732, + "grad_norm": 659.8576946924561, + "learning_rate": 3.6612151457552162e-06, + "loss": 6.6531, + "num_input_tokens_seen": 9847000, + "step": 57 + }, + { + "epoch": 0.022781774580335732, + "loss": 5.987558364868164, + "loss_ce": 2.647714614868164, + "loss_xval": 3.34375, + "num_input_tokens_seen": 9847000, + "step": 57 + }, + { + "epoch": 0.023181454836131096, + "grad_norm": 664.7341132161741, + "learning_rate": 3.6769643677189227e-06, + "loss": 7.0286, + "num_input_tokens_seen": 10019880, + "step": 58 + }, + { + "epoch": 0.023181454836131096, + "loss": 7.144340515136719, + "loss_ce": 2.720512628555298, + "loss_xval": 4.4375, + "num_input_tokens_seen": 10019880, + "step": 58 + }, + { + "epoch": 0.02358113509192646, + "grad_norm": 212.02522543694363, + "learning_rate": 3.692444358987175e-06, + "loss": 7.0622, + "num_input_tokens_seen": 10192832, + "step": 59 + }, + { + "epoch": 0.02358113509192646, + "loss": 6.224069595336914, + "loss_ce": 2.544382333755493, + "loss_xval": 3.6875, + "num_input_tokens_seen": 10192832, + "step": 59 + }, + { + "epoch": 0.023980815347721823, + "grad_norm": 165.01048096537636, + "learning_rate": 3.707664169981407e-06, + "loss": 6.2673, + "num_input_tokens_seen": 10365984, + "step": 60 + }, + { + "epoch": 0.023980815347721823, + "loss": 6.832554817199707, + "loss_ce": 2.652867555618286, + "loss_xval": 4.1875, + "num_input_tokens_seen": 10365984, + "step": 60 + }, + { + "epoch": 0.024380495603517186, + "grad_norm": 483.00531361229116, + "learning_rate": 3.7226324022999028e-06, + "loss": 6.6506, + "num_input_tokens_seen": 10538928, + "step": 61 + }, + { + "epoch": 0.024380495603517186, + "loss": 7.050806045532227, + "loss_ce": 2.4297122955322266, + "loss_xval": 4.625, + "num_input_tokens_seen": 10538928, + "step": 61 + }, + { + "epoch": 0.02478017585931255, + "grad_norm": 572.7726598458358, + "learning_rate": 3.737357237912862e-06, + "loss": 6.9383, + "num_input_tokens_seen": 10711776, + "step": 62 + }, + { + "epoch": 0.02478017585931255, + "loss": 6.301916122436523, + "loss_ce": 2.4972286224365234, + "loss_xval": 3.8125, + "num_input_tokens_seen": 10711776, + "step": 62 + }, + { + "epoch": 0.025179856115107913, + "grad_norm": 199.8374697737695, + "learning_rate": 3.751846466021567e-06, + "loss": 6.6134, + "num_input_tokens_seen": 10884920, + "step": 63 + }, + { + "epoch": 0.025179856115107913, + "loss": 6.5081939697265625, + "loss_ce": 2.4437410831451416, + "loss_xval": 4.0625, + "num_input_tokens_seen": 10884920, + "step": 63 + }, + { + "epoch": 0.025579536370903277, + "grad_norm": 179.37701762975175, + "learning_rate": 3.7661075078023677e-06, + "loss": 6.0898, + "num_input_tokens_seen": 11057376, + "step": 64 + }, + { + "epoch": 0.025579536370903277, + "loss": 5.734729290008545, + "loss_ce": 2.328479290008545, + "loss_xval": 3.40625, + "num_input_tokens_seen": 11057376, + "step": 64 + }, + { + "epoch": 0.02597921662669864, + "grad_norm": 190.80330379121816, + "learning_rate": 3.7801474392322986e-06, + "loss": 5.5901, + "num_input_tokens_seen": 11229816, + "step": 65 + }, + { + "epoch": 0.02597921662669864, + "loss": 5.417404651641846, + "loss_ce": 2.1576390266418457, + "loss_xval": 3.265625, + "num_input_tokens_seen": 11229816, + "step": 65 + }, + { + "epoch": 0.026378896882494004, + "grad_norm": 375.5172678891961, + "learning_rate": 3.793973012172059e-06, + "loss": 5.7052, + "num_input_tokens_seen": 11402552, + "step": 66 + }, + { + "epoch": 0.026378896882494004, + "loss": 5.661341667175293, + "loss_ce": 2.294153928756714, + "loss_xval": 3.375, + "num_input_tokens_seen": 11402552, + "step": 66 + }, + { + "epoch": 0.026778577138289367, + "grad_norm": 331.5482112197834, + "learning_rate": 3.807590673863634e-06, + "loss": 5.6927, + "num_input_tokens_seen": 11575584, + "step": 67 + }, + { + "epoch": 0.026778577138289367, + "loss": 5.483163833618164, + "loss_ce": 2.371835708618164, + "loss_xval": 3.109375, + "num_input_tokens_seen": 11575584, + "step": 67 + }, + { + "epoch": 0.02717825739408473, + "grad_norm": 118.69682377879631, + "learning_rate": 3.8210065849834735e-06, + "loss": 5.503, + "num_input_tokens_seen": 11744688, + "step": 68 + }, + { + "epoch": 0.02717825739408473, + "loss": 4.790616989135742, + "loss_ce": 2.1099531650543213, + "loss_xval": 2.6875, + "num_input_tokens_seen": 11744688, + "step": 68 + }, + { + "epoch": 0.027577937649880094, + "grad_norm": 209.335686573374, + "learning_rate": 3.834226636377774e-06, + "loss": 4.9825, + "num_input_tokens_seen": 11917488, + "step": 69 + }, + { + "epoch": 0.027577937649880094, + "loss": 5.30062198638916, + "loss_ce": 2.11995792388916, + "loss_xval": 3.1875, + "num_input_tokens_seen": 11917488, + "step": 69 + }, + { + "epoch": 0.027977617905675458, + "grad_norm": 183.04183416930314, + "learning_rate": 3.847256464593665e-06, + "loss": 5.1639, + "num_input_tokens_seen": 12090624, + "step": 70 + }, + { + "epoch": 0.027977617905675458, + "loss": 5.195356369018555, + "loss_ce": 2.033247232437134, + "loss_xval": 3.15625, + "num_input_tokens_seen": 12090624, + "step": 70 + }, + { + "epoch": 0.028377298161470825, + "grad_norm": 258.67109460163607, + "learning_rate": 3.860101466308762e-06, + "loss": 5.6155, + "num_input_tokens_seen": 12263440, + "step": 71 + }, + { + "epoch": 0.028377298161470825, + "loss": 5.879308223724365, + "loss_ce": 2.093175172805786, + "loss_xval": 3.78125, + "num_input_tokens_seen": 12263440, + "step": 71 + }, + { + "epoch": 0.02877697841726619, + "grad_norm": 216.22409083276455, + "learning_rate": 3.872766811751572e-06, + "loss": 5.8319, + "num_input_tokens_seen": 12433088, + "step": 72 + }, + { + "epoch": 0.02877697841726619, + "loss": 5.86053466796875, + "loss_ce": 2.049011468887329, + "loss_xval": 3.8125, + "num_input_tokens_seen": 12433088, + "step": 72 + }, + { + "epoch": 0.029176658673061552, + "grad_norm": 224.64777003998142, + "learning_rate": 3.8852574571962525e-06, + "loss": 5.0139, + "num_input_tokens_seen": 12602080, + "step": 73 + }, + { + "epoch": 0.029176658673061552, + "loss": 5.154097557067871, + "loss_ce": 1.9460902214050293, + "loss_xval": 3.203125, + "num_input_tokens_seen": 12602080, + "step": 73 + }, + { + "epoch": 0.029576338928856916, + "grad_norm": 188.87320237119758, + "learning_rate": 3.897578156607312e-06, + "loss": 4.8818, + "num_input_tokens_seen": 12772032, + "step": 74 + }, + { + "epoch": 0.029576338928856916, + "loss": 4.89943790435791, + "loss_ce": 2.05178165435791, + "loss_xval": 2.84375, + "num_input_tokens_seen": 12772032, + "step": 74 + }, + { + "epoch": 0.02997601918465228, + "grad_norm": 239.00340379876698, + "learning_rate": 3.9097334725027084e-06, + "loss": 4.9708, + "num_input_tokens_seen": 12944640, + "step": 75 + }, + { + "epoch": 0.02997601918465228, + "loss": 5.254262924194336, + "loss_ce": 1.935903549194336, + "loss_xval": 3.3125, + "num_input_tokens_seen": 12944640, + "step": 75 + }, + { + "epoch": 0.030375699440447643, + "grad_norm": 175.64097497094016, + "learning_rate": 3.921727786097478e-06, + "loss": 4.8671, + "num_input_tokens_seen": 13117608, + "step": 76 + }, + { + "epoch": 0.030375699440447643, + "loss": 4.83261251449585, + "loss_ce": 1.8843704462051392, + "loss_xval": 2.953125, + "num_input_tokens_seen": 13117608, + "step": 76 + }, + { + "epoch": 0.030775379696243006, + "grad_norm": 171.78683132868542, + "learning_rate": 3.933565306784317e-06, + "loss": 4.9704, + "num_input_tokens_seen": 13290680, + "step": 77 + }, + { + "epoch": 0.030775379696243006, + "loss": 4.89105224609375, + "loss_ce": 1.9115601778030396, + "loss_xval": 2.984375, + "num_input_tokens_seen": 13290680, + "step": 77 + }, + { + "epoch": 0.03117505995203837, + "grad_norm": 179.14747541729278, + "learning_rate": 3.945250081002463e-06, + "loss": 4.7534, + "num_input_tokens_seen": 13464144, + "step": 78 + }, + { + "epoch": 0.03117505995203837, + "loss": 4.642127513885498, + "loss_ce": 1.9326547384262085, + "loss_xval": 2.703125, + "num_input_tokens_seen": 13464144, + "step": 78 + }, + { + "epoch": 0.03157474020783373, + "grad_norm": 137.37235263944487, + "learning_rate": 3.956786000541636e-06, + "loss": 4.239, + "num_input_tokens_seen": 13637160, + "step": 79 + }, + { + "epoch": 0.03157474020783373, + "loss": 4.785009860992432, + "loss_ce": 1.7830569744110107, + "loss_xval": 3.0, + "num_input_tokens_seen": 13637160, + "step": 79 + }, + { + "epoch": 0.0319744204636291, + "grad_norm": 186.29416857398633, + "learning_rate": 3.96817681032367e-06, + "loss": 5.1584, + "num_input_tokens_seen": 13810040, + "step": 80 + }, + { + "epoch": 0.0319744204636291, + "loss": 5.067818641662598, + "loss_ce": 1.8412563800811768, + "loss_xval": 3.21875, + "num_input_tokens_seen": 13810040, + "step": 80 + }, + { + "epoch": 0.03237410071942446, + "grad_norm": 260.14001469149053, + "learning_rate": 3.979426115700776e-06, + "loss": 4.9267, + "num_input_tokens_seen": 13982936, + "step": 81 + }, + { + "epoch": 0.03237410071942446, + "loss": 4.497675895690918, + "loss_ce": 1.7955272197723389, + "loss_xval": 2.703125, + "num_input_tokens_seen": 13982936, + "step": 81 + }, + { + "epoch": 0.032773780975219824, + "grad_norm": 216.40429405771704, + "learning_rate": 3.990537389306027e-06, + "loss": 5.5378, + "num_input_tokens_seen": 14156248, + "step": 82 + }, + { + "epoch": 0.032773780975219824, + "loss": 5.3804426193237305, + "loss_ce": 1.7896225452423096, + "loss_xval": 3.59375, + "num_input_tokens_seen": 14156248, + "step": 82 + }, + { + "epoch": 0.03317346123101519, + "grad_norm": 549.5248800116727, + "learning_rate": 4.001513977488632e-06, + "loss": 4.9184, + "num_input_tokens_seen": 14329344, + "step": 83 + }, + { + "epoch": 0.03317346123101519, + "loss": 4.968637466430664, + "loss_ce": 1.767465353012085, + "loss_xval": 3.203125, + "num_input_tokens_seen": 14329344, + "step": 83 + }, + { + "epoch": 0.03357314148681055, + "grad_norm": 194.0581771356025, + "learning_rate": 4.012359106363829e-06, + "loss": 4.3067, + "num_input_tokens_seen": 14502072, + "step": 84 + }, + { + "epoch": 0.03357314148681055, + "loss": 4.593915939331055, + "loss_ce": 1.6945018768310547, + "loss_xval": 2.90625, + "num_input_tokens_seen": 14502072, + "step": 84 + }, + { + "epoch": 0.033972821742605915, + "grad_norm": 516.6266208234828, + "learning_rate": 4.023075887504775e-06, + "loss": 4.4029, + "num_input_tokens_seen": 14675008, + "step": 85 + }, + { + "epoch": 0.033972821742605915, + "loss": 4.0735392570495605, + "loss_ce": 1.7053749561309814, + "loss_xval": 2.375, + "num_input_tokens_seen": 14675008, + "step": 85 + }, + { + "epoch": 0.03437250199840128, + "grad_norm": 374.0473462885556, + "learning_rate": 4.033667323301552e-06, + "loss": 4.1857, + "num_input_tokens_seen": 14847768, + "step": 86 + }, + { + "epoch": 0.03437250199840128, + "loss": 4.331335067749023, + "loss_ce": 1.7405146360397339, + "loss_xval": 2.59375, + "num_input_tokens_seen": 14847768, + "step": 86 + }, + { + "epoch": 0.03477218225419664, + "grad_norm": 156.7332678793166, + "learning_rate": 4.044136312010388e-06, + "loss": 4.2331, + "num_input_tokens_seen": 15020560, + "step": 87 + }, + { + "epoch": 0.03477218225419664, + "loss": 3.9675381183624268, + "loss_ce": 1.6716396808624268, + "loss_xval": 2.296875, + "num_input_tokens_seen": 15020560, + "step": 87 + }, + { + "epoch": 0.035171862509992005, + "grad_norm": 319.8318187587833, + "learning_rate": 4.0544856525143226e-06, + "loss": 4.8582, + "num_input_tokens_seen": 15193280, + "step": 88 + }, + { + "epoch": 0.035171862509992005, + "loss": 4.755683422088623, + "loss_ce": 1.5857617855072021, + "loss_xval": 3.171875, + "num_input_tokens_seen": 15193280, + "step": 88 + }, + { + "epoch": 0.03557154276578737, + "grad_norm": 644.9710013537776, + "learning_rate": 4.064718048814889e-06, + "loss": 4.9109, + "num_input_tokens_seen": 15366384, + "step": 89 + }, + { + "epoch": 0.03557154276578737, + "loss": 4.91407585144043, + "loss_ce": 1.6425917148590088, + "loss_xval": 3.265625, + "num_input_tokens_seen": 15366384, + "step": 89 + }, + { + "epoch": 0.03597122302158273, + "grad_norm": 246.6542571417044, + "learning_rate": 4.074836114272873e-06, + "loss": 4.3901, + "num_input_tokens_seen": 15539432, + "step": 90 + }, + { + "epoch": 0.03597122302158273, + "loss": 4.276044845581055, + "loss_ce": 1.6398143768310547, + "loss_xval": 2.640625, + "num_input_tokens_seen": 15539432, + "step": 90 + }, + { + "epoch": 0.036370903277378096, + "grad_norm": 455.99645617059537, + "learning_rate": 4.08484237561472e-06, + "loss": 4.7347, + "num_input_tokens_seen": 15712712, + "step": 91 + }, + { + "epoch": 0.036370903277378096, + "loss": 5.283034324645996, + "loss_ce": 1.6326435804367065, + "loss_xval": 3.65625, + "num_input_tokens_seen": 15712712, + "step": 91 + }, + { + "epoch": 0.03677058353317346, + "grad_norm": 196.25249760645295, + "learning_rate": 4.094739276720037e-06, + "loss": 4.3173, + "num_input_tokens_seen": 15885664, + "step": 92 + }, + { + "epoch": 0.03677058353317346, + "loss": 4.700507640838623, + "loss_ce": 1.668281078338623, + "loss_xval": 3.03125, + "num_input_tokens_seen": 15885664, + "step": 92 + }, + { + "epoch": 0.03717026378896882, + "grad_norm": 285.7770874903587, + "learning_rate": 4.1045291822043285e-06, + "loss": 4.0221, + "num_input_tokens_seen": 16058736, + "step": 93 + }, + { + "epoch": 0.03717026378896882, + "loss": 4.253849983215332, + "loss_ce": 1.6098066568374634, + "loss_xval": 2.640625, + "num_input_tokens_seen": 16058736, + "step": 93 + }, + { + "epoch": 0.037569944044764186, + "grad_norm": 204.95844707804565, + "learning_rate": 4.1142143808101425e-06, + "loss": 4.2728, + "num_input_tokens_seen": 16231688, + "step": 94 + }, + { + "epoch": 0.037569944044764186, + "loss": 4.915648460388184, + "loss_ce": 1.6168203353881836, + "loss_xval": 3.296875, + "num_input_tokens_seen": 16231688, + "step": 94 + }, + { + "epoch": 0.03796962430055955, + "grad_norm": 156.8292452749296, + "learning_rate": 4.123797088618779e-06, + "loss": 4.1215, + "num_input_tokens_seen": 16404472, + "step": 95 + }, + { + "epoch": 0.03796962430055955, + "loss": 4.639373779296875, + "loss_ce": 1.5382994413375854, + "loss_xval": 3.09375, + "num_input_tokens_seen": 16404472, + "step": 95 + }, + { + "epoch": 0.03836930455635491, + "grad_norm": 131.9876314168009, + "learning_rate": 4.133279452093834e-06, + "loss": 4.1419, + "num_input_tokens_seen": 16577504, + "step": 96 + }, + { + "epoch": 0.03836930455635491, + "loss": 4.308347702026367, + "loss_ce": 1.5310044288635254, + "loss_xval": 2.78125, + "num_input_tokens_seen": 16577504, + "step": 96 + }, + { + "epoch": 0.03876898481215028, + "grad_norm": 110.37162259406017, + "learning_rate": 4.142663550967035e-06, + "loss": 3.6569, + "num_input_tokens_seen": 16750808, + "step": 97 + }, + { + "epoch": 0.03876898481215028, + "loss": 3.53420352935791, + "loss_ce": 1.5429925918579102, + "loss_xval": 1.9921875, + "num_input_tokens_seen": 16750808, + "step": 97 + }, + { + "epoch": 0.03916866506794564, + "grad_norm": 168.5095133634534, + "learning_rate": 4.151951400976087e-06, + "loss": 3.7882, + "num_input_tokens_seen": 16923992, + "step": 98 + }, + { + "epoch": 0.03916866506794564, + "loss": 3.392901659011841, + "loss_ce": 1.4983705282211304, + "loss_xval": 1.890625, + "num_input_tokens_seen": 16923992, + "step": 98 + }, + { + "epoch": 0.039568345323741004, + "grad_norm": 114.59340514251383, + "learning_rate": 4.161144956463525e-06, + "loss": 3.8205, + "num_input_tokens_seen": 17096680, + "step": 99 + }, + { + "epoch": 0.039568345323741004, + "loss": 3.6567561626434326, + "loss_ce": 1.485857605934143, + "loss_xval": 2.171875, + "num_input_tokens_seen": 17096680, + "step": 99 + }, + { + "epoch": 0.03996802557953637, + "grad_norm": 203.8901770985848, + "learning_rate": 4.170246112844972e-06, + "loss": 3.8433, + "num_input_tokens_seen": 17269216, + "step": 100 + }, + { + "epoch": 0.03996802557953637, + "loss": 3.3409595489501953, + "loss_ce": 1.4923267364501953, + "loss_xval": 1.8515625, + "num_input_tokens_seen": 17269216, + "step": 100 + }, + { + "epoch": 0.04036770583533174, + "grad_norm": 124.1343078329466, + "learning_rate": 4.179256708954579e-06, + "loss": 3.6334, + "num_input_tokens_seen": 17442352, + "step": 101 + }, + { + "epoch": 0.04036770583533174, + "loss": 3.501720666885376, + "loss_ce": 1.488048791885376, + "loss_xval": 2.015625, + "num_input_tokens_seen": 17442352, + "step": 101 + }, + { + "epoch": 0.0407673860911271, + "grad_norm": 196.9699571170823, + "learning_rate": 4.188178529274939e-06, + "loss": 3.2125, + "num_input_tokens_seen": 17615376, + "step": 102 + }, + { + "epoch": 0.0407673860911271, + "loss": 3.121706008911133, + "loss_ce": 1.4449481964111328, + "loss_xval": 1.6796875, + "num_input_tokens_seen": 17615376, + "step": 102 + }, + { + "epoch": 0.041167066346922465, + "grad_norm": 119.54443363473506, + "learning_rate": 4.197013306058203e-06, + "loss": 3.4917, + "num_input_tokens_seen": 17787984, + "step": 103 + }, + { + "epoch": 0.041167066346922465, + "loss": 3.2809221744537354, + "loss_ce": 1.4230120182037354, + "loss_xval": 1.859375, + "num_input_tokens_seen": 17787984, + "step": 103 + }, + { + "epoch": 0.04156674660271783, + "grad_norm": 246.81496667478712, + "learning_rate": 4.205762721344725e-06, + "loss": 3.7306, + "num_input_tokens_seen": 17961048, + "step": 104 + }, + { + "epoch": 0.04156674660271783, + "loss": 3.0534510612487793, + "loss_ce": 1.4357751607894897, + "loss_xval": 1.6171875, + "num_input_tokens_seen": 17961048, + "step": 104 + }, + { + "epoch": 0.04196642685851319, + "grad_norm": 140.73203674853707, + "learning_rate": 4.21442840888513e-06, + "loss": 3.6675, + "num_input_tokens_seen": 18133960, + "step": 105 + }, + { + "epoch": 0.04196642685851319, + "loss": 3.460153102874756, + "loss_ce": 1.4249968528747559, + "loss_xval": 2.03125, + "num_input_tokens_seen": 18133960, + "step": 105 + }, + { + "epoch": 0.042366107114308556, + "grad_norm": 133.38932388420073, + "learning_rate": 4.223011955971264e-06, + "loss": 3.7657, + "num_input_tokens_seen": 18306920, + "step": 106 + }, + { + "epoch": 0.042366107114308556, + "loss": 3.373945951461792, + "loss_ce": 1.422774076461792, + "loss_xval": 1.953125, + "num_input_tokens_seen": 18306920, + "step": 106 + }, + { + "epoch": 0.04276578737010392, + "grad_norm": 137.17503761831742, + "learning_rate": 4.231514905181194e-06, + "loss": 3.9627, + "num_input_tokens_seen": 18479872, + "step": 107 + }, + { + "epoch": 0.04276578737010392, + "loss": 3.5468502044677734, + "loss_ce": 1.4228266477584839, + "loss_xval": 2.125, + "num_input_tokens_seen": 18479872, + "step": 107 + }, + { + "epoch": 0.04316546762589928, + "grad_norm": 168.70624239265484, + "learning_rate": 4.239938756043038e-06, + "loss": 3.0579, + "num_input_tokens_seen": 18653056, + "step": 108 + }, + { + "epoch": 0.04316546762589928, + "loss": 2.9931588172912598, + "loss_ce": 1.4638619422912598, + "loss_xval": 1.53125, + "num_input_tokens_seen": 18653056, + "step": 108 + }, + { + "epoch": 0.043565147881694646, + "grad_norm": 116.92635982536697, + "learning_rate": 4.248284966622114e-06, + "loss": 3.3453, + "num_input_tokens_seen": 18825792, + "step": 109 + }, + { + "epoch": 0.043565147881694646, + "loss": 3.643871307373047, + "loss_ce": 1.4241447448730469, + "loss_xval": 2.21875, + "num_input_tokens_seen": 18825792, + "step": 109 + }, + { + "epoch": 0.04396482813749001, + "grad_norm": 211.32307173730896, + "learning_rate": 4.256554955035623e-06, + "loss": 3.7173, + "num_input_tokens_seen": 18998800, + "step": 110 + }, + { + "epoch": 0.04396482813749001, + "loss": 3.9510254859924316, + "loss_ce": 1.4148926734924316, + "loss_xval": 2.53125, + "num_input_tokens_seen": 18998800, + "step": 110 + }, + { + "epoch": 0.04436450839328537, + "grad_norm": 125.8640650841611, + "learning_rate": 4.264750100898777e-06, + "loss": 3.5679, + "num_input_tokens_seen": 19171832, + "step": 111 + }, + { + "epoch": 0.04436450839328537, + "loss": 3.722196102142334, + "loss_ce": 1.392117977142334, + "loss_xval": 2.328125, + "num_input_tokens_seen": 19171832, + "step": 111 + }, + { + "epoch": 0.04476418864908074, + "grad_norm": 159.67886896593234, + "learning_rate": 4.272871746706091e-06, + "loss": 3.2583, + "num_input_tokens_seen": 19344784, + "step": 112 + }, + { + "epoch": 0.04476418864908074, + "loss": 3.2728888988494873, + "loss_ce": 1.4017952680587769, + "loss_xval": 1.875, + "num_input_tokens_seen": 19344784, + "step": 112 + }, + { + "epoch": 0.0451638689048761, + "grad_norm": 158.9863093245826, + "learning_rate": 4.280921199151268e-06, + "loss": 3.9811, + "num_input_tokens_seen": 19517688, + "step": 113 + }, + { + "epoch": 0.0451638689048761, + "loss": 4.066771030426025, + "loss_ce": 1.3548572063446045, + "loss_xval": 2.71875, + "num_input_tokens_seen": 19517688, + "step": 113 + }, + { + "epoch": 0.045563549160671464, + "grad_norm": 132.59333871837742, + "learning_rate": 4.288899730388944e-06, + "loss": 3.2045, + "num_input_tokens_seen": 19690880, + "step": 114 + }, + { + "epoch": 0.045563549160671464, + "loss": 3.301539659500122, + "loss_ce": 1.330348253250122, + "loss_xval": 1.96875, + "num_input_tokens_seen": 19690880, + "step": 114 + }, + { + "epoch": 0.04596322941646683, + "grad_norm": 156.7543956695616, + "learning_rate": 4.296808579241338e-06, + "loss": 3.0619, + "num_input_tokens_seen": 19863616, + "step": 115 + }, + { + "epoch": 0.04596322941646683, + "loss": 3.2327518463134766, + "loss_ce": 1.3567752838134766, + "loss_xval": 1.875, + "num_input_tokens_seen": 19863616, + "step": 115 + }, + { + "epoch": 0.04636290967226219, + "grad_norm": 137.7793568340198, + "learning_rate": 4.304648952352651e-06, + "loss": 3.3103, + "num_input_tokens_seen": 20036800, + "step": 116 + }, + { + "epoch": 0.04636290967226219, + "loss": 3.387848138809204, + "loss_ce": 1.331695795059204, + "loss_xval": 2.0625, + "num_input_tokens_seen": 20036800, + "step": 116 + }, + { + "epoch": 0.046762589928057555, + "grad_norm": 170.99058754674215, + "learning_rate": 4.312422025293929e-06, + "loss": 3.5094, + "num_input_tokens_seen": 20209848, + "step": 117 + }, + { + "epoch": 0.046762589928057555, + "loss": 3.220695972442627, + "loss_ce": 1.355461597442627, + "loss_xval": 1.8671875, + "num_input_tokens_seen": 20209848, + "step": 117 + }, + { + "epoch": 0.04716227018385292, + "grad_norm": 97.06419206586995, + "learning_rate": 4.320128943620903e-06, + "loss": 2.9942, + "num_input_tokens_seen": 20382728, + "step": 118 + }, + { + "epoch": 0.04716227018385292, + "loss": 3.0818350315093994, + "loss_ce": 1.3162100315093994, + "loss_xval": 1.765625, + "num_input_tokens_seen": 20382728, + "step": 118 + }, + { + "epoch": 0.04756195043964828, + "grad_norm": 338.7848439976261, + "learning_rate": 4.327770823887197e-06, + "loss": 3.9238, + "num_input_tokens_seen": 20555712, + "step": 119 + }, + { + "epoch": 0.04756195043964828, + "loss": 4.059802055358887, + "loss_ce": 1.2927122116088867, + "loss_xval": 2.765625, + "num_input_tokens_seen": 20555712, + "step": 119 + }, + { + "epoch": 0.047961630695443645, + "grad_norm": 427.11268613455303, + "learning_rate": 4.335348754615135e-06, + "loss": 3.0822, + "num_input_tokens_seen": 20728280, + "step": 120 + }, + { + "epoch": 0.047961630695443645, + "loss": 3.1181368827819824, + "loss_ce": 1.2724335193634033, + "loss_xval": 1.84375, + "num_input_tokens_seen": 20728280, + "step": 120 + }, + { + "epoch": 0.04836131095123901, + "grad_norm": 115.90695385095029, + "learning_rate": 4.342863797226275e-06, + "loss": 3.0454, + "num_input_tokens_seen": 20901240, + "step": 121 + }, + { + "epoch": 0.04836131095123901, + "loss": 2.844395875930786, + "loss_ce": 1.2389271259307861, + "loss_xval": 1.609375, + "num_input_tokens_seen": 20901240, + "step": 121 + }, + { + "epoch": 0.04876099120703437, + "grad_norm": 266.16344074709775, + "learning_rate": 4.350316986933631e-06, + "loss": 3.8283, + "num_input_tokens_seen": 21074032, + "step": 122 + }, + { + "epoch": 0.04876099120703437, + "loss": 3.777801752090454, + "loss_ce": 1.2328799962997437, + "loss_xval": 2.546875, + "num_input_tokens_seen": 21074032, + "step": 122 + }, + { + "epoch": 0.049160671462829736, + "grad_norm": 213.32519005243836, + "learning_rate": 4.3577093335974925e-06, + "loss": 3.1261, + "num_input_tokens_seen": 21247224, + "step": 123 + }, + { + "epoch": 0.049160671462829736, + "loss": 3.3567044734954834, + "loss_ce": 1.2375637292861938, + "loss_xval": 2.125, + "num_input_tokens_seen": 21247224, + "step": 123 + }, + { + "epoch": 0.0495603517186251, + "grad_norm": 101.5577996272501, + "learning_rate": 4.36504182254659e-06, + "loss": 3.0516, + "num_input_tokens_seen": 21420088, + "step": 124 + }, + { + "epoch": 0.0495603517186251, + "loss": 3.4608333110809326, + "loss_ce": 1.2235286235809326, + "loss_xval": 2.234375, + "num_input_tokens_seen": 21420088, + "step": 124 + }, + { + "epoch": 0.04996003197442046, + "grad_norm": 198.95971382452228, + "learning_rate": 4.3723154153662725e-06, + "loss": 3.2876, + "num_input_tokens_seen": 21593120, + "step": 125 + }, + { + "epoch": 0.04996003197442046, + "loss": 3.0590901374816895, + "loss_ce": 1.2397539615631104, + "loss_xval": 1.8203125, + "num_input_tokens_seen": 21593120, + "step": 125 + }, + { + "epoch": 0.050359712230215826, + "grad_norm": 166.57226050726882, + "learning_rate": 4.379531050655295e-06, + "loss": 2.9623, + "num_input_tokens_seen": 21765976, + "step": 126 + }, + { + "epoch": 0.050359712230215826, + "loss": 2.768925666809082, + "loss_ce": 1.2347460985183716, + "loss_xval": 1.53125, + "num_input_tokens_seen": 21765976, + "step": 126 + }, + { + "epoch": 0.05075939248601119, + "grad_norm": 217.73765671883095, + "learning_rate": 4.386689644752683e-06, + "loss": 3.1025, + "num_input_tokens_seen": 21938808, + "step": 127 + }, + { + "epoch": 0.05075939248601119, + "loss": 2.747063159942627, + "loss_ce": 1.227287769317627, + "loss_xval": 1.5234375, + "num_input_tokens_seen": 21938808, + "step": 127 + }, + { + "epoch": 0.051159072741806554, + "grad_norm": 263.5829876068027, + "learning_rate": 4.3937920924360965e-06, + "loss": 3.3341, + "num_input_tokens_seen": 22111176, + "step": 128 + }, + { + "epoch": 0.051159072741806554, + "loss": 2.9254727363586426, + "loss_ce": 1.2174649238586426, + "loss_xval": 1.7109375, + "num_input_tokens_seen": 22111176, + "step": 128 + }, + { + "epoch": 0.05155875299760192, + "grad_norm": 279.6834850427316, + "learning_rate": 4.4008392675930185e-06, + "loss": 2.9818, + "num_input_tokens_seen": 22284392, + "step": 129 + }, + { + "epoch": 0.05155875299760192, + "loss": 2.40582275390625, + "loss_ce": 1.191955327987671, + "loss_xval": 1.2109375, + "num_input_tokens_seen": 22284392, + "step": 129 + }, + { + "epoch": 0.05195843325339728, + "grad_norm": 273.57321882538974, + "learning_rate": 4.407832023866027e-06, + "loss": 3.3854, + "num_input_tokens_seen": 22457544, + "step": 130 + }, + { + "epoch": 0.05195843325339728, + "loss": 3.0595662593841553, + "loss_ce": 1.2060506343841553, + "loss_xval": 1.8515625, + "num_input_tokens_seen": 22457544, + "step": 130 + }, + { + "epoch": 0.052358113509192644, + "grad_norm": 344.0145083356384, + "learning_rate": 4.414771195273343e-06, + "loss": 3.6465, + "num_input_tokens_seen": 22630440, + "step": 131 + }, + { + "epoch": 0.052358113509192644, + "loss": 4.122766971588135, + "loss_ce": 1.1769661903381348, + "loss_xval": 2.953125, + "num_input_tokens_seen": 22630440, + "step": 131 + }, + { + "epoch": 0.05275779376498801, + "grad_norm": 255.65466358822889, + "learning_rate": 4.421657596805787e-06, + "loss": 3.421, + "num_input_tokens_seen": 22803176, + "step": 132 + }, + { + "epoch": 0.05275779376498801, + "loss": 3.9519777297973633, + "loss_ce": 1.2005128860473633, + "loss_xval": 2.75, + "num_input_tokens_seen": 22803176, + "step": 132 + }, + { + "epoch": 0.05315747402078337, + "grad_norm": 267.6338664362621, + "learning_rate": 4.428492025001201e-06, + "loss": 3.2026, + "num_input_tokens_seen": 22976304, + "step": 133 + }, + { + "epoch": 0.05315747402078337, + "loss": 3.3537511825561523, + "loss_ce": 1.1672275066375732, + "loss_xval": 2.1875, + "num_input_tokens_seen": 22976304, + "step": 133 + }, + { + "epoch": 0.053557154276578735, + "grad_norm": 131.21776577343127, + "learning_rate": 4.435275258497362e-06, + "loss": 2.9131, + "num_input_tokens_seen": 23149344, + "step": 134 + }, + { + "epoch": 0.053557154276578735, + "loss": 3.031949520111084, + "loss_ce": 1.194547176361084, + "loss_xval": 1.8359375, + "num_input_tokens_seen": 23149344, + "step": 134 + }, + { + "epoch": 0.0539568345323741, + "grad_norm": 514.2271793073204, + "learning_rate": 4.442008058564339e-06, + "loss": 3.4785, + "num_input_tokens_seen": 23322352, + "step": 135 + }, + { + "epoch": 0.0539568345323741, + "loss": 3.484752655029297, + "loss_ce": 1.1751822233200073, + "loss_xval": 2.3125, + "num_input_tokens_seen": 23322352, + "step": 135 + }, + { + "epoch": 0.05435651478816946, + "grad_norm": 163.1858947105468, + "learning_rate": 4.448691169617202e-06, + "loss": 3.0443, + "num_input_tokens_seen": 23495392, + "step": 136 + }, + { + "epoch": 0.05435651478816946, + "loss": 2.978681802749634, + "loss_ce": 1.1910841464996338, + "loss_xval": 1.7890625, + "num_input_tokens_seen": 23495392, + "step": 136 + }, + { + "epoch": 0.054756195043964825, + "grad_norm": 453.2241878703702, + "learning_rate": 4.455325319709954e-06, + "loss": 3.0782, + "num_input_tokens_seen": 23667952, + "step": 137 + }, + { + "epoch": 0.054756195043964825, + "loss": 3.2870519161224365, + "loss_ce": 1.1347081661224365, + "loss_xval": 2.15625, + "num_input_tokens_seen": 23667952, + "step": 137 + }, + { + "epoch": 0.05515587529976019, + "grad_norm": 130.18513835393406, + "learning_rate": 4.461911221011503e-06, + "loss": 2.5414, + "num_input_tokens_seen": 23840968, + "step": 138 + }, + { + "epoch": 0.05515587529976019, + "loss": 2.4765074253082275, + "loss_ce": 1.1742613315582275, + "loss_xval": 1.3046875, + "num_input_tokens_seen": 23840968, + "step": 138 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 458.0524797809028, + "learning_rate": 4.468449570264441e-06, + "loss": 3.1261, + "num_input_tokens_seen": 24013976, + "step": 139 + }, + { + "epoch": 0.05555555555555555, + "loss": 3.3846635818481445, + "loss_ce": 1.151753306388855, + "loss_xval": 2.234375, + "num_input_tokens_seen": 24013976, + "step": 139 + }, + { + "epoch": 0.055955235811350916, + "grad_norm": 128.72335328013372, + "learning_rate": 4.474941049227392e-06, + "loss": 3.1837, + "num_input_tokens_seen": 24186640, + "step": 140 + }, + { + "epoch": 0.055955235811350916, + "loss": 2.599513053894043, + "loss_ce": 1.1634780168533325, + "loss_xval": 1.4375, + "num_input_tokens_seen": 24186640, + "step": 140 + }, + { + "epoch": 0.05635491606714628, + "grad_norm": 426.5509683313082, + "learning_rate": 4.481386325101608e-06, + "loss": 3.1009, + "num_input_tokens_seen": 24360088, + "step": 141 + }, + { + "epoch": 0.05635491606714628, + "loss": 2.746947765350342, + "loss_ce": 1.1336662769317627, + "loss_xval": 1.609375, + "num_input_tokens_seen": 24360088, + "step": 141 + }, + { + "epoch": 0.05675459632294165, + "grad_norm": 139.1436086661555, + "learning_rate": 4.487786050942491e-06, + "loss": 2.714, + "num_input_tokens_seen": 24533144, + "step": 142 + }, + { + "epoch": 0.05675459632294165, + "loss": 2.8101062774658203, + "loss_ce": 1.1851062774658203, + "loss_xval": 1.625, + "num_input_tokens_seen": 24533144, + "step": 142 + }, + { + "epoch": 0.057154276578737014, + "grad_norm": 259.8648252836964, + "learning_rate": 4.494140866056678e-06, + "loss": 3.2545, + "num_input_tokens_seen": 24705472, + "step": 143 + }, + { + "epoch": 0.057154276578737014, + "loss": 3.093147039413452, + "loss_ce": 1.1615064144134521, + "loss_xval": 1.9296875, + "num_input_tokens_seen": 24705472, + "step": 143 + }, + { + "epoch": 0.05755395683453238, + "grad_norm": 188.67765026700243, + "learning_rate": 4.5004513963853e-06, + "loss": 3.1569, + "num_input_tokens_seen": 24878424, + "step": 144 + }, + { + "epoch": 0.05755395683453238, + "loss": 3.5168557167053223, + "loss_ce": 1.1731057167053223, + "loss_xval": 2.34375, + "num_input_tokens_seen": 24878424, + "step": 144 + }, + { + "epoch": 0.05795363709032774, + "grad_norm": 128.55755169702456, + "learning_rate": 4.506718254873952e-06, + "loss": 2.8533, + "num_input_tokens_seen": 25051392, + "step": 145 + }, + { + "epoch": 0.05795363709032774, + "loss": 3.2004616260528564, + "loss_ce": 1.1936256885528564, + "loss_xval": 2.0, + "num_input_tokens_seen": 25051392, + "step": 145 + }, + { + "epoch": 0.058353317346123104, + "grad_norm": 178.2596345349547, + "learning_rate": 4.5129420418299804e-06, + "loss": 2.5044, + "num_input_tokens_seen": 25221360, + "step": 146 + }, + { + "epoch": 0.058353317346123104, + "loss": 2.6224756240844727, + "loss_ce": 1.1561671495437622, + "loss_xval": 1.46875, + "num_input_tokens_seen": 25221360, + "step": 146 + }, + { + "epoch": 0.05875299760191847, + "grad_norm": 109.70621446733796, + "learning_rate": 4.519123345267552e-06, + "loss": 2.6378, + "num_input_tokens_seen": 25394160, + "step": 147 + }, + { + "epoch": 0.05875299760191847, + "loss": 3.004621982574463, + "loss_ce": 1.1469557285308838, + "loss_xval": 1.859375, + "num_input_tokens_seen": 25394160, + "step": 147 + }, + { + "epoch": 0.05915267785771383, + "grad_norm": 145.08135486046965, + "learning_rate": 4.52526274124104e-06, + "loss": 2.9199, + "num_input_tokens_seen": 25566928, + "step": 148 + }, + { + "epoch": 0.05915267785771383, + "loss": 3.229592800140381, + "loss_ce": 1.1158232688903809, + "loss_xval": 2.109375, + "num_input_tokens_seen": 25566928, + "step": 148 + }, + { + "epoch": 0.059552358113509195, + "grad_norm": 92.14531111691825, + "learning_rate": 4.5313607941671774e-06, + "loss": 2.3757, + "num_input_tokens_seen": 25739848, + "step": 149 + }, + { + "epoch": 0.059552358113509195, + "loss": 2.58808970451355, + "loss_ce": 1.1012732982635498, + "loss_xval": 1.484375, + "num_input_tokens_seen": 25739848, + "step": 149 + }, + { + "epoch": 0.05995203836930456, + "grad_norm": 218.825158302032, + "learning_rate": 4.537418057136436e-06, + "loss": 3.0667, + "num_input_tokens_seen": 25913056, + "step": 150 + }, + { + "epoch": 0.05995203836930456, + "loss": 2.9010426998138428, + "loss_ce": 1.0685231685638428, + "loss_xval": 1.8359375, + "num_input_tokens_seen": 25913056, + "step": 150 + }, + { + "epoch": 0.06035171862509992, + "grad_norm": 192.8732197306815, + "learning_rate": 4.54343507221407e-06, + "loss": 2.8767, + "num_input_tokens_seen": 26086456, + "step": 151 + }, + { + "epoch": 0.06035171862509992, + "loss": 2.9303150177001953, + "loss_ce": 1.0826586484909058, + "loss_xval": 1.84375, + "num_input_tokens_seen": 26086456, + "step": 151 + }, + { + "epoch": 0.060751398880895285, + "grad_norm": 116.65432798476725, + "learning_rate": 4.549412370731207e-06, + "loss": 2.8269, + "num_input_tokens_seen": 26259336, + "step": 152 + }, + { + "epoch": 0.060751398880895285, + "loss": 2.606137752532959, + "loss_ce": 1.0680519342422485, + "loss_xval": 1.5390625, + "num_input_tokens_seen": 26259336, + "step": 152 + }, + { + "epoch": 0.06115107913669065, + "grad_norm": 129.79247278384622, + "learning_rate": 4.555350473566405e-06, + "loss": 2.6485, + "num_input_tokens_seen": 26432320, + "step": 153 + }, + { + "epoch": 0.06115107913669065, + "loss": 2.66135573387146, + "loss_ce": 1.0431914329528809, + "loss_xval": 1.6171875, + "num_input_tokens_seen": 26432320, + "step": 153 + }, + { + "epoch": 0.06155075939248601, + "grad_norm": 255.33644741357125, + "learning_rate": 4.561249891418045e-06, + "loss": 2.6596, + "num_input_tokens_seen": 26605232, + "step": 154 + }, + { + "epoch": 0.06155075939248601, + "loss": 2.2152175903320312, + "loss_ce": 1.0363845825195312, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 26605232, + "step": 154 + }, + { + "epoch": 0.061950439648281376, + "grad_norm": 326.2132881570226, + "learning_rate": 4.567111125067892e-06, + "loss": 3.1065, + "num_input_tokens_seen": 26778160, + "step": 155 + }, + { + "epoch": 0.061950439648281376, + "loss": 2.977107524871826, + "loss_ce": 1.0156819820404053, + "loss_xval": 1.9609375, + "num_input_tokens_seen": 26778160, + "step": 155 + }, + { + "epoch": 0.06235011990407674, + "grad_norm": 178.37054244023707, + "learning_rate": 4.572934665636191e-06, + "loss": 2.8082, + "num_input_tokens_seen": 26951312, + "step": 156 + }, + { + "epoch": 0.06235011990407674, + "loss": 3.033820390701294, + "loss_ce": 0.995246171951294, + "loss_xval": 2.03125, + "num_input_tokens_seen": 26951312, + "step": 156 + }, + { + "epoch": 0.0627498001598721, + "grad_norm": 240.25091369114222, + "learning_rate": 4.578720994828615e-06, + "loss": 2.8172, + "num_input_tokens_seen": 27124296, + "step": 157 + }, + { + "epoch": 0.0627498001598721, + "loss": 2.7480130195617676, + "loss_ce": 0.9975247383117676, + "loss_xval": 1.75, + "num_input_tokens_seen": 27124296, + "step": 157 + }, + { + "epoch": 0.06314948041566747, + "grad_norm": 299.6531392086124, + "learning_rate": 4.584470585175365e-06, + "loss": 3.137, + "num_input_tokens_seen": 27297296, + "step": 158 + }, + { + "epoch": 0.06314948041566747, + "loss": 3.1829733848571777, + "loss_ce": 0.9749656915664673, + "loss_xval": 2.203125, + "num_input_tokens_seen": 27297296, + "step": 158 + }, + { + "epoch": 0.06354916067146282, + "grad_norm": 236.05049498447235, + "learning_rate": 4.59018390026273e-06, + "loss": 2.7418, + "num_input_tokens_seen": 27470144, + "step": 159 + }, + { + "epoch": 0.06354916067146282, + "loss": 2.5435566902160645, + "loss_ce": 0.991310715675354, + "loss_xval": 1.5546875, + "num_input_tokens_seen": 27470144, + "step": 159 + }, + { + "epoch": 0.0639488409272582, + "grad_norm": 245.1500933668583, + "learning_rate": 4.595861394957398e-06, + "loss": 2.7991, + "num_input_tokens_seen": 27643168, + "step": 160 + }, + { + "epoch": 0.0639488409272582, + "loss": 2.7963478565216064, + "loss_ce": 1.0024025440216064, + "loss_xval": 1.796875, + "num_input_tokens_seen": 27643168, + "step": 160 + }, + { + "epoch": 0.06434852118305355, + "grad_norm": 269.6347230509653, + "learning_rate": 4.601503515623759e-06, + "loss": 2.5151, + "num_input_tokens_seen": 27816264, + "step": 161 + }, + { + "epoch": 0.06434852118305355, + "loss": 2.5006017684936523, + "loss_ce": 1.0118319988250732, + "loss_xval": 1.4921875, + "num_input_tokens_seen": 27816264, + "step": 161 + }, + { + "epoch": 0.06474820143884892, + "grad_norm": 375.92850399502834, + "learning_rate": 4.607110700334503e-06, + "loss": 2.925, + "num_input_tokens_seen": 27988768, + "step": 162 + }, + { + "epoch": 0.06474820143884892, + "loss": 3.5245378017425537, + "loss_ce": 0.9766863584518433, + "loss_xval": 2.546875, + "num_input_tokens_seen": 27988768, + "step": 162 + }, + { + "epoch": 0.06514788169464429, + "grad_norm": 126.56545474069529, + "learning_rate": 4.6126833790747175e-06, + "loss": 3.1895, + "num_input_tokens_seen": 28161192, + "step": 163 + }, + { + "epoch": 0.06514788169464429, + "loss": 2.75604248046875, + "loss_ce": 0.9889528155326843, + "loss_xval": 1.765625, + "num_input_tokens_seen": 28161192, + "step": 163 + }, + { + "epoch": 0.06554756195043965, + "grad_norm": 293.2841730627708, + "learning_rate": 4.618221973939755e-06, + "loss": 3.0304, + "num_input_tokens_seen": 28334152, + "step": 164 + }, + { + "epoch": 0.06554756195043965, + "loss": 3.074063777923584, + "loss_ce": 1.013517141342163, + "loss_xval": 2.0625, + "num_input_tokens_seen": 28334152, + "step": 164 + }, + { + "epoch": 0.06594724220623502, + "grad_norm": 93.69476386555672, + "learning_rate": 4.623726899327088e-06, + "loss": 2.5372, + "num_input_tokens_seen": 28507128, + "step": 165 + }, + { + "epoch": 0.06594724220623502, + "loss": 2.73991322517395, + "loss_ce": 0.991866409778595, + "loss_xval": 1.75, + "num_input_tokens_seen": 28507128, + "step": 165 + }, + { + "epoch": 0.06634692246203037, + "grad_norm": 354.54679439550654, + "learning_rate": 4.629198562122361e-06, + "loss": 3.1533, + "num_input_tokens_seen": 28679696, + "step": 166 + }, + { + "epoch": 0.06634692246203037, + "loss": 3.6342062950134277, + "loss_ce": 0.9945579171180725, + "loss_xval": 2.640625, + "num_input_tokens_seen": 28679696, + "step": 166 + }, + { + "epoch": 0.06674660271782575, + "grad_norm": 142.92356385134568, + "learning_rate": 4.63463736187985e-06, + "loss": 2.6586, + "num_input_tokens_seen": 28852824, + "step": 167 + }, + { + "epoch": 0.06674660271782575, + "loss": 2.430410623550415, + "loss_ce": 0.9899808764457703, + "loss_xval": 1.4375, + "num_input_tokens_seen": 28852824, + "step": 167 + }, + { + "epoch": 0.0671462829736211, + "grad_norm": 454.4623707108821, + "learning_rate": 4.640043690997557e-06, + "loss": 3.096, + "num_input_tokens_seen": 29025240, + "step": 168 + }, + { + "epoch": 0.0671462829736211, + "loss": 3.01880145072937, + "loss_ce": 0.9689967632293701, + "loss_xval": 2.046875, + "num_input_tokens_seen": 29025240, + "step": 168 + }, + { + "epoch": 0.06754596322941647, + "grad_norm": 157.19975275134112, + "learning_rate": 4.645417934887083e-06, + "loss": 2.986, + "num_input_tokens_seen": 29198016, + "step": 169 + }, + { + "epoch": 0.06754596322941647, + "loss": 3.3017961978912354, + "loss_ce": 0.9551164507865906, + "loss_xval": 2.34375, + "num_input_tokens_seen": 29198016, + "step": 169 + }, + { + "epoch": 0.06794564348521183, + "grad_norm": 550.0943331937483, + "learning_rate": 4.650760472138503e-06, + "loss": 3.3266, + "num_input_tokens_seen": 29371016, + "step": 170 + }, + { + "epoch": 0.06794564348521183, + "loss": 3.0751547813415527, + "loss_ce": 0.9765218496322632, + "loss_xval": 2.09375, + "num_input_tokens_seen": 29371016, + "step": 170 + }, + { + "epoch": 0.0683453237410072, + "grad_norm": 156.04590864267016, + "learning_rate": 4.65607167468041e-06, + "loss": 2.5659, + "num_input_tokens_seen": 29544104, + "step": 171 + }, + { + "epoch": 0.0683453237410072, + "loss": 2.807107925415039, + "loss_ce": 0.9848423004150391, + "loss_xval": 1.8203125, + "num_input_tokens_seen": 29544104, + "step": 171 + }, + { + "epoch": 0.06874500399680256, + "grad_norm": 387.04024961883766, + "learning_rate": 4.66135190793528e-06, + "loss": 2.8385, + "num_input_tokens_seen": 29717344, + "step": 172 + }, + { + "epoch": 0.06874500399680256, + "loss": 2.743061065673828, + "loss_ce": 0.9437446594238281, + "loss_xval": 1.796875, + "num_input_tokens_seen": 29717344, + "step": 172 + }, + { + "epoch": 0.06914468425259793, + "grad_norm": 352.2869774460929, + "learning_rate": 4.666601530970348e-06, + "loss": 2.9918, + "num_input_tokens_seen": 29890248, + "step": 173 + }, + { + "epoch": 0.06914468425259793, + "loss": 3.1348328590393066, + "loss_ce": 1.016424536705017, + "loss_xval": 2.125, + "num_input_tokens_seen": 29890248, + "step": 173 + }, + { + "epoch": 0.06954436450839328, + "grad_norm": 373.6882053758428, + "learning_rate": 4.671820896644117e-06, + "loss": 2.6598, + "num_input_tokens_seen": 30063288, + "step": 174 + }, + { + "epoch": 0.06954436450839328, + "loss": 3.382646083831787, + "loss_ce": 0.938309907913208, + "loss_xval": 2.4375, + "num_input_tokens_seen": 30063288, + "step": 174 + }, + { + "epoch": 0.06994404476418865, + "grad_norm": 377.7377688158846, + "learning_rate": 4.677010351748694e-06, + "loss": 3.2057, + "num_input_tokens_seen": 30236256, + "step": 175 + }, + { + "epoch": 0.06994404476418865, + "loss": 3.4559521675109863, + "loss_ce": 0.8954052925109863, + "loss_xval": 2.5625, + "num_input_tokens_seen": 30236256, + "step": 175 + }, + { + "epoch": 0.07034372501998401, + "grad_norm": 498.3446583121999, + "learning_rate": 4.68217023714805e-06, + "loss": 3.3281, + "num_input_tokens_seen": 30408968, + "step": 176 + }, + { + "epoch": 0.07034372501998401, + "loss": 4.02139139175415, + "loss_ce": 0.8622116446495056, + "loss_xval": 3.15625, + "num_input_tokens_seen": 30408968, + "step": 176 + }, + { + "epoch": 0.07074340527577938, + "grad_norm": 108.56714493238096, + "learning_rate": 4.687300887912368e-06, + "loss": 2.1292, + "num_input_tokens_seen": 30581920, + "step": 177 + }, + { + "epoch": 0.07074340527577938, + "loss": 2.317440986633301, + "loss_ce": 0.8511323928833008, + "loss_xval": 1.46875, + "num_input_tokens_seen": 30581920, + "step": 177 + }, + { + "epoch": 0.07114308553157474, + "grad_norm": 256.71895728118574, + "learning_rate": 4.692402633448618e-06, + "loss": 2.7499, + "num_input_tokens_seen": 30754880, + "step": 178 + }, + { + "epoch": 0.07114308553157474, + "loss": 2.346262216567993, + "loss_ce": 0.8692113757133484, + "loss_xval": 1.4765625, + "num_input_tokens_seen": 30754880, + "step": 178 + }, + { + "epoch": 0.07154276578737011, + "grad_norm": 110.25625300038342, + "learning_rate": 4.6974757976274554e-06, + "loss": 2.7188, + "num_input_tokens_seen": 30927928, + "step": 179 + }, + { + "epoch": 0.07154276578737011, + "loss": 2.735156774520874, + "loss_ce": 0.925098180770874, + "loss_xval": 1.8125, + "num_input_tokens_seen": 30927928, + "step": 179 + }, + { + "epoch": 0.07194244604316546, + "grad_norm": 262.0838823954474, + "learning_rate": 4.702520698906601e-06, + "loss": 2.5287, + "num_input_tokens_seen": 31100800, + "step": 180 + }, + { + "epoch": 0.07194244604316546, + "loss": 2.8189706802368164, + "loss_ce": 0.9381113052368164, + "loss_xval": 1.8828125, + "num_input_tokens_seen": 31100800, + "step": 180 + }, + { + "epoch": 0.07234212629896083, + "grad_norm": 156.72323945008685, + "learning_rate": 4.707537650450795e-06, + "loss": 2.6636, + "num_input_tokens_seen": 31274032, + "step": 181 + }, + { + "epoch": 0.07234212629896083, + "loss": 2.460793972015381, + "loss_ce": 0.9273467063903809, + "loss_xval": 1.53125, + "num_input_tokens_seen": 31274032, + "step": 181 + }, + { + "epoch": 0.07274180655475619, + "grad_norm": 128.82580048107164, + "learning_rate": 4.712526960248448e-06, + "loss": 2.5592, + "num_input_tokens_seen": 31447248, + "step": 182 + }, + { + "epoch": 0.07274180655475619, + "loss": 3.2276525497436523, + "loss_ce": 0.9224766492843628, + "loss_xval": 2.3125, + "num_input_tokens_seen": 31447248, + "step": 182 + }, + { + "epoch": 0.07314148681055156, + "grad_norm": 158.11940464942924, + "learning_rate": 4.717488931225096e-06, + "loss": 2.8079, + "num_input_tokens_seen": 31619912, + "step": 183 + }, + { + "epoch": 0.07314148681055156, + "loss": 2.667755603790283, + "loss_ce": 0.7932440042495728, + "loss_xval": 1.875, + "num_input_tokens_seen": 31619912, + "step": 183 + }, + { + "epoch": 0.07354116706634692, + "grad_norm": 182.92862063987715, + "learning_rate": 4.722423861353765e-06, + "loss": 2.5388, + "num_input_tokens_seen": 31789160, + "step": 184 + }, + { + "epoch": 0.07354116706634692, + "loss": 2.344510555267334, + "loss_ce": 0.750760555267334, + "loss_xval": 1.59375, + "num_input_tokens_seen": 31789160, + "step": 184 + }, + { + "epoch": 0.07394084732214229, + "grad_norm": 219.64439189932665, + "learning_rate": 4.7273320437623414e-06, + "loss": 2.2814, + "num_input_tokens_seen": 31961976, + "step": 185 + }, + { + "epoch": 0.07394084732214229, + "loss": 2.476329803466797, + "loss_ce": 0.7595328688621521, + "loss_xval": 1.71875, + "num_input_tokens_seen": 31961976, + "step": 185 + }, + { + "epoch": 0.07434052757793765, + "grad_norm": 99.98395758538393, + "learning_rate": 4.7322137668380565e-06, + "loss": 2.3144, + "num_input_tokens_seen": 32135048, + "step": 186 + }, + { + "epoch": 0.07434052757793765, + "loss": 2.1182146072387695, + "loss_ce": 0.8057146072387695, + "loss_xval": 1.3125, + "num_input_tokens_seen": 32135048, + "step": 186 + }, + { + "epoch": 0.07474020783373302, + "grad_norm": 221.2636081842405, + "learning_rate": 4.737069314329155e-06, + "loss": 2.5043, + "num_input_tokens_seen": 32307616, + "step": 187 + }, + { + "epoch": 0.07474020783373302, + "loss": 2.7484989166259766, + "loss_ce": 0.8300418853759766, + "loss_xval": 1.921875, + "num_input_tokens_seen": 32307616, + "step": 187 + }, + { + "epoch": 0.07513988808952837, + "grad_norm": 131.12741561544465, + "learning_rate": 4.7418989654438705e-06, + "loss": 2.515, + "num_input_tokens_seen": 32480272, + "step": 188 + }, + { + "epoch": 0.07513988808952837, + "loss": 2.4454233646392822, + "loss_ce": 0.8226206302642822, + "loss_xval": 1.625, + "num_input_tokens_seen": 32480272, + "step": 188 + }, + { + "epoch": 0.07553956834532374, + "grad_norm": 180.84951677755777, + "learning_rate": 4.746702994946761e-06, + "loss": 2.3479, + "num_input_tokens_seen": 32653328, + "step": 189 + }, + { + "epoch": 0.07553956834532374, + "loss": 2.8146772384643555, + "loss_ce": 0.8351851105690002, + "loss_xval": 1.9765625, + "num_input_tokens_seen": 32653328, + "step": 189 + }, + { + "epoch": 0.0759392486011191, + "grad_norm": 160.96373059065317, + "learning_rate": 4.751481673252507e-06, + "loss": 2.678, + "num_input_tokens_seen": 32826176, + "step": 190 + }, + { + "epoch": 0.0759392486011191, + "loss": 2.8481264114379883, + "loss_ce": 0.8188296556472778, + "loss_xval": 2.03125, + "num_input_tokens_seen": 32826176, + "step": 190 + }, + { + "epoch": 0.07633892885691447, + "grad_norm": 121.61672489552205, + "learning_rate": 4.756235266517256e-06, + "loss": 2.3304, + "num_input_tokens_seen": 32999312, + "step": 191 + }, + { + "epoch": 0.07633892885691447, + "loss": 2.3364672660827637, + "loss_ce": 0.7788498997688293, + "loss_xval": 1.5546875, + "num_input_tokens_seen": 32999312, + "step": 191 + }, + { + "epoch": 0.07673860911270983, + "grad_norm": 130.32186035710663, + "learning_rate": 4.7609640367275626e-06, + "loss": 2.2134, + "num_input_tokens_seen": 33172008, + "step": 192 + }, + { + "epoch": 0.07673860911270983, + "loss": 1.7896391153335571, + "loss_ce": 0.7471586465835571, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 33172008, + "step": 192 + }, + { + "epoch": 0.0771382893685052, + "grad_norm": 143.76906168760766, + "learning_rate": 4.765668241787041e-06, + "loss": 2.4567, + "num_input_tokens_seen": 33344800, + "step": 193 + }, + { + "epoch": 0.0771382893685052, + "loss": 2.387838840484619, + "loss_ce": 0.82729172706604, + "loss_xval": 1.5625, + "num_input_tokens_seen": 33344800, + "step": 193 + }, + { + "epoch": 0.07753796962430055, + "grad_norm": 127.81565220139322, + "learning_rate": 4.770348135600763e-06, + "loss": 2.77, + "num_input_tokens_seen": 33517848, + "step": 194 + }, + { + "epoch": 0.07753796962430055, + "loss": 2.490429401397705, + "loss_ce": 0.7819331884384155, + "loss_xval": 1.7109375, + "num_input_tokens_seen": 33517848, + "step": 194 + }, + { + "epoch": 0.07793764988009592, + "grad_norm": 138.22926534009375, + "learning_rate": 4.775003968157493e-06, + "loss": 2.1771, + "num_input_tokens_seen": 33690816, + "step": 195 + }, + { + "epoch": 0.07793764988009592, + "loss": 1.85850191116333, + "loss_ce": 0.7606015801429749, + "loss_xval": 1.1015625, + "num_input_tokens_seen": 33690816, + "step": 195 + }, + { + "epoch": 0.07833733013589128, + "grad_norm": 202.1418801789714, + "learning_rate": 4.779635985609814e-06, + "loss": 2.3614, + "num_input_tokens_seen": 33864144, + "step": 196 + }, + { + "epoch": 0.07833733013589128, + "loss": 2.429080009460449, + "loss_ce": 0.794070303440094, + "loss_xval": 1.6328125, + "num_input_tokens_seen": 33864144, + "step": 196 + }, + { + "epoch": 0.07873701039168665, + "grad_norm": 167.9534199119472, + "learning_rate": 4.784244430352227e-06, + "loss": 2.1861, + "num_input_tokens_seen": 34036976, + "step": 197 + }, + { + "epoch": 0.07873701039168665, + "loss": 2.099493980407715, + "loss_ce": 0.7598943710327148, + "loss_xval": 1.3359375, + "num_input_tokens_seen": 34036976, + "step": 197 + }, + { + "epoch": 0.07913669064748201, + "grad_norm": 177.86964362591564, + "learning_rate": 4.788829541097253e-06, + "loss": 2.3694, + "num_input_tokens_seen": 34209880, + "step": 198 + }, + { + "epoch": 0.07913669064748201, + "loss": 2.0491485595703125, + "loss_ce": 0.7351836562156677, + "loss_xval": 1.3125, + "num_input_tokens_seen": 34209880, + "step": 198 + }, + { + "epoch": 0.07953637090327738, + "grad_norm": 95.02167898698566, + "learning_rate": 4.793391552949641e-06, + "loss": 2.102, + "num_input_tokens_seen": 34382608, + "step": 199 + }, + { + "epoch": 0.07953637090327738, + "loss": 2.334108829498291, + "loss_ce": 0.6905540227890015, + "loss_xval": 1.640625, + "num_input_tokens_seen": 34382608, + "step": 199 + }, + { + "epoch": 0.07993605115907274, + "grad_norm": 168.91885358506565, + "learning_rate": 4.797930697478699e-06, + "loss": 2.0532, + "num_input_tokens_seen": 34555560, + "step": 200 + }, + { + "epoch": 0.07993605115907274, + "loss": 2.0277481079101562, + "loss_ce": 0.6434707641601562, + "loss_xval": 1.3828125, + "num_input_tokens_seen": 34555560, + "step": 200 + }, + { + "epoch": 0.0803357314148681, + "grad_norm": 351.41292703035435, + "learning_rate": 4.802447202788829e-06, + "loss": 2.9673, + "num_input_tokens_seen": 34728224, + "step": 201 + }, + { + "epoch": 0.0803357314148681, + "loss": 3.7609810829162598, + "loss_ce": 0.6657663583755493, + "loss_xval": 3.09375, + "num_input_tokens_seen": 34728224, + "step": 201 + }, + { + "epoch": 0.08073541167066348, + "grad_norm": 442.343019639602, + "learning_rate": 4.806941293588307e-06, + "loss": 2.691, + "num_input_tokens_seen": 34901368, + "step": 202 + }, + { + "epoch": 0.08073541167066348, + "loss": 2.485349178314209, + "loss_ce": 0.7714818716049194, + "loss_xval": 1.7109375, + "num_input_tokens_seen": 34901368, + "step": 202 + }, + { + "epoch": 0.08113509192645883, + "grad_norm": 132.89620507776843, + "learning_rate": 4.8114131912563735e-06, + "loss": 2.0727, + "num_input_tokens_seen": 35074232, + "step": 203 + }, + { + "epoch": 0.08113509192645883, + "loss": 2.1044516563415527, + "loss_ce": 0.7382407784461975, + "loss_xval": 1.3671875, + "num_input_tokens_seen": 35074232, + "step": 203 + }, + { + "epoch": 0.0815347721822542, + "grad_norm": 348.2140794906234, + "learning_rate": 4.815863113908667e-06, + "loss": 2.3281, + "num_input_tokens_seen": 35247568, + "step": 204 + }, + { + "epoch": 0.0815347721822542, + "loss": 2.4087095260620117, + "loss_ce": 0.7719906568527222, + "loss_xval": 1.640625, + "num_input_tokens_seen": 35247568, + "step": 204 + }, + { + "epoch": 0.08193445243804956, + "grad_norm": 85.7039982973391, + "learning_rate": 4.8202912764610565e-06, + "loss": 2.7482, + "num_input_tokens_seen": 35417112, + "step": 205 + }, + { + "epoch": 0.08193445243804956, + "loss": 2.6601197719573975, + "loss_ce": 0.737268328666687, + "loss_xval": 1.921875, + "num_input_tokens_seen": 35417112, + "step": 205 + }, + { + "epoch": 0.08233413269384493, + "grad_norm": 356.1569878617076, + "learning_rate": 4.82469789069193e-06, + "loss": 2.7038, + "num_input_tokens_seen": 35589848, + "step": 206 + }, + { + "epoch": 0.08233413269384493, + "loss": 2.493128538131714, + "loss_ce": 0.7455699443817139, + "loss_xval": 1.75, + "num_input_tokens_seen": 35589848, + "step": 206 + }, + { + "epoch": 0.08273381294964029, + "grad_norm": 138.50940557588262, + "learning_rate": 4.829083165302968e-06, + "loss": 2.0245, + "num_input_tokens_seen": 35762768, + "step": 207 + }, + { + "epoch": 0.08273381294964029, + "loss": 1.909895658493042, + "loss_ce": 0.7660967111587524, + "loss_xval": 1.140625, + "num_input_tokens_seen": 35762768, + "step": 207 + }, + { + "epoch": 0.08313349320543566, + "grad_norm": 525.8835783623617, + "learning_rate": 4.833447305978453e-06, + "loss": 2.669, + "num_input_tokens_seen": 35935712, + "step": 208 + }, + { + "epoch": 0.08313349320543566, + "loss": 2.8658926486968994, + "loss_ce": 0.741869330406189, + "loss_xval": 2.125, + "num_input_tokens_seen": 35935712, + "step": 208 + }, + { + "epoch": 0.08353317346123101, + "grad_norm": 117.03141921519646, + "learning_rate": 4.83779051544316e-06, + "loss": 1.9704, + "num_input_tokens_seen": 36108680, + "step": 209 + }, + { + "epoch": 0.08353317346123101, + "loss": 2.2376246452331543, + "loss_ce": 0.7322536706924438, + "loss_xval": 1.5078125, + "num_input_tokens_seen": 36108680, + "step": 209 + }, + { + "epoch": 0.08393285371702638, + "grad_norm": 319.2968263111755, + "learning_rate": 4.842112993518858e-06, + "loss": 2.3714, + "num_input_tokens_seen": 36281832, + "step": 210 + }, + { + "epoch": 0.08393285371702638, + "loss": 2.113748788833618, + "loss_ce": 0.7533972263336182, + "loss_xval": 1.359375, + "num_input_tokens_seen": 36281832, + "step": 210 + }, + { + "epoch": 0.08433253397282174, + "grad_norm": 89.919306127746, + "learning_rate": 4.846414937179485e-06, + "loss": 2.0618, + "num_input_tokens_seen": 36454648, + "step": 211 + }, + { + "epoch": 0.08433253397282174, + "loss": 2.09275484085083, + "loss_ce": 0.7450986504554749, + "loss_xval": 1.34375, + "num_input_tokens_seen": 36454648, + "step": 211 + }, + { + "epoch": 0.08473221422861711, + "grad_norm": 314.58780968384843, + "learning_rate": 4.850696540604993e-06, + "loss": 2.3359, + "num_input_tokens_seen": 36627424, + "step": 212 + }, + { + "epoch": 0.08473221422861711, + "loss": 2.438669443130493, + "loss_ce": 0.7609350681304932, + "loss_xval": 1.6796875, + "num_input_tokens_seen": 36627424, + "step": 212 + }, + { + "epoch": 0.08513189448441247, + "grad_norm": 107.93286926725172, + "learning_rate": 4.854957995233956e-06, + "loss": 2.1791, + "num_input_tokens_seen": 36800224, + "step": 213 + }, + { + "epoch": 0.08513189448441247, + "loss": 2.3825843334198, + "loss_ce": 0.8034826517105103, + "loss_xval": 1.578125, + "num_input_tokens_seen": 36800224, + "step": 213 + }, + { + "epoch": 0.08553157474020784, + "grad_norm": 214.7337618475198, + "learning_rate": 4.859199489814922e-06, + "loss": 1.6366, + "num_input_tokens_seen": 36973008, + "step": 214 + }, + { + "epoch": 0.08553157474020784, + "loss": 1.71268630027771, + "loss_ce": 0.7490633726119995, + "loss_xval": 0.96484375, + "num_input_tokens_seen": 36973008, + "step": 214 + }, + { + "epoch": 0.0859312549960032, + "grad_norm": 149.02179568380078, + "learning_rate": 4.863421210456582e-06, + "loss": 2.0696, + "num_input_tokens_seen": 37146168, + "step": 215 + }, + { + "epoch": 0.0859312549960032, + "loss": 2.094494581222534, + "loss_ce": 0.7087523937225342, + "loss_xval": 1.3828125, + "num_input_tokens_seen": 37146168, + "step": 215 + }, + { + "epoch": 0.08633093525179857, + "grad_norm": 176.86093117438182, + "learning_rate": 4.867623340676766e-06, + "loss": 2.1813, + "num_input_tokens_seen": 37319032, + "step": 216 + }, + { + "epoch": 0.08633093525179857, + "loss": 2.461796283721924, + "loss_ce": 0.6553997993469238, + "loss_xval": 1.8046875, + "num_input_tokens_seen": 37319032, + "step": 216 + }, + { + "epoch": 0.08673061550759392, + "grad_norm": 80.29115001940868, + "learning_rate": 4.871806061450314e-06, + "loss": 1.9519, + "num_input_tokens_seen": 37490120, + "step": 217 + }, + { + "epoch": 0.08673061550759392, + "loss": 2.127835273742676, + "loss_ce": 0.6957064270973206, + "loss_xval": 1.4296875, + "num_input_tokens_seen": 37490120, + "step": 217 + }, + { + "epoch": 0.08713029576338929, + "grad_norm": 129.0998061036388, + "learning_rate": 4.875969551255842e-06, + "loss": 2.172, + "num_input_tokens_seen": 37663176, + "step": 218 + }, + { + "epoch": 0.08713029576338929, + "loss": 1.8447304964065552, + "loss_ce": 0.7093545198440552, + "loss_xval": 1.1328125, + "num_input_tokens_seen": 37663176, + "step": 218 + }, + { + "epoch": 0.08752997601918465, + "grad_norm": 85.55333189294885, + "learning_rate": 4.8801139861214465e-06, + "loss": 2.1141, + "num_input_tokens_seen": 37835776, + "step": 219 + }, + { + "epoch": 0.08752997601918465, + "loss": 1.9705400466918945, + "loss_ce": 0.6758623123168945, + "loss_xval": 1.296875, + "num_input_tokens_seen": 37835776, + "step": 219 + }, + { + "epoch": 0.08792965627498002, + "grad_norm": 175.8233267376353, + "learning_rate": 4.884239539669352e-06, + "loss": 1.8671, + "num_input_tokens_seen": 38008872, + "step": 220 + }, + { + "epoch": 0.08792965627498002, + "loss": 1.9375450611114502, + "loss_ce": 0.6650841236114502, + "loss_xval": 1.2734375, + "num_input_tokens_seen": 38008872, + "step": 220 + }, + { + "epoch": 0.08832933653077538, + "grad_norm": 145.7069843294875, + "learning_rate": 4.888346383159558e-06, + "loss": 2.1846, + "num_input_tokens_seen": 38181760, + "step": 221 + }, + { + "epoch": 0.08832933653077538, + "loss": 2.246717929840088, + "loss_ce": 0.6512590646743774, + "loss_xval": 1.59375, + "num_input_tokens_seen": 38181760, + "step": 221 + }, + { + "epoch": 0.08872901678657075, + "grad_norm": 191.30221701933354, + "learning_rate": 4.892434685532505e-06, + "loss": 2.1738, + "num_input_tokens_seen": 38355136, + "step": 222 + }, + { + "epoch": 0.08872901678657075, + "loss": 2.435502052307129, + "loss_ce": 0.6669473648071289, + "loss_xval": 1.765625, + "num_input_tokens_seen": 38355136, + "step": 222 + }, + { + "epoch": 0.0891286970423661, + "grad_norm": 327.47100310412145, + "learning_rate": 4.896504613450767e-06, + "loss": 2.1095, + "num_input_tokens_seen": 38524896, + "step": 223 + }, + { + "epoch": 0.0891286970423661, + "loss": 2.2737462520599365, + "loss_ce": 0.646793007850647, + "loss_xval": 1.625, + "num_input_tokens_seen": 38524896, + "step": 223 + }, + { + "epoch": 0.08952837729816147, + "grad_norm": 134.50077594904644, + "learning_rate": 4.900556331339819e-06, + "loss": 1.9673, + "num_input_tokens_seen": 38697752, + "step": 224 + }, + { + "epoch": 0.08952837729816147, + "loss": 1.9093546867370605, + "loss_ce": 0.6168742179870605, + "loss_xval": 1.2890625, + "num_input_tokens_seen": 38697752, + "step": 224 + }, + { + "epoch": 0.08992805755395683, + "grad_norm": 266.2162340533468, + "learning_rate": 4.904590001427903e-06, + "loss": 2.0047, + "num_input_tokens_seen": 38870744, + "step": 225 + }, + { + "epoch": 0.08992805755395683, + "loss": 1.9634662866592407, + "loss_ce": 0.619227945804596, + "loss_xval": 1.34375, + "num_input_tokens_seen": 38870744, + "step": 225 + }, + { + "epoch": 0.0903277378097522, + "grad_norm": 297.0352644872155, + "learning_rate": 4.908605783784996e-06, + "loss": 2.0244, + "num_input_tokens_seen": 39043776, + "step": 226 + }, + { + "epoch": 0.0903277378097522, + "loss": 1.9414169788360596, + "loss_ce": 0.63819420337677, + "loss_xval": 1.3046875, + "num_input_tokens_seen": 39043776, + "step": 226 + }, + { + "epoch": 0.09072741806554756, + "grad_norm": 69.86260872464577, + "learning_rate": 4.912603836360931e-06, + "loss": 1.8326, + "num_input_tokens_seen": 39216696, + "step": 227 + }, + { + "epoch": 0.09072741806554756, + "loss": 1.9535454511642456, + "loss_ce": 0.583916425704956, + "loss_xval": 1.3671875, + "num_input_tokens_seen": 39216696, + "step": 227 + }, + { + "epoch": 0.09112709832134293, + "grad_norm": 188.86005864559968, + "learning_rate": 4.916584315022672e-06, + "loss": 1.8476, + "num_input_tokens_seen": 39389624, + "step": 228 + }, + { + "epoch": 0.09112709832134293, + "loss": 1.6824400424957275, + "loss_ce": 0.5689146518707275, + "loss_xval": 1.1171875, + "num_input_tokens_seen": 39389624, + "step": 228 + }, + { + "epoch": 0.09152677857713828, + "grad_norm": 104.91403215825835, + "learning_rate": 4.920547373590778e-06, + "loss": 1.9768, + "num_input_tokens_seen": 39562616, + "step": 229 + }, + { + "epoch": 0.09152677857713828, + "loss": 2.0511388778686523, + "loss_ce": 0.6551427245140076, + "loss_xval": 1.3984375, + "num_input_tokens_seen": 39562616, + "step": 229 + }, + { + "epoch": 0.09192645883293366, + "grad_norm": 138.43126034850636, + "learning_rate": 4.924493163875066e-06, + "loss": 1.6764, + "num_input_tokens_seen": 39735632, + "step": 230 + }, + { + "epoch": 0.09192645883293366, + "loss": 1.7603843212127686, + "loss_ce": 0.5760581493377686, + "loss_xval": 1.1875, + "num_input_tokens_seen": 39735632, + "step": 230 + }, + { + "epoch": 0.09232613908872901, + "grad_norm": 112.73730815561031, + "learning_rate": 4.92842183570951e-06, + "loss": 2.2555, + "num_input_tokens_seen": 39908488, + "step": 231 + }, + { + "epoch": 0.09232613908872901, + "loss": 2.1147522926330566, + "loss_ce": 0.5903382301330566, + "loss_xval": 1.5234375, + "num_input_tokens_seen": 39908488, + "step": 231 + }, + { + "epoch": 0.09272581934452438, + "grad_norm": 210.33274716994967, + "learning_rate": 4.932333536986379e-06, + "loss": 1.8486, + "num_input_tokens_seen": 40081488, + "step": 232 + }, + { + "epoch": 0.09272581934452438, + "loss": 1.7108758687973022, + "loss_ce": 0.604186475276947, + "loss_xval": 1.109375, + "num_input_tokens_seen": 40081488, + "step": 232 + }, + { + "epoch": 0.09312549960031974, + "grad_norm": 295.7627724529692, + "learning_rate": 4.936228413689641e-06, + "loss": 2.1929, + "num_input_tokens_seen": 40254872, + "step": 233 + }, + { + "epoch": 0.09312549960031974, + "loss": 2.279324531555176, + "loss_ce": 0.6152620315551758, + "loss_xval": 1.6640625, + "num_input_tokens_seen": 40254872, + "step": 233 + }, + { + "epoch": 0.09352517985611511, + "grad_norm": 94.5253042766975, + "learning_rate": 4.940106609927657e-06, + "loss": 1.8654, + "num_input_tokens_seen": 40428056, + "step": 234 + }, + { + "epoch": 0.09352517985611511, + "loss": 1.7779114246368408, + "loss_ce": 0.5865051746368408, + "loss_xval": 1.1875, + "num_input_tokens_seen": 40428056, + "step": 234 + }, + { + "epoch": 0.09392486011191047, + "grad_norm": 218.99104092091028, + "learning_rate": 4.943968267965172e-06, + "loss": 1.9661, + "num_input_tokens_seen": 40600888, + "step": 235 + }, + { + "epoch": 0.09392486011191047, + "loss": 2.0361690521240234, + "loss_ce": 0.5542352199554443, + "loss_xval": 1.484375, + "num_input_tokens_seen": 40600888, + "step": 235 + }, + { + "epoch": 0.09432454036770584, + "grad_norm": 166.17032002216988, + "learning_rate": 4.947813528254631e-06, + "loss": 2.1058, + "num_input_tokens_seen": 40773440, + "step": 236 + }, + { + "epoch": 0.09432454036770584, + "loss": 1.6222901344299316, + "loss_ce": 0.5912842154502869, + "loss_xval": 1.03125, + "num_input_tokens_seen": 40773440, + "step": 236 + }, + { + "epoch": 0.09472422062350119, + "grad_norm": 84.32664481341867, + "learning_rate": 4.95164252946683e-06, + "loss": 1.5917, + "num_input_tokens_seen": 40946384, + "step": 237 + }, + { + "epoch": 0.09472422062350119, + "loss": 1.9710441827774048, + "loss_ce": 0.56601482629776, + "loss_xval": 1.40625, + "num_input_tokens_seen": 40946384, + "step": 237 + }, + { + "epoch": 0.09512390087929656, + "grad_norm": 155.71448367542862, + "learning_rate": 4.955455408520925e-06, + "loss": 1.4781, + "num_input_tokens_seen": 41119280, + "step": 238 + }, + { + "epoch": 0.09512390087929656, + "loss": 1.3114783763885498, + "loss_ce": 0.5365760326385498, + "loss_xval": 0.7734375, + "num_input_tokens_seen": 41119280, + "step": 238 + }, + { + "epoch": 0.09552358113509192, + "grad_norm": 107.51330242252179, + "learning_rate": 4.959252300613805e-06, + "loss": 2.1855, + "num_input_tokens_seen": 41291848, + "step": 239 + }, + { + "epoch": 0.09552358113509192, + "loss": 2.2059006690979004, + "loss_ce": 0.5362229347229004, + "loss_xval": 1.671875, + "num_input_tokens_seen": 41291848, + "step": 239 + }, + { + "epoch": 0.09592326139088729, + "grad_norm": 62.6627619626531, + "learning_rate": 4.963033339248863e-06, + "loss": 1.7001, + "num_input_tokens_seen": 41464768, + "step": 240 + }, + { + "epoch": 0.09592326139088729, + "loss": 1.9659799337387085, + "loss_ce": 0.5311654210090637, + "loss_xval": 1.4375, + "num_input_tokens_seen": 41464768, + "step": 240 + }, + { + "epoch": 0.09632294164668265, + "grad_norm": 140.82973218984645, + "learning_rate": 4.96679865626416e-06, + "loss": 1.886, + "num_input_tokens_seen": 41637768, + "step": 241 + }, + { + "epoch": 0.09632294164668265, + "loss": 2.0564374923706055, + "loss_ce": 0.550822377204895, + "loss_xval": 1.5078125, + "num_input_tokens_seen": 41637768, + "step": 241 + }, + { + "epoch": 0.09672262190247802, + "grad_norm": 418.2780183282535, + "learning_rate": 4.970548381860003e-06, + "loss": 1.9494, + "num_input_tokens_seen": 41811136, + "step": 242 + }, + { + "epoch": 0.09672262190247802, + "loss": 2.3037490844726562, + "loss_ce": 0.5615614652633667, + "loss_xval": 1.7421875, + "num_input_tokens_seen": 41811136, + "step": 242 + }, + { + "epoch": 0.09712230215827339, + "grad_norm": 696.7436633406282, + "learning_rate": 4.974282644625969e-06, + "loss": 2.7664, + "num_input_tokens_seen": 41983952, + "step": 243 + }, + { + "epoch": 0.09712230215827339, + "loss": 3.1029319763183594, + "loss_ce": 0.5375022888183594, + "loss_xval": 2.5625, + "num_input_tokens_seen": 41983952, + "step": 243 + }, + { + "epoch": 0.09752198241406874, + "grad_norm": 650.7634262460341, + "learning_rate": 4.978001571567359e-06, + "loss": 2.7999, + "num_input_tokens_seen": 42156848, + "step": 244 + }, + { + "epoch": 0.09752198241406874, + "loss": 2.580700397491455, + "loss_ce": 0.5709348917007446, + "loss_xval": 2.015625, + "num_input_tokens_seen": 42156848, + "step": 244 + }, + { + "epoch": 0.09792166266986412, + "grad_norm": 88.52292597475672, + "learning_rate": 4.981705288131116e-06, + "loss": 1.7696, + "num_input_tokens_seen": 42329736, + "step": 245 + }, + { + "epoch": 0.09792166266986412, + "loss": 1.907859206199646, + "loss_ce": 0.625144362449646, + "loss_xval": 1.28125, + "num_input_tokens_seen": 42329736, + "step": 245 + }, + { + "epoch": 0.09832134292565947, + "grad_norm": 462.59926637475985, + "learning_rate": 4.98539391823122e-06, + "loss": 2.623, + "num_input_tokens_seen": 42502616, + "step": 246 + }, + { + "epoch": 0.09832134292565947, + "loss": 2.2705206871032715, + "loss_ce": 0.6870246529579163, + "loss_xval": 1.5859375, + "num_input_tokens_seen": 42502616, + "step": 246 + }, + { + "epoch": 0.09872102318145484, + "grad_norm": 219.64091631843007, + "learning_rate": 4.989067584273563e-06, + "loss": 2.1558, + "num_input_tokens_seen": 42675480, + "step": 247 + }, + { + "epoch": 0.09872102318145484, + "loss": 2.346247911453247, + "loss_ce": 0.6250565648078918, + "loss_xval": 1.71875, + "num_input_tokens_seen": 42675480, + "step": 247 + }, + { + "epoch": 0.0991207034372502, + "grad_norm": 452.2760431268779, + "learning_rate": 4.992726407180318e-06, + "loss": 2.4239, + "num_input_tokens_seen": 42848424, + "step": 248 + }, + { + "epoch": 0.0991207034372502, + "loss": 2.3382084369659424, + "loss_ce": 0.6443606615066528, + "loss_xval": 1.6953125, + "num_input_tokens_seen": 42848424, + "step": 248 + }, + { + "epoch": 0.09952038369304557, + "grad_norm": 289.58749315357915, + "learning_rate": 4.996370506413826e-06, + "loss": 2.1094, + "num_input_tokens_seen": 43021520, + "step": 249 + }, + { + "epoch": 0.09952038369304557, + "loss": 2.083463191986084, + "loss_ce": 0.6484045386314392, + "loss_xval": 1.4375, + "num_input_tokens_seen": 43021520, + "step": 249 + }, + { + "epoch": 0.09992006394884093, + "grad_norm": 305.5139012775951, + "learning_rate": 5e-06, + "loss": 2.3916, + "num_input_tokens_seen": 43194472, + "step": 250 + }, + { + "epoch": 0.09992006394884093, + "eval_websight_new_IoU": 0.02511245897039771, + "eval_websight_new_MAE_all": 0.06440733931958675, + "eval_websight_new_MAE_h": 0.030316845513880253, + "eval_websight_new_MAE_w": 0.1007080115377903, + "eval_websight_new_MAE_x": 0.058023618534207344, + "eval_websight_new_MAE_y": 0.06858088076114655, + "eval_websight_new_NUM_probability": 0.0004394065181259066, + "eval_websight_new_inside_bbox": 0.1302083358168602, + "eval_websight_new_loss": 1.9591528177261353, + "eval_websight_new_loss_ce": 0.8309407234191895, + "eval_websight_new_loss_xval": 0.973876953125, + "eval_websight_new_runtime": 59.2945, + "eval_websight_new_samples_per_second": 0.843, + "eval_websight_new_steps_per_second": 0.034, + "num_input_tokens_seen": 43194472, + "step": 250 + }, + { + "epoch": 0.09992006394884093, + "eval_seeclick_IoU": 0.0937136560678482, + "eval_seeclick_MAE_all": 0.11204610392451286, + "eval_seeclick_MAE_h": 0.04166124016046524, + "eval_seeclick_MAE_w": 0.16875187307596207, + "eval_seeclick_MAE_x": 0.1465640515089035, + "eval_seeclick_MAE_y": 0.09120727330446243, + "eval_seeclick_NUM_probability": 0.00042376687633804977, + "eval_seeclick_inside_bbox": 0.2517361119389534, + "eval_seeclick_loss": 4.182728290557861, + "eval_seeclick_loss_ce": 0.9389870762825012, + "eval_seeclick_loss_xval": 3.177978515625, + "eval_seeclick_runtime": 89.2398, + "eval_seeclick_samples_per_second": 0.56, + "eval_seeclick_steps_per_second": 0.022, + "num_input_tokens_seen": 43194472, + "step": 250 + }, + { + "epoch": 0.09992006394884093, + "eval_icons_IoU": 0.0013925364146416541, + "eval_icons_MAE_all": 0.053750623017549515, + "eval_icons_MAE_h": 0.015283203683793545, + "eval_icons_MAE_w": 0.02879231609404087, + "eval_icons_MAE_x": 0.09461009502410889, + "eval_icons_MAE_y": 0.07631688378751278, + "eval_icons_NUM_probability": 0.0005365281249396503, + "eval_icons_inside_bbox": 0.02777777798473835, + "eval_icons_loss": 1.4559746980667114, + "eval_icons_loss_ce": 0.7883208990097046, + "eval_icons_loss_xval": 0.63275146484375, + "eval_icons_runtime": 83.7242, + "eval_icons_samples_per_second": 0.597, + "eval_icons_steps_per_second": 0.024, + "num_input_tokens_seen": 43194472, + "step": 250 + }, + { + "epoch": 0.09992006394884093, + "loss": 1.3925867080688477, + "loss_ce": 0.7909021377563477, + "loss_xval": 0.6015625, + "num_input_tokens_seen": 43194472, + "step": 250 + }, + { + "epoch": 0.1003197442046363, + "grad_norm": 420.7423871157765, + "learning_rate": 5e-06, + "loss": 2.5456, + "num_input_tokens_seen": 43367312, + "step": 251 + }, + { + "epoch": 0.1003197442046363, + "loss": 2.7929365634918213, + "loss_ce": 0.6552413105964661, + "loss_xval": 2.140625, + "num_input_tokens_seen": 43367312, + "step": 251 + }, + { + "epoch": 0.10071942446043165, + "grad_norm": 7157.590435326808, + "learning_rate": 5e-06, + "loss": 3.7782, + "num_input_tokens_seen": 43540136, + "step": 252 + }, + { + "epoch": 0.10071942446043165, + "loss": 3.858090877532959, + "loss_ce": 0.6344579458236694, + "loss_xval": 3.21875, + "num_input_tokens_seen": 43540136, + "step": 252 + }, + { + "epoch": 0.10111910471622702, + "grad_norm": 1371.1194141036922, + "learning_rate": 5e-06, + "loss": 8.4469, + "num_input_tokens_seen": 43713352, + "step": 253 + }, + { + "epoch": 0.10111910471622702, + "loss": 7.681779861450195, + "loss_ce": 1.0880297422409058, + "loss_xval": 6.59375, + "num_input_tokens_seen": 43713352, + "step": 253 + }, + { + "epoch": 0.10151878497202238, + "grad_norm": 389.4763597106653, + "learning_rate": 5e-06, + "loss": 5.2771, + "num_input_tokens_seen": 43886232, + "step": 254 + }, + { + "epoch": 0.10151878497202238, + "loss": 5.221000671386719, + "loss_ce": 1.2112352848052979, + "loss_xval": 4.0, + "num_input_tokens_seen": 43886232, + "step": 254 + }, + { + "epoch": 0.10191846522781775, + "grad_norm": 1409.542196888878, + "learning_rate": 5e-06, + "loss": 7.2904, + "num_input_tokens_seen": 44055544, + "step": 255 + }, + { + "epoch": 0.10191846522781775, + "loss": 7.3611297607421875, + "loss_ce": 1.2146453857421875, + "loss_xval": 6.15625, + "num_input_tokens_seen": 44055544, + "step": 255 + }, + { + "epoch": 0.10231814548361311, + "grad_norm": 324.9012802617059, + "learning_rate": 5e-06, + "loss": 5.3018, + "num_input_tokens_seen": 44228576, + "step": 256 + }, + { + "epoch": 0.10231814548361311, + "loss": 5.863863945007324, + "loss_ce": 1.2427700757980347, + "loss_xval": 4.625, + "num_input_tokens_seen": 44228576, + "step": 256 + }, + { + "epoch": 0.10271782573940848, + "grad_norm": 1193.3422190631172, + "learning_rate": 5e-06, + "loss": 6.202, + "num_input_tokens_seen": 44401408, + "step": 257 + }, + { + "epoch": 0.10271782573940848, + "loss": 6.257023811340332, + "loss_ce": 1.2023365497589111, + "loss_xval": 5.0625, + "num_input_tokens_seen": 44401408, + "step": 257 + }, + { + "epoch": 0.10311750599520383, + "grad_norm": 618.3582068326372, + "learning_rate": 5e-06, + "loss": 4.5653, + "num_input_tokens_seen": 44574456, + "step": 258 + }, + { + "epoch": 0.10311750599520383, + "loss": 4.771925449371338, + "loss_ce": 1.170362949371338, + "loss_xval": 3.59375, + "num_input_tokens_seen": 44574456, + "step": 258 + }, + { + "epoch": 0.1035171862509992, + "grad_norm": 519.3942566880261, + "learning_rate": 5e-06, + "loss": 4.8413, + "num_input_tokens_seen": 44747248, + "step": 259 + }, + { + "epoch": 0.1035171862509992, + "loss": 4.076366901397705, + "loss_ce": 1.152538537979126, + "loss_xval": 2.921875, + "num_input_tokens_seen": 44747248, + "step": 259 + }, + { + "epoch": 0.10391686650679456, + "grad_norm": 794.491302525678, + "learning_rate": 5e-06, + "loss": 5.0691, + "num_input_tokens_seen": 44920312, + "step": 260 + }, + { + "epoch": 0.10391686650679456, + "loss": 5.723645210266113, + "loss_ce": 1.1572389602661133, + "loss_xval": 4.5625, + "num_input_tokens_seen": 44920312, + "step": 260 + }, + { + "epoch": 0.10431654676258993, + "grad_norm": 233.2312194034501, + "learning_rate": 5e-06, + "loss": 3.5956, + "num_input_tokens_seen": 45093192, + "step": 261 + }, + { + "epoch": 0.10431654676258993, + "loss": 3.4732885360717773, + "loss_ce": 1.1500463485717773, + "loss_xval": 2.328125, + "num_input_tokens_seen": 45093192, + "step": 261 + }, + { + "epoch": 0.10471622701838529, + "grad_norm": 400.30987966580153, + "learning_rate": 5e-06, + "loss": 4.4252, + "num_input_tokens_seen": 45266064, + "step": 262 + }, + { + "epoch": 0.10471622701838529, + "loss": 4.673203945159912, + "loss_ce": 1.167344331741333, + "loss_xval": 3.5, + "num_input_tokens_seen": 45266064, + "step": 262 + }, + { + "epoch": 0.10511590727418066, + "grad_norm": 546.3231363919651, + "learning_rate": 5e-06, + "loss": 4.737, + "num_input_tokens_seen": 45439016, + "step": 263 + }, + { + "epoch": 0.10511590727418066, + "loss": 5.174367904663086, + "loss_ce": 1.1450711488723755, + "loss_xval": 4.03125, + "num_input_tokens_seen": 45439016, + "step": 263 + }, + { + "epoch": 0.10551558752997602, + "grad_norm": 241.3773646667893, + "learning_rate": 5e-06, + "loss": 3.0476, + "num_input_tokens_seen": 45612712, + "step": 264 + }, + { + "epoch": 0.10551558752997602, + "loss": 3.1645121574401855, + "loss_ce": 1.1503520011901855, + "loss_xval": 2.015625, + "num_input_tokens_seen": 45612712, + "step": 264 + }, + { + "epoch": 0.10591526778577139, + "grad_norm": 291.824625291368, + "learning_rate": 5e-06, + "loss": 3.1524, + "num_input_tokens_seen": 45785736, + "step": 265 + }, + { + "epoch": 0.10591526778577139, + "loss": 3.5120248794555664, + "loss_ce": 1.1692512035369873, + "loss_xval": 2.34375, + "num_input_tokens_seen": 45785736, + "step": 265 + }, + { + "epoch": 0.10631494804156674, + "grad_norm": 323.654467074144, + "learning_rate": 5e-06, + "loss": 2.7766, + "num_input_tokens_seen": 45958904, + "step": 266 + }, + { + "epoch": 0.10631494804156674, + "loss": 2.4206995964050293, + "loss_ce": 1.1362760066986084, + "loss_xval": 1.28125, + "num_input_tokens_seen": 45958904, + "step": 266 + }, + { + "epoch": 0.10671462829736211, + "grad_norm": 261.4508945724977, + "learning_rate": 5e-06, + "loss": 2.9974, + "num_input_tokens_seen": 46132264, + "step": 267 + }, + { + "epoch": 0.10671462829736211, + "loss": 2.7609076499938965, + "loss_ce": 1.150556206703186, + "loss_xval": 1.609375, + "num_input_tokens_seen": 46132264, + "step": 267 + }, + { + "epoch": 0.10711430855315747, + "grad_norm": 225.15151507021258, + "learning_rate": 5e-06, + "loss": 2.637, + "num_input_tokens_seen": 46305184, + "step": 268 + }, + { + "epoch": 0.10711430855315747, + "loss": 2.379305362701416, + "loss_ce": 1.1512782573699951, + "loss_xval": 1.2265625, + "num_input_tokens_seen": 46305184, + "step": 268 + }, + { + "epoch": 0.10751398880895284, + "grad_norm": 296.01349816516694, + "learning_rate": 5e-06, + "loss": 2.8834, + "num_input_tokens_seen": 46478368, + "step": 269 + }, + { + "epoch": 0.10751398880895284, + "loss": 3.2463014125823975, + "loss_ce": 1.113977074623108, + "loss_xval": 2.125, + "num_input_tokens_seen": 46478368, + "step": 269 + }, + { + "epoch": 0.1079136690647482, + "grad_norm": 155.6721663099127, + "learning_rate": 5e-06, + "loss": 3.2604, + "num_input_tokens_seen": 46651192, + "step": 270 + }, + { + "epoch": 0.1079136690647482, + "loss": 3.4313535690307617, + "loss_ce": 1.0978577136993408, + "loss_xval": 2.328125, + "num_input_tokens_seen": 46651192, + "step": 270 + }, + { + "epoch": 0.10831334932054357, + "grad_norm": 159.3586976784072, + "learning_rate": 5e-06, + "loss": 2.9097, + "num_input_tokens_seen": 46823960, + "step": 271 + }, + { + "epoch": 0.10831334932054357, + "loss": 2.574904441833496, + "loss_ce": 1.068800926208496, + "loss_xval": 1.5078125, + "num_input_tokens_seen": 46823960, + "step": 271 + }, + { + "epoch": 0.10871302957633892, + "grad_norm": 100.09567673766682, + "learning_rate": 5e-06, + "loss": 2.8126, + "num_input_tokens_seen": 46996704, + "step": 272 + }, + { + "epoch": 0.10871302957633892, + "loss": 3.1134049892425537, + "loss_ce": 1.0472428798675537, + "loss_xval": 2.0625, + "num_input_tokens_seen": 46996704, + "step": 272 + }, + { + "epoch": 0.1091127098321343, + "grad_norm": 180.69118269302496, + "learning_rate": 5e-06, + "loss": 2.3287, + "num_input_tokens_seen": 47169304, + "step": 273 + }, + { + "epoch": 0.1091127098321343, + "loss": 2.5099315643310547, + "loss_ce": 1.0426464080810547, + "loss_xval": 1.46875, + "num_input_tokens_seen": 47169304, + "step": 273 + }, + { + "epoch": 0.10951239008792965, + "grad_norm": 88.69240950555843, + "learning_rate": 5e-06, + "loss": 2.7934, + "num_input_tokens_seen": 47342552, + "step": 274 + }, + { + "epoch": 0.10951239008792965, + "loss": 3.1992688179016113, + "loss_ce": 1.0791513919830322, + "loss_xval": 2.125, + "num_input_tokens_seen": 47342552, + "step": 274 + }, + { + "epoch": 0.10991207034372502, + "grad_norm": 68.31354191387534, + "learning_rate": 5e-06, + "loss": 2.3395, + "num_input_tokens_seen": 47515488, + "step": 275 + }, + { + "epoch": 0.10991207034372502, + "loss": 2.1528122425079346, + "loss_ce": 1.0068161487579346, + "loss_xval": 1.1484375, + "num_input_tokens_seen": 47515488, + "step": 275 + }, + { + "epoch": 0.11031175059952038, + "grad_norm": 98.29379090415762, + "learning_rate": 5e-06, + "loss": 2.5491, + "num_input_tokens_seen": 47687864, + "step": 276 + }, + { + "epoch": 0.11031175059952038, + "loss": 2.4284067153930664, + "loss_ce": 1.0075082778930664, + "loss_xval": 1.421875, + "num_input_tokens_seen": 47687864, + "step": 276 + }, + { + "epoch": 0.11071143085531575, + "grad_norm": 208.59052885258336, + "learning_rate": 5e-06, + "loss": 2.4267, + "num_input_tokens_seen": 47860776, + "step": 277 + }, + { + "epoch": 0.11071143085531575, + "loss": 2.307206630706787, + "loss_ce": 0.9878706932067871, + "loss_xval": 1.3203125, + "num_input_tokens_seen": 47860776, + "step": 277 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 64.62028957246228, + "learning_rate": 5e-06, + "loss": 1.8629, + "num_input_tokens_seen": 48033416, + "step": 278 + }, + { + "epoch": 0.1111111111111111, + "loss": 1.5986448526382446, + "loss_ce": 0.9834105372428894, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 48033416, + "step": 278 + }, + { + "epoch": 0.11151079136690648, + "grad_norm": 99.98342503248348, + "learning_rate": 5e-06, + "loss": 2.3711, + "num_input_tokens_seen": 48206160, + "step": 279 + }, + { + "epoch": 0.11151079136690648, + "loss": 2.391204357147217, + "loss_ce": 0.9517512321472168, + "loss_xval": 1.4375, + "num_input_tokens_seen": 48206160, + "step": 279 + }, + { + "epoch": 0.11191047162270183, + "grad_norm": 115.96544018388516, + "learning_rate": 5e-06, + "loss": 2.4155, + "num_input_tokens_seen": 48379424, + "step": 280 + }, + { + "epoch": 0.11191047162270183, + "loss": 2.3940885066986084, + "loss_ce": 0.947799563407898, + "loss_xval": 1.4453125, + "num_input_tokens_seen": 48379424, + "step": 280 + }, + { + "epoch": 0.1123101518784972, + "grad_norm": 104.52004608173208, + "learning_rate": 5e-06, + "loss": 2.3488, + "num_input_tokens_seen": 48552112, + "step": 281 + }, + { + "epoch": 0.1123101518784972, + "loss": 1.9197896718978882, + "loss_ce": 0.9357808828353882, + "loss_xval": 0.984375, + "num_input_tokens_seen": 48552112, + "step": 281 + }, + { + "epoch": 0.11270983213429256, + "grad_norm": 64.65365518524405, + "learning_rate": 5e-06, + "loss": 2.0232, + "num_input_tokens_seen": 48724952, + "step": 282 + }, + { + "epoch": 0.11270983213429256, + "loss": 2.230132818222046, + "loss_ce": 0.9171445369720459, + "loss_xval": 1.3125, + "num_input_tokens_seen": 48724952, + "step": 282 + }, + { + "epoch": 0.11310951239008793, + "grad_norm": 81.10524607428229, + "learning_rate": 5e-06, + "loss": 1.8553, + "num_input_tokens_seen": 48897816, + "step": 283 + }, + { + "epoch": 0.11310951239008793, + "loss": 1.7465626001358032, + "loss_ce": 0.9167286157608032, + "loss_xval": 0.828125, + "num_input_tokens_seen": 48897816, + "step": 283 + }, + { + "epoch": 0.1135091926458833, + "grad_norm": 88.08567804606139, + "learning_rate": 5e-06, + "loss": 2.2449, + "num_input_tokens_seen": 49071056, + "step": 284 + }, + { + "epoch": 0.1135091926458833, + "loss": 2.423125743865967, + "loss_ce": 0.8782038688659668, + "loss_xval": 1.546875, + "num_input_tokens_seen": 49071056, + "step": 284 + }, + { + "epoch": 0.11390887290167866, + "grad_norm": 56.69224113163489, + "learning_rate": 5e-06, + "loss": 2.3233, + "num_input_tokens_seen": 49244160, + "step": 285 + }, + { + "epoch": 0.11390887290167866, + "loss": 2.5701351165771484, + "loss_ce": 0.8718929290771484, + "loss_xval": 1.6953125, + "num_input_tokens_seen": 49244160, + "step": 285 + }, + { + "epoch": 0.11430855315747403, + "grad_norm": 117.84651651002588, + "learning_rate": 5e-06, + "loss": 2.0369, + "num_input_tokens_seen": 49416784, + "step": 286 + }, + { + "epoch": 0.11430855315747403, + "loss": 2.136676788330078, + "loss_ce": 0.8520088195800781, + "loss_xval": 1.28125, + "num_input_tokens_seen": 49416784, + "step": 286 + }, + { + "epoch": 0.11470823341326938, + "grad_norm": 81.01654553435571, + "learning_rate": 5e-06, + "loss": 2.112, + "num_input_tokens_seen": 49589688, + "step": 287 + }, + { + "epoch": 0.11470823341326938, + "loss": 1.9272973537445068, + "loss_ce": 0.8110864162445068, + "loss_xval": 1.1171875, + "num_input_tokens_seen": 49589688, + "step": 287 + }, + { + "epoch": 0.11510791366906475, + "grad_norm": 105.66559786842068, + "learning_rate": 5e-06, + "loss": 2.2775, + "num_input_tokens_seen": 49762488, + "step": 288 + }, + { + "epoch": 0.11510791366906475, + "loss": 2.5139994621276855, + "loss_ce": 0.7979352474212646, + "loss_xval": 1.71875, + "num_input_tokens_seen": 49762488, + "step": 288 + }, + { + "epoch": 0.11550759392486011, + "grad_norm": 97.41645579108356, + "learning_rate": 5e-06, + "loss": 2.2071, + "num_input_tokens_seen": 49935664, + "step": 289 + }, + { + "epoch": 0.11550759392486011, + "loss": 2.298358678817749, + "loss_ce": 0.8047064542770386, + "loss_xval": 1.4921875, + "num_input_tokens_seen": 49935664, + "step": 289 + }, + { + "epoch": 0.11590727418065548, + "grad_norm": 118.02443280432219, + "learning_rate": 5e-06, + "loss": 1.915, + "num_input_tokens_seen": 50108424, + "step": 290 + }, + { + "epoch": 0.11590727418065548, + "loss": 1.942828893661499, + "loss_ce": 0.788532018661499, + "loss_xval": 1.15625, + "num_input_tokens_seen": 50108424, + "step": 290 + }, + { + "epoch": 0.11630695443645084, + "grad_norm": 56.77553608993919, + "learning_rate": 5e-06, + "loss": 1.932, + "num_input_tokens_seen": 50281560, + "step": 291 + }, + { + "epoch": 0.11630695443645084, + "loss": 1.5962605476379395, + "loss_ce": 0.7529988288879395, + "loss_xval": 0.84375, + "num_input_tokens_seen": 50281560, + "step": 291 + }, + { + "epoch": 0.11670663469224621, + "grad_norm": 86.75564799614305, + "learning_rate": 5e-06, + "loss": 1.9127, + "num_input_tokens_seen": 50454736, + "step": 292 + }, + { + "epoch": 0.11670663469224621, + "loss": 1.680945634841919, + "loss_ce": 0.752234697341919, + "loss_xval": 0.9296875, + "num_input_tokens_seen": 50454736, + "step": 292 + }, + { + "epoch": 0.11710631494804156, + "grad_norm": 85.07023337957867, + "learning_rate": 5e-06, + "loss": 2.2476, + "num_input_tokens_seen": 50627480, + "step": 293 + }, + { + "epoch": 0.11710631494804156, + "loss": 2.6010489463806152, + "loss_ce": 0.7497307062149048, + "loss_xval": 1.8515625, + "num_input_tokens_seen": 50627480, + "step": 293 + }, + { + "epoch": 0.11750599520383694, + "grad_norm": 90.42009042380998, + "learning_rate": 5e-06, + "loss": 1.9658, + "num_input_tokens_seen": 50800168, + "step": 294 + }, + { + "epoch": 0.11750599520383694, + "loss": 1.597025752067566, + "loss_ce": 0.7454632520675659, + "loss_xval": 0.8515625, + "num_input_tokens_seen": 50800168, + "step": 294 + }, + { + "epoch": 0.11790567545963229, + "grad_norm": 82.83818839637908, + "learning_rate": 5e-06, + "loss": 1.7579, + "num_input_tokens_seen": 50973280, + "step": 295 + }, + { + "epoch": 0.11790567545963229, + "loss": 1.7075116634368896, + "loss_ce": 0.7070235013961792, + "loss_xval": 1.0, + "num_input_tokens_seen": 50973280, + "step": 295 + }, + { + "epoch": 0.11830535571542766, + "grad_norm": 171.28458309049162, + "learning_rate": 5e-06, + "loss": 1.87, + "num_input_tokens_seen": 51146272, + "step": 296 + }, + { + "epoch": 0.11830535571542766, + "loss": 1.3094793558120728, + "loss_ce": 0.6739814281463623, + "loss_xval": 0.63671875, + "num_input_tokens_seen": 51146272, + "step": 296 + }, + { + "epoch": 0.11870503597122302, + "grad_norm": 108.10320516499523, + "learning_rate": 5e-06, + "loss": 1.9295, + "num_input_tokens_seen": 51319280, + "step": 297 + }, + { + "epoch": 0.11870503597122302, + "loss": 1.6906158924102783, + "loss_ce": 0.6525299549102783, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 51319280, + "step": 297 + }, + { + "epoch": 0.11910471622701839, + "grad_norm": 209.76838886070274, + "learning_rate": 5e-06, + "loss": 1.8786, + "num_input_tokens_seen": 51492232, + "step": 298 + }, + { + "epoch": 0.11910471622701839, + "loss": 1.683868646621704, + "loss_ce": 0.6523745059967041, + "loss_xval": 1.03125, + "num_input_tokens_seen": 51492232, + "step": 298 + }, + { + "epoch": 0.11950439648281375, + "grad_norm": 473.78852226640424, + "learning_rate": 5e-06, + "loss": 2.029, + "num_input_tokens_seen": 51665016, + "step": 299 + }, + { + "epoch": 0.11950439648281375, + "loss": 2.0970144271850586, + "loss_ce": 0.5896900296211243, + "loss_xval": 1.5078125, + "num_input_tokens_seen": 51665016, + "step": 299 + }, + { + "epoch": 0.11990407673860912, + "grad_norm": 358.2945807266524, + "learning_rate": 5e-06, + "loss": 2.0276, + "num_input_tokens_seen": 51837904, + "step": 300 + }, + { + "epoch": 0.11990407673860912, + "loss": 1.9197452068328857, + "loss_ce": 0.6052920818328857, + "loss_xval": 1.3125, + "num_input_tokens_seen": 51837904, + "step": 300 + }, + { + "epoch": 0.12030375699440447, + "grad_norm": 114.31359760931073, + "learning_rate": 5e-06, + "loss": 1.9761, + "num_input_tokens_seen": 52011016, + "step": 301 + }, + { + "epoch": 0.12030375699440447, + "loss": 2.001715660095215, + "loss_ce": 0.6130439043045044, + "loss_xval": 1.390625, + "num_input_tokens_seen": 52011016, + "step": 301 + }, + { + "epoch": 0.12070343725019984, + "grad_norm": 287.61305918926126, + "learning_rate": 5e-06, + "loss": 2.2437, + "num_input_tokens_seen": 52183608, + "step": 302 + }, + { + "epoch": 0.12070343725019984, + "loss": 2.397970676422119, + "loss_ce": 0.5862032771110535, + "loss_xval": 1.8125, + "num_input_tokens_seen": 52183608, + "step": 302 + }, + { + "epoch": 0.1211031175059952, + "grad_norm": 69.49781036705235, + "learning_rate": 5e-06, + "loss": 2.0359, + "num_input_tokens_seen": 52356680, + "step": 303 + }, + { + "epoch": 0.1211031175059952, + "loss": 2.0368640422821045, + "loss_ce": 0.6423327922821045, + "loss_xval": 1.390625, + "num_input_tokens_seen": 52356680, + "step": 303 + }, + { + "epoch": 0.12150279776179057, + "grad_norm": 249.25827639483862, + "learning_rate": 5e-06, + "loss": 1.9618, + "num_input_tokens_seen": 52529992, + "step": 304 + }, + { + "epoch": 0.12150279776179057, + "loss": 1.9398987293243408, + "loss_ce": 0.6230041980743408, + "loss_xval": 1.3203125, + "num_input_tokens_seen": 52529992, + "step": 304 + }, + { + "epoch": 0.12190247801758593, + "grad_norm": 93.74391986745098, + "learning_rate": 5e-06, + "loss": 1.6661, + "num_input_tokens_seen": 52703208, + "step": 305 + }, + { + "epoch": 0.12190247801758593, + "loss": 1.5010509490966797, + "loss_ce": 0.6054210066795349, + "loss_xval": 0.89453125, + "num_input_tokens_seen": 52703208, + "step": 305 + }, + { + "epoch": 0.1223021582733813, + "grad_norm": 170.71986052407345, + "learning_rate": 5e-06, + "loss": 1.752, + "num_input_tokens_seen": 52875752, + "step": 306 + }, + { + "epoch": 0.1223021582733813, + "loss": 1.5768327713012695, + "loss_ce": 0.6339616775512695, + "loss_xval": 0.94140625, + "num_input_tokens_seen": 52875752, + "step": 306 + }, + { + "epoch": 0.12270183852917665, + "grad_norm": 71.67655156139799, + "learning_rate": 5e-06, + "loss": 1.8598, + "num_input_tokens_seen": 53048824, + "step": 307 + }, + { + "epoch": 0.12270183852917665, + "loss": 1.722001552581787, + "loss_ce": 0.6446090340614319, + "loss_xval": 1.078125, + "num_input_tokens_seen": 53048824, + "step": 307 + }, + { + "epoch": 0.12310151878497202, + "grad_norm": 137.84224023687867, + "learning_rate": 5e-06, + "loss": 2.1068, + "num_input_tokens_seen": 53221872, + "step": 308 + }, + { + "epoch": 0.12310151878497202, + "loss": 2.6642374992370605, + "loss_ce": 0.6627727746963501, + "loss_xval": 2.0, + "num_input_tokens_seen": 53221872, + "step": 308 + }, + { + "epoch": 0.12350119904076738, + "grad_norm": 120.78443169609713, + "learning_rate": 5e-06, + "loss": 1.5313, + "num_input_tokens_seen": 53394760, + "step": 309 + }, + { + "epoch": 0.12350119904076738, + "loss": 1.376590609550476, + "loss_ce": 0.5723915100097656, + "loss_xval": 0.8046875, + "num_input_tokens_seen": 53394760, + "step": 309 + }, + { + "epoch": 0.12390087929656275, + "grad_norm": 149.46277474052187, + "learning_rate": 5e-06, + "loss": 1.587, + "num_input_tokens_seen": 53568000, + "step": 310 + }, + { + "epoch": 0.12390087929656275, + "loss": 1.5170848369598389, + "loss_ce": 0.5693309903144836, + "loss_xval": 0.94921875, + "num_input_tokens_seen": 53568000, + "step": 310 + }, + { + "epoch": 0.12430055955235811, + "grad_norm": 164.71563984387302, + "learning_rate": 5e-06, + "loss": 1.6372, + "num_input_tokens_seen": 53740792, + "step": 311 + }, + { + "epoch": 0.12430055955235811, + "loss": 1.6750078201293945, + "loss_ce": 0.578328013420105, + "loss_xval": 1.09375, + "num_input_tokens_seen": 53740792, + "step": 311 + }, + { + "epoch": 0.12470023980815348, + "grad_norm": 121.0108527291859, + "learning_rate": 5e-06, + "loss": 1.863, + "num_input_tokens_seen": 53913616, + "step": 312 + }, + { + "epoch": 0.12470023980815348, + "loss": 1.7013659477233887, + "loss_ce": 0.5402331352233887, + "loss_xval": 1.1640625, + "num_input_tokens_seen": 53913616, + "step": 312 + }, + { + "epoch": 0.12509992006394885, + "grad_norm": 202.9204004913828, + "learning_rate": 5e-06, + "loss": 1.7032, + "num_input_tokens_seen": 54086312, + "step": 313 + }, + { + "epoch": 0.12509992006394885, + "loss": 1.3899582624435425, + "loss_ce": 0.49823465943336487, + "loss_xval": 0.890625, + "num_input_tokens_seen": 54086312, + "step": 313 + }, + { + "epoch": 0.1254996003197442, + "grad_norm": 137.11461244391282, + "learning_rate": 5e-06, + "loss": 1.8513, + "num_input_tokens_seen": 54259016, + "step": 314 + }, + { + "epoch": 0.1254996003197442, + "loss": 1.7766376733779907, + "loss_ce": 0.5368915796279907, + "loss_xval": 1.2421875, + "num_input_tokens_seen": 54259016, + "step": 314 + }, + { + "epoch": 0.12589928057553956, + "grad_norm": 110.41993010150011, + "learning_rate": 5e-06, + "loss": 1.8425, + "num_input_tokens_seen": 54432112, + "step": 315 + }, + { + "epoch": 0.12589928057553956, + "loss": 1.9745423793792725, + "loss_ce": 0.4867495894432068, + "loss_xval": 1.484375, + "num_input_tokens_seen": 54432112, + "step": 315 + }, + { + "epoch": 0.12629896083133493, + "grad_norm": 75.64074697829528, + "learning_rate": 5e-06, + "loss": 1.8845, + "num_input_tokens_seen": 54604632, + "step": 316 + }, + { + "epoch": 0.12629896083133493, + "loss": 1.714593768119812, + "loss_ce": 0.4909610152244568, + "loss_xval": 1.2265625, + "num_input_tokens_seen": 54604632, + "step": 316 + }, + { + "epoch": 0.1266986410871303, + "grad_norm": 147.26310133007271, + "learning_rate": 5e-06, + "loss": 1.3572, + "num_input_tokens_seen": 54777480, + "step": 317 + }, + { + "epoch": 0.1266986410871303, + "loss": 1.3109509944915771, + "loss_ce": 0.4650038480758667, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 54777480, + "step": 317 + }, + { + "epoch": 0.12709832134292565, + "grad_norm": 159.3543242719045, + "learning_rate": 5e-06, + "loss": 1.607, + "num_input_tokens_seen": 54947464, + "step": 318 + }, + { + "epoch": 0.12709832134292565, + "loss": 1.791764736175537, + "loss_ce": 0.4758467674255371, + "loss_xval": 1.3125, + "num_input_tokens_seen": 54947464, + "step": 318 + }, + { + "epoch": 0.12749800159872102, + "grad_norm": 53.98166505832464, + "learning_rate": 5e-06, + "loss": 1.5758, + "num_input_tokens_seen": 55120368, + "step": 319 + }, + { + "epoch": 0.12749800159872102, + "loss": 1.6858811378479004, + "loss_ce": 0.45150619745254517, + "loss_xval": 1.234375, + "num_input_tokens_seen": 55120368, + "step": 319 + }, + { + "epoch": 0.1278976818545164, + "grad_norm": 178.3612818823336, + "learning_rate": 5e-06, + "loss": 1.7526, + "num_input_tokens_seen": 55293312, + "step": 320 + }, + { + "epoch": 0.1278976818545164, + "loss": 2.1778581142425537, + "loss_ce": 0.4437272548675537, + "loss_xval": 1.734375, + "num_input_tokens_seen": 55293312, + "step": 320 + }, + { + "epoch": 0.12829736211031176, + "grad_norm": 111.81551605653638, + "learning_rate": 5e-06, + "loss": 1.8185, + "num_input_tokens_seen": 55466064, + "step": 321 + }, + { + "epoch": 0.12829736211031176, + "loss": 1.8632432222366333, + "loss_ce": 0.4272081255912781, + "loss_xval": 1.4375, + "num_input_tokens_seen": 55466064, + "step": 321 + }, + { + "epoch": 0.1286970423661071, + "grad_norm": 115.38757013402358, + "learning_rate": 5e-06, + "loss": 1.433, + "num_input_tokens_seen": 55639048, + "step": 322 + }, + { + "epoch": 0.1286970423661071, + "loss": 1.242377758026123, + "loss_ce": 0.4625926613807678, + "loss_xval": 0.78125, + "num_input_tokens_seen": 55639048, + "step": 322 + }, + { + "epoch": 0.12909672262190247, + "grad_norm": 162.44283787518899, + "learning_rate": 5e-06, + "loss": 1.3945, + "num_input_tokens_seen": 55812136, + "step": 323 + }, + { + "epoch": 0.12909672262190247, + "loss": 1.231302261352539, + "loss_ce": 0.44077491760253906, + "loss_xval": 0.7890625, + "num_input_tokens_seen": 55812136, + "step": 323 + }, + { + "epoch": 0.12949640287769784, + "grad_norm": 125.96863724523318, + "learning_rate": 5e-06, + "loss": 1.2947, + "num_input_tokens_seen": 55984928, + "step": 324 + }, + { + "epoch": 0.12949640287769784, + "loss": 1.1521010398864746, + "loss_ce": 0.4384779930114746, + "loss_xval": 0.71484375, + "num_input_tokens_seen": 55984928, + "step": 324 + }, + { + "epoch": 0.1298960831334932, + "grad_norm": 397.9317077984938, + "learning_rate": 5e-06, + "loss": 1.9657, + "num_input_tokens_seen": 56157800, + "step": 325 + }, + { + "epoch": 0.1298960831334932, + "loss": 1.3564872741699219, + "loss_ce": 0.4021415710449219, + "loss_xval": 0.953125, + "num_input_tokens_seen": 56157800, + "step": 325 + }, + { + "epoch": 0.13029576338928858, + "grad_norm": 249.17242046790918, + "learning_rate": 5e-06, + "loss": 1.8821, + "num_input_tokens_seen": 56330728, + "step": 326 + }, + { + "epoch": 0.13029576338928858, + "loss": 2.29518461227417, + "loss_ce": 0.3835635483264923, + "loss_xval": 1.9140625, + "num_input_tokens_seen": 56330728, + "step": 326 + }, + { + "epoch": 0.13069544364508393, + "grad_norm": 205.64286668663732, + "learning_rate": 5e-06, + "loss": 1.6595, + "num_input_tokens_seen": 56503560, + "step": 327 + }, + { + "epoch": 0.13069544364508393, + "loss": 2.169922351837158, + "loss_ce": 0.3920902609825134, + "loss_xval": 1.78125, + "num_input_tokens_seen": 56503560, + "step": 327 + }, + { + "epoch": 0.1310951239008793, + "grad_norm": 301.92393796778066, + "learning_rate": 5e-06, + "loss": 1.8751, + "num_input_tokens_seen": 56676496, + "step": 328 + }, + { + "epoch": 0.1310951239008793, + "loss": 1.8017125129699707, + "loss_ce": 0.42768919467926025, + "loss_xval": 1.375, + "num_input_tokens_seen": 56676496, + "step": 328 + }, + { + "epoch": 0.13149480415667467, + "grad_norm": 103.48048918281846, + "learning_rate": 5e-06, + "loss": 1.6365, + "num_input_tokens_seen": 56849424, + "step": 329 + }, + { + "epoch": 0.13149480415667467, + "loss": 1.7497020959854126, + "loss_ce": 0.42157718539237976, + "loss_xval": 1.328125, + "num_input_tokens_seen": 56849424, + "step": 329 + }, + { + "epoch": 0.13189448441247004, + "grad_norm": 171.99654308412548, + "learning_rate": 5e-06, + "loss": 1.9507, + "num_input_tokens_seen": 57021856, + "step": 330 + }, + { + "epoch": 0.13189448441247004, + "loss": 2.358372688293457, + "loss_ce": 0.41257184743881226, + "loss_xval": 1.9453125, + "num_input_tokens_seen": 57021856, + "step": 330 + }, + { + "epoch": 0.13229416466826538, + "grad_norm": 130.04743280621042, + "learning_rate": 5e-06, + "loss": 1.7586, + "num_input_tokens_seen": 57194984, + "step": 331 + }, + { + "epoch": 0.13229416466826538, + "loss": 1.5979559421539307, + "loss_ce": 0.4109441637992859, + "loss_xval": 1.1875, + "num_input_tokens_seen": 57194984, + "step": 331 + }, + { + "epoch": 0.13269384492406075, + "grad_norm": 95.61726348027165, + "learning_rate": 5e-06, + "loss": 1.8431, + "num_input_tokens_seen": 57367960, + "step": 332 + }, + { + "epoch": 0.13269384492406075, + "loss": 2.026491165161133, + "loss_ce": 0.4075947403907776, + "loss_xval": 1.6171875, + "num_input_tokens_seen": 57367960, + "step": 332 + }, + { + "epoch": 0.13309352517985612, + "grad_norm": 179.26524203097776, + "learning_rate": 5e-06, + "loss": 1.8041, + "num_input_tokens_seen": 57540624, + "step": 333 + }, + { + "epoch": 0.13309352517985612, + "loss": 1.6417033672332764, + "loss_ce": 0.41160082817077637, + "loss_xval": 1.2265625, + "num_input_tokens_seen": 57540624, + "step": 333 + }, + { + "epoch": 0.1334932054356515, + "grad_norm": 103.72564142699332, + "learning_rate": 5e-06, + "loss": 1.462, + "num_input_tokens_seen": 57713368, + "step": 334 + }, + { + "epoch": 0.1334932054356515, + "loss": 1.3674639463424683, + "loss_ce": 0.37429988384246826, + "loss_xval": 0.9921875, + "num_input_tokens_seen": 57713368, + "step": 334 + }, + { + "epoch": 0.13389288569144683, + "grad_norm": 226.10650455556296, + "learning_rate": 5e-06, + "loss": 1.5788, + "num_input_tokens_seen": 57886048, + "step": 335 + }, + { + "epoch": 0.13389288569144683, + "loss": 1.418921947479248, + "loss_ce": 0.3932870626449585, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 57886048, + "step": 335 + }, + { + "epoch": 0.1342925659472422, + "grad_norm": 463.12575740719353, + "learning_rate": 5e-06, + "loss": 1.933, + "num_input_tokens_seen": 58058832, + "step": 336 + }, + { + "epoch": 0.1342925659472422, + "loss": 1.5288450717926025, + "loss_ce": 0.3584350347518921, + "loss_xval": 1.171875, + "num_input_tokens_seen": 58058832, + "step": 336 + }, + { + "epoch": 0.13469224620303757, + "grad_norm": 290.28182300021234, + "learning_rate": 5e-06, + "loss": 1.7178, + "num_input_tokens_seen": 58231872, + "step": 337 + }, + { + "epoch": 0.13469224620303757, + "loss": 1.5614473819732666, + "loss_ce": 0.3695529103279114, + "loss_xval": 1.1953125, + "num_input_tokens_seen": 58231872, + "step": 337 + }, + { + "epoch": 0.13509192645883294, + "grad_norm": 143.59463691949978, + "learning_rate": 5e-06, + "loss": 1.743, + "num_input_tokens_seen": 58404472, + "step": 338 + }, + { + "epoch": 0.13509192645883294, + "loss": 1.9069617986679077, + "loss_ce": 0.38645392656326294, + "loss_xval": 1.5234375, + "num_input_tokens_seen": 58404472, + "step": 338 + }, + { + "epoch": 0.1354916067146283, + "grad_norm": 345.1671201338629, + "learning_rate": 5e-06, + "loss": 1.9162, + "num_input_tokens_seen": 58577064, + "step": 339 + }, + { + "epoch": 0.1354916067146283, + "loss": 1.4432034492492676, + "loss_ce": 0.3567776679992676, + "loss_xval": 1.0859375, + "num_input_tokens_seen": 58577064, + "step": 339 + }, + { + "epoch": 0.13589128697042366, + "grad_norm": 139.11908462967634, + "learning_rate": 5e-06, + "loss": 1.5235, + "num_input_tokens_seen": 58750152, + "step": 340 + }, + { + "epoch": 0.13589128697042366, + "loss": 1.5724246501922607, + "loss_ce": 0.3890751004219055, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 58750152, + "step": 340 + }, + { + "epoch": 0.13629096722621903, + "grad_norm": 254.4500953030448, + "learning_rate": 5e-06, + "loss": 1.4418, + "num_input_tokens_seen": 58922968, + "step": 341 + }, + { + "epoch": 0.13629096722621903, + "loss": 1.34968101978302, + "loss_ce": 0.38410484790802, + "loss_xval": 0.96484375, + "num_input_tokens_seen": 58922968, + "step": 341 + }, + { + "epoch": 0.1366906474820144, + "grad_norm": 100.59331456194423, + "learning_rate": 5e-06, + "loss": 1.3907, + "num_input_tokens_seen": 59092360, + "step": 342 + }, + { + "epoch": 0.1366906474820144, + "loss": 1.5428651571273804, + "loss_ce": 0.3946716785430908, + "loss_xval": 1.1484375, + "num_input_tokens_seen": 59092360, + "step": 342 + }, + { + "epoch": 0.13709032773780974, + "grad_norm": 171.42095258220323, + "learning_rate": 5e-06, + "loss": 1.9297, + "num_input_tokens_seen": 59265264, + "step": 343 + }, + { + "epoch": 0.13709032773780974, + "loss": 2.0073769092559814, + "loss_ce": 0.3777381181716919, + "loss_xval": 1.6328125, + "num_input_tokens_seen": 59265264, + "step": 343 + }, + { + "epoch": 0.1374900079936051, + "grad_norm": 182.6528152043041, + "learning_rate": 5e-06, + "loss": 1.6114, + "num_input_tokens_seen": 59438328, + "step": 344 + }, + { + "epoch": 0.1374900079936051, + "loss": 1.540045976638794, + "loss_ce": 0.35742881894111633, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 59438328, + "step": 344 + }, + { + "epoch": 0.13788968824940048, + "grad_norm": 234.01657762932578, + "learning_rate": 5e-06, + "loss": 1.4521, + "num_input_tokens_seen": 59611160, + "step": 345 + }, + { + "epoch": 0.13788968824940048, + "loss": 1.1887354850769043, + "loss_ce": 0.4055323600769043, + "loss_xval": 0.78125, + "num_input_tokens_seen": 59611160, + "step": 345 + }, + { + "epoch": 0.13828936850519585, + "grad_norm": 135.3533298151207, + "learning_rate": 5e-06, + "loss": 1.5234, + "num_input_tokens_seen": 59784352, + "step": 346 + }, + { + "epoch": 0.13828936850519585, + "loss": 1.3803753852844238, + "loss_ce": 0.4030805230140686, + "loss_xval": 0.9765625, + "num_input_tokens_seen": 59784352, + "step": 346 + }, + { + "epoch": 0.1386890487609912, + "grad_norm": 186.93581432030666, + "learning_rate": 5e-06, + "loss": 1.5216, + "num_input_tokens_seen": 59957304, + "step": 347 + }, + { + "epoch": 0.1386890487609912, + "loss": 1.4391629695892334, + "loss_ce": 0.37886011600494385, + "loss_xval": 1.0625, + "num_input_tokens_seen": 59957304, + "step": 347 + }, + { + "epoch": 0.13908872901678657, + "grad_norm": 119.53453883928591, + "learning_rate": 5e-06, + "loss": 2.1058, + "num_input_tokens_seen": 60130160, + "step": 348 + }, + { + "epoch": 0.13908872901678657, + "loss": 2.4009146690368652, + "loss_ce": 0.3730825185775757, + "loss_xval": 2.03125, + "num_input_tokens_seen": 60130160, + "step": 348 + }, + { + "epoch": 0.13948840927258194, + "grad_norm": 82.97817397860176, + "learning_rate": 5e-06, + "loss": 1.5156, + "num_input_tokens_seen": 60302752, + "step": 349 + }, + { + "epoch": 0.13948840927258194, + "loss": 1.3989337682724, + "loss_ce": 0.3671955168247223, + "loss_xval": 1.03125, + "num_input_tokens_seen": 60302752, + "step": 349 + }, + { + "epoch": 0.1398880895283773, + "grad_norm": 178.42499123342392, + "learning_rate": 5e-06, + "loss": 1.2829, + "num_input_tokens_seen": 60475776, + "step": 350 + }, + { + "epoch": 0.1398880895283773, + "loss": 1.3339847326278687, + "loss_ce": 0.38024938106536865, + "loss_xval": 0.953125, + "num_input_tokens_seen": 60475776, + "step": 350 + }, + { + "epoch": 0.14028776978417265, + "grad_norm": 220.105087076434, + "learning_rate": 5e-06, + "loss": 1.8338, + "num_input_tokens_seen": 60648632, + "step": 351 + }, + { + "epoch": 0.14028776978417265, + "loss": 1.7666809558868408, + "loss_ce": 0.39265748858451843, + "loss_xval": 1.375, + "num_input_tokens_seen": 60648632, + "step": 351 + }, + { + "epoch": 0.14068745003996802, + "grad_norm": 117.69512479208115, + "learning_rate": 5e-06, + "loss": 1.5094, + "num_input_tokens_seen": 60821424, + "step": 352 + }, + { + "epoch": 0.14068745003996802, + "loss": 1.3380788564682007, + "loss_ce": 0.3004812002182007, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 60821424, + "step": 352 + }, + { + "epoch": 0.1410871302957634, + "grad_norm": 54.734844905000614, + "learning_rate": 5e-06, + "loss": 1.5579, + "num_input_tokens_seen": 60994352, + "step": 353 + }, + { + "epoch": 0.1410871302957634, + "loss": 1.776560664176941, + "loss_ce": 0.33783990144729614, + "loss_xval": 1.4375, + "num_input_tokens_seen": 60994352, + "step": 353 + }, + { + "epoch": 0.14148681055155876, + "grad_norm": 71.4528673149511, + "learning_rate": 5e-06, + "loss": 1.9286, + "num_input_tokens_seen": 61167184, + "step": 354 + }, + { + "epoch": 0.14148681055155876, + "loss": 1.7447428703308105, + "loss_ce": 0.3264079689979553, + "loss_xval": 1.421875, + "num_input_tokens_seen": 61167184, + "step": 354 + }, + { + "epoch": 0.1418864908073541, + "grad_norm": 144.41906447638016, + "learning_rate": 5e-06, + "loss": 1.3293, + "num_input_tokens_seen": 61340288, + "step": 355 + }, + { + "epoch": 0.1418864908073541, + "loss": 1.5950350761413574, + "loss_ce": 0.30743736028671265, + "loss_xval": 1.2890625, + "num_input_tokens_seen": 61340288, + "step": 355 + }, + { + "epoch": 0.14228617106314947, + "grad_norm": 97.59406386460213, + "learning_rate": 5e-06, + "loss": 1.3858, + "num_input_tokens_seen": 61513160, + "step": 356 + }, + { + "epoch": 0.14228617106314947, + "loss": 1.4868381023406982, + "loss_ce": 0.26979708671569824, + "loss_xval": 1.21875, + "num_input_tokens_seen": 61513160, + "step": 356 + }, + { + "epoch": 0.14268585131894485, + "grad_norm": 113.54612045662563, + "learning_rate": 5e-06, + "loss": 1.3206, + "num_input_tokens_seen": 61686344, + "step": 357 + }, + { + "epoch": 0.14268585131894485, + "loss": 1.2290370464324951, + "loss_ce": 0.24270889163017273, + "loss_xval": 0.984375, + "num_input_tokens_seen": 61686344, + "step": 357 + }, + { + "epoch": 0.14308553157474022, + "grad_norm": 156.8539131457891, + "learning_rate": 5e-06, + "loss": 1.611, + "num_input_tokens_seen": 61859496, + "step": 358 + }, + { + "epoch": 0.14308553157474022, + "loss": 1.502457618713379, + "loss_ce": 0.25953781604766846, + "loss_xval": 1.2421875, + "num_input_tokens_seen": 61859496, + "step": 358 + }, + { + "epoch": 0.14348521183053556, + "grad_norm": 316.9389502143624, + "learning_rate": 5e-06, + "loss": 1.2872, + "num_input_tokens_seen": 62032208, + "step": 359 + }, + { + "epoch": 0.14348521183053556, + "loss": 1.4252452850341797, + "loss_ce": 0.2567882835865021, + "loss_xval": 1.171875, + "num_input_tokens_seen": 62032208, + "step": 359 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 271.7727423648551, + "learning_rate": 5e-06, + "loss": 1.571, + "num_input_tokens_seen": 62205280, + "step": 360 + }, + { + "epoch": 0.14388489208633093, + "loss": 1.4632078409194946, + "loss_ce": 0.28254371881484985, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 62205280, + "step": 360 + }, + { + "epoch": 0.1442845723421263, + "grad_norm": 55.46079000927162, + "learning_rate": 5e-06, + "loss": 1.6113, + "num_input_tokens_seen": 62378232, + "step": 361 + }, + { + "epoch": 0.1442845723421263, + "loss": 1.6278996467590332, + "loss_ce": 0.2782902717590332, + "loss_xval": 1.3515625, + "num_input_tokens_seen": 62378232, + "step": 361 + }, + { + "epoch": 0.14468425259792167, + "grad_norm": 436.3956415187697, + "learning_rate": 5e-06, + "loss": 1.3398, + "num_input_tokens_seen": 62550920, + "step": 362 + }, + { + "epoch": 0.14468425259792167, + "loss": 1.3358556032180786, + "loss_ce": 0.2723791003227234, + "loss_xval": 1.0625, + "num_input_tokens_seen": 62550920, + "step": 362 + }, + { + "epoch": 0.145083932853717, + "grad_norm": 755.5556492215647, + "learning_rate": 5e-06, + "loss": 2.0904, + "num_input_tokens_seen": 62724048, + "step": 363 + }, + { + "epoch": 0.145083932853717, + "loss": 1.9951095581054688, + "loss_ce": 0.29784390330314636, + "loss_xval": 1.6953125, + "num_input_tokens_seen": 62724048, + "step": 363 + }, + { + "epoch": 0.14548361310951238, + "grad_norm": 458.5396858968191, + "learning_rate": 5e-06, + "loss": 2.0173, + "num_input_tokens_seen": 62896912, + "step": 364 + }, + { + "epoch": 0.14548361310951238, + "loss": 1.844632863998413, + "loss_ce": 0.3143594264984131, + "loss_xval": 1.53125, + "num_input_tokens_seen": 62896912, + "step": 364 + }, + { + "epoch": 0.14588329336530775, + "grad_norm": 273.6763198754432, + "learning_rate": 5e-06, + "loss": 1.6539, + "num_input_tokens_seen": 63070104, + "step": 365 + }, + { + "epoch": 0.14588329336530775, + "loss": 1.8728796243667603, + "loss_ce": 0.32600462436676025, + "loss_xval": 1.546875, + "num_input_tokens_seen": 63070104, + "step": 365 + }, + { + "epoch": 0.14628297362110312, + "grad_norm": 336.10897238023637, + "learning_rate": 5e-06, + "loss": 1.7954, + "num_input_tokens_seen": 63243208, + "step": 366 + }, + { + "epoch": 0.14628297362110312, + "loss": 1.7886399030685425, + "loss_ce": 0.37140363454818726, + "loss_xval": 1.4140625, + "num_input_tokens_seen": 63243208, + "step": 366 + }, + { + "epoch": 0.1466826538768985, + "grad_norm": 235.21217327142335, + "learning_rate": 5e-06, + "loss": 1.4071, + "num_input_tokens_seen": 63415976, + "step": 367 + }, + { + "epoch": 0.1466826538768985, + "loss": 1.2615071535110474, + "loss_ce": 0.37381187081336975, + "loss_xval": 0.88671875, + "num_input_tokens_seen": 63415976, + "step": 367 + }, + { + "epoch": 0.14708233413269384, + "grad_norm": 227.78162811961886, + "learning_rate": 5e-06, + "loss": 1.4289, + "num_input_tokens_seen": 63589032, + "step": 368 + }, + { + "epoch": 0.14708233413269384, + "loss": 1.3692877292633057, + "loss_ce": 0.37600159645080566, + "loss_xval": 0.9921875, + "num_input_tokens_seen": 63589032, + "step": 368 + }, + { + "epoch": 0.1474820143884892, + "grad_norm": 332.6665441626038, + "learning_rate": 5e-06, + "loss": 1.5625, + "num_input_tokens_seen": 63762416, + "step": 369 + }, + { + "epoch": 0.1474820143884892, + "loss": 1.526054859161377, + "loss_ce": 0.3673633337020874, + "loss_xval": 1.15625, + "num_input_tokens_seen": 63762416, + "step": 369 + }, + { + "epoch": 0.14788169464428458, + "grad_norm": 236.40131995240245, + "learning_rate": 5e-06, + "loss": 1.8429, + "num_input_tokens_seen": 63935712, + "step": 370 + }, + { + "epoch": 0.14788169464428458, + "loss": 1.8038097620010376, + "loss_ce": 0.4258800745010376, + "loss_xval": 1.375, + "num_input_tokens_seen": 63935712, + "step": 370 + }, + { + "epoch": 0.14828137490007995, + "grad_norm": 347.91614795997964, + "learning_rate": 5e-06, + "loss": 1.458, + "num_input_tokens_seen": 64109008, + "step": 371 + }, + { + "epoch": 0.14828137490007995, + "loss": 1.3971425294876099, + "loss_ce": 0.383470743894577, + "loss_xval": 1.015625, + "num_input_tokens_seen": 64109008, + "step": 371 + }, + { + "epoch": 0.1486810551558753, + "grad_norm": 97.92030501492046, + "learning_rate": 5e-06, + "loss": 1.1957, + "num_input_tokens_seen": 64281584, + "step": 372 + }, + { + "epoch": 0.1486810551558753, + "loss": 1.3989770412445068, + "loss_ce": 0.36162352561950684, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 64281584, + "step": 372 + }, + { + "epoch": 0.14908073541167066, + "grad_norm": 432.055973955712, + "learning_rate": 5e-06, + "loss": 1.6878, + "num_input_tokens_seen": 64454672, + "step": 373 + }, + { + "epoch": 0.14908073541167066, + "loss": 1.6717700958251953, + "loss_ce": 0.33412355184555054, + "loss_xval": 1.3359375, + "num_input_tokens_seen": 64454672, + "step": 373 + }, + { + "epoch": 0.14948041566746603, + "grad_norm": 75.76218741454379, + "learning_rate": 5e-06, + "loss": 1.5385, + "num_input_tokens_seen": 64627880, + "step": 374 + }, + { + "epoch": 0.14948041566746603, + "loss": 1.3063770532608032, + "loss_ce": 0.3713184893131256, + "loss_xval": 0.93359375, + "num_input_tokens_seen": 64627880, + "step": 374 + }, + { + "epoch": 0.1498800959232614, + "grad_norm": 434.32708742379594, + "learning_rate": 5e-06, + "loss": 1.9206, + "num_input_tokens_seen": 64800696, + "step": 375 + }, + { + "epoch": 0.1498800959232614, + "loss": 1.9140050411224365, + "loss_ce": 0.3580968379974365, + "loss_xval": 1.5546875, + "num_input_tokens_seen": 64800696, + "step": 375 + }, + { + "epoch": 0.15027977617905675, + "grad_norm": 231.0941828632733, + "learning_rate": 5e-06, + "loss": 1.5119, + "num_input_tokens_seen": 64973736, + "step": 376 + }, + { + "epoch": 0.15027977617905675, + "loss": 1.2229743003845215, + "loss_ce": 0.41560131311416626, + "loss_xval": 0.80859375, + "num_input_tokens_seen": 64973736, + "step": 376 + }, + { + "epoch": 0.15067945643485212, + "grad_norm": 308.04336921786705, + "learning_rate": 5e-06, + "loss": 1.717, + "num_input_tokens_seen": 65146824, + "step": 377 + }, + { + "epoch": 0.15067945643485212, + "loss": 1.522589087486267, + "loss_ce": 0.3834289610385895, + "loss_xval": 1.140625, + "num_input_tokens_seen": 65146824, + "step": 377 + }, + { + "epoch": 0.1510791366906475, + "grad_norm": 379.91211838660945, + "learning_rate": 5e-06, + "loss": 1.7196, + "num_input_tokens_seen": 65319824, + "step": 378 + }, + { + "epoch": 0.1510791366906475, + "loss": 1.6595503091812134, + "loss_ce": 0.37146443128585815, + "loss_xval": 1.2890625, + "num_input_tokens_seen": 65319824, + "step": 378 + }, + { + "epoch": 0.15147881694644286, + "grad_norm": 188.19279362317252, + "learning_rate": 5e-06, + "loss": 1.5226, + "num_input_tokens_seen": 65492824, + "step": 379 + }, + { + "epoch": 0.15147881694644286, + "loss": 1.2681467533111572, + "loss_ce": 0.4041330814361572, + "loss_xval": 0.86328125, + "num_input_tokens_seen": 65492824, + "step": 379 + }, + { + "epoch": 0.1518784972022382, + "grad_norm": 332.3973897757976, + "learning_rate": 5e-06, + "loss": 1.5562, + "num_input_tokens_seen": 65665920, + "step": 380 + }, + { + "epoch": 0.1518784972022382, + "loss": 1.71194326877594, + "loss_ce": 0.37405264377593994, + "loss_xval": 1.3359375, + "num_input_tokens_seen": 65665920, + "step": 380 + }, + { + "epoch": 0.15227817745803357, + "grad_norm": 71.6692089078513, + "learning_rate": 5e-06, + "loss": 1.5298, + "num_input_tokens_seen": 65839296, + "step": 381 + }, + { + "epoch": 0.15227817745803357, + "loss": 1.185120701789856, + "loss_ce": 0.37054553627967834, + "loss_xval": 0.81640625, + "num_input_tokens_seen": 65839296, + "step": 381 + }, + { + "epoch": 0.15267785771382894, + "grad_norm": 291.72669644168303, + "learning_rate": 5e-06, + "loss": 1.6519, + "num_input_tokens_seen": 66012896, + "step": 382 + }, + { + "epoch": 0.15267785771382894, + "loss": 1.4946725368499756, + "loss_ce": 0.3623483180999756, + "loss_xval": 1.1328125, + "num_input_tokens_seen": 66012896, + "step": 382 + }, + { + "epoch": 0.1530775379696243, + "grad_norm": 124.58060402399016, + "learning_rate": 5e-06, + "loss": 1.6748, + "num_input_tokens_seen": 66185720, + "step": 383 + }, + { + "epoch": 0.1530775379696243, + "loss": 1.6065537929534912, + "loss_ce": 0.3433701694011688, + "loss_xval": 1.265625, + "num_input_tokens_seen": 66185720, + "step": 383 + }, + { + "epoch": 0.15347721822541965, + "grad_norm": 303.4023928485621, + "learning_rate": 5e-06, + "loss": 1.3126, + "num_input_tokens_seen": 66358696, + "step": 384 + }, + { + "epoch": 0.15347721822541965, + "loss": 1.6531447172164917, + "loss_ce": 0.3508985936641693, + "loss_xval": 1.3046875, + "num_input_tokens_seen": 66358696, + "step": 384 + }, + { + "epoch": 0.15387689848121502, + "grad_norm": 186.7444010598904, + "learning_rate": 5e-06, + "loss": 1.4072, + "num_input_tokens_seen": 66531952, + "step": 385 + }, + { + "epoch": 0.15387689848121502, + "loss": 1.579591989517212, + "loss_ce": 0.35400599241256714, + "loss_xval": 1.2265625, + "num_input_tokens_seen": 66531952, + "step": 385 + }, + { + "epoch": 0.1542765787370104, + "grad_norm": 123.72761695126445, + "learning_rate": 5e-06, + "loss": 1.1076, + "num_input_tokens_seen": 66704848, + "step": 386 + }, + { + "epoch": 0.1542765787370104, + "loss": 1.1779499053955078, + "loss_ce": 0.3305378556251526, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 66704848, + "step": 386 + }, + { + "epoch": 0.15467625899280577, + "grad_norm": 300.4351423438375, + "learning_rate": 5e-06, + "loss": 1.5436, + "num_input_tokens_seen": 66877552, + "step": 387 + }, + { + "epoch": 0.15467625899280577, + "loss": 1.4668437242507935, + "loss_ce": 0.30204877257347107, + "loss_xval": 1.1640625, + "num_input_tokens_seen": 66877552, + "step": 387 + }, + { + "epoch": 0.1550759392486011, + "grad_norm": 97.96015654356518, + "learning_rate": 5e-06, + "loss": 1.3898, + "num_input_tokens_seen": 67050520, + "step": 388 + }, + { + "epoch": 0.1550759392486011, + "loss": 1.3291375637054443, + "loss_ce": 0.28250670433044434, + "loss_xval": 1.046875, + "num_input_tokens_seen": 67050520, + "step": 388 + }, + { + "epoch": 0.15547561950439648, + "grad_norm": 256.48054637973087, + "learning_rate": 5e-06, + "loss": 1.0719, + "num_input_tokens_seen": 67223032, + "step": 389 + }, + { + "epoch": 0.15547561950439648, + "loss": 1.2077308893203735, + "loss_ce": 0.2790199816226959, + "loss_xval": 0.9296875, + "num_input_tokens_seen": 67223032, + "step": 389 + }, + { + "epoch": 0.15587529976019185, + "grad_norm": 83.61487152773492, + "learning_rate": 5e-06, + "loss": 1.4863, + "num_input_tokens_seen": 67396040, + "step": 390 + }, + { + "epoch": 0.15587529976019185, + "loss": 1.168054461479187, + "loss_ce": 0.236413836479187, + "loss_xval": 0.9296875, + "num_input_tokens_seen": 67396040, + "step": 390 + }, + { + "epoch": 0.15627498001598722, + "grad_norm": 216.848192893872, + "learning_rate": 5e-06, + "loss": 1.5421, + "num_input_tokens_seen": 67569016, + "step": 391 + }, + { + "epoch": 0.15627498001598722, + "loss": 1.3344125747680664, + "loss_ce": 0.2352915108203888, + "loss_xval": 1.1015625, + "num_input_tokens_seen": 67569016, + "step": 391 + }, + { + "epoch": 0.15667466027178256, + "grad_norm": 100.72676823977093, + "learning_rate": 5e-06, + "loss": 1.2574, + "num_input_tokens_seen": 67742032, + "step": 392 + }, + { + "epoch": 0.15667466027178256, + "loss": 1.4167189598083496, + "loss_ce": 0.22641140222549438, + "loss_xval": 1.1875, + "num_input_tokens_seen": 67742032, + "step": 392 + }, + { + "epoch": 0.15707434052757793, + "grad_norm": 217.73273930965593, + "learning_rate": 5e-06, + "loss": 1.1396, + "num_input_tokens_seen": 67914992, + "step": 393 + }, + { + "epoch": 0.15707434052757793, + "loss": 1.1209442615509033, + "loss_ce": 0.2234833538532257, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 67914992, + "step": 393 + }, + { + "epoch": 0.1574740207833733, + "grad_norm": 273.2904839153481, + "learning_rate": 5e-06, + "loss": 1.3022, + "num_input_tokens_seen": 68087776, + "step": 394 + }, + { + "epoch": 0.1574740207833733, + "loss": 1.3209784030914307, + "loss_ce": 0.22258979082107544, + "loss_xval": 1.1015625, + "num_input_tokens_seen": 68087776, + "step": 394 + }, + { + "epoch": 0.15787370103916867, + "grad_norm": 102.6734241796819, + "learning_rate": 5e-06, + "loss": 1.3072, + "num_input_tokens_seen": 68260448, + "step": 395 + }, + { + "epoch": 0.15787370103916867, + "loss": 1.1156089305877686, + "loss_ce": 0.19959326088428497, + "loss_xval": 0.9140625, + "num_input_tokens_seen": 68260448, + "step": 395 + }, + { + "epoch": 0.15827338129496402, + "grad_norm": 373.9212850213255, + "learning_rate": 5e-06, + "loss": 0.9984, + "num_input_tokens_seen": 68433496, + "step": 396 + }, + { + "epoch": 0.15827338129496402, + "loss": 0.958846926689148, + "loss_ce": 0.17906175553798676, + "loss_xval": 0.78125, + "num_input_tokens_seen": 68433496, + "step": 396 + }, + { + "epoch": 0.1586730615507594, + "grad_norm": 212.68846306498975, + "learning_rate": 5e-06, + "loss": 1.2799, + "num_input_tokens_seen": 68606624, + "step": 397 + }, + { + "epoch": 0.1586730615507594, + "loss": 1.6309540271759033, + "loss_ce": 0.18429884314537048, + "loss_xval": 1.4453125, + "num_input_tokens_seen": 68606624, + "step": 397 + }, + { + "epoch": 0.15907274180655476, + "grad_norm": 201.40167739549648, + "learning_rate": 5e-06, + "loss": 1.0227, + "num_input_tokens_seen": 68779744, + "step": 398 + }, + { + "epoch": 0.15907274180655476, + "loss": 1.1671316623687744, + "loss_ce": 0.1776297688484192, + "loss_xval": 0.98828125, + "num_input_tokens_seen": 68779744, + "step": 398 + }, + { + "epoch": 0.15947242206235013, + "grad_norm": 329.1756886542891, + "learning_rate": 5e-06, + "loss": 1.1226, + "num_input_tokens_seen": 68949416, + "step": 399 + }, + { + "epoch": 0.15947242206235013, + "loss": 0.8560934662818909, + "loss_ce": 0.17335423827171326, + "loss_xval": 0.68359375, + "num_input_tokens_seen": 68949416, + "step": 399 + }, + { + "epoch": 0.15987210231814547, + "grad_norm": 243.90129646768855, + "learning_rate": 5e-06, + "loss": 1.6687, + "num_input_tokens_seen": 69122336, + "step": 400 + }, + { + "epoch": 0.15987210231814547, + "loss": 1.3468685150146484, + "loss_ce": 0.16937831044197083, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 69122336, + "step": 400 + }, + { + "epoch": 0.16027178257394084, + "grad_norm": 182.98535991940025, + "learning_rate": 5e-06, + "loss": 1.9013, + "num_input_tokens_seen": 69295288, + "step": 401 + }, + { + "epoch": 0.16027178257394084, + "loss": 2.1146087646484375, + "loss_ce": 0.1907806098461151, + "loss_xval": 1.921875, + "num_input_tokens_seen": 69295288, + "step": 401 + }, + { + "epoch": 0.1606714628297362, + "grad_norm": 155.15210915585405, + "learning_rate": 5e-06, + "loss": 1.5412, + "num_input_tokens_seen": 69468232, + "step": 402 + }, + { + "epoch": 0.1606714628297362, + "loss": 1.1275564432144165, + "loss_ce": 0.18346473574638367, + "loss_xval": 0.9453125, + "num_input_tokens_seen": 69468232, + "step": 402 + }, + { + "epoch": 0.16107114308553158, + "grad_norm": 89.57021927047049, + "learning_rate": 5e-06, + "loss": 1.2666, + "num_input_tokens_seen": 69641208, + "step": 403 + }, + { + "epoch": 0.16107114308553158, + "loss": 1.3142218589782715, + "loss_ce": 0.18971017003059387, + "loss_xval": 1.125, + "num_input_tokens_seen": 69641208, + "step": 403 + }, + { + "epoch": 0.16147082334132695, + "grad_norm": 84.82452132970737, + "learning_rate": 5e-06, + "loss": 1.0387, + "num_input_tokens_seen": 69814296, + "step": 404 + }, + { + "epoch": 0.16147082334132695, + "loss": 0.96608567237854, + "loss_ce": 0.14345382153987885, + "loss_xval": 0.82421875, + "num_input_tokens_seen": 69814296, + "step": 404 + }, + { + "epoch": 0.1618705035971223, + "grad_norm": 156.2159444055507, + "learning_rate": 5e-06, + "loss": 1.807, + "num_input_tokens_seen": 69987120, + "step": 405 + }, + { + "epoch": 0.1618705035971223, + "loss": 1.702022910118103, + "loss_ce": 0.16247209906578064, + "loss_xval": 1.5390625, + "num_input_tokens_seen": 69987120, + "step": 405 + }, + { + "epoch": 0.16227018385291767, + "grad_norm": 58.20249101157415, + "learning_rate": 5e-06, + "loss": 1.1604, + "num_input_tokens_seen": 70160184, + "step": 406 + }, + { + "epoch": 0.16227018385291767, + "loss": 1.17859947681427, + "loss_ce": 0.1471053659915924, + "loss_xval": 1.03125, + "num_input_tokens_seen": 70160184, + "step": 406 + }, + { + "epoch": 0.16266986410871304, + "grad_norm": 266.83889920116616, + "learning_rate": 5e-06, + "loss": 1.0537, + "num_input_tokens_seen": 70333352, + "step": 407 + }, + { + "epoch": 0.16266986410871304, + "loss": 1.2187542915344238, + "loss_ce": 0.13623477518558502, + "loss_xval": 1.0859375, + "num_input_tokens_seen": 70333352, + "step": 407 + }, + { + "epoch": 0.1630695443645084, + "grad_norm": 422.83042391271886, + "learning_rate": 5e-06, + "loss": 1.3912, + "num_input_tokens_seen": 70506448, + "step": 408 + }, + { + "epoch": 0.1630695443645084, + "loss": 1.4533321857452393, + "loss_ce": 0.1320432424545288, + "loss_xval": 1.3203125, + "num_input_tokens_seen": 70506448, + "step": 408 + }, + { + "epoch": 0.16346922462030375, + "grad_norm": 390.3090358433531, + "learning_rate": 5e-06, + "loss": 1.2469, + "num_input_tokens_seen": 70679568, + "step": 409 + }, + { + "epoch": 0.16346922462030375, + "loss": 1.5814406871795654, + "loss_ce": 0.13808134198188782, + "loss_xval": 1.4453125, + "num_input_tokens_seen": 70679568, + "step": 409 + }, + { + "epoch": 0.16386890487609912, + "grad_norm": 154.26033869033284, + "learning_rate": 5e-06, + "loss": 1.1171, + "num_input_tokens_seen": 70852160, + "step": 410 + }, + { + "epoch": 0.16386890487609912, + "loss": 1.4136357307434082, + "loss_ce": 0.14141888916492462, + "loss_xval": 1.2734375, + "num_input_tokens_seen": 70852160, + "step": 410 + }, + { + "epoch": 0.1642685851318945, + "grad_norm": 174.6829188801557, + "learning_rate": 5e-06, + "loss": 1.2339, + "num_input_tokens_seen": 71024808, + "step": 411 + }, + { + "epoch": 0.1642685851318945, + "loss": 0.867920994758606, + "loss_ce": 0.12304795533418655, + "loss_xval": 0.74609375, + "num_input_tokens_seen": 71024808, + "step": 411 + }, + { + "epoch": 0.16466826538768986, + "grad_norm": 319.3999891226021, + "learning_rate": 5e-06, + "loss": 1.2699, + "num_input_tokens_seen": 71197624, + "step": 412 + }, + { + "epoch": 0.16466826538768986, + "loss": 1.155133843421936, + "loss_ce": 0.12876664102077484, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 71197624, + "step": 412 + }, + { + "epoch": 0.1650679456434852, + "grad_norm": 320.57417407860714, + "learning_rate": 5e-06, + "loss": 1.322, + "num_input_tokens_seen": 71370280, + "step": 413 + }, + { + "epoch": 0.1650679456434852, + "loss": 1.4060778617858887, + "loss_ce": 0.13434943556785583, + "loss_xval": 1.2734375, + "num_input_tokens_seen": 71370280, + "step": 413 + }, + { + "epoch": 0.16546762589928057, + "grad_norm": 178.26232739789174, + "learning_rate": 5e-06, + "loss": 1.2397, + "num_input_tokens_seen": 71543160, + "step": 414 + }, + { + "epoch": 0.16546762589928057, + "loss": 0.9019365310668945, + "loss_ce": 0.12923146784305573, + "loss_xval": 0.7734375, + "num_input_tokens_seen": 71543160, + "step": 414 + }, + { + "epoch": 0.16586730615507594, + "grad_norm": 114.95281581821858, + "learning_rate": 5e-06, + "loss": 1.1288, + "num_input_tokens_seen": 71716120, + "step": 415 + }, + { + "epoch": 0.16586730615507594, + "loss": 1.384334921836853, + "loss_ce": 0.12664452195167542, + "loss_xval": 1.2578125, + "num_input_tokens_seen": 71716120, + "step": 415 + }, + { + "epoch": 0.16626698641087131, + "grad_norm": 242.53801499071906, + "learning_rate": 5e-06, + "loss": 1.287, + "num_input_tokens_seen": 71888592, + "step": 416 + }, + { + "epoch": 0.16626698641087131, + "loss": 1.0861682891845703, + "loss_ce": 0.14195440709590912, + "loss_xval": 0.9453125, + "num_input_tokens_seen": 71888592, + "step": 416 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 305.69045590617145, + "learning_rate": 5e-06, + "loss": 1.0574, + "num_input_tokens_seen": 72061352, + "step": 417 + }, + { + "epoch": 0.16666666666666666, + "loss": 0.9189929962158203, + "loss_ce": 0.1511707752943039, + "loss_xval": 0.76953125, + "num_input_tokens_seen": 72061352, + "step": 417 + }, + { + "epoch": 0.16706634692246203, + "grad_norm": 232.45571623460697, + "learning_rate": 5e-06, + "loss": 1.4015, + "num_input_tokens_seen": 72234232, + "step": 418 + }, + { + "epoch": 0.16706634692246203, + "loss": 0.8937399387359619, + "loss_ce": 0.15826627612113953, + "loss_xval": 0.734375, + "num_input_tokens_seen": 72234232, + "step": 418 + }, + { + "epoch": 0.1674660271782574, + "grad_norm": 104.88982706534415, + "learning_rate": 5e-06, + "loss": 1.3726, + "num_input_tokens_seen": 72406776, + "step": 419 + }, + { + "epoch": 0.1674660271782574, + "loss": 1.644016981124878, + "loss_ce": 0.1425521820783615, + "loss_xval": 1.5, + "num_input_tokens_seen": 72406776, + "step": 419 + }, + { + "epoch": 0.16786570743405277, + "grad_norm": 132.05241726760326, + "learning_rate": 5e-06, + "loss": 1.5023, + "num_input_tokens_seen": 72579768, + "step": 420 + }, + { + "epoch": 0.16786570743405277, + "loss": 1.6695343255996704, + "loss_ce": 0.14804987609386444, + "loss_xval": 1.5234375, + "num_input_tokens_seen": 72579768, + "step": 420 + }, + { + "epoch": 0.1682653876898481, + "grad_norm": 360.7322762424225, + "learning_rate": 5e-06, + "loss": 1.3444, + "num_input_tokens_seen": 72752160, + "step": 421 + }, + { + "epoch": 0.1682653876898481, + "loss": 1.3023874759674072, + "loss_ce": 0.138325035572052, + "loss_xval": 1.1640625, + "num_input_tokens_seen": 72752160, + "step": 421 + }, + { + "epoch": 0.16866506794564348, + "grad_norm": 534.9608711756771, + "learning_rate": 5e-06, + "loss": 1.3106, + "num_input_tokens_seen": 72925176, + "step": 422 + }, + { + "epoch": 0.16866506794564348, + "loss": 1.265779733657837, + "loss_ce": 0.10196132957935333, + "loss_xval": 1.1640625, + "num_input_tokens_seen": 72925176, + "step": 422 + }, + { + "epoch": 0.16906474820143885, + "grad_norm": 462.6432650379374, + "learning_rate": 5e-06, + "loss": 1.3916, + "num_input_tokens_seen": 73098032, + "step": 423 + }, + { + "epoch": 0.16906474820143885, + "loss": 1.118710994720459, + "loss_ce": 0.11578124761581421, + "loss_xval": 1.0, + "num_input_tokens_seen": 73098032, + "step": 423 + }, + { + "epoch": 0.16946442845723422, + "grad_norm": 101.64068942565726, + "learning_rate": 5e-06, + "loss": 1.0825, + "num_input_tokens_seen": 73270832, + "step": 424 + }, + { + "epoch": 0.16946442845723422, + "loss": 0.9654685854911804, + "loss_ce": 0.13453596830368042, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 73270832, + "step": 424 + }, + { + "epoch": 0.16986410871302957, + "grad_norm": 201.11879256749876, + "learning_rate": 5e-06, + "loss": 1.1616, + "num_input_tokens_seen": 73443768, + "step": 425 + }, + { + "epoch": 0.16986410871302957, + "loss": 1.3148138523101807, + "loss_ce": 0.17748481035232544, + "loss_xval": 1.140625, + "num_input_tokens_seen": 73443768, + "step": 425 + }, + { + "epoch": 0.17026378896882494, + "grad_norm": 69.29811911831622, + "learning_rate": 5e-06, + "loss": 1.0085, + "num_input_tokens_seen": 73616680, + "step": 426 + }, + { + "epoch": 0.17026378896882494, + "loss": 1.0997118949890137, + "loss_ce": 0.2227586954832077, + "loss_xval": 0.875, + "num_input_tokens_seen": 73616680, + "step": 426 + }, + { + "epoch": 0.1706634692246203, + "grad_norm": 160.2507375587037, + "learning_rate": 5e-06, + "loss": 1.5514, + "num_input_tokens_seen": 73789392, + "step": 427 + }, + { + "epoch": 0.1706634692246203, + "loss": 2.2413222789764404, + "loss_ce": 0.21300186216831207, + "loss_xval": 2.03125, + "num_input_tokens_seen": 73789392, + "step": 427 + }, + { + "epoch": 0.17106314948041568, + "grad_norm": 58.93629582053553, + "learning_rate": 5e-06, + "loss": 1.3997, + "num_input_tokens_seen": 73962416, + "step": 428 + }, + { + "epoch": 0.17106314948041568, + "loss": 1.1142562627792358, + "loss_ce": 0.16332849860191345, + "loss_xval": 0.94921875, + "num_input_tokens_seen": 73962416, + "step": 428 + }, + { + "epoch": 0.17146282973621102, + "grad_norm": 136.2720034068254, + "learning_rate": 5e-06, + "loss": 1.2624, + "num_input_tokens_seen": 74135600, + "step": 429 + }, + { + "epoch": 0.17146282973621102, + "loss": 1.42500901222229, + "loss_ce": 0.1691497564315796, + "loss_xval": 1.2578125, + "num_input_tokens_seen": 74135600, + "step": 429 + }, + { + "epoch": 0.1718625099920064, + "grad_norm": 148.31464135041392, + "learning_rate": 5e-06, + "loss": 1.6033, + "num_input_tokens_seen": 74308904, + "step": 430 + }, + { + "epoch": 0.1718625099920064, + "loss": 1.8530278205871582, + "loss_ce": 0.15136760473251343, + "loss_xval": 1.703125, + "num_input_tokens_seen": 74308904, + "step": 430 + }, + { + "epoch": 0.17226219024780176, + "grad_norm": 90.69042515021651, + "learning_rate": 5e-06, + "loss": 1.0973, + "num_input_tokens_seen": 74482056, + "step": 431 + }, + { + "epoch": 0.17226219024780176, + "loss": 1.100707769393921, + "loss_ce": 0.16174298524856567, + "loss_xval": 0.9375, + "num_input_tokens_seen": 74482056, + "step": 431 + }, + { + "epoch": 0.17266187050359713, + "grad_norm": 76.132967101736, + "learning_rate": 5e-06, + "loss": 1.0927, + "num_input_tokens_seen": 74655264, + "step": 432 + }, + { + "epoch": 0.17266187050359713, + "loss": 0.9883812069892883, + "loss_ce": 0.13974839448928833, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 74655264, + "step": 432 + }, + { + "epoch": 0.17306155075939247, + "grad_norm": 64.87876348409729, + "learning_rate": 5e-06, + "loss": 1.1213, + "num_input_tokens_seen": 74828240, + "step": 433 + }, + { + "epoch": 0.17306155075939247, + "loss": 1.2422434091567993, + "loss_ce": 0.1206614226102829, + "loss_xval": 1.125, + "num_input_tokens_seen": 74828240, + "step": 433 + }, + { + "epoch": 0.17346123101518784, + "grad_norm": 59.25309660695592, + "learning_rate": 5e-06, + "loss": 1.0091, + "num_input_tokens_seen": 75001712, + "step": 434 + }, + { + "epoch": 0.17346123101518784, + "loss": 0.9062660336494446, + "loss_ce": 0.11586074531078339, + "loss_xval": 0.7890625, + "num_input_tokens_seen": 75001712, + "step": 434 + }, + { + "epoch": 0.17386091127098321, + "grad_norm": 80.00118800339006, + "learning_rate": 5e-06, + "loss": 1.2468, + "num_input_tokens_seen": 75174536, + "step": 435 + }, + { + "epoch": 0.17386091127098321, + "loss": 1.3547441959381104, + "loss_ce": 0.09778615832328796, + "loss_xval": 1.2578125, + "num_input_tokens_seen": 75174536, + "step": 435 + }, + { + "epoch": 0.17426059152677859, + "grad_norm": 95.79254596327765, + "learning_rate": 5e-06, + "loss": 0.7757, + "num_input_tokens_seen": 75347544, + "step": 436 + }, + { + "epoch": 0.17426059152677859, + "loss": 0.7775790095329285, + "loss_ce": 0.08666104078292847, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 75347544, + "step": 436 + }, + { + "epoch": 0.17466027178257393, + "grad_norm": 232.35516007250703, + "learning_rate": 5e-06, + "loss": 1.3154, + "num_input_tokens_seen": 75520632, + "step": 437 + }, + { + "epoch": 0.17466027178257393, + "loss": 1.4089746475219727, + "loss_ce": 0.07499027997255325, + "loss_xval": 1.3359375, + "num_input_tokens_seen": 75520632, + "step": 437 + }, + { + "epoch": 0.1750599520383693, + "grad_norm": 519.4242646802579, + "learning_rate": 5e-06, + "loss": 1.8545, + "num_input_tokens_seen": 75693864, + "step": 438 + }, + { + "epoch": 0.1750599520383693, + "loss": 1.427260160446167, + "loss_ce": 0.06446726620197296, + "loss_xval": 1.359375, + "num_input_tokens_seen": 75693864, + "step": 438 + }, + { + "epoch": 0.17545963229416467, + "grad_norm": 988.9550741607192, + "learning_rate": 5e-06, + "loss": 2.3758, + "num_input_tokens_seen": 75866888, + "step": 439 + }, + { + "epoch": 0.17545963229416467, + "loss": 2.490389347076416, + "loss_ce": 0.08120955526828766, + "loss_xval": 2.40625, + "num_input_tokens_seen": 75866888, + "step": 439 + }, + { + "epoch": 0.17585931254996004, + "grad_norm": 1106.2099257737912, + "learning_rate": 5e-06, + "loss": 2.9193, + "num_input_tokens_seen": 76039728, + "step": 440 + }, + { + "epoch": 0.17585931254996004, + "loss": 2.894068717956543, + "loss_ce": 0.11086547374725342, + "loss_xval": 2.78125, + "num_input_tokens_seen": 76039728, + "step": 440 + }, + { + "epoch": 0.17625899280575538, + "grad_norm": 105.21406614004921, + "learning_rate": 5e-06, + "loss": 1.5263, + "num_input_tokens_seen": 76212632, + "step": 441 + }, + { + "epoch": 0.17625899280575538, + "loss": 1.382925271987915, + "loss_ce": 0.13243699073791504, + "loss_xval": 1.25, + "num_input_tokens_seen": 76212632, + "step": 441 + }, + { + "epoch": 0.17665867306155075, + "grad_norm": 538.4292674538996, + "learning_rate": 5e-06, + "loss": 1.7185, + "num_input_tokens_seen": 76385848, + "step": 442 + }, + { + "epoch": 0.17665867306155075, + "loss": 1.966761589050293, + "loss_ce": 0.19283580780029297, + "loss_xval": 1.7734375, + "num_input_tokens_seen": 76385848, + "step": 442 + }, + { + "epoch": 0.17705835331734612, + "grad_norm": 254.3195195821683, + "learning_rate": 5e-06, + "loss": 1.2737, + "num_input_tokens_seen": 76558752, + "step": 443 + }, + { + "epoch": 0.17705835331734612, + "loss": 1.3162972927093506, + "loss_ce": 0.23280119895935059, + "loss_xval": 1.0859375, + "num_input_tokens_seen": 76558752, + "step": 443 + }, + { + "epoch": 0.1774580335731415, + "grad_norm": 522.1919334034071, + "learning_rate": 5e-06, + "loss": 1.7626, + "num_input_tokens_seen": 76731944, + "step": 444 + }, + { + "epoch": 0.1774580335731415, + "loss": 1.9908093214035034, + "loss_ce": 0.27547723054885864, + "loss_xval": 1.71875, + "num_input_tokens_seen": 76731944, + "step": 444 + }, + { + "epoch": 0.17785771382893686, + "grad_norm": 216.82852013498507, + "learning_rate": 5e-06, + "loss": 1.7808, + "num_input_tokens_seen": 76904696, + "step": 445 + }, + { + "epoch": 0.17785771382893686, + "loss": 1.9551982879638672, + "loss_ce": 0.29369932413101196, + "loss_xval": 1.6640625, + "num_input_tokens_seen": 76904696, + "step": 445 + }, + { + "epoch": 0.1782573940847322, + "grad_norm": 281.8149994961891, + "learning_rate": 5e-06, + "loss": 1.3799, + "num_input_tokens_seen": 77077232, + "step": 446 + }, + { + "epoch": 0.1782573940847322, + "loss": 1.4842294454574585, + "loss_ce": 0.2862313687801361, + "loss_xval": 1.1953125, + "num_input_tokens_seen": 77077232, + "step": 446 + }, + { + "epoch": 0.17865707434052758, + "grad_norm": 315.72186191273005, + "learning_rate": 5e-06, + "loss": 1.7867, + "num_input_tokens_seen": 77249936, + "step": 447 + }, + { + "epoch": 0.17865707434052758, + "loss": 1.7437907457351685, + "loss_ce": 0.30873218178749084, + "loss_xval": 1.4375, + "num_input_tokens_seen": 77249936, + "step": 447 + }, + { + "epoch": 0.17905675459632295, + "grad_norm": 59.192440355888245, + "learning_rate": 5e-06, + "loss": 1.3851, + "num_input_tokens_seen": 77423072, + "step": 448 + }, + { + "epoch": 0.17905675459632295, + "loss": 1.6609851121902466, + "loss_ce": 0.2824450731277466, + "loss_xval": 1.375, + "num_input_tokens_seen": 77423072, + "step": 448 + }, + { + "epoch": 0.17945643485211832, + "grad_norm": 299.73718753712205, + "learning_rate": 5e-06, + "loss": 1.5403, + "num_input_tokens_seen": 77595776, + "step": 449 + }, + { + "epoch": 0.17945643485211832, + "loss": 1.7367419004440308, + "loss_ce": 0.26506221294403076, + "loss_xval": 1.46875, + "num_input_tokens_seen": 77595776, + "step": 449 + }, + { + "epoch": 0.17985611510791366, + "grad_norm": 276.48273158856335, + "learning_rate": 5e-06, + "loss": 1.5309, + "num_input_tokens_seen": 77769064, + "step": 450 + }, + { + "epoch": 0.17985611510791366, + "loss": 1.32578706741333, + "loss_ce": 0.2933163046836853, + "loss_xval": 1.03125, + "num_input_tokens_seen": 77769064, + "step": 450 + }, + { + "epoch": 0.18025579536370903, + "grad_norm": 160.94325945974563, + "learning_rate": 5e-06, + "loss": 1.4696, + "num_input_tokens_seen": 77941696, + "step": 451 + }, + { + "epoch": 0.18025579536370903, + "loss": 1.3891505002975464, + "loss_ce": 0.3073633909225464, + "loss_xval": 1.078125, + "num_input_tokens_seen": 77941696, + "step": 451 + }, + { + "epoch": 0.1806554756195044, + "grad_norm": 312.55926221947135, + "learning_rate": 5e-06, + "loss": 1.3929, + "num_input_tokens_seen": 78114856, + "step": 452 + }, + { + "epoch": 0.1806554756195044, + "loss": 1.0724852085113525, + "loss_ce": 0.2741453945636749, + "loss_xval": 0.796875, + "num_input_tokens_seen": 78114856, + "step": 452 + }, + { + "epoch": 0.18105515587529977, + "grad_norm": 99.87397360747839, + "learning_rate": 5e-06, + "loss": 1.5163, + "num_input_tokens_seen": 78287864, + "step": 453 + }, + { + "epoch": 0.18105515587529977, + "loss": 1.1197184324264526, + "loss_ce": 0.28866374492645264, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 78287864, + "step": 453 + }, + { + "epoch": 0.18145483613109512, + "grad_norm": 274.6263447275612, + "learning_rate": 5e-06, + "loss": 1.2871, + "num_input_tokens_seen": 78460704, + "step": 454 + }, + { + "epoch": 0.18145483613109512, + "loss": 1.456176519393921, + "loss_ce": 0.2547605335712433, + "loss_xval": 1.203125, + "num_input_tokens_seen": 78460704, + "step": 454 + }, + { + "epoch": 0.18185451638689049, + "grad_norm": 199.6778687760442, + "learning_rate": 5e-06, + "loss": 1.2305, + "num_input_tokens_seen": 78633736, + "step": 455 + }, + { + "epoch": 0.18185451638689049, + "loss": 1.268936276435852, + "loss_ce": 0.2605133652687073, + "loss_xval": 1.0078125, + "num_input_tokens_seen": 78633736, + "step": 455 + }, + { + "epoch": 0.18225419664268586, + "grad_norm": 84.40253001428162, + "learning_rate": 5e-06, + "loss": 1.0979, + "num_input_tokens_seen": 78806336, + "step": 456 + }, + { + "epoch": 0.18225419664268586, + "loss": 0.8699742555618286, + "loss_ce": 0.237527996301651, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 78806336, + "step": 456 + }, + { + "epoch": 0.18265387689848123, + "grad_norm": 285.993440607779, + "learning_rate": 5e-06, + "loss": 1.8171, + "num_input_tokens_seen": 78979056, + "step": 457 + }, + { + "epoch": 0.18265387689848123, + "loss": 1.7808828353881836, + "loss_ce": 0.22766010463237762, + "loss_xval": 1.5546875, + "num_input_tokens_seen": 78979056, + "step": 457 + }, + { + "epoch": 0.18305355715427657, + "grad_norm": 90.58562385013235, + "learning_rate": 5e-06, + "loss": 0.5619, + "num_input_tokens_seen": 79148520, + "step": 458 + }, + { + "epoch": 0.18305355715427657, + "loss": 0.5432583093643188, + "loss_ce": 0.22099265456199646, + "loss_xval": 0.322265625, + "num_input_tokens_seen": 79148520, + "step": 458 + }, + { + "epoch": 0.18345323741007194, + "grad_norm": 215.2025403833186, + "learning_rate": 5e-06, + "loss": 1.2952, + "num_input_tokens_seen": 79321256, + "step": 459 + }, + { + "epoch": 0.18345323741007194, + "loss": 1.2274497747421265, + "loss_ce": 0.23355332016944885, + "loss_xval": 0.9921875, + "num_input_tokens_seen": 79321256, + "step": 459 + }, + { + "epoch": 0.1838529176658673, + "grad_norm": 143.1375353333749, + "learning_rate": 5e-06, + "loss": 1.2422, + "num_input_tokens_seen": 79494168, + "step": 460 + }, + { + "epoch": 0.1838529176658673, + "loss": 1.2271267175674438, + "loss_ce": 0.20149190723896027, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 79494168, + "step": 460 + }, + { + "epoch": 0.18425259792166268, + "grad_norm": 108.0060854037686, + "learning_rate": 5e-06, + "loss": 1.5776, + "num_input_tokens_seen": 79663920, + "step": 461 + }, + { + "epoch": 0.18425259792166268, + "loss": 1.7579941749572754, + "loss_ce": 0.1915878802537918, + "loss_xval": 1.5625, + "num_input_tokens_seen": 79663920, + "step": 461 + }, + { + "epoch": 0.18465227817745802, + "grad_norm": 260.0743652424003, + "learning_rate": 5e-06, + "loss": 1.6427, + "num_input_tokens_seen": 79836992, + "step": 462 + }, + { + "epoch": 0.18465227817745802, + "loss": 1.4615955352783203, + "loss_ce": 0.1910877823829651, + "loss_xval": 1.2734375, + "num_input_tokens_seen": 79836992, + "step": 462 + }, + { + "epoch": 0.1850519584332534, + "grad_norm": 112.36519545386706, + "learning_rate": 5e-06, + "loss": 1.4147, + "num_input_tokens_seen": 80009824, + "step": 463 + }, + { + "epoch": 0.1850519584332534, + "loss": 1.2974412441253662, + "loss_ce": 0.1806199550628662, + "loss_xval": 1.1171875, + "num_input_tokens_seen": 80009824, + "step": 463 + }, + { + "epoch": 0.18545163868904876, + "grad_norm": 298.49579765479467, + "learning_rate": 5e-06, + "loss": 1.1115, + "num_input_tokens_seen": 80182856, + "step": 464 + }, + { + "epoch": 0.18545163868904876, + "loss": 1.4449951648712158, + "loss_ce": 0.1569093018770218, + "loss_xval": 1.2890625, + "num_input_tokens_seen": 80182856, + "step": 464 + }, + { + "epoch": 0.18585131894484413, + "grad_norm": 95.35891374971652, + "learning_rate": 5e-06, + "loss": 1.072, + "num_input_tokens_seen": 80355360, + "step": 465 + }, + { + "epoch": 0.18585131894484413, + "loss": 1.2684335708618164, + "loss_ce": 0.13220316171646118, + "loss_xval": 1.1328125, + "num_input_tokens_seen": 80355360, + "step": 465 + }, + { + "epoch": 0.18625099920063948, + "grad_norm": 172.78392931286916, + "learning_rate": 5e-06, + "loss": 1.2831, + "num_input_tokens_seen": 80528248, + "step": 466 + }, + { + "epoch": 0.18625099920063948, + "loss": 1.3320605754852295, + "loss_ce": 0.1433398425579071, + "loss_xval": 1.1875, + "num_input_tokens_seen": 80528248, + "step": 466 + }, + { + "epoch": 0.18665067945643485, + "grad_norm": 51.21602711363438, + "learning_rate": 5e-06, + "loss": 1.2468, + "num_input_tokens_seen": 80700880, + "step": 467 + }, + { + "epoch": 0.18665067945643485, + "loss": 1.1996004581451416, + "loss_ce": 0.13368244469165802, + "loss_xval": 1.0625, + "num_input_tokens_seen": 80700880, + "step": 467 + }, + { + "epoch": 0.18705035971223022, + "grad_norm": 162.93505295456467, + "learning_rate": 5e-06, + "loss": 0.9464, + "num_input_tokens_seen": 80873696, + "step": 468 + }, + { + "epoch": 0.18705035971223022, + "loss": 0.9175683259963989, + "loss_ce": 0.10885241627693176, + "loss_xval": 0.80859375, + "num_input_tokens_seen": 80873696, + "step": 468 + }, + { + "epoch": 0.1874500399680256, + "grad_norm": 87.58305241565736, + "learning_rate": 5e-06, + "loss": 1.1474, + "num_input_tokens_seen": 81046288, + "step": 469 + }, + { + "epoch": 0.1874500399680256, + "loss": 1.0319910049438477, + "loss_ce": 0.09424698352813721, + "loss_xval": 0.9375, + "num_input_tokens_seen": 81046288, + "step": 469 + }, + { + "epoch": 0.18784972022382093, + "grad_norm": 158.44691760619935, + "learning_rate": 5e-06, + "loss": 1.5434, + "num_input_tokens_seen": 81219400, + "step": 470 + }, + { + "epoch": 0.18784972022382093, + "loss": 1.065626621246338, + "loss_ce": 0.09528970718383789, + "loss_xval": 0.96875, + "num_input_tokens_seen": 81219400, + "step": 470 + }, + { + "epoch": 0.1882494004796163, + "grad_norm": 87.4090982666857, + "learning_rate": 5e-06, + "loss": 0.9658, + "num_input_tokens_seen": 81392432, + "step": 471 + }, + { + "epoch": 0.1882494004796163, + "loss": 0.8852615356445312, + "loss_ce": 0.09217070043087006, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 81392432, + "step": 471 + }, + { + "epoch": 0.18864908073541167, + "grad_norm": 122.54682886858762, + "learning_rate": 5e-06, + "loss": 1.2232, + "num_input_tokens_seen": 81565384, + "step": 472 + }, + { + "epoch": 0.18864908073541167, + "loss": 1.1961512565612793, + "loss_ce": 0.08006230741739273, + "loss_xval": 1.1171875, + "num_input_tokens_seen": 81565384, + "step": 472 + }, + { + "epoch": 0.18904876099120704, + "grad_norm": 84.30592183743661, + "learning_rate": 5e-06, + "loss": 0.8992, + "num_input_tokens_seen": 81738112, + "step": 473 + }, + { + "epoch": 0.18904876099120704, + "loss": 1.047995924949646, + "loss_ce": 0.06899204850196838, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 81738112, + "step": 473 + }, + { + "epoch": 0.18944844124700239, + "grad_norm": 53.15779481734767, + "learning_rate": 5e-06, + "loss": 0.8962, + "num_input_tokens_seen": 81911520, + "step": 474 + }, + { + "epoch": 0.18944844124700239, + "loss": 0.8450521230697632, + "loss_ce": 0.06404630839824677, + "loss_xval": 0.78125, + "num_input_tokens_seen": 81911520, + "step": 474 + }, + { + "epoch": 0.18984812150279776, + "grad_norm": 53.57707087954573, + "learning_rate": 5e-06, + "loss": 0.9375, + "num_input_tokens_seen": 82084208, + "step": 475 + }, + { + "epoch": 0.18984812150279776, + "loss": 0.9071247577667236, + "loss_ce": 0.06813547015190125, + "loss_xval": 0.83984375, + "num_input_tokens_seen": 82084208, + "step": 475 + }, + { + "epoch": 0.19024780175859313, + "grad_norm": 99.14610783345852, + "learning_rate": 5e-06, + "loss": 1.3425, + "num_input_tokens_seen": 82257544, + "step": 476 + }, + { + "epoch": 0.19024780175859313, + "loss": 1.4562163352966309, + "loss_ce": 0.05588666349649429, + "loss_xval": 1.3984375, + "num_input_tokens_seen": 82257544, + "step": 476 + }, + { + "epoch": 0.1906474820143885, + "grad_norm": 51.46105686314192, + "learning_rate": 5e-06, + "loss": 0.8211, + "num_input_tokens_seen": 82430560, + "step": 477 + }, + { + "epoch": 0.1906474820143885, + "loss": 0.915199339389801, + "loss_ce": 0.049842871725559235, + "loss_xval": 0.8671875, + "num_input_tokens_seen": 82430560, + "step": 477 + }, + { + "epoch": 0.19104716227018384, + "grad_norm": 91.19133630030953, + "learning_rate": 5e-06, + "loss": 0.6196, + "num_input_tokens_seen": 82603408, + "step": 478 + }, + { + "epoch": 0.19104716227018384, + "loss": 0.725679337978363, + "loss_ce": 0.04806704819202423, + "loss_xval": 0.67578125, + "num_input_tokens_seen": 82603408, + "step": 478 + }, + { + "epoch": 0.1914468425259792, + "grad_norm": 112.31624471773934, + "learning_rate": 5e-06, + "loss": 1.3017, + "num_input_tokens_seen": 82776032, + "step": 479 + }, + { + "epoch": 0.1914468425259792, + "loss": 1.3830339908599854, + "loss_ce": 0.04367845505475998, + "loss_xval": 1.3359375, + "num_input_tokens_seen": 82776032, + "step": 479 + }, + { + "epoch": 0.19184652278177458, + "grad_norm": 53.33397350743472, + "learning_rate": 5e-06, + "loss": 1.2791, + "num_input_tokens_seen": 82948864, + "step": 480 + }, + { + "epoch": 0.19184652278177458, + "loss": 1.2594175338745117, + "loss_ce": 0.044329725205898285, + "loss_xval": 1.21875, + "num_input_tokens_seen": 82948864, + "step": 480 + }, + { + "epoch": 0.19224620303756995, + "grad_norm": 58.189356919828406, + "learning_rate": 5e-06, + "loss": 0.7775, + "num_input_tokens_seen": 83122064, + "step": 481 + }, + { + "epoch": 0.19224620303756995, + "loss": 0.7105453014373779, + "loss_ce": 0.0419052317738533, + "loss_xval": 0.66796875, + "num_input_tokens_seen": 83122064, + "step": 481 + }, + { + "epoch": 0.1926458832933653, + "grad_norm": 157.82228179423655, + "learning_rate": 5e-06, + "loss": 1.1241, + "num_input_tokens_seen": 83295248, + "step": 482 + }, + { + "epoch": 0.1926458832933653, + "loss": 1.1308469772338867, + "loss_ce": 0.034655675292015076, + "loss_xval": 1.09375, + "num_input_tokens_seen": 83295248, + "step": 482 + }, + { + "epoch": 0.19304556354916066, + "grad_norm": 261.49086967978093, + "learning_rate": 5e-06, + "loss": 1.4113, + "num_input_tokens_seen": 83468488, + "step": 483 + }, + { + "epoch": 0.19304556354916066, + "loss": 1.4266133308410645, + "loss_ce": 0.036842815577983856, + "loss_xval": 1.390625, + "num_input_tokens_seen": 83468488, + "step": 483 + }, + { + "epoch": 0.19344524380495604, + "grad_norm": 259.4909482177068, + "learning_rate": 5e-06, + "loss": 1.1918, + "num_input_tokens_seen": 83641736, + "step": 484 + }, + { + "epoch": 0.19344524380495604, + "loss": 1.2513983249664307, + "loss_ce": 0.03948421776294708, + "loss_xval": 1.2109375, + "num_input_tokens_seen": 83641736, + "step": 484 + }, + { + "epoch": 0.1938449240607514, + "grad_norm": 209.39951992648977, + "learning_rate": 5e-06, + "loss": 0.9566, + "num_input_tokens_seen": 83815056, + "step": 485 + }, + { + "epoch": 0.1938449240607514, + "loss": 1.1501352787017822, + "loss_ce": 0.03856303542852402, + "loss_xval": 1.109375, + "num_input_tokens_seen": 83815056, + "step": 485 + }, + { + "epoch": 0.19424460431654678, + "grad_norm": 54.05007201034879, + "learning_rate": 5e-06, + "loss": 0.9943, + "num_input_tokens_seen": 83987984, + "step": 486 + }, + { + "epoch": 0.19424460431654678, + "loss": 1.3479745388031006, + "loss_ce": 0.037183478474617004, + "loss_xval": 1.3125, + "num_input_tokens_seen": 83987984, + "step": 486 + }, + { + "epoch": 0.19464428457234212, + "grad_norm": 145.86163072629424, + "learning_rate": 5e-06, + "loss": 1.1361, + "num_input_tokens_seen": 84160704, + "step": 487 + }, + { + "epoch": 0.19464428457234212, + "loss": 1.1853137016296387, + "loss_ce": 0.040294162929058075, + "loss_xval": 1.1484375, + "num_input_tokens_seen": 84160704, + "step": 487 + }, + { + "epoch": 0.1950439648281375, + "grad_norm": 305.17188970413605, + "learning_rate": 5e-06, + "loss": 0.9527, + "num_input_tokens_seen": 84333744, + "step": 488 + }, + { + "epoch": 0.1950439648281375, + "loss": 1.0381686687469482, + "loss_ce": 0.03060019761323929, + "loss_xval": 1.0078125, + "num_input_tokens_seen": 84333744, + "step": 488 + }, + { + "epoch": 0.19544364508393286, + "grad_norm": 367.7867921107738, + "learning_rate": 5e-06, + "loss": 0.9876, + "num_input_tokens_seen": 84506240, + "step": 489 + }, + { + "epoch": 0.19544364508393286, + "loss": 1.0127192735671997, + "loss_ce": 0.033227067440748215, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 84506240, + "step": 489 + }, + { + "epoch": 0.19584332533972823, + "grad_norm": 109.77618272454248, + "learning_rate": 5e-06, + "loss": 0.9396, + "num_input_tokens_seen": 84679600, + "step": 490 + }, + { + "epoch": 0.19584332533972823, + "loss": 0.48934584856033325, + "loss_ce": 0.034755997359752655, + "loss_xval": 0.455078125, + "num_input_tokens_seen": 84679600, + "step": 490 + }, + { + "epoch": 0.19624300559552357, + "grad_norm": 195.19088075428, + "learning_rate": 5e-06, + "loss": 1.0253, + "num_input_tokens_seen": 84852344, + "step": 491 + }, + { + "epoch": 0.19624300559552357, + "loss": 1.0787075757980347, + "loss_ce": 0.03622712194919586, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 84852344, + "step": 491 + }, + { + "epoch": 0.19664268585131894, + "grad_norm": 168.19680026401545, + "learning_rate": 5e-06, + "loss": 1.1053, + "num_input_tokens_seen": 85025032, + "step": 492 + }, + { + "epoch": 0.19664268585131894, + "loss": 1.0922410488128662, + "loss_ce": 0.037919752299785614, + "loss_xval": 1.0546875, + "num_input_tokens_seen": 85025032, + "step": 492 + }, + { + "epoch": 0.19704236610711431, + "grad_norm": 133.09158680380528, + "learning_rate": 5e-06, + "loss": 0.891, + "num_input_tokens_seen": 85197936, + "step": 493 + }, + { + "epoch": 0.19704236610711431, + "loss": 0.952610969543457, + "loss_ce": 0.050999678671360016, + "loss_xval": 0.90234375, + "num_input_tokens_seen": 85197936, + "step": 493 + }, + { + "epoch": 0.19744204636290968, + "grad_norm": 324.1718865120786, + "learning_rate": 5e-06, + "loss": 1.3506, + "num_input_tokens_seen": 85371064, + "step": 494 + }, + { + "epoch": 0.19744204636290968, + "loss": 1.3527730703353882, + "loss_ce": 0.04369105398654938, + "loss_xval": 1.3125, + "num_input_tokens_seen": 85371064, + "step": 494 + }, + { + "epoch": 0.19784172661870503, + "grad_norm": 223.13546731515177, + "learning_rate": 5e-06, + "loss": 0.9691, + "num_input_tokens_seen": 85544192, + "step": 495 + }, + { + "epoch": 0.19784172661870503, + "loss": 0.9390429258346558, + "loss_ce": 0.03682119399309158, + "loss_xval": 0.90234375, + "num_input_tokens_seen": 85544192, + "step": 495 + }, + { + "epoch": 0.1982414068745004, + "grad_norm": 101.21144573870228, + "learning_rate": 5e-06, + "loss": 0.9112, + "num_input_tokens_seen": 85717424, + "step": 496 + }, + { + "epoch": 0.1982414068745004, + "loss": 1.0302919149398804, + "loss_ce": 0.046039044857025146, + "loss_xval": 0.984375, + "num_input_tokens_seen": 85717424, + "step": 496 + }, + { + "epoch": 0.19864108713029577, + "grad_norm": 206.07249003860602, + "learning_rate": 5e-06, + "loss": 0.8683, + "num_input_tokens_seen": 85890344, + "step": 497 + }, + { + "epoch": 0.19864108713029577, + "loss": 0.8588770627975464, + "loss_ce": 0.03905284404754639, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 85890344, + "step": 497 + }, + { + "epoch": 0.19904076738609114, + "grad_norm": 94.44663670538154, + "learning_rate": 5e-06, + "loss": 0.8524, + "num_input_tokens_seen": 86063192, + "step": 498 + }, + { + "epoch": 0.19904076738609114, + "loss": 0.6648346185684204, + "loss_ce": 0.03678285330533981, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 86063192, + "step": 498 + }, + { + "epoch": 0.19944044764188648, + "grad_norm": 335.24198957414745, + "learning_rate": 5e-06, + "loss": 1.389, + "num_input_tokens_seen": 86236056, + "step": 499 + }, + { + "epoch": 0.19944044764188648, + "loss": 1.3793278932571411, + "loss_ce": 0.04241389036178589, + "loss_xval": 1.3359375, + "num_input_tokens_seen": 86236056, + "step": 499 + }, + { + "epoch": 0.19984012789768185, + "grad_norm": 65.37214172428197, + "learning_rate": 5e-06, + "loss": 1.2575, + "num_input_tokens_seen": 86408824, + "step": 500 + }, + { + "epoch": 0.19984012789768185, + "eval_websight_new_IoU": 0.08079056814312935, + "eval_websight_new_MAE_all": 0.06199362501502037, + "eval_websight_new_MAE_h": 0.05449218116700649, + "eval_websight_new_MAE_w": 0.09156358614563942, + "eval_websight_new_MAE_x": 0.025543496012687683, + "eval_websight_new_MAE_y": 0.07637524232268333, + "eval_websight_new_NUM_probability": 0.6408629715442657, + "eval_websight_new_inside_bbox": 0.046875, + "eval_websight_new_loss": 0.6728891134262085, + "eval_websight_new_loss_ce": 0.04726765863597393, + "eval_websight_new_loss_xval": 0.6082763671875, + "eval_websight_new_runtime": 57.3968, + "eval_websight_new_samples_per_second": 0.871, + "eval_websight_new_steps_per_second": 0.035, + "num_input_tokens_seen": 86408824, + "step": 500 + }, + { + "epoch": 0.19984012789768185, + "eval_seeclick_IoU": 0.11065776646137238, + "eval_seeclick_MAE_all": 0.10889718681573868, + "eval_seeclick_MAE_h": 0.06004502810537815, + "eval_seeclick_MAE_w": 0.16941364109516144, + "eval_seeclick_MAE_x": 0.11266724020242691, + "eval_seeclick_MAE_y": 0.09346283972263336, + "eval_seeclick_NUM_probability": 0.6323218941688538, + "eval_seeclick_inside_bbox": 0.0868055559694767, + "eval_seeclick_loss": 2.2744133472442627, + "eval_seeclick_loss_ce": 0.06857346370816231, + "eval_seeclick_loss_xval": 2.0810546875, + "eval_seeclick_runtime": 82.4728, + "eval_seeclick_samples_per_second": 0.606, + "eval_seeclick_steps_per_second": 0.024, + "num_input_tokens_seen": 86408824, + "step": 500 + }, + { + "epoch": 0.19984012789768185, + "eval_icons_IoU": 0.009586355474311858, + "eval_icons_MAE_all": 0.06707138940691948, + "eval_icons_MAE_h": 0.06313476897776127, + "eval_icons_MAE_w": 0.06441785581409931, + "eval_icons_MAE_x": 0.05763854831457138, + "eval_icons_MAE_y": 0.08309439569711685, + "eval_icons_NUM_probability": 0.6739359498023987, + "eval_icons_inside_bbox": 0.0, + "eval_icons_loss": 0.5873188972473145, + "eval_icons_loss_ce": 0.0424294825643301, + "eval_icons_loss_xval": 0.53759765625, + "eval_icons_runtime": 81.7973, + "eval_icons_samples_per_second": 0.611, + "eval_icons_steps_per_second": 0.024, + "num_input_tokens_seen": 86408824, + "step": 500 + }, + { + "epoch": 0.19984012789768185, + "loss": 0.6232744455337524, + "loss_ce": 0.044661134481430054, + "loss_xval": 0.578125, + "num_input_tokens_seen": 86408824, + "step": 500 + }, + { + "epoch": 0.20023980815347722, + "grad_norm": 415.0441899230684, + "learning_rate": 5e-06, + "loss": 1.1496, + "num_input_tokens_seen": 86581832, + "step": 501 + }, + { + "epoch": 0.20023980815347722, + "loss": 1.2457921504974365, + "loss_ce": 0.04388776421546936, + "loss_xval": 1.203125, + "num_input_tokens_seen": 86581832, + "step": 501 + }, + { + "epoch": 0.2006394884092726, + "grad_norm": 202.89775372810757, + "learning_rate": 5e-06, + "loss": 0.9393, + "num_input_tokens_seen": 86754704, + "step": 502 + }, + { + "epoch": 0.2006394884092726, + "loss": 0.5830790400505066, + "loss_ce": 0.045115165412425995, + "loss_xval": 0.5390625, + "num_input_tokens_seen": 86754704, + "step": 502 + }, + { + "epoch": 0.20103916866506794, + "grad_norm": 435.08701069154466, + "learning_rate": 5e-06, + "loss": 1.521, + "num_input_tokens_seen": 86927520, + "step": 503 + }, + { + "epoch": 0.20103916866506794, + "loss": 1.6471202373504639, + "loss_ce": 0.061182815581560135, + "loss_xval": 1.5859375, + "num_input_tokens_seen": 86927520, + "step": 503 + }, + { + "epoch": 0.2014388489208633, + "grad_norm": 354.9315963502709, + "learning_rate": 5e-06, + "loss": 1.4481, + "num_input_tokens_seen": 87100360, + "step": 504 + }, + { + "epoch": 0.2014388489208633, + "loss": 1.7939571142196655, + "loss_ce": 0.05225791037082672, + "loss_xval": 1.7421875, + "num_input_tokens_seen": 87100360, + "step": 504 + }, + { + "epoch": 0.20183852917665868, + "grad_norm": 277.1538917985221, + "learning_rate": 5e-06, + "loss": 0.8188, + "num_input_tokens_seen": 87273200, + "step": 505 + }, + { + "epoch": 0.20183852917665868, + "loss": 0.6681489944458008, + "loss_ce": 0.05096151679754257, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 87273200, + "step": 505 + }, + { + "epoch": 0.20223820943245405, + "grad_norm": 215.04429897623527, + "learning_rate": 5e-06, + "loss": 0.896, + "num_input_tokens_seen": 87446384, + "step": 506 + }, + { + "epoch": 0.20223820943245405, + "loss": 0.977022111415863, + "loss_ce": 0.06540100276470184, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 87446384, + "step": 506 + }, + { + "epoch": 0.2026378896882494, + "grad_norm": 276.63932282618924, + "learning_rate": 5e-06, + "loss": 1.2516, + "num_input_tokens_seen": 87619496, + "step": 507 + }, + { + "epoch": 0.2026378896882494, + "loss": 0.9704437255859375, + "loss_ce": 0.0629730224609375, + "loss_xval": 0.90625, + "num_input_tokens_seen": 87619496, + "step": 507 + }, + { + "epoch": 0.20303756994404476, + "grad_norm": 96.67338812468043, + "learning_rate": 5e-06, + "loss": 0.9473, + "num_input_tokens_seen": 87792584, + "step": 508 + }, + { + "epoch": 0.20303756994404476, + "loss": 0.8171831965446472, + "loss_ce": 0.08207576721906662, + "loss_xval": 0.734375, + "num_input_tokens_seen": 87792584, + "step": 508 + }, + { + "epoch": 0.20343725019984013, + "grad_norm": 363.8508621351222, + "learning_rate": 5e-06, + "loss": 1.5687, + "num_input_tokens_seen": 87965712, + "step": 509 + }, + { + "epoch": 0.20343725019984013, + "loss": 1.5893386602401733, + "loss_ce": 0.06773223727941513, + "loss_xval": 1.5234375, + "num_input_tokens_seen": 87965712, + "step": 509 + }, + { + "epoch": 0.2038369304556355, + "grad_norm": 265.10336368041925, + "learning_rate": 5e-06, + "loss": 0.9704, + "num_input_tokens_seen": 88138536, + "step": 510 + }, + { + "epoch": 0.2038369304556355, + "loss": 0.9591758847236633, + "loss_ce": 0.06952746957540512, + "loss_xval": 0.890625, + "num_input_tokens_seen": 88138536, + "step": 510 + }, + { + "epoch": 0.20423661071143084, + "grad_norm": 150.68109411527718, + "learning_rate": 5e-06, + "loss": 0.8909, + "num_input_tokens_seen": 88311912, + "step": 511 + }, + { + "epoch": 0.20423661071143084, + "loss": 1.0047615766525269, + "loss_ce": 0.07226639986038208, + "loss_xval": 0.93359375, + "num_input_tokens_seen": 88311912, + "step": 511 + }, + { + "epoch": 0.20463629096722621, + "grad_norm": 189.68353880999203, + "learning_rate": 5e-06, + "loss": 0.709, + "num_input_tokens_seen": 88484904, + "step": 512 + }, + { + "epoch": 0.20463629096722621, + "loss": 0.7475023865699768, + "loss_ce": 0.0678148865699768, + "loss_xval": 0.6796875, + "num_input_tokens_seen": 88484904, + "step": 512 + }, + { + "epoch": 0.20503597122302158, + "grad_norm": 49.74252850484498, + "learning_rate": 5e-06, + "loss": 1.1139, + "num_input_tokens_seen": 88658368, + "step": 513 + }, + { + "epoch": 0.20503597122302158, + "loss": 1.322534203529358, + "loss_ce": 0.06484372913837433, + "loss_xval": 1.2578125, + "num_input_tokens_seen": 88658368, + "step": 513 + }, + { + "epoch": 0.20543565147881696, + "grad_norm": 62.01670257125715, + "learning_rate": 5e-06, + "loss": 1.1007, + "num_input_tokens_seen": 88831128, + "step": 514 + }, + { + "epoch": 0.20543565147881696, + "loss": 0.8021764159202576, + "loss_ce": 0.05950063467025757, + "loss_xval": 0.7421875, + "num_input_tokens_seen": 88831128, + "step": 514 + }, + { + "epoch": 0.2058353317346123, + "grad_norm": 61.10824221702106, + "learning_rate": 5e-06, + "loss": 0.7747, + "num_input_tokens_seen": 89004360, + "step": 515 + }, + { + "epoch": 0.2058353317346123, + "loss": 0.8074045181274414, + "loss_ce": 0.055451322346925735, + "loss_xval": 0.75, + "num_input_tokens_seen": 89004360, + "step": 515 + }, + { + "epoch": 0.20623501199040767, + "grad_norm": 213.54170746936944, + "learning_rate": 5e-06, + "loss": 1.4733, + "num_input_tokens_seen": 89177432, + "step": 516 + }, + { + "epoch": 0.20623501199040767, + "loss": 1.8919177055358887, + "loss_ce": 0.07111698389053345, + "loss_xval": 1.8203125, + "num_input_tokens_seen": 89177432, + "step": 516 + }, + { + "epoch": 0.20663469224620304, + "grad_norm": 188.66997417165135, + "learning_rate": 5e-06, + "loss": 1.0141, + "num_input_tokens_seen": 89350448, + "step": 517 + }, + { + "epoch": 0.20663469224620304, + "loss": 0.9915530681610107, + "loss_ce": 0.06418493390083313, + "loss_xval": 0.92578125, + "num_input_tokens_seen": 89350448, + "step": 517 + }, + { + "epoch": 0.2070343725019984, + "grad_norm": 54.79378455267396, + "learning_rate": 5e-06, + "loss": 1.0231, + "num_input_tokens_seen": 89523664, + "step": 518 + }, + { + "epoch": 0.2070343725019984, + "loss": 1.0046730041503906, + "loss_ce": 0.051059648394584656, + "loss_xval": 0.953125, + "num_input_tokens_seen": 89523664, + "step": 518 + }, + { + "epoch": 0.20743405275779375, + "grad_norm": 97.26893599654822, + "learning_rate": 5e-06, + "loss": 1.3021, + "num_input_tokens_seen": 89696800, + "step": 519 + }, + { + "epoch": 0.20743405275779375, + "loss": 1.5230860710144043, + "loss_ce": 0.044448427855968475, + "loss_xval": 1.4765625, + "num_input_tokens_seen": 89696800, + "step": 519 + }, + { + "epoch": 0.20783373301358912, + "grad_norm": 122.63155997518379, + "learning_rate": 5e-06, + "loss": 1.1363, + "num_input_tokens_seen": 89869128, + "step": 520 + }, + { + "epoch": 0.20783373301358912, + "loss": 0.7199119329452515, + "loss_ce": 0.05108872056007385, + "loss_xval": 0.66796875, + "num_input_tokens_seen": 89869128, + "step": 520 + }, + { + "epoch": 0.2082334132693845, + "grad_norm": 43.22504973101311, + "learning_rate": 5e-06, + "loss": 1.0852, + "num_input_tokens_seen": 90042088, + "step": 521 + }, + { + "epoch": 0.2082334132693845, + "loss": 0.9542793035507202, + "loss_ce": 0.04509960114955902, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 90042088, + "step": 521 + }, + { + "epoch": 0.20863309352517986, + "grad_norm": 237.56955165262985, + "learning_rate": 5e-06, + "loss": 1.1241, + "num_input_tokens_seen": 90215056, + "step": 522 + }, + { + "epoch": 0.20863309352517986, + "loss": 0.8590470552444458, + "loss_ce": 0.03726974129676819, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 90215056, + "step": 522 + }, + { + "epoch": 0.2090327737809752, + "grad_norm": 263.0137742752679, + "learning_rate": 5e-06, + "loss": 1.3867, + "num_input_tokens_seen": 90387888, + "step": 523 + }, + { + "epoch": 0.2090327737809752, + "loss": 1.0127967596054077, + "loss_ce": 0.040872905403375626, + "loss_xval": 0.97265625, + "num_input_tokens_seen": 90387888, + "step": 523 + }, + { + "epoch": 0.20943245403677058, + "grad_norm": 44.95766707948507, + "learning_rate": 5e-06, + "loss": 1.0766, + "num_input_tokens_seen": 90561136, + "step": 524 + }, + { + "epoch": 0.20943245403677058, + "loss": 0.6546859741210938, + "loss_ce": 0.03603363782167435, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 90561136, + "step": 524 + }, + { + "epoch": 0.20983213429256595, + "grad_norm": 112.87978212103138, + "learning_rate": 5e-06, + "loss": 0.6245, + "num_input_tokens_seen": 90734184, + "step": 525 + }, + { + "epoch": 0.20983213429256595, + "loss": 0.4749688506126404, + "loss_ce": 0.04808899015188217, + "loss_xval": 0.427734375, + "num_input_tokens_seen": 90734184, + "step": 525 + }, + { + "epoch": 0.21023181454836132, + "grad_norm": 92.99996671691841, + "learning_rate": 5e-06, + "loss": 1.0497, + "num_input_tokens_seen": 90907176, + "step": 526 + }, + { + "epoch": 0.21023181454836132, + "loss": 0.8723914623260498, + "loss_ce": 0.03266974911093712, + "loss_xval": 0.83984375, + "num_input_tokens_seen": 90907176, + "step": 526 + }, + { + "epoch": 0.2106314948041567, + "grad_norm": 132.48797337096062, + "learning_rate": 5e-06, + "loss": 1.291, + "num_input_tokens_seen": 91080520, + "step": 527 + }, + { + "epoch": 0.2106314948041567, + "loss": 1.0182774066925049, + "loss_ce": 0.03133901208639145, + "loss_xval": 0.98828125, + "num_input_tokens_seen": 91080520, + "step": 527 + }, + { + "epoch": 0.21103117505995203, + "grad_norm": 67.2411409745012, + "learning_rate": 5e-06, + "loss": 0.6511, + "num_input_tokens_seen": 91253584, + "step": 528 + }, + { + "epoch": 0.21103117505995203, + "loss": 0.7829042673110962, + "loss_ce": 0.0313173308968544, + "loss_xval": 0.75, + "num_input_tokens_seen": 91253584, + "step": 528 + }, + { + "epoch": 0.2114308553157474, + "grad_norm": 119.38443221614165, + "learning_rate": 5e-06, + "loss": 1.0069, + "num_input_tokens_seen": 91426584, + "step": 529 + }, + { + "epoch": 0.2114308553157474, + "loss": 1.0257318019866943, + "loss_ce": 0.02658626064658165, + "loss_xval": 1.0, + "num_input_tokens_seen": 91426584, + "step": 529 + }, + { + "epoch": 0.21183053557154277, + "grad_norm": 161.8946233726489, + "learning_rate": 5e-06, + "loss": 1.1522, + "num_input_tokens_seen": 91599784, + "step": 530 + }, + { + "epoch": 0.21183053557154277, + "loss": 1.1873421669006348, + "loss_ce": 0.024622494354844093, + "loss_xval": 1.1640625, + "num_input_tokens_seen": 91599784, + "step": 530 + }, + { + "epoch": 0.21223021582733814, + "grad_norm": 322.1567587058893, + "learning_rate": 5e-06, + "loss": 1.5258, + "num_input_tokens_seen": 91772792, + "step": 531 + }, + { + "epoch": 0.21223021582733814, + "loss": 1.6199225187301636, + "loss_ce": 0.03862369433045387, + "loss_xval": 1.578125, + "num_input_tokens_seen": 91772792, + "step": 531 + }, + { + "epoch": 0.21262989608313348, + "grad_norm": 162.02156030703074, + "learning_rate": 5e-06, + "loss": 0.8099, + "num_input_tokens_seen": 91945720, + "step": 532 + }, + { + "epoch": 0.21262989608313348, + "loss": 0.7622925639152527, + "loss_ce": 0.02779550477862358, + "loss_xval": 0.734375, + "num_input_tokens_seen": 91945720, + "step": 532 + }, + { + "epoch": 0.21302957633892886, + "grad_norm": 136.96122336871258, + "learning_rate": 5e-06, + "loss": 1.1803, + "num_input_tokens_seen": 92118664, + "step": 533 + }, + { + "epoch": 0.21302957633892886, + "loss": 1.2401918172836304, + "loss_ce": 0.021441802382469177, + "loss_xval": 1.21875, + "num_input_tokens_seen": 92118664, + "step": 533 + }, + { + "epoch": 0.21342925659472423, + "grad_norm": 234.78901205861808, + "learning_rate": 5e-06, + "loss": 0.9575, + "num_input_tokens_seen": 92288392, + "step": 534 + }, + { + "epoch": 0.21342925659472423, + "loss": 0.6059136390686035, + "loss_ce": 0.023638233542442322, + "loss_xval": 0.58203125, + "num_input_tokens_seen": 92288392, + "step": 534 + }, + { + "epoch": 0.2138289368505196, + "grad_norm": 144.7989394237274, + "learning_rate": 5e-06, + "loss": 0.8869, + "num_input_tokens_seen": 92461600, + "step": 535 + }, + { + "epoch": 0.2138289368505196, + "loss": 1.0061213970184326, + "loss_ce": 0.024554094299674034, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 92461600, + "step": 535 + }, + { + "epoch": 0.21422861710631494, + "grad_norm": 55.48959721021269, + "learning_rate": 5e-06, + "loss": 0.4912, + "num_input_tokens_seen": 92634216, + "step": 536 + }, + { + "epoch": 0.21422861710631494, + "loss": 0.47603410482406616, + "loss_ce": 0.027242586016654968, + "loss_xval": 0.44921875, + "num_input_tokens_seen": 92634216, + "step": 536 + }, + { + "epoch": 0.2146282973621103, + "grad_norm": 125.22804174984209, + "learning_rate": 5e-06, + "loss": 0.7744, + "num_input_tokens_seen": 92807536, + "step": 537 + }, + { + "epoch": 0.2146282973621103, + "loss": 0.5036810636520386, + "loss_ce": 0.02418886497616768, + "loss_xval": 0.48046875, + "num_input_tokens_seen": 92807536, + "step": 537 + }, + { + "epoch": 0.21502797761790568, + "grad_norm": 211.76831446430648, + "learning_rate": 5e-06, + "loss": 1.0454, + "num_input_tokens_seen": 92980600, + "step": 538 + }, + { + "epoch": 0.21502797761790568, + "loss": 0.9494002461433411, + "loss_ce": 0.03143148496747017, + "loss_xval": 0.91796875, + "num_input_tokens_seen": 92980600, + "step": 538 + }, + { + "epoch": 0.21542765787370105, + "grad_norm": 83.53737940813463, + "learning_rate": 5e-06, + "loss": 0.8112, + "num_input_tokens_seen": 93153536, + "step": 539 + }, + { + "epoch": 0.21542765787370105, + "loss": 0.7787027955055237, + "loss_ce": 0.028214523568749428, + "loss_xval": 0.75, + "num_input_tokens_seen": 93153536, + "step": 539 + }, + { + "epoch": 0.2158273381294964, + "grad_norm": 221.1772186610306, + "learning_rate": 5e-06, + "loss": 1.2162, + "num_input_tokens_seen": 93326336, + "step": 540 + }, + { + "epoch": 0.2158273381294964, + "loss": 0.7756029367446899, + "loss_ce": 0.027067817747592926, + "loss_xval": 0.75, + "num_input_tokens_seen": 93326336, + "step": 540 + }, + { + "epoch": 0.21622701838529176, + "grad_norm": 250.30628691634345, + "learning_rate": 5e-06, + "loss": 0.8389, + "num_input_tokens_seen": 93499160, + "step": 541 + }, + { + "epoch": 0.21622701838529176, + "loss": 0.6384867429733276, + "loss_ce": 0.030210375785827637, + "loss_xval": 0.609375, + "num_input_tokens_seen": 93499160, + "step": 541 + }, + { + "epoch": 0.21662669864108713, + "grad_norm": 60.81920570311037, + "learning_rate": 5e-06, + "loss": 1.1726, + "num_input_tokens_seen": 93672248, + "step": 542 + }, + { + "epoch": 0.21662669864108713, + "loss": 1.3165156841278076, + "loss_ce": 0.03245798870921135, + "loss_xval": 1.28125, + "num_input_tokens_seen": 93672248, + "step": 542 + }, + { + "epoch": 0.2170263788968825, + "grad_norm": 245.3824525578725, + "learning_rate": 5e-06, + "loss": 1.0387, + "num_input_tokens_seen": 93845280, + "step": 543 + }, + { + "epoch": 0.2170263788968825, + "loss": 1.4534016847610474, + "loss_ce": 0.027132168412208557, + "loss_xval": 1.4296875, + "num_input_tokens_seen": 93845280, + "step": 543 + }, + { + "epoch": 0.21742605915267785, + "grad_norm": 96.9503832488851, + "learning_rate": 5e-06, + "loss": 1.6084, + "num_input_tokens_seen": 94018296, + "step": 544 + }, + { + "epoch": 0.21742605915267785, + "loss": 1.4025704860687256, + "loss_ce": 0.04173062741756439, + "loss_xval": 1.359375, + "num_input_tokens_seen": 94018296, + "step": 544 + }, + { + "epoch": 0.21782573940847322, + "grad_norm": 248.6582812449355, + "learning_rate": 5e-06, + "loss": 0.829, + "num_input_tokens_seen": 94191352, + "step": 545 + }, + { + "epoch": 0.21782573940847322, + "loss": 0.7271380424499512, + "loss_ce": 0.026942692697048187, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 94191352, + "step": 545 + }, + { + "epoch": 0.2182254196642686, + "grad_norm": 139.37534365707833, + "learning_rate": 5e-06, + "loss": 0.8576, + "num_input_tokens_seen": 94364712, + "step": 546 + }, + { + "epoch": 0.2182254196642686, + "loss": 1.2029194831848145, + "loss_ce": 0.026039643213152885, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 94364712, + "step": 546 + }, + { + "epoch": 0.21862509992006396, + "grad_norm": 161.00715928409278, + "learning_rate": 5e-06, + "loss": 1.2751, + "num_input_tokens_seen": 94534248, + "step": 547 + }, + { + "epoch": 0.21862509992006396, + "loss": 0.7849478125572205, + "loss_ce": 0.024938026443123817, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 94534248, + "step": 547 + }, + { + "epoch": 0.2190247801758593, + "grad_norm": 120.74445345712077, + "learning_rate": 5e-06, + "loss": 0.972, + "num_input_tokens_seen": 94707432, + "step": 548 + }, + { + "epoch": 0.2190247801758593, + "loss": 1.0243523120880127, + "loss_ce": 0.04681321233510971, + "loss_xval": 0.9765625, + "num_input_tokens_seen": 94707432, + "step": 548 + }, + { + "epoch": 0.21942446043165467, + "grad_norm": 149.35126901028477, + "learning_rate": 5e-06, + "loss": 1.621, + "num_input_tokens_seen": 94880480, + "step": 549 + }, + { + "epoch": 0.21942446043165467, + "loss": 0.938992977142334, + "loss_ce": 0.02834843471646309, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 94880480, + "step": 549 + }, + { + "epoch": 0.21982414068745004, + "grad_norm": 96.31882416543029, + "learning_rate": 5e-06, + "loss": 0.9228, + "num_input_tokens_seen": 95053720, + "step": 550 + }, + { + "epoch": 0.21982414068745004, + "loss": 0.6176514029502869, + "loss_ce": 0.026220720261335373, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 95053720, + "step": 550 + }, + { + "epoch": 0.2202238209432454, + "grad_norm": 89.42353943704879, + "learning_rate": 5e-06, + "loss": 0.8391, + "num_input_tokens_seen": 95226696, + "step": 551 + }, + { + "epoch": 0.2202238209432454, + "loss": 1.1121536493301392, + "loss_ce": 0.025239594280719757, + "loss_xval": 1.0859375, + "num_input_tokens_seen": 95226696, + "step": 551 + }, + { + "epoch": 0.22062350119904076, + "grad_norm": 60.245937188446725, + "learning_rate": 5e-06, + "loss": 1.1905, + "num_input_tokens_seen": 95399936, + "step": 552 + }, + { + "epoch": 0.22062350119904076, + "loss": 1.1478040218353271, + "loss_ce": 0.03208138048648834, + "loss_xval": 1.1171875, + "num_input_tokens_seen": 95399936, + "step": 552 + }, + { + "epoch": 0.22102318145483613, + "grad_norm": 250.376146897708, + "learning_rate": 5e-06, + "loss": 0.8334, + "num_input_tokens_seen": 95573048, + "step": 553 + }, + { + "epoch": 0.22102318145483613, + "loss": 0.9694182872772217, + "loss_ce": 0.029293827712535858, + "loss_xval": 0.94140625, + "num_input_tokens_seen": 95573048, + "step": 553 + }, + { + "epoch": 0.2214228617106315, + "grad_norm": 472.12030978419017, + "learning_rate": 5e-06, + "loss": 1.3447, + "num_input_tokens_seen": 95746232, + "step": 554 + }, + { + "epoch": 0.2214228617106315, + "loss": 1.390291690826416, + "loss_ce": 0.031160805374383926, + "loss_xval": 1.359375, + "num_input_tokens_seen": 95746232, + "step": 554 + }, + { + "epoch": 0.22182254196642687, + "grad_norm": 200.19161843381832, + "learning_rate": 5e-06, + "loss": 1.1299, + "num_input_tokens_seen": 95919064, + "step": 555 + }, + { + "epoch": 0.22182254196642687, + "loss": 1.2493796348571777, + "loss_ce": 0.029714081436395645, + "loss_xval": 1.21875, + "num_input_tokens_seen": 95919064, + "step": 555 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 264.8729753389226, + "learning_rate": 5e-06, + "loss": 1.3297, + "num_input_tokens_seen": 96092128, + "step": 556 + }, + { + "epoch": 0.2222222222222222, + "loss": 1.3867847919464111, + "loss_ce": 0.03546644374728203, + "loss_xval": 1.3515625, + "num_input_tokens_seen": 96092128, + "step": 556 + }, + { + "epoch": 0.22262190247801758, + "grad_norm": 292.84845504228457, + "learning_rate": 5e-06, + "loss": 1.2343, + "num_input_tokens_seen": 96261344, + "step": 557 + }, + { + "epoch": 0.22262190247801758, + "loss": 1.3330931663513184, + "loss_ce": 0.02938220463693142, + "loss_xval": 1.3046875, + "num_input_tokens_seen": 96261344, + "step": 557 + }, + { + "epoch": 0.22302158273381295, + "grad_norm": 60.53087991395975, + "learning_rate": 5e-06, + "loss": 0.841, + "num_input_tokens_seen": 96434288, + "step": 558 + }, + { + "epoch": 0.22302158273381295, + "loss": 0.9854141473770142, + "loss_ce": 0.032960571348667145, + "loss_xval": 0.953125, + "num_input_tokens_seen": 96434288, + "step": 558 + }, + { + "epoch": 0.22342126298960832, + "grad_norm": 197.24569669696305, + "learning_rate": 5e-06, + "loss": 1.209, + "num_input_tokens_seen": 96607664, + "step": 559 + }, + { + "epoch": 0.22342126298960832, + "loss": 0.886874794960022, + "loss_ce": 0.055820122361183167, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 96607664, + "step": 559 + }, + { + "epoch": 0.22382094324540366, + "grad_norm": 90.62307871554165, + "learning_rate": 5e-06, + "loss": 0.8997, + "num_input_tokens_seen": 96780576, + "step": 560 + }, + { + "epoch": 0.22382094324540366, + "loss": 0.5703801512718201, + "loss_ce": 0.035590097308158875, + "loss_xval": 0.53515625, + "num_input_tokens_seen": 96780576, + "step": 560 + }, + { + "epoch": 0.22422062350119903, + "grad_norm": 210.46096711060173, + "learning_rate": 5e-06, + "loss": 0.8813, + "num_input_tokens_seen": 96953128, + "step": 561 + }, + { + "epoch": 0.22422062350119903, + "loss": 1.1930537223815918, + "loss_ce": 0.050475526601076126, + "loss_xval": 1.140625, + "num_input_tokens_seen": 96953128, + "step": 561 + }, + { + "epoch": 0.2246203037569944, + "grad_norm": 156.37219513558009, + "learning_rate": 5e-06, + "loss": 1.1836, + "num_input_tokens_seen": 97126480, + "step": 562 + }, + { + "epoch": 0.2246203037569944, + "loss": 0.6334319710731506, + "loss_ce": 0.04236753284931183, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 97126480, + "step": 562 + }, + { + "epoch": 0.22501998401278978, + "grad_norm": 387.8888057437442, + "learning_rate": 5e-06, + "loss": 0.975, + "num_input_tokens_seen": 97299688, + "step": 563 + }, + { + "epoch": 0.22501998401278978, + "loss": 0.8660625219345093, + "loss_ce": 0.04672662168741226, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 97299688, + "step": 563 + }, + { + "epoch": 0.22541966426858512, + "grad_norm": 64.648417204487, + "learning_rate": 5e-06, + "loss": 0.6787, + "num_input_tokens_seen": 97472888, + "step": 564 + }, + { + "epoch": 0.22541966426858512, + "loss": 0.90760338306427, + "loss_ce": 0.04456621780991554, + "loss_xval": 0.86328125, + "num_input_tokens_seen": 97472888, + "step": 564 + }, + { + "epoch": 0.2258193445243805, + "grad_norm": 411.13766405693985, + "learning_rate": 5e-06, + "loss": 1.0557, + "num_input_tokens_seen": 97646160, + "step": 565 + }, + { + "epoch": 0.2258193445243805, + "loss": 1.2993905544281006, + "loss_ce": 0.042066287249326706, + "loss_xval": 1.2578125, + "num_input_tokens_seen": 97646160, + "step": 565 + }, + { + "epoch": 0.22621902478017586, + "grad_norm": 217.77303028298573, + "learning_rate": 5e-06, + "loss": 0.8985, + "num_input_tokens_seen": 97818944, + "step": 566 + }, + { + "epoch": 0.22621902478017586, + "loss": 0.7498751282691956, + "loss_ce": 0.041379086673259735, + "loss_xval": 0.70703125, + "num_input_tokens_seen": 97818944, + "step": 566 + }, + { + "epoch": 0.22661870503597123, + "grad_norm": 245.13372340348388, + "learning_rate": 5e-06, + "loss": 0.8019, + "num_input_tokens_seen": 97991832, + "step": 567 + }, + { + "epoch": 0.22661870503597123, + "loss": 0.7386154532432556, + "loss_ce": 0.03329318016767502, + "loss_xval": 0.70703125, + "num_input_tokens_seen": 97991832, + "step": 567 + }, + { + "epoch": 0.2270183852917666, + "grad_norm": 257.82480324488694, + "learning_rate": 5e-06, + "loss": 1.2229, + "num_input_tokens_seen": 98164624, + "step": 568 + }, + { + "epoch": 0.2270183852917666, + "loss": 0.9997340440750122, + "loss_ce": 0.04294687137007713, + "loss_xval": 0.95703125, + "num_input_tokens_seen": 98164624, + "step": 568 + }, + { + "epoch": 0.22741806554756194, + "grad_norm": 34.59729056287196, + "learning_rate": 5e-06, + "loss": 0.9462, + "num_input_tokens_seen": 98337936, + "step": 569 + }, + { + "epoch": 0.22741806554756194, + "loss": 0.8521548509597778, + "loss_ce": 0.03757966682314873, + "loss_xval": 0.81640625, + "num_input_tokens_seen": 98337936, + "step": 569 + }, + { + "epoch": 0.2278177458033573, + "grad_norm": 111.7199151079931, + "learning_rate": 5e-06, + "loss": 0.853, + "num_input_tokens_seen": 98511288, + "step": 570 + }, + { + "epoch": 0.2278177458033573, + "loss": 0.8863071799278259, + "loss_ce": 0.03657577931880951, + "loss_xval": 0.8515625, + "num_input_tokens_seen": 98511288, + "step": 570 + }, + { + "epoch": 0.22821742605915268, + "grad_norm": 42.73190171410932, + "learning_rate": 5e-06, + "loss": 0.8986, + "num_input_tokens_seen": 98684336, + "step": 571 + }, + { + "epoch": 0.22821742605915268, + "loss": 1.1216399669647217, + "loss_ce": 0.041561778634786606, + "loss_xval": 1.078125, + "num_input_tokens_seen": 98684336, + "step": 571 + }, + { + "epoch": 0.22861710631494805, + "grad_norm": 176.58612953080763, + "learning_rate": 5e-06, + "loss": 1.2574, + "num_input_tokens_seen": 98853952, + "step": 572 + }, + { + "epoch": 0.22861710631494805, + "loss": 1.1767420768737793, + "loss_ce": 0.034652289003133774, + "loss_xval": 1.140625, + "num_input_tokens_seen": 98853952, + "step": 572 + }, + { + "epoch": 0.2290167865707434, + "grad_norm": 81.67430863927449, + "learning_rate": 5e-06, + "loss": 1.0762, + "num_input_tokens_seen": 99026888, + "step": 573 + }, + { + "epoch": 0.2290167865707434, + "loss": 1.4329999685287476, + "loss_ce": 0.07716501504182816, + "loss_xval": 1.359375, + "num_input_tokens_seen": 99026888, + "step": 573 + }, + { + "epoch": 0.22941646682653877, + "grad_norm": 274.37468677613424, + "learning_rate": 5e-06, + "loss": 1.2617, + "num_input_tokens_seen": 99199640, + "step": 574 + }, + { + "epoch": 0.22941646682653877, + "loss": 0.6166301965713501, + "loss_ce": 0.029716167598962784, + "loss_xval": 0.5859375, + "num_input_tokens_seen": 99199640, + "step": 574 + }, + { + "epoch": 0.22981614708233414, + "grad_norm": 26.081873724774624, + "learning_rate": 5e-06, + "loss": 0.6142, + "num_input_tokens_seen": 99372312, + "step": 575 + }, + { + "epoch": 0.22981614708233414, + "loss": 0.5156276822090149, + "loss_ce": 0.03149682283401489, + "loss_xval": 0.484375, + "num_input_tokens_seen": 99372312, + "step": 575 + }, + { + "epoch": 0.2302158273381295, + "grad_norm": 284.96481935200273, + "learning_rate": 5e-06, + "loss": 1.1397, + "num_input_tokens_seen": 99545160, + "step": 576 + }, + { + "epoch": 0.2302158273381295, + "loss": 1.3616349697113037, + "loss_ce": 0.034730590879917145, + "loss_xval": 1.328125, + "num_input_tokens_seen": 99545160, + "step": 576 + }, + { + "epoch": 0.23061550759392485, + "grad_norm": 44.124449238858176, + "learning_rate": 5e-06, + "loss": 0.7619, + "num_input_tokens_seen": 99718512, + "step": 577 + }, + { + "epoch": 0.23061550759392485, + "loss": 0.824275016784668, + "loss_ce": 0.029658352956175804, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 99718512, + "step": 577 + }, + { + "epoch": 0.23101518784972022, + "grad_norm": 207.04912847362317, + "learning_rate": 5e-06, + "loss": 1.2726, + "num_input_tokens_seen": 99891440, + "step": 578 + }, + { + "epoch": 0.23101518784972022, + "loss": 1.4096300601959229, + "loss_ce": 0.02791622281074524, + "loss_xval": 1.3828125, + "num_input_tokens_seen": 99891440, + "step": 578 + }, + { + "epoch": 0.2314148681055156, + "grad_norm": 136.60800256923707, + "learning_rate": 5e-06, + "loss": 0.9458, + "num_input_tokens_seen": 100064696, + "step": 579 + }, + { + "epoch": 0.2314148681055156, + "loss": 0.8815451860427856, + "loss_ce": 0.027907539159059525, + "loss_xval": 0.85546875, + "num_input_tokens_seen": 100064696, + "step": 579 + }, + { + "epoch": 0.23181454836131096, + "grad_norm": 281.71610387045706, + "learning_rate": 5e-06, + "loss": 0.8355, + "num_input_tokens_seen": 100237384, + "step": 580 + }, + { + "epoch": 0.23181454836131096, + "loss": 0.7737770080566406, + "loss_ce": 0.025363922119140625, + "loss_xval": 0.75, + "num_input_tokens_seen": 100237384, + "step": 580 + }, + { + "epoch": 0.2322142286171063, + "grad_norm": 69.97613875254609, + "learning_rate": 5e-06, + "loss": 1.1513, + "num_input_tokens_seen": 100410504, + "step": 581 + }, + { + "epoch": 0.2322142286171063, + "loss": 1.1582691669464111, + "loss_ce": 0.026433231309056282, + "loss_xval": 1.1328125, + "num_input_tokens_seen": 100410504, + "step": 581 + }, + { + "epoch": 0.23261390887290168, + "grad_norm": 373.00175697859225, + "learning_rate": 5e-06, + "loss": 1.4461, + "num_input_tokens_seen": 100583328, + "step": 582 + }, + { + "epoch": 0.23261390887290168, + "loss": 1.4269239902496338, + "loss_ce": 0.023115256801247597, + "loss_xval": 1.40625, + "num_input_tokens_seen": 100583328, + "step": 582 + }, + { + "epoch": 0.23301358912869705, + "grad_norm": 149.603838664552, + "learning_rate": 5e-06, + "loss": 0.6656, + "num_input_tokens_seen": 100755664, + "step": 583 + }, + { + "epoch": 0.23301358912869705, + "loss": 0.7104263305664062, + "loss_ce": 0.024635307490825653, + "loss_xval": 0.6875, + "num_input_tokens_seen": 100755664, + "step": 583 + }, + { + "epoch": 0.23341326938449242, + "grad_norm": 315.2272192197468, + "learning_rate": 5e-06, + "loss": 0.7095, + "num_input_tokens_seen": 100925176, + "step": 584 + }, + { + "epoch": 0.23341326938449242, + "loss": 0.7385757565498352, + "loss_ce": 0.03300934657454491, + "loss_xval": 0.70703125, + "num_input_tokens_seen": 100925176, + "step": 584 + }, + { + "epoch": 0.23381294964028776, + "grad_norm": 158.70840429315962, + "learning_rate": 5e-06, + "loss": 1.2368, + "num_input_tokens_seen": 101097904, + "step": 585 + }, + { + "epoch": 0.23381294964028776, + "loss": 1.1981698274612427, + "loss_ce": 0.028125843033194542, + "loss_xval": 1.171875, + "num_input_tokens_seen": 101097904, + "step": 585 + }, + { + "epoch": 0.23421262989608313, + "grad_norm": 300.90753147063134, + "learning_rate": 5e-06, + "loss": 0.9731, + "num_input_tokens_seen": 101270824, + "step": 586 + }, + { + "epoch": 0.23421262989608313, + "loss": 1.0399353504180908, + "loss_ce": 0.031390391290187836, + "loss_xval": 1.0078125, + "num_input_tokens_seen": 101270824, + "step": 586 + }, + { + "epoch": 0.2346123101518785, + "grad_norm": 150.6022752310562, + "learning_rate": 5e-06, + "loss": 1.2954, + "num_input_tokens_seen": 101443832, + "step": 587 + }, + { + "epoch": 0.2346123101518785, + "loss": 0.8707741498947144, + "loss_ce": 0.031174514442682266, + "loss_xval": 0.83984375, + "num_input_tokens_seen": 101443832, + "step": 587 + }, + { + "epoch": 0.23501199040767387, + "grad_norm": 277.43328148547425, + "learning_rate": 5e-06, + "loss": 1.0352, + "num_input_tokens_seen": 101616416, + "step": 588 + }, + { + "epoch": 0.23501199040767387, + "loss": 1.0693809986114502, + "loss_ce": 0.0334923230111599, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 101616416, + "step": 588 + }, + { + "epoch": 0.2354116706634692, + "grad_norm": 78.20054429498728, + "learning_rate": 5e-06, + "loss": 0.919, + "num_input_tokens_seen": 101789472, + "step": 589 + }, + { + "epoch": 0.2354116706634692, + "loss": 0.9403672814369202, + "loss_ce": 0.02972276508808136, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 101789472, + "step": 589 + }, + { + "epoch": 0.23581135091926458, + "grad_norm": 248.8776432354872, + "learning_rate": 5e-06, + "loss": 1.3982, + "num_input_tokens_seen": 101962248, + "step": 590 + }, + { + "epoch": 0.23581135091926458, + "loss": 1.470240592956543, + "loss_ce": 0.030176982283592224, + "loss_xval": 1.4375, + "num_input_tokens_seen": 101962248, + "step": 590 + }, + { + "epoch": 0.23621103117505995, + "grad_norm": 34.509014167489795, + "learning_rate": 5e-06, + "loss": 0.7077, + "num_input_tokens_seen": 102131336, + "step": 591 + }, + { + "epoch": 0.23621103117505995, + "loss": 0.4956533908843994, + "loss_ce": 0.03294587880373001, + "loss_xval": 0.462890625, + "num_input_tokens_seen": 102131336, + "step": 591 + }, + { + "epoch": 0.23661071143085532, + "grad_norm": 315.0428753524298, + "learning_rate": 5e-06, + "loss": 0.9751, + "num_input_tokens_seen": 102303824, + "step": 592 + }, + { + "epoch": 0.23661071143085532, + "loss": 1.054516077041626, + "loss_ce": 0.03449662774801254, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 102303824, + "step": 592 + }, + { + "epoch": 0.23701039168665067, + "grad_norm": 92.7050944230278, + "learning_rate": 5e-06, + "loss": 1.0229, + "num_input_tokens_seen": 102476784, + "step": 593 + }, + { + "epoch": 0.23701039168665067, + "loss": 1.73143470287323, + "loss_ce": 0.03075111284852028, + "loss_xval": 1.703125, + "num_input_tokens_seen": 102476784, + "step": 593 + }, + { + "epoch": 0.23741007194244604, + "grad_norm": 121.2204406389868, + "learning_rate": 5e-06, + "loss": 0.808, + "num_input_tokens_seen": 102649976, + "step": 594 + }, + { + "epoch": 0.23741007194244604, + "loss": 0.783922553062439, + "loss_ce": 0.03868328034877777, + "loss_xval": 0.74609375, + "num_input_tokens_seen": 102649976, + "step": 594 + }, + { + "epoch": 0.2378097521982414, + "grad_norm": 37.963372753565885, + "learning_rate": 5e-06, + "loss": 0.8382, + "num_input_tokens_seen": 102822584, + "step": 595 + }, + { + "epoch": 0.2378097521982414, + "loss": 0.7244482040405273, + "loss_ce": 0.03298095613718033, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 102822584, + "step": 595 + }, + { + "epoch": 0.23820943245403678, + "grad_norm": 169.1225094579379, + "learning_rate": 5e-06, + "loss": 1.0284, + "num_input_tokens_seen": 102995352, + "step": 596 + }, + { + "epoch": 0.23820943245403678, + "loss": 1.4873781204223633, + "loss_ce": 0.03516869992017746, + "loss_xval": 1.453125, + "num_input_tokens_seen": 102995352, + "step": 596 + }, + { + "epoch": 0.23860911270983212, + "grad_norm": 56.84926795707111, + "learning_rate": 5e-06, + "loss": 0.8466, + "num_input_tokens_seen": 103167920, + "step": 597 + }, + { + "epoch": 0.23860911270983212, + "loss": 0.8535851240158081, + "loss_ce": 0.0333947092294693, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 103167920, + "step": 597 + }, + { + "epoch": 0.2390087929656275, + "grad_norm": 245.5535091122652, + "learning_rate": 5e-06, + "loss": 0.8312, + "num_input_tokens_seen": 103340488, + "step": 598 + }, + { + "epoch": 0.2390087929656275, + "loss": 1.069108009338379, + "loss_ce": 0.027115818113088608, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 103340488, + "step": 598 + }, + { + "epoch": 0.23940847322142286, + "grad_norm": 158.88384407024452, + "learning_rate": 5e-06, + "loss": 0.5671, + "num_input_tokens_seen": 103513496, + "step": 599 + }, + { + "epoch": 0.23940847322142286, + "loss": 0.4917399287223816, + "loss_ce": 0.042154960334300995, + "loss_xval": 0.44921875, + "num_input_tokens_seen": 103513496, + "step": 599 + }, + { + "epoch": 0.23980815347721823, + "grad_norm": 215.62794739008515, + "learning_rate": 5e-06, + "loss": 0.8888, + "num_input_tokens_seen": 103686592, + "step": 600 + }, + { + "epoch": 0.23980815347721823, + "loss": 0.6870980262756348, + "loss_ce": 0.024866603314876556, + "loss_xval": 0.6640625, + "num_input_tokens_seen": 103686592, + "step": 600 + }, + { + "epoch": 0.24020783373301358, + "grad_norm": 233.7424673976639, + "learning_rate": 5e-06, + "loss": 1.2323, + "num_input_tokens_seen": 103859464, + "step": 601 + }, + { + "epoch": 0.24020783373301358, + "loss": 1.235499382019043, + "loss_ce": 0.024195652455091476, + "loss_xval": 1.2109375, + "num_input_tokens_seen": 103859464, + "step": 601 + }, + { + "epoch": 0.24060751398880895, + "grad_norm": 191.42755343958055, + "learning_rate": 5e-06, + "loss": 0.907, + "num_input_tokens_seen": 104032384, + "step": 602 + }, + { + "epoch": 0.24060751398880895, + "loss": 0.9421808123588562, + "loss_ce": 0.03629700094461441, + "loss_xval": 0.90625, + "num_input_tokens_seen": 104032384, + "step": 602 + }, + { + "epoch": 0.24100719424460432, + "grad_norm": 308.9412242751799, + "learning_rate": 5e-06, + "loss": 1.2389, + "num_input_tokens_seen": 104205312, + "step": 603 + }, + { + "epoch": 0.24100719424460432, + "loss": 1.3211491107940674, + "loss_ce": 0.02378581464290619, + "loss_xval": 1.296875, + "num_input_tokens_seen": 104205312, + "step": 603 + }, + { + "epoch": 0.2414068745003997, + "grad_norm": 89.11609070631093, + "learning_rate": 5e-06, + "loss": 0.731, + "num_input_tokens_seen": 104377936, + "step": 604 + }, + { + "epoch": 0.2414068745003997, + "loss": 0.8285616040229797, + "loss_ce": 0.0302217286080122, + "loss_xval": 0.796875, + "num_input_tokens_seen": 104377936, + "step": 604 + }, + { + "epoch": 0.24180655475619503, + "grad_norm": 279.00947911880144, + "learning_rate": 5e-06, + "loss": 1.1599, + "num_input_tokens_seen": 104550768, + "step": 605 + }, + { + "epoch": 0.24180655475619503, + "loss": 1.0528864860534668, + "loss_ce": 0.024688273668289185, + "loss_xval": 1.03125, + "num_input_tokens_seen": 104550768, + "step": 605 + }, + { + "epoch": 0.2422062350119904, + "grad_norm": 80.1187354122152, + "learning_rate": 5e-06, + "loss": 0.8784, + "num_input_tokens_seen": 104723680, + "step": 606 + }, + { + "epoch": 0.2422062350119904, + "loss": 0.8573121428489685, + "loss_ce": 0.02589123696088791, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 104723680, + "step": 606 + }, + { + "epoch": 0.24260591526778577, + "grad_norm": 255.05532627961566, + "learning_rate": 5e-06, + "loss": 1.1517, + "num_input_tokens_seen": 104896272, + "step": 607 + }, + { + "epoch": 0.24260591526778577, + "loss": 1.3371992111206055, + "loss_ce": 0.023966766893863678, + "loss_xval": 1.3125, + "num_input_tokens_seen": 104896272, + "step": 607 + }, + { + "epoch": 0.24300559552358114, + "grad_norm": 172.50234084378377, + "learning_rate": 5e-06, + "loss": 1.2108, + "num_input_tokens_seen": 105063968, + "step": 608 + }, + { + "epoch": 0.24300559552358114, + "loss": 1.5010284185409546, + "loss_ce": 0.023855600506067276, + "loss_xval": 1.4765625, + "num_input_tokens_seen": 105063968, + "step": 608 + }, + { + "epoch": 0.2434052757793765, + "grad_norm": 392.0650101886134, + "learning_rate": 5e-06, + "loss": 1.335, + "num_input_tokens_seen": 105237008, + "step": 609 + }, + { + "epoch": 0.2434052757793765, + "loss": 1.3141515254974365, + "loss_ce": 0.026797983795404434, + "loss_xval": 1.2890625, + "num_input_tokens_seen": 105237008, + "step": 609 + }, + { + "epoch": 0.24380495603517185, + "grad_norm": 61.45983002680283, + "learning_rate": 5e-06, + "loss": 0.8797, + "num_input_tokens_seen": 105409768, + "step": 610 + }, + { + "epoch": 0.24380495603517185, + "loss": 1.0218791961669922, + "loss_ce": 0.027372296899557114, + "loss_xval": 0.99609375, + "num_input_tokens_seen": 105409768, + "step": 610 + }, + { + "epoch": 0.24420463629096723, + "grad_norm": 301.269151881736, + "learning_rate": 5e-06, + "loss": 0.9493, + "num_input_tokens_seen": 105582552, + "step": 611 + }, + { + "epoch": 0.24420463629096723, + "loss": 1.1783733367919922, + "loss_ce": 0.026517830789089203, + "loss_xval": 1.1484375, + "num_input_tokens_seen": 105582552, + "step": 611 + }, + { + "epoch": 0.2446043165467626, + "grad_norm": 111.22066209378004, + "learning_rate": 5e-06, + "loss": 1.1837, + "num_input_tokens_seen": 105755840, + "step": 612 + }, + { + "epoch": 0.2446043165467626, + "loss": 1.2885253429412842, + "loss_ce": 0.032177697867155075, + "loss_xval": 1.2578125, + "num_input_tokens_seen": 105755840, + "step": 612 + }, + { + "epoch": 0.24500399680255797, + "grad_norm": 212.8848788943127, + "learning_rate": 5e-06, + "loss": 0.9066, + "num_input_tokens_seen": 105928728, + "step": 613 + }, + { + "epoch": 0.24500399680255797, + "loss": 1.120781660079956, + "loss_ce": 0.024956412613391876, + "loss_xval": 1.09375, + "num_input_tokens_seen": 105928728, + "step": 613 + }, + { + "epoch": 0.2454036770583533, + "grad_norm": 103.49456362295976, + "learning_rate": 5e-06, + "loss": 0.8015, + "num_input_tokens_seen": 106101936, + "step": 614 + }, + { + "epoch": 0.2454036770583533, + "loss": 0.8075883984565735, + "loss_ce": 0.026216331869363785, + "loss_xval": 0.78125, + "num_input_tokens_seen": 106101936, + "step": 614 + }, + { + "epoch": 0.24580335731414868, + "grad_norm": 248.325610563766, + "learning_rate": 5e-06, + "loss": 1.5101, + "num_input_tokens_seen": 106275008, + "step": 615 + }, + { + "epoch": 0.24580335731414868, + "loss": 1.5115883350372314, + "loss_ce": 0.02550426870584488, + "loss_xval": 1.484375, + "num_input_tokens_seen": 106275008, + "step": 615 + }, + { + "epoch": 0.24620303756994405, + "grad_norm": 167.96429221700566, + "learning_rate": 5e-06, + "loss": 1.2687, + "num_input_tokens_seen": 106447848, + "step": 616 + }, + { + "epoch": 0.24620303756994405, + "loss": 1.5167852640151978, + "loss_ce": 0.03192197158932686, + "loss_xval": 1.484375, + "num_input_tokens_seen": 106447848, + "step": 616 + }, + { + "epoch": 0.24660271782573942, + "grad_norm": 298.4988658186804, + "learning_rate": 5e-06, + "loss": 1.4475, + "num_input_tokens_seen": 106620640, + "step": 617 + }, + { + "epoch": 0.24660271782573942, + "loss": 1.7282776832580566, + "loss_ce": 0.03162240982055664, + "loss_xval": 1.6953125, + "num_input_tokens_seen": 106620640, + "step": 617 + }, + { + "epoch": 0.24700239808153476, + "grad_norm": 219.23244811627757, + "learning_rate": 5e-06, + "loss": 1.3819, + "num_input_tokens_seen": 106793448, + "step": 618 + }, + { + "epoch": 0.24700239808153476, + "loss": 0.7871071696281433, + "loss_ce": 0.026853220537304878, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 106793448, + "step": 618 + }, + { + "epoch": 0.24740207833733013, + "grad_norm": 303.82496611640624, + "learning_rate": 5e-06, + "loss": 1.0851, + "num_input_tokens_seen": 106966416, + "step": 619 + }, + { + "epoch": 0.24740207833733013, + "loss": 1.2948808670043945, + "loss_ce": 0.02705862559378147, + "loss_xval": 1.265625, + "num_input_tokens_seen": 106966416, + "step": 619 + }, + { + "epoch": 0.2478017585931255, + "grad_norm": 169.45479385596363, + "learning_rate": 5e-06, + "loss": 1.7907, + "num_input_tokens_seen": 107139480, + "step": 620 + }, + { + "epoch": 0.2478017585931255, + "loss": 1.1137454509735107, + "loss_ce": 0.03098176047205925, + "loss_xval": 1.0859375, + "num_input_tokens_seen": 107139480, + "step": 620 + }, + { + "epoch": 0.24820143884892087, + "grad_norm": 280.20960363263634, + "learning_rate": 5e-06, + "loss": 1.0557, + "num_input_tokens_seen": 107312464, + "step": 621 + }, + { + "epoch": 0.24820143884892087, + "loss": 1.1125739812850952, + "loss_ce": 0.031031014397740364, + "loss_xval": 1.078125, + "num_input_tokens_seen": 107312464, + "step": 621 + }, + { + "epoch": 0.24860111910471622, + "grad_norm": 52.29310753242722, + "learning_rate": 5e-06, + "loss": 0.8947, + "num_input_tokens_seen": 107485336, + "step": 622 + }, + { + "epoch": 0.24860111910471622, + "loss": 0.861524760723114, + "loss_ce": 0.03401007875800133, + "loss_xval": 0.828125, + "num_input_tokens_seen": 107485336, + "step": 622 + }, + { + "epoch": 0.2490007993605116, + "grad_norm": 326.1027649366469, + "learning_rate": 5e-06, + "loss": 1.1985, + "num_input_tokens_seen": 107658200, + "step": 623 + }, + { + "epoch": 0.2490007993605116, + "loss": 1.141750693321228, + "loss_ce": 0.031765300780534744, + "loss_xval": 1.109375, + "num_input_tokens_seen": 107658200, + "step": 623 + }, + { + "epoch": 0.24940047961630696, + "grad_norm": 38.95977192063135, + "learning_rate": 5e-06, + "loss": 0.7603, + "num_input_tokens_seen": 107831376, + "step": 624 + }, + { + "epoch": 0.24940047961630696, + "loss": 0.727970540523529, + "loss_ce": 0.02875177562236786, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 107831376, + "step": 624 + }, + { + "epoch": 0.24980015987210233, + "grad_norm": 252.07183773955228, + "learning_rate": 5e-06, + "loss": 1.5353, + "num_input_tokens_seen": 108004032, + "step": 625 + }, + { + "epoch": 0.24980015987210233, + "loss": 1.5708106756210327, + "loss_ce": 0.04151376336812973, + "loss_xval": 1.53125, + "num_input_tokens_seen": 108004032, + "step": 625 + }, + { + "epoch": 0.2501998401278977, + "grad_norm": 123.5509465218223, + "learning_rate": 5e-06, + "loss": 0.8309, + "num_input_tokens_seen": 108176984, + "step": 626 + }, + { + "epoch": 0.2501998401278977, + "loss": 0.5989866256713867, + "loss_ce": 0.031481776386499405, + "loss_xval": 0.56640625, + "num_input_tokens_seen": 108176984, + "step": 626 + }, + { + "epoch": 0.25059952038369304, + "grad_norm": 318.6509948375151, + "learning_rate": 5e-06, + "loss": 1.5763, + "num_input_tokens_seen": 108350144, + "step": 627 + }, + { + "epoch": 0.25059952038369304, + "loss": 2.1137542724609375, + "loss_ce": 0.03538517281413078, + "loss_xval": 2.078125, + "num_input_tokens_seen": 108350144, + "step": 627 + }, + { + "epoch": 0.2509992006394884, + "grad_norm": 61.521784410197895, + "learning_rate": 5e-06, + "loss": 0.9835, + "num_input_tokens_seen": 108523376, + "step": 628 + }, + { + "epoch": 0.2509992006394884, + "loss": 0.8479458093643188, + "loss_ce": 0.03227199614048004, + "loss_xval": 0.81640625, + "num_input_tokens_seen": 108523376, + "step": 628 + }, + { + "epoch": 0.2513988808952838, + "grad_norm": 126.59752275575777, + "learning_rate": 5e-06, + "loss": 1.0511, + "num_input_tokens_seen": 108696208, + "step": 629 + }, + { + "epoch": 0.2513988808952838, + "loss": 1.2120076417922974, + "loss_ce": 0.02963467314839363, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 108696208, + "step": 629 + }, + { + "epoch": 0.2517985611510791, + "grad_norm": 83.31660909579162, + "learning_rate": 5e-06, + "loss": 0.9889, + "num_input_tokens_seen": 108869176, + "step": 630 + }, + { + "epoch": 0.2517985611510791, + "loss": 1.004129409790039, + "loss_ce": 0.03550145775079727, + "loss_xval": 0.96875, + "num_input_tokens_seen": 108869176, + "step": 630 + }, + { + "epoch": 0.2521982414068745, + "grad_norm": 114.49495819012182, + "learning_rate": 5e-06, + "loss": 0.8974, + "num_input_tokens_seen": 109042280, + "step": 631 + }, + { + "epoch": 0.2521982414068745, + "loss": 1.1485271453857422, + "loss_ce": 0.0317058339715004, + "loss_xval": 1.1171875, + "num_input_tokens_seen": 109042280, + "step": 631 + }, + { + "epoch": 0.25259792166266987, + "grad_norm": 158.50373331009686, + "learning_rate": 5e-06, + "loss": 0.9505, + "num_input_tokens_seen": 109215320, + "step": 632 + }, + { + "epoch": 0.25259792166266987, + "loss": 0.7758920192718506, + "loss_ce": 0.022229932248592377, + "loss_xval": 0.75390625, + "num_input_tokens_seen": 109215320, + "step": 632 + }, + { + "epoch": 0.2529976019184652, + "grad_norm": 124.3150224300894, + "learning_rate": 5e-06, + "loss": 1.041, + "num_input_tokens_seen": 109388376, + "step": 633 + }, + { + "epoch": 0.2529976019184652, + "loss": 0.7999504804611206, + "loss_ce": 0.022484708577394485, + "loss_xval": 0.77734375, + "num_input_tokens_seen": 109388376, + "step": 633 + }, + { + "epoch": 0.2533972821742606, + "grad_norm": 153.8473346571185, + "learning_rate": 5e-06, + "loss": 1.0814, + "num_input_tokens_seen": 109561424, + "step": 634 + }, + { + "epoch": 0.2533972821742606, + "loss": 0.7413469552993774, + "loss_ce": 0.02229173481464386, + "loss_xval": 0.71875, + "num_input_tokens_seen": 109561424, + "step": 634 + }, + { + "epoch": 0.25379696243005595, + "grad_norm": 123.4116823206711, + "learning_rate": 5e-06, + "loss": 0.6317, + "num_input_tokens_seen": 109734584, + "step": 635 + }, + { + "epoch": 0.25379696243005595, + "loss": 0.49806663393974304, + "loss_ce": 0.024433817714452744, + "loss_xval": 0.47265625, + "num_input_tokens_seen": 109734584, + "step": 635 + }, + { + "epoch": 0.2541966426858513, + "grad_norm": 155.95322284214282, + "learning_rate": 5e-06, + "loss": 0.9948, + "num_input_tokens_seen": 109907808, + "step": 636 + }, + { + "epoch": 0.2541966426858513, + "loss": 0.8807600736618042, + "loss_ce": 0.022056490182876587, + "loss_xval": 0.859375, + "num_input_tokens_seen": 109907808, + "step": 636 + }, + { + "epoch": 0.2545963229416467, + "grad_norm": 36.34285927695085, + "learning_rate": 5e-06, + "loss": 0.7191, + "num_input_tokens_seen": 110080336, + "step": 637 + }, + { + "epoch": 0.2545963229416467, + "loss": 0.7213298678398132, + "loss_ce": 0.021866969764232635, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 110080336, + "step": 637 + }, + { + "epoch": 0.25499600319744203, + "grad_norm": 40.14733221133789, + "learning_rate": 5e-06, + "loss": 0.5536, + "num_input_tokens_seen": 110253400, + "step": 638 + }, + { + "epoch": 0.25499600319744203, + "loss": 0.7994478940963745, + "loss_ce": 0.017709653824567795, + "loss_xval": 0.78125, + "num_input_tokens_seen": 110253400, + "step": 638 + }, + { + "epoch": 0.25539568345323743, + "grad_norm": 123.57240267301728, + "learning_rate": 5e-06, + "loss": 0.9722, + "num_input_tokens_seen": 110426568, + "step": 639 + }, + { + "epoch": 0.25539568345323743, + "loss": 0.8483471274375916, + "loss_ce": 0.014118612743914127, + "loss_xval": 0.8359375, + "num_input_tokens_seen": 110426568, + "step": 639 + }, + { + "epoch": 0.2557953637090328, + "grad_norm": 75.02054811289982, + "learning_rate": 5e-06, + "loss": 0.7891, + "num_input_tokens_seen": 110599520, + "step": 640 + }, + { + "epoch": 0.2557953637090328, + "loss": 0.7000423669815063, + "loss_ce": 0.023406604304909706, + "loss_xval": 0.67578125, + "num_input_tokens_seen": 110599520, + "step": 640 + }, + { + "epoch": 0.2561950439648281, + "grad_norm": 91.74212057215723, + "learning_rate": 5e-06, + "loss": 1.2231, + "num_input_tokens_seen": 110772552, + "step": 641 + }, + { + "epoch": 0.2561950439648281, + "loss": 1.4409394264221191, + "loss_ce": 0.014792068861424923, + "loss_xval": 1.4296875, + "num_input_tokens_seen": 110772552, + "step": 641 + }, + { + "epoch": 0.2565947242206235, + "grad_norm": 157.31170807411007, + "learning_rate": 5e-06, + "loss": 0.8605, + "num_input_tokens_seen": 110945136, + "step": 642 + }, + { + "epoch": 0.2565947242206235, + "loss": 0.8512501120567322, + "loss_ce": 0.015312610194087029, + "loss_xval": 0.8359375, + "num_input_tokens_seen": 110945136, + "step": 642 + }, + { + "epoch": 0.25699440447641886, + "grad_norm": 58.06363263841326, + "learning_rate": 5e-06, + "loss": 1.0306, + "num_input_tokens_seen": 111118304, + "step": 643 + }, + { + "epoch": 0.25699440447641886, + "loss": 1.2408883571624756, + "loss_ce": 0.014325831085443497, + "loss_xval": 1.2265625, + "num_input_tokens_seen": 111118304, + "step": 643 + }, + { + "epoch": 0.2573940847322142, + "grad_norm": 83.05291570147797, + "learning_rate": 5e-06, + "loss": 0.8975, + "num_input_tokens_seen": 111291296, + "step": 644 + }, + { + "epoch": 0.2573940847322142, + "loss": 0.860167920589447, + "loss_ce": 0.012267546728253365, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 111291296, + "step": 644 + }, + { + "epoch": 0.2577937649880096, + "grad_norm": 106.75725756616248, + "learning_rate": 5e-06, + "loss": 0.7986, + "num_input_tokens_seen": 111464416, + "step": 645 + }, + { + "epoch": 0.2577937649880096, + "loss": 0.8707510828971863, + "loss_ce": 0.014671968296170235, + "loss_xval": 0.85546875, + "num_input_tokens_seen": 111464416, + "step": 645 + }, + { + "epoch": 0.25819344524380494, + "grad_norm": 65.34641583085333, + "learning_rate": 5e-06, + "loss": 0.892, + "num_input_tokens_seen": 111637288, + "step": 646 + }, + { + "epoch": 0.25819344524380494, + "loss": 0.8701699376106262, + "loss_ce": 0.022757841274142265, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 111637288, + "step": 646 + }, + { + "epoch": 0.25859312549960034, + "grad_norm": 69.58421071925531, + "learning_rate": 5e-06, + "loss": 0.7172, + "num_input_tokens_seen": 111810496, + "step": 647 + }, + { + "epoch": 0.25859312549960034, + "loss": 0.6108307242393494, + "loss_ce": 0.0166535172611475, + "loss_xval": 0.59375, + "num_input_tokens_seen": 111810496, + "step": 647 + }, + { + "epoch": 0.2589928057553957, + "grad_norm": 60.44211136131847, + "learning_rate": 5e-06, + "loss": 0.8919, + "num_input_tokens_seen": 111983536, + "step": 648 + }, + { + "epoch": 0.2589928057553957, + "loss": 1.001638412475586, + "loss_ce": 0.021535882726311684, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 111983536, + "step": 648 + }, + { + "epoch": 0.259392486011191, + "grad_norm": 37.92595630946876, + "learning_rate": 5e-06, + "loss": 1.3082, + "num_input_tokens_seen": 112156504, + "step": 649 + }, + { + "epoch": 0.259392486011191, + "loss": 1.5613832473754883, + "loss_ce": 0.020733918994665146, + "loss_xval": 1.5390625, + "num_input_tokens_seen": 112156504, + "step": 649 + }, + { + "epoch": 0.2597921662669864, + "grad_norm": 28.670635237631853, + "learning_rate": 5e-06, + "loss": 1.019, + "num_input_tokens_seen": 112329664, + "step": 650 + }, + { + "epoch": 0.2597921662669864, + "loss": 1.3567280769348145, + "loss_ce": 0.021767208352684975, + "loss_xval": 1.3359375, + "num_input_tokens_seen": 112329664, + "step": 650 + }, + { + "epoch": 0.26019184652278177, + "grad_norm": 62.41074023562404, + "learning_rate": 5e-06, + "loss": 0.9244, + "num_input_tokens_seen": 112502960, + "step": 651 + }, + { + "epoch": 0.26019184652278177, + "loss": 0.9109457731246948, + "loss_ce": 0.013728970661759377, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 112502960, + "step": 651 + }, + { + "epoch": 0.26059152677857716, + "grad_norm": 45.00813129699106, + "learning_rate": 5e-06, + "loss": 0.8637, + "num_input_tokens_seen": 112675584, + "step": 652 + }, + { + "epoch": 0.26059152677857716, + "loss": 0.6911357045173645, + "loss_ce": 0.02640179917216301, + "loss_xval": 0.6640625, + "num_input_tokens_seen": 112675584, + "step": 652 + }, + { + "epoch": 0.2609912070343725, + "grad_norm": 136.9051208047938, + "learning_rate": 5e-06, + "loss": 0.947, + "num_input_tokens_seen": 112848712, + "step": 653 + }, + { + "epoch": 0.2609912070343725, + "loss": 1.220213770866394, + "loss_ce": 0.015074612572789192, + "loss_xval": 1.203125, + "num_input_tokens_seen": 112848712, + "step": 653 + }, + { + "epoch": 0.26139088729016785, + "grad_norm": 191.51181392428464, + "learning_rate": 5e-06, + "loss": 1.027, + "num_input_tokens_seen": 113021672, + "step": 654 + }, + { + "epoch": 0.26139088729016785, + "loss": 0.8155316114425659, + "loss_ce": 0.02170836180448532, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 113021672, + "step": 654 + }, + { + "epoch": 0.26179056754596325, + "grad_norm": 59.47364063857337, + "learning_rate": 5e-06, + "loss": 0.9311, + "num_input_tokens_seen": 113194640, + "step": 655 + }, + { + "epoch": 0.26179056754596325, + "loss": 0.7021055221557617, + "loss_ce": 0.0113096684217453, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 113194640, + "step": 655 + }, + { + "epoch": 0.2621902478017586, + "grad_norm": 299.2925607489183, + "learning_rate": 5e-06, + "loss": 1.1994, + "num_input_tokens_seen": 113367520, + "step": 656 + }, + { + "epoch": 0.2621902478017586, + "loss": 1.1251329183578491, + "loss_ce": 0.012095760554075241, + "loss_xval": 1.109375, + "num_input_tokens_seen": 113367520, + "step": 656 + }, + { + "epoch": 0.26258992805755393, + "grad_norm": 152.9578503535034, + "learning_rate": 5e-06, + "loss": 0.6672, + "num_input_tokens_seen": 113540536, + "step": 657 + }, + { + "epoch": 0.26258992805755393, + "loss": 0.8924187421798706, + "loss_ce": 0.01619800738990307, + "loss_xval": 0.875, + "num_input_tokens_seen": 113540536, + "step": 657 + }, + { + "epoch": 0.26298960831334933, + "grad_norm": 174.0581314265997, + "learning_rate": 5e-06, + "loss": 1.0259, + "num_input_tokens_seen": 113713200, + "step": 658 + }, + { + "epoch": 0.26298960831334933, + "loss": 1.100234031677246, + "loss_ce": 0.011733030900359154, + "loss_xval": 1.0859375, + "num_input_tokens_seen": 113713200, + "step": 658 + }, + { + "epoch": 0.2633892885691447, + "grad_norm": 84.02652993262616, + "learning_rate": 5e-06, + "loss": 0.7989, + "num_input_tokens_seen": 113886064, + "step": 659 + }, + { + "epoch": 0.2633892885691447, + "loss": 0.5299695730209351, + "loss_ce": 0.014466674998402596, + "loss_xval": 0.515625, + "num_input_tokens_seen": 113886064, + "step": 659 + }, + { + "epoch": 0.2637889688249401, + "grad_norm": 236.5341573563948, + "learning_rate": 5e-06, + "loss": 1.0824, + "num_input_tokens_seen": 114058936, + "step": 660 + }, + { + "epoch": 0.2637889688249401, + "loss": 0.8847863674163818, + "loss_ce": 0.013448446989059448, + "loss_xval": 0.87109375, + "num_input_tokens_seen": 114058936, + "step": 660 + }, + { + "epoch": 0.2641886490807354, + "grad_norm": 169.64090644183318, + "learning_rate": 5e-06, + "loss": 1.0582, + "num_input_tokens_seen": 114231344, + "step": 661 + }, + { + "epoch": 0.2641886490807354, + "loss": 1.3163893222808838, + "loss_ce": 0.01853768527507782, + "loss_xval": 1.296875, + "num_input_tokens_seen": 114231344, + "step": 661 + }, + { + "epoch": 0.26458832933653076, + "grad_norm": 175.88382335589282, + "learning_rate": 5e-06, + "loss": 0.6119, + "num_input_tokens_seen": 114404496, + "step": 662 + }, + { + "epoch": 0.26458832933653076, + "loss": 0.6016393899917603, + "loss_ce": 0.019119868054986, + "loss_xval": 0.58203125, + "num_input_tokens_seen": 114404496, + "step": 662 + }, + { + "epoch": 0.26498800959232616, + "grad_norm": 195.8103408122979, + "learning_rate": 5e-06, + "loss": 1.1647, + "num_input_tokens_seen": 114577520, + "step": 663 + }, + { + "epoch": 0.26498800959232616, + "loss": 0.993999719619751, + "loss_ce": 0.018291711807250977, + "loss_xval": 0.9765625, + "num_input_tokens_seen": 114577520, + "step": 663 + }, + { + "epoch": 0.2653876898481215, + "grad_norm": 138.23767892074832, + "learning_rate": 5e-06, + "loss": 1.0253, + "num_input_tokens_seen": 114750672, + "step": 664 + }, + { + "epoch": 0.2653876898481215, + "loss": 1.1113959550857544, + "loss_ce": 0.018378403037786484, + "loss_xval": 1.09375, + "num_input_tokens_seen": 114750672, + "step": 664 + }, + { + "epoch": 0.26578737010391684, + "grad_norm": 190.91695554448776, + "learning_rate": 5e-06, + "loss": 1.0631, + "num_input_tokens_seen": 114923496, + "step": 665 + }, + { + "epoch": 0.26578737010391684, + "loss": 0.6147146821022034, + "loss_ce": 0.024016443639993668, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 114923496, + "step": 665 + }, + { + "epoch": 0.26618705035971224, + "grad_norm": 102.45689102808429, + "learning_rate": 5e-06, + "loss": 0.8536, + "num_input_tokens_seen": 115096112, + "step": 666 + }, + { + "epoch": 0.26618705035971224, + "loss": 0.6601771116256714, + "loss_ce": 0.0206507109105587, + "loss_xval": 0.640625, + "num_input_tokens_seen": 115096112, + "step": 666 + }, + { + "epoch": 0.2665867306155076, + "grad_norm": 175.7561493345075, + "learning_rate": 5e-06, + "loss": 0.9328, + "num_input_tokens_seen": 115269536, + "step": 667 + }, + { + "epoch": 0.2665867306155076, + "loss": 1.2699246406555176, + "loss_ce": 0.029201963916420937, + "loss_xval": 1.2421875, + "num_input_tokens_seen": 115269536, + "step": 667 + }, + { + "epoch": 0.266986410871303, + "grad_norm": 75.22857527978371, + "learning_rate": 5e-06, + "loss": 0.661, + "num_input_tokens_seen": 115442560, + "step": 668 + }, + { + "epoch": 0.266986410871303, + "loss": 0.6588489413261414, + "loss_ce": 0.030308909714221954, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 115442560, + "step": 668 + }, + { + "epoch": 0.2673860911270983, + "grad_norm": 146.67725635888027, + "learning_rate": 5e-06, + "loss": 0.968, + "num_input_tokens_seen": 115615360, + "step": 669 + }, + { + "epoch": 0.2673860911270983, + "loss": 1.12626314163208, + "loss_ce": 0.026165474206209183, + "loss_xval": 1.1015625, + "num_input_tokens_seen": 115615360, + "step": 669 + }, + { + "epoch": 0.26778577138289367, + "grad_norm": 133.73687162857127, + "learning_rate": 5e-06, + "loss": 0.8349, + "num_input_tokens_seen": 115788160, + "step": 670 + }, + { + "epoch": 0.26778577138289367, + "loss": 1.1909589767456055, + "loss_ce": 0.02347848378121853, + "loss_xval": 1.1640625, + "num_input_tokens_seen": 115788160, + "step": 670 + }, + { + "epoch": 0.26818545163868907, + "grad_norm": 270.0264431031233, + "learning_rate": 5e-06, + "loss": 1.0488, + "num_input_tokens_seen": 115960856, + "step": 671 + }, + { + "epoch": 0.26818545163868907, + "loss": 0.9442439675331116, + "loss_ce": 0.03384360671043396, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 115960856, + "step": 671 + }, + { + "epoch": 0.2685851318944844, + "grad_norm": 118.67248768897453, + "learning_rate": 5e-06, + "loss": 0.9384, + "num_input_tokens_seen": 116133480, + "step": 672 + }, + { + "epoch": 0.2685851318944844, + "loss": 0.7503706216812134, + "loss_ce": 0.023441843688488007, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 116133480, + "step": 672 + }, + { + "epoch": 0.26898481215027975, + "grad_norm": 410.22210699900944, + "learning_rate": 5e-06, + "loss": 0.984, + "num_input_tokens_seen": 116306376, + "step": 673 + }, + { + "epoch": 0.26898481215027975, + "loss": 1.0442287921905518, + "loss_ce": 0.017373330891132355, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 116306376, + "step": 673 + }, + { + "epoch": 0.26938449240607515, + "grad_norm": 49.025436414803835, + "learning_rate": 5e-06, + "loss": 1.0499, + "num_input_tokens_seen": 116479536, + "step": 674 + }, + { + "epoch": 0.26938449240607515, + "loss": 1.7757625579833984, + "loss_ce": 0.01941489428281784, + "loss_xval": 1.7578125, + "num_input_tokens_seen": 116479536, + "step": 674 + }, + { + "epoch": 0.2697841726618705, + "grad_norm": 413.2558380877526, + "learning_rate": 5e-06, + "loss": 0.8974, + "num_input_tokens_seen": 116652840, + "step": 675 + }, + { + "epoch": 0.2697841726618705, + "loss": 1.0395095348358154, + "loss_ce": 0.024128668010234833, + "loss_xval": 1.015625, + "num_input_tokens_seen": 116652840, + "step": 675 + }, + { + "epoch": 0.2701838529176659, + "grad_norm": 121.24926053527112, + "learning_rate": 5e-06, + "loss": 1.2369, + "num_input_tokens_seen": 116825496, + "step": 676 + }, + { + "epoch": 0.2701838529176659, + "loss": 1.2461820840835571, + "loss_ce": 0.0210844948887825, + "loss_xval": 1.2265625, + "num_input_tokens_seen": 116825496, + "step": 676 + }, + { + "epoch": 0.27058353317346123, + "grad_norm": 268.47864300466, + "learning_rate": 5e-06, + "loss": 0.9889, + "num_input_tokens_seen": 116998824, + "step": 677 + }, + { + "epoch": 0.27058353317346123, + "loss": 0.8895606994628906, + "loss_ce": 0.021884921938180923, + "loss_xval": 0.8671875, + "num_input_tokens_seen": 116998824, + "step": 677 + }, + { + "epoch": 0.2709832134292566, + "grad_norm": 64.88427270509409, + "learning_rate": 5e-06, + "loss": 0.6857, + "num_input_tokens_seen": 117171936, + "step": 678 + }, + { + "epoch": 0.2709832134292566, + "loss": 0.778445839881897, + "loss_ce": 0.022342335432767868, + "loss_xval": 0.7578125, + "num_input_tokens_seen": 117171936, + "step": 678 + }, + { + "epoch": 0.271382893685052, + "grad_norm": 166.62187626988478, + "learning_rate": 5e-06, + "loss": 0.6886, + "num_input_tokens_seen": 117344920, + "step": 679 + }, + { + "epoch": 0.271382893685052, + "loss": 0.4085671603679657, + "loss_ce": 0.023191187530755997, + "loss_xval": 0.384765625, + "num_input_tokens_seen": 117344920, + "step": 679 + }, + { + "epoch": 0.2717825739408473, + "grad_norm": 52.26940993035468, + "learning_rate": 5e-06, + "loss": 1.0123, + "num_input_tokens_seen": 117517976, + "step": 680 + }, + { + "epoch": 0.2717825739408473, + "loss": 1.3288424015045166, + "loss_ce": 0.02269016206264496, + "loss_xval": 1.3046875, + "num_input_tokens_seen": 117517976, + "step": 680 + }, + { + "epoch": 0.27218225419664266, + "grad_norm": 142.4492173509578, + "learning_rate": 5e-06, + "loss": 0.7816, + "num_input_tokens_seen": 117690720, + "step": 681 + }, + { + "epoch": 0.27218225419664266, + "loss": 0.7112863063812256, + "loss_ce": 0.026471804827451706, + "loss_xval": 0.68359375, + "num_input_tokens_seen": 117690720, + "step": 681 + }, + { + "epoch": 0.27258193445243806, + "grad_norm": 70.93340773235381, + "learning_rate": 5e-06, + "loss": 0.8648, + "num_input_tokens_seen": 117863704, + "step": 682 + }, + { + "epoch": 0.27258193445243806, + "loss": 1.0744261741638184, + "loss_ce": 0.02364499494433403, + "loss_xval": 1.046875, + "num_input_tokens_seen": 117863704, + "step": 682 + }, + { + "epoch": 0.2729816147082334, + "grad_norm": 65.75630002146995, + "learning_rate": 5e-06, + "loss": 0.8312, + "num_input_tokens_seen": 118036656, + "step": 683 + }, + { + "epoch": 0.2729816147082334, + "loss": 0.5977785587310791, + "loss_ce": 0.023987047374248505, + "loss_xval": 0.57421875, + "num_input_tokens_seen": 118036656, + "step": 683 + }, + { + "epoch": 0.2733812949640288, + "grad_norm": 54.55328598333207, + "learning_rate": 5e-06, + "loss": 0.5361, + "num_input_tokens_seen": 118209616, + "step": 684 + }, + { + "epoch": 0.2733812949640288, + "loss": 0.4402380585670471, + "loss_ce": 0.01866826042532921, + "loss_xval": 0.421875, + "num_input_tokens_seen": 118209616, + "step": 684 + }, + { + "epoch": 0.27378097521982414, + "grad_norm": 72.43434827664534, + "learning_rate": 5e-06, + "loss": 0.8812, + "num_input_tokens_seen": 118382624, + "step": 685 + }, + { + "epoch": 0.27378097521982414, + "loss": 0.9849303364753723, + "loss_ce": 0.023275673389434814, + "loss_xval": 0.9609375, + "num_input_tokens_seen": 118382624, + "step": 685 + }, + { + "epoch": 0.2741806554756195, + "grad_norm": 65.98449745999218, + "learning_rate": 5e-06, + "loss": 0.9268, + "num_input_tokens_seen": 118555344, + "step": 686 + }, + { + "epoch": 0.2741806554756195, + "loss": 1.1211669445037842, + "loss_ce": 0.021313386037945747, + "loss_xval": 1.1015625, + "num_input_tokens_seen": 118555344, + "step": 686 + }, + { + "epoch": 0.2745803357314149, + "grad_norm": 63.9892841046574, + "learning_rate": 5e-06, + "loss": 0.8567, + "num_input_tokens_seen": 118728288, + "step": 687 + }, + { + "epoch": 0.2745803357314149, + "loss": 0.9254956245422363, + "loss_ce": 0.020466340705752373, + "loss_xval": 0.90625, + "num_input_tokens_seen": 118728288, + "step": 687 + }, + { + "epoch": 0.2749800159872102, + "grad_norm": 56.42397672481398, + "learning_rate": 5e-06, + "loss": 0.9393, + "num_input_tokens_seen": 118900712, + "step": 688 + }, + { + "epoch": 0.2749800159872102, + "loss": 0.8299415111541748, + "loss_ce": 0.01860113814473152, + "loss_xval": 0.8125, + "num_input_tokens_seen": 118900712, + "step": 688 + }, + { + "epoch": 0.2753796962430056, + "grad_norm": 91.65977622605617, + "learning_rate": 5e-06, + "loss": 0.8144, + "num_input_tokens_seen": 119073720, + "step": 689 + }, + { + "epoch": 0.2753796962430056, + "loss": 0.9090508818626404, + "loss_ce": 0.016350697726011276, + "loss_xval": 0.89453125, + "num_input_tokens_seen": 119073720, + "step": 689 + }, + { + "epoch": 0.27577937649880097, + "grad_norm": 39.025883096673645, + "learning_rate": 5e-06, + "loss": 0.6471, + "num_input_tokens_seen": 119246792, + "step": 690 + }, + { + "epoch": 0.27577937649880097, + "loss": 0.6969112157821655, + "loss_ce": 0.014599241316318512, + "loss_xval": 0.68359375, + "num_input_tokens_seen": 119246792, + "step": 690 + }, + { + "epoch": 0.2761790567545963, + "grad_norm": 36.73841496397389, + "learning_rate": 5e-06, + "loss": 0.6317, + "num_input_tokens_seen": 119419560, + "step": 691 + }, + { + "epoch": 0.2761790567545963, + "loss": 0.9147318005561829, + "loss_ce": 0.012082915753126144, + "loss_xval": 0.90234375, + "num_input_tokens_seen": 119419560, + "step": 691 + }, + { + "epoch": 0.2765787370103917, + "grad_norm": 168.6230690419483, + "learning_rate": 5e-06, + "loss": 0.7357, + "num_input_tokens_seen": 119592824, + "step": 692 + }, + { + "epoch": 0.2765787370103917, + "loss": 0.6877519488334656, + "loss_ce": 0.014412128366529942, + "loss_xval": 0.671875, + "num_input_tokens_seen": 119592824, + "step": 692 + }, + { + "epoch": 0.27697841726618705, + "grad_norm": 52.75849147183071, + "learning_rate": 5e-06, + "loss": 0.7668, + "num_input_tokens_seen": 119762056, + "step": 693 + }, + { + "epoch": 0.27697841726618705, + "loss": 0.33476799726486206, + "loss_ce": 0.009755777195096016, + "loss_xval": 0.32421875, + "num_input_tokens_seen": 119762056, + "step": 693 + }, + { + "epoch": 0.2773780975219824, + "grad_norm": 131.1361500389134, + "learning_rate": 5e-06, + "loss": 0.9371, + "num_input_tokens_seen": 119935104, + "step": 694 + }, + { + "epoch": 0.2773780975219824, + "loss": 1.146599531173706, + "loss_ce": 0.009636607021093369, + "loss_xval": 1.140625, + "num_input_tokens_seen": 119935104, + "step": 694 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 47.6259432845342, + "learning_rate": 5e-06, + "loss": 1.0213, + "num_input_tokens_seen": 120107776, + "step": 695 + }, + { + "epoch": 0.2777777777777778, + "loss": 0.7354253530502319, + "loss_ce": 0.011792542412877083, + "loss_xval": 0.72265625, + "num_input_tokens_seen": 120107776, + "step": 695 + }, + { + "epoch": 0.27817745803357313, + "grad_norm": 291.1641874227151, + "learning_rate": 5e-06, + "loss": 0.7937, + "num_input_tokens_seen": 120280744, + "step": 696 + }, + { + "epoch": 0.27817745803357313, + "loss": 0.6080259084701538, + "loss_ce": 0.01232281606644392, + "loss_xval": 0.59375, + "num_input_tokens_seen": 120280744, + "step": 696 + }, + { + "epoch": 0.27857713828936853, + "grad_norm": 129.99843252720103, + "learning_rate": 5e-06, + "loss": 0.8997, + "num_input_tokens_seen": 120453824, + "step": 697 + }, + { + "epoch": 0.27857713828936853, + "loss": 0.6489090919494629, + "loss_ce": 0.008528226986527443, + "loss_xval": 0.640625, + "num_input_tokens_seen": 120453824, + "step": 697 + }, + { + "epoch": 0.2789768185451639, + "grad_norm": 136.31513410825676, + "learning_rate": 5e-06, + "loss": 0.5905, + "num_input_tokens_seen": 120626520, + "step": 698 + }, + { + "epoch": 0.2789768185451639, + "loss": 0.7713261842727661, + "loss_ce": 0.007776360027492046, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 120626520, + "step": 698 + }, + { + "epoch": 0.2793764988009592, + "grad_norm": 55.383571736824486, + "learning_rate": 5e-06, + "loss": 0.9588, + "num_input_tokens_seen": 120799808, + "step": 699 + }, + { + "epoch": 0.2793764988009592, + "loss": 1.009063482284546, + "loss_ce": 0.011138629168272018, + "loss_xval": 0.99609375, + "num_input_tokens_seen": 120799808, + "step": 699 + }, + { + "epoch": 0.2797761790567546, + "grad_norm": 195.29831466108058, + "learning_rate": 5e-06, + "loss": 1.1378, + "num_input_tokens_seen": 120972440, + "step": 700 + }, + { + "epoch": 0.2797761790567546, + "loss": 1.147801399230957, + "loss_ce": 0.013279901817440987, + "loss_xval": 1.1328125, + "num_input_tokens_seen": 120972440, + "step": 700 + }, + { + "epoch": 0.28017585931254996, + "grad_norm": 117.78468354041925, + "learning_rate": 5e-06, + "loss": 0.6818, + "num_input_tokens_seen": 121145656, + "step": 701 + }, + { + "epoch": 0.28017585931254996, + "loss": 0.387192964553833, + "loss_ce": 0.012437107972800732, + "loss_xval": 0.375, + "num_input_tokens_seen": 121145656, + "step": 701 + }, + { + "epoch": 0.2805755395683453, + "grad_norm": 349.182381437043, + "learning_rate": 5e-06, + "loss": 0.9223, + "num_input_tokens_seen": 121318464, + "step": 702 + }, + { + "epoch": 0.2805755395683453, + "loss": 0.9129467606544495, + "loss_ce": 0.01377684623003006, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 121318464, + "step": 702 + }, + { + "epoch": 0.2809752198241407, + "grad_norm": 78.93412862466627, + "learning_rate": 5e-06, + "loss": 1.0278, + "num_input_tokens_seen": 121491328, + "step": 703 + }, + { + "epoch": 0.2809752198241407, + "loss": 0.9784796237945557, + "loss_ce": 0.014093691483139992, + "loss_xval": 0.96484375, + "num_input_tokens_seen": 121491328, + "step": 703 + }, + { + "epoch": 0.28137490007993604, + "grad_norm": 153.8104816648774, + "learning_rate": 5e-06, + "loss": 0.713, + "num_input_tokens_seen": 121664304, + "step": 704 + }, + { + "epoch": 0.28137490007993604, + "loss": 0.562119722366333, + "loss_ce": 0.02134818211197853, + "loss_xval": 0.5390625, + "num_input_tokens_seen": 121664304, + "step": 704 + }, + { + "epoch": 0.28177458033573144, + "grad_norm": 62.97326893902962, + "learning_rate": 5e-06, + "loss": 0.6645, + "num_input_tokens_seen": 121837640, + "step": 705 + }, + { + "epoch": 0.28177458033573144, + "loss": 0.5091035962104797, + "loss_ce": 0.014718795195221901, + "loss_xval": 0.494140625, + "num_input_tokens_seen": 121837640, + "step": 705 + }, + { + "epoch": 0.2821742605915268, + "grad_norm": 124.77914466366667, + "learning_rate": 5e-06, + "loss": 0.5219, + "num_input_tokens_seen": 122010648, + "step": 706 + }, + { + "epoch": 0.2821742605915268, + "loss": 0.6143680810928345, + "loss_ce": 0.02232704497873783, + "loss_xval": 0.59375, + "num_input_tokens_seen": 122010648, + "step": 706 + }, + { + "epoch": 0.2825739408473221, + "grad_norm": 61.81361725933705, + "learning_rate": 5e-06, + "loss": 0.7228, + "num_input_tokens_seen": 122183368, + "step": 707 + }, + { + "epoch": 0.2825739408473221, + "loss": 0.7822574973106384, + "loss_ce": 0.023407384753227234, + "loss_xval": 0.7578125, + "num_input_tokens_seen": 122183368, + "step": 707 + }, + { + "epoch": 0.2829736211031175, + "grad_norm": 49.1614349431222, + "learning_rate": 5e-06, + "loss": 0.9115, + "num_input_tokens_seen": 122356296, + "step": 708 + }, + { + "epoch": 0.2829736211031175, + "loss": 0.7862190008163452, + "loss_ce": 0.019556399434804916, + "loss_xval": 0.765625, + "num_input_tokens_seen": 122356296, + "step": 708 + }, + { + "epoch": 0.28337330135891287, + "grad_norm": 127.70627885796577, + "learning_rate": 5e-06, + "loss": 0.785, + "num_input_tokens_seen": 122529488, + "step": 709 + }, + { + "epoch": 0.28337330135891287, + "loss": 1.0741759538650513, + "loss_ce": 0.03291618824005127, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 122529488, + "step": 709 + }, + { + "epoch": 0.2837729816147082, + "grad_norm": 31.053281397497233, + "learning_rate": 5e-06, + "loss": 0.8754, + "num_input_tokens_seen": 122702752, + "step": 710 + }, + { + "epoch": 0.2837729816147082, + "loss": 0.7221265435218811, + "loss_ce": 0.019550863653421402, + "loss_xval": 0.703125, + "num_input_tokens_seen": 122702752, + "step": 710 + }, + { + "epoch": 0.2841726618705036, + "grad_norm": 212.21575411655493, + "learning_rate": 5e-06, + "loss": 0.8754, + "num_input_tokens_seen": 122875696, + "step": 711 + }, + { + "epoch": 0.2841726618705036, + "loss": 0.996657133102417, + "loss_ce": 0.02216985821723938, + "loss_xval": 0.97265625, + "num_input_tokens_seen": 122875696, + "step": 711 + }, + { + "epoch": 0.28457234212629895, + "grad_norm": 40.100860368736996, + "learning_rate": 5e-06, + "loss": 0.7592, + "num_input_tokens_seen": 123048768, + "step": 712 + }, + { + "epoch": 0.28457234212629895, + "loss": 0.8356152772903442, + "loss_ce": 0.01981930062174797, + "loss_xval": 0.81640625, + "num_input_tokens_seen": 123048768, + "step": 712 + }, + { + "epoch": 0.28497202238209435, + "grad_norm": 214.2157109789503, + "learning_rate": 5e-06, + "loss": 0.732, + "num_input_tokens_seen": 123221664, + "step": 713 + }, + { + "epoch": 0.28497202238209435, + "loss": 0.5940902829170227, + "loss_ce": 0.019993610680103302, + "loss_xval": 0.57421875, + "num_input_tokens_seen": 123221664, + "step": 713 + }, + { + "epoch": 0.2853717026378897, + "grad_norm": 40.609562804706826, + "learning_rate": 5e-06, + "loss": 1.1444, + "num_input_tokens_seen": 123394488, + "step": 714 + }, + { + "epoch": 0.2853717026378897, + "loss": 0.7391673922538757, + "loss_ce": 0.028840193524956703, + "loss_xval": 0.7109375, + "num_input_tokens_seen": 123394488, + "step": 714 + }, + { + "epoch": 0.28577138289368503, + "grad_norm": 178.02080973482205, + "learning_rate": 5e-06, + "loss": 0.9773, + "num_input_tokens_seen": 123567376, + "step": 715 + }, + { + "epoch": 0.28577138289368503, + "loss": 0.922229528427124, + "loss_ce": 0.018909169360995293, + "loss_xval": 0.90234375, + "num_input_tokens_seen": 123567376, + "step": 715 + }, + { + "epoch": 0.28617106314948043, + "grad_norm": 123.04011045066329, + "learning_rate": 5e-06, + "loss": 1.068, + "num_input_tokens_seen": 123740088, + "step": 716 + }, + { + "epoch": 0.28617106314948043, + "loss": 1.4456578493118286, + "loss_ce": 0.016336563974618912, + "loss_xval": 1.4296875, + "num_input_tokens_seen": 123740088, + "step": 716 + }, + { + "epoch": 0.2865707434052758, + "grad_norm": 260.6443722826441, + "learning_rate": 5e-06, + "loss": 0.8816, + "num_input_tokens_seen": 123913104, + "step": 717 + }, + { + "epoch": 0.2865707434052758, + "loss": 0.5610959529876709, + "loss_ce": 0.0250852033495903, + "loss_xval": 0.53515625, + "num_input_tokens_seen": 123913104, + "step": 717 + }, + { + "epoch": 0.2869704236610711, + "grad_norm": 54.92546204084786, + "learning_rate": 5e-06, + "loss": 0.8094, + "num_input_tokens_seen": 124085896, + "step": 718 + }, + { + "epoch": 0.2869704236610711, + "loss": 0.9141414761543274, + "loss_ce": 0.017046771943569183, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 124085896, + "step": 718 + }, + { + "epoch": 0.2873701039168665, + "grad_norm": 201.82756469697114, + "learning_rate": 5e-06, + "loss": 0.99, + "num_input_tokens_seen": 124259080, + "step": 719 + }, + { + "epoch": 0.2873701039168665, + "loss": 1.0651757717132568, + "loss_ce": 0.01634761318564415, + "loss_xval": 1.046875, + "num_input_tokens_seen": 124259080, + "step": 719 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 102.45815081080835, + "learning_rate": 5e-06, + "loss": 0.8853, + "num_input_tokens_seen": 124431976, + "step": 720 + }, + { + "epoch": 0.28776978417266186, + "loss": 1.0132455825805664, + "loss_ce": 0.016541466116905212, + "loss_xval": 0.99609375, + "num_input_tokens_seen": 124431976, + "step": 720 + }, + { + "epoch": 0.28816946442845726, + "grad_norm": 160.91023116493113, + "learning_rate": 5e-06, + "loss": 0.5593, + "num_input_tokens_seen": 124604832, + "step": 721 + }, + { + "epoch": 0.28816946442845726, + "loss": 0.4822537302970886, + "loss_ce": 0.01203891821205616, + "loss_xval": 0.470703125, + "num_input_tokens_seen": 124604832, + "step": 721 + }, + { + "epoch": 0.2885691446842526, + "grad_norm": 72.91741185752112, + "learning_rate": 5e-06, + "loss": 0.8368, + "num_input_tokens_seen": 124778048, + "step": 722 + }, + { + "epoch": 0.2885691446842526, + "loss": 1.0014230012893677, + "loss_ce": 0.017902549356222153, + "loss_xval": 0.984375, + "num_input_tokens_seen": 124778048, + "step": 722 + }, + { + "epoch": 0.28896882494004794, + "grad_norm": 146.8668026594306, + "learning_rate": 5e-06, + "loss": 0.8567, + "num_input_tokens_seen": 124951344, + "step": 723 + }, + { + "epoch": 0.28896882494004794, + "loss": 0.8188395500183105, + "loss_ce": 0.015738962218165398, + "loss_xval": 0.8046875, + "num_input_tokens_seen": 124951344, + "step": 723 + }, + { + "epoch": 0.28936850519584334, + "grad_norm": 42.10566547583542, + "learning_rate": 5e-06, + "loss": 0.5991, + "num_input_tokens_seen": 125124264, + "step": 724 + }, + { + "epoch": 0.28936850519584334, + "loss": 0.6889089345932007, + "loss_ce": 0.01135767251253128, + "loss_xval": 0.67578125, + "num_input_tokens_seen": 125124264, + "step": 724 + }, + { + "epoch": 0.2897681854516387, + "grad_norm": 123.25802216263138, + "learning_rate": 5e-06, + "loss": 0.7095, + "num_input_tokens_seen": 125297072, + "step": 725 + }, + { + "epoch": 0.2897681854516387, + "loss": 0.25660455226898193, + "loss_ce": 0.010632868856191635, + "loss_xval": 0.24609375, + "num_input_tokens_seen": 125297072, + "step": 725 + }, + { + "epoch": 0.290167865707434, + "grad_norm": 68.66541377559568, + "learning_rate": 5e-06, + "loss": 1.1072, + "num_input_tokens_seen": 125469880, + "step": 726 + }, + { + "epoch": 0.290167865707434, + "loss": 1.066777229309082, + "loss_ce": 0.011845514178276062, + "loss_xval": 1.0546875, + "num_input_tokens_seen": 125469880, + "step": 726 + }, + { + "epoch": 0.2905675459632294, + "grad_norm": 118.39291047454554, + "learning_rate": 5e-06, + "loss": 1.0021, + "num_input_tokens_seen": 125642760, + "step": 727 + }, + { + "epoch": 0.2905675459632294, + "loss": 1.0062450170516968, + "loss_ce": 0.020649326965212822, + "loss_xval": 0.984375, + "num_input_tokens_seen": 125642760, + "step": 727 + }, + { + "epoch": 0.29096722621902477, + "grad_norm": 133.06920888386995, + "learning_rate": 5e-06, + "loss": 1.0436, + "num_input_tokens_seen": 125815440, + "step": 728 + }, + { + "epoch": 0.29096722621902477, + "loss": 1.2720887660980225, + "loss_ce": 0.014886559918522835, + "loss_xval": 1.2578125, + "num_input_tokens_seen": 125815440, + "step": 728 + }, + { + "epoch": 0.29136690647482016, + "grad_norm": 41.10436043549906, + "learning_rate": 5e-06, + "loss": 0.9073, + "num_input_tokens_seen": 125988424, + "step": 729 + }, + { + "epoch": 0.29136690647482016, + "loss": 1.0338069200515747, + "loss_ce": 0.01378735899925232, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 125988424, + "step": 729 + }, + { + "epoch": 0.2917665867306155, + "grad_norm": 171.41761601925776, + "learning_rate": 5e-06, + "loss": 1.1713, + "num_input_tokens_seen": 126161680, + "step": 730 + }, + { + "epoch": 0.2917665867306155, + "loss": 1.4746264219284058, + "loss_ce": 0.011247565969824791, + "loss_xval": 1.4609375, + "num_input_tokens_seen": 126161680, + "step": 730 + }, + { + "epoch": 0.29216626698641085, + "grad_norm": 30.728524368886863, + "learning_rate": 5e-06, + "loss": 0.9324, + "num_input_tokens_seen": 126334240, + "step": 731 + }, + { + "epoch": 0.29216626698641085, + "loss": 1.2535715103149414, + "loss_ce": 0.01419171690940857, + "loss_xval": 1.2421875, + "num_input_tokens_seen": 126334240, + "step": 731 + }, + { + "epoch": 0.29256594724220625, + "grad_norm": 154.08953622194392, + "learning_rate": 5e-06, + "loss": 0.6463, + "num_input_tokens_seen": 126506840, + "step": 732 + }, + { + "epoch": 0.29256594724220625, + "loss": 0.7345225811004639, + "loss_ce": 0.013819379732012749, + "loss_xval": 0.71875, + "num_input_tokens_seen": 126506840, + "step": 732 + }, + { + "epoch": 0.2929656274980016, + "grad_norm": 67.98532313091332, + "learning_rate": 5e-06, + "loss": 0.5375, + "num_input_tokens_seen": 126679928, + "step": 733 + }, + { + "epoch": 0.2929656274980016, + "loss": 0.6890181303024292, + "loss_ce": 0.013786174356937408, + "loss_xval": 0.67578125, + "num_input_tokens_seen": 126679928, + "step": 733 + }, + { + "epoch": 0.293365307753797, + "grad_norm": 169.58988174707864, + "learning_rate": 5e-06, + "loss": 0.8897, + "num_input_tokens_seen": 126852704, + "step": 734 + }, + { + "epoch": 0.293365307753797, + "loss": 0.8869085311889648, + "loss_ce": 0.011359157972037792, + "loss_xval": 0.875, + "num_input_tokens_seen": 126852704, + "step": 734 + }, + { + "epoch": 0.29376498800959233, + "grad_norm": 188.55727116539376, + "learning_rate": 5e-06, + "loss": 0.655, + "num_input_tokens_seen": 127025936, + "step": 735 + }, + { + "epoch": 0.29376498800959233, + "loss": 0.6508488655090332, + "loss_ce": 0.009491443634033203, + "loss_xval": 0.640625, + "num_input_tokens_seen": 127025936, + "step": 735 + }, + { + "epoch": 0.2941646682653877, + "grad_norm": 57.679259705456175, + "learning_rate": 5e-06, + "loss": 1.2767, + "num_input_tokens_seen": 127198680, + "step": 736 + }, + { + "epoch": 0.2941646682653877, + "loss": 1.423262596130371, + "loss_ce": 0.00944419577717781, + "loss_xval": 1.4140625, + "num_input_tokens_seen": 127198680, + "step": 736 + }, + { + "epoch": 0.2945643485211831, + "grad_norm": 145.57825597022912, + "learning_rate": 5e-06, + "loss": 0.8213, + "num_input_tokens_seen": 127371416, + "step": 737 + }, + { + "epoch": 0.2945643485211831, + "loss": 1.0838699340820312, + "loss_ce": 0.01673116721212864, + "loss_xval": 1.0703125, + "num_input_tokens_seen": 127371416, + "step": 737 + }, + { + "epoch": 0.2949640287769784, + "grad_norm": 49.478090664102595, + "learning_rate": 5e-06, + "loss": 0.893, + "num_input_tokens_seen": 127544496, + "step": 738 + }, + { + "epoch": 0.2949640287769784, + "loss": 1.1251657009124756, + "loss_ce": 0.012983132153749466, + "loss_xval": 1.109375, + "num_input_tokens_seen": 127544496, + "step": 738 + }, + { + "epoch": 0.29536370903277376, + "grad_norm": 152.29658505097626, + "learning_rate": 5e-06, + "loss": 0.6981, + "num_input_tokens_seen": 127713600, + "step": 739 + }, + { + "epoch": 0.29536370903277376, + "loss": 0.8325961828231812, + "loss_ce": 0.02095065638422966, + "loss_xval": 0.8125, + "num_input_tokens_seen": 127713600, + "step": 739 + }, + { + "epoch": 0.29576338928856916, + "grad_norm": 104.36305781820683, + "learning_rate": 5e-06, + "loss": 0.9132, + "num_input_tokens_seen": 127886552, + "step": 740 + }, + { + "epoch": 0.29576338928856916, + "loss": 0.8980221152305603, + "loss_ce": 0.019421041011810303, + "loss_xval": 0.87890625, + "num_input_tokens_seen": 127886552, + "step": 740 + }, + { + "epoch": 0.2961630695443645, + "grad_norm": 358.56742391146184, + "learning_rate": 5e-06, + "loss": 1.2376, + "num_input_tokens_seen": 128059336, + "step": 741 + }, + { + "epoch": 0.2961630695443645, + "loss": 1.2877583503723145, + "loss_ce": 0.014748061075806618, + "loss_xval": 1.2734375, + "num_input_tokens_seen": 128059336, + "step": 741 + }, + { + "epoch": 0.2965627498001599, + "grad_norm": 38.999705304706644, + "learning_rate": 5e-06, + "loss": 0.7955, + "num_input_tokens_seen": 128232384, + "step": 742 + }, + { + "epoch": 0.2965627498001599, + "loss": 0.7341563701629639, + "loss_ce": 0.022913720458745956, + "loss_xval": 0.7109375, + "num_input_tokens_seen": 128232384, + "step": 742 + }, + { + "epoch": 0.29696243005595524, + "grad_norm": 58.58273127409154, + "learning_rate": 5e-06, + "loss": 0.6488, + "num_input_tokens_seen": 128405632, + "step": 743 + }, + { + "epoch": 0.29696243005595524, + "loss": 0.8151225447654724, + "loss_ce": 0.016538549214601517, + "loss_xval": 0.796875, + "num_input_tokens_seen": 128405632, + "step": 743 + }, + { + "epoch": 0.2973621103117506, + "grad_norm": 34.3307729926042, + "learning_rate": 5e-06, + "loss": 0.8303, + "num_input_tokens_seen": 128578240, + "step": 744 + }, + { + "epoch": 0.2973621103117506, + "loss": 0.7622973322868347, + "loss_ce": 0.012358361855149269, + "loss_xval": 0.75, + "num_input_tokens_seen": 128578240, + "step": 744 + }, + { + "epoch": 0.297761790567546, + "grad_norm": 80.36115843481713, + "learning_rate": 5e-06, + "loss": 0.7475, + "num_input_tokens_seen": 128751168, + "step": 745 + }, + { + "epoch": 0.297761790567546, + "loss": 0.5837757587432861, + "loss_ce": 0.01822400838136673, + "loss_xval": 0.56640625, + "num_input_tokens_seen": 128751168, + "step": 745 + }, + { + "epoch": 0.2981614708233413, + "grad_norm": 92.27821846652243, + "learning_rate": 5e-06, + "loss": 1.0467, + "num_input_tokens_seen": 128923664, + "step": 746 + }, + { + "epoch": 0.2981614708233413, + "loss": 0.9637579917907715, + "loss_ce": 0.014783459715545177, + "loss_xval": 0.94921875, + "num_input_tokens_seen": 128923664, + "step": 746 + }, + { + "epoch": 0.29856115107913667, + "grad_norm": 59.953210838770005, + "learning_rate": 5e-06, + "loss": 1.1645, + "num_input_tokens_seen": 129096520, + "step": 747 + }, + { + "epoch": 0.29856115107913667, + "loss": 0.559451162815094, + "loss_ce": 0.011294430121779442, + "loss_xval": 0.546875, + "num_input_tokens_seen": 129096520, + "step": 747 + }, + { + "epoch": 0.29896083133493206, + "grad_norm": 54.717171712560024, + "learning_rate": 5e-06, + "loss": 0.6601, + "num_input_tokens_seen": 129269376, + "step": 748 + }, + { + "epoch": 0.29896083133493206, + "loss": 0.5837520956993103, + "loss_ce": 0.01343961339443922, + "loss_xval": 0.5703125, + "num_input_tokens_seen": 129269376, + "step": 748 + }, + { + "epoch": 0.2993605115907274, + "grad_norm": 40.816773316862005, + "learning_rate": 5e-06, + "loss": 0.6647, + "num_input_tokens_seen": 129441928, + "step": 749 + }, + { + "epoch": 0.2993605115907274, + "loss": 0.7464295029640198, + "loss_ce": 0.020355278626084328, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 129441928, + "step": 749 + }, + { + "epoch": 0.2997601918465228, + "grad_norm": 129.2032035605195, + "learning_rate": 5e-06, + "loss": 0.694, + "num_input_tokens_seen": 129614136, + "step": 750 + }, + { + "epoch": 0.2997601918465228, + "eval_websight_new_IoU": 0.32972943782806396, + "eval_websight_new_MAE_all": 0.03321713022887707, + "eval_websight_new_MAE_h": 0.03598089702427387, + "eval_websight_new_MAE_w": 0.05507303401827812, + "eval_websight_new_MAE_x": 0.021736985072493553, + "eval_websight_new_MAE_y": 0.020077602006495, + "eval_websight_new_NUM_probability": 0.9082909226417542, + "eval_websight_new_inside_bbox": 0.5902777910232544, + "eval_websight_new_loss": 0.3363611698150635, + "eval_websight_new_loss_ce": 0.00987301068380475, + "eval_websight_new_loss_xval": 0.2787322998046875, + "eval_websight_new_runtime": 59.6643, + "eval_websight_new_samples_per_second": 0.838, + "eval_websight_new_steps_per_second": 0.034, + "num_input_tokens_seen": 129614136, + "step": 750 + }, + { + "epoch": 0.2997601918465228, + "eval_seeclick_IoU": 0.21530038118362427, + "eval_seeclick_MAE_all": 0.0899505689740181, + "eval_seeclick_MAE_h": 0.03901367634534836, + "eval_seeclick_MAE_w": 0.1384214162826538, + "eval_seeclick_MAE_x": 0.11315473914146423, + "eval_seeclick_MAE_y": 0.0692124255001545, + "eval_seeclick_NUM_probability": 0.8880393803119659, + "eval_seeclick_inside_bbox": 0.3229166716337204, + "eval_seeclick_loss": 2.2942774295806885, + "eval_seeclick_loss_ce": 0.026869087480008602, + "eval_seeclick_loss_xval": 2.2388916015625, + "eval_seeclick_runtime": 89.7723, + "eval_seeclick_samples_per_second": 0.557, + "eval_seeclick_steps_per_second": 0.022, + "num_input_tokens_seen": 129614136, + "step": 750 + }, + { + "epoch": 0.2997601918465228, + "eval_icons_IoU": 0.09595663845539093, + "eval_icons_MAE_all": 0.035994925536215305, + "eval_icons_MAE_h": 0.0310601107776165, + "eval_icons_MAE_w": 0.02391492947936058, + "eval_icons_MAE_x": 0.05677058733999729, + "eval_icons_MAE_y": 0.03223407082259655, + "eval_icons_NUM_probability": 0.9089525938034058, + "eval_icons_inside_bbox": 0.2048611119389534, + "eval_icons_loss": 0.2528549134731293, + "eval_icons_loss_ce": 0.012350890785455704, + "eval_icons_loss_xval": 0.222991943359375, + "eval_icons_runtime": 82.7604, + "eval_icons_samples_per_second": 0.604, + "eval_icons_steps_per_second": 0.024, + "num_input_tokens_seen": 129614136, + "step": 750 + }, + { + "epoch": 0.2997601918465228, + "loss": 0.3234509825706482, + "loss_ce": 0.014796189963817596, + "loss_xval": 0.30859375, + "num_input_tokens_seen": 129614136, + "step": 750 + }, + { + "epoch": 0.30015987210231815, + "grad_norm": 50.44016771405421, + "learning_rate": 5e-06, + "loss": 0.6259, + "num_input_tokens_seen": 129786896, + "step": 751 + }, + { + "epoch": 0.30015987210231815, + "loss": 0.6966589689254761, + "loss_ce": 0.015018315985798836, + "loss_xval": 0.6796875, + "num_input_tokens_seen": 129786896, + "step": 751 + }, + { + "epoch": 0.3005595523581135, + "grad_norm": 188.04641922934573, + "learning_rate": 5e-06, + "loss": 1.1675, + "num_input_tokens_seen": 129959752, + "step": 752 + }, + { + "epoch": 0.3005595523581135, + "loss": 1.1777859926223755, + "loss_ce": 0.016409026458859444, + "loss_xval": 1.1640625, + "num_input_tokens_seen": 129959752, + "step": 752 + }, + { + "epoch": 0.3009592326139089, + "grad_norm": 64.08122838016891, + "learning_rate": 5e-06, + "loss": 1.021, + "num_input_tokens_seen": 130132688, + "step": 753 + }, + { + "epoch": 0.3009592326139089, + "loss": 0.7875679731369019, + "loss_ce": 0.011078734882175922, + "loss_xval": 0.77734375, + "num_input_tokens_seen": 130132688, + "step": 753 + }, + { + "epoch": 0.30135891286970423, + "grad_norm": 238.25273070676948, + "learning_rate": 5e-06, + "loss": 0.9882, + "num_input_tokens_seen": 130306096, + "step": 754 + }, + { + "epoch": 0.30135891286970423, + "loss": 1.330396056175232, + "loss_ce": 0.009595339186489582, + "loss_xval": 1.3203125, + "num_input_tokens_seen": 130306096, + "step": 754 + }, + { + "epoch": 0.3017585931254996, + "grad_norm": 98.65332854764719, + "learning_rate": 5e-06, + "loss": 0.8745, + "num_input_tokens_seen": 130479152, + "step": 755 + }, + { + "epoch": 0.3017585931254996, + "loss": 0.6690840125083923, + "loss_ce": 0.009049820713698864, + "loss_xval": 0.66015625, + "num_input_tokens_seen": 130479152, + "step": 755 + }, + { + "epoch": 0.302158273381295, + "grad_norm": 279.6737307195592, + "learning_rate": 5e-06, + "loss": 0.735, + "num_input_tokens_seen": 130651792, + "step": 756 + }, + { + "epoch": 0.302158273381295, + "loss": 0.8260841965675354, + "loss_ce": 0.008701398968696594, + "loss_xval": 0.81640625, + "num_input_tokens_seen": 130651792, + "step": 756 + }, + { + "epoch": 0.3025579536370903, + "grad_norm": 168.73328107773594, + "learning_rate": 5e-06, + "loss": 0.7155, + "num_input_tokens_seen": 130824800, + "step": 757 + }, + { + "epoch": 0.3025579536370903, + "loss": 0.9893529415130615, + "loss_ce": 0.009982806630432606, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 130824800, + "step": 757 + }, + { + "epoch": 0.3029576338928857, + "grad_norm": 229.3989099063003, + "learning_rate": 5e-06, + "loss": 0.7724, + "num_input_tokens_seen": 130997424, + "step": 758 + }, + { + "epoch": 0.3029576338928857, + "loss": 1.1186637878417969, + "loss_ce": 0.011974346823990345, + "loss_xval": 1.109375, + "num_input_tokens_seen": 130997424, + "step": 758 + }, + { + "epoch": 0.30335731414868106, + "grad_norm": 138.1632528777344, + "learning_rate": 5e-06, + "loss": 0.7744, + "num_input_tokens_seen": 131170472, + "step": 759 + }, + { + "epoch": 0.30335731414868106, + "loss": 0.7636563777923584, + "loss_ce": 0.0134122334420681, + "loss_xval": 0.75, + "num_input_tokens_seen": 131170472, + "step": 759 + }, + { + "epoch": 0.3037569944044764, + "grad_norm": 226.06896222225566, + "learning_rate": 5e-06, + "loss": 0.9072, + "num_input_tokens_seen": 131343872, + "step": 760 + }, + { + "epoch": 0.3037569944044764, + "loss": 0.6551499962806702, + "loss_ce": 0.022825779393315315, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 131343872, + "step": 760 + }, + { + "epoch": 0.3041566746602718, + "grad_norm": 140.47231030423117, + "learning_rate": 5e-06, + "loss": 0.801, + "num_input_tokens_seen": 131516864, + "step": 761 + }, + { + "epoch": 0.3041566746602718, + "loss": 0.7796874642372131, + "loss_ce": 0.018823187798261642, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 131516864, + "step": 761 + }, + { + "epoch": 0.30455635491606714, + "grad_norm": 191.03867527797138, + "learning_rate": 5e-06, + "loss": 0.8019, + "num_input_tokens_seen": 131686080, + "step": 762 + }, + { + "epoch": 0.30455635491606714, + "loss": 0.8312917947769165, + "loss_ce": 0.01952417567372322, + "loss_xval": 0.8125, + "num_input_tokens_seen": 131686080, + "step": 762 + }, + { + "epoch": 0.3049560351718625, + "grad_norm": 99.15375963784552, + "learning_rate": 5e-06, + "loss": 0.9744, + "num_input_tokens_seen": 131859384, + "step": 763 + }, + { + "epoch": 0.3049560351718625, + "loss": 1.0681936740875244, + "loss_ce": 0.016435783356428146, + "loss_xval": 1.0546875, + "num_input_tokens_seen": 131859384, + "step": 763 + }, + { + "epoch": 0.3053557154276579, + "grad_norm": 213.65930477816116, + "learning_rate": 5e-06, + "loss": 1.204, + "num_input_tokens_seen": 132032312, + "step": 764 + }, + { + "epoch": 0.3053557154276579, + "loss": 1.4312927722930908, + "loss_ce": 0.019061321392655373, + "loss_xval": 1.4140625, + "num_input_tokens_seen": 132032312, + "step": 764 + }, + { + "epoch": 0.3057553956834532, + "grad_norm": 108.68993738295507, + "learning_rate": 5e-06, + "loss": 0.61, + "num_input_tokens_seen": 132205336, + "step": 765 + }, + { + "epoch": 0.3057553956834532, + "loss": 0.6518961787223816, + "loss_ce": 0.026041686534881592, + "loss_xval": 0.625, + "num_input_tokens_seen": 132205336, + "step": 765 + }, + { + "epoch": 0.3061550759392486, + "grad_norm": 211.70893336363255, + "learning_rate": 5e-06, + "loss": 0.8624, + "num_input_tokens_seen": 132378208, + "step": 766 + }, + { + "epoch": 0.3061550759392486, + "loss": 0.7774863243103027, + "loss_ce": 0.019185544922947884, + "loss_xval": 0.7578125, + "num_input_tokens_seen": 132378208, + "step": 766 + }, + { + "epoch": 0.30655475619504396, + "grad_norm": 67.90632571223537, + "learning_rate": 5e-06, + "loss": 0.7338, + "num_input_tokens_seen": 132551608, + "step": 767 + }, + { + "epoch": 0.30655475619504396, + "loss": 0.9820546507835388, + "loss_ce": 0.020140592008829117, + "loss_xval": 0.9609375, + "num_input_tokens_seen": 132551608, + "step": 767 + }, + { + "epoch": 0.3069544364508393, + "grad_norm": 333.3981316273364, + "learning_rate": 5e-06, + "loss": 0.83, + "num_input_tokens_seen": 132724568, + "step": 768 + }, + { + "epoch": 0.3069544364508393, + "loss": 0.8645689487457275, + "loss_ce": 0.020635826513171196, + "loss_xval": 0.84375, + "num_input_tokens_seen": 132724568, + "step": 768 + }, + { + "epoch": 0.3073541167066347, + "grad_norm": 83.04924236141541, + "learning_rate": 5e-06, + "loss": 0.6825, + "num_input_tokens_seen": 132897376, + "step": 769 + }, + { + "epoch": 0.3073541167066347, + "loss": 0.8747704029083252, + "loss_ce": 0.036544110625982285, + "loss_xval": 0.83984375, + "num_input_tokens_seen": 132897376, + "step": 769 + }, + { + "epoch": 0.30775379696243005, + "grad_norm": 98.58312316090154, + "learning_rate": 5e-06, + "loss": 0.933, + "num_input_tokens_seen": 133070104, + "step": 770 + }, + { + "epoch": 0.30775379696243005, + "loss": 0.47699296474456787, + "loss_ce": 0.19757401943206787, + "loss_xval": 0.279296875, + "num_input_tokens_seen": 133070104, + "step": 770 + }, + { + "epoch": 0.30815347721822545, + "grad_norm": 67.93244599542409, + "learning_rate": 5e-06, + "loss": 1.1321, + "num_input_tokens_seen": 133243488, + "step": 771 + }, + { + "epoch": 0.30815347721822545, + "loss": 1.0369318723678589, + "loss_ce": 0.13879956305027008, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 133243488, + "step": 771 + }, + { + "epoch": 0.3085531574740208, + "grad_norm": 46.27524938342765, + "learning_rate": 5e-06, + "loss": 0.842, + "num_input_tokens_seen": 133412784, + "step": 772 + }, + { + "epoch": 0.3085531574740208, + "loss": 0.8269345164299011, + "loss_ce": 0.09854095429182053, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 133412784, + "step": 772 + }, + { + "epoch": 0.30895283772981613, + "grad_norm": 50.41952425911126, + "learning_rate": 5e-06, + "loss": 1.1107, + "num_input_tokens_seen": 133585896, + "step": 773 + }, + { + "epoch": 0.30895283772981613, + "loss": 1.146827220916748, + "loss_ce": 0.07016701996326447, + "loss_xval": 1.078125, + "num_input_tokens_seen": 133585896, + "step": 773 + }, + { + "epoch": 0.30935251798561153, + "grad_norm": 82.74316039825493, + "learning_rate": 5e-06, + "loss": 0.6717, + "num_input_tokens_seen": 133758544, + "step": 774 + }, + { + "epoch": 0.30935251798561153, + "loss": 0.5270382165908813, + "loss_ce": 0.08367883414030075, + "loss_xval": 0.443359375, + "num_input_tokens_seen": 133758544, + "step": 774 + }, + { + "epoch": 0.3097521982414069, + "grad_norm": 66.82965857737175, + "learning_rate": 5e-06, + "loss": 0.9872, + "num_input_tokens_seen": 133931776, + "step": 775 + }, + { + "epoch": 0.3097521982414069, + "loss": 1.059539794921875, + "loss_ce": 0.07809457927942276, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 133931776, + "step": 775 + }, + { + "epoch": 0.3101518784972022, + "grad_norm": 61.167284008978314, + "learning_rate": 5e-06, + "loss": 0.7877, + "num_input_tokens_seen": 134104704, + "step": 776 + }, + { + "epoch": 0.3101518784972022, + "loss": 0.8881216049194336, + "loss_ce": 0.0730580985546112, + "loss_xval": 0.81640625, + "num_input_tokens_seen": 134104704, + "step": 776 + }, + { + "epoch": 0.3105515587529976, + "grad_norm": 84.1144974470978, + "learning_rate": 5e-06, + "loss": 0.7481, + "num_input_tokens_seen": 134277464, + "step": 777 + }, + { + "epoch": 0.3105515587529976, + "loss": 0.6984782218933105, + "loss_ce": 0.047355152666568756, + "loss_xval": 0.65234375, + "num_input_tokens_seen": 134277464, + "step": 777 + }, + { + "epoch": 0.31095123900879296, + "grad_norm": 115.97957410951801, + "learning_rate": 5e-06, + "loss": 0.9494, + "num_input_tokens_seen": 134450816, + "step": 778 + }, + { + "epoch": 0.31095123900879296, + "loss": 1.0162138938903809, + "loss_ce": 0.046731531620025635, + "loss_xval": 0.96875, + "num_input_tokens_seen": 134450816, + "step": 778 + }, + { + "epoch": 0.31135091926458835, + "grad_norm": 103.83504655014406, + "learning_rate": 5e-06, + "loss": 0.5651, + "num_input_tokens_seen": 134624024, + "step": 779 + }, + { + "epoch": 0.31135091926458835, + "loss": 0.5172841548919678, + "loss_ce": 0.02302144654095173, + "loss_xval": 0.494140625, + "num_input_tokens_seen": 134624024, + "step": 779 + }, + { + "epoch": 0.3117505995203837, + "grad_norm": 63.90837696559783, + "learning_rate": 5e-06, + "loss": 0.7547, + "num_input_tokens_seen": 134797272, + "step": 780 + }, + { + "epoch": 0.3117505995203837, + "loss": 0.7474457025527954, + "loss_ce": 0.026986707001924515, + "loss_xval": 0.71875, + "num_input_tokens_seen": 134797272, + "step": 780 + }, + { + "epoch": 0.31215027977617904, + "grad_norm": 115.9985412220161, + "learning_rate": 5e-06, + "loss": 0.9481, + "num_input_tokens_seen": 134970320, + "step": 781 + }, + { + "epoch": 0.31215027977617904, + "loss": 0.7179272174835205, + "loss_ce": 0.01968502625823021, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 134970320, + "step": 781 + }, + { + "epoch": 0.31254996003197444, + "grad_norm": 41.252406618208745, + "learning_rate": 5e-06, + "loss": 0.7365, + "num_input_tokens_seen": 135143296, + "step": 782 + }, + { + "epoch": 0.31254996003197444, + "loss": 0.676671028137207, + "loss_ce": 0.025548022240400314, + "loss_xval": 0.65234375, + "num_input_tokens_seen": 135143296, + "step": 782 + }, + { + "epoch": 0.3129496402877698, + "grad_norm": 102.07811583540216, + "learning_rate": 5e-06, + "loss": 1.3992, + "num_input_tokens_seen": 135316624, + "step": 783 + }, + { + "epoch": 0.3129496402877698, + "loss": 1.6752166748046875, + "loss_ce": 0.03459162265062332, + "loss_xval": 1.640625, + "num_input_tokens_seen": 135316624, + "step": 783 + }, + { + "epoch": 0.3133493205435651, + "grad_norm": 112.62388613696993, + "learning_rate": 5e-06, + "loss": 0.8903, + "num_input_tokens_seen": 135489416, + "step": 784 + }, + { + "epoch": 0.3133493205435651, + "loss": 0.9434410929679871, + "loss_ce": 0.029622741043567657, + "loss_xval": 0.9140625, + "num_input_tokens_seen": 135489416, + "step": 784 + }, + { + "epoch": 0.3137490007993605, + "grad_norm": 81.92869001107958, + "learning_rate": 5e-06, + "loss": 0.5826, + "num_input_tokens_seen": 135662392, + "step": 785 + }, + { + "epoch": 0.3137490007993605, + "loss": 0.8590636253356934, + "loss_ce": 0.032403476536273956, + "loss_xval": 0.828125, + "num_input_tokens_seen": 135662392, + "step": 785 + }, + { + "epoch": 0.31414868105515587, + "grad_norm": 56.93984188783562, + "learning_rate": 5e-06, + "loss": 0.6952, + "num_input_tokens_seen": 135835648, + "step": 786 + }, + { + "epoch": 0.31414868105515587, + "loss": 0.7936071157455444, + "loss_ce": 0.03201046958565712, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 135835648, + "step": 786 + }, + { + "epoch": 0.31454836131095126, + "grad_norm": 123.08995481669876, + "learning_rate": 5e-06, + "loss": 1.0795, + "num_input_tokens_seen": 136008360, + "step": 787 + }, + { + "epoch": 0.31454836131095126, + "loss": 1.2340142726898193, + "loss_ce": 0.022832613438367844, + "loss_xval": 1.2109375, + "num_input_tokens_seen": 136008360, + "step": 787 + }, + { + "epoch": 0.3149480415667466, + "grad_norm": 192.62508663565438, + "learning_rate": 5e-06, + "loss": 0.7918, + "num_input_tokens_seen": 136180904, + "step": 788 + }, + { + "epoch": 0.3149480415667466, + "loss": 0.6366986036300659, + "loss_ce": 0.04831964522600174, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 136180904, + "step": 788 + }, + { + "epoch": 0.31534772182254195, + "grad_norm": 51.10447508784947, + "learning_rate": 5e-06, + "loss": 0.8927, + "num_input_tokens_seen": 136353552, + "step": 789 + }, + { + "epoch": 0.31534772182254195, + "loss": 1.050945520401001, + "loss_ce": 0.05485168844461441, + "loss_xval": 0.99609375, + "num_input_tokens_seen": 136353552, + "step": 789 + }, + { + "epoch": 0.31574740207833735, + "grad_norm": 154.61700039241353, + "learning_rate": 5e-06, + "loss": 0.8251, + "num_input_tokens_seen": 136526584, + "step": 790 + }, + { + "epoch": 0.31574740207833735, + "loss": 0.8927372694015503, + "loss_ce": 0.0441044420003891, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 136526584, + "step": 790 + }, + { + "epoch": 0.3161470823341327, + "grad_norm": 42.16296097175742, + "learning_rate": 5e-06, + "loss": 0.7295, + "num_input_tokens_seen": 136699328, + "step": 791 + }, + { + "epoch": 0.3161470823341327, + "loss": 0.9556566476821899, + "loss_ce": 0.04513419792056084, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 136699328, + "step": 791 + }, + { + "epoch": 0.31654676258992803, + "grad_norm": 138.874910538935, + "learning_rate": 5e-06, + "loss": 0.6915, + "num_input_tokens_seen": 136871856, + "step": 792 + }, + { + "epoch": 0.31654676258992803, + "loss": 0.6887627840042114, + "loss_ce": 0.03727353364229202, + "loss_xval": 0.65234375, + "num_input_tokens_seen": 136871856, + "step": 792 + }, + { + "epoch": 0.31694644284572343, + "grad_norm": 65.9961884504279, + "learning_rate": 5e-06, + "loss": 0.6059, + "num_input_tokens_seen": 137045064, + "step": 793 + }, + { + "epoch": 0.31694644284572343, + "loss": 0.5695608854293823, + "loss_ce": 0.041973013430833817, + "loss_xval": 0.52734375, + "num_input_tokens_seen": 137045064, + "step": 793 + }, + { + "epoch": 0.3173461231015188, + "grad_norm": 129.30759832098593, + "learning_rate": 5e-06, + "loss": 0.8537, + "num_input_tokens_seen": 137218184, + "step": 794 + }, + { + "epoch": 0.3173461231015188, + "loss": 0.6901719570159912, + "loss_ce": 0.03502054512500763, + "loss_xval": 0.65625, + "num_input_tokens_seen": 137218184, + "step": 794 + }, + { + "epoch": 0.31774580335731417, + "grad_norm": 86.71836721194123, + "learning_rate": 5e-06, + "loss": 0.7562, + "num_input_tokens_seen": 137391216, + "step": 795 + }, + { + "epoch": 0.31774580335731417, + "loss": 0.7299758195877075, + "loss_ce": 0.03719630092382431, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 137391216, + "step": 795 + }, + { + "epoch": 0.3181454836131095, + "grad_norm": 57.12461147367074, + "learning_rate": 5e-06, + "loss": 0.7832, + "num_input_tokens_seen": 137564144, + "step": 796 + }, + { + "epoch": 0.3181454836131095, + "loss": 0.5537126064300537, + "loss_ce": 0.03189250826835632, + "loss_xval": 0.5234375, + "num_input_tokens_seen": 137564144, + "step": 796 + }, + { + "epoch": 0.31854516386890486, + "grad_norm": 65.79780600817213, + "learning_rate": 5e-06, + "loss": 0.7925, + "num_input_tokens_seen": 137737192, + "step": 797 + }, + { + "epoch": 0.31854516386890486, + "loss": 0.9935466051101685, + "loss_ce": 0.029099617153406143, + "loss_xval": 0.96484375, + "num_input_tokens_seen": 137737192, + "step": 797 + }, + { + "epoch": 0.31894484412470026, + "grad_norm": 105.16048221090652, + "learning_rate": 5e-06, + "loss": 0.5815, + "num_input_tokens_seen": 137910264, + "step": 798 + }, + { + "epoch": 0.31894484412470026, + "loss": 0.6643279790878296, + "loss_ce": 0.03017270937561989, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 137910264, + "step": 798 + }, + { + "epoch": 0.3193445243804956, + "grad_norm": 72.96586018987574, + "learning_rate": 5e-06, + "loss": 1.1874, + "num_input_tokens_seen": 138083392, + "step": 799 + }, + { + "epoch": 0.3193445243804956, + "loss": 1.442209243774414, + "loss_ce": 0.024972904473543167, + "loss_xval": 1.4140625, + "num_input_tokens_seen": 138083392, + "step": 799 + }, + { + "epoch": 0.31974420463629094, + "grad_norm": 157.17421656167332, + "learning_rate": 5e-06, + "loss": 0.9201, + "num_input_tokens_seen": 138256112, + "step": 800 + }, + { + "epoch": 0.31974420463629094, + "loss": 1.115356206893921, + "loss_ce": 0.024047698825597763, + "loss_xval": 1.09375, + "num_input_tokens_seen": 138256112, + "step": 800 + }, + { + "epoch": 0.32014388489208634, + "grad_norm": 110.192018937514, + "learning_rate": 5e-06, + "loss": 1.0826, + "num_input_tokens_seen": 138428960, + "step": 801 + }, + { + "epoch": 0.32014388489208634, + "loss": 1.336624264717102, + "loss_ce": 0.06312566250562668, + "loss_xval": 1.2734375, + "num_input_tokens_seen": 138428960, + "step": 801 + }, + { + "epoch": 0.3205435651478817, + "grad_norm": 151.99977831225056, + "learning_rate": 5e-06, + "loss": 0.8894, + "num_input_tokens_seen": 138601992, + "step": 802 + }, + { + "epoch": 0.3205435651478817, + "loss": 0.6037086844444275, + "loss_ce": 0.02289813756942749, + "loss_xval": 0.58203125, + "num_input_tokens_seen": 138601992, + "step": 802 + }, + { + "epoch": 0.3209432454036771, + "grad_norm": 52.06306805397873, + "learning_rate": 5e-06, + "loss": 0.7899, + "num_input_tokens_seen": 138774584, + "step": 803 + }, + { + "epoch": 0.3209432454036771, + "loss": 0.744976282119751, + "loss_ce": 0.019451454281806946, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 138774584, + "step": 803 + }, + { + "epoch": 0.3213429256594724, + "grad_norm": 39.91507915716981, + "learning_rate": 5e-06, + "loss": 0.6145, + "num_input_tokens_seen": 138947832, + "step": 804 + }, + { + "epoch": 0.3213429256594724, + "loss": 0.7825067043304443, + "loss_ce": 0.03006529062986374, + "loss_xval": 0.75390625, + "num_input_tokens_seen": 138947832, + "step": 804 + }, + { + "epoch": 0.32174260591526777, + "grad_norm": 75.5906555802562, + "learning_rate": 5e-06, + "loss": 0.7645, + "num_input_tokens_seen": 139120760, + "step": 805 + }, + { + "epoch": 0.32174260591526777, + "loss": 0.8884168267250061, + "loss_ce": 0.0256849005818367, + "loss_xval": 0.86328125, + "num_input_tokens_seen": 139120760, + "step": 805 + }, + { + "epoch": 0.32214228617106316, + "grad_norm": 146.3475272918251, + "learning_rate": 5e-06, + "loss": 0.7265, + "num_input_tokens_seen": 139293776, + "step": 806 + }, + { + "epoch": 0.32214228617106316, + "loss": 0.8622835874557495, + "loss_ce": 0.02243983931839466, + "loss_xval": 0.83984375, + "num_input_tokens_seen": 139293776, + "step": 806 + }, + { + "epoch": 0.3225419664268585, + "grad_norm": 149.26645866928035, + "learning_rate": 5e-06, + "loss": 0.6571, + "num_input_tokens_seen": 139466240, + "step": 807 + }, + { + "epoch": 0.3225419664268585, + "loss": 0.9698929190635681, + "loss_ce": 0.018721019849181175, + "loss_xval": 0.953125, + "num_input_tokens_seen": 139466240, + "step": 807 + }, + { + "epoch": 0.3229416466826539, + "grad_norm": 85.13022329777696, + "learning_rate": 5e-06, + "loss": 0.7868, + "num_input_tokens_seen": 139639432, + "step": 808 + }, + { + "epoch": 0.3229416466826539, + "loss": 0.3776288628578186, + "loss_ce": 0.029606396332383156, + "loss_xval": 0.34765625, + "num_input_tokens_seen": 139639432, + "step": 808 + }, + { + "epoch": 0.32334132693844925, + "grad_norm": 124.8610805709589, + "learning_rate": 5e-06, + "loss": 0.51, + "num_input_tokens_seen": 139813008, + "step": 809 + }, + { + "epoch": 0.32334132693844925, + "loss": 0.42539799213409424, + "loss_ce": 0.028364313766360283, + "loss_xval": 0.396484375, + "num_input_tokens_seen": 139813008, + "step": 809 + }, + { + "epoch": 0.3237410071942446, + "grad_norm": 35.664202919806826, + "learning_rate": 5e-06, + "loss": 0.9645, + "num_input_tokens_seen": 139982784, + "step": 810 + }, + { + "epoch": 0.3237410071942446, + "loss": 0.6386287212371826, + "loss_ce": 0.026446137577295303, + "loss_xval": 0.61328125, + "num_input_tokens_seen": 139982784, + "step": 810 + }, + { + "epoch": 0.32414068745004, + "grad_norm": 63.87833532222325, + "learning_rate": 5e-06, + "loss": 0.6453, + "num_input_tokens_seen": 140156176, + "step": 811 + }, + { + "epoch": 0.32414068745004, + "loss": 0.4492732882499695, + "loss_ce": 0.025262057781219482, + "loss_xval": 0.423828125, + "num_input_tokens_seen": 140156176, + "step": 811 + }, + { + "epoch": 0.32454036770583533, + "grad_norm": 67.34889598567726, + "learning_rate": 5e-06, + "loss": 0.8529, + "num_input_tokens_seen": 140329472, + "step": 812 + }, + { + "epoch": 0.32454036770583533, + "loss": 0.3586152493953705, + "loss_ce": 0.028720222413539886, + "loss_xval": 0.330078125, + "num_input_tokens_seen": 140329472, + "step": 812 + }, + { + "epoch": 0.3249400479616307, + "grad_norm": 70.12852323782404, + "learning_rate": 5e-06, + "loss": 0.5924, + "num_input_tokens_seen": 140502504, + "step": 813 + }, + { + "epoch": 0.3249400479616307, + "loss": 0.7642978429794312, + "loss_ce": 0.025772511959075928, + "loss_xval": 0.73828125, + "num_input_tokens_seen": 140502504, + "step": 813 + }, + { + "epoch": 0.32533972821742607, + "grad_norm": 28.086180346559747, + "learning_rate": 5e-06, + "loss": 0.4791, + "num_input_tokens_seen": 140675400, + "step": 814 + }, + { + "epoch": 0.32533972821742607, + "loss": 0.7082566022872925, + "loss_ce": 0.015293995849788189, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 140675400, + "step": 814 + }, + { + "epoch": 0.3257394084732214, + "grad_norm": 101.79711405925198, + "learning_rate": 5e-06, + "loss": 1.158, + "num_input_tokens_seen": 140848184, + "step": 815 + }, + { + "epoch": 0.3257394084732214, + "loss": 0.7473459839820862, + "loss_ce": 0.015107257291674614, + "loss_xval": 0.73046875, + "num_input_tokens_seen": 140848184, + "step": 815 + }, + { + "epoch": 0.3261390887290168, + "grad_norm": 193.45790211450287, + "learning_rate": 5e-06, + "loss": 0.8277, + "num_input_tokens_seen": 141021312, + "step": 816 + }, + { + "epoch": 0.3261390887290168, + "loss": 0.8551950454711914, + "loss_ce": 0.013276074081659317, + "loss_xval": 0.84375, + "num_input_tokens_seen": 141021312, + "step": 816 + }, + { + "epoch": 0.32653876898481216, + "grad_norm": 123.7203597173954, + "learning_rate": 5e-06, + "loss": 0.8849, + "num_input_tokens_seen": 141194280, + "step": 817 + }, + { + "epoch": 0.32653876898481216, + "loss": 0.8089841604232788, + "loss_ce": 0.014916693791747093, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 141194280, + "step": 817 + }, + { + "epoch": 0.3269384492406075, + "grad_norm": 123.03492826004478, + "learning_rate": 5e-06, + "loss": 1.1935, + "num_input_tokens_seen": 141367040, + "step": 818 + }, + { + "epoch": 0.3269384492406075, + "loss": 1.457615852355957, + "loss_ce": 0.01865091174840927, + "loss_xval": 1.4375, + "num_input_tokens_seen": 141367040, + "step": 818 + }, + { + "epoch": 0.3273381294964029, + "grad_norm": 196.81361407257646, + "learning_rate": 5e-06, + "loss": 0.8593, + "num_input_tokens_seen": 141540360, + "step": 819 + }, + { + "epoch": 0.3273381294964029, + "loss": 0.7007966041564941, + "loss_ce": 0.01708076149225235, + "loss_xval": 0.68359375, + "num_input_tokens_seen": 141540360, + "step": 819 + }, + { + "epoch": 0.32773780975219824, + "grad_norm": 58.596043671656304, + "learning_rate": 5e-06, + "loss": 0.895, + "num_input_tokens_seen": 141713328, + "step": 820 + }, + { + "epoch": 0.32773780975219824, + "loss": 0.9711037278175354, + "loss_ce": 0.01468280702829361, + "loss_xval": 0.95703125, + "num_input_tokens_seen": 141713328, + "step": 820 + }, + { + "epoch": 0.3281374900079936, + "grad_norm": 191.31795115874968, + "learning_rate": 5e-06, + "loss": 0.8616, + "num_input_tokens_seen": 141886664, + "step": 821 + }, + { + "epoch": 0.3281374900079936, + "loss": 0.8032501935958862, + "loss_ce": 0.010769794695079327, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 141886664, + "step": 821 + }, + { + "epoch": 0.328537170263789, + "grad_norm": 64.81683384211863, + "learning_rate": 5e-06, + "loss": 0.5945, + "num_input_tokens_seen": 142059672, + "step": 822 + }, + { + "epoch": 0.328537170263789, + "loss": 0.43535593152046204, + "loss_ce": 0.02971627749502659, + "loss_xval": 0.40625, + "num_input_tokens_seen": 142059672, + "step": 822 + }, + { + "epoch": 0.3289368505195843, + "grad_norm": 184.53548859363346, + "learning_rate": 5e-06, + "loss": 0.7032, + "num_input_tokens_seen": 142232456, + "step": 823 + }, + { + "epoch": 0.3289368505195843, + "loss": 0.6750579476356506, + "loss_ce": 0.06104426458477974, + "loss_xval": 0.61328125, + "num_input_tokens_seen": 142232456, + "step": 823 + }, + { + "epoch": 0.3293365307753797, + "grad_norm": 118.97174765110988, + "learning_rate": 5e-06, + "loss": 0.9834, + "num_input_tokens_seen": 142405272, + "step": 824 + }, + { + "epoch": 0.3293365307753797, + "loss": 1.2425031661987305, + "loss_ce": 0.051829393953084946, + "loss_xval": 1.1875, + "num_input_tokens_seen": 142405272, + "step": 824 + }, + { + "epoch": 0.32973621103117506, + "grad_norm": 296.29048064640887, + "learning_rate": 5e-06, + "loss": 0.9834, + "num_input_tokens_seen": 142578256, + "step": 825 + }, + { + "epoch": 0.32973621103117506, + "loss": 1.2711055278778076, + "loss_ce": 0.02940632961690426, + "loss_xval": 1.2421875, + "num_input_tokens_seen": 142578256, + "step": 825 + }, + { + "epoch": 0.3301358912869704, + "grad_norm": 51.48298081505895, + "learning_rate": 5e-06, + "loss": 0.762, + "num_input_tokens_seen": 142751040, + "step": 826 + }, + { + "epoch": 0.3301358912869704, + "loss": 0.8536327481269836, + "loss_ce": 0.021448887884616852, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 142751040, + "step": 826 + }, + { + "epoch": 0.3305355715427658, + "grad_norm": 318.4751539614244, + "learning_rate": 5e-06, + "loss": 1.316, + "num_input_tokens_seen": 142924288, + "step": 827 + }, + { + "epoch": 0.3305355715427658, + "loss": 0.8164308071136475, + "loss_ce": 0.023950327187776566, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 142924288, + "step": 827 + }, + { + "epoch": 0.33093525179856115, + "grad_norm": 54.08071398162099, + "learning_rate": 5e-06, + "loss": 0.5481, + "num_input_tokens_seen": 143096912, + "step": 828 + }, + { + "epoch": 0.33093525179856115, + "loss": 0.5444949269294739, + "loss_ce": 0.028564732521772385, + "loss_xval": 0.515625, + "num_input_tokens_seen": 143096912, + "step": 828 + }, + { + "epoch": 0.3313349320543565, + "grad_norm": 244.38305682024998, + "learning_rate": 5e-06, + "loss": 0.8846, + "num_input_tokens_seen": 143270104, + "step": 829 + }, + { + "epoch": 0.3313349320543565, + "loss": 1.1831023693084717, + "loss_ce": 0.05102230980992317, + "loss_xval": 1.1328125, + "num_input_tokens_seen": 143270104, + "step": 829 + }, + { + "epoch": 0.3317346123101519, + "grad_norm": 78.02138265678809, + "learning_rate": 5e-06, + "loss": 1.0717, + "num_input_tokens_seen": 143442832, + "step": 830 + }, + { + "epoch": 0.3317346123101519, + "loss": 1.199249267578125, + "loss_ce": 0.02102671191096306, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 143442832, + "step": 830 + }, + { + "epoch": 0.33213429256594723, + "grad_norm": 244.2166319233294, + "learning_rate": 5e-06, + "loss": 0.9575, + "num_input_tokens_seen": 143615664, + "step": 831 + }, + { + "epoch": 0.33213429256594723, + "loss": 0.8177416324615479, + "loss_ce": 0.02526114135980606, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 143615664, + "step": 831 + }, + { + "epoch": 0.33253397282174263, + "grad_norm": 202.87305428352477, + "learning_rate": 5e-06, + "loss": 1.0996, + "num_input_tokens_seen": 143788624, + "step": 832 + }, + { + "epoch": 0.33253397282174263, + "loss": 0.7815216183662415, + "loss_ce": 0.024685688316822052, + "loss_xval": 0.7578125, + "num_input_tokens_seen": 143788624, + "step": 832 + }, + { + "epoch": 0.33293365307753797, + "grad_norm": 267.1015074921022, + "learning_rate": 5e-06, + "loss": 0.8148, + "num_input_tokens_seen": 143961728, + "step": 833 + }, + { + "epoch": 0.33293365307753797, + "loss": 1.0413076877593994, + "loss_ce": 0.030077166855335236, + "loss_xval": 1.0078125, + "num_input_tokens_seen": 143961728, + "step": 833 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 249.0188358069812, + "learning_rate": 5e-06, + "loss": 0.6692, + "num_input_tokens_seen": 144134792, + "step": 834 + }, + { + "epoch": 0.3333333333333333, + "loss": 0.577985405921936, + "loss_ce": 0.025495212525129318, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 144134792, + "step": 834 + }, + { + "epoch": 0.3337330135891287, + "grad_norm": 168.98599906136002, + "learning_rate": 5e-06, + "loss": 0.7907, + "num_input_tokens_seen": 144307776, + "step": 835 + }, + { + "epoch": 0.3337330135891287, + "loss": 0.7566037774085999, + "loss_ce": 0.03101782500743866, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 144307776, + "step": 835 + }, + { + "epoch": 0.33413269384492406, + "grad_norm": 223.09060693099514, + "learning_rate": 5e-06, + "loss": 0.8478, + "num_input_tokens_seen": 144480840, + "step": 836 + }, + { + "epoch": 0.33413269384492406, + "loss": 0.6522843241691589, + "loss_ce": 0.03289957344532013, + "loss_xval": 0.62109375, + "num_input_tokens_seen": 144480840, + "step": 836 + }, + { + "epoch": 0.3345323741007194, + "grad_norm": 26.914313057404094, + "learning_rate": 5e-06, + "loss": 0.3649, + "num_input_tokens_seen": 144654280, + "step": 837 + }, + { + "epoch": 0.3345323741007194, + "loss": 0.3446645140647888, + "loss_ce": 0.02459615468978882, + "loss_xval": 0.3203125, + "num_input_tokens_seen": 144654280, + "step": 837 + }, + { + "epoch": 0.3349320543565148, + "grad_norm": 180.4789733229281, + "learning_rate": 5e-06, + "loss": 0.9611, + "num_input_tokens_seen": 144827136, + "step": 838 + }, + { + "epoch": 0.3349320543565148, + "loss": 0.5780286192893982, + "loss_ce": 0.02566044218838215, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 144827136, + "step": 838 + }, + { + "epoch": 0.33533173461231014, + "grad_norm": 90.74097333134976, + "learning_rate": 5e-06, + "loss": 0.6284, + "num_input_tokens_seen": 145000568, + "step": 839 + }, + { + "epoch": 0.33533173461231014, + "loss": 0.7638094425201416, + "loss_ce": 0.0294344425201416, + "loss_xval": 0.734375, + "num_input_tokens_seen": 145000568, + "step": 839 + }, + { + "epoch": 0.33573141486810554, + "grad_norm": 112.5969382238021, + "learning_rate": 5e-06, + "loss": 0.8517, + "num_input_tokens_seen": 145173128, + "step": 840 + }, + { + "epoch": 0.33573141486810554, + "loss": 0.4753795266151428, + "loss_ce": 0.025672491639852524, + "loss_xval": 0.44921875, + "num_input_tokens_seen": 145173128, + "step": 840 + }, + { + "epoch": 0.3361310951239009, + "grad_norm": 160.69367514258622, + "learning_rate": 5e-06, + "loss": 0.9031, + "num_input_tokens_seen": 145345760, + "step": 841 + }, + { + "epoch": 0.3361310951239009, + "loss": 1.1150989532470703, + "loss_ce": 0.022447630763053894, + "loss_xval": 1.09375, + "num_input_tokens_seen": 145345760, + "step": 841 + }, + { + "epoch": 0.3365307753796962, + "grad_norm": 83.89344588868427, + "learning_rate": 5e-06, + "loss": 0.6187, + "num_input_tokens_seen": 145518800, + "step": 842 + }, + { + "epoch": 0.3365307753796962, + "loss": 0.8424967527389526, + "loss_ce": 0.020902525633573532, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 145518800, + "step": 842 + }, + { + "epoch": 0.3369304556354916, + "grad_norm": 128.21717519579028, + "learning_rate": 5e-06, + "loss": 0.7996, + "num_input_tokens_seen": 145691888, + "step": 843 + }, + { + "epoch": 0.3369304556354916, + "loss": 0.5368384718894958, + "loss_ce": 0.01852792128920555, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 145691888, + "step": 843 + }, + { + "epoch": 0.33733013589128696, + "grad_norm": 27.261036954593337, + "learning_rate": 5e-06, + "loss": 0.5289, + "num_input_tokens_seen": 145865024, + "step": 844 + }, + { + "epoch": 0.33733013589128696, + "loss": 0.6117612719535828, + "loss_ce": 0.02216166816651821, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 145865024, + "step": 844 + }, + { + "epoch": 0.3377298161470823, + "grad_norm": 148.51182247944908, + "learning_rate": 5e-06, + "loss": 0.8651, + "num_input_tokens_seen": 146037824, + "step": 845 + }, + { + "epoch": 0.3377298161470823, + "loss": 0.6133238077163696, + "loss_ce": 0.017864754423499107, + "loss_xval": 0.59375, + "num_input_tokens_seen": 146037824, + "step": 845 + }, + { + "epoch": 0.3381294964028777, + "grad_norm": 71.55720025246332, + "learning_rate": 5e-06, + "loss": 0.5425, + "num_input_tokens_seen": 146210432, + "step": 846 + }, + { + "epoch": 0.3381294964028777, + "loss": 0.8297093510627747, + "loss_ce": 0.014645876362919807, + "loss_xval": 0.81640625, + "num_input_tokens_seen": 146210432, + "step": 846 + }, + { + "epoch": 0.33852917665867305, + "grad_norm": 97.5892415923415, + "learning_rate": 5e-06, + "loss": 0.6328, + "num_input_tokens_seen": 146383528, + "step": 847 + }, + { + "epoch": 0.33852917665867305, + "loss": 0.6266300082206726, + "loss_ce": 0.0211612731218338, + "loss_xval": 0.60546875, + "num_input_tokens_seen": 146383528, + "step": 847 + }, + { + "epoch": 0.33892885691446845, + "grad_norm": 153.6287910779392, + "learning_rate": 5e-06, + "loss": 0.6496, + "num_input_tokens_seen": 146555992, + "step": 848 + }, + { + "epoch": 0.33892885691446845, + "loss": 0.628968358039856, + "loss_ce": 0.012330153957009315, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 146555992, + "step": 848 + }, + { + "epoch": 0.3393285371702638, + "grad_norm": 119.93950349993199, + "learning_rate": 5e-06, + "loss": 0.47, + "num_input_tokens_seen": 146728976, + "step": 849 + }, + { + "epoch": 0.3393285371702638, + "loss": 0.5641285181045532, + "loss_ce": 0.013103111647069454, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 146728976, + "step": 849 + }, + { + "epoch": 0.33972821742605913, + "grad_norm": 151.5642271469956, + "learning_rate": 5e-06, + "loss": 0.828, + "num_input_tokens_seen": 146901664, + "step": 850 + }, + { + "epoch": 0.33972821742605913, + "loss": 0.6333939433097839, + "loss_ce": 0.009858794510364532, + "loss_xval": 0.625, + "num_input_tokens_seen": 146901664, + "step": 850 + }, + { + "epoch": 0.34012789768185453, + "grad_norm": 63.2888069151697, + "learning_rate": 5e-06, + "loss": 0.5584, + "num_input_tokens_seen": 147074640, + "step": 851 + }, + { + "epoch": 0.34012789768185453, + "loss": 0.6273359656333923, + "loss_ce": 0.010331545025110245, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 147074640, + "step": 851 + }, + { + "epoch": 0.3405275779376499, + "grad_norm": 133.0194702803221, + "learning_rate": 5e-06, + "loss": 0.9838, + "num_input_tokens_seen": 147247328, + "step": 852 + }, + { + "epoch": 0.3405275779376499, + "loss": 1.4191014766693115, + "loss_ce": 0.011447655037045479, + "loss_xval": 1.40625, + "num_input_tokens_seen": 147247328, + "step": 852 + }, + { + "epoch": 0.34092725819344527, + "grad_norm": 67.27535330850881, + "learning_rate": 5e-06, + "loss": 0.4894, + "num_input_tokens_seen": 147420576, + "step": 853 + }, + { + "epoch": 0.34092725819344527, + "loss": 0.5238457918167114, + "loss_ce": 0.008770117536187172, + "loss_xval": 0.515625, + "num_input_tokens_seen": 147420576, + "step": 853 + }, + { + "epoch": 0.3413269384492406, + "grad_norm": 155.66643248889466, + "learning_rate": 5e-06, + "loss": 0.7028, + "num_input_tokens_seen": 147593552, + "step": 854 + }, + { + "epoch": 0.3413269384492406, + "loss": 0.6921306252479553, + "loss_ce": 0.020133551210165024, + "loss_xval": 0.671875, + "num_input_tokens_seen": 147593552, + "step": 854 + }, + { + "epoch": 0.34172661870503596, + "grad_norm": 62.82339145784998, + "learning_rate": 5e-06, + "loss": 0.4399, + "num_input_tokens_seen": 147766336, + "step": 855 + }, + { + "epoch": 0.34172661870503596, + "loss": 0.4368290901184082, + "loss_ce": 0.008331773802638054, + "loss_xval": 0.427734375, + "num_input_tokens_seen": 147766336, + "step": 855 + }, + { + "epoch": 0.34212629896083135, + "grad_norm": 169.0372459131588, + "learning_rate": 5e-06, + "loss": 1.1117, + "num_input_tokens_seen": 147939288, + "step": 856 + }, + { + "epoch": 0.34212629896083135, + "loss": 1.1313327550888062, + "loss_ce": 0.01048314105719328, + "loss_xval": 1.1171875, + "num_input_tokens_seen": 147939288, + "step": 856 + }, + { + "epoch": 0.3425259792166267, + "grad_norm": 160.87608512503493, + "learning_rate": 5e-06, + "loss": 0.9064, + "num_input_tokens_seen": 148112320, + "step": 857 + }, + { + "epoch": 0.3425259792166267, + "loss": 1.256239891052246, + "loss_ce": 0.008559215813875198, + "loss_xval": 1.25, + "num_input_tokens_seen": 148112320, + "step": 857 + }, + { + "epoch": 0.34292565947242204, + "grad_norm": 105.44623396035678, + "learning_rate": 5e-06, + "loss": 0.7052, + "num_input_tokens_seen": 148285280, + "step": 858 + }, + { + "epoch": 0.34292565947242204, + "loss": 0.8094460368156433, + "loss_ce": 0.010129651054739952, + "loss_xval": 0.80078125, + "num_input_tokens_seen": 148285280, + "step": 858 + }, + { + "epoch": 0.34332533972821744, + "grad_norm": 120.42122131819227, + "learning_rate": 5e-06, + "loss": 0.737, + "num_input_tokens_seen": 148457720, + "step": 859 + }, + { + "epoch": 0.34332533972821744, + "loss": 0.742914080619812, + "loss_ce": 0.009515605866909027, + "loss_xval": 0.734375, + "num_input_tokens_seen": 148457720, + "step": 859 + }, + { + "epoch": 0.3437250199840128, + "grad_norm": 130.49015429141713, + "learning_rate": 5e-06, + "loss": 0.7364, + "num_input_tokens_seen": 148631048, + "step": 860 + }, + { + "epoch": 0.3437250199840128, + "loss": 1.0588502883911133, + "loss_ce": 0.011364908888936043, + "loss_xval": 1.046875, + "num_input_tokens_seen": 148631048, + "step": 860 + }, + { + "epoch": 0.3441247002398082, + "grad_norm": 136.66847470521824, + "learning_rate": 5e-06, + "loss": 0.8036, + "num_input_tokens_seen": 148804048, + "step": 861 + }, + { + "epoch": 0.3441247002398082, + "loss": 0.9435184597969055, + "loss_ce": 0.010290941223502159, + "loss_xval": 0.93359375, + "num_input_tokens_seen": 148804048, + "step": 861 + }, + { + "epoch": 0.3445243804956035, + "grad_norm": 150.40577419616318, + "learning_rate": 5e-06, + "loss": 0.8159, + "num_input_tokens_seen": 148977112, + "step": 862 + }, + { + "epoch": 0.3445243804956035, + "loss": 1.0028644800186157, + "loss_ce": 0.016902528703212738, + "loss_xval": 0.984375, + "num_input_tokens_seen": 148977112, + "step": 862 + }, + { + "epoch": 0.34492406075139886, + "grad_norm": 59.347926899056496, + "learning_rate": 5e-06, + "loss": 0.6159, + "num_input_tokens_seen": 149149576, + "step": 863 + }, + { + "epoch": 0.34492406075139886, + "loss": 0.5505508184432983, + "loss_ce": 0.01075590681284666, + "loss_xval": 0.5390625, + "num_input_tokens_seen": 149149576, + "step": 863 + }, + { + "epoch": 0.34532374100719426, + "grad_norm": 19.849931423983264, + "learning_rate": 5e-06, + "loss": 0.3624, + "num_input_tokens_seen": 149322448, + "step": 864 + }, + { + "epoch": 0.34532374100719426, + "loss": 0.39450711011886597, + "loss_ce": 0.02286403253674507, + "loss_xval": 0.37109375, + "num_input_tokens_seen": 149322448, + "step": 864 + }, + { + "epoch": 0.3457234212629896, + "grad_norm": 93.95032815890265, + "learning_rate": 5e-06, + "loss": 0.7113, + "num_input_tokens_seen": 149495400, + "step": 865 + }, + { + "epoch": 0.3457234212629896, + "loss": 0.6392983794212341, + "loss_ce": 0.007950708270072937, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 149495400, + "step": 865 + }, + { + "epoch": 0.34612310151878495, + "grad_norm": 47.376116576369185, + "learning_rate": 5e-06, + "loss": 0.5306, + "num_input_tokens_seen": 149668592, + "step": 866 + }, + { + "epoch": 0.34612310151878495, + "loss": 0.8008941411972046, + "loss_ce": 0.008505244739353657, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 149668592, + "step": 866 + }, + { + "epoch": 0.34652278177458035, + "grad_norm": 79.5279904225274, + "learning_rate": 5e-06, + "loss": 0.7948, + "num_input_tokens_seen": 149841352, + "step": 867 + }, + { + "epoch": 0.34652278177458035, + "loss": 0.7384788990020752, + "loss_ce": 0.010390488430857658, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 149841352, + "step": 867 + }, + { + "epoch": 0.3469224620303757, + "grad_norm": 135.31620195017132, + "learning_rate": 5e-06, + "loss": 0.9246, + "num_input_tokens_seen": 150014072, + "step": 868 + }, + { + "epoch": 0.3469224620303757, + "loss": 1.0314879417419434, + "loss_ce": 0.010796924121677876, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 150014072, + "step": 868 + }, + { + "epoch": 0.3473221422861711, + "grad_norm": 127.44588322169365, + "learning_rate": 5e-06, + "loss": 0.932, + "num_input_tokens_seen": 150187008, + "step": 869 + }, + { + "epoch": 0.3473221422861711, + "loss": 0.8235166668891907, + "loss_ce": 0.01760845072567463, + "loss_xval": 0.8046875, + "num_input_tokens_seen": 150187008, + "step": 869 + }, + { + "epoch": 0.34772182254196643, + "grad_norm": 135.87113372177404, + "learning_rate": 5e-06, + "loss": 0.7667, + "num_input_tokens_seen": 150359864, + "step": 870 + }, + { + "epoch": 0.34772182254196643, + "loss": 0.5125239491462708, + "loss_ce": 0.008007319644093513, + "loss_xval": 0.50390625, + "num_input_tokens_seen": 150359864, + "step": 870 + }, + { + "epoch": 0.3481215027977618, + "grad_norm": 146.7548911832383, + "learning_rate": 5e-06, + "loss": 0.736, + "num_input_tokens_seen": 150532848, + "step": 871 + }, + { + "epoch": 0.3481215027977618, + "loss": 0.9291001558303833, + "loss_ce": 0.009239314123988152, + "loss_xval": 0.91796875, + "num_input_tokens_seen": 150532848, + "step": 871 + }, + { + "epoch": 0.34852118305355717, + "grad_norm": 133.2402111732672, + "learning_rate": 5e-06, + "loss": 0.8006, + "num_input_tokens_seen": 150705504, + "step": 872 + }, + { + "epoch": 0.34852118305355717, + "loss": 0.8471918106079102, + "loss_ce": 0.01296328753232956, + "loss_xval": 0.8359375, + "num_input_tokens_seen": 150705504, + "step": 872 + }, + { + "epoch": 0.3489208633093525, + "grad_norm": 110.66209021170407, + "learning_rate": 5e-06, + "loss": 0.7153, + "num_input_tokens_seen": 150878208, + "step": 873 + }, + { + "epoch": 0.3489208633093525, + "loss": 0.5486522316932678, + "loss_ce": 0.01294666901230812, + "loss_xval": 0.53515625, + "num_input_tokens_seen": 150878208, + "step": 873 + }, + { + "epoch": 0.34932054356514786, + "grad_norm": 188.24093809918526, + "learning_rate": 5e-06, + "loss": 0.5398, + "num_input_tokens_seen": 151051624, + "step": 874 + }, + { + "epoch": 0.34932054356514786, + "loss": 0.646808922290802, + "loss_ce": 0.008014976046979427, + "loss_xval": 0.640625, + "num_input_tokens_seen": 151051624, + "step": 874 + }, + { + "epoch": 0.34972022382094325, + "grad_norm": 64.80165189770501, + "learning_rate": 5e-06, + "loss": 0.5948, + "num_input_tokens_seen": 151224760, + "step": 875 + }, + { + "epoch": 0.34972022382094325, + "loss": 0.7177197933197021, + "loss_ce": 0.008735395967960358, + "loss_xval": 0.7109375, + "num_input_tokens_seen": 151224760, + "step": 875 + }, + { + "epoch": 0.3501199040767386, + "grad_norm": 217.1609995782766, + "learning_rate": 5e-06, + "loss": 0.8025, + "num_input_tokens_seen": 151397592, + "step": 876 + }, + { + "epoch": 0.3501199040767386, + "loss": 0.7121882438659668, + "loss_ce": 0.014678522013127804, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 151397592, + "step": 876 + }, + { + "epoch": 0.350519584332534, + "grad_norm": 36.431244186683855, + "learning_rate": 5e-06, + "loss": 0.8813, + "num_input_tokens_seen": 151570816, + "step": 877 + }, + { + "epoch": 0.350519584332534, + "loss": 0.89690101146698, + "loss_ce": 0.010609478689730167, + "loss_xval": 0.88671875, + "num_input_tokens_seen": 151570816, + "step": 877 + }, + { + "epoch": 0.35091926458832934, + "grad_norm": 177.67298531218174, + "learning_rate": 5e-06, + "loss": 0.7987, + "num_input_tokens_seen": 151744144, + "step": 878 + }, + { + "epoch": 0.35091926458832934, + "loss": 0.9523735046386719, + "loss_ce": 0.013164570555090904, + "loss_xval": 0.9375, + "num_input_tokens_seen": 151744144, + "step": 878 + }, + { + "epoch": 0.3513189448441247, + "grad_norm": 71.39991832784374, + "learning_rate": 5e-06, + "loss": 0.5297, + "num_input_tokens_seen": 151917624, + "step": 879 + }, + { + "epoch": 0.3513189448441247, + "loss": 0.5013617277145386, + "loss_ce": 0.012103933840990067, + "loss_xval": 0.48828125, + "num_input_tokens_seen": 151917624, + "step": 879 + }, + { + "epoch": 0.3517186250999201, + "grad_norm": 70.96128356278321, + "learning_rate": 5e-06, + "loss": 0.6057, + "num_input_tokens_seen": 152090752, + "step": 880 + }, + { + "epoch": 0.3517186250999201, + "loss": 0.4656725227832794, + "loss_ce": 0.016270659863948822, + "loss_xval": 0.44921875, + "num_input_tokens_seen": 152090752, + "step": 880 + }, + { + "epoch": 0.3521183053557154, + "grad_norm": 82.47791554411735, + "learning_rate": 5e-06, + "loss": 0.6792, + "num_input_tokens_seen": 152263832, + "step": 881 + }, + { + "epoch": 0.3521183053557154, + "loss": 0.7818809747695923, + "loss_ce": 0.015645675361156464, + "loss_xval": 0.765625, + "num_input_tokens_seen": 152263832, + "step": 881 + }, + { + "epoch": 0.35251798561151076, + "grad_norm": 60.893015026895945, + "learning_rate": 5e-06, + "loss": 0.6402, + "num_input_tokens_seen": 152436992, + "step": 882 + }, + { + "epoch": 0.35251798561151076, + "loss": 0.6701950430870056, + "loss_ce": 0.012968515045940876, + "loss_xval": 0.65625, + "num_input_tokens_seen": 152436992, + "step": 882 + }, + { + "epoch": 0.35291766586730616, + "grad_norm": 104.03511210850309, + "learning_rate": 5e-06, + "loss": 0.6192, + "num_input_tokens_seen": 152609648, + "step": 883 + }, + { + "epoch": 0.35291766586730616, + "loss": 0.603535532951355, + "loss_ce": 0.01662144437432289, + "loss_xval": 0.5859375, + "num_input_tokens_seen": 152609648, + "step": 883 + }, + { + "epoch": 0.3533173461231015, + "grad_norm": 22.54287001703602, + "learning_rate": 5e-06, + "loss": 0.4427, + "num_input_tokens_seen": 152782776, + "step": 884 + }, + { + "epoch": 0.3533173461231015, + "loss": 0.3957730233669281, + "loss_ce": 0.016134345903992653, + "loss_xval": 0.37890625, + "num_input_tokens_seen": 152782776, + "step": 884 + }, + { + "epoch": 0.3537170263788969, + "grad_norm": 84.24382922528733, + "learning_rate": 5e-06, + "loss": 0.6745, + "num_input_tokens_seen": 152955672, + "step": 885 + }, + { + "epoch": 0.3537170263788969, + "loss": 0.5475041270256042, + "loss_ce": 0.012897195294499397, + "loss_xval": 0.53515625, + "num_input_tokens_seen": 152955672, + "step": 885 + }, + { + "epoch": 0.35411670663469225, + "grad_norm": 88.2327166019281, + "learning_rate": 5e-06, + "loss": 0.539, + "num_input_tokens_seen": 153128288, + "step": 886 + }, + { + "epoch": 0.35411670663469225, + "loss": 0.502876877784729, + "loss_ce": 0.01227623037993908, + "loss_xval": 0.490234375, + "num_input_tokens_seen": 153128288, + "step": 886 + }, + { + "epoch": 0.3545163868904876, + "grad_norm": 128.72303277494018, + "learning_rate": 5e-06, + "loss": 0.7006, + "num_input_tokens_seen": 153300984, + "step": 887 + }, + { + "epoch": 0.3545163868904876, + "loss": 0.760562539100647, + "loss_ce": 0.010928753763437271, + "loss_xval": 0.75, + "num_input_tokens_seen": 153300984, + "step": 887 + }, + { + "epoch": 0.354916067146283, + "grad_norm": 54.2554350644671, + "learning_rate": 5e-06, + "loss": 0.5323, + "num_input_tokens_seen": 153474224, + "step": 888 + }, + { + "epoch": 0.354916067146283, + "loss": 0.6280389428138733, + "loss_ce": 0.010485243052244186, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 153474224, + "step": 888 + }, + { + "epoch": 0.35531574740207833, + "grad_norm": 129.45656745633136, + "learning_rate": 5e-06, + "loss": 0.7172, + "num_input_tokens_seen": 153647280, + "step": 889 + }, + { + "epoch": 0.35531574740207833, + "loss": 0.9391533136367798, + "loss_ce": 0.008367151021957397, + "loss_xval": 0.9296875, + "num_input_tokens_seen": 153647280, + "step": 889 + }, + { + "epoch": 0.35571542765787373, + "grad_norm": 44.89891618018207, + "learning_rate": 5e-06, + "loss": 0.4303, + "num_input_tokens_seen": 153820864, + "step": 890 + }, + { + "epoch": 0.35571542765787373, + "loss": 0.4807822108268738, + "loss_ce": 0.012703584507107735, + "loss_xval": 0.46875, + "num_input_tokens_seen": 153820864, + "step": 890 + }, + { + "epoch": 0.35611510791366907, + "grad_norm": 116.72687188293727, + "learning_rate": 5e-06, + "loss": 0.5185, + "num_input_tokens_seen": 153994288, + "step": 891 + }, + { + "epoch": 0.35611510791366907, + "loss": 0.284774512052536, + "loss_ce": 0.010848723351955414, + "loss_xval": 0.2734375, + "num_input_tokens_seen": 153994288, + "step": 891 + }, + { + "epoch": 0.3565147881694644, + "grad_norm": 26.176387267105216, + "learning_rate": 5e-06, + "loss": 0.3183, + "num_input_tokens_seen": 154167208, + "step": 892 + }, + { + "epoch": 0.3565147881694644, + "loss": 0.1847338080406189, + "loss_ce": 0.009440846741199493, + "loss_xval": 0.17578125, + "num_input_tokens_seen": 154167208, + "step": 892 + }, + { + "epoch": 0.3569144684252598, + "grad_norm": 165.85099821106596, + "learning_rate": 5e-06, + "loss": 0.6986, + "num_input_tokens_seen": 154339944, + "step": 893 + }, + { + "epoch": 0.3569144684252598, + "loss": 0.6996178030967712, + "loss_ce": 0.008089495822787285, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 154339944, + "step": 893 + }, + { + "epoch": 0.35731414868105515, + "grad_norm": 60.57304398442198, + "learning_rate": 5e-06, + "loss": 0.6213, + "num_input_tokens_seen": 154513144, + "step": 894 + }, + { + "epoch": 0.35731414868105515, + "loss": 0.6562942266464233, + "loss_ce": 0.0067581310868263245, + "loss_xval": 0.6484375, + "num_input_tokens_seen": 154513144, + "step": 894 + }, + { + "epoch": 0.3577138289368505, + "grad_norm": 88.21158150555097, + "learning_rate": 5e-06, + "loss": 0.9994, + "num_input_tokens_seen": 154686024, + "step": 895 + }, + { + "epoch": 0.3577138289368505, + "loss": 0.9512639045715332, + "loss_ce": 0.006012419238686562, + "loss_xval": 0.9453125, + "num_input_tokens_seen": 154686024, + "step": 895 + }, + { + "epoch": 0.3581135091926459, + "grad_norm": 55.98236696389787, + "learning_rate": 5e-06, + "loss": 0.8411, + "num_input_tokens_seen": 154858752, + "step": 896 + }, + { + "epoch": 0.3581135091926459, + "loss": 0.9027426242828369, + "loss_ce": 0.008821753785014153, + "loss_xval": 0.89453125, + "num_input_tokens_seen": 154858752, + "step": 896 + }, + { + "epoch": 0.35851318944844124, + "grad_norm": 86.30176471176888, + "learning_rate": 5e-06, + "loss": 0.5935, + "num_input_tokens_seen": 155031920, + "step": 897 + }, + { + "epoch": 0.35851318944844124, + "loss": 0.8349786400794983, + "loss_ce": 0.012407823465764523, + "loss_xval": 0.82421875, + "num_input_tokens_seen": 155031920, + "step": 897 + }, + { + "epoch": 0.35891286970423664, + "grad_norm": 47.41906607681754, + "learning_rate": 5e-06, + "loss": 0.5413, + "num_input_tokens_seen": 155205008, + "step": 898 + }, + { + "epoch": 0.35891286970423664, + "loss": 0.5134440660476685, + "loss_ce": 0.007950928062200546, + "loss_xval": 0.50390625, + "num_input_tokens_seen": 155205008, + "step": 898 + }, + { + "epoch": 0.359312549960032, + "grad_norm": 30.61976486692005, + "learning_rate": 5e-06, + "loss": 0.6285, + "num_input_tokens_seen": 155378224, + "step": 899 + }, + { + "epoch": 0.359312549960032, + "loss": 0.762615442276001, + "loss_ce": 0.012127195484936237, + "loss_xval": 0.75, + "num_input_tokens_seen": 155378224, + "step": 899 + }, + { + "epoch": 0.3597122302158273, + "grad_norm": 43.585381340210624, + "learning_rate": 5e-06, + "loss": 0.4863, + "num_input_tokens_seen": 155551440, + "step": 900 + }, + { + "epoch": 0.3597122302158273, + "loss": 0.48361706733703613, + "loss_ce": 0.008275268599390984, + "loss_xval": 0.474609375, + "num_input_tokens_seen": 155551440, + "step": 900 + }, + { + "epoch": 0.3601119104716227, + "grad_norm": 30.9989474941419, + "learning_rate": 5e-06, + "loss": 0.4965, + "num_input_tokens_seen": 155724368, + "step": 901 + }, + { + "epoch": 0.3601119104716227, + "loss": 0.49682554602622986, + "loss_ce": 0.006957381498068571, + "loss_xval": 0.490234375, + "num_input_tokens_seen": 155724368, + "step": 901 + }, + { + "epoch": 0.36051159072741806, + "grad_norm": 113.42580772320302, + "learning_rate": 5e-06, + "loss": 0.6446, + "num_input_tokens_seen": 155897552, + "step": 902 + }, + { + "epoch": 0.36051159072741806, + "loss": 0.9667686223983765, + "loss_ce": 0.007417993154376745, + "loss_xval": 0.9609375, + "num_input_tokens_seen": 155897552, + "step": 902 + }, + { + "epoch": 0.3609112709832134, + "grad_norm": 101.77745338904332, + "learning_rate": 5e-06, + "loss": 0.4489, + "num_input_tokens_seen": 156070800, + "step": 903 + }, + { + "epoch": 0.3609112709832134, + "loss": 0.30879414081573486, + "loss_ce": 0.004655956290662289, + "loss_xval": 0.3046875, + "num_input_tokens_seen": 156070800, + "step": 903 + }, + { + "epoch": 0.3613109512390088, + "grad_norm": 98.20446936043231, + "learning_rate": 5e-06, + "loss": 0.7204, + "num_input_tokens_seen": 156243696, + "step": 904 + }, + { + "epoch": 0.3613109512390088, + "loss": 0.5402124524116516, + "loss_ce": 0.008504673838615417, + "loss_xval": 0.53125, + "num_input_tokens_seen": 156243696, + "step": 904 + }, + { + "epoch": 0.36171063149480415, + "grad_norm": 110.94970569500595, + "learning_rate": 5e-06, + "loss": 0.5071, + "num_input_tokens_seen": 156416472, + "step": 905 + }, + { + "epoch": 0.36171063149480415, + "loss": 0.33861905336380005, + "loss_ce": 0.012080967426300049, + "loss_xval": 0.326171875, + "num_input_tokens_seen": 156416472, + "step": 905 + }, + { + "epoch": 0.36211031175059955, + "grad_norm": 125.05593579343002, + "learning_rate": 5e-06, + "loss": 0.4928, + "num_input_tokens_seen": 156589520, + "step": 906 + }, + { + "epoch": 0.36211031175059955, + "loss": 0.6595361232757568, + "loss_ce": 0.008992912247776985, + "loss_xval": 0.65234375, + "num_input_tokens_seen": 156589520, + "step": 906 + }, + { + "epoch": 0.3625099920063949, + "grad_norm": 73.10342927626562, + "learning_rate": 5e-06, + "loss": 0.8154, + "num_input_tokens_seen": 156762224, + "step": 907 + }, + { + "epoch": 0.3625099920063949, + "loss": 0.6952431797981262, + "loss_ce": 0.009085968136787415, + "loss_xval": 0.6875, + "num_input_tokens_seen": 156762224, + "step": 907 + }, + { + "epoch": 0.36290967226219023, + "grad_norm": 126.39497282050058, + "learning_rate": 5e-06, + "loss": 1.1672, + "num_input_tokens_seen": 156935288, + "step": 908 + }, + { + "epoch": 0.36290967226219023, + "loss": 0.9857476949691772, + "loss_ce": 0.006133475806564093, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 156935288, + "step": 908 + }, + { + "epoch": 0.36330935251798563, + "grad_norm": 54.826078470566614, + "learning_rate": 5e-06, + "loss": 0.504, + "num_input_tokens_seen": 157108208, + "step": 909 + }, + { + "epoch": 0.36330935251798563, + "loss": 0.29077398777008057, + "loss_ce": 0.0048853312619030476, + "loss_xval": 0.28515625, + "num_input_tokens_seen": 157108208, + "step": 909 + }, + { + "epoch": 0.36370903277378097, + "grad_norm": 71.30519374447884, + "learning_rate": 5e-06, + "loss": 0.6028, + "num_input_tokens_seen": 157281176, + "step": 910 + }, + { + "epoch": 0.36370903277378097, + "loss": 0.48019158840179443, + "loss_ce": 0.007047041319310665, + "loss_xval": 0.47265625, + "num_input_tokens_seen": 157281176, + "step": 910 + }, + { + "epoch": 0.3641087130295763, + "grad_norm": 52.76501098451776, + "learning_rate": 5e-06, + "loss": 0.6989, + "num_input_tokens_seen": 157454056, + "step": 911 + }, + { + "epoch": 0.3641087130295763, + "loss": 0.8536124229431152, + "loss_ce": 0.005712021142244339, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 157454056, + "step": 911 + }, + { + "epoch": 0.3645083932853717, + "grad_norm": 70.87644756607571, + "learning_rate": 5e-06, + "loss": 0.5284, + "num_input_tokens_seen": 157626944, + "step": 912 + }, + { + "epoch": 0.3645083932853717, + "loss": 0.7316247224807739, + "loss_ce": 0.004940135404467583, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 157626944, + "step": 912 + }, + { + "epoch": 0.36490807354116706, + "grad_norm": 102.12641205437491, + "learning_rate": 5e-06, + "loss": 0.7, + "num_input_tokens_seen": 157800248, + "step": 913 + }, + { + "epoch": 0.36490807354116706, + "loss": 0.8764115571975708, + "loss_ce": 0.00440229382365942, + "loss_xval": 0.87109375, + "num_input_tokens_seen": 157800248, + "step": 913 + }, + { + "epoch": 0.36530775379696245, + "grad_norm": 79.45654741039287, + "learning_rate": 5e-06, + "loss": 0.6452, + "num_input_tokens_seen": 157973304, + "step": 914 + }, + { + "epoch": 0.36530775379696245, + "loss": 0.3669845163822174, + "loss_ce": 0.009074367582798004, + "loss_xval": 0.357421875, + "num_input_tokens_seen": 157973304, + "step": 914 + }, + { + "epoch": 0.3657074340527578, + "grad_norm": 124.30904914641106, + "learning_rate": 5e-06, + "loss": 0.7281, + "num_input_tokens_seen": 158146416, + "step": 915 + }, + { + "epoch": 0.3657074340527578, + "loss": 0.82874596118927, + "loss_ce": 0.009165898896753788, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 158146416, + "step": 915 + }, + { + "epoch": 0.36610711430855314, + "grad_norm": 42.101453883892475, + "learning_rate": 5e-06, + "loss": 0.6469, + "num_input_tokens_seen": 158319176, + "step": 916 + }, + { + "epoch": 0.36610711430855314, + "loss": 0.7069438695907593, + "loss_ce": 0.017673827707767487, + "loss_xval": 0.6875, + "num_input_tokens_seen": 158319176, + "step": 916 + }, + { + "epoch": 0.36650679456434854, + "grad_norm": 162.51287996615943, + "learning_rate": 5e-06, + "loss": 0.6696, + "num_input_tokens_seen": 158492232, + "step": 917 + }, + { + "epoch": 0.36650679456434854, + "loss": 0.90346360206604, + "loss_ce": 0.005209219641983509, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 158492232, + "step": 917 + }, + { + "epoch": 0.3669064748201439, + "grad_norm": 100.54356211089177, + "learning_rate": 5e-06, + "loss": 0.5464, + "num_input_tokens_seen": 158665144, + "step": 918 + }, + { + "epoch": 0.3669064748201439, + "loss": 0.5094872713088989, + "loss_ce": 0.011562451720237732, + "loss_xval": 0.498046875, + "num_input_tokens_seen": 158665144, + "step": 918 + }, + { + "epoch": 0.3673061550759392, + "grad_norm": 76.05379399514754, + "learning_rate": 5e-06, + "loss": 0.7202, + "num_input_tokens_seen": 158838016, + "step": 919 + }, + { + "epoch": 0.3673061550759392, + "loss": 0.7858811616897583, + "loss_ce": 0.008903573267161846, + "loss_xval": 0.77734375, + "num_input_tokens_seen": 158838016, + "step": 919 + }, + { + "epoch": 0.3677058353317346, + "grad_norm": 90.74075761256418, + "learning_rate": 5e-06, + "loss": 0.9269, + "num_input_tokens_seen": 159011192, + "step": 920 + }, + { + "epoch": 0.3677058353317346, + "loss": 0.9359513521194458, + "loss_ce": 0.00675212824717164, + "loss_xval": 0.9296875, + "num_input_tokens_seen": 159011192, + "step": 920 + }, + { + "epoch": 0.36810551558752996, + "grad_norm": 45.087533766808335, + "learning_rate": 5e-06, + "loss": 0.683, + "num_input_tokens_seen": 159184136, + "step": 921 + }, + { + "epoch": 0.36810551558752996, + "loss": 0.9866589307785034, + "loss_ce": 0.005701903253793716, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 159184136, + "step": 921 + }, + { + "epoch": 0.36850519584332536, + "grad_norm": 107.55288335570066, + "learning_rate": 5e-06, + "loss": 0.676, + "num_input_tokens_seen": 159356600, + "step": 922 + }, + { + "epoch": 0.36850519584332536, + "loss": 0.4224510192871094, + "loss_ce": 0.007106784265488386, + "loss_xval": 0.416015625, + "num_input_tokens_seen": 159356600, + "step": 922 + }, + { + "epoch": 0.3689048760991207, + "grad_norm": 54.68109555714196, + "learning_rate": 5e-06, + "loss": 0.7, + "num_input_tokens_seen": 159529752, + "step": 923 + }, + { + "epoch": 0.3689048760991207, + "loss": 0.4360928535461426, + "loss_ce": 0.011532355099916458, + "loss_xval": 0.423828125, + "num_input_tokens_seen": 159529752, + "step": 923 + }, + { + "epoch": 0.36930455635491605, + "grad_norm": 42.085657682472316, + "learning_rate": 5e-06, + "loss": 0.5586, + "num_input_tokens_seen": 159702560, + "step": 924 + }, + { + "epoch": 0.36930455635491605, + "loss": 0.6730107069015503, + "loss_ce": 0.005652267951518297, + "loss_xval": 0.66796875, + "num_input_tokens_seen": 159702560, + "step": 924 + }, + { + "epoch": 0.36970423661071145, + "grad_norm": 47.95856995776537, + "learning_rate": 5e-06, + "loss": 1.0306, + "num_input_tokens_seen": 159875480, + "step": 925 + }, + { + "epoch": 0.36970423661071145, + "loss": 1.4013919830322266, + "loss_ce": 0.005395848304033279, + "loss_xval": 1.3984375, + "num_input_tokens_seen": 159875480, + "step": 925 + }, + { + "epoch": 0.3701039168665068, + "grad_norm": 63.74419610065917, + "learning_rate": 5e-06, + "loss": 0.8646, + "num_input_tokens_seen": 160048464, + "step": 926 + }, + { + "epoch": 0.3701039168665068, + "loss": 0.7054557800292969, + "loss_ce": 0.008556396700441837, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 160048464, + "step": 926 + }, + { + "epoch": 0.37050359712230213, + "grad_norm": 120.90791495364564, + "learning_rate": 5e-06, + "loss": 0.6159, + "num_input_tokens_seen": 160217688, + "step": 927 + }, + { + "epoch": 0.37050359712230213, + "loss": 0.663061261177063, + "loss_ce": 0.006322955247014761, + "loss_xval": 0.65625, + "num_input_tokens_seen": 160217688, + "step": 927 + }, + { + "epoch": 0.37090327737809753, + "grad_norm": 30.76703590033193, + "learning_rate": 5e-06, + "loss": 0.5309, + "num_input_tokens_seen": 160390592, + "step": 928 + }, + { + "epoch": 0.37090327737809753, + "loss": 0.5543885231018066, + "loss_ce": 0.0052857049740850925, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 160390592, + "step": 928 + }, + { + "epoch": 0.37130295763389287, + "grad_norm": 113.83660820851645, + "learning_rate": 5e-06, + "loss": 0.5308, + "num_input_tokens_seen": 160563232, + "step": 929 + }, + { + "epoch": 0.37130295763389287, + "loss": 0.59651780128479, + "loss_ce": 0.00856616348028183, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 160563232, + "step": 929 + }, + { + "epoch": 0.37170263788968827, + "grad_norm": 47.41293456779943, + "learning_rate": 5e-06, + "loss": 0.708, + "num_input_tokens_seen": 160736496, + "step": 930 + }, + { + "epoch": 0.37170263788968827, + "loss": 0.9490618705749512, + "loss_ce": 0.00680114608258009, + "loss_xval": 0.94140625, + "num_input_tokens_seen": 160736496, + "step": 930 + }, + { + "epoch": 0.3721023181454836, + "grad_norm": 158.06844665795316, + "learning_rate": 5e-06, + "loss": 0.644, + "num_input_tokens_seen": 160909248, + "step": 931 + }, + { + "epoch": 0.3721023181454836, + "loss": 0.5230578184127808, + "loss_ce": 0.005845884792506695, + "loss_xval": 0.515625, + "num_input_tokens_seen": 160909248, + "step": 931 + }, + { + "epoch": 0.37250199840127896, + "grad_norm": 51.036944593601305, + "learning_rate": 5e-06, + "loss": 0.7556, + "num_input_tokens_seen": 161082424, + "step": 932 + }, + { + "epoch": 0.37250199840127896, + "loss": 0.789161205291748, + "loss_ce": 0.00980327744036913, + "loss_xval": 0.78125, + "num_input_tokens_seen": 161082424, + "step": 932 + }, + { + "epoch": 0.37290167865707435, + "grad_norm": 64.86944261600239, + "learning_rate": 5e-06, + "loss": 0.5996, + "num_input_tokens_seen": 161251584, + "step": 933 + }, + { + "epoch": 0.37290167865707435, + "loss": 0.6159493923187256, + "loss_ce": 0.007917143404483795, + "loss_xval": 0.609375, + "num_input_tokens_seen": 161251584, + "step": 933 + }, + { + "epoch": 0.3733013589128697, + "grad_norm": 70.73385265509664, + "learning_rate": 5e-06, + "loss": 0.5539, + "num_input_tokens_seen": 161424536, + "step": 934 + }, + { + "epoch": 0.3733013589128697, + "loss": 0.46938377618789673, + "loss_ce": 0.007774879224598408, + "loss_xval": 0.4609375, + "num_input_tokens_seen": 161424536, + "step": 934 + }, + { + "epoch": 0.3737010391686651, + "grad_norm": 91.39872145012366, + "learning_rate": 5e-06, + "loss": 0.5084, + "num_input_tokens_seen": 161597208, + "step": 935 + }, + { + "epoch": 0.3737010391686651, + "loss": 0.5436455011367798, + "loss_ce": 0.005315391346812248, + "loss_xval": 0.5390625, + "num_input_tokens_seen": 161597208, + "step": 935 + }, + { + "epoch": 0.37410071942446044, + "grad_norm": 34.91207932565985, + "learning_rate": 5e-06, + "loss": 0.6474, + "num_input_tokens_seen": 161770632, + "step": 936 + }, + { + "epoch": 0.37410071942446044, + "loss": 0.6301032304763794, + "loss_ce": 0.008216038346290588, + "loss_xval": 0.62109375, + "num_input_tokens_seen": 161770632, + "step": 936 + }, + { + "epoch": 0.3745003996802558, + "grad_norm": 64.44416456277416, + "learning_rate": 5e-06, + "loss": 0.5532, + "num_input_tokens_seen": 161943488, + "step": 937 + }, + { + "epoch": 0.3745003996802558, + "loss": 0.6335973739624023, + "loss_ce": 0.007498729042708874, + "loss_xval": 0.625, + "num_input_tokens_seen": 161943488, + "step": 937 + }, + { + "epoch": 0.3749000799360512, + "grad_norm": 40.232602977833224, + "learning_rate": 5e-06, + "loss": 0.6844, + "num_input_tokens_seen": 162116368, + "step": 938 + }, + { + "epoch": 0.3749000799360512, + "loss": 0.6123180389404297, + "loss_ce": 0.005567554850131273, + "loss_xval": 0.60546875, + "num_input_tokens_seen": 162116368, + "step": 938 + }, + { + "epoch": 0.3752997601918465, + "grad_norm": 53.75798255647753, + "learning_rate": 5e-06, + "loss": 0.7675, + "num_input_tokens_seen": 162289448, + "step": 939 + }, + { + "epoch": 0.3752997601918465, + "loss": 0.9261175990104675, + "loss_ce": 0.014862729236483574, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 162289448, + "step": 939 + }, + { + "epoch": 0.37569944044764186, + "grad_norm": 39.58720080674575, + "learning_rate": 5e-06, + "loss": 0.8083, + "num_input_tokens_seen": 162462168, + "step": 940 + }, + { + "epoch": 0.37569944044764186, + "loss": 0.8281588554382324, + "loss_ce": 0.00760222552344203, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 162462168, + "step": 940 + }, + { + "epoch": 0.37609912070343726, + "grad_norm": 83.49540063053209, + "learning_rate": 5e-06, + "loss": 0.5544, + "num_input_tokens_seen": 162635232, + "step": 941 + }, + { + "epoch": 0.37609912070343726, + "loss": 0.6313626766204834, + "loss_ce": 0.008254722692072392, + "loss_xval": 0.625, + "num_input_tokens_seen": 162635232, + "step": 941 + }, + { + "epoch": 0.3764988009592326, + "grad_norm": 25.10804339460941, + "learning_rate": 5e-06, + "loss": 0.4647, + "num_input_tokens_seen": 162808504, + "step": 942 + }, + { + "epoch": 0.3764988009592326, + "loss": 0.6199823021888733, + "loss_ce": 0.006884154863655567, + "loss_xval": 0.61328125, + "num_input_tokens_seen": 162808504, + "step": 942 + }, + { + "epoch": 0.376898481215028, + "grad_norm": 117.86457604419098, + "learning_rate": 5e-06, + "loss": 0.4543, + "num_input_tokens_seen": 162981520, + "step": 943 + }, + { + "epoch": 0.376898481215028, + "loss": 0.6796769499778748, + "loss_ce": 0.008473317138850689, + "loss_xval": 0.671875, + "num_input_tokens_seen": 162981520, + "step": 943 + }, + { + "epoch": 0.37729816147082335, + "grad_norm": 86.42324771771638, + "learning_rate": 5e-06, + "loss": 0.7931, + "num_input_tokens_seen": 163154416, + "step": 944 + }, + { + "epoch": 0.37729816147082335, + "loss": 0.7076585292816162, + "loss_ce": 0.005021838005632162, + "loss_xval": 0.703125, + "num_input_tokens_seen": 163154416, + "step": 944 + }, + { + "epoch": 0.3776978417266187, + "grad_norm": 88.55505827409151, + "learning_rate": 5e-06, + "loss": 0.6935, + "num_input_tokens_seen": 163327568, + "step": 945 + }, + { + "epoch": 0.3776978417266187, + "loss": 0.789901852607727, + "loss_ce": 0.00938426237553358, + "loss_xval": 0.78125, + "num_input_tokens_seen": 163327568, + "step": 945 + }, + { + "epoch": 0.3780975219824141, + "grad_norm": 59.41303258965943, + "learning_rate": 5e-06, + "loss": 0.4882, + "num_input_tokens_seen": 163500632, + "step": 946 + }, + { + "epoch": 0.3780975219824141, + "loss": 0.64866042137146, + "loss_ce": 0.006875764578580856, + "loss_xval": 0.640625, + "num_input_tokens_seen": 163500632, + "step": 946 + }, + { + "epoch": 0.37849720223820943, + "grad_norm": 79.47629063266275, + "learning_rate": 5e-06, + "loss": 0.6103, + "num_input_tokens_seen": 163673792, + "step": 947 + }, + { + "epoch": 0.37849720223820943, + "loss": 0.34968358278274536, + "loss_ce": 0.004804443567991257, + "loss_xval": 0.345703125, + "num_input_tokens_seen": 163673792, + "step": 947 + }, + { + "epoch": 0.37889688249400477, + "grad_norm": 85.59131590620106, + "learning_rate": 5e-06, + "loss": 0.7057, + "num_input_tokens_seen": 163846624, + "step": 948 + }, + { + "epoch": 0.37889688249400477, + "loss": 0.6663787961006165, + "loss_ce": 0.004757732152938843, + "loss_xval": 0.66015625, + "num_input_tokens_seen": 163846624, + "step": 948 + }, + { + "epoch": 0.37929656274980017, + "grad_norm": 76.63253940661868, + "learning_rate": 5e-06, + "loss": 0.6282, + "num_input_tokens_seen": 164019440, + "step": 949 + }, + { + "epoch": 0.37929656274980017, + "loss": 0.7174029350280762, + "loss_ce": 0.006465459242463112, + "loss_xval": 0.7109375, + "num_input_tokens_seen": 164019440, + "step": 949 + }, + { + "epoch": 0.3796962430055955, + "grad_norm": 108.31628507448607, + "learning_rate": 5e-06, + "loss": 0.5647, + "num_input_tokens_seen": 164192424, + "step": 950 + }, + { + "epoch": 0.3796962430055955, + "loss": 0.6702345609664917, + "loss_ce": 0.005500704515725374, + "loss_xval": 0.6640625, + "num_input_tokens_seen": 164192424, + "step": 950 + }, + { + "epoch": 0.3800959232613909, + "grad_norm": 187.11072677320212, + "learning_rate": 5e-06, + "loss": 0.7636, + "num_input_tokens_seen": 164365360, + "step": 951 + }, + { + "epoch": 0.3800959232613909, + "loss": 0.6925037503242493, + "loss_ce": 0.007933435961604118, + "loss_xval": 0.68359375, + "num_input_tokens_seen": 164365360, + "step": 951 + }, + { + "epoch": 0.38049560351718625, + "grad_norm": 103.05122060180793, + "learning_rate": 5e-06, + "loss": 0.5753, + "num_input_tokens_seen": 164538256, + "step": 952 + }, + { + "epoch": 0.38049560351718625, + "loss": 0.4882552921772003, + "loss_ce": 0.007603436708450317, + "loss_xval": 0.48046875, + "num_input_tokens_seen": 164538256, + "step": 952 + }, + { + "epoch": 0.3808952837729816, + "grad_norm": 96.4099328188074, + "learning_rate": 5e-06, + "loss": 0.4646, + "num_input_tokens_seen": 164711880, + "step": 953 + }, + { + "epoch": 0.3808952837729816, + "loss": 0.3607138395309448, + "loss_ce": 0.008541014045476913, + "loss_xval": 0.3515625, + "num_input_tokens_seen": 164711880, + "step": 953 + }, + { + "epoch": 0.381294964028777, + "grad_norm": 67.6984785917304, + "learning_rate": 5e-06, + "loss": 0.8889, + "num_input_tokens_seen": 164881448, + "step": 954 + }, + { + "epoch": 0.381294964028777, + "loss": 0.927307665348053, + "loss_ce": 0.01062064804136753, + "loss_xval": 0.91796875, + "num_input_tokens_seen": 164881448, + "step": 954 + }, + { + "epoch": 0.38169464428457234, + "grad_norm": 153.94541262913765, + "learning_rate": 5e-06, + "loss": 0.8191, + "num_input_tokens_seen": 165054160, + "step": 955 + }, + { + "epoch": 0.38169464428457234, + "loss": 0.47236108779907227, + "loss_ce": 0.006662844214588404, + "loss_xval": 0.46484375, + "num_input_tokens_seen": 165054160, + "step": 955 + }, + { + "epoch": 0.3820943245403677, + "grad_norm": 149.72575420742487, + "learning_rate": 5e-06, + "loss": 1.0601, + "num_input_tokens_seen": 165226944, + "step": 956 + }, + { + "epoch": 0.3820943245403677, + "loss": 1.1687389612197876, + "loss_ce": 0.00907099712640047, + "loss_xval": 1.15625, + "num_input_tokens_seen": 165226944, + "step": 956 + }, + { + "epoch": 0.3824940047961631, + "grad_norm": 44.450558669277505, + "learning_rate": 5e-06, + "loss": 0.4047, + "num_input_tokens_seen": 165399800, + "step": 957 + }, + { + "epoch": 0.3824940047961631, + "loss": 0.38355177640914917, + "loss_ce": 0.006476604379713535, + "loss_xval": 0.376953125, + "num_input_tokens_seen": 165399800, + "step": 957 + }, + { + "epoch": 0.3828936850519584, + "grad_norm": 106.25175160747101, + "learning_rate": 5e-06, + "loss": 0.4297, + "num_input_tokens_seen": 165572760, + "step": 958 + }, + { + "epoch": 0.3828936850519584, + "loss": 0.5556713938713074, + "loss_ce": 0.011237800121307373, + "loss_xval": 0.54296875, + "num_input_tokens_seen": 165572760, + "step": 958 + }, + { + "epoch": 0.3832933653077538, + "grad_norm": 49.87171905029353, + "learning_rate": 5e-06, + "loss": 0.6636, + "num_input_tokens_seen": 165745872, + "step": 959 + }, + { + "epoch": 0.3832933653077538, + "loss": 0.41619065403938293, + "loss_ce": 0.006583709269762039, + "loss_xval": 0.41015625, + "num_input_tokens_seen": 165745872, + "step": 959 + }, + { + "epoch": 0.38369304556354916, + "grad_norm": 132.66081692696326, + "learning_rate": 5e-06, + "loss": 0.9105, + "num_input_tokens_seen": 165918744, + "step": 960 + }, + { + "epoch": 0.38369304556354916, + "loss": 0.9205524325370789, + "loss_ce": 0.011311713606119156, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 165918744, + "step": 960 + }, + { + "epoch": 0.3840927258193445, + "grad_norm": 73.04408709563512, + "learning_rate": 5e-06, + "loss": 0.5809, + "num_input_tokens_seen": 166087696, + "step": 961 + }, + { + "epoch": 0.3840927258193445, + "loss": 0.701846718788147, + "loss_ce": 0.005496594589203596, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 166087696, + "step": 961 + }, + { + "epoch": 0.3844924060751399, + "grad_norm": 142.34342052216527, + "learning_rate": 5e-06, + "loss": 0.8367, + "num_input_tokens_seen": 166260736, + "step": 962 + }, + { + "epoch": 0.3844924060751399, + "loss": 1.0870518684387207, + "loss_ce": 0.007217873819172382, + "loss_xval": 1.078125, + "num_input_tokens_seen": 166260736, + "step": 962 + }, + { + "epoch": 0.38489208633093525, + "grad_norm": 91.56652258008455, + "learning_rate": 5e-06, + "loss": 0.6936, + "num_input_tokens_seen": 166433480, + "step": 963 + }, + { + "epoch": 0.38489208633093525, + "loss": 0.757168710231781, + "loss_ce": 0.005947999190539122, + "loss_xval": 0.75, + "num_input_tokens_seen": 166433480, + "step": 963 + }, + { + "epoch": 0.3852917665867306, + "grad_norm": 112.22507236008505, + "learning_rate": 5e-06, + "loss": 0.4131, + "num_input_tokens_seen": 166606920, + "step": 964 + }, + { + "epoch": 0.3852917665867306, + "loss": 0.2249833643436432, + "loss_ce": 0.007179419510066509, + "loss_xval": 0.2177734375, + "num_input_tokens_seen": 166606920, + "step": 964 + }, + { + "epoch": 0.385691446842526, + "grad_norm": 75.91188836670992, + "learning_rate": 5e-06, + "loss": 0.4977, + "num_input_tokens_seen": 166779976, + "step": 965 + }, + { + "epoch": 0.385691446842526, + "loss": 0.3778020143508911, + "loss_ce": 0.007257565855979919, + "loss_xval": 0.37109375, + "num_input_tokens_seen": 166779976, + "step": 965 + }, + { + "epoch": 0.38609112709832133, + "grad_norm": 72.22940290020605, + "learning_rate": 5e-06, + "loss": 0.583, + "num_input_tokens_seen": 166953200, + "step": 966 + }, + { + "epoch": 0.38609112709832133, + "loss": 0.6213172674179077, + "loss_ce": 0.011698130518198013, + "loss_xval": 0.609375, + "num_input_tokens_seen": 166953200, + "step": 966 + }, + { + "epoch": 0.3864908073541167, + "grad_norm": 40.4774269583089, + "learning_rate": 5e-06, + "loss": 0.8808, + "num_input_tokens_seen": 167126160, + "step": 967 + }, + { + "epoch": 0.3864908073541167, + "loss": 1.3061549663543701, + "loss_ce": 0.008913781493902206, + "loss_xval": 1.296875, + "num_input_tokens_seen": 167126160, + "step": 967 + }, + { + "epoch": 0.38689048760991207, + "grad_norm": 77.76546008195128, + "learning_rate": 5e-06, + "loss": 0.6034, + "num_input_tokens_seen": 167299016, + "step": 968 + }, + { + "epoch": 0.38689048760991207, + "loss": 0.4178801476955414, + "loss_ce": 0.005404568277299404, + "loss_xval": 0.412109375, + "num_input_tokens_seen": 167299016, + "step": 968 + }, + { + "epoch": 0.3872901678657074, + "grad_norm": 50.08422450179112, + "learning_rate": 5e-06, + "loss": 0.8172, + "num_input_tokens_seen": 167472080, + "step": 969 + }, + { + "epoch": 0.3872901678657074, + "loss": 0.6857173442840576, + "loss_ce": 0.008837435394525528, + "loss_xval": 0.67578125, + "num_input_tokens_seen": 167472080, + "step": 969 + }, + { + "epoch": 0.3876898481215028, + "grad_norm": 115.02853547549552, + "learning_rate": 5e-06, + "loss": 1.2746, + "num_input_tokens_seen": 167645184, + "step": 970 + }, + { + "epoch": 0.3876898481215028, + "loss": 0.7067731618881226, + "loss_ce": 0.007554412819445133, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 167645184, + "step": 970 + }, + { + "epoch": 0.38808952837729815, + "grad_norm": 114.25307598804022, + "learning_rate": 5e-06, + "loss": 0.6847, + "num_input_tokens_seen": 167817888, + "step": 971 + }, + { + "epoch": 0.38808952837729815, + "loss": 0.8140785694122314, + "loss_ce": 0.006705489940941334, + "loss_xval": 0.80859375, + "num_input_tokens_seen": 167817888, + "step": 971 + }, + { + "epoch": 0.38848920863309355, + "grad_norm": 58.200766823999736, + "learning_rate": 5e-06, + "loss": 0.8283, + "num_input_tokens_seen": 167990920, + "step": 972 + }, + { + "epoch": 0.38848920863309355, + "loss": 0.551996111869812, + "loss_ce": 0.007928753271698952, + "loss_xval": 0.54296875, + "num_input_tokens_seen": 167990920, + "step": 972 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 58.649617728544364, + "learning_rate": 5e-06, + "loss": 0.4376, + "num_input_tokens_seen": 168163488, + "step": 973 + }, + { + "epoch": 0.3888888888888889, + "loss": 0.2681065797805786, + "loss_ce": 0.0070592425763607025, + "loss_xval": 0.26171875, + "num_input_tokens_seen": 168163488, + "step": 973 + }, + { + "epoch": 0.38928856914468424, + "grad_norm": 80.19293169274987, + "learning_rate": 5e-06, + "loss": 0.6363, + "num_input_tokens_seen": 168336336, + "step": 974 + }, + { + "epoch": 0.38928856914468424, + "loss": 0.7013700008392334, + "loss_ce": 0.006606808863580227, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 168336336, + "step": 974 + }, + { + "epoch": 0.38968824940047964, + "grad_norm": 89.07859540994681, + "learning_rate": 5e-06, + "loss": 0.4365, + "num_input_tokens_seen": 168509680, + "step": 975 + }, + { + "epoch": 0.38968824940047964, + "loss": 0.40148645639419556, + "loss_ce": 0.006741571240127087, + "loss_xval": 0.39453125, + "num_input_tokens_seen": 168509680, + "step": 975 + }, + { + "epoch": 0.390087929656275, + "grad_norm": 63.38435699178039, + "learning_rate": 5e-06, + "loss": 0.6486, + "num_input_tokens_seen": 168682024, + "step": 976 + }, + { + "epoch": 0.390087929656275, + "loss": 0.48744258284568787, + "loss_ce": 0.00575310830026865, + "loss_xval": 0.482421875, + "num_input_tokens_seen": 168682024, + "step": 976 + }, + { + "epoch": 0.3904876099120703, + "grad_norm": 37.56895127097929, + "learning_rate": 5e-06, + "loss": 0.8508, + "num_input_tokens_seen": 168855064, + "step": 977 + }, + { + "epoch": 0.3904876099120703, + "loss": 0.7805444002151489, + "loss_ce": 0.010647003538906574, + "loss_xval": 0.76953125, + "num_input_tokens_seen": 168855064, + "step": 977 + }, + { + "epoch": 0.3908872901678657, + "grad_norm": 38.66001191702328, + "learning_rate": 5e-06, + "loss": 0.6123, + "num_input_tokens_seen": 169028096, + "step": 978 + }, + { + "epoch": 0.3908872901678657, + "loss": 0.5990478992462158, + "loss_ce": 0.009570390917360783, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 169028096, + "step": 978 + }, + { + "epoch": 0.39128697042366106, + "grad_norm": 43.557533168818516, + "learning_rate": 5e-06, + "loss": 0.6573, + "num_input_tokens_seen": 169201024, + "step": 979 + }, + { + "epoch": 0.39128697042366106, + "loss": 0.6643787026405334, + "loss_ce": 0.005992514081299305, + "loss_xval": 0.66015625, + "num_input_tokens_seen": 169201024, + "step": 979 + }, + { + "epoch": 0.39168665067945646, + "grad_norm": 57.037574608846434, + "learning_rate": 5e-06, + "loss": 0.553, + "num_input_tokens_seen": 169372448, + "step": 980 + }, + { + "epoch": 0.39168665067945646, + "loss": 0.7914978265762329, + "loss_ce": 0.015252649784088135, + "loss_xval": 0.77734375, + "num_input_tokens_seen": 169372448, + "step": 980 + }, + { + "epoch": 0.3920863309352518, + "grad_norm": 26.713226016677673, + "learning_rate": 5e-06, + "loss": 0.4622, + "num_input_tokens_seen": 169545336, + "step": 981 + }, + { + "epoch": 0.3920863309352518, + "loss": 0.5592265725135803, + "loss_ce": 0.005027370527386665, + "loss_xval": 0.5546875, + "num_input_tokens_seen": 169545336, + "step": 981 + }, + { + "epoch": 0.39248601119104715, + "grad_norm": 30.376410347757318, + "learning_rate": 5e-06, + "loss": 0.4451, + "num_input_tokens_seen": 169718480, + "step": 982 + }, + { + "epoch": 0.39248601119104715, + "loss": 0.326797217130661, + "loss_ce": 0.00709507055580616, + "loss_xval": 0.3203125, + "num_input_tokens_seen": 169718480, + "step": 982 + }, + { + "epoch": 0.39288569144684254, + "grad_norm": 42.70059848914429, + "learning_rate": 5e-06, + "loss": 0.65, + "num_input_tokens_seen": 169891376, + "step": 983 + }, + { + "epoch": 0.39288569144684254, + "loss": 0.6005296111106873, + "loss_ce": 0.004796041641384363, + "loss_xval": 0.59765625, + "num_input_tokens_seen": 169891376, + "step": 983 + }, + { + "epoch": 0.3932853717026379, + "grad_norm": 61.85368887460458, + "learning_rate": 5e-06, + "loss": 0.7171, + "num_input_tokens_seen": 170064280, + "step": 984 + }, + { + "epoch": 0.3932853717026379, + "loss": 0.3366258144378662, + "loss_ce": 0.0034348834306001663, + "loss_xval": 0.333984375, + "num_input_tokens_seen": 170064280, + "step": 984 + }, + { + "epoch": 0.39368505195843323, + "grad_norm": 27.359901358980956, + "learning_rate": 5e-06, + "loss": 0.5081, + "num_input_tokens_seen": 170237664, + "step": 985 + }, + { + "epoch": 0.39368505195843323, + "loss": 0.4442784786224365, + "loss_ce": 0.0073583247140049934, + "loss_xval": 0.4375, + "num_input_tokens_seen": 170237664, + "step": 985 + }, + { + "epoch": 0.39408473221422863, + "grad_norm": 161.64024654863002, + "learning_rate": 5e-06, + "loss": 0.8349, + "num_input_tokens_seen": 170411016, + "step": 986 + }, + { + "epoch": 0.39408473221422863, + "loss": 0.765992283821106, + "loss_ce": 0.005799395032227039, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 170411016, + "step": 986 + }, + { + "epoch": 0.39448441247002397, + "grad_norm": 77.11201946826561, + "learning_rate": 5e-06, + "loss": 1.1603, + "num_input_tokens_seen": 170583888, + "step": 987 + }, + { + "epoch": 0.39448441247002397, + "loss": 1.365664005279541, + "loss_ce": 0.007936842739582062, + "loss_xval": 1.359375, + "num_input_tokens_seen": 170583888, + "step": 987 + }, + { + "epoch": 0.39488409272581937, + "grad_norm": 126.12675711597637, + "learning_rate": 5e-06, + "loss": 0.9069, + "num_input_tokens_seen": 170756584, + "step": 988 + }, + { + "epoch": 0.39488409272581937, + "loss": 0.7058815956115723, + "loss_ce": 0.008860129863023758, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 170756584, + "step": 988 + }, + { + "epoch": 0.3952837729816147, + "grad_norm": 176.69341122468677, + "learning_rate": 5e-06, + "loss": 0.8091, + "num_input_tokens_seen": 170929272, + "step": 989 + }, + { + "epoch": 0.3952837729816147, + "loss": 0.7472469806671143, + "loss_ce": 0.0038387635722756386, + "loss_xval": 0.7421875, + "num_input_tokens_seen": 170929272, + "step": 989 + }, + { + "epoch": 0.39568345323741005, + "grad_norm": 100.80639940703247, + "learning_rate": 5e-06, + "loss": 0.4486, + "num_input_tokens_seen": 171102064, + "step": 990 + }, + { + "epoch": 0.39568345323741005, + "loss": 0.34538111090660095, + "loss_ce": 0.00645289896056056, + "loss_xval": 0.33984375, + "num_input_tokens_seen": 171102064, + "step": 990 + }, + { + "epoch": 0.39608313349320545, + "grad_norm": 127.70680274006587, + "learning_rate": 5e-06, + "loss": 0.4639, + "num_input_tokens_seen": 171275256, + "step": 991 + }, + { + "epoch": 0.39608313349320545, + "loss": 0.6334390044212341, + "loss_ce": 0.006730004213750362, + "loss_xval": 0.625, + "num_input_tokens_seen": 171275256, + "step": 991 + }, + { + "epoch": 0.3964828137490008, + "grad_norm": 98.65398770343047, + "learning_rate": 5e-06, + "loss": 0.7308, + "num_input_tokens_seen": 171448104, + "step": 992 + }, + { + "epoch": 0.3964828137490008, + "loss": 0.7053290009498596, + "loss_ce": 0.009894400835037231, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 171448104, + "step": 992 + }, + { + "epoch": 0.39688249400479614, + "grad_norm": 104.68550005705072, + "learning_rate": 5e-06, + "loss": 0.2795, + "num_input_tokens_seen": 171621304, + "step": 993 + }, + { + "epoch": 0.39688249400479614, + "loss": 0.2923963665962219, + "loss_ce": 0.005653205327689648, + "loss_xval": 0.287109375, + "num_input_tokens_seen": 171621304, + "step": 993 + }, + { + "epoch": 0.39728217426059154, + "grad_norm": 151.12207326320515, + "learning_rate": 5e-06, + "loss": 0.8025, + "num_input_tokens_seen": 171794016, + "step": 994 + }, + { + "epoch": 0.39728217426059154, + "loss": 0.7504175901412964, + "loss_ce": 0.005300438497215509, + "loss_xval": 0.74609375, + "num_input_tokens_seen": 171794016, + "step": 994 + }, + { + "epoch": 0.3976818545163869, + "grad_norm": 87.55290667068692, + "learning_rate": 5e-06, + "loss": 0.5974, + "num_input_tokens_seen": 171966768, + "step": 995 + }, + { + "epoch": 0.3976818545163869, + "loss": 0.3750014305114746, + "loss_ce": 0.005433551035821438, + "loss_xval": 0.369140625, + "num_input_tokens_seen": 171966768, + "step": 995 + }, + { + "epoch": 0.3980815347721823, + "grad_norm": 52.76431280420055, + "learning_rate": 5e-06, + "loss": 0.5791, + "num_input_tokens_seen": 172139856, + "step": 996 + }, + { + "epoch": 0.3980815347721823, + "loss": 0.5788711309432983, + "loss_ce": 0.009596217423677444, + "loss_xval": 0.5703125, + "num_input_tokens_seen": 172139856, + "step": 996 + }, + { + "epoch": 0.3984812150279776, + "grad_norm": 61.69947757143887, + "learning_rate": 5e-06, + "loss": 0.5662, + "num_input_tokens_seen": 172312544, + "step": 997 + }, + { + "epoch": 0.3984812150279776, + "loss": 0.50725257396698, + "loss_ce": 0.010182302445173264, + "loss_xval": 0.49609375, + "num_input_tokens_seen": 172312544, + "step": 997 + }, + { + "epoch": 0.39888089528377296, + "grad_norm": 34.32354136252659, + "learning_rate": 5e-06, + "loss": 0.8179, + "num_input_tokens_seen": 172485560, + "step": 998 + }, + { + "epoch": 0.39888089528377296, + "loss": 0.9845725893974304, + "loss_ce": 0.023391013965010643, + "loss_xval": 0.9609375, + "num_input_tokens_seen": 172485560, + "step": 998 + }, + { + "epoch": 0.39928057553956836, + "grad_norm": 84.83106537615475, + "learning_rate": 5e-06, + "loss": 0.5806, + "num_input_tokens_seen": 172658848, + "step": 999 + }, + { + "epoch": 0.39928057553956836, + "loss": 0.8320725560188293, + "loss_ce": 0.005900641903281212, + "loss_xval": 0.828125, + "num_input_tokens_seen": 172658848, + "step": 999 + }, + { + "epoch": 0.3996802557953637, + "grad_norm": 80.35046570322501, + "learning_rate": 5e-06, + "loss": 0.603, + "num_input_tokens_seen": 172831616, + "step": 1000 + }, + { + "epoch": 0.3996802557953637, + "eval_websight_new_IoU": 0.3092806488275528, + "eval_websight_new_MAE_all": 0.024964885786175728, + "eval_websight_new_MAE_h": 0.009109157603234053, + "eval_websight_new_MAE_w": 0.04664035141468048, + "eval_websight_new_MAE_x": 0.025324680842459202, + "eval_websight_new_MAE_y": 0.018785354681313038, + "eval_websight_new_NUM_probability": 0.9444170296192169, + "eval_websight_new_inside_bbox": 0.6996527910232544, + "eval_websight_new_loss": 0.33494770526885986, + "eval_websight_new_loss_ce": 0.006523952353745699, + "eval_websight_new_loss_xval": 0.26861572265625, + "eval_websight_new_runtime": 56.6826, + "eval_websight_new_samples_per_second": 0.882, + "eval_websight_new_steps_per_second": 0.035, + "num_input_tokens_seen": 172831616, + "step": 1000 + }, + { + "epoch": 0.3996802557953637, + "eval_seeclick_IoU": 0.23224642127752304, + "eval_seeclick_MAE_all": 0.07489410787820816, + "eval_seeclick_MAE_h": 0.02226562239229679, + "eval_seeclick_MAE_w": 0.11477012187242508, + "eval_seeclick_MAE_x": 0.0983852706849575, + "eval_seeclick_MAE_y": 0.06415541097521782, + "eval_seeclick_NUM_probability": 0.9417648315429688, + "eval_seeclick_inside_bbox": 0.4444444477558136, + "eval_seeclick_loss": 1.5326517820358276, + "eval_seeclick_loss_ce": 0.020226879976689816, + "eval_seeclick_loss_xval": 1.391357421875, + "eval_seeclick_runtime": 84.8257, + "eval_seeclick_samples_per_second": 0.589, + "eval_seeclick_steps_per_second": 0.024, + "num_input_tokens_seen": 172831616, + "step": 1000 + }, + { + "epoch": 0.3996802557953637, + "eval_icons_IoU": 0.061911119148135185, + "eval_icons_MAE_all": 0.028313827700912952, + "eval_icons_MAE_h": 0.006960721453651786, + "eval_icons_MAE_w": 0.008420140482485294, + "eval_icons_MAE_x": 0.05678635463118553, + "eval_icons_MAE_y": 0.04108810052275658, + "eval_icons_NUM_probability": 0.9464539885520935, + "eval_icons_inside_bbox": 0.09027777798473835, + "eval_icons_loss": 0.38697123527526855, + "eval_icons_loss_ce": 0.006282810820266604, + "eval_icons_loss_xval": 0.310516357421875, + "eval_icons_runtime": 83.1499, + "eval_icons_samples_per_second": 0.601, + "eval_icons_steps_per_second": 0.024, + "num_input_tokens_seen": 172831616, + "step": 1000 + }, + { + "epoch": 0.3996802557953637, + "loss": 0.23844069242477417, + "loss_ce": 0.00641553895547986, + "loss_xval": 0.232421875, + "num_input_tokens_seen": 172831616, + "step": 1000 + }, + { + "epoch": 0.40007993605115905, + "grad_norm": 26.22909617587164, + "learning_rate": 5e-06, + "loss": 0.5293, + "num_input_tokens_seen": 173004832, + "step": 1001 + }, + { + "epoch": 0.40007993605115905, + "loss": 0.5112817287445068, + "loss_ce": 0.008443554863333702, + "loss_xval": 0.50390625, + "num_input_tokens_seen": 173004832, + "step": 1001 + }, + { + "epoch": 0.40047961630695444, + "grad_norm": 37.57565610535053, + "learning_rate": 5e-06, + "loss": 0.6173, + "num_input_tokens_seen": 173178376, + "step": 1002 + }, + { + "epoch": 0.40047961630695444, + "loss": 0.6873658299446106, + "loss_ce": 0.0071595776826143265, + "loss_xval": 0.6796875, + "num_input_tokens_seen": 173178376, + "step": 1002 + }, + { + "epoch": 0.4008792965627498, + "grad_norm": 52.448242126332076, + "learning_rate": 5e-06, + "loss": 0.5896, + "num_input_tokens_seen": 173351128, + "step": 1003 + }, + { + "epoch": 0.4008792965627498, + "loss": 0.6299257278442383, + "loss_ce": 0.007367200218141079, + "loss_xval": 0.62109375, + "num_input_tokens_seen": 173351128, + "step": 1003 + }, + { + "epoch": 0.4012789768185452, + "grad_norm": 59.20638229901705, + "learning_rate": 5e-06, + "loss": 0.5644, + "num_input_tokens_seen": 173524176, + "step": 1004 + }, + { + "epoch": 0.4012789768185452, + "loss": 0.7194583415985107, + "loss_ce": 0.006445643957704306, + "loss_xval": 0.71484375, + "num_input_tokens_seen": 173524176, + "step": 1004 + }, + { + "epoch": 0.40167865707434053, + "grad_norm": 36.58616370768613, + "learning_rate": 5e-06, + "loss": 0.9425, + "num_input_tokens_seen": 173696832, + "step": 1005 + }, + { + "epoch": 0.40167865707434053, + "loss": 1.347395896911621, + "loss_ce": 0.0066977087408304214, + "loss_xval": 1.34375, + "num_input_tokens_seen": 173696832, + "step": 1005 + }, + { + "epoch": 0.40207833733013587, + "grad_norm": 108.83892770229245, + "learning_rate": 5e-06, + "loss": 0.9874, + "num_input_tokens_seen": 173869728, + "step": 1006 + }, + { + "epoch": 0.40207833733013587, + "loss": 0.9585855007171631, + "loss_ce": 0.004667055793106556, + "loss_xval": 0.953125, + "num_input_tokens_seen": 173869728, + "step": 1006 + }, + { + "epoch": 0.40247801758593127, + "grad_norm": 29.57215095837664, + "learning_rate": 5e-06, + "loss": 0.6105, + "num_input_tokens_seen": 174042616, + "step": 1007 + }, + { + "epoch": 0.40247801758593127, + "loss": 0.6520742177963257, + "loss_ce": 0.006871582940220833, + "loss_xval": 0.64453125, + "num_input_tokens_seen": 174042616, + "step": 1007 + }, + { + "epoch": 0.4028776978417266, + "grad_norm": 153.84706487430793, + "learning_rate": 5e-06, + "loss": 0.657, + "num_input_tokens_seen": 174215280, + "step": 1008 + }, + { + "epoch": 0.4028776978417266, + "loss": 0.7333929538726807, + "loss_ce": 0.012934006750583649, + "loss_xval": 0.71875, + "num_input_tokens_seen": 174215280, + "step": 1008 + }, + { + "epoch": 0.403277378097522, + "grad_norm": 114.61737036027186, + "learning_rate": 5e-06, + "loss": 0.7579, + "num_input_tokens_seen": 174388264, + "step": 1009 + }, + { + "epoch": 0.403277378097522, + "loss": 0.525598406791687, + "loss_ce": 0.006250268779695034, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 174388264, + "step": 1009 + }, + { + "epoch": 0.40367705835331735, + "grad_norm": 124.58554146445353, + "learning_rate": 5e-06, + "loss": 0.5623, + "num_input_tokens_seen": 174561424, + "step": 1010 + }, + { + "epoch": 0.40367705835331735, + "loss": 0.7112394571304321, + "loss_ce": 0.012814194895327091, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 174561424, + "step": 1010 + }, + { + "epoch": 0.4040767386091127, + "grad_norm": 97.38895828742109, + "learning_rate": 5e-06, + "loss": 0.3995, + "num_input_tokens_seen": 174734472, + "step": 1011 + }, + { + "epoch": 0.4040767386091127, + "loss": 0.4527726471424103, + "loss_ce": 0.0054460205137729645, + "loss_xval": 0.447265625, + "num_input_tokens_seen": 174734472, + "step": 1011 + }, + { + "epoch": 0.4044764188649081, + "grad_norm": 39.357984950176494, + "learning_rate": 5e-06, + "loss": 0.6717, + "num_input_tokens_seen": 174907320, + "step": 1012 + }, + { + "epoch": 0.4044764188649081, + "loss": 0.7403974533081055, + "loss_ce": 0.009654035791754723, + "loss_xval": 0.73046875, + "num_input_tokens_seen": 174907320, + "step": 1012 + }, + { + "epoch": 0.40487609912070344, + "grad_norm": 43.49113942292695, + "learning_rate": 5e-06, + "loss": 0.648, + "num_input_tokens_seen": 175080416, + "step": 1013 + }, + { + "epoch": 0.40487609912070344, + "loss": 0.5626762509346008, + "loss_ce": 0.009941885247826576, + "loss_xval": 0.5546875, + "num_input_tokens_seen": 175080416, + "step": 1013 + }, + { + "epoch": 0.4052757793764988, + "grad_norm": 134.41500315087782, + "learning_rate": 5e-06, + "loss": 0.9411, + "num_input_tokens_seen": 175252992, + "step": 1014 + }, + { + "epoch": 0.4052757793764988, + "loss": 0.6051626801490784, + "loss_ce": 0.004698799457401037, + "loss_xval": 0.6015625, + "num_input_tokens_seen": 175252992, + "step": 1014 + }, + { + "epoch": 0.4056754596322942, + "grad_norm": 91.34613154118188, + "learning_rate": 5e-06, + "loss": 0.4441, + "num_input_tokens_seen": 175425760, + "step": 1015 + }, + { + "epoch": 0.4056754596322942, + "loss": 0.5284594297409058, + "loss_ce": 0.006120562553405762, + "loss_xval": 0.5234375, + "num_input_tokens_seen": 175425760, + "step": 1015 + }, + { + "epoch": 0.4060751398880895, + "grad_norm": 62.797149514317915, + "learning_rate": 5e-06, + "loss": 0.9974, + "num_input_tokens_seen": 175598784, + "step": 1016 + }, + { + "epoch": 0.4060751398880895, + "loss": 0.6914917826652527, + "loss_ce": 0.0069824811071157455, + "loss_xval": 0.68359375, + "num_input_tokens_seen": 175598784, + "step": 1016 + }, + { + "epoch": 0.4064748201438849, + "grad_norm": 68.01439644089561, + "learning_rate": 5e-06, + "loss": 0.6034, + "num_input_tokens_seen": 175771752, + "step": 1017 + }, + { + "epoch": 0.4064748201438849, + "loss": 0.7401469945907593, + "loss_ce": 0.005466855131089687, + "loss_xval": 0.734375, + "num_input_tokens_seen": 175771752, + "step": 1017 + }, + { + "epoch": 0.40687450039968026, + "grad_norm": 33.194237726138894, + "learning_rate": 5e-06, + "loss": 0.4104, + "num_input_tokens_seen": 175944448, + "step": 1018 + }, + { + "epoch": 0.40687450039968026, + "loss": 0.4032331705093384, + "loss_ce": 0.004368394613265991, + "loss_xval": 0.3984375, + "num_input_tokens_seen": 175944448, + "step": 1018 + }, + { + "epoch": 0.4072741806554756, + "grad_norm": 32.31623305090731, + "learning_rate": 5e-06, + "loss": 0.5535, + "num_input_tokens_seen": 176117224, + "step": 1019 + }, + { + "epoch": 0.4072741806554756, + "loss": 0.6844021081924438, + "loss_ce": 0.004104219377040863, + "loss_xval": 0.6796875, + "num_input_tokens_seen": 176117224, + "step": 1019 + }, + { + "epoch": 0.407673860911271, + "grad_norm": 69.88987283607364, + "learning_rate": 5e-06, + "loss": 0.5382, + "num_input_tokens_seen": 176290288, + "step": 1020 + }, + { + "epoch": 0.407673860911271, + "loss": 0.6937678456306458, + "loss_ce": 0.006511982996016741, + "loss_xval": 0.6875, + "num_input_tokens_seen": 176290288, + "step": 1020 + }, + { + "epoch": 0.40807354116706634, + "grad_norm": 49.12054277220987, + "learning_rate": 5e-06, + "loss": 0.7288, + "num_input_tokens_seen": 176463360, + "step": 1021 + }, + { + "epoch": 0.40807354116706634, + "loss": 0.8968067169189453, + "loss_ce": 0.005205155350267887, + "loss_xval": 0.890625, + "num_input_tokens_seen": 176463360, + "step": 1021 + }, + { + "epoch": 0.4084732214228617, + "grad_norm": 96.62921131568778, + "learning_rate": 5e-06, + "loss": 0.8582, + "num_input_tokens_seen": 176636272, + "step": 1022 + }, + { + "epoch": 0.4084732214228617, + "loss": 0.6188912987709045, + "loss_ce": 0.004145242273807526, + "loss_xval": 0.61328125, + "num_input_tokens_seen": 176636272, + "step": 1022 + }, + { + "epoch": 0.4088729016786571, + "grad_norm": 104.19761497471133, + "learning_rate": 5e-06, + "loss": 0.5969, + "num_input_tokens_seen": 176809416, + "step": 1023 + }, + { + "epoch": 0.4088729016786571, + "loss": 0.7266084551811218, + "loss_ce": 0.004013280384242535, + "loss_xval": 0.72265625, + "num_input_tokens_seen": 176809416, + "step": 1023 + }, + { + "epoch": 0.40927258193445243, + "grad_norm": 25.052045025077565, + "learning_rate": 5e-06, + "loss": 0.652, + "num_input_tokens_seen": 176982512, + "step": 1024 + }, + { + "epoch": 0.40927258193445243, + "loss": 0.4747720956802368, + "loss_ce": 0.0037638223730027676, + "loss_xval": 0.470703125, + "num_input_tokens_seen": 176982512, + "step": 1024 + }, + { + "epoch": 0.4096722621902478, + "grad_norm": 55.184447426888354, + "learning_rate": 5e-06, + "loss": 0.909, + "num_input_tokens_seen": 177155544, + "step": 1025 + }, + { + "epoch": 0.4096722621902478, + "loss": 1.211665153503418, + "loss_ce": 0.008418156765401363, + "loss_xval": 1.203125, + "num_input_tokens_seen": 177155544, + "step": 1025 + }, + { + "epoch": 0.41007194244604317, + "grad_norm": 51.43456903101215, + "learning_rate": 5e-06, + "loss": 0.4022, + "num_input_tokens_seen": 177328144, + "step": 1026 + }, + { + "epoch": 0.41007194244604317, + "loss": 0.3644227683544159, + "loss_ce": 0.003643968142569065, + "loss_xval": 0.361328125, + "num_input_tokens_seen": 177328144, + "step": 1026 + }, + { + "epoch": 0.4104716227018385, + "grad_norm": 38.349924787831824, + "learning_rate": 5e-06, + "loss": 0.6099, + "num_input_tokens_seen": 177500576, + "step": 1027 + }, + { + "epoch": 0.4104716227018385, + "loss": 0.46429070830345154, + "loss_ce": 0.005428393371403217, + "loss_xval": 0.458984375, + "num_input_tokens_seen": 177500576, + "step": 1027 + }, + { + "epoch": 0.4108713029576339, + "grad_norm": 81.99938062743149, + "learning_rate": 5e-06, + "loss": 0.4963, + "num_input_tokens_seen": 177673800, + "step": 1028 + }, + { + "epoch": 0.4108713029576339, + "loss": 0.27311575412750244, + "loss_ce": 0.0070024700835347176, + "loss_xval": 0.265625, + "num_input_tokens_seen": 177673800, + "step": 1028 + }, + { + "epoch": 0.41127098321342925, + "grad_norm": 60.9113505265921, + "learning_rate": 5e-06, + "loss": 0.6987, + "num_input_tokens_seen": 177846720, + "step": 1029 + }, + { + "epoch": 0.41127098321342925, + "loss": 0.8026575446128845, + "loss_ce": 0.0034326824825257063, + "loss_xval": 0.80078125, + "num_input_tokens_seen": 177846720, + "step": 1029 + }, + { + "epoch": 0.4116706634692246, + "grad_norm": 134.37290555496102, + "learning_rate": 5e-06, + "loss": 0.7379, + "num_input_tokens_seen": 178019584, + "step": 1030 + }, + { + "epoch": 0.4116706634692246, + "loss": 0.6104703545570374, + "loss_ce": 0.006893688812851906, + "loss_xval": 0.60546875, + "num_input_tokens_seen": 178019584, + "step": 1030 + }, + { + "epoch": 0.41207034372502, + "grad_norm": 116.92587445948755, + "learning_rate": 5e-06, + "loss": 0.5363, + "num_input_tokens_seen": 178192928, + "step": 1031 + }, + { + "epoch": 0.41207034372502, + "loss": 0.3673360347747803, + "loss_ce": 0.002956158248707652, + "loss_xval": 0.365234375, + "num_input_tokens_seen": 178192928, + "step": 1031 + }, + { + "epoch": 0.41247002398081534, + "grad_norm": 75.81443523599133, + "learning_rate": 5e-06, + "loss": 0.5552, + "num_input_tokens_seen": 178365992, + "step": 1032 + }, + { + "epoch": 0.41247002398081534, + "loss": 0.5825966596603394, + "loss_ce": 0.00911034271121025, + "loss_xval": 0.57421875, + "num_input_tokens_seen": 178365992, + "step": 1032 + }, + { + "epoch": 0.41286970423661074, + "grad_norm": 153.80297019873962, + "learning_rate": 5e-06, + "loss": 0.6214, + "num_input_tokens_seen": 178539008, + "step": 1033 + }, + { + "epoch": 0.41286970423661074, + "loss": 0.5666282773017883, + "loss_ce": 0.007546260487288237, + "loss_xval": 0.55859375, + "num_input_tokens_seen": 178539008, + "step": 1033 + }, + { + "epoch": 0.4132693844924061, + "grad_norm": 49.204697005620176, + "learning_rate": 5e-06, + "loss": 0.9595, + "num_input_tokens_seen": 178711328, + "step": 1034 + }, + { + "epoch": 0.4132693844924061, + "loss": 0.6298288106918335, + "loss_ce": 0.01019988302141428, + "loss_xval": 0.62109375, + "num_input_tokens_seen": 178711328, + "step": 1034 + }, + { + "epoch": 0.4136690647482014, + "grad_norm": 76.08084598570717, + "learning_rate": 5e-06, + "loss": 0.4532, + "num_input_tokens_seen": 178884104, + "step": 1035 + }, + { + "epoch": 0.4136690647482014, + "loss": 0.4612717032432556, + "loss_ce": 0.012480195611715317, + "loss_xval": 0.44921875, + "num_input_tokens_seen": 178884104, + "step": 1035 + }, + { + "epoch": 0.4140687450039968, + "grad_norm": 34.52904789226501, + "learning_rate": 5e-06, + "loss": 0.4644, + "num_input_tokens_seen": 179057048, + "step": 1036 + }, + { + "epoch": 0.4140687450039968, + "loss": 0.6084589958190918, + "loss_ce": 0.006713386625051498, + "loss_xval": 0.6015625, + "num_input_tokens_seen": 179057048, + "step": 1036 + }, + { + "epoch": 0.41446842525979216, + "grad_norm": 107.83231397269536, + "learning_rate": 5e-06, + "loss": 0.4973, + "num_input_tokens_seen": 179230376, + "step": 1037 + }, + { + "epoch": 0.41446842525979216, + "loss": 0.3456187844276428, + "loss_ce": 0.005652973428368568, + "loss_xval": 0.33984375, + "num_input_tokens_seen": 179230376, + "step": 1037 + }, + { + "epoch": 0.4148681055155875, + "grad_norm": 68.86144066104148, + "learning_rate": 5e-06, + "loss": 0.6048, + "num_input_tokens_seen": 179403320, + "step": 1038 + }, + { + "epoch": 0.4148681055155875, + "loss": 0.9780701398849487, + "loss_ce": 0.040020860731601715, + "loss_xval": 0.9375, + "num_input_tokens_seen": 179403320, + "step": 1038 + }, + { + "epoch": 0.4152677857713829, + "grad_norm": 29.2133368946288, + "learning_rate": 5e-06, + "loss": 0.64, + "num_input_tokens_seen": 179576152, + "step": 1039 + }, + { + "epoch": 0.4152677857713829, + "loss": 0.7152938842773438, + "loss_ce": 0.005577098578214645, + "loss_xval": 0.7109375, + "num_input_tokens_seen": 179576152, + "step": 1039 + }, + { + "epoch": 0.41566746602717825, + "grad_norm": 65.40652651781713, + "learning_rate": 5e-06, + "loss": 0.6278, + "num_input_tokens_seen": 179748936, + "step": 1040 + }, + { + "epoch": 0.41566746602717825, + "loss": 0.7276248931884766, + "loss_ce": 0.004358314909040928, + "loss_xval": 0.72265625, + "num_input_tokens_seen": 179748936, + "step": 1040 + }, + { + "epoch": 0.41606714628297364, + "grad_norm": 118.90984830416035, + "learning_rate": 5e-06, + "loss": 0.4819, + "num_input_tokens_seen": 179922080, + "step": 1041 + }, + { + "epoch": 0.41606714628297364, + "loss": 0.38193365931510925, + "loss_ce": 0.013891654089093208, + "loss_xval": 0.3671875, + "num_input_tokens_seen": 179922080, + "step": 1041 + }, + { + "epoch": 0.416466826538769, + "grad_norm": 56.73656107069961, + "learning_rate": 5e-06, + "loss": 0.5628, + "num_input_tokens_seen": 180094792, + "step": 1042 + }, + { + "epoch": 0.416466826538769, + "loss": 0.624241292476654, + "loss_ce": 0.005711013451218605, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 180094792, + "step": 1042 + }, + { + "epoch": 0.41686650679456433, + "grad_norm": 161.1185485615042, + "learning_rate": 5e-06, + "loss": 0.5426, + "num_input_tokens_seen": 180267832, + "step": 1043 + }, + { + "epoch": 0.41686650679456433, + "loss": 0.41229158639907837, + "loss_ce": 0.003966381307691336, + "loss_xval": 0.408203125, + "num_input_tokens_seen": 180267832, + "step": 1043 + }, + { + "epoch": 0.4172661870503597, + "grad_norm": 78.0546420015716, + "learning_rate": 5e-06, + "loss": 0.4282, + "num_input_tokens_seen": 180440816, + "step": 1044 + }, + { + "epoch": 0.4172661870503597, + "loss": 0.4995594322681427, + "loss_ce": 0.020921722054481506, + "loss_xval": 0.478515625, + "num_input_tokens_seen": 180440816, + "step": 1044 + }, + { + "epoch": 0.41766586730615507, + "grad_norm": 115.85859991185649, + "learning_rate": 5e-06, + "loss": 0.7434, + "num_input_tokens_seen": 180613760, + "step": 1045 + }, + { + "epoch": 0.41766586730615507, + "loss": 0.4128772020339966, + "loss_ce": 0.005528563167899847, + "loss_xval": 0.408203125, + "num_input_tokens_seen": 180613760, + "step": 1045 + }, + { + "epoch": 0.4180655475619504, + "grad_norm": 122.32619412285479, + "learning_rate": 5e-06, + "loss": 0.7231, + "num_input_tokens_seen": 180786392, + "step": 1046 + }, + { + "epoch": 0.4180655475619504, + "loss": 0.42909038066864014, + "loss_ce": 0.00416360329836607, + "loss_xval": 0.42578125, + "num_input_tokens_seen": 180786392, + "step": 1046 + }, + { + "epoch": 0.4184652278177458, + "grad_norm": 74.33985091881948, + "learning_rate": 5e-06, + "loss": 0.6912, + "num_input_tokens_seen": 180959520, + "step": 1047 + }, + { + "epoch": 0.4184652278177458, + "loss": 0.4434017837047577, + "loss_ce": 0.006145905703306198, + "loss_xval": 0.4375, + "num_input_tokens_seen": 180959520, + "step": 1047 + }, + { + "epoch": 0.41886490807354115, + "grad_norm": 52.273984052094164, + "learning_rate": 5e-06, + "loss": 0.7557, + "num_input_tokens_seen": 181132584, + "step": 1048 + }, + { + "epoch": 0.41886490807354115, + "loss": 0.7552452087402344, + "loss_ce": 0.006557449232786894, + "loss_xval": 0.75, + "num_input_tokens_seen": 181132584, + "step": 1048 + }, + { + "epoch": 0.41926458832933655, + "grad_norm": 115.38168323853732, + "learning_rate": 5e-06, + "loss": 0.5297, + "num_input_tokens_seen": 181305200, + "step": 1049 + }, + { + "epoch": 0.41926458832933655, + "loss": 0.35102906823158264, + "loss_ce": 0.04145876318216324, + "loss_xval": 0.30859375, + "num_input_tokens_seen": 181305200, + "step": 1049 + }, + { + "epoch": 0.4196642685851319, + "grad_norm": 129.43468680045206, + "learning_rate": 5e-06, + "loss": 0.7044, + "num_input_tokens_seen": 181477832, + "step": 1050 + }, + { + "epoch": 0.4196642685851319, + "loss": 0.7682023048400879, + "loss_ce": 0.005873220041394234, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 181477832, + "step": 1050 + }, + { + "epoch": 0.42006394884092724, + "grad_norm": 84.3901351931311, + "learning_rate": 5e-06, + "loss": 0.5263, + "num_input_tokens_seen": 181650856, + "step": 1051 + }, + { + "epoch": 0.42006394884092724, + "loss": 0.20175260305404663, + "loss_ce": 0.005524573847651482, + "loss_xval": 0.1962890625, + "num_input_tokens_seen": 181650856, + "step": 1051 + }, + { + "epoch": 0.42046362909672264, + "grad_norm": 105.05979567930446, + "learning_rate": 5e-06, + "loss": 0.6799, + "num_input_tokens_seen": 181823832, + "step": 1052 + }, + { + "epoch": 0.42046362909672264, + "loss": 0.5329502820968628, + "loss_ce": 0.007681742776185274, + "loss_xval": 0.5234375, + "num_input_tokens_seen": 181823832, + "step": 1052 + }, + { + "epoch": 0.420863309352518, + "grad_norm": 141.7226731667635, + "learning_rate": 5e-06, + "loss": 0.6034, + "num_input_tokens_seen": 181997064, + "step": 1053 + }, + { + "epoch": 0.420863309352518, + "loss": 0.8590031862258911, + "loss_ce": 0.007928947918117046, + "loss_xval": 0.8515625, + "num_input_tokens_seen": 181997064, + "step": 1053 + }, + { + "epoch": 0.4212629896083134, + "grad_norm": 97.79244733216197, + "learning_rate": 5e-06, + "loss": 0.5302, + "num_input_tokens_seen": 182169840, + "step": 1054 + }, + { + "epoch": 0.4212629896083134, + "loss": 0.6665828227996826, + "loss_ce": 0.008318647742271423, + "loss_xval": 0.66015625, + "num_input_tokens_seen": 182169840, + "step": 1054 + }, + { + "epoch": 0.4216626698641087, + "grad_norm": 118.68763350662337, + "learning_rate": 5e-06, + "loss": 0.9278, + "num_input_tokens_seen": 182342800, + "step": 1055 + }, + { + "epoch": 0.4216626698641087, + "loss": 1.1125082969665527, + "loss_ce": 0.01094580627977848, + "loss_xval": 1.1015625, + "num_input_tokens_seen": 182342800, + "step": 1055 + }, + { + "epoch": 0.42206235011990406, + "grad_norm": 80.77252748394068, + "learning_rate": 5e-06, + "loss": 0.5525, + "num_input_tokens_seen": 182515448, + "step": 1056 + }, + { + "epoch": 0.42206235011990406, + "loss": 0.5583893656730652, + "loss_ce": 0.007119842804968357, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 182515448, + "step": 1056 + }, + { + "epoch": 0.42246203037569946, + "grad_norm": 102.19393718382399, + "learning_rate": 5e-06, + "loss": 0.3007, + "num_input_tokens_seen": 182688632, + "step": 1057 + }, + { + "epoch": 0.42246203037569946, + "loss": 0.28939294815063477, + "loss_ce": 0.009790889918804169, + "loss_xval": 0.279296875, + "num_input_tokens_seen": 182688632, + "step": 1057 + }, + { + "epoch": 0.4228617106314948, + "grad_norm": 36.3098200607475, + "learning_rate": 5e-06, + "loss": 0.5062, + "num_input_tokens_seen": 182861544, + "step": 1058 + }, + { + "epoch": 0.4228617106314948, + "loss": 0.6533856391906738, + "loss_ce": 0.0076947640627622604, + "loss_xval": 0.64453125, + "num_input_tokens_seen": 182861544, + "step": 1058 + }, + { + "epoch": 0.42326139088729015, + "grad_norm": 172.13037659969413, + "learning_rate": 5e-06, + "loss": 0.593, + "num_input_tokens_seen": 183034432, + "step": 1059 + }, + { + "epoch": 0.42326139088729015, + "loss": 0.5093013048171997, + "loss_ce": 0.006615748163312674, + "loss_xval": 0.50390625, + "num_input_tokens_seen": 183034432, + "step": 1059 + }, + { + "epoch": 0.42366107114308554, + "grad_norm": 47.71558190089748, + "learning_rate": 5e-06, + "loss": 0.344, + "num_input_tokens_seen": 183207152, + "step": 1060 + }, + { + "epoch": 0.42366107114308554, + "loss": 0.24332204461097717, + "loss_ce": 0.0058342646807432175, + "loss_xval": 0.2373046875, + "num_input_tokens_seen": 183207152, + "step": 1060 + }, + { + "epoch": 0.4240607513988809, + "grad_norm": 225.7577370720187, + "learning_rate": 5e-06, + "loss": 0.8292, + "num_input_tokens_seen": 183380024, + "step": 1061 + }, + { + "epoch": 0.4240607513988809, + "loss": 1.1143964529037476, + "loss_ce": 0.006730412133038044, + "loss_xval": 1.109375, + "num_input_tokens_seen": 183380024, + "step": 1061 + }, + { + "epoch": 0.4244604316546763, + "grad_norm": 37.571367032139, + "learning_rate": 5e-06, + "loss": 0.5539, + "num_input_tokens_seen": 183553232, + "step": 1062 + }, + { + "epoch": 0.4244604316546763, + "loss": 0.7506046891212463, + "loss_ce": 0.004358367994427681, + "loss_xval": 0.74609375, + "num_input_tokens_seen": 183553232, + "step": 1062 + }, + { + "epoch": 0.4248601119104716, + "grad_norm": 147.5481344304394, + "learning_rate": 5e-06, + "loss": 0.6861, + "num_input_tokens_seen": 183726032, + "step": 1063 + }, + { + "epoch": 0.4248601119104716, + "loss": 0.6540185213088989, + "loss_ce": 0.007289969827979803, + "loss_xval": 0.6484375, + "num_input_tokens_seen": 183726032, + "step": 1063 + }, + { + "epoch": 0.42525979216626697, + "grad_norm": 82.0603792449001, + "learning_rate": 5e-06, + "loss": 0.5386, + "num_input_tokens_seen": 183898888, + "step": 1064 + }, + { + "epoch": 0.42525979216626697, + "loss": 0.5563596487045288, + "loss_ce": 0.007897760719060898, + "loss_xval": 0.546875, + "num_input_tokens_seen": 183898888, + "step": 1064 + }, + { + "epoch": 0.42565947242206237, + "grad_norm": 157.6689432477122, + "learning_rate": 5e-06, + "loss": 0.765, + "num_input_tokens_seen": 184071656, + "step": 1065 + }, + { + "epoch": 0.42565947242206237, + "loss": 0.8480945825576782, + "loss_ce": 0.007274296134710312, + "loss_xval": 0.83984375, + "num_input_tokens_seen": 184071656, + "step": 1065 + }, + { + "epoch": 0.4260591526778577, + "grad_norm": 73.92092672975159, + "learning_rate": 5e-06, + "loss": 0.6531, + "num_input_tokens_seen": 184244448, + "step": 1066 + }, + { + "epoch": 0.4260591526778577, + "loss": 0.43375566601753235, + "loss_ce": 0.00882891844958067, + "loss_xval": 0.42578125, + "num_input_tokens_seen": 184244448, + "step": 1066 + }, + { + "epoch": 0.42645883293365305, + "grad_norm": 137.83919435792907, + "learning_rate": 5e-06, + "loss": 0.7066, + "num_input_tokens_seen": 184417704, + "step": 1067 + }, + { + "epoch": 0.42645883293365305, + "loss": 0.5617185235023499, + "loss_ce": 0.009838663972914219, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 184417704, + "step": 1067 + }, + { + "epoch": 0.42685851318944845, + "grad_norm": 12.10364180031393, + "learning_rate": 5e-06, + "loss": 0.5337, + "num_input_tokens_seen": 184590944, + "step": 1068 + }, + { + "epoch": 0.42685851318944845, + "loss": 0.6997219920158386, + "loss_ce": 0.006026932038366795, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 184590944, + "step": 1068 + }, + { + "epoch": 0.4272581934452438, + "grad_norm": 97.7581622286511, + "learning_rate": 5e-06, + "loss": 0.4564, + "num_input_tokens_seen": 184763904, + "step": 1069 + }, + { + "epoch": 0.4272581934452438, + "loss": 0.5520721673965454, + "loss_ce": 0.007485995534807444, + "loss_xval": 0.54296875, + "num_input_tokens_seen": 184763904, + "step": 1069 + }, + { + "epoch": 0.4276578737010392, + "grad_norm": 44.525008948363784, + "learning_rate": 5e-06, + "loss": 0.5811, + "num_input_tokens_seen": 184936952, + "step": 1070 + }, + { + "epoch": 0.4276578737010392, + "loss": 0.7966837882995605, + "loss_ce": 0.01567797176539898, + "loss_xval": 0.78125, + "num_input_tokens_seen": 184936952, + "step": 1070 + }, + { + "epoch": 0.42805755395683454, + "grad_norm": 61.55085629886339, + "learning_rate": 5e-06, + "loss": 0.5968, + "num_input_tokens_seen": 185109752, + "step": 1071 + }, + { + "epoch": 0.42805755395683454, + "loss": 0.614800214767456, + "loss_ce": 0.005913465283811092, + "loss_xval": 0.609375, + "num_input_tokens_seen": 185109752, + "step": 1071 + }, + { + "epoch": 0.4284572342126299, + "grad_norm": 127.48039438818263, + "learning_rate": 5e-06, + "loss": 0.981, + "num_input_tokens_seen": 185282904, + "step": 1072 + }, + { + "epoch": 0.4284572342126299, + "loss": 1.1177836656570435, + "loss_ce": 0.0068827904760837555, + "loss_xval": 1.109375, + "num_input_tokens_seen": 185282904, + "step": 1072 + }, + { + "epoch": 0.4288569144684253, + "grad_norm": 101.69167477228902, + "learning_rate": 5e-06, + "loss": 0.5908, + "num_input_tokens_seen": 185455960, + "step": 1073 + }, + { + "epoch": 0.4288569144684253, + "loss": 0.5060060620307922, + "loss_ce": 0.014306841418147087, + "loss_xval": 0.4921875, + "num_input_tokens_seen": 185455960, + "step": 1073 + }, + { + "epoch": 0.4292565947242206, + "grad_norm": 79.52330036933225, + "learning_rate": 5e-06, + "loss": 0.7136, + "num_input_tokens_seen": 185628528, + "step": 1074 + }, + { + "epoch": 0.4292565947242206, + "loss": 0.9773727655410767, + "loss_ce": 0.010301224887371063, + "loss_xval": 0.96875, + "num_input_tokens_seen": 185628528, + "step": 1074 + }, + { + "epoch": 0.42965627498001596, + "grad_norm": 59.63539541971929, + "learning_rate": 5e-06, + "loss": 0.659, + "num_input_tokens_seen": 185801848, + "step": 1075 + }, + { + "epoch": 0.42965627498001596, + "loss": 1.030850887298584, + "loss_ce": 0.008054335601627827, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 185801848, + "step": 1075 + }, + { + "epoch": 0.43005595523581136, + "grad_norm": 98.38438903473731, + "learning_rate": 5e-06, + "loss": 0.5859, + "num_input_tokens_seen": 185974824, + "step": 1076 + }, + { + "epoch": 0.43005595523581136, + "loss": 0.6300668716430664, + "loss_ce": 0.009705590084195137, + "loss_xval": 0.62109375, + "num_input_tokens_seen": 185974824, + "step": 1076 + }, + { + "epoch": 0.4304556354916067, + "grad_norm": 83.65142565563072, + "learning_rate": 5e-06, + "loss": 0.4725, + "num_input_tokens_seen": 186147832, + "step": 1077 + }, + { + "epoch": 0.4304556354916067, + "loss": 0.40116894245147705, + "loss_ce": 0.005661151837557554, + "loss_xval": 0.39453125, + "num_input_tokens_seen": 186147832, + "step": 1077 + }, + { + "epoch": 0.4308553157474021, + "grad_norm": 118.49626901189295, + "learning_rate": 5e-06, + "loss": 0.7388, + "num_input_tokens_seen": 186321064, + "step": 1078 + }, + { + "epoch": 0.4308553157474021, + "loss": 0.876869797706604, + "loss_ce": 0.005531886592507362, + "loss_xval": 0.87109375, + "num_input_tokens_seen": 186321064, + "step": 1078 + }, + { + "epoch": 0.43125499600319744, + "grad_norm": 109.09146812364261, + "learning_rate": 5e-06, + "loss": 0.5597, + "num_input_tokens_seen": 186493784, + "step": 1079 + }, + { + "epoch": 0.43125499600319744, + "loss": 0.3077165484428406, + "loss_ce": 0.014015364460647106, + "loss_xval": 0.29296875, + "num_input_tokens_seen": 186493784, + "step": 1079 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 153.6208097109895, + "learning_rate": 5e-06, + "loss": 0.659, + "num_input_tokens_seen": 186666760, + "step": 1080 + }, + { + "epoch": 0.4316546762589928, + "loss": 0.7934675216674805, + "loss_ce": 0.0073347436264157295, + "loss_xval": 0.78515625, + "num_input_tokens_seen": 186666760, + "step": 1080 + }, + { + "epoch": 0.4320543565147882, + "grad_norm": 104.55460590207808, + "learning_rate": 5e-06, + "loss": 0.533, + "num_input_tokens_seen": 186839760, + "step": 1081 + }, + { + "epoch": 0.4320543565147882, + "loss": 0.6861118078231812, + "loss_ce": 0.011856443248689175, + "loss_xval": 0.67578125, + "num_input_tokens_seen": 186839760, + "step": 1081 + }, + { + "epoch": 0.4324540367705835, + "grad_norm": 163.00494027996143, + "learning_rate": 5e-06, + "loss": 0.5373, + "num_input_tokens_seen": 187013040, + "step": 1082 + }, + { + "epoch": 0.4324540367705835, + "loss": 0.47731196880340576, + "loss_ce": 0.005754363723099232, + "loss_xval": 0.470703125, + "num_input_tokens_seen": 187013040, + "step": 1082 + }, + { + "epoch": 0.43285371702637887, + "grad_norm": 56.689099954633605, + "learning_rate": 5e-06, + "loss": 0.5203, + "num_input_tokens_seen": 187186120, + "step": 1083 + }, + { + "epoch": 0.43285371702637887, + "loss": 0.4869546592235565, + "loss_ce": 0.004288674332201481, + "loss_xval": 0.482421875, + "num_input_tokens_seen": 187186120, + "step": 1083 + }, + { + "epoch": 0.43325339728217427, + "grad_norm": 130.52361059914816, + "learning_rate": 5e-06, + "loss": 0.4672, + "num_input_tokens_seen": 187359432, + "step": 1084 + }, + { + "epoch": 0.43325339728217427, + "loss": 0.3323134183883667, + "loss_ce": 0.005409114994108677, + "loss_xval": 0.326171875, + "num_input_tokens_seen": 187359432, + "step": 1084 + }, + { + "epoch": 0.4336530775379696, + "grad_norm": 41.52331087326012, + "learning_rate": 5e-06, + "loss": 0.297, + "num_input_tokens_seen": 187532856, + "step": 1085 + }, + { + "epoch": 0.4336530775379696, + "loss": 0.14482995867729187, + "loss_ce": 0.009652344509959221, + "loss_xval": 0.134765625, + "num_input_tokens_seen": 187532856, + "step": 1085 + }, + { + "epoch": 0.434052757793765, + "grad_norm": 93.52508988364698, + "learning_rate": 5e-06, + "loss": 0.578, + "num_input_tokens_seen": 187705944, + "step": 1086 + }, + { + "epoch": 0.434052757793765, + "loss": 0.5481054186820984, + "loss_ce": 0.00513664074242115, + "loss_xval": 0.54296875, + "num_input_tokens_seen": 187705944, + "step": 1086 + }, + { + "epoch": 0.43445243804956035, + "grad_norm": 80.0157216132746, + "learning_rate": 5e-06, + "loss": 0.6773, + "num_input_tokens_seen": 187878896, + "step": 1087 + }, + { + "epoch": 0.43445243804956035, + "loss": 0.5329375267028809, + "loss_ce": 0.009744150564074516, + "loss_xval": 0.5234375, + "num_input_tokens_seen": 187878896, + "step": 1087 + }, + { + "epoch": 0.4348521183053557, + "grad_norm": 80.72326753655868, + "learning_rate": 5e-06, + "loss": 0.5067, + "num_input_tokens_seen": 188051976, + "step": 1088 + }, + { + "epoch": 0.4348521183053557, + "loss": 0.39130324125289917, + "loss_ce": 0.011908696964383125, + "loss_xval": 0.37890625, + "num_input_tokens_seen": 188051976, + "step": 1088 + }, + { + "epoch": 0.4352517985611511, + "grad_norm": 93.34175906908915, + "learning_rate": 5e-06, + "loss": 0.4928, + "num_input_tokens_seen": 188224992, + "step": 1089 + }, + { + "epoch": 0.4352517985611511, + "loss": 0.6273799538612366, + "loss_ce": 0.007751064375042915, + "loss_xval": 0.62109375, + "num_input_tokens_seen": 188224992, + "step": 1089 + }, + { + "epoch": 0.43565147881694644, + "grad_norm": 87.97360717139733, + "learning_rate": 5e-06, + "loss": 0.8327, + "num_input_tokens_seen": 188397832, + "step": 1090 + }, + { + "epoch": 0.43565147881694644, + "loss": 1.0488063097000122, + "loss_ce": 0.006386911030858755, + "loss_xval": 1.0390625, + "num_input_tokens_seen": 188397832, + "step": 1090 + }, + { + "epoch": 0.43605115907274183, + "grad_norm": 95.46249626601698, + "learning_rate": 5e-06, + "loss": 0.4634, + "num_input_tokens_seen": 188570840, + "step": 1091 + }, + { + "epoch": 0.43605115907274183, + "loss": 0.4953336715698242, + "loss_ce": 0.010622961446642876, + "loss_xval": 0.484375, + "num_input_tokens_seen": 188570840, + "step": 1091 + }, + { + "epoch": 0.4364508393285372, + "grad_norm": 78.63885250463996, + "learning_rate": 5e-06, + "loss": 0.938, + "num_input_tokens_seen": 188740200, + "step": 1092 + }, + { + "epoch": 0.4364508393285372, + "loss": 0.8642194271087646, + "loss_ce": 0.007529974915087223, + "loss_xval": 0.85546875, + "num_input_tokens_seen": 188740200, + "step": 1092 + }, + { + "epoch": 0.4368505195843325, + "grad_norm": 43.144393557301015, + "learning_rate": 5e-06, + "loss": 0.3254, + "num_input_tokens_seen": 188913096, + "step": 1093 + }, + { + "epoch": 0.4368505195843325, + "loss": 0.3359166979789734, + "loss_ce": 0.007486535236239433, + "loss_xval": 0.328125, + "num_input_tokens_seen": 188913096, + "step": 1093 + }, + { + "epoch": 0.4372501998401279, + "grad_norm": 40.822521318289155, + "learning_rate": 5e-06, + "loss": 0.6592, + "num_input_tokens_seen": 189085696, + "step": 1094 + }, + { + "epoch": 0.4372501998401279, + "loss": 0.490889310836792, + "loss_ce": 0.007735013496130705, + "loss_xval": 0.482421875, + "num_input_tokens_seen": 189085696, + "step": 1094 + }, + { + "epoch": 0.43764988009592326, + "grad_norm": 35.439525893578136, + "learning_rate": 5e-06, + "loss": 0.4484, + "num_input_tokens_seen": 189258752, + "step": 1095 + }, + { + "epoch": 0.43764988009592326, + "loss": 0.5019693374633789, + "loss_ce": 0.00807284377515316, + "loss_xval": 0.494140625, + "num_input_tokens_seen": 189258752, + "step": 1095 + }, + { + "epoch": 0.4380495603517186, + "grad_norm": 54.20083299822031, + "learning_rate": 5e-06, + "loss": 0.4249, + "num_input_tokens_seen": 189431568, + "step": 1096 + }, + { + "epoch": 0.4380495603517186, + "loss": 0.49263429641723633, + "loss_ce": 0.006062053143978119, + "loss_xval": 0.486328125, + "num_input_tokens_seen": 189431568, + "step": 1096 + }, + { + "epoch": 0.438449240607514, + "grad_norm": 41.375718060440676, + "learning_rate": 5e-06, + "loss": 0.4014, + "num_input_tokens_seen": 189604344, + "step": 1097 + }, + { + "epoch": 0.438449240607514, + "loss": 0.5682648420333862, + "loss_ce": 0.00735173374414444, + "loss_xval": 0.5625, + "num_input_tokens_seen": 189604344, + "step": 1097 + }, + { + "epoch": 0.43884892086330934, + "grad_norm": 68.92925320134385, + "learning_rate": 5e-06, + "loss": 0.6919, + "num_input_tokens_seen": 189777200, + "step": 1098 + }, + { + "epoch": 0.43884892086330934, + "loss": 0.9103479385375977, + "loss_ce": 0.00611207727342844, + "loss_xval": 0.90234375, + "num_input_tokens_seen": 189777200, + "step": 1098 + }, + { + "epoch": 0.43924860111910474, + "grad_norm": 87.815067117457, + "learning_rate": 5e-06, + "loss": 0.6971, + "num_input_tokens_seen": 189950504, + "step": 1099 + }, + { + "epoch": 0.43924860111910474, + "loss": 0.8111795783042908, + "loss_ce": 0.009421739727258682, + "loss_xval": 0.80078125, + "num_input_tokens_seen": 189950504, + "step": 1099 + }, + { + "epoch": 0.4396482813749001, + "grad_norm": 72.59872907713559, + "learning_rate": 5e-06, + "loss": 0.7176, + "num_input_tokens_seen": 190123456, + "step": 1100 + }, + { + "epoch": 0.4396482813749001, + "loss": 0.6115920543670654, + "loss_ce": 0.004917819052934647, + "loss_xval": 0.60546875, + "num_input_tokens_seen": 190123456, + "step": 1100 + }, + { + "epoch": 0.44004796163069543, + "grad_norm": 43.80495265395074, + "learning_rate": 5e-06, + "loss": 0.398, + "num_input_tokens_seen": 190296440, + "step": 1101 + }, + { + "epoch": 0.44004796163069543, + "loss": 0.44945085048675537, + "loss_ce": 0.010089308023452759, + "loss_xval": 0.439453125, + "num_input_tokens_seen": 190296440, + "step": 1101 + }, + { + "epoch": 0.4404476418864908, + "grad_norm": 43.888561938956094, + "learning_rate": 5e-06, + "loss": 0.5756, + "num_input_tokens_seen": 190469448, + "step": 1102 + }, + { + "epoch": 0.4404476418864908, + "loss": 0.34963488578796387, + "loss_ce": 0.009516467340290546, + "loss_xval": 0.33984375, + "num_input_tokens_seen": 190469448, + "step": 1102 + }, + { + "epoch": 0.44084732214228617, + "grad_norm": 53.76726245558171, + "learning_rate": 5e-06, + "loss": 0.5227, + "num_input_tokens_seen": 190639032, + "step": 1103 + }, + { + "epoch": 0.44084732214228617, + "loss": 0.42856094241142273, + "loss_ce": 0.003573148977011442, + "loss_xval": 0.42578125, + "num_input_tokens_seen": 190639032, + "step": 1103 + }, + { + "epoch": 0.4412470023980815, + "grad_norm": 52.770932080369946, + "learning_rate": 5e-06, + "loss": 0.639, + "num_input_tokens_seen": 190811968, + "step": 1104 + }, + { + "epoch": 0.4412470023980815, + "loss": 0.8563051819801331, + "loss_ce": 0.00498683238402009, + "loss_xval": 0.8515625, + "num_input_tokens_seen": 190811968, + "step": 1104 + }, + { + "epoch": 0.4416466826538769, + "grad_norm": 108.72621992851067, + "learning_rate": 5e-06, + "loss": 0.6654, + "num_input_tokens_seen": 190984856, + "step": 1105 + }, + { + "epoch": 0.4416466826538769, + "loss": 0.4479440748691559, + "loss_ce": 0.012885487638413906, + "loss_xval": 0.435546875, + "num_input_tokens_seen": 190984856, + "step": 1105 + }, + { + "epoch": 0.44204636290967225, + "grad_norm": 57.656324232108965, + "learning_rate": 5e-06, + "loss": 0.3461, + "num_input_tokens_seen": 191157688, + "step": 1106 + }, + { + "epoch": 0.44204636290967225, + "loss": 0.3032693564891815, + "loss_ce": 0.004014000296592712, + "loss_xval": 0.298828125, + "num_input_tokens_seen": 191157688, + "step": 1106 + }, + { + "epoch": 0.44244604316546765, + "grad_norm": 55.89449040965285, + "learning_rate": 5e-06, + "loss": 0.5353, + "num_input_tokens_seen": 191330880, + "step": 1107 + }, + { + "epoch": 0.44244604316546765, + "loss": 0.45665621757507324, + "loss_ce": 0.010245123878121376, + "loss_xval": 0.447265625, + "num_input_tokens_seen": 191330880, + "step": 1107 + }, + { + "epoch": 0.442845723421263, + "grad_norm": 94.95423178563114, + "learning_rate": 5e-06, + "loss": 0.3389, + "num_input_tokens_seen": 191503792, + "step": 1108 + }, + { + "epoch": 0.442845723421263, + "loss": 0.40393486618995667, + "loss_ce": 0.009342581033706665, + "loss_xval": 0.39453125, + "num_input_tokens_seen": 191503792, + "step": 1108 + }, + { + "epoch": 0.44324540367705834, + "grad_norm": 78.28345803795365, + "learning_rate": 5e-06, + "loss": 0.6797, + "num_input_tokens_seen": 191676888, + "step": 1109 + }, + { + "epoch": 0.44324540367705834, + "loss": 0.7096420526504517, + "loss_ce": 0.00401464244350791, + "loss_xval": 0.70703125, + "num_input_tokens_seen": 191676888, + "step": 1109 + }, + { + "epoch": 0.44364508393285373, + "grad_norm": 61.34569052749395, + "learning_rate": 5e-06, + "loss": 0.4274, + "num_input_tokens_seen": 191850256, + "step": 1110 + }, + { + "epoch": 0.44364508393285373, + "loss": 0.44284114241600037, + "loss_ce": 0.004608696326613426, + "loss_xval": 0.4375, + "num_input_tokens_seen": 191850256, + "step": 1110 + }, + { + "epoch": 0.4440447641886491, + "grad_norm": 81.82998770177993, + "learning_rate": 5e-06, + "loss": 0.586, + "num_input_tokens_seen": 192023160, + "step": 1111 + }, + { + "epoch": 0.4440447641886491, + "loss": 0.41691532731056213, + "loss_ce": 0.0032190163619816303, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 192023160, + "step": 1111 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 56.85534103714712, + "learning_rate": 5e-06, + "loss": 0.84, + "num_input_tokens_seen": 192196264, + "step": 1112 + }, + { + "epoch": 0.4444444444444444, + "loss": 0.5856711268424988, + "loss_ce": 0.0033957427367568016, + "loss_xval": 0.58203125, + "num_input_tokens_seen": 192196264, + "step": 1112 + }, + { + "epoch": 0.4448441247002398, + "grad_norm": 53.319073892773986, + "learning_rate": 5e-06, + "loss": 0.5192, + "num_input_tokens_seen": 192369400, + "step": 1113 + }, + { + "epoch": 0.4448441247002398, + "loss": 0.6779003143310547, + "loss_ce": 0.007734273560345173, + "loss_xval": 0.671875, + "num_input_tokens_seen": 192369400, + "step": 1113 + }, + { + "epoch": 0.44524380495603516, + "grad_norm": 53.64924867403011, + "learning_rate": 5e-06, + "loss": 0.7078, + "num_input_tokens_seen": 192542208, + "step": 1114 + }, + { + "epoch": 0.44524380495603516, + "loss": 0.7112681865692139, + "loss_ce": 0.004114857874810696, + "loss_xval": 0.70703125, + "num_input_tokens_seen": 192542208, + "step": 1114 + }, + { + "epoch": 0.44564348521183056, + "grad_norm": 94.76319006567356, + "learning_rate": 5e-06, + "loss": 0.8391, + "num_input_tokens_seen": 192714968, + "step": 1115 + }, + { + "epoch": 0.44564348521183056, + "loss": 0.9513822793960571, + "loss_ce": 0.006313872057944536, + "loss_xval": 0.9453125, + "num_input_tokens_seen": 192714968, + "step": 1115 + }, + { + "epoch": 0.4460431654676259, + "grad_norm": 32.46405483729325, + "learning_rate": 5e-06, + "loss": 0.698, + "num_input_tokens_seen": 192887824, + "step": 1116 + }, + { + "epoch": 0.4460431654676259, + "loss": 0.3914491534233093, + "loss_ce": 0.00399797223508358, + "loss_xval": 0.38671875, + "num_input_tokens_seen": 192887824, + "step": 1116 + }, + { + "epoch": 0.44644284572342124, + "grad_norm": 86.48967997737158, + "learning_rate": 5e-06, + "loss": 0.649, + "num_input_tokens_seen": 193061072, + "step": 1117 + }, + { + "epoch": 0.44644284572342124, + "loss": 0.6223208904266357, + "loss_ce": 0.003363374387845397, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 193061072, + "step": 1117 + }, + { + "epoch": 0.44684252597921664, + "grad_norm": 31.61585246517527, + "learning_rate": 5e-06, + "loss": 0.8002, + "num_input_tokens_seen": 193234224, + "step": 1118 + }, + { + "epoch": 0.44684252597921664, + "loss": 0.7989094257354736, + "loss_ce": 0.004201183095574379, + "loss_xval": 0.79296875, + "num_input_tokens_seen": 193234224, + "step": 1118 + }, + { + "epoch": 0.447242206235012, + "grad_norm": 74.71859039399737, + "learning_rate": 5e-06, + "loss": 0.6386, + "num_input_tokens_seen": 193406944, + "step": 1119 + }, + { + "epoch": 0.447242206235012, + "loss": 0.7952804565429688, + "loss_ce": 0.004783664830029011, + "loss_xval": 0.7890625, + "num_input_tokens_seen": 193406944, + "step": 1119 + }, + { + "epoch": 0.44764188649080733, + "grad_norm": 67.74121508591531, + "learning_rate": 5e-06, + "loss": 0.9679, + "num_input_tokens_seen": 193579568, + "step": 1120 + }, + { + "epoch": 0.44764188649080733, + "loss": 1.1605088710784912, + "loss_ce": 0.004624995868653059, + "loss_xval": 1.15625, + "num_input_tokens_seen": 193579568, + "step": 1120 + }, + { + "epoch": 0.4480415667466027, + "grad_norm": 159.3426492178321, + "learning_rate": 5e-06, + "loss": 0.6493, + "num_input_tokens_seen": 193752760, + "step": 1121 + }, + { + "epoch": 0.4480415667466027, + "loss": 0.7691453695297241, + "loss_ce": 0.009623829275369644, + "loss_xval": 0.7578125, + "num_input_tokens_seen": 193752760, + "step": 1121 + }, + { + "epoch": 0.44844124700239807, + "grad_norm": 19.412155259998478, + "learning_rate": 5e-06, + "loss": 0.4257, + "num_input_tokens_seen": 193925760, + "step": 1122 + }, + { + "epoch": 0.44844124700239807, + "loss": 0.35129088163375854, + "loss_ce": 0.019381720572710037, + "loss_xval": 0.33203125, + "num_input_tokens_seen": 193925760, + "step": 1122 + }, + { + "epoch": 0.44884092725819347, + "grad_norm": 132.37602488962582, + "learning_rate": 5e-06, + "loss": 0.6866, + "num_input_tokens_seen": 194098728, + "step": 1123 + }, + { + "epoch": 0.44884092725819347, + "loss": 0.681769847869873, + "loss_ce": 0.011481714434921741, + "loss_xval": 0.671875, + "num_input_tokens_seen": 194098728, + "step": 1123 + }, + { + "epoch": 0.4492406075139888, + "grad_norm": 56.93257373319191, + "learning_rate": 5e-06, + "loss": 0.4694, + "num_input_tokens_seen": 194271896, + "step": 1124 + }, + { + "epoch": 0.4492406075139888, + "loss": 0.3968222737312317, + "loss_ce": 0.008516602218151093, + "loss_xval": 0.388671875, + "num_input_tokens_seen": 194271896, + "step": 1124 + }, + { + "epoch": 0.44964028776978415, + "grad_norm": 141.95283709265078, + "learning_rate": 5e-06, + "loss": 0.4908, + "num_input_tokens_seen": 194444824, + "step": 1125 + }, + { + "epoch": 0.44964028776978415, + "loss": 0.46788841485977173, + "loss_ce": 0.005364010110497475, + "loss_xval": 0.462890625, + "num_input_tokens_seen": 194444824, + "step": 1125 + }, + { + "epoch": 0.45003996802557955, + "grad_norm": 74.83771643983036, + "learning_rate": 5e-06, + "loss": 0.6375, + "num_input_tokens_seen": 194614360, + "step": 1126 + }, + { + "epoch": 0.45003996802557955, + "loss": 0.42517510056495667, + "loss_ce": 0.006016166415065527, + "loss_xval": 0.419921875, + "num_input_tokens_seen": 194614360, + "step": 1126 + }, + { + "epoch": 0.4504396482813749, + "grad_norm": 89.4636056566743, + "learning_rate": 5e-06, + "loss": 0.7524, + "num_input_tokens_seen": 194786968, + "step": 1127 + }, + { + "epoch": 0.4504396482813749, + "loss": 0.8362482190132141, + "loss_ce": 0.0059259673580527306, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 194786968, + "step": 1127 + }, + { + "epoch": 0.45083932853717024, + "grad_norm": 104.5467139927948, + "learning_rate": 5e-06, + "loss": 0.7947, + "num_input_tokens_seen": 194959904, + "step": 1128 + }, + { + "epoch": 0.45083932853717024, + "loss": 0.5858356952667236, + "loss_ce": 0.0061237625777721405, + "loss_xval": 0.578125, + "num_input_tokens_seen": 194959904, + "step": 1128 + }, + { + "epoch": 0.45123900879296563, + "grad_norm": 103.50952784722763, + "learning_rate": 5e-06, + "loss": 0.5609, + "num_input_tokens_seen": 195133032, + "step": 1129 + }, + { + "epoch": 0.45123900879296563, + "loss": 0.3132474422454834, + "loss_ce": 0.005569221451878548, + "loss_xval": 0.30859375, + "num_input_tokens_seen": 195133032, + "step": 1129 + }, + { + "epoch": 0.451638689048761, + "grad_norm": 85.79342122693706, + "learning_rate": 5e-06, + "loss": 0.636, + "num_input_tokens_seen": 195305808, + "step": 1130 + }, + { + "epoch": 0.451638689048761, + "loss": 0.5406002402305603, + "loss_ce": 0.00495569733902812, + "loss_xval": 0.53515625, + "num_input_tokens_seen": 195305808, + "step": 1130 + }, + { + "epoch": 0.4520383693045564, + "grad_norm": 57.37006766323614, + "learning_rate": 5e-06, + "loss": 0.516, + "num_input_tokens_seen": 195478768, + "step": 1131 + }, + { + "epoch": 0.4520383693045564, + "loss": 0.344220370054245, + "loss_ce": 0.004925938788801432, + "loss_xval": 0.33984375, + "num_input_tokens_seen": 195478768, + "step": 1131 + }, + { + "epoch": 0.4524380495603517, + "grad_norm": 120.1394915291317, + "learning_rate": 5e-06, + "loss": 0.459, + "num_input_tokens_seen": 195651456, + "step": 1132 + }, + { + "epoch": 0.4524380495603517, + "loss": 0.5401527881622314, + "loss_ce": 0.0051185921765863895, + "loss_xval": 0.53515625, + "num_input_tokens_seen": 195651456, + "step": 1132 + }, + { + "epoch": 0.45283772981614706, + "grad_norm": 54.69748517929681, + "learning_rate": 5e-06, + "loss": 0.7013, + "num_input_tokens_seen": 195824560, + "step": 1133 + }, + { + "epoch": 0.45283772981614706, + "loss": 0.8380387425422668, + "loss_ce": 0.006373690906912088, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 195824560, + "step": 1133 + }, + { + "epoch": 0.45323741007194246, + "grad_norm": 95.7615162884942, + "learning_rate": 5e-06, + "loss": 0.4366, + "num_input_tokens_seen": 195997400, + "step": 1134 + }, + { + "epoch": 0.45323741007194246, + "loss": 0.31681621074676514, + "loss_ce": 0.0037669152952730656, + "loss_xval": 0.3125, + "num_input_tokens_seen": 195997400, + "step": 1134 + }, + { + "epoch": 0.4536370903277378, + "grad_norm": 51.51343251960469, + "learning_rate": 5e-06, + "loss": 0.9125, + "num_input_tokens_seen": 196170648, + "step": 1135 + }, + { + "epoch": 0.4536370903277378, + "loss": 0.5430760979652405, + "loss_ce": 0.004868092946708202, + "loss_xval": 0.5390625, + "num_input_tokens_seen": 196170648, + "step": 1135 + }, + { + "epoch": 0.4540367705835332, + "grad_norm": 41.699713092805354, + "learning_rate": 5e-06, + "loss": 0.8595, + "num_input_tokens_seen": 196343824, + "step": 1136 + }, + { + "epoch": 0.4540367705835332, + "loss": 0.8823078274726868, + "loss_ce": 0.013533404096961021, + "loss_xval": 0.8671875, + "num_input_tokens_seen": 196343824, + "step": 1136 + }, + { + "epoch": 0.45443645083932854, + "grad_norm": 47.49786214842191, + "learning_rate": 5e-06, + "loss": 0.7696, + "num_input_tokens_seen": 196517272, + "step": 1137 + }, + { + "epoch": 0.45443645083932854, + "loss": 0.6541973948478699, + "loss_ce": 0.006553375627845526, + "loss_xval": 0.6484375, + "num_input_tokens_seen": 196517272, + "step": 1137 + }, + { + "epoch": 0.4548361310951239, + "grad_norm": 38.86335658205585, + "learning_rate": 5e-06, + "loss": 0.5626, + "num_input_tokens_seen": 196690024, + "step": 1138 + }, + { + "epoch": 0.4548361310951239, + "loss": 0.49186328053474426, + "loss_ce": 0.0104790348559618, + "loss_xval": 0.48046875, + "num_input_tokens_seen": 196690024, + "step": 1138 + }, + { + "epoch": 0.4552358113509193, + "grad_norm": 74.08500005053307, + "learning_rate": 5e-06, + "loss": 0.6865, + "num_input_tokens_seen": 196862856, + "step": 1139 + }, + { + "epoch": 0.4552358113509193, + "loss": 0.9089970588684082, + "loss_ce": 0.010742646642029285, + "loss_xval": 0.8984375, + "num_input_tokens_seen": 196862856, + "step": 1139 + }, + { + "epoch": 0.4556354916067146, + "grad_norm": 62.43683214837666, + "learning_rate": 5e-06, + "loss": 0.9647, + "num_input_tokens_seen": 197035824, + "step": 1140 + }, + { + "epoch": 0.4556354916067146, + "loss": 1.02647864818573, + "loss_ce": 0.0059097823686897755, + "loss_xval": 1.0234375, + "num_input_tokens_seen": 197035824, + "step": 1140 + }, + { + "epoch": 0.45603517186250997, + "grad_norm": 44.531689617859236, + "learning_rate": 5e-06, + "loss": 0.7474, + "num_input_tokens_seen": 197208824, + "step": 1141 + }, + { + "epoch": 0.45603517186250997, + "loss": 0.6352089643478394, + "loss_ce": 0.005020937416702509, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 197208824, + "step": 1141 + }, + { + "epoch": 0.45643485211830537, + "grad_norm": 33.00928243102856, + "learning_rate": 5e-06, + "loss": 0.5729, + "num_input_tokens_seen": 197381696, + "step": 1142 + }, + { + "epoch": 0.45643485211830537, + "loss": 0.5751632452011108, + "loss_ce": 0.005094892345368862, + "loss_xval": 0.5703125, + "num_input_tokens_seen": 197381696, + "step": 1142 + }, + { + "epoch": 0.4568345323741007, + "grad_norm": 110.0605177033546, + "learning_rate": 5e-06, + "loss": 0.8354, + "num_input_tokens_seen": 197554728, + "step": 1143 + }, + { + "epoch": 0.4568345323741007, + "loss": 0.8544121384620667, + "loss_ce": 0.005535189062356949, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 197554728, + "step": 1143 + }, + { + "epoch": 0.4572342126298961, + "grad_norm": 66.44659371930612, + "learning_rate": 5e-06, + "loss": 0.6322, + "num_input_tokens_seen": 197727824, + "step": 1144 + }, + { + "epoch": 0.4572342126298961, + "loss": 0.6607143878936768, + "loss_ce": 0.006173363886773586, + "loss_xval": 0.65625, + "num_input_tokens_seen": 197727824, + "step": 1144 + }, + { + "epoch": 0.45763389288569145, + "grad_norm": 43.536496164695684, + "learning_rate": 5e-06, + "loss": 0.6087, + "num_input_tokens_seen": 197900352, + "step": 1145 + }, + { + "epoch": 0.45763389288569145, + "loss": 0.5236250162124634, + "loss_ce": 0.0048262146301567554, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 197900352, + "step": 1145 + }, + { + "epoch": 0.4580335731414868, + "grad_norm": 99.45161663771454, + "learning_rate": 5e-06, + "loss": 0.7284, + "num_input_tokens_seen": 198073304, + "step": 1146 + }, + { + "epoch": 0.4580335731414868, + "loss": 0.7672001123428345, + "loss_ce": 0.005206714384257793, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 198073304, + "step": 1146 + }, + { + "epoch": 0.4584332533972822, + "grad_norm": 81.21506953379286, + "learning_rate": 5e-06, + "loss": 0.6978, + "num_input_tokens_seen": 198246440, + "step": 1147 + }, + { + "epoch": 0.4584332533972822, + "loss": 0.574648380279541, + "loss_ce": 0.012514561414718628, + "loss_xval": 0.5625, + "num_input_tokens_seen": 198246440, + "step": 1147 + }, + { + "epoch": 0.45883293365307753, + "grad_norm": 79.41751309974842, + "learning_rate": 5e-06, + "loss": 1.0044, + "num_input_tokens_seen": 198419656, + "step": 1148 + }, + { + "epoch": 0.45883293365307753, + "loss": 0.8175091743469238, + "loss_ce": 0.004032664000988007, + "loss_xval": 0.8125, + "num_input_tokens_seen": 198419656, + "step": 1148 + }, + { + "epoch": 0.4592326139088729, + "grad_norm": 69.40833245914523, + "learning_rate": 5e-06, + "loss": 0.4482, + "num_input_tokens_seen": 198592384, + "step": 1149 + }, + { + "epoch": 0.4592326139088729, + "loss": 0.37215864658355713, + "loss_ce": 0.00448285136371851, + "loss_xval": 0.3671875, + "num_input_tokens_seen": 198592384, + "step": 1149 + }, + { + "epoch": 0.4596322941646683, + "grad_norm": 122.05219702328993, + "learning_rate": 5e-06, + "loss": 0.5041, + "num_input_tokens_seen": 198765368, + "step": 1150 + }, + { + "epoch": 0.4596322941646683, + "loss": 0.2772751450538635, + "loss_ce": 0.010124286636710167, + "loss_xval": 0.267578125, + "num_input_tokens_seen": 198765368, + "step": 1150 + }, + { + "epoch": 0.4600319744204636, + "grad_norm": 55.85383031494442, + "learning_rate": 5e-06, + "loss": 0.76, + "num_input_tokens_seen": 198938392, + "step": 1151 + }, + { + "epoch": 0.4600319744204636, + "loss": 0.8261213302612305, + "loss_ce": 0.0045881494879722595, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 198938392, + "step": 1151 + }, + { + "epoch": 0.460431654676259, + "grad_norm": 53.35549520327359, + "learning_rate": 5e-06, + "loss": 0.489, + "num_input_tokens_seen": 199111568, + "step": 1152 + }, + { + "epoch": 0.460431654676259, + "loss": 0.6392042636871338, + "loss_ce": 0.007368315011262894, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 199111568, + "step": 1152 + }, + { + "epoch": 0.46083133493205436, + "grad_norm": 41.969663825725945, + "learning_rate": 5e-06, + "loss": 0.5099, + "num_input_tokens_seen": 199284384, + "step": 1153 + }, + { + "epoch": 0.46083133493205436, + "loss": 0.5351383686065674, + "loss_ce": 0.004376672208309174, + "loss_xval": 0.53125, + "num_input_tokens_seen": 199284384, + "step": 1153 + }, + { + "epoch": 0.4612310151878497, + "grad_norm": 62.75425126705116, + "learning_rate": 5e-06, + "loss": 0.5918, + "num_input_tokens_seen": 199457280, + "step": 1154 + }, + { + "epoch": 0.4612310151878497, + "loss": 0.7125241756439209, + "loss_ce": 0.006286341696977615, + "loss_xval": 0.70703125, + "num_input_tokens_seen": 199457280, + "step": 1154 + }, + { + "epoch": 0.4616306954436451, + "grad_norm": 50.715666738129826, + "learning_rate": 5e-06, + "loss": 0.5002, + "num_input_tokens_seen": 199630232, + "step": 1155 + }, + { + "epoch": 0.4616306954436451, + "loss": 0.25332915782928467, + "loss_ce": 0.0032681506127119064, + "loss_xval": 0.25, + "num_input_tokens_seen": 199630232, + "step": 1155 + }, + { + "epoch": 0.46203037569944044, + "grad_norm": 29.166937844481254, + "learning_rate": 5e-06, + "loss": 0.4687, + "num_input_tokens_seen": 199803208, + "step": 1156 + }, + { + "epoch": 0.46203037569944044, + "loss": 0.3712252378463745, + "loss_ce": 0.0047701504081487656, + "loss_xval": 0.3671875, + "num_input_tokens_seen": 199803208, + "step": 1156 + }, + { + "epoch": 0.4624300559552358, + "grad_norm": 67.21113680973119, + "learning_rate": 5e-06, + "loss": 0.8676, + "num_input_tokens_seen": 199976416, + "step": 1157 + }, + { + "epoch": 0.4624300559552358, + "loss": 0.41837334632873535, + "loss_ce": 0.003608925500884652, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 199976416, + "step": 1157 + }, + { + "epoch": 0.4628297362110312, + "grad_norm": 36.177291288625426, + "learning_rate": 5e-06, + "loss": 0.4889, + "num_input_tokens_seen": 200149720, + "step": 1158 + }, + { + "epoch": 0.4628297362110312, + "loss": 0.7105360627174377, + "loss_ce": 0.005763140507042408, + "loss_xval": 0.703125, + "num_input_tokens_seen": 200149720, + "step": 1158 + }, + { + "epoch": 0.4632294164668265, + "grad_norm": 144.37390819755106, + "learning_rate": 5e-06, + "loss": 0.3905, + "num_input_tokens_seen": 200322696, + "step": 1159 + }, + { + "epoch": 0.4632294164668265, + "loss": 0.48566287755966187, + "loss_ce": 0.0023254724219441414, + "loss_xval": 0.482421875, + "num_input_tokens_seen": 200322696, + "step": 1159 + }, + { + "epoch": 0.4636290967226219, + "grad_norm": 34.84183224871753, + "learning_rate": 5e-06, + "loss": 0.2845, + "num_input_tokens_seen": 200496096, + "step": 1160 + }, + { + "epoch": 0.4636290967226219, + "loss": 0.27525389194488525, + "loss_ce": 0.0022894316352903843, + "loss_xval": 0.2734375, + "num_input_tokens_seen": 200496096, + "step": 1160 + }, + { + "epoch": 0.46402877697841727, + "grad_norm": 52.0086165058688, + "learning_rate": 5e-06, + "loss": 0.6896, + "num_input_tokens_seen": 200669288, + "step": 1161 + }, + { + "epoch": 0.46402877697841727, + "loss": 1.063295602798462, + "loss_ce": 0.009096423164010048, + "loss_xval": 1.0546875, + "num_input_tokens_seen": 200669288, + "step": 1161 + }, + { + "epoch": 0.4644284572342126, + "grad_norm": 48.3637411249726, + "learning_rate": 5e-06, + "loss": 0.5238, + "num_input_tokens_seen": 200842184, + "step": 1162 + }, + { + "epoch": 0.4644284572342126, + "loss": 0.41702714562416077, + "loss_ce": 0.00394120067358017, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 200842184, + "step": 1162 + }, + { + "epoch": 0.464828137490008, + "grad_norm": 95.71936689991736, + "learning_rate": 5e-06, + "loss": 0.5964, + "num_input_tokens_seen": 201015320, + "step": 1163 + }, + { + "epoch": 0.464828137490008, + "loss": 0.45856761932373047, + "loss_ce": 0.002146715298295021, + "loss_xval": 0.45703125, + "num_input_tokens_seen": 201015320, + "step": 1163 + }, + { + "epoch": 0.46522781774580335, + "grad_norm": 47.786200135317, + "learning_rate": 5e-06, + "loss": 0.8803, + "num_input_tokens_seen": 201187928, + "step": 1164 + }, + { + "epoch": 0.46522781774580335, + "loss": 1.0784153938293457, + "loss_ce": 0.007370521314442158, + "loss_xval": 1.0703125, + "num_input_tokens_seen": 201187928, + "step": 1164 + }, + { + "epoch": 0.4656274980015987, + "grad_norm": 70.62039457961404, + "learning_rate": 5e-06, + "loss": 0.9221, + "num_input_tokens_seen": 201360888, + "step": 1165 + }, + { + "epoch": 0.4656274980015987, + "loss": 0.8973113298416138, + "loss_ce": 0.004855302162468433, + "loss_xval": 0.890625, + "num_input_tokens_seen": 201360888, + "step": 1165 + }, + { + "epoch": 0.4660271782573941, + "grad_norm": 35.876824576405056, + "learning_rate": 5e-06, + "loss": 0.4456, + "num_input_tokens_seen": 201533736, + "step": 1166 + }, + { + "epoch": 0.4660271782573941, + "loss": 0.5968400835990906, + "loss_ce": 0.006630140822380781, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 201533736, + "step": 1166 + }, + { + "epoch": 0.46642685851318944, + "grad_norm": 63.15386629040931, + "learning_rate": 5e-06, + "loss": 0.683, + "num_input_tokens_seen": 201706496, + "step": 1167 + }, + { + "epoch": 0.46642685851318944, + "loss": 0.5146918296813965, + "loss_ce": 0.0032782740890979767, + "loss_xval": 0.51171875, + "num_input_tokens_seen": 201706496, + "step": 1167 + }, + { + "epoch": 0.46682653876898483, + "grad_norm": 51.66674260877597, + "learning_rate": 5e-06, + "loss": 0.6915, + "num_input_tokens_seen": 201879464, + "step": 1168 + }, + { + "epoch": 0.46682653876898483, + "loss": 1.018492341041565, + "loss_ce": 0.00811633188277483, + "loss_xval": 1.0078125, + "num_input_tokens_seen": 201879464, + "step": 1168 + }, + { + "epoch": 0.4672262190247802, + "grad_norm": 49.273976329122995, + "learning_rate": 5e-06, + "loss": 0.376, + "num_input_tokens_seen": 202052384, + "step": 1169 + }, + { + "epoch": 0.4672262190247802, + "loss": 0.39832669496536255, + "loss_ce": 0.003398712258785963, + "loss_xval": 0.39453125, + "num_input_tokens_seen": 202052384, + "step": 1169 + }, + { + "epoch": 0.4676258992805755, + "grad_norm": 101.63398509107819, + "learning_rate": 5e-06, + "loss": 0.6936, + "num_input_tokens_seen": 202225344, + "step": 1170 + }, + { + "epoch": 0.4676258992805755, + "loss": 0.5724492073059082, + "loss_ce": 0.004242459312081337, + "loss_xval": 0.56640625, + "num_input_tokens_seen": 202225344, + "step": 1170 + }, + { + "epoch": 0.4680255795363709, + "grad_norm": 21.379650294342827, + "learning_rate": 5e-06, + "loss": 0.4938, + "num_input_tokens_seen": 202398136, + "step": 1171 + }, + { + "epoch": 0.4680255795363709, + "loss": 0.41817671060562134, + "loss_ce": 0.0021305850241333246, + "loss_xval": 0.416015625, + "num_input_tokens_seen": 202398136, + "step": 1171 + }, + { + "epoch": 0.46842525979216626, + "grad_norm": 70.14687736374864, + "learning_rate": 5e-06, + "loss": 0.695, + "num_input_tokens_seen": 202571104, + "step": 1172 + }, + { + "epoch": 0.46842525979216626, + "loss": 0.5816267728805542, + "loss_ce": 0.005393843166530132, + "loss_xval": 0.578125, + "num_input_tokens_seen": 202571104, + "step": 1172 + }, + { + "epoch": 0.46882494004796166, + "grad_norm": 24.91607803294252, + "learning_rate": 5e-06, + "loss": 0.6048, + "num_input_tokens_seen": 202744160, + "step": 1173 + }, + { + "epoch": 0.46882494004796166, + "loss": 0.5543652176856995, + "loss_ce": 0.004911464173346758, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 202744160, + "step": 1173 + }, + { + "epoch": 0.469224620303757, + "grad_norm": 59.37963682222382, + "learning_rate": 5e-06, + "loss": 0.4961, + "num_input_tokens_seen": 202916928, + "step": 1174 + }, + { + "epoch": 0.469224620303757, + "loss": 0.5824704170227051, + "loss_ce": 0.008221141993999481, + "loss_xval": 0.57421875, + "num_input_tokens_seen": 202916928, + "step": 1174 + }, + { + "epoch": 0.46962430055955234, + "grad_norm": 61.38563783071624, + "learning_rate": 5e-06, + "loss": 0.9222, + "num_input_tokens_seen": 203090040, + "step": 1175 + }, + { + "epoch": 0.46962430055955234, + "loss": 0.6337473392486572, + "loss_ce": 0.003986579366028309, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 203090040, + "step": 1175 + }, + { + "epoch": 0.47002398081534774, + "grad_norm": 30.520568694364325, + "learning_rate": 5e-06, + "loss": 0.5868, + "num_input_tokens_seen": 203260272, + "step": 1176 + }, + { + "epoch": 0.47002398081534774, + "loss": 0.7021505236625671, + "loss_ce": 0.009065819904208183, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 203260272, + "step": 1176 + }, + { + "epoch": 0.4704236610711431, + "grad_norm": 114.80561125694719, + "learning_rate": 5e-06, + "loss": 0.6283, + "num_input_tokens_seen": 203433192, + "step": 1177 + }, + { + "epoch": 0.4704236610711431, + "loss": 0.46350085735321045, + "loss_ce": 0.004699579905718565, + "loss_xval": 0.458984375, + "num_input_tokens_seen": 203433192, + "step": 1177 + }, + { + "epoch": 0.4708233413269384, + "grad_norm": 90.34627311896233, + "learning_rate": 5e-06, + "loss": 0.5641, + "num_input_tokens_seen": 203606144, + "step": 1178 + }, + { + "epoch": 0.4708233413269384, + "loss": 0.4923925995826721, + "loss_ce": 0.007163085043430328, + "loss_xval": 0.484375, + "num_input_tokens_seen": 203606144, + "step": 1178 + }, + { + "epoch": 0.4712230215827338, + "grad_norm": 71.19184014400291, + "learning_rate": 5e-06, + "loss": 0.5977, + "num_input_tokens_seen": 203779104, + "step": 1179 + }, + { + "epoch": 0.4712230215827338, + "loss": 0.6201607584953308, + "loss_ce": 0.0019356525735929608, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 203779104, + "step": 1179 + }, + { + "epoch": 0.47162270183852917, + "grad_norm": 46.09540182862563, + "learning_rate": 5e-06, + "loss": 0.7048, + "num_input_tokens_seen": 203951968, + "step": 1180 + }, + { + "epoch": 0.47162270183852917, + "loss": 0.6933472752571106, + "loss_ce": 0.003711057361215353, + "loss_xval": 0.69140625, + "num_input_tokens_seen": 203951968, + "step": 1180 + }, + { + "epoch": 0.47202238209432457, + "grad_norm": 85.31817735510187, + "learning_rate": 5e-06, + "loss": 0.4103, + "num_input_tokens_seen": 204124320, + "step": 1181 + }, + { + "epoch": 0.47202238209432457, + "loss": 0.45448797941207886, + "loss_ce": 0.0026142210699617863, + "loss_xval": 0.451171875, + "num_input_tokens_seen": 204124320, + "step": 1181 + }, + { + "epoch": 0.4724220623501199, + "grad_norm": 62.585221065215585, + "learning_rate": 5e-06, + "loss": 0.5123, + "num_input_tokens_seen": 204296704, + "step": 1182 + }, + { + "epoch": 0.4724220623501199, + "loss": 0.753317654132843, + "loss_ce": 0.004050097428262234, + "loss_xval": 0.75, + "num_input_tokens_seen": 204296704, + "step": 1182 + }, + { + "epoch": 0.47282174260591525, + "grad_norm": 90.40966754196259, + "learning_rate": 5e-06, + "loss": 0.6652, + "num_input_tokens_seen": 204469256, + "step": 1183 + }, + { + "epoch": 0.47282174260591525, + "loss": 0.42565417289733887, + "loss_ce": 0.003901238553225994, + "loss_xval": 0.421875, + "num_input_tokens_seen": 204469256, + "step": 1183 + }, + { + "epoch": 0.47322142286171065, + "grad_norm": 54.39935747092517, + "learning_rate": 5e-06, + "loss": 0.7614, + "num_input_tokens_seen": 204642296, + "step": 1184 + }, + { + "epoch": 0.47322142286171065, + "loss": 0.5638756155967712, + "loss_ce": 0.005281836725771427, + "loss_xval": 0.55859375, + "num_input_tokens_seen": 204642296, + "step": 1184 + }, + { + "epoch": 0.473621103117506, + "grad_norm": 69.45566268019499, + "learning_rate": 5e-06, + "loss": 0.7909, + "num_input_tokens_seen": 204815504, + "step": 1185 + }, + { + "epoch": 0.473621103117506, + "loss": 0.9263577461242676, + "loss_ce": 0.0036892304196953773, + "loss_xval": 0.921875, + "num_input_tokens_seen": 204815504, + "step": 1185 + }, + { + "epoch": 0.47402078337330134, + "grad_norm": 47.95557312190612, + "learning_rate": 5e-06, + "loss": 0.5052, + "num_input_tokens_seen": 204988160, + "step": 1186 + }, + { + "epoch": 0.47402078337330134, + "loss": 0.42373624444007874, + "loss_ce": 0.004943536594510078, + "loss_xval": 0.41796875, + "num_input_tokens_seen": 204988160, + "step": 1186 + }, + { + "epoch": 0.47442046362909673, + "grad_norm": 80.91810811660899, + "learning_rate": 5e-06, + "loss": 0.4477, + "num_input_tokens_seen": 205160608, + "step": 1187 + }, + { + "epoch": 0.47442046362909673, + "loss": 0.27876970171928406, + "loss_ce": 0.0027076760306954384, + "loss_xval": 0.275390625, + "num_input_tokens_seen": 205160608, + "step": 1187 + }, + { + "epoch": 0.4748201438848921, + "grad_norm": 89.19904810541342, + "learning_rate": 5e-06, + "loss": 0.8543, + "num_input_tokens_seen": 205333520, + "step": 1188 + }, + { + "epoch": 0.4748201438848921, + "loss": 1.167051076889038, + "loss_ce": 0.00945834070444107, + "loss_xval": 1.15625, + "num_input_tokens_seen": 205333520, + "step": 1188 + }, + { + "epoch": 0.4752198241406875, + "grad_norm": 129.51637700147677, + "learning_rate": 5e-06, + "loss": 0.6642, + "num_input_tokens_seen": 205505920, + "step": 1189 + }, + { + "epoch": 0.4752198241406875, + "loss": 0.6400755643844604, + "loss_ce": 0.003539936849847436, + "loss_xval": 0.63671875, + "num_input_tokens_seen": 205505920, + "step": 1189 + }, + { + "epoch": 0.4756195043964828, + "grad_norm": 147.50728598390194, + "learning_rate": 5e-06, + "loss": 0.7091, + "num_input_tokens_seen": 205679128, + "step": 1190 + }, + { + "epoch": 0.4756195043964828, + "loss": 0.9833164215087891, + "loss_ce": 0.005899389274418354, + "loss_xval": 0.9765625, + "num_input_tokens_seen": 205679128, + "step": 1190 + }, + { + "epoch": 0.47601918465227816, + "grad_norm": 90.94767641525118, + "learning_rate": 5e-06, + "loss": 0.7318, + "num_input_tokens_seen": 205851816, + "step": 1191 + }, + { + "epoch": 0.47601918465227816, + "loss": 0.7062619924545288, + "loss_ce": 0.006860113237053156, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 205851816, + "step": 1191 + }, + { + "epoch": 0.47641886490807356, + "grad_norm": 149.49872792375933, + "learning_rate": 5e-06, + "loss": 0.5292, + "num_input_tokens_seen": 206024792, + "step": 1192 + }, + { + "epoch": 0.47641886490807356, + "loss": 0.5177109837532043, + "loss_ce": 0.005381924565881491, + "loss_xval": 0.51171875, + "num_input_tokens_seen": 206024792, + "step": 1192 + }, + { + "epoch": 0.4768185451638689, + "grad_norm": 36.85115396826063, + "learning_rate": 5e-06, + "loss": 0.4891, + "num_input_tokens_seen": 206197976, + "step": 1193 + }, + { + "epoch": 0.4768185451638689, + "loss": 0.6350235939025879, + "loss_ce": 0.006605636328458786, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 206197976, + "step": 1193 + }, + { + "epoch": 0.47721822541966424, + "grad_norm": 45.87647325830105, + "learning_rate": 5e-06, + "loss": 0.3795, + "num_input_tokens_seen": 206370960, + "step": 1194 + }, + { + "epoch": 0.47721822541966424, + "loss": 0.31491148471832275, + "loss_ce": 0.004608749412000179, + "loss_xval": 0.310546875, + "num_input_tokens_seen": 206370960, + "step": 1194 + }, + { + "epoch": 0.47761790567545964, + "grad_norm": 112.11711434198487, + "learning_rate": 5e-06, + "loss": 0.7471, + "num_input_tokens_seen": 206544200, + "step": 1195 + }, + { + "epoch": 0.47761790567545964, + "loss": 0.9842346906661987, + "loss_ce": 0.004620386753231287, + "loss_xval": 0.98046875, + "num_input_tokens_seen": 206544200, + "step": 1195 + }, + { + "epoch": 0.478017585931255, + "grad_norm": 65.02709178533176, + "learning_rate": 5e-06, + "loss": 0.7812, + "num_input_tokens_seen": 206716880, + "step": 1196 + }, + { + "epoch": 0.478017585931255, + "loss": 1.0724844932556152, + "loss_ce": 0.012914232909679413, + "loss_xval": 1.0625, + "num_input_tokens_seen": 206716880, + "step": 1196 + }, + { + "epoch": 0.4784172661870504, + "grad_norm": 64.89552423131474, + "learning_rate": 5e-06, + "loss": 0.6697, + "num_input_tokens_seen": 206889208, + "step": 1197 + }, + { + "epoch": 0.4784172661870504, + "loss": 0.5000320076942444, + "loss_ce": 0.006379666738212109, + "loss_xval": 0.494140625, + "num_input_tokens_seen": 206889208, + "step": 1197 + }, + { + "epoch": 0.4788169464428457, + "grad_norm": 92.45595988265173, + "learning_rate": 5e-06, + "loss": 0.4587, + "num_input_tokens_seen": 207062608, + "step": 1198 + }, + { + "epoch": 0.4788169464428457, + "loss": 0.6541969776153564, + "loss_ce": 0.0036232522688806057, + "loss_xval": 0.65234375, + "num_input_tokens_seen": 207062608, + "step": 1198 + }, + { + "epoch": 0.47921662669864107, + "grad_norm": 82.54766047585346, + "learning_rate": 5e-06, + "loss": 0.6257, + "num_input_tokens_seen": 207235096, + "step": 1199 + }, + { + "epoch": 0.47921662669864107, + "loss": 0.689261257648468, + "loss_ce": 0.007254431024193764, + "loss_xval": 0.68359375, + "num_input_tokens_seen": 207235096, + "step": 1199 + }, + { + "epoch": 0.47961630695443647, + "grad_norm": 57.082823434112576, + "learning_rate": 5e-06, + "loss": 0.5072, + "num_input_tokens_seen": 207408280, + "step": 1200 + }, + { + "epoch": 0.47961630695443647, + "loss": 0.5904377102851868, + "loss_ce": 0.003889882005751133, + "loss_xval": 0.5859375, + "num_input_tokens_seen": 207408280, + "step": 1200 + }, + { + "epoch": 0.4800159872102318, + "grad_norm": 37.253625537767235, + "learning_rate": 5e-06, + "loss": 0.7844, + "num_input_tokens_seen": 207581328, + "step": 1201 + }, + { + "epoch": 0.4800159872102318, + "loss": 0.5707876682281494, + "loss_ce": 0.0070670172572135925, + "loss_xval": 0.5625, + "num_input_tokens_seen": 207581328, + "step": 1201 + }, + { + "epoch": 0.48041566746602715, + "grad_norm": 126.39021086125909, + "learning_rate": 5e-06, + "loss": 0.47, + "num_input_tokens_seen": 207754272, + "step": 1202 + }, + { + "epoch": 0.48041566746602715, + "loss": 0.369029700756073, + "loss_ce": 0.005626373924314976, + "loss_xval": 0.36328125, + "num_input_tokens_seen": 207754272, + "step": 1202 + }, + { + "epoch": 0.48081534772182255, + "grad_norm": 27.399518912536287, + "learning_rate": 5e-06, + "loss": 0.4326, + "num_input_tokens_seen": 207926864, + "step": 1203 + }, + { + "epoch": 0.48081534772182255, + "loss": 0.48411285877227783, + "loss_ce": 0.0038882247172296047, + "loss_xval": 0.48046875, + "num_input_tokens_seen": 207926864, + "step": 1203 + }, + { + "epoch": 0.4812150279776179, + "grad_norm": 133.10942772969344, + "learning_rate": 5e-06, + "loss": 0.8218, + "num_input_tokens_seen": 208099896, + "step": 1204 + }, + { + "epoch": 0.4812150279776179, + "loss": 0.7755193114280701, + "loss_ce": 0.012824056670069695, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 208099896, + "step": 1204 + }, + { + "epoch": 0.4816147082334133, + "grad_norm": 39.05446343546031, + "learning_rate": 5e-06, + "loss": 0.5318, + "num_input_tokens_seen": 208272824, + "step": 1205 + }, + { + "epoch": 0.4816147082334133, + "loss": 0.4078354239463806, + "loss_ce": 0.0037826700136065483, + "loss_xval": 0.404296875, + "num_input_tokens_seen": 208272824, + "step": 1205 + }, + { + "epoch": 0.48201438848920863, + "grad_norm": 141.26378845226603, + "learning_rate": 5e-06, + "loss": 0.9037, + "num_input_tokens_seen": 208445320, + "step": 1206 + }, + { + "epoch": 0.48201438848920863, + "loss": 0.4204305112361908, + "loss_ce": 0.007832853123545647, + "loss_xval": 0.412109375, + "num_input_tokens_seen": 208445320, + "step": 1206 + }, + { + "epoch": 0.482414068745004, + "grad_norm": 65.24571131237124, + "learning_rate": 5e-06, + "loss": 0.781, + "num_input_tokens_seen": 208618264, + "step": 1207 + }, + { + "epoch": 0.482414068745004, + "loss": 1.0000226497650146, + "loss_ce": 0.005759958643466234, + "loss_xval": 0.99609375, + "num_input_tokens_seen": 208618264, + "step": 1207 + }, + { + "epoch": 0.4828137490007994, + "grad_norm": 93.46614098455318, + "learning_rate": 5e-06, + "loss": 0.6181, + "num_input_tokens_seen": 208791320, + "step": 1208 + }, + { + "epoch": 0.4828137490007994, + "loss": 0.5935106873512268, + "loss_ce": 0.007390075363218784, + "loss_xval": 0.5859375, + "num_input_tokens_seen": 208791320, + "step": 1208 + }, + { + "epoch": 0.4832134292565947, + "grad_norm": 197.0479425394529, + "learning_rate": 5e-06, + "loss": 0.7052, + "num_input_tokens_seen": 208964360, + "step": 1209 + }, + { + "epoch": 0.4832134292565947, + "loss": 0.6921088695526123, + "loss_ce": 0.00509719830006361, + "loss_xval": 0.6875, + "num_input_tokens_seen": 208964360, + "step": 1209 + }, + { + "epoch": 0.48361310951239006, + "grad_norm": 128.35610143451706, + "learning_rate": 5e-06, + "loss": 0.6615, + "num_input_tokens_seen": 209137008, + "step": 1210 + }, + { + "epoch": 0.48361310951239006, + "loss": 0.6662713289260864, + "loss_ce": 0.0055047329515218735, + "loss_xval": 0.66015625, + "num_input_tokens_seen": 209137008, + "step": 1210 + }, + { + "epoch": 0.48401278976818546, + "grad_norm": 97.43877540168052, + "learning_rate": 5e-06, + "loss": 0.4814, + "num_input_tokens_seen": 209309968, + "step": 1211 + }, + { + "epoch": 0.48401278976818546, + "loss": 0.5262230634689331, + "loss_ce": 0.005654177628457546, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 209309968, + "step": 1211 + }, + { + "epoch": 0.4844124700239808, + "grad_norm": 72.44506988992362, + "learning_rate": 5e-06, + "loss": 0.5673, + "num_input_tokens_seen": 209483056, + "step": 1212 + }, + { + "epoch": 0.4844124700239808, + "loss": 0.6535665988922119, + "loss_ce": 0.005800464190542698, + "loss_xval": 0.6484375, + "num_input_tokens_seen": 209483056, + "step": 1212 + }, + { + "epoch": 0.4848121502797762, + "grad_norm": 82.26685181094909, + "learning_rate": 5e-06, + "loss": 0.2986, + "num_input_tokens_seen": 209655808, + "step": 1213 + }, + { + "epoch": 0.4848121502797762, + "loss": 0.24768781661987305, + "loss_ce": 0.005134111270308495, + "loss_xval": 0.2421875, + "num_input_tokens_seen": 209655808, + "step": 1213 + }, + { + "epoch": 0.48521183053557154, + "grad_norm": 2883.265707805072, + "learning_rate": 5e-06, + "loss": 3.4834, + "num_input_tokens_seen": 209828696, + "step": 1214 + }, + { + "epoch": 0.48521183053557154, + "loss": 6.084342002868652, + "loss_ce": 0.01479253824800253, + "loss_xval": 6.0625, + "num_input_tokens_seen": 209828696, + "step": 1214 + }, + { + "epoch": 0.4856115107913669, + "grad_norm": 88.50388639064461, + "learning_rate": 5e-06, + "loss": 0.7424, + "num_input_tokens_seen": 210001296, + "step": 1215 + }, + { + "epoch": 0.4856115107913669, + "loss": 1.1838319301605225, + "loss_ce": 0.00505995936691761, + "loss_xval": 1.1796875, + "num_input_tokens_seen": 210001296, + "step": 1215 + }, + { + "epoch": 0.4860111910471623, + "grad_norm": 103.36056926681852, + "learning_rate": 5e-06, + "loss": 0.9452, + "num_input_tokens_seen": 210174048, + "step": 1216 + }, + { + "epoch": 0.4860111910471623, + "loss": 0.7499319314956665, + "loss_ce": 0.10924588143825531, + "loss_xval": 0.640625, + "num_input_tokens_seen": 210174048, + "step": 1216 + }, + { + "epoch": 0.4864108713029576, + "grad_norm": 55.89331329285388, + "learning_rate": 5e-06, + "loss": 0.8338, + "num_input_tokens_seen": 210347120, + "step": 1217 + }, + { + "epoch": 0.4864108713029576, + "loss": 0.9395196437835693, + "loss_ce": 0.15698786079883575, + "loss_xval": 0.78125, + "num_input_tokens_seen": 210347120, + "step": 1217 + }, + { + "epoch": 0.486810551558753, + "grad_norm": 67.56062531813605, + "learning_rate": 5e-06, + "loss": 0.8645, + "num_input_tokens_seen": 210519800, + "step": 1218 + }, + { + "epoch": 0.486810551558753, + "loss": 0.8125213980674744, + "loss_ce": 0.18675847351551056, + "loss_xval": 0.625, + "num_input_tokens_seen": 210519800, + "step": 1218 + }, + { + "epoch": 0.48721023181454837, + "grad_norm": 71.24757754808115, + "learning_rate": 5e-06, + "loss": 0.8123, + "num_input_tokens_seen": 210692952, + "step": 1219 + }, + { + "epoch": 0.48721023181454837, + "loss": 0.9922150373458862, + "loss_ce": 0.23125924170017242, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 210692952, + "step": 1219 + }, + { + "epoch": 0.4876099120703437, + "grad_norm": 114.99129622606719, + "learning_rate": 5e-06, + "loss": 0.8695, + "num_input_tokens_seen": 210862584, + "step": 1220 + }, + { + "epoch": 0.4876099120703437, + "loss": 0.7880289554595947, + "loss_ce": 0.1557047963142395, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 210862584, + "step": 1220 + }, + { + "epoch": 0.4880095923261391, + "grad_norm": 37.624977084364716, + "learning_rate": 5e-06, + "loss": 0.8068, + "num_input_tokens_seen": 211035584, + "step": 1221 + }, + { + "epoch": 0.4880095923261391, + "loss": 0.8313114047050476, + "loss_ce": 0.13703647255897522, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 211035584, + "step": 1221 + }, + { + "epoch": 0.48840927258193445, + "grad_norm": 42.83229604580505, + "learning_rate": 5e-06, + "loss": 0.6414, + "num_input_tokens_seen": 211208152, + "step": 1222 + }, + { + "epoch": 0.48840927258193445, + "loss": 0.6869634985923767, + "loss_ce": 0.12171684950590134, + "loss_xval": 0.56640625, + "num_input_tokens_seen": 211208152, + "step": 1222 + }, + { + "epoch": 0.4888089528377298, + "grad_norm": 37.91970336856216, + "learning_rate": 5e-06, + "loss": 0.8735, + "num_input_tokens_seen": 211380896, + "step": 1223 + }, + { + "epoch": 0.4888089528377298, + "loss": 0.47167566418647766, + "loss_ce": 0.15661218762397766, + "loss_xval": 0.314453125, + "num_input_tokens_seen": 211380896, + "step": 1223 + }, + { + "epoch": 0.4892086330935252, + "grad_norm": 38.52062556987781, + "learning_rate": 5e-06, + "loss": 0.8148, + "num_input_tokens_seen": 211553704, + "step": 1224 + }, + { + "epoch": 0.4892086330935252, + "loss": 0.5824769735336304, + "loss_ce": 0.11915907263755798, + "loss_xval": 0.462890625, + "num_input_tokens_seen": 211553704, + "step": 1224 + }, + { + "epoch": 0.48960831334932053, + "grad_norm": 81.00693493140086, + "learning_rate": 5e-06, + "loss": 0.7234, + "num_input_tokens_seen": 211726768, + "step": 1225 + }, + { + "epoch": 0.48960831334932053, + "loss": 0.9797881841659546, + "loss_ce": 0.1069854348897934, + "loss_xval": 0.87109375, + "num_input_tokens_seen": 211726768, + "step": 1225 + }, + { + "epoch": 0.49000799360511593, + "grad_norm": 37.58450026948904, + "learning_rate": 5e-06, + "loss": 0.6889, + "num_input_tokens_seen": 211899872, + "step": 1226 + }, + { + "epoch": 0.49000799360511593, + "loss": 0.9251545667648315, + "loss_ce": 0.10462842881679535, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 211899872, + "step": 1226 + }, + { + "epoch": 0.4904076738609113, + "grad_norm": 42.74553283537623, + "learning_rate": 5e-06, + "loss": 0.917, + "num_input_tokens_seen": 212072968, + "step": 1227 + }, + { + "epoch": 0.4904076738609113, + "loss": 1.1364541053771973, + "loss_ce": 0.09052520245313644, + "loss_xval": 1.046875, + "num_input_tokens_seen": 212072968, + "step": 1227 + }, + { + "epoch": 0.4908073541167066, + "grad_norm": 39.42896536840982, + "learning_rate": 5e-06, + "loss": 0.4878, + "num_input_tokens_seen": 212246136, + "step": 1228 + }, + { + "epoch": 0.4908073541167066, + "loss": 0.45796746015548706, + "loss_ce": 0.05721063166856766, + "loss_xval": 0.400390625, + "num_input_tokens_seen": 212246136, + "step": 1228 + }, + { + "epoch": 0.491207034372502, + "grad_norm": 49.5355844201104, + "learning_rate": 5e-06, + "loss": 1.0534, + "num_input_tokens_seen": 212419256, + "step": 1229 + }, + { + "epoch": 0.491207034372502, + "loss": 1.6057854890823364, + "loss_ce": 0.05982610583305359, + "loss_xval": 1.546875, + "num_input_tokens_seen": 212419256, + "step": 1229 + }, + { + "epoch": 0.49160671462829736, + "grad_norm": 53.85482243731108, + "learning_rate": 5e-06, + "loss": 0.4841, + "num_input_tokens_seen": 212592272, + "step": 1230 + }, + { + "epoch": 0.49160671462829736, + "loss": 0.3914153575897217, + "loss_ce": 0.05328058823943138, + "loss_xval": 0.337890625, + "num_input_tokens_seen": 212592272, + "step": 1230 + }, + { + "epoch": 0.4920063948840927, + "grad_norm": 29.615996478413024, + "learning_rate": 5e-06, + "loss": 0.6303, + "num_input_tokens_seen": 212765424, + "step": 1231 + }, + { + "epoch": 0.4920063948840927, + "loss": 0.8910754919052124, + "loss_ce": 0.04480774700641632, + "loss_xval": 0.84765625, + "num_input_tokens_seen": 212765424, + "step": 1231 + }, + { + "epoch": 0.4924060751398881, + "grad_norm": 22.122138227281248, + "learning_rate": 5e-06, + "loss": 0.6796, + "num_input_tokens_seen": 212938128, + "step": 1232 + }, + { + "epoch": 0.4924060751398881, + "loss": 0.9208757281303406, + "loss_ce": 0.03565235063433647, + "loss_xval": 0.88671875, + "num_input_tokens_seen": 212938128, + "step": 1232 + }, + { + "epoch": 0.49280575539568344, + "grad_norm": 37.455588321599855, + "learning_rate": 5e-06, + "loss": 0.5403, + "num_input_tokens_seen": 213111032, + "step": 1233 + }, + { + "epoch": 0.49280575539568344, + "loss": 0.3303181231021881, + "loss_ce": 0.027003923431038857, + "loss_xval": 0.302734375, + "num_input_tokens_seen": 213111032, + "step": 1233 + }, + { + "epoch": 0.49320543565147884, + "grad_norm": 26.199194983743542, + "learning_rate": 5e-06, + "loss": 0.5541, + "num_input_tokens_seen": 213284168, + "step": 1234 + }, + { + "epoch": 0.49320543565147884, + "loss": 0.5545949935913086, + "loss_ce": 0.04916280135512352, + "loss_xval": 0.50390625, + "num_input_tokens_seen": 213284168, + "step": 1234 + }, + { + "epoch": 0.4936051159072742, + "grad_norm": 48.729256475162536, + "learning_rate": 5e-06, + "loss": 0.6151, + "num_input_tokens_seen": 213456952, + "step": 1235 + }, + { + "epoch": 0.4936051159072742, + "loss": 0.6374036073684692, + "loss_ce": 0.032087456434965134, + "loss_xval": 0.60546875, + "num_input_tokens_seen": 213456952, + "step": 1235 + }, + { + "epoch": 0.4940047961630695, + "grad_norm": 43.01829750124253, + "learning_rate": 5e-06, + "loss": 0.4977, + "num_input_tokens_seen": 213630232, + "step": 1236 + }, + { + "epoch": 0.4940047961630695, + "loss": 0.4653409421443939, + "loss_ce": 0.024789176881313324, + "loss_xval": 0.44140625, + "num_input_tokens_seen": 213630232, + "step": 1236 + }, + { + "epoch": 0.4944044764188649, + "grad_norm": 32.54880413499179, + "learning_rate": 5e-06, + "loss": 0.3181, + "num_input_tokens_seen": 213803368, + "step": 1237 + }, + { + "epoch": 0.4944044764188649, + "loss": 0.4874266982078552, + "loss_ce": 0.01226796768605709, + "loss_xval": 0.474609375, + "num_input_tokens_seen": 213803368, + "step": 1237 + }, + { + "epoch": 0.49480415667466027, + "grad_norm": 45.33323015772085, + "learning_rate": 5e-06, + "loss": 0.793, + "num_input_tokens_seen": 213976184, + "step": 1238 + }, + { + "epoch": 0.49480415667466027, + "loss": 0.8546011447906494, + "loss_ce": 0.0273917093873024, + "loss_xval": 0.828125, + "num_input_tokens_seen": 213976184, + "step": 1238 + }, + { + "epoch": 0.4952038369304556, + "grad_norm": 58.6602616771861, + "learning_rate": 5e-06, + "loss": 0.6672, + "num_input_tokens_seen": 214149208, + "step": 1239 + }, + { + "epoch": 0.4952038369304556, + "loss": 0.7796196341514587, + "loss_ce": 0.021196816116571426, + "loss_xval": 0.7578125, + "num_input_tokens_seen": 214149208, + "step": 1239 + }, + { + "epoch": 0.495603517186251, + "grad_norm": 41.12794193465622, + "learning_rate": 5e-06, + "loss": 0.4236, + "num_input_tokens_seen": 214322368, + "step": 1240 + }, + { + "epoch": 0.495603517186251, + "loss": 0.352802038192749, + "loss_ce": 0.019183889031410217, + "loss_xval": 0.333984375, + "num_input_tokens_seen": 214322368, + "step": 1240 + }, + { + "epoch": 0.49600319744204635, + "grad_norm": 57.576106531202115, + "learning_rate": 5e-06, + "loss": 0.5261, + "num_input_tokens_seen": 214494992, + "step": 1241 + }, + { + "epoch": 0.49600319744204635, + "loss": 0.36257174611091614, + "loss_ce": 0.01869969069957733, + "loss_xval": 0.34375, + "num_input_tokens_seen": 214494992, + "step": 1241 + }, + { + "epoch": 0.49640287769784175, + "grad_norm": 44.020812396343615, + "learning_rate": 5e-06, + "loss": 0.5488, + "num_input_tokens_seen": 214668120, + "step": 1242 + }, + { + "epoch": 0.49640287769784175, + "loss": 0.610977828502655, + "loss_ce": 0.02009648270905018, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 214668120, + "step": 1242 + }, + { + "epoch": 0.4968025579536371, + "grad_norm": 61.870825774278416, + "learning_rate": 5e-06, + "loss": 0.5893, + "num_input_tokens_seen": 214841232, + "step": 1243 + }, + { + "epoch": 0.4968025579536371, + "loss": 0.7896366715431213, + "loss_ce": 0.028238333761692047, + "loss_xval": 0.76171875, + "num_input_tokens_seen": 214841232, + "step": 1243 + }, + { + "epoch": 0.49720223820943243, + "grad_norm": 41.17697077521213, + "learning_rate": 5e-06, + "loss": 0.5542, + "num_input_tokens_seen": 215014424, + "step": 1244 + }, + { + "epoch": 0.49720223820943243, + "loss": 0.680115818977356, + "loss_ce": 0.009522556327283382, + "loss_xval": 0.671875, + "num_input_tokens_seen": 215014424, + "step": 1244 + }, + { + "epoch": 0.49760191846522783, + "grad_norm": 76.82017942798518, + "learning_rate": 5e-06, + "loss": 0.6606, + "num_input_tokens_seen": 215187240, + "step": 1245 + }, + { + "epoch": 0.49760191846522783, + "loss": 0.577049732208252, + "loss_ce": 0.02297259122133255, + "loss_xval": 0.5546875, + "num_input_tokens_seen": 215187240, + "step": 1245 + }, + { + "epoch": 0.4980015987210232, + "grad_norm": 49.35990869457503, + "learning_rate": 5e-06, + "loss": 0.4436, + "num_input_tokens_seen": 215356856, + "step": 1246 + }, + { + "epoch": 0.4980015987210232, + "loss": 0.52464359998703, + "loss_ce": 0.01289433240890503, + "loss_xval": 0.51171875, + "num_input_tokens_seen": 215356856, + "step": 1246 + }, + { + "epoch": 0.4984012789768185, + "grad_norm": 30.431804132343178, + "learning_rate": 5e-06, + "loss": 0.4152, + "num_input_tokens_seen": 215529512, + "step": 1247 + }, + { + "epoch": 0.4984012789768185, + "loss": 0.39520263671875, + "loss_ce": 0.004180910065770149, + "loss_xval": 0.390625, + "num_input_tokens_seen": 215529512, + "step": 1247 + }, + { + "epoch": 0.4988009592326139, + "grad_norm": 79.43813542549815, + "learning_rate": 5e-06, + "loss": 0.7606, + "num_input_tokens_seen": 215702560, + "step": 1248 + }, + { + "epoch": 0.4988009592326139, + "loss": 0.8635843396186829, + "loss_ce": 0.006284565664827824, + "loss_xval": 0.85546875, + "num_input_tokens_seen": 215702560, + "step": 1248 + }, + { + "epoch": 0.49920063948840926, + "grad_norm": 46.9042618380455, + "learning_rate": 5e-06, + "loss": 0.5143, + "num_input_tokens_seen": 215875760, + "step": 1249 + }, + { + "epoch": 0.49920063948840926, + "loss": 0.5450088381767273, + "loss_ce": 0.0034438944421708584, + "loss_xval": 0.54296875, + "num_input_tokens_seen": 215875760, + "step": 1249 + }, + { + "epoch": 0.49960031974420466, + "grad_norm": 44.95576452572528, + "learning_rate": 5e-06, + "loss": 0.8945, + "num_input_tokens_seen": 216048592, + "step": 1250 + }, + { + "epoch": 0.49960031974420466, + "eval_websight_new_IoU": 0.46755318343639374, + "eval_websight_new_MAE_all": 0.01638866774737835, + "eval_websight_new_MAE_h": 0.004519310197792947, + "eval_websight_new_MAE_w": 0.030832246877253056, + "eval_websight_new_MAE_x": 0.014988915994763374, + "eval_websight_new_MAE_y": 0.015214197337627411, + "eval_websight_new_NUM_probability": 0.9706818461418152, + "eval_websight_new_inside_bbox": 0.7760416567325592, + "eval_websight_new_loss": 0.11890730261802673, + "eval_websight_new_loss_ce": 0.0031176727497950196, + "eval_websight_new_loss_xval": 0.097412109375, + "eval_websight_new_runtime": 56.3167, + "eval_websight_new_samples_per_second": 0.888, + "eval_websight_new_steps_per_second": 0.036, + "num_input_tokens_seen": 216048592, + "step": 1250 + }, + { + "epoch": 0.49960031974420466, + "eval_seeclick_IoU": 0.2279941290616989, + "eval_seeclick_MAE_all": 0.07300104945898056, + "eval_seeclick_MAE_h": 0.023876951076090336, + "eval_seeclick_MAE_w": 0.0947648361325264, + "eval_seeclick_MAE_x": 0.10035844147205353, + "eval_seeclick_MAE_y": 0.0730039793998003, + "eval_seeclick_NUM_probability": 0.9678144454956055, + "eval_seeclick_inside_bbox": 0.4288194477558136, + "eval_seeclick_loss": 1.8150830268859863, + "eval_seeclick_loss_ce": 0.013575777411460876, + "eval_seeclick_loss_xval": 1.76678466796875, + "eval_seeclick_runtime": 81.5951, + "eval_seeclick_samples_per_second": 0.613, + "eval_seeclick_steps_per_second": 0.025, + "num_input_tokens_seen": 216048592, + "step": 1250 + }, + { + "epoch": 0.49960031974420466, + "eval_icons_IoU": 0.1491006501019001, + "eval_icons_MAE_all": 0.023428103420883417, + "eval_icons_MAE_h": 0.009299044031649828, + "eval_icons_MAE_w": 0.006808809470385313, + "eval_icons_MAE_x": 0.0501435212790966, + "eval_icons_MAE_y": 0.0274610361084342, + "eval_icons_NUM_probability": 0.9710031449794769, + "eval_icons_inside_bbox": 0.2708333358168602, + "eval_icons_loss": 0.2193080484867096, + "eval_icons_loss_ce": 0.0031963232904672623, + "eval_icons_loss_xval": 0.18771743774414062, + "eval_icons_runtime": 86.547, + "eval_icons_samples_per_second": 0.578, + "eval_icons_steps_per_second": 0.023, + "num_input_tokens_seen": 216048592, + "step": 1250 + }, + { + "epoch": 0.49960031974420466, + "loss": 0.33800265192985535, + "loss_ce": 0.003209566930308938, + "loss_xval": 0.333984375, + "num_input_tokens_seen": 216048592, + "step": 1250 + }, + { + "epoch": 0.5, + "grad_norm": 56.2359235987798, + "learning_rate": 5e-06, + "loss": 0.9024, + "num_input_tokens_seen": 216221720, + "step": 1251 + }, + { + "epoch": 0.5, + "loss": 0.8478096723556519, + "loss_ce": 0.007721788249909878, + "loss_xval": 0.83984375, + "num_input_tokens_seen": 216221720, + "step": 1251 + }, + { + "epoch": 0.5003996802557954, + "grad_norm": 25.41910553147172, + "learning_rate": 5e-06, + "loss": 0.5755, + "num_input_tokens_seen": 216394976, + "step": 1252 + }, + { + "epoch": 0.5003996802557954, + "loss": 0.5523375272750854, + "loss_ce": 0.006713734474033117, + "loss_xval": 0.546875, + "num_input_tokens_seen": 216394976, + "step": 1252 + }, + { + "epoch": 0.5007993605115907, + "grad_norm": 32.41052618484067, + "learning_rate": 5e-06, + "loss": 0.7387, + "num_input_tokens_seen": 216567976, + "step": 1253 + }, + { + "epoch": 0.5007993605115907, + "loss": 0.8942447900772095, + "loss_ce": 0.009235035628080368, + "loss_xval": 0.88671875, + "num_input_tokens_seen": 216567976, + "step": 1253 + }, + { + "epoch": 0.5011990407673861, + "grad_norm": 47.47149033729788, + "learning_rate": 5e-06, + "loss": 0.5451, + "num_input_tokens_seen": 216740976, + "step": 1254 + }, + { + "epoch": 0.5011990407673861, + "loss": 0.34523525834083557, + "loss_ce": 0.006917405407875776, + "loss_xval": 0.337890625, + "num_input_tokens_seen": 216740976, + "step": 1254 + }, + { + "epoch": 0.5015987210231815, + "grad_norm": 33.76916038849005, + "learning_rate": 5e-06, + "loss": 0.462, + "num_input_tokens_seen": 216913768, + "step": 1255 + }, + { + "epoch": 0.5015987210231815, + "loss": 0.37124156951904297, + "loss_ce": 0.002772345207631588, + "loss_xval": 0.369140625, + "num_input_tokens_seen": 216913768, + "step": 1255 + }, + { + "epoch": 0.5019984012789768, + "grad_norm": 38.361290403427155, + "learning_rate": 5e-06, + "loss": 0.4189, + "num_input_tokens_seen": 217086576, + "step": 1256 + }, + { + "epoch": 0.5019984012789768, + "loss": 0.36219191551208496, + "loss_ce": 0.00684523768723011, + "loss_xval": 0.35546875, + "num_input_tokens_seen": 217086576, + "step": 1256 + }, + { + "epoch": 0.5023980815347722, + "grad_norm": 39.2253317331318, + "learning_rate": 5e-06, + "loss": 0.4246, + "num_input_tokens_seen": 217259736, + "step": 1257 + }, + { + "epoch": 0.5023980815347722, + "loss": 0.5401076078414917, + "loss_ce": 0.010536082088947296, + "loss_xval": 0.53125, + "num_input_tokens_seen": 217259736, + "step": 1257 + }, + { + "epoch": 0.5027977617905676, + "grad_norm": 32.08576453335854, + "learning_rate": 5e-06, + "loss": 0.6911, + "num_input_tokens_seen": 217432720, + "step": 1258 + }, + { + "epoch": 0.5027977617905676, + "loss": 0.46980804204940796, + "loss_ce": 0.004964273888617754, + "loss_xval": 0.46484375, + "num_input_tokens_seen": 217432720, + "step": 1258 + }, + { + "epoch": 0.503197442046363, + "grad_norm": 52.40906144418351, + "learning_rate": 5e-06, + "loss": 0.4735, + "num_input_tokens_seen": 217605680, + "step": 1259 + }, + { + "epoch": 0.503197442046363, + "loss": 0.3017037510871887, + "loss_ce": 0.009772591292858124, + "loss_xval": 0.291015625, + "num_input_tokens_seen": 217605680, + "step": 1259 + }, + { + "epoch": 0.5035971223021583, + "grad_norm": 27.32132146628816, + "learning_rate": 5e-06, + "loss": 0.5435, + "num_input_tokens_seen": 217778576, + "step": 1260 + }, + { + "epoch": 0.5035971223021583, + "loss": 0.30970054864883423, + "loss_ce": 0.009041349403560162, + "loss_xval": 0.30078125, + "num_input_tokens_seen": 217778576, + "step": 1260 + }, + { + "epoch": 0.5039968025579536, + "grad_norm": 56.69177348549417, + "learning_rate": 5e-06, + "loss": 0.4164, + "num_input_tokens_seen": 217951296, + "step": 1261 + }, + { + "epoch": 0.5039968025579536, + "loss": 0.32917362451553345, + "loss_ce": 0.0054126461036503315, + "loss_xval": 0.32421875, + "num_input_tokens_seen": 217951296, + "step": 1261 + }, + { + "epoch": 0.504396482813749, + "grad_norm": 33.190618492693645, + "learning_rate": 5e-06, + "loss": 0.5033, + "num_input_tokens_seen": 218124328, + "step": 1262 + }, + { + "epoch": 0.504396482813749, + "loss": 0.45435625314712524, + "loss_ce": 0.0076399631798267365, + "loss_xval": 0.447265625, + "num_input_tokens_seen": 218124328, + "step": 1262 + }, + { + "epoch": 0.5047961630695443, + "grad_norm": 108.08898692175731, + "learning_rate": 5e-06, + "loss": 0.4897, + "num_input_tokens_seen": 218296984, + "step": 1263 + }, + { + "epoch": 0.5047961630695443, + "loss": 0.45390087366104126, + "loss_ce": 0.00871045421808958, + "loss_xval": 0.4453125, + "num_input_tokens_seen": 218296984, + "step": 1263 + }, + { + "epoch": 0.5051958433253397, + "grad_norm": 21.718718812429533, + "learning_rate": 5e-06, + "loss": 0.2889, + "num_input_tokens_seen": 218469976, + "step": 1264 + }, + { + "epoch": 0.5051958433253397, + "loss": 0.2596575617790222, + "loss_ce": 0.009230328723788261, + "loss_xval": 0.25, + "num_input_tokens_seen": 218469976, + "step": 1264 + }, + { + "epoch": 0.5055955235811351, + "grad_norm": 37.47124945247608, + "learning_rate": 5e-06, + "loss": 0.6292, + "num_input_tokens_seen": 218639192, + "step": 1265 + }, + { + "epoch": 0.5055955235811351, + "loss": 0.6127042770385742, + "loss_ce": 0.003237716155126691, + "loss_xval": 0.609375, + "num_input_tokens_seen": 218639192, + "step": 1265 + }, + { + "epoch": 0.5059952038369304, + "grad_norm": 61.16539980854701, + "learning_rate": 5e-06, + "loss": 0.7032, + "num_input_tokens_seen": 218812232, + "step": 1266 + }, + { + "epoch": 0.5059952038369304, + "loss": 0.6988984942436218, + "loss_ce": 0.00425736466422677, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 218812232, + "step": 1266 + }, + { + "epoch": 0.5063948840927258, + "grad_norm": 78.5806516668246, + "learning_rate": 5e-06, + "loss": 0.3494, + "num_input_tokens_seen": 218985632, + "step": 1267 + }, + { + "epoch": 0.5063948840927258, + "loss": 0.35586732625961304, + "loss_ce": 0.007844888605177402, + "loss_xval": 0.34765625, + "num_input_tokens_seen": 218985632, + "step": 1267 + }, + { + "epoch": 0.5067945643485212, + "grad_norm": 59.790638905805274, + "learning_rate": 5e-06, + "loss": 0.6626, + "num_input_tokens_seen": 219159072, + "step": 1268 + }, + { + "epoch": 0.5067945643485212, + "loss": 0.6986931562423706, + "loss_ce": 0.0025871843099594116, + "loss_xval": 0.6953125, + "num_input_tokens_seen": 219159072, + "step": 1268 + }, + { + "epoch": 0.5071942446043165, + "grad_norm": 62.29636011421407, + "learning_rate": 5e-06, + "loss": 0.487, + "num_input_tokens_seen": 219332328, + "step": 1269 + }, + { + "epoch": 0.5071942446043165, + "loss": 0.49609100818634033, + "loss_ce": 0.0033542001619935036, + "loss_xval": 0.4921875, + "num_input_tokens_seen": 219332328, + "step": 1269 + }, + { + "epoch": 0.5075939248601119, + "grad_norm": 69.1067299073295, + "learning_rate": 5e-06, + "loss": 0.4005, + "num_input_tokens_seen": 219505336, + "step": 1270 + }, + { + "epoch": 0.5075939248601119, + "loss": 0.40173617005348206, + "loss_ce": 0.006960791535675526, + "loss_xval": 0.39453125, + "num_input_tokens_seen": 219505336, + "step": 1270 + }, + { + "epoch": 0.5079936051159073, + "grad_norm": 46.18987949039075, + "learning_rate": 5e-06, + "loss": 0.6363, + "num_input_tokens_seen": 219678016, + "step": 1271 + }, + { + "epoch": 0.5079936051159073, + "loss": 0.8009523749351501, + "loss_ce": 0.003787450725212693, + "loss_xval": 0.796875, + "num_input_tokens_seen": 219678016, + "step": 1271 + }, + { + "epoch": 0.5083932853717026, + "grad_norm": 80.61756373894711, + "learning_rate": 5e-06, + "loss": 0.5925, + "num_input_tokens_seen": 219851080, + "step": 1272 + }, + { + "epoch": 0.5083932853717026, + "loss": 0.4208008050918579, + "loss_ce": 0.00719609297811985, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 219851080, + "step": 1272 + }, + { + "epoch": 0.508792965627498, + "grad_norm": 24.109669016327093, + "learning_rate": 5e-06, + "loss": 0.3739, + "num_input_tokens_seen": 220024640, + "step": 1273 + }, + { + "epoch": 0.508792965627498, + "loss": 0.387967050075531, + "loss_ce": 0.005032491870224476, + "loss_xval": 0.3828125, + "num_input_tokens_seen": 220024640, + "step": 1273 + }, + { + "epoch": 0.5091926458832934, + "grad_norm": 45.963819514467495, + "learning_rate": 5e-06, + "loss": 0.4557, + "num_input_tokens_seen": 220197848, + "step": 1274 + }, + { + "epoch": 0.5091926458832934, + "loss": 0.29403769969940186, + "loss_ce": 0.008789882063865662, + "loss_xval": 0.28515625, + "num_input_tokens_seen": 220197848, + "step": 1274 + }, + { + "epoch": 0.5095923261390888, + "grad_norm": 54.94754414475741, + "learning_rate": 5e-06, + "loss": 0.6258, + "num_input_tokens_seen": 220370784, + "step": 1275 + }, + { + "epoch": 0.5095923261390888, + "loss": 0.6867777109146118, + "loss_ce": 0.009470607154071331, + "loss_xval": 0.67578125, + "num_input_tokens_seen": 220370784, + "step": 1275 + }, + { + "epoch": 0.5099920063948841, + "grad_norm": 37.56208223998764, + "learning_rate": 5e-06, + "loss": 0.7402, + "num_input_tokens_seen": 220543944, + "step": 1276 + }, + { + "epoch": 0.5099920063948841, + "loss": 0.28449493646621704, + "loss_ce": 0.0019937213510274887, + "loss_xval": 0.283203125, + "num_input_tokens_seen": 220543944, + "step": 1276 + }, + { + "epoch": 0.5103916866506795, + "grad_norm": 86.38962127574528, + "learning_rate": 5e-06, + "loss": 0.2525, + "num_input_tokens_seen": 220716584, + "step": 1277 + }, + { + "epoch": 0.5103916866506795, + "loss": 0.36348748207092285, + "loss_ce": 0.004051441326737404, + "loss_xval": 0.359375, + "num_input_tokens_seen": 220716584, + "step": 1277 + }, + { + "epoch": 0.5107913669064749, + "grad_norm": 41.557195679205826, + "learning_rate": 5e-06, + "loss": 0.4988, + "num_input_tokens_seen": 220889464, + "step": 1278 + }, + { + "epoch": 0.5107913669064749, + "loss": 0.4272310435771942, + "loss_ce": 0.0021822056733071804, + "loss_xval": 0.42578125, + "num_input_tokens_seen": 220889464, + "step": 1278 + }, + { + "epoch": 0.5111910471622702, + "grad_norm": 72.04043333354636, + "learning_rate": 5e-06, + "loss": 0.5884, + "num_input_tokens_seen": 221061952, + "step": 1279 + }, + { + "epoch": 0.5111910471622702, + "loss": 0.7456511855125427, + "loss_ce": 0.004440242424607277, + "loss_xval": 0.7421875, + "num_input_tokens_seen": 221061952, + "step": 1279 + }, + { + "epoch": 0.5115907274180655, + "grad_norm": 54.40269586756929, + "learning_rate": 5e-06, + "loss": 0.5129, + "num_input_tokens_seen": 221234880, + "step": 1280 + }, + { + "epoch": 0.5115907274180655, + "loss": 0.6264010071754456, + "loss_ce": 0.0085421372205019, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 221234880, + "step": 1280 + }, + { + "epoch": 0.511990407673861, + "grad_norm": 60.383568725505285, + "learning_rate": 5e-06, + "loss": 0.9545, + "num_input_tokens_seen": 221408088, + "step": 1281 + }, + { + "epoch": 0.511990407673861, + "loss": 0.9138003587722778, + "loss_ce": 0.009747644886374474, + "loss_xval": 0.90234375, + "num_input_tokens_seen": 221408088, + "step": 1281 + }, + { + "epoch": 0.5123900879296562, + "grad_norm": 24.245343297674463, + "learning_rate": 5e-06, + "loss": 0.4102, + "num_input_tokens_seen": 221581192, + "step": 1282 + }, + { + "epoch": 0.5123900879296562, + "loss": 0.2816739082336426, + "loss_ce": 0.0051236217841506, + "loss_xval": 0.27734375, + "num_input_tokens_seen": 221581192, + "step": 1282 + }, + { + "epoch": 0.5127897681854516, + "grad_norm": 36.034373770787454, + "learning_rate": 5e-06, + "loss": 0.4183, + "num_input_tokens_seen": 221753984, + "step": 1283 + }, + { + "epoch": 0.5127897681854516, + "loss": 0.28008684515953064, + "loss_ce": 0.006466236896812916, + "loss_xval": 0.2734375, + "num_input_tokens_seen": 221753984, + "step": 1283 + }, + { + "epoch": 0.513189448441247, + "grad_norm": 33.97082506216034, + "learning_rate": 5e-06, + "loss": 0.7656, + "num_input_tokens_seen": 221927128, + "step": 1284 + }, + { + "epoch": 0.513189448441247, + "loss": 0.9152973890304565, + "loss_ce": 0.005415809340775013, + "loss_xval": 0.91015625, + "num_input_tokens_seen": 221927128, + "step": 1284 + }, + { + "epoch": 0.5135891286970423, + "grad_norm": 38.15398027342218, + "learning_rate": 5e-06, + "loss": 0.6912, + "num_input_tokens_seen": 222100064, + "step": 1285 + }, + { + "epoch": 0.5135891286970423, + "loss": 0.43050551414489746, + "loss_ce": 0.0032594138756394386, + "loss_xval": 0.427734375, + "num_input_tokens_seen": 222100064, + "step": 1285 + }, + { + "epoch": 0.5139888089528377, + "grad_norm": 17.77718576528173, + "learning_rate": 5e-06, + "loss": 0.4619, + "num_input_tokens_seen": 222273144, + "step": 1286 + }, + { + "epoch": 0.5139888089528377, + "loss": 0.502855122089386, + "loss_ce": 0.007493783254176378, + "loss_xval": 0.49609375, + "num_input_tokens_seen": 222273144, + "step": 1286 + }, + { + "epoch": 0.5143884892086331, + "grad_norm": 49.07971279565783, + "learning_rate": 5e-06, + "loss": 0.5318, + "num_input_tokens_seen": 222446248, + "step": 1287 + }, + { + "epoch": 0.5143884892086331, + "loss": 0.3896329402923584, + "loss_ce": 0.01567053608596325, + "loss_xval": 0.373046875, + "num_input_tokens_seen": 222446248, + "step": 1287 + }, + { + "epoch": 0.5147881694644284, + "grad_norm": 21.411674481901443, + "learning_rate": 5e-06, + "loss": 0.4787, + "num_input_tokens_seen": 222619208, + "step": 1288 + }, + { + "epoch": 0.5147881694644284, + "loss": 0.6500849723815918, + "loss_ce": 0.01007033046334982, + "loss_xval": 0.640625, + "num_input_tokens_seen": 222619208, + "step": 1288 + }, + { + "epoch": 0.5151878497202238, + "grad_norm": 69.27709990958668, + "learning_rate": 5e-06, + "loss": 0.6783, + "num_input_tokens_seen": 222792208, + "step": 1289 + }, + { + "epoch": 0.5151878497202238, + "loss": 0.657429575920105, + "loss_ce": 0.0020950797479599714, + "loss_xval": 0.65625, + "num_input_tokens_seen": 222792208, + "step": 1289 + }, + { + "epoch": 0.5155875299760192, + "grad_norm": 35.61428154048361, + "learning_rate": 5e-06, + "loss": 0.4603, + "num_input_tokens_seen": 222965128, + "step": 1290 + }, + { + "epoch": 0.5155875299760192, + "loss": 0.27936655282974243, + "loss_ce": 0.0025415923446416855, + "loss_xval": 0.27734375, + "num_input_tokens_seen": 222965128, + "step": 1290 + }, + { + "epoch": 0.5159872102318146, + "grad_norm": 74.58818630856017, + "learning_rate": 5e-06, + "loss": 0.5112, + "num_input_tokens_seen": 223137904, + "step": 1291 + }, + { + "epoch": 0.5159872102318146, + "loss": 0.5828518867492676, + "loss_ce": 0.003628222271800041, + "loss_xval": 0.578125, + "num_input_tokens_seen": 223137904, + "step": 1291 + }, + { + "epoch": 0.5163868904876099, + "grad_norm": 80.74060896190802, + "learning_rate": 5e-06, + "loss": 0.4543, + "num_input_tokens_seen": 223310688, + "step": 1292 + }, + { + "epoch": 0.5163868904876099, + "loss": 0.4423733055591583, + "loss_ce": 0.004201945383101702, + "loss_xval": 0.4375, + "num_input_tokens_seen": 223310688, + "step": 1292 + }, + { + "epoch": 0.5167865707434053, + "grad_norm": 84.13859190094642, + "learning_rate": 5e-06, + "loss": 0.3072, + "num_input_tokens_seen": 223483768, + "step": 1293 + }, + { + "epoch": 0.5167865707434053, + "loss": 0.27045226097106934, + "loss_ce": 0.007879025302827358, + "loss_xval": 0.26171875, + "num_input_tokens_seen": 223483768, + "step": 1293 + }, + { + "epoch": 0.5171862509992007, + "grad_norm": 67.13775547647433, + "learning_rate": 5e-06, + "loss": 0.4929, + "num_input_tokens_seen": 223656944, + "step": 1294 + }, + { + "epoch": 0.5171862509992007, + "loss": 0.6396459341049194, + "loss_ce": 0.004178367555141449, + "loss_xval": 0.63671875, + "num_input_tokens_seen": 223656944, + "step": 1294 + }, + { + "epoch": 0.517585931254996, + "grad_norm": 122.1235623959299, + "learning_rate": 5e-06, + "loss": 0.6337, + "num_input_tokens_seen": 223829944, + "step": 1295 + }, + { + "epoch": 0.517585931254996, + "loss": 0.5107072591781616, + "loss_ce": 0.006068557035177946, + "loss_xval": 0.50390625, + "num_input_tokens_seen": 223829944, + "step": 1295 + }, + { + "epoch": 0.5179856115107914, + "grad_norm": 23.06635214962515, + "learning_rate": 5e-06, + "loss": 0.451, + "num_input_tokens_seen": 224002944, + "step": 1296 + }, + { + "epoch": 0.5179856115107914, + "loss": 0.3327631950378418, + "loss_ce": 0.0022272877395153046, + "loss_xval": 0.330078125, + "num_input_tokens_seen": 224002944, + "step": 1296 + }, + { + "epoch": 0.5183852917665868, + "grad_norm": 72.82118466464544, + "learning_rate": 5e-06, + "loss": 0.3968, + "num_input_tokens_seen": 224176144, + "step": 1297 + }, + { + "epoch": 0.5183852917665868, + "loss": 0.18834558129310608, + "loss_ce": 0.003409042488783598, + "loss_xval": 0.1845703125, + "num_input_tokens_seen": 224176144, + "step": 1297 + }, + { + "epoch": 0.518784972022382, + "grad_norm": 22.046556325441585, + "learning_rate": 5e-06, + "loss": 0.6103, + "num_input_tokens_seen": 224349112, + "step": 1298 + }, + { + "epoch": 0.518784972022382, + "loss": 0.6086263656616211, + "loss_ce": 0.003767983755096793, + "loss_xval": 0.60546875, + "num_input_tokens_seen": 224349112, + "step": 1298 + }, + { + "epoch": 0.5191846522781774, + "grad_norm": 46.80144712333731, + "learning_rate": 5e-06, + "loss": 0.5122, + "num_input_tokens_seen": 224522008, + "step": 1299 + }, + { + "epoch": 0.5191846522781774, + "loss": 0.6673205494880676, + "loss_ce": 0.00789671204984188, + "loss_xval": 0.66015625, + "num_input_tokens_seen": 224522008, + "step": 1299 + }, + { + "epoch": 0.5195843325339728, + "grad_norm": 58.87556869060338, + "learning_rate": 5e-06, + "loss": 0.6155, + "num_input_tokens_seen": 224694608, + "step": 1300 + }, + { + "epoch": 0.5195843325339728, + "loss": 0.45236122608184814, + "loss_ce": 0.004302144981920719, + "loss_xval": 0.447265625, + "num_input_tokens_seen": 224694608, + "step": 1300 + }, + { + "epoch": 0.5199840127897681, + "grad_norm": 59.57025192167172, + "learning_rate": 5e-06, + "loss": 0.6062, + "num_input_tokens_seen": 224867824, + "step": 1301 + }, + { + "epoch": 0.5199840127897681, + "loss": 0.5515105128288269, + "loss_ce": 0.007199006155133247, + "loss_xval": 0.54296875, + "num_input_tokens_seen": 224867824, + "step": 1301 + }, + { + "epoch": 0.5203836930455635, + "grad_norm": 46.648119620488075, + "learning_rate": 5e-06, + "loss": 0.5472, + "num_input_tokens_seen": 225040896, + "step": 1302 + }, + { + "epoch": 0.5203836930455635, + "loss": 0.3536621034145355, + "loss_ce": 0.0044799624010920525, + "loss_xval": 0.349609375, + "num_input_tokens_seen": 225040896, + "step": 1302 + }, + { + "epoch": 0.5207833733013589, + "grad_norm": 21.096710126232633, + "learning_rate": 5e-06, + "loss": 0.3661, + "num_input_tokens_seen": 225213776, + "step": 1303 + }, + { + "epoch": 0.5207833733013589, + "loss": 0.47925370931625366, + "loss_ce": 0.004583288915455341, + "loss_xval": 0.474609375, + "num_input_tokens_seen": 225213776, + "step": 1303 + }, + { + "epoch": 0.5211830535571543, + "grad_norm": 45.00401125931439, + "learning_rate": 5e-06, + "loss": 0.4723, + "num_input_tokens_seen": 225386776, + "step": 1304 + }, + { + "epoch": 0.5211830535571543, + "loss": 0.3989310562610626, + "loss_ce": 0.005376371555030346, + "loss_xval": 0.39453125, + "num_input_tokens_seen": 225386776, + "step": 1304 + }, + { + "epoch": 0.5215827338129496, + "grad_norm": 47.83089787844274, + "learning_rate": 5e-06, + "loss": 0.4133, + "num_input_tokens_seen": 225559840, + "step": 1305 + }, + { + "epoch": 0.5215827338129496, + "loss": 0.2589433789253235, + "loss_ce": 0.0025346819311380386, + "loss_xval": 0.255859375, + "num_input_tokens_seen": 225559840, + "step": 1305 + }, + { + "epoch": 0.521982414068745, + "grad_norm": 76.45941526870152, + "learning_rate": 5e-06, + "loss": 0.4183, + "num_input_tokens_seen": 225732832, + "step": 1306 + }, + { + "epoch": 0.521982414068745, + "loss": 0.35908687114715576, + "loss_ce": 0.005754332058131695, + "loss_xval": 0.353515625, + "num_input_tokens_seen": 225732832, + "step": 1306 + }, + { + "epoch": 0.5223820943245404, + "grad_norm": 28.346923450799792, + "learning_rate": 5e-06, + "loss": 0.2043, + "num_input_tokens_seen": 225905584, + "step": 1307 + }, + { + "epoch": 0.5223820943245404, + "loss": 0.30641642212867737, + "loss_ce": 0.004780675284564495, + "loss_xval": 0.30078125, + "num_input_tokens_seen": 225905584, + "step": 1307 + }, + { + "epoch": 0.5227817745803357, + "grad_norm": 52.38544353874319, + "learning_rate": 5e-06, + "loss": 0.3302, + "num_input_tokens_seen": 226075200, + "step": 1308 + }, + { + "epoch": 0.5227817745803357, + "loss": 0.39421164989471436, + "loss_ce": 0.005997546017169952, + "loss_xval": 0.388671875, + "num_input_tokens_seen": 226075200, + "step": 1308 + }, + { + "epoch": 0.5231814548361311, + "grad_norm": 30.047970923730745, + "learning_rate": 5e-06, + "loss": 0.362, + "num_input_tokens_seen": 226248032, + "step": 1309 + }, + { + "epoch": 0.5231814548361311, + "loss": 0.17206686735153198, + "loss_ce": 0.002251807600259781, + "loss_xval": 0.169921875, + "num_input_tokens_seen": 226248032, + "step": 1309 + }, + { + "epoch": 0.5235811350919265, + "grad_norm": 21.06650577505968, + "learning_rate": 5e-06, + "loss": 0.3305, + "num_input_tokens_seen": 226420696, + "step": 1310 + }, + { + "epoch": 0.5235811350919265, + "loss": 0.42764365673065186, + "loss_ce": 0.01628197729587555, + "loss_xval": 0.412109375, + "num_input_tokens_seen": 226420696, + "step": 1310 + }, + { + "epoch": 0.5239808153477218, + "grad_norm": 54.162401748623466, + "learning_rate": 5e-06, + "loss": 0.3938, + "num_input_tokens_seen": 226593672, + "step": 1311 + }, + { + "epoch": 0.5239808153477218, + "loss": 0.16174665093421936, + "loss_ce": 0.0044895680621266365, + "loss_xval": 0.1572265625, + "num_input_tokens_seen": 226593672, + "step": 1311 + }, + { + "epoch": 0.5243804956035172, + "grad_norm": 25.92578935552127, + "learning_rate": 5e-06, + "loss": 0.7577, + "num_input_tokens_seen": 226766864, + "step": 1312 + }, + { + "epoch": 0.5243804956035172, + "loss": 0.7232966423034668, + "loss_ce": 0.0022577994968742132, + "loss_xval": 0.72265625, + "num_input_tokens_seen": 226766864, + "step": 1312 + }, + { + "epoch": 0.5247801758593126, + "grad_norm": 30.005024485668603, + "learning_rate": 5e-06, + "loss": 0.497, + "num_input_tokens_seen": 226939656, + "step": 1313 + }, + { + "epoch": 0.5247801758593126, + "loss": 0.4305632710456848, + "loss_ce": 0.0025236960500478745, + "loss_xval": 0.427734375, + "num_input_tokens_seen": 226939656, + "step": 1313 + }, + { + "epoch": 0.5251798561151079, + "grad_norm": 42.69744397870593, + "learning_rate": 5e-06, + "loss": 0.4521, + "num_input_tokens_seen": 227112800, + "step": 1314 + }, + { + "epoch": 0.5251798561151079, + "loss": 0.30210599303245544, + "loss_ce": 0.0036135392729192972, + "loss_xval": 0.298828125, + "num_input_tokens_seen": 227112800, + "step": 1314 + }, + { + "epoch": 0.5255795363709033, + "grad_norm": 15.197513540024648, + "learning_rate": 5e-06, + "loss": 0.3364, + "num_input_tokens_seen": 227285992, + "step": 1315 + }, + { + "epoch": 0.5255795363709033, + "loss": 0.37036991119384766, + "loss_ce": 0.005036352667957544, + "loss_xval": 0.365234375, + "num_input_tokens_seen": 227285992, + "step": 1315 + }, + { + "epoch": 0.5259792166266987, + "grad_norm": 59.42411346211391, + "learning_rate": 5e-06, + "loss": 0.5199, + "num_input_tokens_seen": 227458960, + "step": 1316 + }, + { + "epoch": 0.5259792166266987, + "loss": 0.28569337725639343, + "loss_ce": 0.004321303218603134, + "loss_xval": 0.28125, + "num_input_tokens_seen": 227458960, + "step": 1316 + }, + { + "epoch": 0.526378896882494, + "grad_norm": 69.21561612485712, + "learning_rate": 5e-06, + "loss": 0.491, + "num_input_tokens_seen": 227632200, + "step": 1317 + }, + { + "epoch": 0.526378896882494, + "loss": 0.2640664279460907, + "loss_ce": 0.003949846141040325, + "loss_xval": 0.259765625, + "num_input_tokens_seen": 227632200, + "step": 1317 + }, + { + "epoch": 0.5267785771382894, + "grad_norm": 13.132135435903631, + "learning_rate": 5e-06, + "loss": 0.2884, + "num_input_tokens_seen": 227805296, + "step": 1318 + }, + { + "epoch": 0.5267785771382894, + "loss": 0.28863468766212463, + "loss_ce": 0.006469154264777899, + "loss_xval": 0.28125, + "num_input_tokens_seen": 227805296, + "step": 1318 + }, + { + "epoch": 0.5271782573940847, + "grad_norm": 57.694864530698325, + "learning_rate": 5e-06, + "loss": 0.5443, + "num_input_tokens_seen": 227978120, + "step": 1319 + }, + { + "epoch": 0.5271782573940847, + "loss": 0.5344159007072449, + "loss_ce": 0.006644911132752895, + "loss_xval": 0.52734375, + "num_input_tokens_seen": 227978120, + "step": 1319 + }, + { + "epoch": 0.5275779376498801, + "grad_norm": 37.97773385737496, + "learning_rate": 5e-06, + "loss": 0.353, + "num_input_tokens_seen": 228151056, + "step": 1320 + }, + { + "epoch": 0.5275779376498801, + "loss": 0.3334610164165497, + "loss_ce": 0.006907662842422724, + "loss_xval": 0.326171875, + "num_input_tokens_seen": 228151056, + "step": 1320 + }, + { + "epoch": 0.5279776179056754, + "grad_norm": 23.410095734672666, + "learning_rate": 5e-06, + "loss": 0.6552, + "num_input_tokens_seen": 228323968, + "step": 1321 + }, + { + "epoch": 0.5279776179056754, + "loss": 0.579754114151001, + "loss_ce": 0.005108083598315716, + "loss_xval": 0.57421875, + "num_input_tokens_seen": 228323968, + "step": 1321 + }, + { + "epoch": 0.5283772981614708, + "grad_norm": 45.88258781654939, + "learning_rate": 5e-06, + "loss": 0.7877, + "num_input_tokens_seen": 228497032, + "step": 1322 + }, + { + "epoch": 0.5283772981614708, + "loss": 0.7026103734970093, + "loss_ce": 0.005100608803331852, + "loss_xval": 0.69921875, + "num_input_tokens_seen": 228497032, + "step": 1322 + }, + { + "epoch": 0.5287769784172662, + "grad_norm": 20.4423042983933, + "learning_rate": 5e-06, + "loss": 0.4286, + "num_input_tokens_seen": 228670088, + "step": 1323 + }, + { + "epoch": 0.5287769784172662, + "loss": 0.3816264271736145, + "loss_ce": 0.014408385381102562, + "loss_xval": 0.3671875, + "num_input_tokens_seen": 228670088, + "step": 1323 + }, + { + "epoch": 0.5291766586730615, + "grad_norm": 51.63250028305346, + "learning_rate": 5e-06, + "loss": 0.2495, + "num_input_tokens_seen": 228843096, + "step": 1324 + }, + { + "epoch": 0.5291766586730615, + "loss": 0.3274524211883545, + "loss_ce": 0.005583534948527813, + "loss_xval": 0.322265625, + "num_input_tokens_seen": 228843096, + "step": 1324 + }, + { + "epoch": 0.5295763389288569, + "grad_norm": 24.608922162922916, + "learning_rate": 5e-06, + "loss": 0.3274, + "num_input_tokens_seen": 229016144, + "step": 1325 + }, + { + "epoch": 0.5295763389288569, + "loss": 0.486013799905777, + "loss_ce": 0.0058578504249453545, + "loss_xval": 0.48046875, + "num_input_tokens_seen": 229016144, + "step": 1325 + }, + { + "epoch": 0.5299760191846523, + "grad_norm": 61.57937595911081, + "learning_rate": 5e-06, + "loss": 0.312, + "num_input_tokens_seen": 229188832, + "step": 1326 + }, + { + "epoch": 0.5299760191846523, + "loss": 0.3464367985725403, + "loss_ce": 0.007767999544739723, + "loss_xval": 0.337890625, + "num_input_tokens_seen": 229188832, + "step": 1326 + }, + { + "epoch": 0.5303756994404476, + "grad_norm": 44.936253935063796, + "learning_rate": 5e-06, + "loss": 0.8079, + "num_input_tokens_seen": 229361504, + "step": 1327 + }, + { + "epoch": 0.5303756994404476, + "loss": 1.1617729663848877, + "loss_ce": 0.0061028143391013145, + "loss_xval": 1.15625, + "num_input_tokens_seen": 229361504, + "step": 1327 + }, + { + "epoch": 0.530775379696243, + "grad_norm": 57.45812813973755, + "learning_rate": 5e-06, + "loss": 0.447, + "num_input_tokens_seen": 229534336, + "step": 1328 + }, + { + "epoch": 0.530775379696243, + "loss": 0.42143499851226807, + "loss_ce": 0.005541440099477768, + "loss_xval": 0.416015625, + "num_input_tokens_seen": 229534336, + "step": 1328 + }, + { + "epoch": 0.5311750599520384, + "grad_norm": 95.69310855679585, + "learning_rate": 5e-06, + "loss": 0.6078, + "num_input_tokens_seen": 229707488, + "step": 1329 + }, + { + "epoch": 0.5311750599520384, + "loss": 0.42394664883613586, + "loss_ce": 0.0022547650150954723, + "loss_xval": 0.421875, + "num_input_tokens_seen": 229707488, + "step": 1329 + }, + { + "epoch": 0.5315747402078337, + "grad_norm": 55.23809125724533, + "learning_rate": 5e-06, + "loss": 0.9022, + "num_input_tokens_seen": 229880592, + "step": 1330 + }, + { + "epoch": 0.5315747402078337, + "loss": 1.1072825193405151, + "loss_ce": 0.010541743598878384, + "loss_xval": 1.09375, + "num_input_tokens_seen": 229880592, + "step": 1330 + }, + { + "epoch": 0.5319744204636291, + "grad_norm": 107.65601955216079, + "learning_rate": 5e-06, + "loss": 0.5116, + "num_input_tokens_seen": 230053800, + "step": 1331 + }, + { + "epoch": 0.5319744204636291, + "loss": 0.4177815020084381, + "loss_ce": 0.0035969249438494444, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 230053800, + "step": 1331 + }, + { + "epoch": 0.5323741007194245, + "grad_norm": 47.508790422209, + "learning_rate": 5e-06, + "loss": 0.4268, + "num_input_tokens_seen": 230226896, + "step": 1332 + }, + { + "epoch": 0.5323741007194245, + "loss": 0.4874696731567383, + "loss_ce": 0.003766059409826994, + "loss_xval": 0.484375, + "num_input_tokens_seen": 230226896, + "step": 1332 + }, + { + "epoch": 0.5327737809752199, + "grad_norm": 44.833401746884746, + "learning_rate": 5e-06, + "loss": 0.2839, + "num_input_tokens_seen": 230399952, + "step": 1333 + }, + { + "epoch": 0.5327737809752199, + "loss": 0.21195606887340546, + "loss_ce": 0.003459974192082882, + "loss_xval": 0.208984375, + "num_input_tokens_seen": 230399952, + "step": 1333 + }, + { + "epoch": 0.5331734612310152, + "grad_norm": 22.979639420354616, + "learning_rate": 5e-06, + "loss": 0.5363, + "num_input_tokens_seen": 230573048, + "step": 1334 + }, + { + "epoch": 0.5331734612310152, + "loss": 0.5578432083129883, + "loss_ce": 0.015240712091326714, + "loss_xval": 0.54296875, + "num_input_tokens_seen": 230573048, + "step": 1334 + }, + { + "epoch": 0.5335731414868106, + "grad_norm": 38.390194365787885, + "learning_rate": 5e-06, + "loss": 0.4866, + "num_input_tokens_seen": 230746344, + "step": 1335 + }, + { + "epoch": 0.5335731414868106, + "loss": 0.5222955942153931, + "loss_ce": 0.003130522556602955, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 230746344, + "step": 1335 + }, + { + "epoch": 0.533972821742606, + "grad_norm": 62.630756568579976, + "learning_rate": 5e-06, + "loss": 0.4862, + "num_input_tokens_seen": 230919112, + "step": 1336 + }, + { + "epoch": 0.533972821742606, + "loss": 0.20648841559886932, + "loss_ce": 0.00534704327583313, + "loss_xval": 0.201171875, + "num_input_tokens_seen": 230919112, + "step": 1336 + }, + { + "epoch": 0.5343725019984013, + "grad_norm": 54.98815999554934, + "learning_rate": 5e-06, + "loss": 0.357, + "num_input_tokens_seen": 231092360, + "step": 1337 + }, + { + "epoch": 0.5343725019984013, + "loss": 0.24189867079257965, + "loss_ce": 0.0023204381577670574, + "loss_xval": 0.2392578125, + "num_input_tokens_seen": 231092360, + "step": 1337 + }, + { + "epoch": 0.5347721822541966, + "grad_norm": 61.52109754634168, + "learning_rate": 5e-06, + "loss": 0.5032, + "num_input_tokens_seen": 231265440, + "step": 1338 + }, + { + "epoch": 0.5347721822541966, + "loss": 0.5029772520065308, + "loss_ce": 0.0066393520683050156, + "loss_xval": 0.49609375, + "num_input_tokens_seen": 231265440, + "step": 1338 + }, + { + "epoch": 0.535171862509992, + "grad_norm": 36.05634221729635, + "learning_rate": 5e-06, + "loss": 0.6178, + "num_input_tokens_seen": 231438576, + "step": 1339 + }, + { + "epoch": 0.535171862509992, + "loss": 0.7768712639808655, + "loss_ce": 0.002121466211974621, + "loss_xval": 0.7734375, + "num_input_tokens_seen": 231438576, + "step": 1339 + }, + { + "epoch": 0.5355715427657873, + "grad_norm": 64.96571340372034, + "learning_rate": 5e-06, + "loss": 0.9467, + "num_input_tokens_seen": 231611296, + "step": 1340 + }, + { + "epoch": 0.5355715427657873, + "loss": 1.1156829595565796, + "loss_ce": 0.00386651698499918, + "loss_xval": 1.109375, + "num_input_tokens_seen": 231611296, + "step": 1340 + }, + { + "epoch": 0.5359712230215827, + "grad_norm": 84.07508123457501, + "learning_rate": 5e-06, + "loss": 0.4938, + "num_input_tokens_seen": 231783904, + "step": 1341 + }, + { + "epoch": 0.5359712230215827, + "loss": 0.5198108553886414, + "loss_ce": 0.0021717222407460213, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 231783904, + "step": 1341 + }, + { + "epoch": 0.5363709032773781, + "grad_norm": 57.64533848978117, + "learning_rate": 5e-06, + "loss": 0.3062, + "num_input_tokens_seen": 231956376, + "step": 1342 + }, + { + "epoch": 0.5363709032773781, + "loss": 0.36183974146842957, + "loss_ce": 0.00447891466319561, + "loss_xval": 0.357421875, + "num_input_tokens_seen": 231956376, + "step": 1342 + }, + { + "epoch": 0.5367705835331734, + "grad_norm": 90.55074914704535, + "learning_rate": 5e-06, + "loss": 0.5969, + "num_input_tokens_seen": 232125528, + "step": 1343 + }, + { + "epoch": 0.5367705835331734, + "loss": 0.3362892270088196, + "loss_ce": 0.0024879206903278828, + "loss_xval": 0.333984375, + "num_input_tokens_seen": 232125528, + "step": 1343 + }, + { + "epoch": 0.5371702637889688, + "grad_norm": 12.81961083789221, + "learning_rate": 5e-06, + "loss": 0.2753, + "num_input_tokens_seen": 232298760, + "step": 1344 + }, + { + "epoch": 0.5371702637889688, + "loss": 0.2364044338464737, + "loss_ce": 0.006698611192405224, + "loss_xval": 0.2294921875, + "num_input_tokens_seen": 232298760, + "step": 1344 + }, + { + "epoch": 0.5375699440447642, + "grad_norm": 45.502407796394216, + "learning_rate": 5e-06, + "loss": 0.4094, + "num_input_tokens_seen": 232468344, + "step": 1345 + }, + { + "epoch": 0.5375699440447642, + "loss": 0.4168083667755127, + "loss_ce": 0.002806881908327341, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 232468344, + "step": 1345 + }, + { + "epoch": 0.5379696243005595, + "grad_norm": 51.143260733380274, + "learning_rate": 5e-06, + "loss": 0.5264, + "num_input_tokens_seen": 232641464, + "step": 1346 + }, + { + "epoch": 0.5379696243005595, + "loss": 0.5115495324134827, + "loss_ce": 0.002791010309010744, + "loss_xval": 0.5078125, + "num_input_tokens_seen": 232641464, + "step": 1346 + }, + { + "epoch": 0.5383693045563549, + "grad_norm": 40.13135649023107, + "learning_rate": 5e-06, + "loss": 0.5977, + "num_input_tokens_seen": 232814664, + "step": 1347 + }, + { + "epoch": 0.5383693045563549, + "loss": 0.27177077531814575, + "loss_ce": 0.006420427467674017, + "loss_xval": 0.265625, + "num_input_tokens_seen": 232814664, + "step": 1347 + }, + { + "epoch": 0.5387689848121503, + "grad_norm": 96.30534320206739, + "learning_rate": 5e-06, + "loss": 0.6539, + "num_input_tokens_seen": 232987248, + "step": 1348 + }, + { + "epoch": 0.5387689848121503, + "loss": 0.4249582290649414, + "loss_ce": 0.0066537680104374886, + "loss_xval": 0.41796875, + "num_input_tokens_seen": 232987248, + "step": 1348 + }, + { + "epoch": 0.5391686650679457, + "grad_norm": 77.13815649979884, + "learning_rate": 5e-06, + "loss": 0.6195, + "num_input_tokens_seen": 233160320, + "step": 1349 + }, + { + "epoch": 0.5391686650679457, + "loss": 0.7565589547157288, + "loss_ce": 0.0020728609524667263, + "loss_xval": 0.75390625, + "num_input_tokens_seen": 233160320, + "step": 1349 + }, + { + "epoch": 0.539568345323741, + "grad_norm": 98.78007103206556, + "learning_rate": 5e-06, + "loss": 0.4941, + "num_input_tokens_seen": 233328096, + "step": 1350 + }, + { + "epoch": 0.539568345323741, + "loss": 0.6630009412765503, + "loss_ce": 0.007361266296356916, + "loss_xval": 0.65625, + "num_input_tokens_seen": 233328096, + "step": 1350 + }, + { + "epoch": 0.5399680255795364, + "grad_norm": 36.141697477602044, + "learning_rate": 5e-06, + "loss": 0.4463, + "num_input_tokens_seen": 233501160, + "step": 1351 + }, + { + "epoch": 0.5399680255795364, + "loss": 0.2609432339668274, + "loss_ce": 0.0051449015736579895, + "loss_xval": 0.255859375, + "num_input_tokens_seen": 233501160, + "step": 1351 + }, + { + "epoch": 0.5403677058353318, + "grad_norm": 116.14707098226158, + "learning_rate": 5e-06, + "loss": 0.7684, + "num_input_tokens_seen": 233673680, + "step": 1352 + }, + { + "epoch": 0.5403677058353318, + "loss": 0.9591152667999268, + "loss_ce": 0.0037320067640393972, + "loss_xval": 0.95703125, + "num_input_tokens_seen": 233673680, + "step": 1352 + }, + { + "epoch": 0.5407673860911271, + "grad_norm": 32.51684246973528, + "learning_rate": 5e-06, + "loss": 0.3522, + "num_input_tokens_seen": 233846656, + "step": 1353 + }, + { + "epoch": 0.5407673860911271, + "loss": 0.17604267597198486, + "loss_ce": 0.00724994670599699, + "loss_xval": 0.1689453125, + "num_input_tokens_seen": 233846656, + "step": 1353 + }, + { + "epoch": 0.5411670663469225, + "grad_norm": 95.46125369800038, + "learning_rate": 5e-06, + "loss": 0.4154, + "num_input_tokens_seen": 234019816, + "step": 1354 + }, + { + "epoch": 0.5411670663469225, + "loss": 0.5023761987686157, + "loss_ce": 0.003169646020978689, + "loss_xval": 0.5, + "num_input_tokens_seen": 234019816, + "step": 1354 + }, + { + "epoch": 0.5415667466027179, + "grad_norm": 39.52923561929836, + "learning_rate": 5e-06, + "loss": 0.3683, + "num_input_tokens_seen": 234192592, + "step": 1355 + }, + { + "epoch": 0.5415667466027179, + "loss": 0.23313309252262115, + "loss_ce": 0.0046327258460223675, + "loss_xval": 0.228515625, + "num_input_tokens_seen": 234192592, + "step": 1355 + }, + { + "epoch": 0.5419664268585132, + "grad_norm": 101.44654311999328, + "learning_rate": 5e-06, + "loss": 0.7023, + "num_input_tokens_seen": 234365312, + "step": 1356 + }, + { + "epoch": 0.5419664268585132, + "loss": 0.6109759211540222, + "loss_ce": 0.0038592463824898005, + "loss_xval": 0.60546875, + "num_input_tokens_seen": 234365312, + "step": 1356 + }, + { + "epoch": 0.5423661071143085, + "grad_norm": 68.28777793206135, + "learning_rate": 5e-06, + "loss": 0.4509, + "num_input_tokens_seen": 234538280, + "step": 1357 + }, + { + "epoch": 0.5423661071143085, + "loss": 0.4432651400566101, + "loss_ce": 0.004788582678884268, + "loss_xval": 0.4375, + "num_input_tokens_seen": 234538280, + "step": 1357 + }, + { + "epoch": 0.542765787370104, + "grad_norm": 80.01066832816397, + "learning_rate": 5e-06, + "loss": 0.5568, + "num_input_tokens_seen": 234711248, + "step": 1358 + }, + { + "epoch": 0.542765787370104, + "loss": 0.4560088813304901, + "loss_ce": 0.008193954825401306, + "loss_xval": 0.447265625, + "num_input_tokens_seen": 234711248, + "step": 1358 + }, + { + "epoch": 0.5431654676258992, + "grad_norm": 72.31855145193121, + "learning_rate": 5e-06, + "loss": 0.8891, + "num_input_tokens_seen": 234884400, + "step": 1359 + }, + { + "epoch": 0.5431654676258992, + "loss": 0.7763038873672485, + "loss_ce": 0.0035377484746277332, + "loss_xval": 0.7734375, + "num_input_tokens_seen": 234884400, + "step": 1359 + }, + { + "epoch": 0.5435651478816946, + "grad_norm": 43.434452462882426, + "learning_rate": 5e-06, + "loss": 0.4146, + "num_input_tokens_seen": 235057136, + "step": 1360 + }, + { + "epoch": 0.5435651478816946, + "loss": 0.5203279256820679, + "loss_ce": 0.002963472157716751, + "loss_xval": 0.515625, + "num_input_tokens_seen": 235057136, + "step": 1360 + }, + { + "epoch": 0.54396482813749, + "grad_norm": 118.75692026928306, + "learning_rate": 5e-06, + "loss": 0.4007, + "num_input_tokens_seen": 235230312, + "step": 1361 + }, + { + "epoch": 0.54396482813749, + "loss": 0.4342734217643738, + "loss_ce": 0.0037924526259303093, + "loss_xval": 0.4296875, + "num_input_tokens_seen": 235230312, + "step": 1361 + }, + { + "epoch": 0.5443645083932853, + "grad_norm": 49.47107440249342, + "learning_rate": 5e-06, + "loss": 0.7464, + "num_input_tokens_seen": 235403296, + "step": 1362 + }, + { + "epoch": 0.5443645083932853, + "loss": 0.6611525416374207, + "loss_ce": 0.003864938160404563, + "loss_xval": 0.65625, + "num_input_tokens_seen": 235403296, + "step": 1362 + }, + { + "epoch": 0.5447641886490807, + "grad_norm": 70.75775745041976, + "learning_rate": 5e-06, + "loss": 0.6533, + "num_input_tokens_seen": 235576152, + "step": 1363 + }, + { + "epoch": 0.5447641886490807, + "loss": 0.7079252600669861, + "loss_ce": 0.003945786505937576, + "loss_xval": 0.703125, + "num_input_tokens_seen": 235576152, + "step": 1363 + }, + { + "epoch": 0.5451638689048761, + "grad_norm": 70.38526419899421, + "learning_rate": 5e-06, + "loss": 0.4057, + "num_input_tokens_seen": 235748992, + "step": 1364 + }, + { + "epoch": 0.5451638689048761, + "loss": 0.6269418001174927, + "loss_ce": 0.003559216158464551, + "loss_xval": 0.625, + "num_input_tokens_seen": 235748992, + "step": 1364 + }, + { + "epoch": 0.5455635491606715, + "grad_norm": 31.366011027189284, + "learning_rate": 5e-06, + "loss": 0.3862, + "num_input_tokens_seen": 235921912, + "step": 1365 + }, + { + "epoch": 0.5455635491606715, + "loss": 0.4212551712989807, + "loss_ce": 0.007284234277904034, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 235921912, + "step": 1365 + }, + { + "epoch": 0.5459632294164668, + "grad_norm": 105.28341921676295, + "learning_rate": 5e-06, + "loss": 1.079, + "num_input_tokens_seen": 236094904, + "step": 1366 + }, + { + "epoch": 0.5459632294164668, + "loss": 0.4033566415309906, + "loss_ce": 0.0030270516872406006, + "loss_xval": 0.400390625, + "num_input_tokens_seen": 236094904, + "step": 1366 + }, + { + "epoch": 0.5463629096722622, + "grad_norm": 42.70774593040797, + "learning_rate": 5e-06, + "loss": 0.663, + "num_input_tokens_seen": 236268016, + "step": 1367 + }, + { + "epoch": 0.5463629096722622, + "loss": 0.1966470330953598, + "loss_ce": 0.004935611039400101, + "loss_xval": 0.19140625, + "num_input_tokens_seen": 236268016, + "step": 1367 + }, + { + "epoch": 0.5467625899280576, + "grad_norm": 112.81975987822089, + "learning_rate": 5e-06, + "loss": 0.5069, + "num_input_tokens_seen": 236440808, + "step": 1368 + }, + { + "epoch": 0.5467625899280576, + "loss": 0.7222706079483032, + "loss_ce": 0.005046539939939976, + "loss_xval": 0.71875, + "num_input_tokens_seen": 236440808, + "step": 1368 + }, + { + "epoch": 0.5471622701838529, + "grad_norm": 50.88949130940376, + "learning_rate": 5e-06, + "loss": 0.4146, + "num_input_tokens_seen": 236613648, + "step": 1369 + }, + { + "epoch": 0.5471622701838529, + "loss": 0.4492402970790863, + "loss_ce": 0.004294017795473337, + "loss_xval": 0.4453125, + "num_input_tokens_seen": 236613648, + "step": 1369 + }, + { + "epoch": 0.5475619504396483, + "grad_norm": 104.13474925279456, + "learning_rate": 5e-06, + "loss": 0.9189, + "num_input_tokens_seen": 236786680, + "step": 1370 + }, + { + "epoch": 0.5475619504396483, + "loss": 0.8582189679145813, + "loss_ce": 0.0034826004412025213, + "loss_xval": 0.85546875, + "num_input_tokens_seen": 236786680, + "step": 1370 + }, + { + "epoch": 0.5479616306954437, + "grad_norm": 32.81711209453196, + "learning_rate": 5e-06, + "loss": 0.3459, + "num_input_tokens_seen": 236959600, + "step": 1371 + }, + { + "epoch": 0.5479616306954437, + "loss": 0.272049218416214, + "loss_ce": 0.008255256339907646, + "loss_xval": 0.263671875, + "num_input_tokens_seen": 236959600, + "step": 1371 + }, + { + "epoch": 0.548361310951239, + "grad_norm": 105.1381007340414, + "learning_rate": 5e-06, + "loss": 0.592, + "num_input_tokens_seen": 237132680, + "step": 1372 + }, + { + "epoch": 0.548361310951239, + "loss": 0.6921520233154297, + "loss_ce": 0.005262310616672039, + "loss_xval": 0.6875, + "num_input_tokens_seen": 237132680, + "step": 1372 + }, + { + "epoch": 0.5487609912070344, + "grad_norm": 66.93404166002924, + "learning_rate": 5e-06, + "loss": 0.515, + "num_input_tokens_seen": 237305536, + "step": 1373 + }, + { + "epoch": 0.5487609912070344, + "loss": 0.4059692621231079, + "loss_ce": 0.0069824280217289925, + "loss_xval": 0.3984375, + "num_input_tokens_seen": 237305536, + "step": 1373 + }, + { + "epoch": 0.5491606714628298, + "grad_norm": 73.26212946796912, + "learning_rate": 5e-06, + "loss": 0.6015, + "num_input_tokens_seen": 237478648, + "step": 1374 + }, + { + "epoch": 0.5491606714628298, + "loss": 0.5105670690536499, + "loss_ce": 0.004585604183375835, + "loss_xval": 0.5078125, + "num_input_tokens_seen": 237478648, + "step": 1374 + }, + { + "epoch": 0.549560351718625, + "grad_norm": 107.19992432502981, + "learning_rate": 5e-06, + "loss": 0.8017, + "num_input_tokens_seen": 237651648, + "step": 1375 + }, + { + "epoch": 0.549560351718625, + "loss": 0.6293633580207825, + "loss_ce": 0.004180216696113348, + "loss_xval": 0.625, + "num_input_tokens_seen": 237651648, + "step": 1375 + }, + { + "epoch": 0.5499600319744204, + "grad_norm": 102.85773361268112, + "learning_rate": 5e-06, + "loss": 0.6284, + "num_input_tokens_seen": 237824552, + "step": 1376 + }, + { + "epoch": 0.5499600319744204, + "loss": 0.6715470552444458, + "loss_ce": 0.007576065603643656, + "loss_xval": 0.6640625, + "num_input_tokens_seen": 237824552, + "step": 1376 + }, + { + "epoch": 0.5503597122302158, + "grad_norm": 51.34013198873488, + "learning_rate": 5e-06, + "loss": 0.5408, + "num_input_tokens_seen": 237997088, + "step": 1377 + }, + { + "epoch": 0.5503597122302158, + "loss": 0.5498093366622925, + "loss_ce": 0.0029343212954699993, + "loss_xval": 0.546875, + "num_input_tokens_seen": 237997088, + "step": 1377 + }, + { + "epoch": 0.5507593924860112, + "grad_norm": 45.16480729836272, + "learning_rate": 5e-06, + "loss": 0.5291, + "num_input_tokens_seen": 238170104, + "step": 1378 + }, + { + "epoch": 0.5507593924860112, + "loss": 0.6173465847969055, + "loss_ce": 0.006995024159550667, + "loss_xval": 0.609375, + "num_input_tokens_seen": 238170104, + "step": 1378 + }, + { + "epoch": 0.5511590727418065, + "grad_norm": 90.67189419837551, + "learning_rate": 5e-06, + "loss": 0.341, + "num_input_tokens_seen": 238342840, + "step": 1379 + }, + { + "epoch": 0.5511590727418065, + "loss": 0.33946144580841064, + "loss_ce": 0.0049888077192008495, + "loss_xval": 0.333984375, + "num_input_tokens_seen": 238342840, + "step": 1379 + }, + { + "epoch": 0.5515587529976019, + "grad_norm": 52.043973144528145, + "learning_rate": 5e-06, + "loss": 0.4572, + "num_input_tokens_seen": 238515808, + "step": 1380 + }, + { + "epoch": 0.5515587529976019, + "loss": 0.5051401257514954, + "loss_ce": 0.011243650689721107, + "loss_xval": 0.494140625, + "num_input_tokens_seen": 238515808, + "step": 1380 + }, + { + "epoch": 0.5519584332533973, + "grad_norm": 56.17714897064993, + "learning_rate": 5e-06, + "loss": 0.3739, + "num_input_tokens_seen": 238688664, + "step": 1381 + }, + { + "epoch": 0.5519584332533973, + "loss": 0.5631752610206604, + "loss_ce": 0.00677874032407999, + "loss_xval": 0.5546875, + "num_input_tokens_seen": 238688664, + "step": 1381 + }, + { + "epoch": 0.5523581135091926, + "grad_norm": 40.5381866842805, + "learning_rate": 5e-06, + "loss": 0.3333, + "num_input_tokens_seen": 238861488, + "step": 1382 + }, + { + "epoch": 0.5523581135091926, + "loss": 0.37881040573120117, + "loss_ce": 0.00214719888754189, + "loss_xval": 0.376953125, + "num_input_tokens_seen": 238861488, + "step": 1382 + }, + { + "epoch": 0.552757793764988, + "grad_norm": 51.78581653084787, + "learning_rate": 5e-06, + "loss": 0.5934, + "num_input_tokens_seen": 239034336, + "step": 1383 + }, + { + "epoch": 0.552757793764988, + "loss": 0.6673434972763062, + "loss_ce": 0.0035251579247415066, + "loss_xval": 0.6640625, + "num_input_tokens_seen": 239034336, + "step": 1383 + }, + { + "epoch": 0.5531574740207834, + "grad_norm": 65.06524952164565, + "learning_rate": 5e-06, + "loss": 0.3498, + "num_input_tokens_seen": 239207232, + "step": 1384 + }, + { + "epoch": 0.5531574740207834, + "loss": 0.5492129325866699, + "loss_ce": 0.008563470095396042, + "loss_xval": 0.5390625, + "num_input_tokens_seen": 239207232, + "step": 1384 + }, + { + "epoch": 0.5535571542765787, + "grad_norm": 24.785324730924025, + "learning_rate": 5e-06, + "loss": 0.4747, + "num_input_tokens_seen": 239380368, + "step": 1385 + }, + { + "epoch": 0.5535571542765787, + "loss": 0.6038126349449158, + "loss_ce": 0.006461561657488346, + "loss_xval": 0.59765625, + "num_input_tokens_seen": 239380368, + "step": 1385 + }, + { + "epoch": 0.5539568345323741, + "grad_norm": 22.3044022119237, + "learning_rate": 5e-06, + "loss": 0.309, + "num_input_tokens_seen": 239553512, + "step": 1386 + }, + { + "epoch": 0.5539568345323741, + "loss": 0.23669841885566711, + "loss_ce": 0.0036051569040864706, + "loss_xval": 0.2333984375, + "num_input_tokens_seen": 239553512, + "step": 1386 + }, + { + "epoch": 0.5543565147881695, + "grad_norm": 32.16457873391064, + "learning_rate": 5e-06, + "loss": 0.3397, + "num_input_tokens_seen": 239726688, + "step": 1387 + }, + { + "epoch": 0.5543565147881695, + "loss": 0.39722371101379395, + "loss_ce": 0.006537655834108591, + "loss_xval": 0.390625, + "num_input_tokens_seen": 239726688, + "step": 1387 + }, + { + "epoch": 0.5547561950439648, + "grad_norm": 40.175745398782205, + "learning_rate": 5e-06, + "loss": 0.3402, + "num_input_tokens_seen": 239899592, + "step": 1388 + }, + { + "epoch": 0.5547561950439648, + "loss": 0.26957184076309204, + "loss_ce": 0.00232940586283803, + "loss_xval": 0.267578125, + "num_input_tokens_seen": 239899592, + "step": 1388 + }, + { + "epoch": 0.5551558752997602, + "grad_norm": 18.426941380820455, + "learning_rate": 5e-06, + "loss": 0.4403, + "num_input_tokens_seen": 240072512, + "step": 1389 + }, + { + "epoch": 0.5551558752997602, + "loss": 0.46690550446510315, + "loss_ce": 0.0025653140619397163, + "loss_xval": 0.46484375, + "num_input_tokens_seen": 240072512, + "step": 1389 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 45.49727718405183, + "learning_rate": 5e-06, + "loss": 0.5213, + "num_input_tokens_seen": 240245672, + "step": 1390 + }, + { + "epoch": 0.5555555555555556, + "loss": 0.593445897102356, + "loss_ce": 0.005082281306385994, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 240245672, + "step": 1390 + }, + { + "epoch": 0.5559552358113509, + "grad_norm": 53.104840422529556, + "learning_rate": 5e-06, + "loss": 0.5958, + "num_input_tokens_seen": 240418408, + "step": 1391 + }, + { + "epoch": 0.5559552358113509, + "loss": 0.519351601600647, + "loss_ce": 0.005221944767981768, + "loss_xval": 0.515625, + "num_input_tokens_seen": 240418408, + "step": 1391 + }, + { + "epoch": 0.5563549160671463, + "grad_norm": 89.13366947751527, + "learning_rate": 5e-06, + "loss": 0.7433, + "num_input_tokens_seen": 240591376, + "step": 1392 + }, + { + "epoch": 0.5563549160671463, + "loss": 0.9261295795440674, + "loss_ce": 0.010220762342214584, + "loss_xval": 0.9140625, + "num_input_tokens_seen": 240591376, + "step": 1392 + }, + { + "epoch": 0.5567545963229417, + "grad_norm": 35.338435295261455, + "learning_rate": 5e-06, + "loss": 0.4817, + "num_input_tokens_seen": 240764424, + "step": 1393 + }, + { + "epoch": 0.5567545963229417, + "loss": 0.23436526954174042, + "loss_ce": 0.005239281803369522, + "loss_xval": 0.2294921875, + "num_input_tokens_seen": 240764424, + "step": 1393 + }, + { + "epoch": 0.5571542765787371, + "grad_norm": 64.32086168686824, + "learning_rate": 5e-06, + "loss": 0.5221, + "num_input_tokens_seen": 240937200, + "step": 1394 + }, + { + "epoch": 0.5571542765787371, + "loss": 0.38562482595443726, + "loss_ce": 0.003971992991864681, + "loss_xval": 0.380859375, + "num_input_tokens_seen": 240937200, + "step": 1394 + }, + { + "epoch": 0.5575539568345323, + "grad_norm": 21.415086573261146, + "learning_rate": 5e-06, + "loss": 0.2177, + "num_input_tokens_seen": 241110088, + "step": 1395 + }, + { + "epoch": 0.5575539568345323, + "loss": 0.2166837602853775, + "loss_ce": 0.0027860510163009167, + "loss_xval": 0.2138671875, + "num_input_tokens_seen": 241110088, + "step": 1395 + }, + { + "epoch": 0.5579536370903277, + "grad_norm": 62.42230199682289, + "learning_rate": 5e-06, + "loss": 0.4826, + "num_input_tokens_seen": 241282992, + "step": 1396 + }, + { + "epoch": 0.5579536370903277, + "loss": 0.5386009216308594, + "loss_ce": 0.002742763375863433, + "loss_xval": 0.53515625, + "num_input_tokens_seen": 241282992, + "step": 1396 + }, + { + "epoch": 0.5583533173461231, + "grad_norm": 29.710934144805904, + "learning_rate": 5e-06, + "loss": 0.4846, + "num_input_tokens_seen": 241455856, + "step": 1397 + }, + { + "epoch": 0.5583533173461231, + "loss": 0.6360405683517456, + "loss_ce": 0.004456415772438049, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 241455856, + "step": 1397 + }, + { + "epoch": 0.5587529976019184, + "grad_norm": 30.202754655810416, + "learning_rate": 5e-06, + "loss": 0.4387, + "num_input_tokens_seen": 241628576, + "step": 1398 + }, + { + "epoch": 0.5587529976019184, + "loss": 0.41940563917160034, + "loss_ce": 0.005099033936858177, + "loss_xval": 0.4140625, + "num_input_tokens_seen": 241628576, + "step": 1398 + }, + { + "epoch": 0.5591526778577138, + "grad_norm": 34.58672133188951, + "learning_rate": 5e-06, + "loss": 0.5131, + "num_input_tokens_seen": 241801464, + "step": 1399 + }, + { + "epoch": 0.5591526778577138, + "loss": 0.8912590742111206, + "loss_ce": 0.004296140745282173, + "loss_xval": 0.88671875, + "num_input_tokens_seen": 241801464, + "step": 1399 + }, + { + "epoch": 0.5595523581135092, + "grad_norm": 61.9029431455141, + "learning_rate": 5e-06, + "loss": 0.3072, + "num_input_tokens_seen": 241974584, + "step": 1400 + }, + { + "epoch": 0.5595523581135092, + "loss": 0.3766733407974243, + "loss_ce": 0.004877682775259018, + "loss_xval": 0.37109375, + "num_input_tokens_seen": 241974584, + "step": 1400 + }, + { + "epoch": 0.5599520383693045, + "grad_norm": 58.06942738123757, + "learning_rate": 5e-06, + "loss": 0.4248, + "num_input_tokens_seen": 242147592, + "step": 1401 + }, + { + "epoch": 0.5599520383693045, + "loss": 0.3648153245449066, + "loss_ce": 0.0030446944292634726, + "loss_xval": 0.361328125, + "num_input_tokens_seen": 242147592, + "step": 1401 + }, + { + "epoch": 0.5603517186250999, + "grad_norm": 61.23774543866974, + "learning_rate": 5e-06, + "loss": 0.4822, + "num_input_tokens_seen": 242317160, + "step": 1402 + }, + { + "epoch": 0.5603517186250999, + "loss": 0.24395032227039337, + "loss_ce": 0.003014039946720004, + "loss_xval": 0.2412109375, + "num_input_tokens_seen": 242317160, + "step": 1402 + }, + { + "epoch": 0.5607513988808953, + "grad_norm": 35.92228501713754, + "learning_rate": 5e-06, + "loss": 0.4341, + "num_input_tokens_seen": 242486528, + "step": 1403 + }, + { + "epoch": 0.5607513988808953, + "loss": 0.39893341064453125, + "loss_ce": 0.0020217944402247667, + "loss_xval": 0.396484375, + "num_input_tokens_seen": 242486528, + "step": 1403 + }, + { + "epoch": 0.5611510791366906, + "grad_norm": 91.09500736565018, + "learning_rate": 5e-06, + "loss": 0.5419, + "num_input_tokens_seen": 242659688, + "step": 1404 + }, + { + "epoch": 0.5611510791366906, + "loss": 0.6004331111907959, + "loss_ce": 0.0020444332621991634, + "loss_xval": 0.59765625, + "num_input_tokens_seen": 242659688, + "step": 1404 + }, + { + "epoch": 0.561550759392486, + "grad_norm": 55.4921140642313, + "learning_rate": 5e-06, + "loss": 0.5621, + "num_input_tokens_seen": 242832520, + "step": 1405 + }, + { + "epoch": 0.561550759392486, + "loss": 0.6540317535400391, + "loss_ce": 0.0017490473110228777, + "loss_xval": 0.65234375, + "num_input_tokens_seen": 242832520, + "step": 1405 + }, + { + "epoch": 0.5619504396482814, + "grad_norm": 51.074231439851395, + "learning_rate": 5e-06, + "loss": 0.3966, + "num_input_tokens_seen": 243005600, + "step": 1406 + }, + { + "epoch": 0.5619504396482814, + "loss": 0.48184454441070557, + "loss_ce": 0.006624825298786163, + "loss_xval": 0.474609375, + "num_input_tokens_seen": 243005600, + "step": 1406 + }, + { + "epoch": 0.5623501199040767, + "grad_norm": 90.03472614066028, + "learning_rate": 5e-06, + "loss": 0.4644, + "num_input_tokens_seen": 243178984, + "step": 1407 + }, + { + "epoch": 0.5623501199040767, + "loss": 0.6506166458129883, + "loss_ce": 0.006390602793544531, + "loss_xval": 0.64453125, + "num_input_tokens_seen": 243178984, + "step": 1407 + }, + { + "epoch": 0.5627498001598721, + "grad_norm": 57.773260923011286, + "learning_rate": 5e-06, + "loss": 0.8877, + "num_input_tokens_seen": 243352152, + "step": 1408 + }, + { + "epoch": 0.5627498001598721, + "loss": 0.8280885219573975, + "loss_ce": 0.00386980758048594, + "loss_xval": 0.82421875, + "num_input_tokens_seen": 243352152, + "step": 1408 + }, + { + "epoch": 0.5631494804156675, + "grad_norm": 116.82277290671593, + "learning_rate": 5e-06, + "loss": 0.4323, + "num_input_tokens_seen": 243525104, + "step": 1409 + }, + { + "epoch": 0.5631494804156675, + "loss": 0.4614856243133545, + "loss_ce": 0.006285438779741526, + "loss_xval": 0.455078125, + "num_input_tokens_seen": 243525104, + "step": 1409 + }, + { + "epoch": 0.5635491606714629, + "grad_norm": 61.1230282236721, + "learning_rate": 5e-06, + "loss": 0.35, + "num_input_tokens_seen": 243697976, + "step": 1410 + }, + { + "epoch": 0.5635491606714629, + "loss": 0.38546931743621826, + "loss_ce": 0.0026262898463755846, + "loss_xval": 0.3828125, + "num_input_tokens_seen": 243697976, + "step": 1410 + }, + { + "epoch": 0.5639488409272582, + "grad_norm": 111.85099191601515, + "learning_rate": 5e-06, + "loss": 0.3607, + "num_input_tokens_seen": 243870560, + "step": 1411 + }, + { + "epoch": 0.5639488409272582, + "loss": 0.37697547674179077, + "loss_ce": 0.00398966483771801, + "loss_xval": 0.373046875, + "num_input_tokens_seen": 243870560, + "step": 1411 + }, + { + "epoch": 0.5643485211830536, + "grad_norm": 88.33100579942653, + "learning_rate": 5e-06, + "loss": 0.3984, + "num_input_tokens_seen": 244043272, + "step": 1412 + }, + { + "epoch": 0.5643485211830536, + "loss": 0.432317852973938, + "loss_ce": 0.004156234674155712, + "loss_xval": 0.427734375, + "num_input_tokens_seen": 244043272, + "step": 1412 + }, + { + "epoch": 0.564748201438849, + "grad_norm": 81.4102017754153, + "learning_rate": 5e-06, + "loss": 0.6458, + "num_input_tokens_seen": 244216144, + "step": 1413 + }, + { + "epoch": 0.564748201438849, + "loss": 0.6670348048210144, + "loss_ce": 0.0025145430117845535, + "loss_xval": 0.6640625, + "num_input_tokens_seen": 244216144, + "step": 1413 + }, + { + "epoch": 0.5651478816946442, + "grad_norm": 46.391307098818544, + "learning_rate": 5e-06, + "loss": 0.4551, + "num_input_tokens_seen": 244389072, + "step": 1414 + }, + { + "epoch": 0.5651478816946442, + "loss": 0.3752412796020508, + "loss_ce": 0.005246136337518692, + "loss_xval": 0.369140625, + "num_input_tokens_seen": 244389072, + "step": 1414 + }, + { + "epoch": 0.5655475619504396, + "grad_norm": 129.86301928613088, + "learning_rate": 5e-06, + "loss": 0.4808, + "num_input_tokens_seen": 244561792, + "step": 1415 + }, + { + "epoch": 0.5655475619504396, + "loss": 0.30836910009384155, + "loss_ce": 0.007160608656704426, + "loss_xval": 0.30078125, + "num_input_tokens_seen": 244561792, + "step": 1415 + }, + { + "epoch": 0.565947242206235, + "grad_norm": 40.89946165935015, + "learning_rate": 5e-06, + "loss": 0.4916, + "num_input_tokens_seen": 244735040, + "step": 1416 + }, + { + "epoch": 0.565947242206235, + "loss": 0.36573469638824463, + "loss_ce": 0.002453463850542903, + "loss_xval": 0.36328125, + "num_input_tokens_seen": 244735040, + "step": 1416 + }, + { + "epoch": 0.5663469224620303, + "grad_norm": 57.37518618513914, + "learning_rate": 5e-06, + "loss": 0.7517, + "num_input_tokens_seen": 244907832, + "step": 1417 + }, + { + "epoch": 0.5663469224620303, + "loss": 1.0576549768447876, + "loss_ce": 0.007972333580255508, + "loss_xval": 1.046875, + "num_input_tokens_seen": 244907832, + "step": 1417 + }, + { + "epoch": 0.5667466027178257, + "grad_norm": 50.498261369439454, + "learning_rate": 5e-06, + "loss": 0.7163, + "num_input_tokens_seen": 245080504, + "step": 1418 + }, + { + "epoch": 0.5667466027178257, + "loss": 0.40888845920562744, + "loss_ce": 0.011854803189635277, + "loss_xval": 0.396484375, + "num_input_tokens_seen": 245080504, + "step": 1418 + }, + { + "epoch": 0.5671462829736211, + "grad_norm": 87.47288430083069, + "learning_rate": 5e-06, + "loss": 0.4905, + "num_input_tokens_seen": 245253712, + "step": 1419 + }, + { + "epoch": 0.5671462829736211, + "loss": 0.5902141332626343, + "loss_ce": 0.01254691369831562, + "loss_xval": 0.578125, + "num_input_tokens_seen": 245253712, + "step": 1419 + }, + { + "epoch": 0.5675459632294164, + "grad_norm": 49.32901995535315, + "learning_rate": 5e-06, + "loss": 0.4311, + "num_input_tokens_seen": 245426592, + "step": 1420 + }, + { + "epoch": 0.5675459632294164, + "loss": 0.39141321182250977, + "loss_ce": 0.006342416163533926, + "loss_xval": 0.384765625, + "num_input_tokens_seen": 245426592, + "step": 1420 + }, + { + "epoch": 0.5679456434852118, + "grad_norm": 53.82129861028939, + "learning_rate": 5e-06, + "loss": 0.5077, + "num_input_tokens_seen": 245599872, + "step": 1421 + }, + { + "epoch": 0.5679456434852118, + "loss": 0.3029336631298065, + "loss_ce": 0.002427075756713748, + "loss_xval": 0.30078125, + "num_input_tokens_seen": 245599872, + "step": 1421 + }, + { + "epoch": 0.5683453237410072, + "grad_norm": 20.8997393508194, + "learning_rate": 5e-06, + "loss": 0.4295, + "num_input_tokens_seen": 245772848, + "step": 1422 + }, + { + "epoch": 0.5683453237410072, + "loss": 0.4302918314933777, + "loss_ce": 0.015069630928337574, + "loss_xval": 0.416015625, + "num_input_tokens_seen": 245772848, + "step": 1422 + }, + { + "epoch": 0.5687450039968026, + "grad_norm": 34.823028475917404, + "learning_rate": 5e-06, + "loss": 0.5763, + "num_input_tokens_seen": 245945280, + "step": 1423 + }, + { + "epoch": 0.5687450039968026, + "loss": 0.46396341919898987, + "loss_ce": 0.003575220936909318, + "loss_xval": 0.4609375, + "num_input_tokens_seen": 245945280, + "step": 1423 + }, + { + "epoch": 0.5691446842525979, + "grad_norm": 78.37392797245467, + "learning_rate": 5e-06, + "loss": 0.5372, + "num_input_tokens_seen": 246118456, + "step": 1424 + }, + { + "epoch": 0.5691446842525979, + "loss": 0.7276915311813354, + "loss_ce": 0.004821660462766886, + "loss_xval": 0.72265625, + "num_input_tokens_seen": 246118456, + "step": 1424 + }, + { + "epoch": 0.5695443645083933, + "grad_norm": 35.365197351786534, + "learning_rate": 5e-06, + "loss": 0.659, + "num_input_tokens_seen": 246291224, + "step": 1425 + }, + { + "epoch": 0.5695443645083933, + "loss": 0.3385479152202606, + "loss_ce": 0.005723202601075172, + "loss_xval": 0.33203125, + "num_input_tokens_seen": 246291224, + "step": 1425 + }, + { + "epoch": 0.5699440447641887, + "grad_norm": 38.38998430273591, + "learning_rate": 5e-06, + "loss": 0.6328, + "num_input_tokens_seen": 246464600, + "step": 1426 + }, + { + "epoch": 0.5699440447641887, + "loss": 0.5840468406677246, + "loss_ce": 0.007112047169357538, + "loss_xval": 0.578125, + "num_input_tokens_seen": 246464600, + "step": 1426 + }, + { + "epoch": 0.570343725019984, + "grad_norm": 26.75343703719278, + "learning_rate": 5e-06, + "loss": 0.7358, + "num_input_tokens_seen": 246637424, + "step": 1427 + }, + { + "epoch": 0.570343725019984, + "loss": 0.7768107652664185, + "loss_ce": 0.005021188408136368, + "loss_xval": 0.7734375, + "num_input_tokens_seen": 246637424, + "step": 1427 + }, + { + "epoch": 0.5707434052757794, + "grad_norm": 48.88082237036071, + "learning_rate": 5e-06, + "loss": 0.4387, + "num_input_tokens_seen": 246810640, + "step": 1428 + }, + { + "epoch": 0.5707434052757794, + "loss": 0.5554983019828796, + "loss_ce": 0.0043356032110750675, + "loss_xval": 0.55078125, + "num_input_tokens_seen": 246810640, + "step": 1428 + }, + { + "epoch": 0.5711430855315748, + "grad_norm": 34.52783585537693, + "learning_rate": 5e-06, + "loss": 0.4248, + "num_input_tokens_seen": 246983392, + "step": 1429 + }, + { + "epoch": 0.5711430855315748, + "loss": 0.5638842582702637, + "loss_ce": 0.00471069710329175, + "loss_xval": 0.55859375, + "num_input_tokens_seen": 246983392, + "step": 1429 + }, + { + "epoch": 0.5715427657873701, + "grad_norm": 25.030027317245604, + "learning_rate": 5e-06, + "loss": 0.3461, + "num_input_tokens_seen": 247156080, + "step": 1430 + }, + { + "epoch": 0.5715427657873701, + "loss": 0.4537726938724518, + "loss_ce": 0.009070548228919506, + "loss_xval": 0.4453125, + "num_input_tokens_seen": 247156080, + "step": 1430 + }, + { + "epoch": 0.5719424460431655, + "grad_norm": 37.87817514853326, + "learning_rate": 5e-06, + "loss": 0.5784, + "num_input_tokens_seen": 247328944, + "step": 1431 + }, + { + "epoch": 0.5719424460431655, + "loss": 0.7571723461151123, + "loss_ce": 0.0038001316133886576, + "loss_xval": 0.75390625, + "num_input_tokens_seen": 247328944, + "step": 1431 + }, + { + "epoch": 0.5723421262989609, + "grad_norm": 54.5491901614114, + "learning_rate": 5e-06, + "loss": 0.4931, + "num_input_tokens_seen": 247502144, + "step": 1432 + }, + { + "epoch": 0.5723421262989609, + "loss": 0.5219075083732605, + "loss_ce": 0.003871629014611244, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 247502144, + "step": 1432 + }, + { + "epoch": 0.5727418065547561, + "grad_norm": 56.706910138506075, + "learning_rate": 5e-06, + "loss": 0.5465, + "num_input_tokens_seen": 247674936, + "step": 1433 + }, + { + "epoch": 0.5727418065547561, + "loss": 0.6499233245849609, + "loss_ce": 0.002157290233299136, + "loss_xval": 0.6484375, + "num_input_tokens_seen": 247674936, + "step": 1433 + }, + { + "epoch": 0.5731414868105515, + "grad_norm": 16.90318365693236, + "learning_rate": 5e-06, + "loss": 0.5191, + "num_input_tokens_seen": 247848032, + "step": 1434 + }, + { + "epoch": 0.5731414868105515, + "loss": 0.534213662147522, + "loss_ce": 0.014712927863001823, + "loss_xval": 0.51953125, + "num_input_tokens_seen": 247848032, + "step": 1434 + }, + { + "epoch": 0.573541167066347, + "grad_norm": 30.931323218606245, + "learning_rate": 5e-06, + "loss": 0.557, + "num_input_tokens_seen": 248020944, + "step": 1435 + }, + { + "epoch": 0.573541167066347, + "loss": 0.4585926830768585, + "loss_ce": 0.005650795064866543, + "loss_xval": 0.453125, + "num_input_tokens_seen": 248020944, + "step": 1435 + }, + { + "epoch": 0.5739408473221422, + "grad_norm": 52.81318892972862, + "learning_rate": 5e-06, + "loss": 0.4016, + "num_input_tokens_seen": 248193744, + "step": 1436 + }, + { + "epoch": 0.5739408473221422, + "loss": 0.22793583571910858, + "loss_ce": 0.006530814804136753, + "loss_xval": 0.2216796875, + "num_input_tokens_seen": 248193744, + "step": 1436 + }, + { + "epoch": 0.5743405275779376, + "grad_norm": 41.45713075341328, + "learning_rate": 5e-06, + "loss": 0.3829, + "num_input_tokens_seen": 248366568, + "step": 1437 + }, + { + "epoch": 0.5743405275779376, + "loss": 0.15585559606552124, + "loss_ce": 0.002306394511833787, + "loss_xval": 0.1533203125, + "num_input_tokens_seen": 248366568, + "step": 1437 + }, + { + "epoch": 0.574740207833733, + "grad_norm": 42.899656566869155, + "learning_rate": 5e-06, + "loss": 0.3808, + "num_input_tokens_seen": 248539664, + "step": 1438 + }, + { + "epoch": 0.574740207833733, + "loss": 0.40062421560287476, + "loss_ce": 0.0016373979160562158, + "loss_xval": 0.3984375, + "num_input_tokens_seen": 248539664, + "step": 1438 + }, + { + "epoch": 0.5751398880895284, + "grad_norm": 28.503400213541788, + "learning_rate": 5e-06, + "loss": 0.5283, + "num_input_tokens_seen": 248712512, + "step": 1439 + }, + { + "epoch": 0.5751398880895284, + "loss": 0.5747106075286865, + "loss_ce": 0.001498923171311617, + "loss_xval": 0.57421875, + "num_input_tokens_seen": 248712512, + "step": 1439 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 28.16745339282557, + "learning_rate": 5e-06, + "loss": 0.5161, + "num_input_tokens_seen": 248882088, + "step": 1440 + }, + { + "epoch": 0.5755395683453237, + "loss": 0.2827589511871338, + "loss_ce": 0.001676824176684022, + "loss_xval": 0.28125, + "num_input_tokens_seen": 248882088, + "step": 1440 + }, + { + "epoch": 0.5759392486011191, + "grad_norm": 77.82657940371696, + "learning_rate": 5e-06, + "loss": 0.5886, + "num_input_tokens_seen": 249054720, + "step": 1441 + }, + { + "epoch": 0.5759392486011191, + "loss": 0.40001291036605835, + "loss_ce": 0.0038947416469454765, + "loss_xval": 0.396484375, + "num_input_tokens_seen": 249054720, + "step": 1441 + }, + { + "epoch": 0.5763389288569145, + "grad_norm": 20.49828614983781, + "learning_rate": 5e-06, + "loss": 0.4377, + "num_input_tokens_seen": 249227984, + "step": 1442 + }, + { + "epoch": 0.5763389288569145, + "loss": 0.39074262976646423, + "loss_ce": 0.0035661240108311176, + "loss_xval": 0.38671875, + "num_input_tokens_seen": 249227984, + "step": 1442 + }, + { + "epoch": 0.5767386091127098, + "grad_norm": 25.0536969800906, + "learning_rate": 5e-06, + "loss": 0.435, + "num_input_tokens_seen": 249400992, + "step": 1443 + }, + { + "epoch": 0.5767386091127098, + "loss": 0.5896605849266052, + "loss_ce": 0.006286558695137501, + "loss_xval": 0.58203125, + "num_input_tokens_seen": 249400992, + "step": 1443 + }, + { + "epoch": 0.5771382893685052, + "grad_norm": 40.73073999908245, + "learning_rate": 5e-06, + "loss": 0.2902, + "num_input_tokens_seen": 249574272, + "step": 1444 + }, + { + "epoch": 0.5771382893685052, + "loss": 0.1345619261264801, + "loss_ce": 0.003610992804169655, + "loss_xval": 0.130859375, + "num_input_tokens_seen": 249574272, + "step": 1444 + }, + { + "epoch": 0.5775379696243006, + "grad_norm": 24.77426937534188, + "learning_rate": 5e-06, + "loss": 0.5771, + "num_input_tokens_seen": 249747576, + "step": 1445 + }, + { + "epoch": 0.5775379696243006, + "loss": 0.5811995267868042, + "loss_ce": 0.005882172379642725, + "loss_xval": 0.57421875, + "num_input_tokens_seen": 249747576, + "step": 1445 + }, + { + "epoch": 0.5779376498800959, + "grad_norm": 40.53175188685322, + "learning_rate": 5e-06, + "loss": 0.5273, + "num_input_tokens_seen": 249920024, + "step": 1446 + }, + { + "epoch": 0.5779376498800959, + "loss": 0.6215522885322571, + "loss_ce": 0.002655788091942668, + "loss_xval": 0.6171875, + "num_input_tokens_seen": 249920024, + "step": 1446 + }, + { + "epoch": 0.5783373301358913, + "grad_norm": 17.535297794439494, + "learning_rate": 5e-06, + "loss": 0.2833, + "num_input_tokens_seen": 250092904, + "step": 1447 + }, + { + "epoch": 0.5783373301358913, + "loss": 0.3107568621635437, + "loss_ce": 0.0020715624559670687, + "loss_xval": 0.30859375, + "num_input_tokens_seen": 250092904, + "step": 1447 + }, + { + "epoch": 0.5787370103916867, + "grad_norm": 59.851621485656516, + "learning_rate": 5e-06, + "loss": 0.3855, + "num_input_tokens_seen": 250266032, + "step": 1448 + }, + { + "epoch": 0.5787370103916867, + "loss": 0.254830002784729, + "loss_ce": 0.008156410418450832, + "loss_xval": 0.2470703125, + "num_input_tokens_seen": 250266032, + "step": 1448 + }, + { + "epoch": 0.579136690647482, + "grad_norm": 56.40854683309498, + "learning_rate": 5e-06, + "loss": 0.6026, + "num_input_tokens_seen": 250438480, + "step": 1449 + }, + { + "epoch": 0.579136690647482, + "loss": 0.7701046466827393, + "loss_ce": 0.004235545638948679, + "loss_xval": 0.765625, + "num_input_tokens_seen": 250438480, + "step": 1449 + }, + { + "epoch": 0.5795363709032774, + "grad_norm": 17.60230541103847, + "learning_rate": 5e-06, + "loss": 0.4824, + "num_input_tokens_seen": 250611272, + "step": 1450 + }, + { + "epoch": 0.5795363709032774, + "loss": 0.6768176555633545, + "loss_ce": 0.0039049754850566387, + "loss_xval": 0.671875, + "num_input_tokens_seen": 250611272, + "step": 1450 + }, + { + "epoch": 0.5799360511590728, + "grad_norm": 46.39964305071812, + "learning_rate": 5e-06, + "loss": 0.5641, + "num_input_tokens_seen": 250784048, + "step": 1451 + }, + { + "epoch": 0.5799360511590728, + "loss": 0.11980067938566208, + "loss_ce": 0.012394066900014877, + "loss_xval": 0.107421875, + "num_input_tokens_seen": 250784048, + "step": 1451 + }, + { + "epoch": 0.580335731414868, + "grad_norm": 31.255334541299135, + "learning_rate": 5e-06, + "loss": 0.2784, + "num_input_tokens_seen": 250957296, + "step": 1452 + }, + { + "epoch": 0.580335731414868, + "loss": 0.31082266569137573, + "loss_ce": 0.0031139145139604807, + "loss_xval": 0.30859375, + "num_input_tokens_seen": 250957296, + "step": 1452 + }, + { + "epoch": 0.5807354116706634, + "grad_norm": 77.28904921192597, + "learning_rate": 5e-06, + "loss": 0.3999, + "num_input_tokens_seen": 251130488, + "step": 1453 + }, + { + "epoch": 0.5807354116706634, + "loss": 0.2975703477859497, + "loss_ce": 0.0013362220488488674, + "loss_xval": 0.296875, + "num_input_tokens_seen": 251130488, + "step": 1453 + }, + { + "epoch": 0.5811350919264588, + "grad_norm": 19.237682772485503, + "learning_rate": 5e-06, + "loss": 0.5526, + "num_input_tokens_seen": 251303224, + "step": 1454 + }, + { + "epoch": 0.5811350919264588, + "loss": 0.4594392478466034, + "loss_ce": 0.00637528020888567, + "loss_xval": 0.453125, + "num_input_tokens_seen": 251303224, + "step": 1454 + }, + { + "epoch": 0.5815347721822542, + "grad_norm": 86.7880986943712, + "learning_rate": 5e-06, + "loss": 0.5105, + "num_input_tokens_seen": 251476032, + "step": 1455 + }, + { + "epoch": 0.5815347721822542, + "loss": 0.6088451147079468, + "loss_ce": 0.01717032864689827, + "loss_xval": 0.58984375, + "num_input_tokens_seen": 251476032, + "step": 1455 + }, + { + "epoch": 0.5819344524380495, + "grad_norm": 47.39222562187567, + "learning_rate": 5e-06, + "loss": 0.5009, + "num_input_tokens_seen": 251649088, + "step": 1456 + }, + { + "epoch": 0.5819344524380495, + "loss": 0.3581160306930542, + "loss_ce": 0.0064924792386591434, + "loss_xval": 0.3515625, + "num_input_tokens_seen": 251649088, + "step": 1456 + }, + { + "epoch": 0.5823341326938449, + "grad_norm": 60.79443614481706, + "learning_rate": 5e-06, + "loss": 0.419, + "num_input_tokens_seen": 251822128, + "step": 1457 + }, + { + "epoch": 0.5823341326938449, + "loss": 0.4279427230358124, + "loss_ce": 0.002863375935703516, + "loss_xval": 0.42578125, + "num_input_tokens_seen": 251822128, + "step": 1457 + }, + { + "epoch": 0.5827338129496403, + "grad_norm": 68.2522669779495, + "learning_rate": 5e-06, + "loss": 0.3529, + "num_input_tokens_seen": 251994680, + "step": 1458 + }, + { + "epoch": 0.5827338129496403, + "loss": 0.11519064009189606, + "loss_ce": 0.006273405160754919, + "loss_xval": 0.10888671875, + "num_input_tokens_seen": 251994680, + "step": 1458 + }, + { + "epoch": 0.5831334932054356, + "grad_norm": 23.09936882583252, + "learning_rate": 5e-06, + "loss": 0.4876, + "num_input_tokens_seen": 252167616, + "step": 1459 + }, + { + "epoch": 0.5831334932054356, + "loss": 0.35358744859695435, + "loss_ce": 0.002971008885651827, + "loss_xval": 0.3515625, + "num_input_tokens_seen": 252167616, + "step": 1459 + }, + { + "epoch": 0.583533173461231, + "grad_norm": 101.93791119936537, + "learning_rate": 5e-06, + "loss": 0.4939, + "num_input_tokens_seen": 252340704, + "step": 1460 + }, + { + "epoch": 0.583533173461231, + "loss": 0.5895069241523743, + "loss_ce": 0.0047748456709086895, + "loss_xval": 0.5859375, + "num_input_tokens_seen": 252340704, + "step": 1460 + }, + { + "epoch": 0.5839328537170264, + "grad_norm": 50.2395324965491, + "learning_rate": 5e-06, + "loss": 0.4909, + "num_input_tokens_seen": 252513632, + "step": 1461 + }, + { + "epoch": 0.5839328537170264, + "loss": 0.3211362957954407, + "loss_ce": 0.0036619282327592373, + "loss_xval": 0.318359375, + "num_input_tokens_seen": 252513632, + "step": 1461 + }, + { + "epoch": 0.5843325339728217, + "grad_norm": 71.7569870488973, + "learning_rate": 5e-06, + "loss": 0.5136, + "num_input_tokens_seen": 252686728, + "step": 1462 + }, + { + "epoch": 0.5843325339728217, + "loss": 0.6344008445739746, + "loss_ce": 0.003266814863309264, + "loss_xval": 0.6328125, + "num_input_tokens_seen": 252686728, + "step": 1462 + }, + { + "epoch": 0.5847322142286171, + "grad_norm": 39.298202628131506, + "learning_rate": 5e-06, + "loss": 0.4078, + "num_input_tokens_seen": 252859736, + "step": 1463 + }, + { + "epoch": 0.5847322142286171, + "loss": 0.33668771386146545, + "loss_ce": 0.00300851883366704, + "loss_xval": 0.333984375, + "num_input_tokens_seen": 252859736, + "step": 1463 + }, + { + "epoch": 0.5851318944844125, + "grad_norm": 94.10269985619064, + "learning_rate": 5e-06, + "loss": 0.5931, + "num_input_tokens_seen": 253032736, + "step": 1464 + }, + { + "epoch": 0.5851318944844125, + "loss": 0.6349042654037476, + "loss_ce": 0.004136495292186737, + "loss_xval": 0.62890625, + "num_input_tokens_seen": 253032736, + "step": 1464 + }, + { + "epoch": 0.5855315747402078, + "grad_norm": 88.35693572031727, + "learning_rate": 5e-06, + "loss": 0.3551, + "num_input_tokens_seen": 253205312, + "step": 1465 + }, + { + "epoch": 0.5855315747402078, + "loss": 0.28992077708244324, + "loss_ce": 0.0015906940679997206, + "loss_xval": 0.2890625, + "num_input_tokens_seen": 253205312, + "step": 1465 + }, + { + "epoch": 0.5859312549960032, + "grad_norm": 89.76469496150095, + "learning_rate": 5e-06, + "loss": 0.2787, + "num_input_tokens_seen": 253378672, + "step": 1466 + }, + { + "epoch": 0.5859312549960032, + "loss": 0.3085545301437378, + "loss_ce": 0.005087736062705517, + "loss_xval": 0.302734375, + "num_input_tokens_seen": 253378672, + "step": 1466 + }, + { + "epoch": 0.5863309352517986, + "grad_norm": 41.82595179045434, + "learning_rate": 5e-06, + "loss": 0.393, + "num_input_tokens_seen": 253551688, + "step": 1467 + }, + { + "epoch": 0.5863309352517986, + "loss": 0.3362312614917755, + "loss_ce": 0.00755694042891264, + "loss_xval": 0.328125, + "num_input_tokens_seen": 253551688, + "step": 1467 + }, + { + "epoch": 0.586730615507594, + "grad_norm": 79.00064925393347, + "learning_rate": 5e-06, + "loss": 0.6256, + "num_input_tokens_seen": 253725080, + "step": 1468 + }, + { + "epoch": 0.586730615507594, + "loss": 0.5980717539787292, + "loss_ce": 0.004138659685850143, + "loss_xval": 0.59375, + "num_input_tokens_seen": 253725080, + "step": 1468 + }, + { + "epoch": 0.5871302957633893, + "grad_norm": 84.44247148635479, + "learning_rate": 5e-06, + "loss": 0.504, + "num_input_tokens_seen": 253898112, + "step": 1469 + }, + { + "epoch": 0.5871302957633893, + "loss": 0.7288265228271484, + "loss_ce": 0.0033321240916848183, + "loss_xval": 0.7265625, + "num_input_tokens_seen": 253898112, + "step": 1469 + }, + { + "epoch": 0.5875299760191847, + "grad_norm": 26.04118376928227, + "learning_rate": 5e-06, + "loss": 0.6305, + "num_input_tokens_seen": 254071208, + "step": 1470 + }, + { + "epoch": 0.5875299760191847, + "loss": 0.39779388904571533, + "loss_ce": 0.011380329728126526, + "loss_xval": 0.38671875, + "num_input_tokens_seen": 254071208, + "step": 1470 + }, + { + "epoch": 0.5879296562749801, + "grad_norm": 36.18494670134758, + "learning_rate": 5e-06, + "loss": 0.3893, + "num_input_tokens_seen": 254244272, + "step": 1471 + }, + { + "epoch": 0.5879296562749801, + "loss": 0.29755258560180664, + "loss_ce": 0.0026612617075443268, + "loss_xval": 0.294921875, + "num_input_tokens_seen": 254244272, + "step": 1471 + }, + { + "epoch": 0.5883293365307753, + "grad_norm": 57.43211670376443, + "learning_rate": 5e-06, + "loss": 0.5984, + "num_input_tokens_seen": 254416960, + "step": 1472 + }, + { + "epoch": 0.5883293365307753, + "loss": 0.510471522808075, + "loss_ce": 0.006870460696518421, + "loss_xval": 0.50390625, + "num_input_tokens_seen": 254416960, + "step": 1472 + }, + { + "epoch": 0.5887290167865707, + "grad_norm": 39.33826844537321, + "learning_rate": 5e-06, + "loss": 0.6617, + "num_input_tokens_seen": 254589784, + "step": 1473 + }, + { + "epoch": 0.5887290167865707, + "loss": 0.5638649463653564, + "loss_ce": 0.004691338166594505, + "loss_xval": 0.55859375, + "num_input_tokens_seen": 254589784, + "step": 1473 + }, + { + "epoch": 0.5891286970423661, + "grad_norm": 53.51753517382941, + "learning_rate": 5e-06, + "loss": 0.4919, + "num_input_tokens_seen": 254762816, + "step": 1474 + }, + { + "epoch": 0.5891286970423661, + "loss": 0.4380595088005066, + "loss_ce": 0.0016886851517483592, + "loss_xval": 0.435546875, + "num_input_tokens_seen": 254762816, + "step": 1474 + }, + { + "epoch": 0.5895283772981614, + "grad_norm": 47.60334876034079, + "learning_rate": 5e-06, + "loss": 0.7327, + "num_input_tokens_seen": 254936072, + "step": 1475 + }, + { + "epoch": 0.5895283772981614, + "loss": 0.9281734228134155, + "loss_ce": 0.00651207473129034, + "loss_xval": 0.921875, + "num_input_tokens_seen": 254936072, + "step": 1475 + }, + { + "epoch": 0.5899280575539568, + "grad_norm": 42.283179440632075, + "learning_rate": 5e-06, + "loss": 0.3088, + "num_input_tokens_seen": 255108728, + "step": 1476 + }, + { + "epoch": 0.5899280575539568, + "loss": 0.17988061904907227, + "loss_ce": 0.0028633992187678814, + "loss_xval": 0.1767578125, + "num_input_tokens_seen": 255108728, + "step": 1476 + }, + { + "epoch": 0.5903277378097522, + "grad_norm": 56.89112241926728, + "learning_rate": 5e-06, + "loss": 0.4516, + "num_input_tokens_seen": 255281672, + "step": 1477 + }, + { + "epoch": 0.5903277378097522, + "loss": 0.535220742225647, + "loss_ce": 0.0025211526080965996, + "loss_xval": 0.53125, + "num_input_tokens_seen": 255281672, + "step": 1477 + }, + { + "epoch": 0.5907274180655475, + "grad_norm": 26.05614758054613, + "learning_rate": 5e-06, + "loss": 0.675, + "num_input_tokens_seen": 255454832, + "step": 1478 + }, + { + "epoch": 0.5907274180655475, + "loss": 0.8795278072357178, + "loss_ce": 0.0042531476356089115, + "loss_xval": 0.875, + "num_input_tokens_seen": 255454832, + "step": 1478 + }, + { + "epoch": 0.5911270983213429, + "grad_norm": 82.8948565903926, + "learning_rate": 5e-06, + "loss": 0.4564, + "num_input_tokens_seen": 255627888, + "step": 1479 + }, + { + "epoch": 0.5911270983213429, + "loss": 0.3429642617702484, + "loss_ce": 0.0021439511328935623, + "loss_xval": 0.33984375, + "num_input_tokens_seen": 255627888, + "step": 1479 + }, + { + "epoch": 0.5915267785771383, + "grad_norm": 74.3533568254218, + "learning_rate": 5e-06, + "loss": 0.436, + "num_input_tokens_seen": 255800960, + "step": 1480 + }, + { + "epoch": 0.5915267785771383, + "loss": 0.27249640226364136, + "loss_ce": 0.005421818234026432, + "loss_xval": 0.267578125, + "num_input_tokens_seen": 255800960, + "step": 1480 + }, + { + "epoch": 0.5919264588329336, + "grad_norm": 100.93013388274387, + "learning_rate": 5e-06, + "loss": 0.911, + "num_input_tokens_seen": 255974224, + "step": 1481 + }, + { + "epoch": 0.5919264588329336, + "loss": 1.2054524421691895, + "loss_ce": 0.005745388101786375, + "loss_xval": 1.203125, + "num_input_tokens_seen": 255974224, + "step": 1481 + }, + { + "epoch": 0.592326139088729, + "grad_norm": 30.404189839395876, + "learning_rate": 5e-06, + "loss": 0.4284, + "num_input_tokens_seen": 256146968, + "step": 1482 + }, + { + "epoch": 0.592326139088729, + "loss": 0.45736631751060486, + "loss_ce": 0.011336689814925194, + "loss_xval": 0.4453125, + "num_input_tokens_seen": 256146968, + "step": 1482 + }, + { + "epoch": 0.5927258193445244, + "grad_norm": 86.86243262803453, + "learning_rate": 5e-06, + "loss": 0.5621, + "num_input_tokens_seen": 256319912, + "step": 1483 + }, + { + "epoch": 0.5927258193445244, + "loss": 0.49696576595306396, + "loss_ce": 0.005571682937443256, + "loss_xval": 0.4921875, + "num_input_tokens_seen": 256319912, + "step": 1483 + }, + { + "epoch": 0.5931254996003198, + "grad_norm": 36.87791867548973, + "learning_rate": 5e-06, + "loss": 0.6044, + "num_input_tokens_seen": 256492800, + "step": 1484 + }, + { + "epoch": 0.5931254996003198, + "loss": 0.8356503844261169, + "loss_ce": 0.004473670851439238, + "loss_xval": 0.83203125, + "num_input_tokens_seen": 256492800, + "step": 1484 + }, + { + "epoch": 0.5935251798561151, + "grad_norm": 56.240908798030695, + "learning_rate": 5e-06, + "loss": 0.4727, + "num_input_tokens_seen": 256665936, + "step": 1485 + }, + { + "epoch": 0.5935251798561151, + "loss": 0.4955936670303345, + "loss_ce": 0.005145644303411245, + "loss_xval": 0.490234375, + "num_input_tokens_seen": 256665936, + "step": 1485 + }, + { + "epoch": 0.5939248601119105, + "grad_norm": 47.051400925900325, + "learning_rate": 5e-06, + "loss": 0.3654, + "num_input_tokens_seen": 256838816, + "step": 1486 + }, + { + "epoch": 0.5939248601119105, + "loss": 0.33166587352752686, + "loss_ce": 0.002838968764990568, + "loss_xval": 0.328125, + "num_input_tokens_seen": 256838816, + "step": 1486 + }, + { + "epoch": 0.5943245403677059, + "grad_norm": 76.5452662635696, + "learning_rate": 5e-06, + "loss": 0.3362, + "num_input_tokens_seen": 257011728, + "step": 1487 + }, + { + "epoch": 0.5943245403677059, + "loss": 0.16850775480270386, + "loss_ce": 0.002980403369292617, + "loss_xval": 0.166015625, + "num_input_tokens_seen": 257011728, + "step": 1487 + }, + { + "epoch": 0.5947242206235012, + "grad_norm": 35.09571761153484, + "learning_rate": 5e-06, + "loss": 0.462, + "num_input_tokens_seen": 257184728, + "step": 1488 + }, + { + "epoch": 0.5947242206235012, + "loss": 0.5621877312660217, + "loss_ce": 0.0029989101458340883, + "loss_xval": 0.55859375, + "num_input_tokens_seen": 257184728, + "step": 1488 + }, + { + "epoch": 0.5951239008792966, + "grad_norm": 63.285571785786225, + "learning_rate": 5e-06, + "loss": 0.5579, + "num_input_tokens_seen": 257357600, + "step": 1489 + }, + { + "epoch": 0.5951239008792966, + "loss": 0.8287309408187866, + "loss_ce": 0.0077470894902944565, + "loss_xval": 0.8203125, + "num_input_tokens_seen": 257357600, + "step": 1489 + }, + { + "epoch": 0.595523581135092, + "grad_norm": 33.80174599093451, + "learning_rate": 5e-06, + "loss": 0.3899, + "num_input_tokens_seen": 257530392, + "step": 1490 + }, + { + "epoch": 0.595523581135092, + "loss": 0.15278327465057373, + "loss_ce": 0.003247134620323777, + "loss_xval": 0.1494140625, + "num_input_tokens_seen": 257530392, + "step": 1490 + }, + { + "epoch": 0.5959232613908872, + "grad_norm": 38.413696017178964, + "learning_rate": 5e-06, + "loss": 0.3459, + "num_input_tokens_seen": 257703408, + "step": 1491 + }, + { + "epoch": 0.5959232613908872, + "loss": 0.3699941635131836, + "loss_ce": 0.004729264881461859, + "loss_xval": 0.365234375, + "num_input_tokens_seen": 257703408, + "step": 1491 + }, + { + "epoch": 0.5963229416466826, + "grad_norm": 40.28456049137768, + "learning_rate": 5e-06, + "loss": 0.314, + "num_input_tokens_seen": 257876240, + "step": 1492 + }, + { + "epoch": 0.5963229416466826, + "loss": 0.2647836208343506, + "loss_ce": 0.005323182325810194, + "loss_xval": 0.259765625, + "num_input_tokens_seen": 257876240, + "step": 1492 + }, + { + "epoch": 0.596722621902478, + "grad_norm": 50.27059541024263, + "learning_rate": 5e-06, + "loss": 0.4964, + "num_input_tokens_seen": 258049304, + "step": 1493 + }, + { + "epoch": 0.596722621902478, + "loss": 0.45188143849372864, + "loss_ce": 0.004188567399978638, + "loss_xval": 0.447265625, + "num_input_tokens_seen": 258049304, + "step": 1493 + }, + { + "epoch": 0.5971223021582733, + "grad_norm": 20.64882540231887, + "learning_rate": 5e-06, + "loss": 0.5905, + "num_input_tokens_seen": 258222536, + "step": 1494 + }, + { + "epoch": 0.5971223021582733, + "loss": 0.8253822326660156, + "loss_ce": 0.0019264371367171407, + "loss_xval": 0.82421875, + "num_input_tokens_seen": 258222536, + "step": 1494 + }, + { + "epoch": 0.5975219824140687, + "grad_norm": 28.749252671557343, + "learning_rate": 5e-06, + "loss": 0.4295, + "num_input_tokens_seen": 258395280, + "step": 1495 + }, + { + "epoch": 0.5975219824140687, + "loss": 0.3758857250213623, + "loss_ce": 0.002136970404535532, + "loss_xval": 0.373046875, + "num_input_tokens_seen": 258395280, + "step": 1495 + }, + { + "epoch": 0.5979216626698641, + "grad_norm": 28.63136497084886, + "learning_rate": 5e-06, + "loss": 0.333, + "num_input_tokens_seen": 258568496, + "step": 1496 + }, + { + "epoch": 0.5979216626698641, + "loss": 0.31825536489486694, + "loss_ce": 0.0036496452521532774, + "loss_xval": 0.314453125, + "num_input_tokens_seen": 258568496, + "step": 1496 + }, + { + "epoch": 0.5983213429256595, + "grad_norm": 43.51054271549945, + "learning_rate": 5e-06, + "loss": 0.32, + "num_input_tokens_seen": 258741448, + "step": 1497 + }, + { + "epoch": 0.5983213429256595, + "loss": 0.26728811860084534, + "loss_ce": 0.008239655755460262, + "loss_xval": 0.259765625, + "num_input_tokens_seen": 258741448, + "step": 1497 + }, + { + "epoch": 0.5987210231814548, + "grad_norm": 53.32489405720729, + "learning_rate": 5e-06, + "loss": 0.3977, + "num_input_tokens_seen": 258914352, + "step": 1498 + }, + { + "epoch": 0.5987210231814548, + "loss": 0.23198693990707397, + "loss_ce": 0.004524178337305784, + "loss_xval": 0.2275390625, + "num_input_tokens_seen": 258914352, + "step": 1498 + }, + { + "epoch": 0.5991207034372502, + "grad_norm": 19.333266417797198, + "learning_rate": 5e-06, + "loss": 0.3824, + "num_input_tokens_seen": 259087304, + "step": 1499 + }, + { + "epoch": 0.5991207034372502, + "loss": 0.5503689050674438, + "loss_ce": 0.01039084792137146, + "loss_xval": 0.5390625, + "num_input_tokens_seen": 259087304, + "step": 1499 + }, + { + "epoch": 0.5995203836930456, + "grad_norm": 88.26003375881355, + "learning_rate": 5e-06, + "loss": 0.5661, + "num_input_tokens_seen": 259260416, + "step": 1500 + }, + { + "epoch": 0.5995203836930456, + "eval_websight_new_IoU": 0.3115440905094147, + "eval_websight_new_MAE_all": 0.020920580253005028, + "eval_websight_new_MAE_h": 0.017279735766351223, + "eval_websight_new_MAE_w": 0.03733859211206436, + "eval_websight_new_MAE_x": 0.012794057838618755, + "eval_websight_new_MAE_y": 0.016269936691969633, + "eval_websight_new_NUM_probability": 0.9875062704086304, + "eval_websight_new_inside_bbox": 0.6961805522441864, + "eval_websight_new_loss": 0.07518891245126724, + "eval_websight_new_loss_ce": 0.001353644474875182, + "eval_websight_new_loss_xval": 0.070343017578125, + "eval_websight_new_runtime": 58.5335, + "eval_websight_new_samples_per_second": 0.854, + "eval_websight_new_steps_per_second": 0.034, + "num_input_tokens_seen": 259260416, + "step": 1500 + }, + { + "epoch": 0.5995203836930456, + "eval_seeclick_IoU": 0.27675844728946686, + "eval_seeclick_MAE_all": 0.0651068165898323, + "eval_seeclick_MAE_h": 0.027694360353052616, + "eval_seeclick_MAE_w": 0.08417735807597637, + "eval_seeclick_MAE_x": 0.08383799344301224, + "eval_seeclick_MAE_y": 0.06471756100654602, + "eval_seeclick_NUM_probability": 0.986900806427002, + "eval_seeclick_inside_bbox": 0.4756944477558136, + "eval_seeclick_loss": 1.40971040725708, + "eval_seeclick_loss_ce": 0.013312608003616333, + "eval_seeclick_loss_xval": 1.2222900390625, + "eval_seeclick_runtime": 81.445, + "eval_seeclick_samples_per_second": 0.614, + "eval_seeclick_steps_per_second": 0.025, + "num_input_tokens_seen": 259260416, + "step": 1500 + }, + { + "epoch": 0.5995203836930456, + "eval_icons_IoU": 0.10929679498076439, + "eval_icons_MAE_all": 0.01885663205757737, + "eval_icons_MAE_h": 0.012082248460501432, + "eval_icons_MAE_w": 0.009315322153270245, + "eval_icons_MAE_x": 0.026283184066414833, + "eval_icons_MAE_y": 0.027745775878429413, + "eval_icons_NUM_probability": 0.9871878027915955, + "eval_icons_inside_bbox": 0.2795138955116272, + "eval_icons_loss": 0.10837095975875854, + "eval_icons_loss_ce": 0.001372582628391683, + "eval_icons_loss_xval": 0.08797454833984375, + "eval_icons_runtime": 89.9202, + "eval_icons_samples_per_second": 0.556, + "eval_icons_steps_per_second": 0.022, + "num_input_tokens_seen": 259260416, + "step": 1500 + } + ], + "logging_steps": 1.0, + "max_steps": 7506, + "num_input_tokens_seen": 259260416, + "num_train_epochs": 3, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1622802757648384.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}