diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,90997 @@ +{ + "best_metric": 0.5200754404067993, + "best_model_checkpoint": "./models/T-lite-it_7B_lora_thinking/checkpoint-129408", + "epoch": 2.8803418803418803, + "eval_steps": 2696, + "global_step": 129408, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00022257834757834758, + "grad_norm": 0.28544881939888, + "learning_rate": 0.00025, + "loss": 0.9123, + "step": 10 + }, + { + "epoch": 0.00044515669515669517, + "grad_norm": 0.3727859556674957, + "learning_rate": 0.0003999999991305469, + "loss": 0.8314, + "step": 20 + }, + { + "epoch": 0.0006677350427350427, + "grad_norm": 0.28690457344055176, + "learning_rate": 0.0003999999893491996, + "loss": 0.6179, + "step": 30 + }, + { + "epoch": 0.0008903133903133903, + "grad_norm": 0.3062066435813904, + "learning_rate": 0.00039999996869968915, + "loss": 0.6463, + "step": 40 + }, + { + "epoch": 0.001112891737891738, + "grad_norm": 0.4153619110584259, + "learning_rate": 0.0003999999371820167, + "loss": 0.648, + "step": 50 + }, + { + "epoch": 0.0013354700854700855, + "grad_norm": 0.3052282929420471, + "learning_rate": 0.0003999998947961839, + "loss": 0.7065, + "step": 60 + }, + { + "epoch": 0.001558048433048433, + "grad_norm": 0.2599051296710968, + "learning_rate": 0.00039999984154219303, + "loss": 0.5866, + "step": 70 + }, + { + "epoch": 0.0017806267806267807, + "grad_norm": 0.3215443193912506, + "learning_rate": 0.0003999997774200471, + "loss": 0.6875, + "step": 80 + }, + { + "epoch": 0.002003205128205128, + "grad_norm": 0.2554830014705658, + "learning_rate": 0.0003999997024297496, + "loss": 0.6784, + "step": 90 + }, + { + "epoch": 0.002225783475783476, + "grad_norm": 0.46018001437187195, + "learning_rate": 0.0003999996165713045, + "loss": 0.8871, + "step": 100 + }, + { + "epoch": 0.002448361823361823, + "grad_norm": 0.42150408029556274, + "learning_rate": 0.0003999995198447165, + "loss": 0.575, + "step": 110 + }, + { + "epoch": 0.002670940170940171, + "grad_norm": 0.38404232263565063, + "learning_rate": 0.0003999994122499908, + "loss": 0.5757, + "step": 120 + }, + { + "epoch": 0.0028935185185185184, + "grad_norm": 0.3114868402481079, + "learning_rate": 0.00039999929378713346, + "loss": 0.572, + "step": 130 + }, + { + "epoch": 0.003116096866096866, + "grad_norm": 0.25811660289764404, + "learning_rate": 0.00039999916445615063, + "loss": 0.8068, + "step": 140 + }, + { + "epoch": 0.0033386752136752135, + "grad_norm": 0.3242238461971283, + "learning_rate": 0.00039999902425704955, + "loss": 0.7357, + "step": 150 + }, + { + "epoch": 0.0035612535612535613, + "grad_norm": 0.24886858463287354, + "learning_rate": 0.00039999887318983775, + "loss": 0.5593, + "step": 160 + }, + { + "epoch": 0.0037838319088319087, + "grad_norm": 0.45792001485824585, + "learning_rate": 0.0003999987112545234, + "loss": 0.6795, + "step": 170 + }, + { + "epoch": 0.004006410256410256, + "grad_norm": 0.4137823283672333, + "learning_rate": 0.00039999853845111544, + "loss": 0.6331, + "step": 180 + }, + { + "epoch": 0.004228988603988604, + "grad_norm": 0.3585183322429657, + "learning_rate": 0.0003999983547796231, + "loss": 0.5823, + "step": 190 + }, + { + "epoch": 0.004451566951566952, + "grad_norm": 0.4130493998527527, + "learning_rate": 0.00039999816024005645, + "loss": 0.6371, + "step": 200 + }, + { + "epoch": 0.004674145299145299, + "grad_norm": 0.5912800431251526, + "learning_rate": 0.0003999979548324261, + "loss": 0.797, + "step": 210 + }, + { + "epoch": 0.004896723646723646, + "grad_norm": 0.32389238476753235, + "learning_rate": 0.0003999977385567431, + "loss": 0.6167, + "step": 220 + }, + { + "epoch": 0.005119301994301995, + "grad_norm": 0.3468959331512451, + "learning_rate": 0.0003999975114130193, + "loss": 0.6044, + "step": 230 + }, + { + "epoch": 0.005341880341880342, + "grad_norm": 0.4953303933143616, + "learning_rate": 0.000399997273401267, + "loss": 0.6288, + "step": 240 + }, + { + "epoch": 0.005564458689458689, + "grad_norm": 0.3117259442806244, + "learning_rate": 0.0003999970245214991, + "loss": 0.5408, + "step": 250 + }, + { + "epoch": 0.005787037037037037, + "grad_norm": 0.39363420009613037, + "learning_rate": 0.0003999967647737292, + "loss": 0.6291, + "step": 260 + }, + { + "epoch": 0.006009615384615385, + "grad_norm": 0.44677507877349854, + "learning_rate": 0.0003999964941579714, + "loss": 0.6248, + "step": 270 + }, + { + "epoch": 0.006232193732193732, + "grad_norm": 0.33922237157821655, + "learning_rate": 0.00039999621267424037, + "loss": 0.654, + "step": 280 + }, + { + "epoch": 0.00645477207977208, + "grad_norm": 0.33551132678985596, + "learning_rate": 0.00039999592032255134, + "loss": 0.5206, + "step": 290 + }, + { + "epoch": 0.006677350427350427, + "grad_norm": 0.3641911745071411, + "learning_rate": 0.00039999561710292035, + "loss": 0.656, + "step": 300 + }, + { + "epoch": 0.006899928774928775, + "grad_norm": 0.5253650546073914, + "learning_rate": 0.00039999530301536377, + "loss": 0.7078, + "step": 310 + }, + { + "epoch": 0.007122507122507123, + "grad_norm": 0.41651082038879395, + "learning_rate": 0.00039999497805989873, + "loss": 0.8011, + "step": 320 + }, + { + "epoch": 0.00734508547008547, + "grad_norm": 0.20987141132354736, + "learning_rate": 0.00039999464223654284, + "loss": 0.724, + "step": 330 + }, + { + "epoch": 0.007567663817663817, + "grad_norm": 0.5447407364845276, + "learning_rate": 0.0003999942955453144, + "loss": 0.6052, + "step": 340 + }, + { + "epoch": 0.007790242165242166, + "grad_norm": 0.37690338492393494, + "learning_rate": 0.00039999393798623216, + "loss": 0.7012, + "step": 350 + }, + { + "epoch": 0.008012820512820512, + "grad_norm": 0.23951241374015808, + "learning_rate": 0.00039999356955931564, + "loss": 0.7919, + "step": 360 + }, + { + "epoch": 0.008235398860398861, + "grad_norm": 0.45175644755363464, + "learning_rate": 0.0003999931902645848, + "loss": 0.6225, + "step": 370 + }, + { + "epoch": 0.008457977207977209, + "grad_norm": 0.4582245647907257, + "learning_rate": 0.0003999928001020603, + "loss": 0.7225, + "step": 380 + }, + { + "epoch": 0.008680555555555556, + "grad_norm": 0.1780846118927002, + "learning_rate": 0.0003999923990717633, + "loss": 0.5654, + "step": 390 + }, + { + "epoch": 0.008903133903133903, + "grad_norm": 0.5246115326881409, + "learning_rate": 0.00039999198717371564, + "loss": 0.6064, + "step": 400 + }, + { + "epoch": 0.00912571225071225, + "grad_norm": 0.5305804014205933, + "learning_rate": 0.0003999915644079397, + "loss": 0.5286, + "step": 410 + }, + { + "epoch": 0.009348290598290598, + "grad_norm": 0.45081108808517456, + "learning_rate": 0.00039999113077445847, + "loss": 0.7647, + "step": 420 + }, + { + "epoch": 0.009570868945868945, + "grad_norm": 0.2678021788597107, + "learning_rate": 0.0003999906862732954, + "loss": 0.7105, + "step": 430 + }, + { + "epoch": 0.009793447293447293, + "grad_norm": 0.4171365797519684, + "learning_rate": 0.0003999902309044747, + "loss": 0.5313, + "step": 440 + }, + { + "epoch": 0.010016025641025642, + "grad_norm": 0.3753603994846344, + "learning_rate": 0.0003999897646680212, + "loss": 0.6744, + "step": 450 + }, + { + "epoch": 0.01023860398860399, + "grad_norm": 0.533179759979248, + "learning_rate": 0.0003999892875639601, + "loss": 0.528, + "step": 460 + }, + { + "epoch": 0.010461182336182337, + "grad_norm": 0.3925873041152954, + "learning_rate": 0.0003999887995923174, + "loss": 0.6925, + "step": 470 + }, + { + "epoch": 0.010683760683760684, + "grad_norm": 0.3100382089614868, + "learning_rate": 0.0003999883007531196, + "loss": 0.58, + "step": 480 + }, + { + "epoch": 0.010906339031339031, + "grad_norm": 0.3338608145713806, + "learning_rate": 0.0003999877910463939, + "loss": 0.533, + "step": 490 + }, + { + "epoch": 0.011128917378917379, + "grad_norm": 0.2950505018234253, + "learning_rate": 0.00039998727047216786, + "loss": 0.6492, + "step": 500 + }, + { + "epoch": 0.011351495726495726, + "grad_norm": 0.4604972004890442, + "learning_rate": 0.0003999867390304698, + "loss": 0.5994, + "step": 510 + }, + { + "epoch": 0.011574074074074073, + "grad_norm": 0.46998804807662964, + "learning_rate": 0.00039998619672132864, + "loss": 0.6596, + "step": 520 + }, + { + "epoch": 0.01179665242165242, + "grad_norm": 0.3429502248764038, + "learning_rate": 0.0003999856435447739, + "loss": 0.6498, + "step": 530 + }, + { + "epoch": 0.01201923076923077, + "grad_norm": 0.45733514428138733, + "learning_rate": 0.00039998507950083555, + "loss": 0.7259, + "step": 540 + }, + { + "epoch": 0.012241809116809117, + "grad_norm": 0.4332731068134308, + "learning_rate": 0.0003999845045895442, + "loss": 0.7316, + "step": 550 + }, + { + "epoch": 0.012464387464387465, + "grad_norm": 0.32235032320022583, + "learning_rate": 0.0003999839188109312, + "loss": 0.6579, + "step": 560 + }, + { + "epoch": 0.012686965811965812, + "grad_norm": 0.4269620180130005, + "learning_rate": 0.00039998332216502835, + "loss": 0.5983, + "step": 570 + }, + { + "epoch": 0.01290954415954416, + "grad_norm": 0.3818014860153198, + "learning_rate": 0.000399982714651868, + "loss": 0.7129, + "step": 580 + }, + { + "epoch": 0.013132122507122507, + "grad_norm": 0.31318408250808716, + "learning_rate": 0.0003999820962714833, + "loss": 0.6421, + "step": 590 + }, + { + "epoch": 0.013354700854700854, + "grad_norm": 0.5945875644683838, + "learning_rate": 0.00039998146702390776, + "loss": 0.7313, + "step": 600 + }, + { + "epoch": 0.013577279202279201, + "grad_norm": 0.3725283443927765, + "learning_rate": 0.00039998082690917555, + "loss": 0.6931, + "step": 610 + }, + { + "epoch": 0.01379985754985755, + "grad_norm": 0.415237694978714, + "learning_rate": 0.0003999801759273215, + "loss": 0.5633, + "step": 620 + }, + { + "epoch": 0.014022435897435898, + "grad_norm": 0.4613287150859833, + "learning_rate": 0.00039997951407838106, + "loss": 0.6357, + "step": 630 + }, + { + "epoch": 0.014245014245014245, + "grad_norm": 0.41351374983787537, + "learning_rate": 0.00039997884136238997, + "loss": 0.6294, + "step": 640 + }, + { + "epoch": 0.014467592592592593, + "grad_norm": 0.7268733382225037, + "learning_rate": 0.00039997815777938504, + "loss": 0.7595, + "step": 650 + }, + { + "epoch": 0.01469017094017094, + "grad_norm": 0.355999618768692, + "learning_rate": 0.0003999774633294033, + "loss": 0.5714, + "step": 660 + }, + { + "epoch": 0.014912749287749287, + "grad_norm": 0.8212071657180786, + "learning_rate": 0.00039997675801248245, + "loss": 0.6492, + "step": 670 + }, + { + "epoch": 0.015135327635327635, + "grad_norm": 0.47924473881721497, + "learning_rate": 0.00039997604182866084, + "loss": 0.5294, + "step": 680 + }, + { + "epoch": 0.015357905982905982, + "grad_norm": 0.4293022155761719, + "learning_rate": 0.00039997531477797745, + "loss": 0.5905, + "step": 690 + }, + { + "epoch": 0.015580484330484331, + "grad_norm": 0.526131808757782, + "learning_rate": 0.00039997457686047176, + "loss": 0.6094, + "step": 700 + }, + { + "epoch": 0.01580306267806268, + "grad_norm": 0.5501940250396729, + "learning_rate": 0.0003999738280761839, + "loss": 0.6805, + "step": 710 + }, + { + "epoch": 0.016025641025641024, + "grad_norm": 0.7406270503997803, + "learning_rate": 0.0003999730684251544, + "loss": 0.6279, + "step": 720 + }, + { + "epoch": 0.016248219373219373, + "grad_norm": 0.4423774778842926, + "learning_rate": 0.0003999722979074247, + "loss": 0.6017, + "step": 730 + }, + { + "epoch": 0.016470797720797722, + "grad_norm": 0.5101915001869202, + "learning_rate": 0.0003999715165230367, + "loss": 0.6441, + "step": 740 + }, + { + "epoch": 0.016693376068376068, + "grad_norm": 0.24923740327358246, + "learning_rate": 0.0003999707242720327, + "loss": 0.6781, + "step": 750 + }, + { + "epoch": 0.016915954415954417, + "grad_norm": 0.6661295294761658, + "learning_rate": 0.00039996992115445585, + "loss": 0.5248, + "step": 760 + }, + { + "epoch": 0.017138532763532763, + "grad_norm": 0.3601098954677582, + "learning_rate": 0.00039996910717034976, + "loss": 0.6236, + "step": 770 + }, + { + "epoch": 0.017361111111111112, + "grad_norm": 0.43522101640701294, + "learning_rate": 0.0003999682823197588, + "loss": 0.6272, + "step": 780 + }, + { + "epoch": 0.017583689458689458, + "grad_norm": 0.6393892168998718, + "learning_rate": 0.0003999674466027276, + "loss": 0.6327, + "step": 790 + }, + { + "epoch": 0.017806267806267807, + "grad_norm": 0.5061392188072205, + "learning_rate": 0.0003999666000193016, + "loss": 0.611, + "step": 800 + }, + { + "epoch": 0.018028846153846152, + "grad_norm": 0.3880053460597992, + "learning_rate": 0.00039996574256952693, + "loss": 0.6651, + "step": 810 + }, + { + "epoch": 0.0182514245014245, + "grad_norm": 0.45905396342277527, + "learning_rate": 0.00039996487425345006, + "loss": 0.6382, + "step": 820 + }, + { + "epoch": 0.01847400284900285, + "grad_norm": 0.5195857882499695, + "learning_rate": 0.0003999639950711183, + "loss": 0.682, + "step": 830 + }, + { + "epoch": 0.018696581196581196, + "grad_norm": 0.4754457473754883, + "learning_rate": 0.00039996310502257926, + "loss": 0.6563, + "step": 840 + }, + { + "epoch": 0.018919159544159545, + "grad_norm": 0.4215100407600403, + "learning_rate": 0.0003999622041078815, + "loss": 0.5943, + "step": 850 + }, + { + "epoch": 0.01914173789173789, + "grad_norm": 0.6068809032440186, + "learning_rate": 0.00039996129232707374, + "loss": 0.7153, + "step": 860 + }, + { + "epoch": 0.01936431623931624, + "grad_norm": 0.41532132029533386, + "learning_rate": 0.00039996036968020576, + "loss": 0.593, + "step": 870 + }, + { + "epoch": 0.019586894586894586, + "grad_norm": 0.5850262641906738, + "learning_rate": 0.00039995943616732754, + "loss": 0.5694, + "step": 880 + }, + { + "epoch": 0.019809472934472935, + "grad_norm": 0.4964942932128906, + "learning_rate": 0.0003999584917884899, + "loss": 0.6062, + "step": 890 + }, + { + "epoch": 0.020032051282051284, + "grad_norm": 0.6453270316123962, + "learning_rate": 0.0003999575365437441, + "loss": 0.627, + "step": 900 + }, + { + "epoch": 0.02025462962962963, + "grad_norm": 0.4876425266265869, + "learning_rate": 0.00039995657043314205, + "loss": 0.6849, + "step": 910 + }, + { + "epoch": 0.02047720797720798, + "grad_norm": 0.5182598829269409, + "learning_rate": 0.0003999555934567363, + "loss": 0.7098, + "step": 920 + }, + { + "epoch": 0.020699786324786324, + "grad_norm": 0.3465155065059662, + "learning_rate": 0.0003999546056145799, + "loss": 0.6095, + "step": 930 + }, + { + "epoch": 0.020922364672364673, + "grad_norm": 0.6155537366867065, + "learning_rate": 0.00039995360690672657, + "loss": 0.5103, + "step": 940 + }, + { + "epoch": 0.02114494301994302, + "grad_norm": 0.6999111771583557, + "learning_rate": 0.0003999525973332306, + "loss": 0.7678, + "step": 950 + }, + { + "epoch": 0.021367521367521368, + "grad_norm": 0.2875516414642334, + "learning_rate": 0.00039995157689414666, + "loss": 0.6324, + "step": 960 + }, + { + "epoch": 0.021590099715099714, + "grad_norm": 0.6682087182998657, + "learning_rate": 0.00039995054558953047, + "loss": 0.6292, + "step": 970 + }, + { + "epoch": 0.021812678062678063, + "grad_norm": 0.63261479139328, + "learning_rate": 0.00039994950341943784, + "loss": 0.6469, + "step": 980 + }, + { + "epoch": 0.022035256410256412, + "grad_norm": 0.5575029253959656, + "learning_rate": 0.0003999484503839256, + "loss": 0.5626, + "step": 990 + }, + { + "epoch": 0.022257834757834757, + "grad_norm": 0.5150830149650574, + "learning_rate": 0.00039994738648305086, + "loss": 0.5915, + "step": 1000 + }, + { + "epoch": 0.022480413105413107, + "grad_norm": 0.5310041904449463, + "learning_rate": 0.0003999463117168714, + "loss": 0.5757, + "step": 1010 + }, + { + "epoch": 0.022702991452991452, + "grad_norm": 0.31499940156936646, + "learning_rate": 0.0003999452260854457, + "loss": 0.7014, + "step": 1020 + }, + { + "epoch": 0.0229255698005698, + "grad_norm": 0.5305114984512329, + "learning_rate": 0.0003999441295888328, + "loss": 0.6615, + "step": 1030 + }, + { + "epoch": 0.023148148148148147, + "grad_norm": 0.47061678767204285, + "learning_rate": 0.0003999430222270921, + "loss": 0.6299, + "step": 1040 + }, + { + "epoch": 0.023370726495726496, + "grad_norm": 0.49521180987358093, + "learning_rate": 0.000399941904000284, + "loss": 0.6524, + "step": 1050 + }, + { + "epoch": 0.02359330484330484, + "grad_norm": 0.5753868222236633, + "learning_rate": 0.0003999407749084691, + "loss": 0.666, + "step": 1060 + }, + { + "epoch": 0.02381588319088319, + "grad_norm": 0.3605792224407196, + "learning_rate": 0.0003999396349517088, + "loss": 0.4916, + "step": 1070 + }, + { + "epoch": 0.02403846153846154, + "grad_norm": 0.47851887345314026, + "learning_rate": 0.0003999384841300651, + "loss": 0.6972, + "step": 1080 + }, + { + "epoch": 0.024261039886039885, + "grad_norm": 0.4846053719520569, + "learning_rate": 0.00039993732244360047, + "loss": 0.5415, + "step": 1090 + }, + { + "epoch": 0.024483618233618235, + "grad_norm": 0.5065937638282776, + "learning_rate": 0.0003999361498923781, + "loss": 0.6008, + "step": 1100 + }, + { + "epoch": 0.02470619658119658, + "grad_norm": 0.6912859678268433, + "learning_rate": 0.00039993496647646164, + "loss": 0.657, + "step": 1110 + }, + { + "epoch": 0.02492877492877493, + "grad_norm": 0.48633942008018494, + "learning_rate": 0.0003999337721959154, + "loss": 0.7885, + "step": 1120 + }, + { + "epoch": 0.025151353276353275, + "grad_norm": 0.5203970670700073, + "learning_rate": 0.00039993256705080435, + "loss": 0.524, + "step": 1130 + }, + { + "epoch": 0.025373931623931624, + "grad_norm": 0.3965185284614563, + "learning_rate": 0.0003999313510411939, + "loss": 0.5524, + "step": 1140 + }, + { + "epoch": 0.025596509971509973, + "grad_norm": 0.5027601718902588, + "learning_rate": 0.00039993012416715014, + "loss": 0.6776, + "step": 1150 + }, + { + "epoch": 0.02581908831908832, + "grad_norm": 0.5505548119544983, + "learning_rate": 0.00039992888642873984, + "loss": 0.5468, + "step": 1160 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.7258923649787903, + "learning_rate": 0.0003999276378260302, + "loss": 0.6938, + "step": 1170 + }, + { + "epoch": 0.026264245014245013, + "grad_norm": 0.40920066833496094, + "learning_rate": 0.00039992637835908895, + "loss": 0.6654, + "step": 1180 + }, + { + "epoch": 0.026486823361823363, + "grad_norm": 0.4127655327320099, + "learning_rate": 0.00039992510802798465, + "loss": 0.5783, + "step": 1190 + }, + { + "epoch": 0.026709401709401708, + "grad_norm": 0.49130943417549133, + "learning_rate": 0.0003999238268327863, + "loss": 0.6508, + "step": 1200 + }, + { + "epoch": 0.026931980056980057, + "grad_norm": 0.782516360282898, + "learning_rate": 0.00039992253477356363, + "loss": 0.6979, + "step": 1210 + }, + { + "epoch": 0.027154558404558403, + "grad_norm": 0.4363095760345459, + "learning_rate": 0.0003999212318503867, + "loss": 0.5841, + "step": 1220 + }, + { + "epoch": 0.027377136752136752, + "grad_norm": 0.4464375972747803, + "learning_rate": 0.00039991991806332636, + "loss": 0.6695, + "step": 1230 + }, + { + "epoch": 0.0275997150997151, + "grad_norm": 0.6378709673881531, + "learning_rate": 0.000399918593412454, + "loss": 0.7352, + "step": 1240 + }, + { + "epoch": 0.027822293447293447, + "grad_norm": 0.7491675615310669, + "learning_rate": 0.00039991725789784166, + "loss": 0.7471, + "step": 1250 + }, + { + "epoch": 0.028044871794871796, + "grad_norm": 0.54839026927948, + "learning_rate": 0.00039991591151956185, + "loss": 0.6001, + "step": 1260 + }, + { + "epoch": 0.02826745014245014, + "grad_norm": 0.5543126463890076, + "learning_rate": 0.00039991455427768773, + "loss": 0.5391, + "step": 1270 + }, + { + "epoch": 0.02849002849002849, + "grad_norm": 0.41751816868782043, + "learning_rate": 0.0003999131861722931, + "loss": 0.7174, + "step": 1280 + }, + { + "epoch": 0.028712606837606836, + "grad_norm": 0.5156871676445007, + "learning_rate": 0.0003999118072034522, + "loss": 0.6834, + "step": 1290 + }, + { + "epoch": 0.028935185185185185, + "grad_norm": 0.7073959112167358, + "learning_rate": 0.0003999104173712401, + "loss": 0.5147, + "step": 1300 + }, + { + "epoch": 0.029157763532763534, + "grad_norm": 0.8129472136497498, + "learning_rate": 0.0003999090166757323, + "loss": 0.6713, + "step": 1310 + }, + { + "epoch": 0.02938034188034188, + "grad_norm": 0.7459490299224854, + "learning_rate": 0.00039990760511700486, + "loss": 0.7992, + "step": 1320 + }, + { + "epoch": 0.02960292022792023, + "grad_norm": 0.5266855955123901, + "learning_rate": 0.00039990618269513447, + "loss": 0.7064, + "step": 1330 + }, + { + "epoch": 0.029825498575498575, + "grad_norm": 0.8832513689994812, + "learning_rate": 0.0003999047494101985, + "loss": 0.526, + "step": 1340 + }, + { + "epoch": 0.030048076923076924, + "grad_norm": 0.5843579769134521, + "learning_rate": 0.0003999033052622748, + "loss": 0.6702, + "step": 1350 + }, + { + "epoch": 0.03027065527065527, + "grad_norm": 0.5423453450202942, + "learning_rate": 0.0003999018502514418, + "loss": 0.6079, + "step": 1360 + }, + { + "epoch": 0.03049323361823362, + "grad_norm": 0.43239280581474304, + "learning_rate": 0.0003999003843777786, + "loss": 0.7386, + "step": 1370 + }, + { + "epoch": 0.030715811965811964, + "grad_norm": 0.43782392144203186, + "learning_rate": 0.00039989890764136494, + "loss": 0.6689, + "step": 1380 + }, + { + "epoch": 0.030938390313390313, + "grad_norm": 0.44703444838523865, + "learning_rate": 0.000399897420042281, + "loss": 0.7488, + "step": 1390 + }, + { + "epoch": 0.031160968660968662, + "grad_norm": 0.6166260242462158, + "learning_rate": 0.0003998959215806075, + "loss": 0.607, + "step": 1400 + }, + { + "epoch": 0.03138354700854701, + "grad_norm": 0.6796806454658508, + "learning_rate": 0.0003998944122564261, + "loss": 0.5879, + "step": 1410 + }, + { + "epoch": 0.03160612535612536, + "grad_norm": 0.6388904452323914, + "learning_rate": 0.00039989289206981857, + "loss": 0.7348, + "step": 1420 + }, + { + "epoch": 0.031828703703703706, + "grad_norm": 0.501086413860321, + "learning_rate": 0.00039989136102086775, + "loss": 0.7234, + "step": 1430 + }, + { + "epoch": 0.03205128205128205, + "grad_norm": 0.7662519216537476, + "learning_rate": 0.0003998898191096567, + "loss": 0.7404, + "step": 1440 + }, + { + "epoch": 0.0322738603988604, + "grad_norm": 0.6648521423339844, + "learning_rate": 0.0003998882663362692, + "loss": 0.5212, + "step": 1450 + }, + { + "epoch": 0.03249643874643875, + "grad_norm": 0.3986724019050598, + "learning_rate": 0.0003998867027007897, + "loss": 0.5945, + "step": 1460 + }, + { + "epoch": 0.032719017094017096, + "grad_norm": 0.5667985677719116, + "learning_rate": 0.00039988512820330317, + "loss": 0.4911, + "step": 1470 + }, + { + "epoch": 0.032941595441595445, + "grad_norm": 0.3411963880062103, + "learning_rate": 0.0003998835428438952, + "loss": 0.5614, + "step": 1480 + }, + { + "epoch": 0.03316417378917379, + "grad_norm": 0.8257679343223572, + "learning_rate": 0.00039988194662265173, + "loss": 0.5847, + "step": 1490 + }, + { + "epoch": 0.033386752136752136, + "grad_norm": 0.6138916015625, + "learning_rate": 0.00039988033953965976, + "loss": 0.791, + "step": 1500 + }, + { + "epoch": 0.033609330484330485, + "grad_norm": 0.7213571071624756, + "learning_rate": 0.00039987872159500645, + "loss": 0.6312, + "step": 1510 + }, + { + "epoch": 0.033831908831908834, + "grad_norm": 0.6440274715423584, + "learning_rate": 0.00039987709278877984, + "loss": 0.5499, + "step": 1520 + }, + { + "epoch": 0.034054487179487176, + "grad_norm": 0.4683380722999573, + "learning_rate": 0.0003998754531210683, + "loss": 0.7107, + "step": 1530 + }, + { + "epoch": 0.034277065527065526, + "grad_norm": 0.44701075553894043, + "learning_rate": 0.0003998738025919611, + "loss": 0.6155, + "step": 1540 + }, + { + "epoch": 0.034499643874643875, + "grad_norm": 0.5694136619567871, + "learning_rate": 0.00039987214120154775, + "loss": 0.6355, + "step": 1550 + }, + { + "epoch": 0.034722222222222224, + "grad_norm": 0.4150397777557373, + "learning_rate": 0.0003998704689499187, + "loss": 0.5468, + "step": 1560 + }, + { + "epoch": 0.03494480056980057, + "grad_norm": 0.45780056715011597, + "learning_rate": 0.0003998687858371647, + "loss": 0.5747, + "step": 1570 + }, + { + "epoch": 0.035167378917378915, + "grad_norm": 0.5170963406562805, + "learning_rate": 0.00039986709186337724, + "loss": 0.4998, + "step": 1580 + }, + { + "epoch": 0.035389957264957264, + "grad_norm": 0.7482688426971436, + "learning_rate": 0.0003998653870286485, + "loss": 0.8305, + "step": 1590 + }, + { + "epoch": 0.03561253561253561, + "grad_norm": 0.585242509841919, + "learning_rate": 0.00039986367133307087, + "loss": 0.6235, + "step": 1600 + }, + { + "epoch": 0.03583511396011396, + "grad_norm": 0.5389675498008728, + "learning_rate": 0.0003998619447767378, + "loss": 0.5928, + "step": 1610 + }, + { + "epoch": 0.036057692307692304, + "grad_norm": 0.6164636611938477, + "learning_rate": 0.0003998602073597431, + "loss": 0.7214, + "step": 1620 + }, + { + "epoch": 0.036280270655270654, + "grad_norm": 0.466166228055954, + "learning_rate": 0.000399858459082181, + "loss": 0.5967, + "step": 1630 + }, + { + "epoch": 0.036502849002849, + "grad_norm": 0.6401296854019165, + "learning_rate": 0.00039985669994414667, + "loss": 0.691, + "step": 1640 + }, + { + "epoch": 0.03672542735042735, + "grad_norm": 0.5248224139213562, + "learning_rate": 0.00039985492994573565, + "loss": 0.6946, + "step": 1650 + }, + { + "epoch": 0.0369480056980057, + "grad_norm": 0.4127870500087738, + "learning_rate": 0.0003998531490870441, + "loss": 0.4733, + "step": 1660 + }, + { + "epoch": 0.03717058404558404, + "grad_norm": 0.47520193457603455, + "learning_rate": 0.0003998513573681689, + "loss": 0.6858, + "step": 1670 + }, + { + "epoch": 0.03739316239316239, + "grad_norm": 0.4704365134239197, + "learning_rate": 0.0003998495547892072, + "loss": 0.5678, + "step": 1680 + }, + { + "epoch": 0.03761574074074074, + "grad_norm": 0.36997130513191223, + "learning_rate": 0.0003998477413502572, + "loss": 0.6287, + "step": 1690 + }, + { + "epoch": 0.03783831908831909, + "grad_norm": 0.7199309468269348, + "learning_rate": 0.0003998459170514173, + "loss": 0.6482, + "step": 1700 + }, + { + "epoch": 0.03806089743589743, + "grad_norm": 0.41155070066452026, + "learning_rate": 0.0003998440818927867, + "loss": 0.6537, + "step": 1710 + }, + { + "epoch": 0.03828347578347578, + "grad_norm": 0.647976815700531, + "learning_rate": 0.00039984223587446506, + "loss": 0.622, + "step": 1720 + }, + { + "epoch": 0.03850605413105413, + "grad_norm": 0.5233225226402283, + "learning_rate": 0.0003998403789965528, + "loss": 0.5125, + "step": 1730 + }, + { + "epoch": 0.03872863247863248, + "grad_norm": 0.7623016834259033, + "learning_rate": 0.0003998385112591506, + "loss": 0.6881, + "step": 1740 + }, + { + "epoch": 0.03895121082621083, + "grad_norm": 0.45694059133529663, + "learning_rate": 0.0003998366326623602, + "loss": 0.6947, + "step": 1750 + }, + { + "epoch": 0.03917378917378917, + "grad_norm": 0.5509689450263977, + "learning_rate": 0.0003998347432062835, + "loss": 0.6702, + "step": 1760 + }, + { + "epoch": 0.03939636752136752, + "grad_norm": 0.5311200022697449, + "learning_rate": 0.00039983284289102334, + "loss": 0.6444, + "step": 1770 + }, + { + "epoch": 0.03961894586894587, + "grad_norm": 0.48745280504226685, + "learning_rate": 0.0003998309317166829, + "loss": 0.5315, + "step": 1780 + }, + { + "epoch": 0.03984152421652422, + "grad_norm": 0.558219313621521, + "learning_rate": 0.000399829009683366, + "loss": 0.7597, + "step": 1790 + }, + { + "epoch": 0.04006410256410257, + "grad_norm": 0.617832362651825, + "learning_rate": 0.0003998270767911772, + "loss": 0.6786, + "step": 1800 + }, + { + "epoch": 0.04028668091168091, + "grad_norm": 0.48355668783187866, + "learning_rate": 0.00039982513304022145, + "loss": 0.6901, + "step": 1810 + }, + { + "epoch": 0.04050925925925926, + "grad_norm": 0.4814782440662384, + "learning_rate": 0.00039982317843060433, + "loss": 0.6997, + "step": 1820 + }, + { + "epoch": 0.04073183760683761, + "grad_norm": 0.5412116646766663, + "learning_rate": 0.00039982121296243216, + "loss": 0.6539, + "step": 1830 + }, + { + "epoch": 0.04095441595441596, + "grad_norm": 0.9898232221603394, + "learning_rate": 0.0003998192366358117, + "loss": 0.6633, + "step": 1840 + }, + { + "epoch": 0.0411769943019943, + "grad_norm": 0.5413815975189209, + "learning_rate": 0.0003998172494508503, + "loss": 0.6422, + "step": 1850 + }, + { + "epoch": 0.04139957264957265, + "grad_norm": 0.6026504635810852, + "learning_rate": 0.000399815251407656, + "loss": 0.6522, + "step": 1860 + }, + { + "epoch": 0.041622150997151, + "grad_norm": 0.490631103515625, + "learning_rate": 0.00039981324250633733, + "loss": 0.5702, + "step": 1870 + }, + { + "epoch": 0.041844729344729346, + "grad_norm": 0.7394275665283203, + "learning_rate": 0.0003998112227470036, + "loss": 0.6785, + "step": 1880 + }, + { + "epoch": 0.042067307692307696, + "grad_norm": 0.689113438129425, + "learning_rate": 0.00039980919212976435, + "loss": 0.6489, + "step": 1890 + }, + { + "epoch": 0.04228988603988604, + "grad_norm": 0.5154189467430115, + "learning_rate": 0.00039980715065473, + "loss": 0.6737, + "step": 1900 + }, + { + "epoch": 0.04251246438746439, + "grad_norm": 0.5986299514770508, + "learning_rate": 0.00039980509832201165, + "loss": 0.7127, + "step": 1910 + }, + { + "epoch": 0.042735042735042736, + "grad_norm": 0.5043966770172119, + "learning_rate": 0.00039980303513172057, + "loss": 0.6557, + "step": 1920 + }, + { + "epoch": 0.042957621082621085, + "grad_norm": 0.5125043988227844, + "learning_rate": 0.000399800961083969, + "loss": 0.6017, + "step": 1930 + }, + { + "epoch": 0.04318019943019943, + "grad_norm": 0.4287809729576111, + "learning_rate": 0.0003997988761788697, + "loss": 0.6154, + "step": 1940 + }, + { + "epoch": 0.043402777777777776, + "grad_norm": 0.4034154415130615, + "learning_rate": 0.00039979678041653587, + "loss": 0.5579, + "step": 1950 + }, + { + "epoch": 0.043625356125356125, + "grad_norm": 0.6922469735145569, + "learning_rate": 0.0003997946737970814, + "loss": 0.7429, + "step": 1960 + }, + { + "epoch": 0.043847934472934474, + "grad_norm": 0.670454204082489, + "learning_rate": 0.00039979255632062086, + "loss": 0.804, + "step": 1970 + }, + { + "epoch": 0.044070512820512824, + "grad_norm": 0.4805451035499573, + "learning_rate": 0.00039979042798726923, + "loss": 0.5674, + "step": 1980 + }, + { + "epoch": 0.044293091168091166, + "grad_norm": 0.3706532418727875, + "learning_rate": 0.00039978828879714217, + "loss": 0.7832, + "step": 1990 + }, + { + "epoch": 0.044515669515669515, + "grad_norm": 0.5441576242446899, + "learning_rate": 0.00039978613875035594, + "loss": 0.6144, + "step": 2000 + }, + { + "epoch": 0.044738247863247864, + "grad_norm": 0.5556431412696838, + "learning_rate": 0.00039978397784702743, + "loss": 0.623, + "step": 2010 + }, + { + "epoch": 0.04496082621082621, + "grad_norm": 0.4374752342700958, + "learning_rate": 0.00039978180608727396, + "loss": 0.601, + "step": 2020 + }, + { + "epoch": 0.045183404558404555, + "grad_norm": 0.5470070242881775, + "learning_rate": 0.00039977962347121363, + "loss": 0.6754, + "step": 2030 + }, + { + "epoch": 0.045405982905982904, + "grad_norm": 0.5512667298316956, + "learning_rate": 0.000399777429998965, + "loss": 0.6927, + "step": 2040 + }, + { + "epoch": 0.04562856125356125, + "grad_norm": 0.491028755903244, + "learning_rate": 0.00039977522567064726, + "loss": 0.5932, + "step": 2050 + }, + { + "epoch": 0.0458511396011396, + "grad_norm": 0.8657106161117554, + "learning_rate": 0.00039977301048638023, + "loss": 0.7108, + "step": 2060 + }, + { + "epoch": 0.04607371794871795, + "grad_norm": 0.5009307265281677, + "learning_rate": 0.00039977078444628427, + "loss": 0.4469, + "step": 2070 + }, + { + "epoch": 0.046296296296296294, + "grad_norm": 0.5697231292724609, + "learning_rate": 0.0003997685475504803, + "loss": 0.731, + "step": 2080 + }, + { + "epoch": 0.04651887464387464, + "grad_norm": 0.47939106822013855, + "learning_rate": 0.00039976629979909, + "loss": 0.66, + "step": 2090 + }, + { + "epoch": 0.04674145299145299, + "grad_norm": 0.6738607287406921, + "learning_rate": 0.0003997640411922354, + "loss": 0.6655, + "step": 2100 + }, + { + "epoch": 0.04696403133903134, + "grad_norm": 0.43843522667884827, + "learning_rate": 0.00039976177173003924, + "loss": 0.6354, + "step": 2110 + }, + { + "epoch": 0.04718660968660968, + "grad_norm": 0.7421457171440125, + "learning_rate": 0.0003997594914126249, + "loss": 0.7271, + "step": 2120 + }, + { + "epoch": 0.04740918803418803, + "grad_norm": 0.6225273609161377, + "learning_rate": 0.0003997572002401163, + "loss": 0.6037, + "step": 2130 + }, + { + "epoch": 0.04763176638176638, + "grad_norm": 0.6136428117752075, + "learning_rate": 0.00039975489821263783, + "loss": 0.5636, + "step": 2140 + }, + { + "epoch": 0.04785434472934473, + "grad_norm": 0.5567286610603333, + "learning_rate": 0.0003997525853303147, + "loss": 0.6963, + "step": 2150 + }, + { + "epoch": 0.04807692307692308, + "grad_norm": 0.9734333753585815, + "learning_rate": 0.00039975026159327253, + "loss": 0.7507, + "step": 2160 + }, + { + "epoch": 0.04829950142450142, + "grad_norm": 0.3505744934082031, + "learning_rate": 0.00039974792700163766, + "loss": 0.5056, + "step": 2170 + }, + { + "epoch": 0.04852207977207977, + "grad_norm": 0.6284974813461304, + "learning_rate": 0.0003997455815555369, + "loss": 0.6345, + "step": 2180 + }, + { + "epoch": 0.04874465811965812, + "grad_norm": 0.43681466579437256, + "learning_rate": 0.00039974322525509776, + "loss": 0.5093, + "step": 2190 + }, + { + "epoch": 0.04896723646723647, + "grad_norm": 0.8628547787666321, + "learning_rate": 0.0003997408581004482, + "loss": 0.6074, + "step": 2200 + }, + { + "epoch": 0.04918981481481482, + "grad_norm": 0.37960782647132874, + "learning_rate": 0.0003997384800917169, + "loss": 0.6353, + "step": 2210 + }, + { + "epoch": 0.04941239316239316, + "grad_norm": 0.5450817942619324, + "learning_rate": 0.0003997360912290331, + "loss": 0.7205, + "step": 2220 + }, + { + "epoch": 0.04963497150997151, + "grad_norm": 0.6776572465896606, + "learning_rate": 0.00039973369151252654, + "loss": 0.7616, + "step": 2230 + }, + { + "epoch": 0.04985754985754986, + "grad_norm": 0.5885903239250183, + "learning_rate": 0.0003997312809423277, + "loss": 0.6358, + "step": 2240 + }, + { + "epoch": 0.05008012820512821, + "grad_norm": 0.6875993013381958, + "learning_rate": 0.00039972885951856756, + "loss": 0.575, + "step": 2250 + }, + { + "epoch": 0.05030270655270655, + "grad_norm": 0.7251737117767334, + "learning_rate": 0.0003997264272413777, + "loss": 0.7485, + "step": 2260 + }, + { + "epoch": 0.0505252849002849, + "grad_norm": 0.6607152819633484, + "learning_rate": 0.0003997239841108902, + "loss": 0.72, + "step": 2270 + }, + { + "epoch": 0.05074786324786325, + "grad_norm": 0.45974501967430115, + "learning_rate": 0.000399721530127238, + "loss": 0.7453, + "step": 2280 + }, + { + "epoch": 0.0509704415954416, + "grad_norm": 0.46049365401268005, + "learning_rate": 0.0003997190652905543, + "loss": 0.7315, + "step": 2290 + }, + { + "epoch": 0.051193019943019946, + "grad_norm": 0.5258949398994446, + "learning_rate": 0.0003997165896009731, + "loss": 0.7081, + "step": 2300 + }, + { + "epoch": 0.05141559829059829, + "grad_norm": 0.4143979251384735, + "learning_rate": 0.000399714103058629, + "loss": 0.6437, + "step": 2310 + }, + { + "epoch": 0.05163817663817664, + "grad_norm": 0.7126597166061401, + "learning_rate": 0.00039971160566365695, + "loss": 0.6343, + "step": 2320 + }, + { + "epoch": 0.05186075498575499, + "grad_norm": 0.384854257106781, + "learning_rate": 0.0003997090974161928, + "loss": 0.5415, + "step": 2330 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.45989155769348145, + "learning_rate": 0.0003997065783163728, + "loss": 0.6082, + "step": 2340 + }, + { + "epoch": 0.05230591168091168, + "grad_norm": 0.5884913206100464, + "learning_rate": 0.0003997040483643338, + "loss": 0.6769, + "step": 2350 + }, + { + "epoch": 0.05252849002849003, + "grad_norm": 0.5443273782730103, + "learning_rate": 0.0003997015075602134, + "loss": 0.5862, + "step": 2360 + }, + { + "epoch": 0.052751068376068376, + "grad_norm": 0.6262222528457642, + "learning_rate": 0.00039969895590414954, + "loss": 0.5576, + "step": 2370 + }, + { + "epoch": 0.052973646723646725, + "grad_norm": 0.37384849786758423, + "learning_rate": 0.00039969639339628094, + "loss": 0.6212, + "step": 2380 + }, + { + "epoch": 0.053196225071225074, + "grad_norm": 1.0342721939086914, + "learning_rate": 0.00039969382003674685, + "loss": 0.7919, + "step": 2390 + }, + { + "epoch": 0.053418803418803416, + "grad_norm": 0.49225690960884094, + "learning_rate": 0.00039969123582568714, + "loss": 0.6046, + "step": 2400 + }, + { + "epoch": 0.053641381766381765, + "grad_norm": 0.6797170042991638, + "learning_rate": 0.0003996886407632422, + "loss": 0.7064, + "step": 2410 + }, + { + "epoch": 0.053863960113960115, + "grad_norm": 0.3157079815864563, + "learning_rate": 0.00039968603484955305, + "loss": 0.5789, + "step": 2420 + }, + { + "epoch": 0.054086538461538464, + "grad_norm": 0.6280565857887268, + "learning_rate": 0.0003996834180847612, + "loss": 0.6395, + "step": 2430 + }, + { + "epoch": 0.054309116809116806, + "grad_norm": 0.714435338973999, + "learning_rate": 0.00039968079046900906, + "loss": 0.653, + "step": 2440 + }, + { + "epoch": 0.054531695156695155, + "grad_norm": 0.8974350094795227, + "learning_rate": 0.0003996781520024392, + "loss": 0.7271, + "step": 2450 + }, + { + "epoch": 0.054754273504273504, + "grad_norm": 0.7227585911750793, + "learning_rate": 0.00039967550268519517, + "loss": 0.7686, + "step": 2460 + }, + { + "epoch": 0.05497685185185185, + "grad_norm": 0.8472549319267273, + "learning_rate": 0.00039967284251742085, + "loss": 0.7441, + "step": 2470 + }, + { + "epoch": 0.0551994301994302, + "grad_norm": 0.8443915247917175, + "learning_rate": 0.00039967017149926084, + "loss": 0.5644, + "step": 2480 + }, + { + "epoch": 0.055422008547008544, + "grad_norm": 0.5027754306793213, + "learning_rate": 0.0003996674896308602, + "loss": 0.6362, + "step": 2490 + }, + { + "epoch": 0.055644586894586893, + "grad_norm": 0.4711776375770569, + "learning_rate": 0.0003996647969123647, + "loss": 0.6692, + "step": 2500 + }, + { + "epoch": 0.05586716524216524, + "grad_norm": 0.5613887906074524, + "learning_rate": 0.0003996620933439207, + "loss": 0.6728, + "step": 2510 + }, + { + "epoch": 0.05608974358974359, + "grad_norm": 0.4277225732803345, + "learning_rate": 0.00039965937892567514, + "loss": 0.6065, + "step": 2520 + }, + { + "epoch": 0.056312321937321934, + "grad_norm": 0.5663828253746033, + "learning_rate": 0.00039965665365777545, + "loss": 0.5488, + "step": 2530 + }, + { + "epoch": 0.05653490028490028, + "grad_norm": 0.49894288182258606, + "learning_rate": 0.0003996539175403697, + "loss": 0.5605, + "step": 2540 + }, + { + "epoch": 0.05675747863247863, + "grad_norm": 0.6952654123306274, + "learning_rate": 0.0003996511705736067, + "loss": 0.5986, + "step": 2550 + }, + { + "epoch": 0.05698005698005698, + "grad_norm": 0.49367037415504456, + "learning_rate": 0.00039964841275763564, + "loss": 0.6781, + "step": 2560 + }, + { + "epoch": 0.05720263532763533, + "grad_norm": 0.554221510887146, + "learning_rate": 0.0003996456440926064, + "loss": 0.7196, + "step": 2570 + }, + { + "epoch": 0.05742521367521367, + "grad_norm": 0.7069229483604431, + "learning_rate": 0.00039964286457866937, + "loss": 0.653, + "step": 2580 + }, + { + "epoch": 0.05764779202279202, + "grad_norm": 0.4243583679199219, + "learning_rate": 0.0003996400742159757, + "loss": 0.6262, + "step": 2590 + }, + { + "epoch": 0.05787037037037037, + "grad_norm": 0.5111228227615356, + "learning_rate": 0.0003996372730046769, + "loss": 0.6631, + "step": 2600 + }, + { + "epoch": 0.05809294871794872, + "grad_norm": 0.5107941627502441, + "learning_rate": 0.0003996344609449253, + "loss": 0.8428, + "step": 2610 + }, + { + "epoch": 0.05831552706552707, + "grad_norm": 0.7400026321411133, + "learning_rate": 0.00039963163803687367, + "loss": 0.6988, + "step": 2620 + }, + { + "epoch": 0.05853810541310541, + "grad_norm": 0.67812180519104, + "learning_rate": 0.0003996288042806754, + "loss": 0.6731, + "step": 2630 + }, + { + "epoch": 0.05876068376068376, + "grad_norm": 0.6009535789489746, + "learning_rate": 0.00039962595967648446, + "loss": 0.7021, + "step": 2640 + }, + { + "epoch": 0.05898326210826211, + "grad_norm": 3.271467447280884, + "learning_rate": 0.00039962310422445545, + "loss": 0.755, + "step": 2650 + }, + { + "epoch": 0.05920584045584046, + "grad_norm": 0.6115080118179321, + "learning_rate": 0.00039962023792474355, + "loss": 0.7505, + "step": 2660 + }, + { + "epoch": 0.0594284188034188, + "grad_norm": 0.5685977339744568, + "learning_rate": 0.00039961736077750456, + "loss": 0.706, + "step": 2670 + }, + { + "epoch": 0.05965099715099715, + "grad_norm": 0.6168237328529358, + "learning_rate": 0.00039961447278289466, + "loss": 0.6097, + "step": 2680 + }, + { + "epoch": 0.0598735754985755, + "grad_norm": 0.4329089820384979, + "learning_rate": 0.00039961157394107096, + "loss": 0.6782, + "step": 2690 + }, + { + "epoch": 0.06000712250712251, + "eval_loss": 0.6390817165374756, + "eval_runtime": 337.4282, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 7.009, + "step": 2696 + }, + { + "epoch": 0.06009615384615385, + "grad_norm": 0.5909023284912109, + "learning_rate": 0.00039960866425219093, + "loss": 0.6413, + "step": 2700 + }, + { + "epoch": 0.0603187321937322, + "grad_norm": 0.4124837815761566, + "learning_rate": 0.00039960574371641265, + "loss": 0.6268, + "step": 2710 + }, + { + "epoch": 0.06054131054131054, + "grad_norm": 0.8504196405410767, + "learning_rate": 0.0003996028123338949, + "loss": 0.6622, + "step": 2720 + }, + { + "epoch": 0.06076388888888889, + "grad_norm": 0.4204083979129791, + "learning_rate": 0.00039959987010479685, + "loss": 0.554, + "step": 2730 + }, + { + "epoch": 0.06098646723646724, + "grad_norm": 0.6747197508811951, + "learning_rate": 0.0003995969170292785, + "loss": 0.7341, + "step": 2740 + }, + { + "epoch": 0.061209045584045586, + "grad_norm": 0.41700077056884766, + "learning_rate": 0.00039959395310750027, + "loss": 0.6148, + "step": 2750 + }, + { + "epoch": 0.06143162393162393, + "grad_norm": 0.41387853026390076, + "learning_rate": 0.00039959097833962325, + "loss": 0.7127, + "step": 2760 + }, + { + "epoch": 0.06165420227920228, + "grad_norm": 0.9684638381004333, + "learning_rate": 0.000399587992725809, + "loss": 0.6922, + "step": 2770 + }, + { + "epoch": 0.06187678062678063, + "grad_norm": 0.5317614078521729, + "learning_rate": 0.00039958499626622, + "loss": 0.553, + "step": 2780 + }, + { + "epoch": 0.062099358974358976, + "grad_norm": 0.5378207564353943, + "learning_rate": 0.00039958198896101874, + "loss": 0.5355, + "step": 2790 + }, + { + "epoch": 0.062321937321937325, + "grad_norm": 0.5899708271026611, + "learning_rate": 0.0003995789708103689, + "loss": 0.5899, + "step": 2800 + }, + { + "epoch": 0.06254451566951567, + "grad_norm": 0.5884451270103455, + "learning_rate": 0.0003995759418144344, + "loss": 0.7328, + "step": 2810 + }, + { + "epoch": 0.06276709401709402, + "grad_norm": 0.6845195293426514, + "learning_rate": 0.0003995729019733799, + "loss": 0.6292, + "step": 2820 + }, + { + "epoch": 0.06298967236467236, + "grad_norm": 0.623423159122467, + "learning_rate": 0.0003995698512873704, + "loss": 0.7828, + "step": 2830 + }, + { + "epoch": 0.06321225071225071, + "grad_norm": 0.5040481090545654, + "learning_rate": 0.0003995667897565719, + "loss": 0.5945, + "step": 2840 + }, + { + "epoch": 0.06343482905982906, + "grad_norm": 0.7745060324668884, + "learning_rate": 0.0003995637173811506, + "loss": 0.6323, + "step": 2850 + }, + { + "epoch": 0.06365740740740741, + "grad_norm": 0.6161078810691833, + "learning_rate": 0.0003995606341612736, + "loss": 0.6442, + "step": 2860 + }, + { + "epoch": 0.06387998575498575, + "grad_norm": 0.41616639494895935, + "learning_rate": 0.0003995575400971083, + "loss": 0.5293, + "step": 2870 + }, + { + "epoch": 0.0641025641025641, + "grad_norm": 0.6070893406867981, + "learning_rate": 0.00039955443518882296, + "loss": 0.6674, + "step": 2880 + }, + { + "epoch": 0.06432514245014245, + "grad_norm": 0.5885868668556213, + "learning_rate": 0.00039955131943658623, + "loss": 0.6609, + "step": 2890 + }, + { + "epoch": 0.0645477207977208, + "grad_norm": 0.662845253944397, + "learning_rate": 0.00039954819284056747, + "loss": 0.5523, + "step": 2900 + }, + { + "epoch": 0.06477029914529915, + "grad_norm": 0.6544046401977539, + "learning_rate": 0.00039954505540093645, + "loss": 0.6903, + "step": 2910 + }, + { + "epoch": 0.0649928774928775, + "grad_norm": 0.4174458682537079, + "learning_rate": 0.0003995419071178639, + "loss": 0.6201, + "step": 2920 + }, + { + "epoch": 0.06521545584045584, + "grad_norm": 0.6580033898353577, + "learning_rate": 0.00039953874799152073, + "loss": 0.6212, + "step": 2930 + }, + { + "epoch": 0.06543803418803419, + "grad_norm": 0.6909990310668945, + "learning_rate": 0.00039953557802207857, + "loss": 0.7442, + "step": 2940 + }, + { + "epoch": 0.06566061253561253, + "grad_norm": 0.7643675208091736, + "learning_rate": 0.0003995323972097098, + "loss": 0.7135, + "step": 2950 + }, + { + "epoch": 0.06588319088319089, + "grad_norm": 0.5393293499946594, + "learning_rate": 0.00039952920555458727, + "loss": 0.6405, + "step": 2960 + }, + { + "epoch": 0.06610576923076923, + "grad_norm": 0.9071075916290283, + "learning_rate": 0.00039952600305688426, + "loss": 0.6748, + "step": 2970 + }, + { + "epoch": 0.06632834757834757, + "grad_norm": 0.3836714029312134, + "learning_rate": 0.00039952278971677497, + "loss": 0.6589, + "step": 2980 + }, + { + "epoch": 0.06655092592592593, + "grad_norm": 0.5095319151878357, + "learning_rate": 0.000399519565534434, + "loss": 0.7009, + "step": 2990 + }, + { + "epoch": 0.06677350427350427, + "grad_norm": 0.58671635389328, + "learning_rate": 0.00039951633051003643, + "loss": 0.6462, + "step": 3000 + }, + { + "epoch": 0.06699608262108261, + "grad_norm": 0.5948638319969177, + "learning_rate": 0.00039951308464375814, + "loss": 0.6507, + "step": 3010 + }, + { + "epoch": 0.06721866096866097, + "grad_norm": 0.8463034629821777, + "learning_rate": 0.00039950982793577553, + "loss": 0.6059, + "step": 3020 + }, + { + "epoch": 0.06744123931623931, + "grad_norm": 0.4733259677886963, + "learning_rate": 0.00039950656038626554, + "loss": 0.68, + "step": 3030 + }, + { + "epoch": 0.06766381766381767, + "grad_norm": 0.7535138726234436, + "learning_rate": 0.0003995032819954057, + "loss": 0.5747, + "step": 3040 + }, + { + "epoch": 0.06788639601139601, + "grad_norm": 0.48301541805267334, + "learning_rate": 0.00039949999276337427, + "loss": 0.7331, + "step": 3050 + }, + { + "epoch": 0.06810897435897435, + "grad_norm": 0.7251609563827515, + "learning_rate": 0.0003994966926903498, + "loss": 0.6975, + "step": 3060 + }, + { + "epoch": 0.06833155270655271, + "grad_norm": 0.4815773665904999, + "learning_rate": 0.00039949338177651183, + "loss": 0.7074, + "step": 3070 + }, + { + "epoch": 0.06855413105413105, + "grad_norm": 0.5007272362709045, + "learning_rate": 0.0003994900600220401, + "loss": 0.6067, + "step": 3080 + }, + { + "epoch": 0.06877670940170941, + "grad_norm": 0.6377426385879517, + "learning_rate": 0.0003994867274271153, + "loss": 0.7899, + "step": 3090 + }, + { + "epoch": 0.06899928774928775, + "grad_norm": 1.270365595817566, + "learning_rate": 0.0003994833839919183, + "loss": 0.7652, + "step": 3100 + }, + { + "epoch": 0.06922186609686609, + "grad_norm": 0.5779575109481812, + "learning_rate": 0.00039948002971663103, + "loss": 0.6203, + "step": 3110 + }, + { + "epoch": 0.06944444444444445, + "grad_norm": 0.5712846517562866, + "learning_rate": 0.0003994766646014355, + "loss": 0.7494, + "step": 3120 + }, + { + "epoch": 0.06966702279202279, + "grad_norm": 0.5811907649040222, + "learning_rate": 0.00039947328864651485, + "loss": 0.6286, + "step": 3130 + }, + { + "epoch": 0.06988960113960115, + "grad_norm": 0.7779800891876221, + "learning_rate": 0.00039946990185205235, + "loss": 0.6996, + "step": 3140 + }, + { + "epoch": 0.07011217948717949, + "grad_norm": 0.5738331079483032, + "learning_rate": 0.0003994665042182321, + "loss": 0.6631, + "step": 3150 + }, + { + "epoch": 0.07033475783475783, + "grad_norm": 1.0784659385681152, + "learning_rate": 0.00039946309574523874, + "loss": 0.6092, + "step": 3160 + }, + { + "epoch": 0.07055733618233619, + "grad_norm": 1.0656852722167969, + "learning_rate": 0.00039945967643325746, + "loss": 0.6525, + "step": 3170 + }, + { + "epoch": 0.07077991452991453, + "grad_norm": 0.6276305913925171, + "learning_rate": 0.0003994562462824741, + "loss": 0.5683, + "step": 3180 + }, + { + "epoch": 0.07100249287749288, + "grad_norm": 0.6950351595878601, + "learning_rate": 0.00039945280529307496, + "loss": 0.5425, + "step": 3190 + }, + { + "epoch": 0.07122507122507123, + "grad_norm": 0.6023120880126953, + "learning_rate": 0.0003994493534652471, + "loss": 0.614, + "step": 3200 + }, + { + "epoch": 0.07144764957264957, + "grad_norm": 0.6610495448112488, + "learning_rate": 0.00039944589079917814, + "loss": 0.5214, + "step": 3210 + }, + { + "epoch": 0.07167022792022792, + "grad_norm": 0.6308820843696594, + "learning_rate": 0.0003994424172950562, + "loss": 0.6614, + "step": 3220 + }, + { + "epoch": 0.07189280626780627, + "grad_norm": 0.5825803875923157, + "learning_rate": 0.00039943893295307, + "loss": 0.6141, + "step": 3230 + }, + { + "epoch": 0.07211538461538461, + "grad_norm": 0.7101708650588989, + "learning_rate": 0.00039943543777340895, + "loss": 0.6021, + "step": 3240 + }, + { + "epoch": 0.07233796296296297, + "grad_norm": 0.3456685245037079, + "learning_rate": 0.00039943193175626297, + "loss": 0.6149, + "step": 3250 + }, + { + "epoch": 0.07256054131054131, + "grad_norm": 0.8595088124275208, + "learning_rate": 0.0003994284149018225, + "loss": 0.6951, + "step": 3260 + }, + { + "epoch": 0.07278311965811966, + "grad_norm": 0.5759161114692688, + "learning_rate": 0.0003994248872102787, + "loss": 0.5981, + "step": 3270 + }, + { + "epoch": 0.073005698005698, + "grad_norm": 0.7946069836616516, + "learning_rate": 0.00039942134868182333, + "loss": 0.6799, + "step": 3280 + }, + { + "epoch": 0.07322827635327635, + "grad_norm": 0.5611863136291504, + "learning_rate": 0.00039941779931664857, + "loss": 0.8381, + "step": 3290 + }, + { + "epoch": 0.0734508547008547, + "grad_norm": 0.6077750325202942, + "learning_rate": 0.0003994142391149474, + "loss": 0.7151, + "step": 3300 + }, + { + "epoch": 0.07367343304843305, + "grad_norm": 0.6016742587089539, + "learning_rate": 0.0003994106680769131, + "loss": 0.5861, + "step": 3310 + }, + { + "epoch": 0.0738960113960114, + "grad_norm": 0.8998081684112549, + "learning_rate": 0.00039940708620273994, + "loss": 0.7188, + "step": 3320 + }, + { + "epoch": 0.07411858974358974, + "grad_norm": 0.607742965221405, + "learning_rate": 0.0003994034934926226, + "loss": 0.5308, + "step": 3330 + }, + { + "epoch": 0.07434116809116809, + "grad_norm": 0.3822357654571533, + "learning_rate": 0.000399399889946756, + "loss": 0.5146, + "step": 3340 + }, + { + "epoch": 0.07456374643874644, + "grad_norm": 0.5996620655059814, + "learning_rate": 0.0003993962755653362, + "loss": 0.582, + "step": 3350 + }, + { + "epoch": 0.07478632478632478, + "grad_norm": 0.6241055727005005, + "learning_rate": 0.00039939265034855955, + "loss": 0.6331, + "step": 3360 + }, + { + "epoch": 0.07500890313390314, + "grad_norm": 0.41144707798957825, + "learning_rate": 0.00039938901429662307, + "loss": 0.5581, + "step": 3370 + }, + { + "epoch": 0.07523148148148148, + "grad_norm": 0.7115545868873596, + "learning_rate": 0.00039938536740972427, + "loss": 0.7581, + "step": 3380 + }, + { + "epoch": 0.07545405982905982, + "grad_norm": 0.4126880466938019, + "learning_rate": 0.00039938170968806144, + "loss": 0.6505, + "step": 3390 + }, + { + "epoch": 0.07567663817663818, + "grad_norm": 0.6458644866943359, + "learning_rate": 0.0003993780411318333, + "loss": 0.7225, + "step": 3400 + }, + { + "epoch": 0.07589921652421652, + "grad_norm": 0.7713171243667603, + "learning_rate": 0.0003993743617412391, + "loss": 0.7107, + "step": 3410 + }, + { + "epoch": 0.07612179487179487, + "grad_norm": 0.5597316026687622, + "learning_rate": 0.00039937067151647894, + "loss": 0.7864, + "step": 3420 + }, + { + "epoch": 0.07634437321937322, + "grad_norm": 0.5204119682312012, + "learning_rate": 0.0003993669704577533, + "loss": 0.6962, + "step": 3430 + }, + { + "epoch": 0.07656695156695156, + "grad_norm": 0.5534259676933289, + "learning_rate": 0.00039936325856526324, + "loss": 0.7243, + "step": 3440 + }, + { + "epoch": 0.07678952991452992, + "grad_norm": 0.6227967739105225, + "learning_rate": 0.00039935953583921047, + "loss": 0.6225, + "step": 3450 + }, + { + "epoch": 0.07701210826210826, + "grad_norm": 0.5725488662719727, + "learning_rate": 0.00039935580227979734, + "loss": 0.6252, + "step": 3460 + }, + { + "epoch": 0.0772346866096866, + "grad_norm": 0.6526398658752441, + "learning_rate": 0.0003993520578872267, + "loss": 0.6411, + "step": 3470 + }, + { + "epoch": 0.07745726495726496, + "grad_norm": 0.9423260688781738, + "learning_rate": 0.000399348302661702, + "loss": 0.6321, + "step": 3480 + }, + { + "epoch": 0.0776798433048433, + "grad_norm": 0.5385962724685669, + "learning_rate": 0.0003993445366034275, + "loss": 0.6431, + "step": 3490 + }, + { + "epoch": 0.07790242165242166, + "grad_norm": 1.0496457815170288, + "learning_rate": 0.00039934075971260753, + "loss": 0.709, + "step": 3500 + }, + { + "epoch": 0.078125, + "grad_norm": 0.9465022087097168, + "learning_rate": 0.0003993369719894475, + "loss": 0.5711, + "step": 3510 + }, + { + "epoch": 0.07834757834757834, + "grad_norm": 0.6062886714935303, + "learning_rate": 0.0003993331734341533, + "loss": 0.6693, + "step": 3520 + }, + { + "epoch": 0.0785701566951567, + "grad_norm": 0.6801285743713379, + "learning_rate": 0.0003993293640469313, + "loss": 0.5972, + "step": 3530 + }, + { + "epoch": 0.07879273504273504, + "grad_norm": 0.5197761654853821, + "learning_rate": 0.0003993255438279884, + "loss": 0.5398, + "step": 3540 + }, + { + "epoch": 0.0790153133903134, + "grad_norm": 0.6844536662101746, + "learning_rate": 0.0003993217127775323, + "loss": 0.7739, + "step": 3550 + }, + { + "epoch": 0.07923789173789174, + "grad_norm": 0.5622774958610535, + "learning_rate": 0.00039931787089577113, + "loss": 0.5835, + "step": 3560 + }, + { + "epoch": 0.07946047008547008, + "grad_norm": 0.6373755931854248, + "learning_rate": 0.00039931401818291373, + "loss": 0.588, + "step": 3570 + }, + { + "epoch": 0.07968304843304844, + "grad_norm": 0.6422734260559082, + "learning_rate": 0.0003993101546391694, + "loss": 0.6587, + "step": 3580 + }, + { + "epoch": 0.07990562678062678, + "grad_norm": 0.3758021593093872, + "learning_rate": 0.0003993062802647481, + "loss": 0.5819, + "step": 3590 + }, + { + "epoch": 0.08012820512820513, + "grad_norm": 0.41243478655815125, + "learning_rate": 0.00039930239505986035, + "loss": 0.6124, + "step": 3600 + }, + { + "epoch": 0.08035078347578348, + "grad_norm": 0.33799731731414795, + "learning_rate": 0.0003992984990247173, + "loss": 0.5755, + "step": 3610 + }, + { + "epoch": 0.08057336182336182, + "grad_norm": 0.6254426836967468, + "learning_rate": 0.0003992945921595307, + "loss": 0.5911, + "step": 3620 + }, + { + "epoch": 0.08079594017094018, + "grad_norm": 0.7614779472351074, + "learning_rate": 0.0003992906744645128, + "loss": 0.6772, + "step": 3630 + }, + { + "epoch": 0.08101851851851852, + "grad_norm": 0.5825209617614746, + "learning_rate": 0.0003992867459398765, + "loss": 0.7136, + "step": 3640 + }, + { + "epoch": 0.08124109686609686, + "grad_norm": 0.5053777098655701, + "learning_rate": 0.0003992828065858352, + "loss": 0.5367, + "step": 3650 + }, + { + "epoch": 0.08146367521367522, + "grad_norm": 0.9008991718292236, + "learning_rate": 0.00039927885640260317, + "loss": 0.7203, + "step": 3660 + }, + { + "epoch": 0.08168625356125356, + "grad_norm": 0.6658133268356323, + "learning_rate": 0.0003992748953903949, + "loss": 0.5676, + "step": 3670 + }, + { + "epoch": 0.08190883190883191, + "grad_norm": 0.4479519724845886, + "learning_rate": 0.0003992709235494257, + "loss": 0.6296, + "step": 3680 + }, + { + "epoch": 0.08213141025641026, + "grad_norm": 0.7825643420219421, + "learning_rate": 0.0003992669408799113, + "loss": 0.6228, + "step": 3690 + }, + { + "epoch": 0.0823539886039886, + "grad_norm": 0.7348558902740479, + "learning_rate": 0.0003992629473820683, + "loss": 0.6685, + "step": 3700 + }, + { + "epoch": 0.08257656695156695, + "grad_norm": 0.6211950182914734, + "learning_rate": 0.0003992589430561136, + "loss": 0.593, + "step": 3710 + }, + { + "epoch": 0.0827991452991453, + "grad_norm": 0.49289315938949585, + "learning_rate": 0.00039925492790226477, + "loss": 0.7275, + "step": 3720 + }, + { + "epoch": 0.08302172364672365, + "grad_norm": 0.513803243637085, + "learning_rate": 0.00039925090192074005, + "loss": 0.6036, + "step": 3730 + }, + { + "epoch": 0.083244301994302, + "grad_norm": 0.5003082752227783, + "learning_rate": 0.00039924686511175824, + "loss": 0.6403, + "step": 3740 + }, + { + "epoch": 0.08346688034188034, + "grad_norm": 0.7302523255348206, + "learning_rate": 0.00039924281747553866, + "loss": 0.6492, + "step": 3750 + }, + { + "epoch": 0.08368945868945869, + "grad_norm": 0.6277854442596436, + "learning_rate": 0.00039923875901230125, + "loss": 0.5726, + "step": 3760 + }, + { + "epoch": 0.08391203703703703, + "grad_norm": 0.9266096353530884, + "learning_rate": 0.00039923468972226654, + "loss": 0.8142, + "step": 3770 + }, + { + "epoch": 0.08413461538461539, + "grad_norm": 0.5914018750190735, + "learning_rate": 0.00039923060960565576, + "loss": 0.628, + "step": 3780 + }, + { + "epoch": 0.08435719373219373, + "grad_norm": 0.7605827450752258, + "learning_rate": 0.0003992265186626905, + "loss": 0.7179, + "step": 3790 + }, + { + "epoch": 0.08457977207977208, + "grad_norm": 0.7958018779754639, + "learning_rate": 0.0003992224168935932, + "loss": 0.6565, + "step": 3800 + }, + { + "epoch": 0.08480235042735043, + "grad_norm": 0.49368587136268616, + "learning_rate": 0.0003992183042985866, + "loss": 0.6812, + "step": 3810 + }, + { + "epoch": 0.08502492877492877, + "grad_norm": 0.6145605444908142, + "learning_rate": 0.00039921418087789426, + "loss": 0.6702, + "step": 3820 + }, + { + "epoch": 0.08524750712250712, + "grad_norm": 0.5280147790908813, + "learning_rate": 0.0003992100466317403, + "loss": 0.694, + "step": 3830 + }, + { + "epoch": 0.08547008547008547, + "grad_norm": 0.771976113319397, + "learning_rate": 0.0003992059015603493, + "loss": 0.6219, + "step": 3840 + }, + { + "epoch": 0.08569266381766381, + "grad_norm": 0.888991117477417, + "learning_rate": 0.00039920174566394646, + "loss": 0.6727, + "step": 3850 + }, + { + "epoch": 0.08591524216524217, + "grad_norm": 0.8724148869514465, + "learning_rate": 0.0003991975789427578, + "loss": 0.6773, + "step": 3860 + }, + { + "epoch": 0.08613782051282051, + "grad_norm": 0.5815834999084473, + "learning_rate": 0.0003991934013970096, + "loss": 0.6245, + "step": 3870 + }, + { + "epoch": 0.08636039886039885, + "grad_norm": 0.8174399733543396, + "learning_rate": 0.0003991892130269288, + "loss": 0.6157, + "step": 3880 + }, + { + "epoch": 0.08658297720797721, + "grad_norm": 0.7425127029418945, + "learning_rate": 0.0003991850138327432, + "loss": 0.7522, + "step": 3890 + }, + { + "epoch": 0.08680555555555555, + "grad_norm": 0.46512556076049805, + "learning_rate": 0.00039918080381468095, + "loss": 0.5833, + "step": 3900 + }, + { + "epoch": 0.08702813390313391, + "grad_norm": 0.6372483968734741, + "learning_rate": 0.0003991765829729706, + "loss": 0.5622, + "step": 3910 + }, + { + "epoch": 0.08725071225071225, + "grad_norm": 0.627477765083313, + "learning_rate": 0.00039917235130784175, + "loss": 0.708, + "step": 3920 + }, + { + "epoch": 0.08747329059829059, + "grad_norm": 0.7006207704544067, + "learning_rate": 0.0003991681088195243, + "loss": 0.7236, + "step": 3930 + }, + { + "epoch": 0.08769586894586895, + "grad_norm": 0.5845404863357544, + "learning_rate": 0.0003991638555082488, + "loss": 0.6517, + "step": 3940 + }, + { + "epoch": 0.08791844729344729, + "grad_norm": 0.4940961003303528, + "learning_rate": 0.0003991595913742463, + "loss": 0.68, + "step": 3950 + }, + { + "epoch": 0.08814102564102565, + "grad_norm": 0.6172021627426147, + "learning_rate": 0.00039915531641774855, + "loss": 0.7886, + "step": 3960 + }, + { + "epoch": 0.08836360398860399, + "grad_norm": 0.671653687953949, + "learning_rate": 0.00039915103063898786, + "loss": 0.7868, + "step": 3970 + }, + { + "epoch": 0.08858618233618233, + "grad_norm": 0.6825588941574097, + "learning_rate": 0.0003991467340381972, + "loss": 0.5949, + "step": 3980 + }, + { + "epoch": 0.08880876068376069, + "grad_norm": 0.6282208561897278, + "learning_rate": 0.0003991424266156099, + "loss": 0.6068, + "step": 3990 + }, + { + "epoch": 0.08903133903133903, + "grad_norm": 0.8855395913124084, + "learning_rate": 0.00039913810837146024, + "loss": 0.6163, + "step": 4000 + }, + { + "epoch": 0.08925391737891739, + "grad_norm": 0.566017210483551, + "learning_rate": 0.0003991337793059827, + "loss": 0.6877, + "step": 4010 + }, + { + "epoch": 0.08947649572649573, + "grad_norm": 0.5654011368751526, + "learning_rate": 0.0003991294394194125, + "loss": 0.6206, + "step": 4020 + }, + { + "epoch": 0.08969907407407407, + "grad_norm": 0.7026615738868713, + "learning_rate": 0.0003991250887119856, + "loss": 0.6946, + "step": 4030 + }, + { + "epoch": 0.08992165242165243, + "grad_norm": 0.8429543375968933, + "learning_rate": 0.0003991207271839383, + "loss": 0.6198, + "step": 4040 + }, + { + "epoch": 0.09014423076923077, + "grad_norm": 0.5337466597557068, + "learning_rate": 0.0003991163548355078, + "loss": 0.7019, + "step": 4050 + }, + { + "epoch": 0.09036680911680911, + "grad_norm": 0.6086145639419556, + "learning_rate": 0.00039911197166693144, + "loss": 0.5869, + "step": 4060 + }, + { + "epoch": 0.09058938746438747, + "grad_norm": 0.578947901725769, + "learning_rate": 0.00039910757767844767, + "loss": 0.6893, + "step": 4070 + }, + { + "epoch": 0.09081196581196581, + "grad_norm": 0.6418620944023132, + "learning_rate": 0.00039910317287029505, + "loss": 0.695, + "step": 4080 + }, + { + "epoch": 0.09103454415954416, + "grad_norm": 0.587532639503479, + "learning_rate": 0.00039909875724271305, + "loss": 0.5836, + "step": 4090 + }, + { + "epoch": 0.0912571225071225, + "grad_norm": 0.46247434616088867, + "learning_rate": 0.0003990943307959416, + "loss": 0.5812, + "step": 4100 + }, + { + "epoch": 0.09147970085470085, + "grad_norm": 0.5754249691963196, + "learning_rate": 0.0003990898935302212, + "loss": 0.6688, + "step": 4110 + }, + { + "epoch": 0.0917022792022792, + "grad_norm": 0.6887472867965698, + "learning_rate": 0.000399085445445793, + "loss": 0.6571, + "step": 4120 + }, + { + "epoch": 0.09192485754985755, + "grad_norm": 0.9536170959472656, + "learning_rate": 0.00039908098654289876, + "loss": 0.7645, + "step": 4130 + }, + { + "epoch": 0.0921474358974359, + "grad_norm": 0.5921887159347534, + "learning_rate": 0.0003990765168217807, + "loss": 0.6758, + "step": 4140 + }, + { + "epoch": 0.09237001424501425, + "grad_norm": 0.8859508633613586, + "learning_rate": 0.0003990720362826817, + "loss": 0.57, + "step": 4150 + }, + { + "epoch": 0.09259259259259259, + "grad_norm": 0.76180499792099, + "learning_rate": 0.00039906754492584535, + "loss": 0.6223, + "step": 4160 + }, + { + "epoch": 0.09281517094017094, + "grad_norm": 0.43025851249694824, + "learning_rate": 0.0003990630427515156, + "loss": 0.6777, + "step": 4170 + }, + { + "epoch": 0.09303774928774929, + "grad_norm": 0.3659840524196625, + "learning_rate": 0.00039905852975993724, + "loss": 0.5963, + "step": 4180 + }, + { + "epoch": 0.09326032763532764, + "grad_norm": 0.8982321619987488, + "learning_rate": 0.0003990540059513554, + "loss": 0.6307, + "step": 4190 + }, + { + "epoch": 0.09348290598290598, + "grad_norm": 0.6464897990226746, + "learning_rate": 0.0003990494713260158, + "loss": 0.6593, + "step": 4200 + }, + { + "epoch": 0.09370548433048433, + "grad_norm": 0.7186951637268066, + "learning_rate": 0.00039904492588416506, + "loss": 0.7252, + "step": 4210 + }, + { + "epoch": 0.09392806267806268, + "grad_norm": 0.5287630558013916, + "learning_rate": 0.00039904036962605006, + "loss": 0.7238, + "step": 4220 + }, + { + "epoch": 0.09415064102564102, + "grad_norm": 0.963729739189148, + "learning_rate": 0.0003990358025519185, + "loss": 0.7215, + "step": 4230 + }, + { + "epoch": 0.09437321937321937, + "grad_norm": 0.355486124753952, + "learning_rate": 0.0003990312246620184, + "loss": 0.6176, + "step": 4240 + }, + { + "epoch": 0.09459579772079772, + "grad_norm": 0.671535849571228, + "learning_rate": 0.0003990266359565987, + "loss": 0.6039, + "step": 4250 + }, + { + "epoch": 0.09481837606837606, + "grad_norm": 0.7099301218986511, + "learning_rate": 0.0003990220364359087, + "loss": 0.6712, + "step": 4260 + }, + { + "epoch": 0.09504095441595442, + "grad_norm": 0.5276811718940735, + "learning_rate": 0.00039901742610019825, + "loss": 0.6352, + "step": 4270 + }, + { + "epoch": 0.09526353276353276, + "grad_norm": 0.547616720199585, + "learning_rate": 0.00039901280494971796, + "loss": 0.6072, + "step": 4280 + }, + { + "epoch": 0.0954861111111111, + "grad_norm": 0.570436954498291, + "learning_rate": 0.0003990081729847189, + "loss": 0.678, + "step": 4290 + }, + { + "epoch": 0.09570868945868946, + "grad_norm": 0.8578181862831116, + "learning_rate": 0.0003990035302054528, + "loss": 0.5163, + "step": 4300 + }, + { + "epoch": 0.0959312678062678, + "grad_norm": 0.6730733513832092, + "learning_rate": 0.00039899887661217203, + "loss": 0.6553, + "step": 4310 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 0.8366342186927795, + "learning_rate": 0.0003989942122051293, + "loss": 0.6638, + "step": 4320 + }, + { + "epoch": 0.0963764245014245, + "grad_norm": 0.6078762412071228, + "learning_rate": 0.00039898953698457826, + "loss": 0.5934, + "step": 4330 + }, + { + "epoch": 0.09659900284900284, + "grad_norm": 0.7725120782852173, + "learning_rate": 0.0003989848509507728, + "loss": 0.6387, + "step": 4340 + }, + { + "epoch": 0.0968215811965812, + "grad_norm": 0.87664794921875, + "learning_rate": 0.0003989801541039677, + "loss": 0.675, + "step": 4350 + }, + { + "epoch": 0.09704415954415954, + "grad_norm": 0.43659526109695435, + "learning_rate": 0.00039897544644441814, + "loss": 0.6767, + "step": 4360 + }, + { + "epoch": 0.0972667378917379, + "grad_norm": 0.615048348903656, + "learning_rate": 0.0003989707279723799, + "loss": 0.8962, + "step": 4370 + }, + { + "epoch": 0.09748931623931624, + "grad_norm": 0.6210547089576721, + "learning_rate": 0.0003989659986881094, + "loss": 0.553, + "step": 4380 + }, + { + "epoch": 0.09771189458689458, + "grad_norm": 0.8046955466270447, + "learning_rate": 0.0003989612585918637, + "loss": 0.5356, + "step": 4390 + }, + { + "epoch": 0.09793447293447294, + "grad_norm": 0.5782980918884277, + "learning_rate": 0.0003989565076839003, + "loss": 0.726, + "step": 4400 + }, + { + "epoch": 0.09815705128205128, + "grad_norm": 0.9094864726066589, + "learning_rate": 0.0003989517459644774, + "loss": 0.6938, + "step": 4410 + }, + { + "epoch": 0.09837962962962964, + "grad_norm": 0.49428123235702515, + "learning_rate": 0.00039894697343385377, + "loss": 0.5804, + "step": 4420 + }, + { + "epoch": 0.09860220797720798, + "grad_norm": 0.6479130387306213, + "learning_rate": 0.00039894219009228876, + "loss": 0.6167, + "step": 4430 + }, + { + "epoch": 0.09882478632478632, + "grad_norm": 0.784542977809906, + "learning_rate": 0.0003989373959400422, + "loss": 0.6455, + "step": 4440 + }, + { + "epoch": 0.09904736467236468, + "grad_norm": 0.6367858052253723, + "learning_rate": 0.00039893259097737474, + "loss": 0.6466, + "step": 4450 + }, + { + "epoch": 0.09926994301994302, + "grad_norm": 2.0138823986053467, + "learning_rate": 0.00039892777520454746, + "loss": 0.6594, + "step": 4460 + }, + { + "epoch": 0.09949252136752136, + "grad_norm": 0.5950619578361511, + "learning_rate": 0.00039892294862182195, + "loss": 0.727, + "step": 4470 + }, + { + "epoch": 0.09971509971509972, + "grad_norm": 0.7700321078300476, + "learning_rate": 0.0003989181112294606, + "loss": 0.6564, + "step": 4480 + }, + { + "epoch": 0.09993767806267806, + "grad_norm": 0.7688900232315063, + "learning_rate": 0.0003989132630277263, + "loss": 0.5971, + "step": 4490 + }, + { + "epoch": 0.10016025641025642, + "grad_norm": 0.8038826584815979, + "learning_rate": 0.0003989084040168824, + "loss": 0.5495, + "step": 4500 + }, + { + "epoch": 0.10038283475783476, + "grad_norm": 0.7080345749855042, + "learning_rate": 0.000398903534197193, + "loss": 0.6569, + "step": 4510 + }, + { + "epoch": 0.1006054131054131, + "grad_norm": 0.5524005889892578, + "learning_rate": 0.0003988986535689227, + "loss": 0.5265, + "step": 4520 + }, + { + "epoch": 0.10082799145299146, + "grad_norm": 1.074703335762024, + "learning_rate": 0.0003988937621323368, + "loss": 0.5926, + "step": 4530 + }, + { + "epoch": 0.1010505698005698, + "grad_norm": 0.5182725787162781, + "learning_rate": 0.000398888859887701, + "loss": 0.5199, + "step": 4540 + }, + { + "epoch": 0.10127314814814815, + "grad_norm": 0.6152669787406921, + "learning_rate": 0.00039888394683528167, + "loss": 0.7744, + "step": 4550 + }, + { + "epoch": 0.1014957264957265, + "grad_norm": 0.7104138731956482, + "learning_rate": 0.00039887902297534595, + "loss": 0.6647, + "step": 4560 + }, + { + "epoch": 0.10171830484330484, + "grad_norm": 0.8174471855163574, + "learning_rate": 0.0003988740883081613, + "loss": 0.7876, + "step": 4570 + }, + { + "epoch": 0.1019408831908832, + "grad_norm": 0.743220329284668, + "learning_rate": 0.00039886914283399587, + "loss": 0.8068, + "step": 4580 + }, + { + "epoch": 0.10216346153846154, + "grad_norm": 0.3491692543029785, + "learning_rate": 0.0003988641865531184, + "loss": 0.5628, + "step": 4590 + }, + { + "epoch": 0.10238603988603989, + "grad_norm": 0.9163690805435181, + "learning_rate": 0.0003988592194657982, + "loss": 0.6775, + "step": 4600 + }, + { + "epoch": 0.10260861823361823, + "grad_norm": 0.27448055148124695, + "learning_rate": 0.0003988542415723053, + "loss": 0.5396, + "step": 4610 + }, + { + "epoch": 0.10283119658119658, + "grad_norm": 0.5256446003913879, + "learning_rate": 0.0003988492528729101, + "loss": 0.5133, + "step": 4620 + }, + { + "epoch": 0.10305377492877493, + "grad_norm": 0.5656632781028748, + "learning_rate": 0.00039884425336788367, + "loss": 0.5857, + "step": 4630 + }, + { + "epoch": 0.10327635327635327, + "grad_norm": 0.6648633480072021, + "learning_rate": 0.0003988392430574978, + "loss": 0.6029, + "step": 4640 + }, + { + "epoch": 0.10349893162393162, + "grad_norm": 0.49245837330818176, + "learning_rate": 0.00039883422194202464, + "loss": 0.6558, + "step": 4650 + }, + { + "epoch": 0.10372150997150997, + "grad_norm": 0.6503485441207886, + "learning_rate": 0.00039882919002173713, + "loss": 0.6207, + "step": 4660 + }, + { + "epoch": 0.10394408831908832, + "grad_norm": 0.5436996817588806, + "learning_rate": 0.0003988241472969086, + "loss": 0.7351, + "step": 4670 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.8330632448196411, + "learning_rate": 0.0003988190937678132, + "loss": 0.6843, + "step": 4680 + }, + { + "epoch": 0.10438924501424501, + "grad_norm": 0.7099012136459351, + "learning_rate": 0.00039881402943472543, + "loss": 0.624, + "step": 4690 + }, + { + "epoch": 0.10461182336182336, + "grad_norm": 0.47862574458122253, + "learning_rate": 0.0003988089542979206, + "loss": 0.6694, + "step": 4700 + }, + { + "epoch": 0.10483440170940171, + "grad_norm": 0.5711633563041687, + "learning_rate": 0.0003988038683576744, + "loss": 0.5787, + "step": 4710 + }, + { + "epoch": 0.10505698005698005, + "grad_norm": 0.8468604683876038, + "learning_rate": 0.0003987987716142632, + "loss": 0.5867, + "step": 4720 + }, + { + "epoch": 0.10527955840455841, + "grad_norm": 0.45642709732055664, + "learning_rate": 0.0003987936640679641, + "loss": 0.5533, + "step": 4730 + }, + { + "epoch": 0.10550213675213675, + "grad_norm": 0.847876250743866, + "learning_rate": 0.00039878854571905454, + "loss": 0.732, + "step": 4740 + }, + { + "epoch": 0.1057247150997151, + "grad_norm": 0.690815269947052, + "learning_rate": 0.0003987834165678126, + "loss": 0.6412, + "step": 4750 + }, + { + "epoch": 0.10594729344729345, + "grad_norm": 0.6536726355552673, + "learning_rate": 0.0003987782766145172, + "loss": 0.5655, + "step": 4760 + }, + { + "epoch": 0.10616987179487179, + "grad_norm": 0.660818874835968, + "learning_rate": 0.00039877312585944743, + "loss": 0.5416, + "step": 4770 + }, + { + "epoch": 0.10639245014245015, + "grad_norm": 0.4305530786514282, + "learning_rate": 0.0003987679643028832, + "loss": 0.6333, + "step": 4780 + }, + { + "epoch": 0.10661502849002849, + "grad_norm": 0.7705379128456116, + "learning_rate": 0.00039876279194510524, + "loss": 0.6503, + "step": 4790 + }, + { + "epoch": 0.10683760683760683, + "grad_norm": 0.5306333303451538, + "learning_rate": 0.00039875760878639436, + "loss": 0.5507, + "step": 4800 + }, + { + "epoch": 0.10706018518518519, + "grad_norm": 0.7447746396064758, + "learning_rate": 0.0003987524148270323, + "loss": 0.6574, + "step": 4810 + }, + { + "epoch": 0.10728276353276353, + "grad_norm": 0.652462363243103, + "learning_rate": 0.0003987472100673013, + "loss": 0.6528, + "step": 4820 + }, + { + "epoch": 0.10750534188034189, + "grad_norm": 0.6787484288215637, + "learning_rate": 0.00039874199450748427, + "loss": 0.5111, + "step": 4830 + }, + { + "epoch": 0.10772792022792023, + "grad_norm": 0.3653728663921356, + "learning_rate": 0.0003987367681478645, + "loss": 0.6208, + "step": 4840 + }, + { + "epoch": 0.10795049857549857, + "grad_norm": 0.5167881846427917, + "learning_rate": 0.00039873153098872607, + "loss": 0.725, + "step": 4850 + }, + { + "epoch": 0.10817307692307693, + "grad_norm": 0.45200806856155396, + "learning_rate": 0.00039872628303035357, + "loss": 0.6057, + "step": 4860 + }, + { + "epoch": 0.10839565527065527, + "grad_norm": 0.7915605306625366, + "learning_rate": 0.00039872102427303214, + "loss": 0.5791, + "step": 4870 + }, + { + "epoch": 0.10861823361823361, + "grad_norm": 0.7434691190719604, + "learning_rate": 0.0003987157547170476, + "loss": 0.7281, + "step": 4880 + }, + { + "epoch": 0.10884081196581197, + "grad_norm": 0.6410295963287354, + "learning_rate": 0.00039871047436268627, + "loss": 0.6529, + "step": 4890 + }, + { + "epoch": 0.10906339031339031, + "grad_norm": 0.655968964099884, + "learning_rate": 0.0003987051832102351, + "loss": 0.6572, + "step": 4900 + }, + { + "epoch": 0.10928596866096867, + "grad_norm": 0.5038496851921082, + "learning_rate": 0.0003986998812599816, + "loss": 0.5462, + "step": 4910 + }, + { + "epoch": 0.10950854700854701, + "grad_norm": 0.7149390578269958, + "learning_rate": 0.00039869456851221387, + "loss": 0.6658, + "step": 4920 + }, + { + "epoch": 0.10973112535612535, + "grad_norm": 0.5790989398956299, + "learning_rate": 0.00039868924496722064, + "loss": 0.6599, + "step": 4930 + }, + { + "epoch": 0.1099537037037037, + "grad_norm": 0.6562433838844299, + "learning_rate": 0.0003986839106252912, + "loss": 0.6651, + "step": 4940 + }, + { + "epoch": 0.11017628205128205, + "grad_norm": 0.6387081146240234, + "learning_rate": 0.00039867856548671536, + "loss": 0.7696, + "step": 4950 + }, + { + "epoch": 0.1103988603988604, + "grad_norm": 0.7654621005058289, + "learning_rate": 0.00039867320955178364, + "loss": 0.7165, + "step": 4960 + }, + { + "epoch": 0.11062143874643875, + "grad_norm": 0.49957525730133057, + "learning_rate": 0.00039866784282078713, + "loss": 0.6508, + "step": 4970 + }, + { + "epoch": 0.11084401709401709, + "grad_norm": 0.6308586001396179, + "learning_rate": 0.00039866246529401733, + "loss": 0.5636, + "step": 4980 + }, + { + "epoch": 0.11106659544159544, + "grad_norm": 0.6661836504936218, + "learning_rate": 0.0003986570769717666, + "loss": 0.5475, + "step": 4990 + }, + { + "epoch": 0.11128917378917379, + "grad_norm": 0.5627828240394592, + "learning_rate": 0.0003986516778543276, + "loss": 0.6255, + "step": 5000 + }, + { + "epoch": 0.11151175213675214, + "grad_norm": 0.7126817107200623, + "learning_rate": 0.00039864626794199385, + "loss": 0.5624, + "step": 5010 + }, + { + "epoch": 0.11173433048433049, + "grad_norm": 0.7044990658760071, + "learning_rate": 0.00039864084723505925, + "loss": 0.6114, + "step": 5020 + }, + { + "epoch": 0.11195690883190883, + "grad_norm": 0.7257621884346008, + "learning_rate": 0.00039863541573381846, + "loss": 0.6342, + "step": 5030 + }, + { + "epoch": 0.11217948717948718, + "grad_norm": 0.6506055593490601, + "learning_rate": 0.0003986299734385665, + "loss": 0.6685, + "step": 5040 + }, + { + "epoch": 0.11240206552706553, + "grad_norm": 0.7877110838890076, + "learning_rate": 0.0003986245203495992, + "loss": 0.5746, + "step": 5050 + }, + { + "epoch": 0.11262464387464387, + "grad_norm": 0.6759909391403198, + "learning_rate": 0.0003986190564672129, + "loss": 0.6528, + "step": 5060 + }, + { + "epoch": 0.11284722222222222, + "grad_norm": 0.5384511351585388, + "learning_rate": 0.00039861358179170447, + "loss": 0.5897, + "step": 5070 + }, + { + "epoch": 0.11306980056980057, + "grad_norm": 0.6556115746498108, + "learning_rate": 0.0003986080963233714, + "loss": 0.636, + "step": 5080 + }, + { + "epoch": 0.11329237891737892, + "grad_norm": 0.5106657147407532, + "learning_rate": 0.00039860260006251174, + "loss": 0.5044, + "step": 5090 + }, + { + "epoch": 0.11351495726495726, + "grad_norm": 0.6946271657943726, + "learning_rate": 0.0003985970930094242, + "loss": 0.496, + "step": 5100 + }, + { + "epoch": 0.1137375356125356, + "grad_norm": 0.7798992395401001, + "learning_rate": 0.00039859157516440813, + "loss": 0.6639, + "step": 5110 + }, + { + "epoch": 0.11396011396011396, + "grad_norm": 0.5560083389282227, + "learning_rate": 0.00039858604652776323, + "loss": 0.644, + "step": 5120 + }, + { + "epoch": 0.1141826923076923, + "grad_norm": 0.8663046360015869, + "learning_rate": 0.00039858050709979, + "loss": 0.6779, + "step": 5130 + }, + { + "epoch": 0.11440527065527066, + "grad_norm": 0.6664817333221436, + "learning_rate": 0.00039857495688078946, + "loss": 0.6587, + "step": 5140 + }, + { + "epoch": 0.114627849002849, + "grad_norm": 0.6127937436103821, + "learning_rate": 0.00039856939587106324, + "loss": 0.5973, + "step": 5150 + }, + { + "epoch": 0.11485042735042734, + "grad_norm": 0.4794783294200897, + "learning_rate": 0.00039856382407091345, + "loss": 0.5779, + "step": 5160 + }, + { + "epoch": 0.1150730056980057, + "grad_norm": 0.418834924697876, + "learning_rate": 0.0003985582414806429, + "loss": 0.5309, + "step": 5170 + }, + { + "epoch": 0.11529558404558404, + "grad_norm": 0.6403540968894958, + "learning_rate": 0.00039855264810055493, + "loss": 0.6882, + "step": 5180 + }, + { + "epoch": 0.1155181623931624, + "grad_norm": 0.9398502707481384, + "learning_rate": 0.00039854704393095357, + "loss": 0.5824, + "step": 5190 + }, + { + "epoch": 0.11574074074074074, + "grad_norm": 0.6912689208984375, + "learning_rate": 0.0003985414289721433, + "loss": 0.6691, + "step": 5200 + }, + { + "epoch": 0.11596331908831908, + "grad_norm": 0.33099091053009033, + "learning_rate": 0.00039853580322442923, + "loss": 0.5808, + "step": 5210 + }, + { + "epoch": 0.11618589743589744, + "grad_norm": 0.48186391592025757, + "learning_rate": 0.00039853016668811716, + "loss": 0.8117, + "step": 5220 + }, + { + "epoch": 0.11640847578347578, + "grad_norm": 0.46495237946510315, + "learning_rate": 0.0003985245193635132, + "loss": 0.5786, + "step": 5230 + }, + { + "epoch": 0.11663105413105414, + "grad_norm": 0.7267534136772156, + "learning_rate": 0.0003985188612509244, + "loss": 0.5912, + "step": 5240 + }, + { + "epoch": 0.11685363247863248, + "grad_norm": 0.4775184988975525, + "learning_rate": 0.00039851319235065816, + "loss": 0.6497, + "step": 5250 + }, + { + "epoch": 0.11707621082621082, + "grad_norm": 0.9448670148849487, + "learning_rate": 0.00039850751266302253, + "loss": 0.6227, + "step": 5260 + }, + { + "epoch": 0.11729878917378918, + "grad_norm": 0.785839855670929, + "learning_rate": 0.00039850182218832615, + "loss": 0.6165, + "step": 5270 + }, + { + "epoch": 0.11752136752136752, + "grad_norm": 0.4065621495246887, + "learning_rate": 0.00039849612092687824, + "loss": 0.744, + "step": 5280 + }, + { + "epoch": 0.11774394586894586, + "grad_norm": 0.9685630798339844, + "learning_rate": 0.00039849040887898865, + "loss": 0.5659, + "step": 5290 + }, + { + "epoch": 0.11796652421652422, + "grad_norm": 0.5907897353172302, + "learning_rate": 0.0003984846860449677, + "loss": 0.6881, + "step": 5300 + }, + { + "epoch": 0.11818910256410256, + "grad_norm": 0.727581799030304, + "learning_rate": 0.0003984789524251265, + "loss": 0.6058, + "step": 5310 + }, + { + "epoch": 0.11841168091168092, + "grad_norm": 0.6126496195793152, + "learning_rate": 0.00039847320801977647, + "loss": 0.7483, + "step": 5320 + }, + { + "epoch": 0.11863425925925926, + "grad_norm": 1.0143053531646729, + "learning_rate": 0.0003984674528292299, + "loss": 0.53, + "step": 5330 + }, + { + "epoch": 0.1188568376068376, + "grad_norm": 0.6108867526054382, + "learning_rate": 0.00039846168685379944, + "loss": 0.6549, + "step": 5340 + }, + { + "epoch": 0.11907941595441596, + "grad_norm": 0.5987547636032104, + "learning_rate": 0.0003984559100937984, + "loss": 0.5832, + "step": 5350 + }, + { + "epoch": 0.1193019943019943, + "grad_norm": 0.7370131015777588, + "learning_rate": 0.00039845012254954084, + "loss": 0.5923, + "step": 5360 + }, + { + "epoch": 0.11952457264957266, + "grad_norm": 0.6291268467903137, + "learning_rate": 0.0003984443242213411, + "loss": 0.6125, + "step": 5370 + }, + { + "epoch": 0.119747150997151, + "grad_norm": 0.5933393239974976, + "learning_rate": 0.0003984385151095143, + "loss": 0.8159, + "step": 5380 + }, + { + "epoch": 0.11996972934472934, + "grad_norm": 0.833824098110199, + "learning_rate": 0.0003984326952143762, + "loss": 0.6775, + "step": 5390 + }, + { + "epoch": 0.12001424501424501, + "eval_loss": 0.6372599005699158, + "eval_runtime": 337.3844, + "eval_samples_per_second": 7.01, + "eval_steps_per_second": 7.01, + "step": 5392 + }, + { + "epoch": 0.1201923076923077, + "grad_norm": 0.7204414010047913, + "learning_rate": 0.00039842686453624295, + "loss": 0.5655, + "step": 5400 + }, + { + "epoch": 0.12041488603988604, + "grad_norm": 0.7585781812667847, + "learning_rate": 0.0003984210230754315, + "loss": 0.6104, + "step": 5410 + }, + { + "epoch": 0.1206374643874644, + "grad_norm": 0.5115826725959778, + "learning_rate": 0.00039841517083225915, + "loss": 0.663, + "step": 5420 + }, + { + "epoch": 0.12086004273504274, + "grad_norm": 0.38228660821914673, + "learning_rate": 0.0003984093078070441, + "loss": 0.5706, + "step": 5430 + }, + { + "epoch": 0.12108262108262108, + "grad_norm": 0.38406258821487427, + "learning_rate": 0.00039840343400010476, + "loss": 0.5985, + "step": 5440 + }, + { + "epoch": 0.12130519943019943, + "grad_norm": 0.6619420647621155, + "learning_rate": 0.0003983975494117604, + "loss": 0.6852, + "step": 5450 + }, + { + "epoch": 0.12152777777777778, + "grad_norm": 0.5927149653434753, + "learning_rate": 0.00039839165404233077, + "loss": 0.5684, + "step": 5460 + }, + { + "epoch": 0.12175035612535613, + "grad_norm": 0.7376818656921387, + "learning_rate": 0.00039838574789213626, + "loss": 0.5955, + "step": 5470 + }, + { + "epoch": 0.12197293447293447, + "grad_norm": 0.9106239080429077, + "learning_rate": 0.00039837983096149783, + "loss": 0.7286, + "step": 5480 + }, + { + "epoch": 0.12219551282051282, + "grad_norm": 0.6335828304290771, + "learning_rate": 0.00039837390325073694, + "loss": 0.5584, + "step": 5490 + }, + { + "epoch": 0.12241809116809117, + "grad_norm": 0.5210475921630859, + "learning_rate": 0.0003983679647601758, + "loss": 0.6321, + "step": 5500 + }, + { + "epoch": 0.12264066951566951, + "grad_norm": 0.6025746464729309, + "learning_rate": 0.00039836201549013704, + "loss": 0.5565, + "step": 5510 + }, + { + "epoch": 0.12286324786324786, + "grad_norm": 0.570562481880188, + "learning_rate": 0.00039835605544094393, + "loss": 0.7184, + "step": 5520 + }, + { + "epoch": 0.12308582621082621, + "grad_norm": 0.4416629374027252, + "learning_rate": 0.00039835008461292046, + "loss": 0.6752, + "step": 5530 + }, + { + "epoch": 0.12330840455840456, + "grad_norm": 0.7752945423126221, + "learning_rate": 0.000398344103006391, + "loss": 0.6763, + "step": 5540 + }, + { + "epoch": 0.12353098290598291, + "grad_norm": 0.5153147578239441, + "learning_rate": 0.0003983381106216805, + "loss": 0.7373, + "step": 5550 + }, + { + "epoch": 0.12375356125356125, + "grad_norm": 0.44418540596961975, + "learning_rate": 0.00039833210745911484, + "loss": 0.6644, + "step": 5560 + }, + { + "epoch": 0.1239761396011396, + "grad_norm": 0.5131638646125793, + "learning_rate": 0.00039832609351902006, + "loss": 0.5635, + "step": 5570 + }, + { + "epoch": 0.12419871794871795, + "grad_norm": 0.49686259031295776, + "learning_rate": 0.000398320068801723, + "loss": 0.6164, + "step": 5580 + }, + { + "epoch": 0.1244212962962963, + "grad_norm": 0.9833499789237976, + "learning_rate": 0.00039831403330755103, + "loss": 0.5118, + "step": 5590 + }, + { + "epoch": 0.12464387464387465, + "grad_norm": 0.7739784717559814, + "learning_rate": 0.0003983079870368322, + "loss": 0.7191, + "step": 5600 + }, + { + "epoch": 0.12486645299145299, + "grad_norm": 0.5591177344322205, + "learning_rate": 0.00039830192998989493, + "loss": 0.5886, + "step": 5610 + }, + { + "epoch": 0.12508903133903135, + "grad_norm": 0.7678048610687256, + "learning_rate": 0.0003982958621670685, + "loss": 0.7345, + "step": 5620 + }, + { + "epoch": 0.12531160968660968, + "grad_norm": 0.6148435473442078, + "learning_rate": 0.0003982897835686825, + "loss": 0.803, + "step": 5630 + }, + { + "epoch": 0.12553418803418803, + "grad_norm": 0.5740734338760376, + "learning_rate": 0.00039828369419506746, + "loss": 0.6542, + "step": 5640 + }, + { + "epoch": 0.1257567663817664, + "grad_norm": 0.5886821746826172, + "learning_rate": 0.000398277594046554, + "loss": 0.6559, + "step": 5650 + }, + { + "epoch": 0.12597934472934472, + "grad_norm": 0.5730891823768616, + "learning_rate": 0.00039827148312347396, + "loss": 0.8286, + "step": 5660 + }, + { + "epoch": 0.12620192307692307, + "grad_norm": 0.5327824950218201, + "learning_rate": 0.0003982653614261591, + "loss": 0.5672, + "step": 5670 + }, + { + "epoch": 0.12642450142450143, + "grad_norm": 0.6959152221679688, + "learning_rate": 0.0003982592289549422, + "loss": 0.4987, + "step": 5680 + }, + { + "epoch": 0.12664707977207978, + "grad_norm": 0.6996831893920898, + "learning_rate": 0.00039825308571015647, + "loss": 0.6099, + "step": 5690 + }, + { + "epoch": 0.1268696581196581, + "grad_norm": 0.6751108169555664, + "learning_rate": 0.0003982469316921358, + "loss": 0.749, + "step": 5700 + }, + { + "epoch": 0.12709223646723647, + "grad_norm": 0.742348849773407, + "learning_rate": 0.0003982407669012146, + "loss": 0.6497, + "step": 5710 + }, + { + "epoch": 0.12731481481481483, + "grad_norm": 0.5918314456939697, + "learning_rate": 0.0003982345913377278, + "loss": 0.5652, + "step": 5720 + }, + { + "epoch": 0.12753739316239315, + "grad_norm": 0.5717595815658569, + "learning_rate": 0.000398228405002011, + "loss": 0.6369, + "step": 5730 + }, + { + "epoch": 0.1277599715099715, + "grad_norm": 1.0393725633621216, + "learning_rate": 0.0003982222078944005, + "loss": 0.6695, + "step": 5740 + }, + { + "epoch": 0.12798254985754987, + "grad_norm": 0.9899468421936035, + "learning_rate": 0.00039821600001523283, + "loss": 0.7561, + "step": 5750 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 0.43348416686058044, + "learning_rate": 0.0003982097813648455, + "loss": 0.6485, + "step": 5760 + }, + { + "epoch": 0.12842770655270655, + "grad_norm": 0.5063757300376892, + "learning_rate": 0.00039820355194357637, + "loss": 0.5846, + "step": 5770 + }, + { + "epoch": 0.1286502849002849, + "grad_norm": 0.6151053309440613, + "learning_rate": 0.00039819731175176403, + "loss": 0.6263, + "step": 5780 + }, + { + "epoch": 0.12887286324786323, + "grad_norm": 0.5029515027999878, + "learning_rate": 0.00039819106078974747, + "loss": 0.5724, + "step": 5790 + }, + { + "epoch": 0.1290954415954416, + "grad_norm": 0.6326857805252075, + "learning_rate": 0.00039818479905786636, + "loss": 0.7097, + "step": 5800 + }, + { + "epoch": 0.12931801994301995, + "grad_norm": 0.5461398959159851, + "learning_rate": 0.00039817852655646115, + "loss": 0.6849, + "step": 5810 + }, + { + "epoch": 0.1295405982905983, + "grad_norm": 0.579310417175293, + "learning_rate": 0.0003981722432858725, + "loss": 0.6139, + "step": 5820 + }, + { + "epoch": 0.12976317663817663, + "grad_norm": 0.8785983324050903, + "learning_rate": 0.00039816594924644194, + "loss": 0.6478, + "step": 5830 + }, + { + "epoch": 0.129985754985755, + "grad_norm": 0.8515345454216003, + "learning_rate": 0.00039815964443851143, + "loss": 0.5224, + "step": 5840 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.6978330612182617, + "learning_rate": 0.00039815332886242367, + "loss": 0.6621, + "step": 5850 + }, + { + "epoch": 0.13043091168091167, + "grad_norm": 0.9195356965065002, + "learning_rate": 0.0003981470025185218, + "loss": 0.6799, + "step": 5860 + }, + { + "epoch": 0.13065349002849003, + "grad_norm": 0.7251855134963989, + "learning_rate": 0.0003981406654071496, + "loss": 0.6915, + "step": 5870 + }, + { + "epoch": 0.13087606837606838, + "grad_norm": 0.73192298412323, + "learning_rate": 0.00039813431752865145, + "loss": 0.625, + "step": 5880 + }, + { + "epoch": 0.1310986467236467, + "grad_norm": 0.8073905110359192, + "learning_rate": 0.00039812795888337225, + "loss": 0.5896, + "step": 5890 + }, + { + "epoch": 0.13132122507122507, + "grad_norm": 0.46170732378959656, + "learning_rate": 0.00039812158947165755, + "loss": 0.6374, + "step": 5900 + }, + { + "epoch": 0.13154380341880342, + "grad_norm": 0.790496826171875, + "learning_rate": 0.0003981152092938535, + "loss": 0.6458, + "step": 5910 + }, + { + "epoch": 0.13176638176638178, + "grad_norm": 0.9606544971466064, + "learning_rate": 0.0003981088183503069, + "loss": 0.824, + "step": 5920 + }, + { + "epoch": 0.1319889601139601, + "grad_norm": 0.5605319738388062, + "learning_rate": 0.0003981024166413648, + "loss": 0.7279, + "step": 5930 + }, + { + "epoch": 0.13221153846153846, + "grad_norm": 0.7565925717353821, + "learning_rate": 0.00039809600416737523, + "loss": 0.5449, + "step": 5940 + }, + { + "epoch": 0.13243411680911682, + "grad_norm": 0.5933641791343689, + "learning_rate": 0.00039808958092868663, + "loss": 0.6217, + "step": 5950 + }, + { + "epoch": 0.13265669515669515, + "grad_norm": 0.6032646894454956, + "learning_rate": 0.00039808314692564806, + "loss": 0.4533, + "step": 5960 + }, + { + "epoch": 0.1328792735042735, + "grad_norm": 0.6120531558990479, + "learning_rate": 0.00039807670215860917, + "loss": 0.6702, + "step": 5970 + }, + { + "epoch": 0.13310185185185186, + "grad_norm": 0.79580157995224, + "learning_rate": 0.00039807024662792, + "loss": 0.7245, + "step": 5980 + }, + { + "epoch": 0.1333244301994302, + "grad_norm": 0.6910545229911804, + "learning_rate": 0.00039806378033393157, + "loss": 0.5987, + "step": 5990 + }, + { + "epoch": 0.13354700854700854, + "grad_norm": 0.38598570227622986, + "learning_rate": 0.0003980573032769952, + "loss": 0.7247, + "step": 6000 + }, + { + "epoch": 0.1337695868945869, + "grad_norm": 0.6384770274162292, + "learning_rate": 0.0003980508154574628, + "loss": 0.6279, + "step": 6010 + }, + { + "epoch": 0.13399216524216523, + "grad_norm": 0.666942298412323, + "learning_rate": 0.00039804431687568694, + "loss": 0.6807, + "step": 6020 + }, + { + "epoch": 0.13421474358974358, + "grad_norm": 0.4904678463935852, + "learning_rate": 0.0003980378075320208, + "loss": 0.6168, + "step": 6030 + }, + { + "epoch": 0.13443732193732194, + "grad_norm": 0.6417827606201172, + "learning_rate": 0.00039803128742681805, + "loss": 0.5758, + "step": 6040 + }, + { + "epoch": 0.1346599002849003, + "grad_norm": 0.7031834125518799, + "learning_rate": 0.00039802475656043303, + "loss": 0.7149, + "step": 6050 + }, + { + "epoch": 0.13488247863247863, + "grad_norm": 0.6842778325080872, + "learning_rate": 0.00039801821493322067, + "loss": 0.6906, + "step": 6060 + }, + { + "epoch": 0.13510505698005698, + "grad_norm": 0.8279538750648499, + "learning_rate": 0.0003980116625455364, + "loss": 0.6726, + "step": 6070 + }, + { + "epoch": 0.13532763532763534, + "grad_norm": 0.5008348226547241, + "learning_rate": 0.00039800509939773624, + "loss": 0.5404, + "step": 6080 + }, + { + "epoch": 0.13555021367521367, + "grad_norm": 0.831046462059021, + "learning_rate": 0.00039799852549017686, + "loss": 0.728, + "step": 6090 + }, + { + "epoch": 0.13577279202279202, + "grad_norm": 0.5071601867675781, + "learning_rate": 0.00039799194082321555, + "loss": 0.5437, + "step": 6100 + }, + { + "epoch": 0.13599537037037038, + "grad_norm": 0.522179365158081, + "learning_rate": 0.00039798534539721013, + "loss": 0.678, + "step": 6110 + }, + { + "epoch": 0.1362179487179487, + "grad_norm": 0.5688888430595398, + "learning_rate": 0.00039797873921251895, + "loss": 0.4982, + "step": 6120 + }, + { + "epoch": 0.13644052706552706, + "grad_norm": 0.4896734654903412, + "learning_rate": 0.00039797212226950097, + "loss": 0.6654, + "step": 6130 + }, + { + "epoch": 0.13666310541310542, + "grad_norm": 0.6670453548431396, + "learning_rate": 0.0003979654945685158, + "loss": 0.7016, + "step": 6140 + }, + { + "epoch": 0.13688568376068377, + "grad_norm": 0.4609311819076538, + "learning_rate": 0.00039795885610992364, + "loss": 0.6884, + "step": 6150 + }, + { + "epoch": 0.1371082621082621, + "grad_norm": 0.6140702366828918, + "learning_rate": 0.00039795220689408517, + "loss": 0.7774, + "step": 6160 + }, + { + "epoch": 0.13733084045584046, + "grad_norm": 0.613925576210022, + "learning_rate": 0.00039794554692136174, + "loss": 0.4291, + "step": 6170 + }, + { + "epoch": 0.13755341880341881, + "grad_norm": 0.5260471105575562, + "learning_rate": 0.00039793887619211525, + "loss": 0.569, + "step": 6180 + }, + { + "epoch": 0.13777599715099714, + "grad_norm": 0.6670699119567871, + "learning_rate": 0.0003979321947067081, + "loss": 0.5147, + "step": 6190 + }, + { + "epoch": 0.1379985754985755, + "grad_norm": 0.6594804525375366, + "learning_rate": 0.00039792550246550354, + "loss": 0.5451, + "step": 6200 + }, + { + "epoch": 0.13822115384615385, + "grad_norm": 0.8946701884269714, + "learning_rate": 0.0003979187994688651, + "loss": 0.7319, + "step": 6210 + }, + { + "epoch": 0.13844373219373218, + "grad_norm": 0.5910085439682007, + "learning_rate": 0.00039791208571715705, + "loss": 0.6866, + "step": 6220 + }, + { + "epoch": 0.13866631054131054, + "grad_norm": 0.6339130401611328, + "learning_rate": 0.00039790536121074436, + "loss": 0.622, + "step": 6230 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 0.8916711807250977, + "learning_rate": 0.0003978986259499922, + "loss": 0.6254, + "step": 6240 + }, + { + "epoch": 0.13911146723646722, + "grad_norm": 0.4093226492404938, + "learning_rate": 0.0003978918799352668, + "loss": 0.572, + "step": 6250 + }, + { + "epoch": 0.13933404558404558, + "grad_norm": 0.6929894089698792, + "learning_rate": 0.0003978851231669346, + "loss": 0.5564, + "step": 6260 + }, + { + "epoch": 0.13955662393162394, + "grad_norm": 0.6963203549385071, + "learning_rate": 0.00039787835564536277, + "loss": 0.6477, + "step": 6270 + }, + { + "epoch": 0.1397792022792023, + "grad_norm": 0.5985345840454102, + "learning_rate": 0.00039787157737091914, + "loss": 0.6064, + "step": 6280 + }, + { + "epoch": 0.14000178062678062, + "grad_norm": 0.5991149544715881, + "learning_rate": 0.000397864788343972, + "loss": 0.6235, + "step": 6290 + }, + { + "epoch": 0.14022435897435898, + "grad_norm": 0.6351196765899658, + "learning_rate": 0.00039785798856489026, + "loss": 0.6082, + "step": 6300 + }, + { + "epoch": 0.14044693732193733, + "grad_norm": 0.4941553473472595, + "learning_rate": 0.0003978511780340435, + "loss": 0.7004, + "step": 6310 + }, + { + "epoch": 0.14066951566951566, + "grad_norm": 0.5571572780609131, + "learning_rate": 0.0003978443567518017, + "loss": 0.708, + "step": 6320 + }, + { + "epoch": 0.14089209401709402, + "grad_norm": 0.6778060793876648, + "learning_rate": 0.00039783752471853566, + "loss": 0.6762, + "step": 6330 + }, + { + "epoch": 0.14111467236467237, + "grad_norm": 0.7342366576194763, + "learning_rate": 0.00039783068193461653, + "loss": 0.6335, + "step": 6340 + }, + { + "epoch": 0.1413372507122507, + "grad_norm": 0.7785418629646301, + "learning_rate": 0.0003978238284004162, + "loss": 0.6961, + "step": 6350 + }, + { + "epoch": 0.14155982905982906, + "grad_norm": 0.5320806503295898, + "learning_rate": 0.00039781696411630714, + "loss": 0.5599, + "step": 6360 + }, + { + "epoch": 0.1417824074074074, + "grad_norm": 0.6191650032997131, + "learning_rate": 0.0003978100890826622, + "loss": 0.694, + "step": 6370 + }, + { + "epoch": 0.14200498575498577, + "grad_norm": 0.6701613068580627, + "learning_rate": 0.00039780320329985515, + "loss": 0.6344, + "step": 6380 + }, + { + "epoch": 0.1422275641025641, + "grad_norm": 0.529349684715271, + "learning_rate": 0.0003977963067682601, + "loss": 0.586, + "step": 6390 + }, + { + "epoch": 0.14245014245014245, + "grad_norm": 0.7802019119262695, + "learning_rate": 0.00039778939948825184, + "loss": 0.7415, + "step": 6400 + }, + { + "epoch": 0.1426727207977208, + "grad_norm": 0.6360093355178833, + "learning_rate": 0.00039778248146020564, + "loss": 0.5822, + "step": 6410 + }, + { + "epoch": 0.14289529914529914, + "grad_norm": 0.829375684261322, + "learning_rate": 0.0003977755526844975, + "loss": 0.6785, + "step": 6420 + }, + { + "epoch": 0.1431178774928775, + "grad_norm": 0.664879560470581, + "learning_rate": 0.00039776861316150394, + "loss": 0.6984, + "step": 6430 + }, + { + "epoch": 0.14334045584045585, + "grad_norm": 0.47659850120544434, + "learning_rate": 0.000397761662891602, + "loss": 0.5464, + "step": 6440 + }, + { + "epoch": 0.14356303418803418, + "grad_norm": 0.7773748636245728, + "learning_rate": 0.0003977547018751695, + "loss": 0.6879, + "step": 6450 + }, + { + "epoch": 0.14378561253561253, + "grad_norm": 0.6674002408981323, + "learning_rate": 0.0003977477301125845, + "loss": 0.6609, + "step": 6460 + }, + { + "epoch": 0.1440081908831909, + "grad_norm": 0.7954365611076355, + "learning_rate": 0.000397740747604226, + "loss": 0.6521, + "step": 6470 + }, + { + "epoch": 0.14423076923076922, + "grad_norm": 0.929842472076416, + "learning_rate": 0.0003977337543504734, + "loss": 0.6339, + "step": 6480 + }, + { + "epoch": 0.14445334757834757, + "grad_norm": 0.5389747023582458, + "learning_rate": 0.0003977267503517067, + "loss": 0.6441, + "step": 6490 + }, + { + "epoch": 0.14467592592592593, + "grad_norm": 0.5743294358253479, + "learning_rate": 0.00039771973560830657, + "loss": 0.656, + "step": 6500 + }, + { + "epoch": 0.14489850427350429, + "grad_norm": 0.7883462905883789, + "learning_rate": 0.00039771271012065416, + "loss": 0.6832, + "step": 6510 + }, + { + "epoch": 0.14512108262108261, + "grad_norm": 0.8884845972061157, + "learning_rate": 0.0003977056738891311, + "loss": 0.5839, + "step": 6520 + }, + { + "epoch": 0.14534366096866097, + "grad_norm": 0.6745997667312622, + "learning_rate": 0.00039769862691412, + "loss": 0.7437, + "step": 6530 + }, + { + "epoch": 0.14556623931623933, + "grad_norm": 0.4044835865497589, + "learning_rate": 0.00039769156919600363, + "loss": 0.6029, + "step": 6540 + }, + { + "epoch": 0.14578881766381765, + "grad_norm": 0.6320715546607971, + "learning_rate": 0.00039768450073516555, + "loss": 0.645, + "step": 6550 + }, + { + "epoch": 0.146011396011396, + "grad_norm": 0.52195805311203, + "learning_rate": 0.00039767742153198985, + "loss": 0.6134, + "step": 6560 + }, + { + "epoch": 0.14623397435897437, + "grad_norm": 0.6805070638656616, + "learning_rate": 0.00039767033158686125, + "loss": 0.5271, + "step": 6570 + }, + { + "epoch": 0.1464565527065527, + "grad_norm": 0.9781261086463928, + "learning_rate": 0.00039766323090016496, + "loss": 0.617, + "step": 6580 + }, + { + "epoch": 0.14667913105413105, + "grad_norm": 0.7117429971694946, + "learning_rate": 0.00039765611947228696, + "loss": 0.6566, + "step": 6590 + }, + { + "epoch": 0.1469017094017094, + "grad_norm": 0.5962918400764465, + "learning_rate": 0.0003976489973036136, + "loss": 0.7107, + "step": 6600 + }, + { + "epoch": 0.14712428774928774, + "grad_norm": 0.6786366701126099, + "learning_rate": 0.00039764186439453193, + "loss": 0.5947, + "step": 6610 + }, + { + "epoch": 0.1473468660968661, + "grad_norm": 0.5934000611305237, + "learning_rate": 0.00039763472074542955, + "loss": 0.6663, + "step": 6620 + }, + { + "epoch": 0.14756944444444445, + "grad_norm": 0.5991849303245544, + "learning_rate": 0.0003976275663566947, + "loss": 0.5502, + "step": 6630 + }, + { + "epoch": 0.1477920227920228, + "grad_norm": 0.6965344548225403, + "learning_rate": 0.000397620401228716, + "loss": 0.7819, + "step": 6640 + }, + { + "epoch": 0.14801460113960113, + "grad_norm": 0.5270944237709045, + "learning_rate": 0.00039761322536188297, + "loss": 0.5685, + "step": 6650 + }, + { + "epoch": 0.1482371794871795, + "grad_norm": 0.7651955485343933, + "learning_rate": 0.0003976060387565855, + "loss": 0.6355, + "step": 6660 + }, + { + "epoch": 0.14845975783475784, + "grad_norm": 0.969373881816864, + "learning_rate": 0.00039759884141321415, + "loss": 0.7148, + "step": 6670 + }, + { + "epoch": 0.14868233618233617, + "grad_norm": 0.5654862523078918, + "learning_rate": 0.00039759163333215997, + "loss": 0.6992, + "step": 6680 + }, + { + "epoch": 0.14890491452991453, + "grad_norm": 0.4438091218471527, + "learning_rate": 0.00039758441451381464, + "loss": 0.5312, + "step": 6690 + }, + { + "epoch": 0.14912749287749288, + "grad_norm": 0.4933086633682251, + "learning_rate": 0.0003975771849585705, + "loss": 0.6724, + "step": 6700 + }, + { + "epoch": 0.1493500712250712, + "grad_norm": 0.5377151966094971, + "learning_rate": 0.0003975699446668204, + "loss": 0.51, + "step": 6710 + }, + { + "epoch": 0.14957264957264957, + "grad_norm": 0.6050658822059631, + "learning_rate": 0.00039756269363895775, + "loss": 0.6447, + "step": 6720 + }, + { + "epoch": 0.14979522792022792, + "grad_norm": 0.46284544467926025, + "learning_rate": 0.00039755543187537667, + "loss": 0.5348, + "step": 6730 + }, + { + "epoch": 0.15001780626780628, + "grad_norm": 0.5097401142120361, + "learning_rate": 0.0003975481593764716, + "loss": 0.5738, + "step": 6740 + }, + { + "epoch": 0.1502403846153846, + "grad_norm": 0.3497709631919861, + "learning_rate": 0.00039754087614263787, + "loss": 0.6685, + "step": 6750 + }, + { + "epoch": 0.15046296296296297, + "grad_norm": 0.7363093495368958, + "learning_rate": 0.00039753358217427124, + "loss": 0.5343, + "step": 6760 + }, + { + "epoch": 0.15068554131054132, + "grad_norm": 0.7607445120811462, + "learning_rate": 0.000397526277471768, + "loss": 0.6038, + "step": 6770 + }, + { + "epoch": 0.15090811965811965, + "grad_norm": 0.6368987560272217, + "learning_rate": 0.0003975189620355251, + "loss": 0.5928, + "step": 6780 + }, + { + "epoch": 0.151130698005698, + "grad_norm": 0.7133484482765198, + "learning_rate": 0.00039751163586594017, + "loss": 0.6399, + "step": 6790 + }, + { + "epoch": 0.15135327635327636, + "grad_norm": 0.6672192215919495, + "learning_rate": 0.0003975042989634113, + "loss": 0.5147, + "step": 6800 + }, + { + "epoch": 0.1515758547008547, + "grad_norm": 0.597719669342041, + "learning_rate": 0.000397496951328337, + "loss": 0.6102, + "step": 6810 + }, + { + "epoch": 0.15179843304843305, + "grad_norm": 0.4774816334247589, + "learning_rate": 0.00039748959296111684, + "loss": 0.5469, + "step": 6820 + }, + { + "epoch": 0.1520210113960114, + "grad_norm": 0.789928674697876, + "learning_rate": 0.00039748222386215044, + "loss": 0.574, + "step": 6830 + }, + { + "epoch": 0.15224358974358973, + "grad_norm": 0.6176126003265381, + "learning_rate": 0.0003974748440318384, + "loss": 0.6669, + "step": 6840 + }, + { + "epoch": 0.15246616809116809, + "grad_norm": 0.5427154302597046, + "learning_rate": 0.0003974674534705816, + "loss": 0.5147, + "step": 6850 + }, + { + "epoch": 0.15268874643874644, + "grad_norm": 0.6813560724258423, + "learning_rate": 0.00039746005217878173, + "loss": 0.5202, + "step": 6860 + }, + { + "epoch": 0.1529113247863248, + "grad_norm": 0.46288907527923584, + "learning_rate": 0.00039745264015684096, + "loss": 0.6455, + "step": 6870 + }, + { + "epoch": 0.15313390313390313, + "grad_norm": 0.5715744495391846, + "learning_rate": 0.00039744521740516214, + "loss": 0.7415, + "step": 6880 + }, + { + "epoch": 0.15335648148148148, + "grad_norm": 0.7895148396492004, + "learning_rate": 0.0003974377839241486, + "loss": 0.6318, + "step": 6890 + }, + { + "epoch": 0.15357905982905984, + "grad_norm": 0.6893559098243713, + "learning_rate": 0.00039743033971420414, + "loss": 0.7765, + "step": 6900 + }, + { + "epoch": 0.15380163817663817, + "grad_norm": 0.5792664289474487, + "learning_rate": 0.0003974228847757335, + "loss": 0.7086, + "step": 6910 + }, + { + "epoch": 0.15402421652421652, + "grad_norm": 0.7981087565422058, + "learning_rate": 0.0003974154191091416, + "loss": 0.59, + "step": 6920 + }, + { + "epoch": 0.15424679487179488, + "grad_norm": 0.5684230923652649, + "learning_rate": 0.0003974079427148342, + "loss": 0.5112, + "step": 6930 + }, + { + "epoch": 0.1544693732193732, + "grad_norm": 0.6198676824569702, + "learning_rate": 0.0003974004555932177, + "loss": 0.599, + "step": 6940 + }, + { + "epoch": 0.15469195156695156, + "grad_norm": 0.8237385749816895, + "learning_rate": 0.00039739295774469875, + "loss": 0.6301, + "step": 6950 + }, + { + "epoch": 0.15491452991452992, + "grad_norm": 0.8568017482757568, + "learning_rate": 0.00039738544916968494, + "loss": 0.8313, + "step": 6960 + }, + { + "epoch": 0.15513710826210828, + "grad_norm": 0.5193782448768616, + "learning_rate": 0.0003973779298685842, + "loss": 0.6072, + "step": 6970 + }, + { + "epoch": 0.1553596866096866, + "grad_norm": 0.7648767828941345, + "learning_rate": 0.0003973703998418052, + "loss": 0.6464, + "step": 6980 + }, + { + "epoch": 0.15558226495726496, + "grad_norm": 0.49430105090141296, + "learning_rate": 0.0003973628590897571, + "loss": 0.6868, + "step": 6990 + }, + { + "epoch": 0.15580484330484332, + "grad_norm": 0.38553017377853394, + "learning_rate": 0.0003973553076128496, + "loss": 0.6342, + "step": 7000 + }, + { + "epoch": 0.15602742165242164, + "grad_norm": 0.6819826364517212, + "learning_rate": 0.00039734774541149315, + "loss": 0.6189, + "step": 7010 + }, + { + "epoch": 0.15625, + "grad_norm": 1.0453999042510986, + "learning_rate": 0.0003973401724860987, + "loss": 0.6177, + "step": 7020 + }, + { + "epoch": 0.15647257834757836, + "grad_norm": 0.6561777591705322, + "learning_rate": 0.0003973325888370777, + "loss": 0.7275, + "step": 7030 + }, + { + "epoch": 0.15669515669515668, + "grad_norm": 0.574906051158905, + "learning_rate": 0.0003973249944648423, + "loss": 0.5917, + "step": 7040 + }, + { + "epoch": 0.15691773504273504, + "grad_norm": 0.961545467376709, + "learning_rate": 0.0003973173893698051, + "loss": 0.6971, + "step": 7050 + }, + { + "epoch": 0.1571403133903134, + "grad_norm": 0.48292088508605957, + "learning_rate": 0.00039730977355237953, + "loss": 0.7646, + "step": 7060 + }, + { + "epoch": 0.15736289173789172, + "grad_norm": 0.8326444029808044, + "learning_rate": 0.00039730214701297925, + "loss": 0.6865, + "step": 7070 + }, + { + "epoch": 0.15758547008547008, + "grad_norm": 0.6857150197029114, + "learning_rate": 0.0003972945097520188, + "loss": 0.7559, + "step": 7080 + }, + { + "epoch": 0.15780804843304844, + "grad_norm": 0.6443191766738892, + "learning_rate": 0.0003972868617699132, + "loss": 0.5099, + "step": 7090 + }, + { + "epoch": 0.1580306267806268, + "grad_norm": 0.7539528012275696, + "learning_rate": 0.000397279203067078, + "loss": 0.7146, + "step": 7100 + }, + { + "epoch": 0.15825320512820512, + "grad_norm": 0.548685610294342, + "learning_rate": 0.00039727153364392943, + "loss": 0.6138, + "step": 7110 + }, + { + "epoch": 0.15847578347578348, + "grad_norm": 0.7192095518112183, + "learning_rate": 0.0003972638535008842, + "loss": 0.727, + "step": 7120 + }, + { + "epoch": 0.15869836182336183, + "grad_norm": 0.7837377190589905, + "learning_rate": 0.0003972561626383597, + "loss": 0.6666, + "step": 7130 + }, + { + "epoch": 0.15892094017094016, + "grad_norm": 0.6404113173484802, + "learning_rate": 0.00039724846105677387, + "loss": 0.823, + "step": 7140 + }, + { + "epoch": 0.15914351851851852, + "grad_norm": 0.9070250988006592, + "learning_rate": 0.0003972407487565452, + "loss": 0.7189, + "step": 7150 + }, + { + "epoch": 0.15936609686609687, + "grad_norm": 0.5076274871826172, + "learning_rate": 0.0003972330257380927, + "loss": 0.64, + "step": 7160 + }, + { + "epoch": 0.1595886752136752, + "grad_norm": 0.628984272480011, + "learning_rate": 0.00039722529200183614, + "loss": 0.6955, + "step": 7170 + }, + { + "epoch": 0.15981125356125356, + "grad_norm": 0.6998558640480042, + "learning_rate": 0.0003972175475481958, + "loss": 0.6029, + "step": 7180 + }, + { + "epoch": 0.16003383190883191, + "grad_norm": 0.6283873319625854, + "learning_rate": 0.0003972097923775924, + "loss": 0.6558, + "step": 7190 + }, + { + "epoch": 0.16025641025641027, + "grad_norm": 0.4948321282863617, + "learning_rate": 0.0003972020264904475, + "loss": 0.6605, + "step": 7200 + }, + { + "epoch": 0.1604789886039886, + "grad_norm": 0.6738045811653137, + "learning_rate": 0.00039719424988718307, + "loss": 0.6408, + "step": 7210 + }, + { + "epoch": 0.16070156695156695, + "grad_norm": 0.6499155163764954, + "learning_rate": 0.00039718646256822163, + "loss": 0.5679, + "step": 7220 + }, + { + "epoch": 0.1609241452991453, + "grad_norm": 0.8065938949584961, + "learning_rate": 0.0003971786645339864, + "loss": 0.6205, + "step": 7230 + }, + { + "epoch": 0.16114672364672364, + "grad_norm": 0.5457149744033813, + "learning_rate": 0.00039717085578490114, + "loss": 0.5933, + "step": 7240 + }, + { + "epoch": 0.161369301994302, + "grad_norm": 0.6167128682136536, + "learning_rate": 0.0003971630363213901, + "loss": 0.6124, + "step": 7250 + }, + { + "epoch": 0.16159188034188035, + "grad_norm": 1.0074851512908936, + "learning_rate": 0.00039715520614387834, + "loss": 0.5991, + "step": 7260 + }, + { + "epoch": 0.16181445868945868, + "grad_norm": 0.575910747051239, + "learning_rate": 0.0003971473652527912, + "loss": 0.7633, + "step": 7270 + }, + { + "epoch": 0.16203703703703703, + "grad_norm": 0.6274125576019287, + "learning_rate": 0.00039713951364855486, + "loss": 0.5563, + "step": 7280 + }, + { + "epoch": 0.1622596153846154, + "grad_norm": 0.7009009122848511, + "learning_rate": 0.000397131651331596, + "loss": 0.6355, + "step": 7290 + }, + { + "epoch": 0.16248219373219372, + "grad_norm": 0.532028317451477, + "learning_rate": 0.00039712377830234183, + "loss": 0.6231, + "step": 7300 + }, + { + "epoch": 0.16270477207977208, + "grad_norm": 0.7437495589256287, + "learning_rate": 0.0003971158945612201, + "loss": 0.8354, + "step": 7310 + }, + { + "epoch": 0.16292735042735043, + "grad_norm": 0.7639107704162598, + "learning_rate": 0.00039710800010865936, + "loss": 0.6896, + "step": 7320 + }, + { + "epoch": 0.1631499287749288, + "grad_norm": 0.7520061731338501, + "learning_rate": 0.0003971000949450885, + "loss": 0.6361, + "step": 7330 + }, + { + "epoch": 0.16337250712250712, + "grad_norm": 0.9273715615272522, + "learning_rate": 0.00039709217907093715, + "loss": 0.6059, + "step": 7340 + }, + { + "epoch": 0.16359508547008547, + "grad_norm": 0.8598505854606628, + "learning_rate": 0.00039708425248663546, + "loss": 0.6117, + "step": 7350 + }, + { + "epoch": 0.16381766381766383, + "grad_norm": 0.616790235042572, + "learning_rate": 0.00039707631519261415, + "loss": 0.6402, + "step": 7360 + }, + { + "epoch": 0.16404024216524216, + "grad_norm": 0.5139318704605103, + "learning_rate": 0.0003970683671893045, + "loss": 0.5693, + "step": 7370 + }, + { + "epoch": 0.1642628205128205, + "grad_norm": 0.563727617263794, + "learning_rate": 0.0003970604084771385, + "loss": 0.5927, + "step": 7380 + }, + { + "epoch": 0.16448539886039887, + "grad_norm": 0.7112362384796143, + "learning_rate": 0.0003970524390565485, + "loss": 0.5696, + "step": 7390 + }, + { + "epoch": 0.1647079772079772, + "grad_norm": 0.4131312668323517, + "learning_rate": 0.0003970444589279677, + "loss": 0.7855, + "step": 7400 + }, + { + "epoch": 0.16493055555555555, + "grad_norm": 0.8097583055496216, + "learning_rate": 0.0003970364680918297, + "loss": 0.6333, + "step": 7410 + }, + { + "epoch": 0.1651531339031339, + "grad_norm": 0.632159411907196, + "learning_rate": 0.0003970284665485688, + "loss": 0.5816, + "step": 7420 + }, + { + "epoch": 0.16537571225071226, + "grad_norm": 0.5271141529083252, + "learning_rate": 0.00039702045429861955, + "loss": 0.6219, + "step": 7430 + }, + { + "epoch": 0.1655982905982906, + "grad_norm": 0.36876511573791504, + "learning_rate": 0.00039701243134241765, + "loss": 0.6601, + "step": 7440 + }, + { + "epoch": 0.16582086894586895, + "grad_norm": 0.755054771900177, + "learning_rate": 0.0003970043976803989, + "loss": 0.6177, + "step": 7450 + }, + { + "epoch": 0.1660434472934473, + "grad_norm": 0.7070847153663635, + "learning_rate": 0.00039699635331299994, + "loss": 0.6584, + "step": 7460 + }, + { + "epoch": 0.16626602564102563, + "grad_norm": 0.523898720741272, + "learning_rate": 0.00039698829824065784, + "loss": 0.6504, + "step": 7470 + }, + { + "epoch": 0.166488603988604, + "grad_norm": 0.6656180620193481, + "learning_rate": 0.00039698023246381036, + "loss": 0.7437, + "step": 7480 + }, + { + "epoch": 0.16671118233618235, + "grad_norm": 0.8116356730461121, + "learning_rate": 0.0003969721559828958, + "loss": 0.6025, + "step": 7490 + }, + { + "epoch": 0.16693376068376067, + "grad_norm": 0.5214651823043823, + "learning_rate": 0.00039696406879835306, + "loss": 0.5921, + "step": 7500 + }, + { + "epoch": 0.16715633903133903, + "grad_norm": 0.5218111276626587, + "learning_rate": 0.00039695597091062154, + "loss": 0.5214, + "step": 7510 + }, + { + "epoch": 0.16737891737891739, + "grad_norm": 0.838103711605072, + "learning_rate": 0.0003969478623201413, + "loss": 0.6568, + "step": 7520 + }, + { + "epoch": 0.16760149572649571, + "grad_norm": 0.6541166305541992, + "learning_rate": 0.00039693974302735304, + "loss": 0.7354, + "step": 7530 + }, + { + "epoch": 0.16782407407407407, + "grad_norm": 0.6170065999031067, + "learning_rate": 0.0003969316130326979, + "loss": 0.6438, + "step": 7540 + }, + { + "epoch": 0.16804665242165243, + "grad_norm": 0.7294967770576477, + "learning_rate": 0.0003969234723366177, + "loss": 0.687, + "step": 7550 + }, + { + "epoch": 0.16826923076923078, + "grad_norm": 0.8346336483955383, + "learning_rate": 0.00039691532093955484, + "loss": 0.6582, + "step": 7560 + }, + { + "epoch": 0.1684918091168091, + "grad_norm": 0.5497300028800964, + "learning_rate": 0.00039690715884195223, + "loss": 0.7647, + "step": 7570 + }, + { + "epoch": 0.16871438746438747, + "grad_norm": 0.7623947858810425, + "learning_rate": 0.0003968989860442534, + "loss": 0.6478, + "step": 7580 + }, + { + "epoch": 0.16893696581196582, + "grad_norm": 0.8158397078514099, + "learning_rate": 0.0003968908025469024, + "loss": 0.6687, + "step": 7590 + }, + { + "epoch": 0.16915954415954415, + "grad_norm": 0.369629830121994, + "learning_rate": 0.0003968826083503441, + "loss": 0.6755, + "step": 7600 + }, + { + "epoch": 0.1693821225071225, + "grad_norm": 0.5609103441238403, + "learning_rate": 0.00039687440345502364, + "loss": 0.5001, + "step": 7610 + }, + { + "epoch": 0.16960470085470086, + "grad_norm": 0.5148101449012756, + "learning_rate": 0.000396866187861387, + "loss": 0.5391, + "step": 7620 + }, + { + "epoch": 0.1698272792022792, + "grad_norm": 0.47444501519203186, + "learning_rate": 0.0003968579615698805, + "loss": 0.5694, + "step": 7630 + }, + { + "epoch": 0.17004985754985755, + "grad_norm": 0.6512032151222229, + "learning_rate": 0.0003968497245809512, + "loss": 0.5598, + "step": 7640 + }, + { + "epoch": 0.1702724358974359, + "grad_norm": 0.7191357016563416, + "learning_rate": 0.0003968414768950467, + "loss": 0.5568, + "step": 7650 + }, + { + "epoch": 0.17049501424501423, + "grad_norm": 0.8455227017402649, + "learning_rate": 0.00039683321851261526, + "loss": 0.7559, + "step": 7660 + }, + { + "epoch": 0.1707175925925926, + "grad_norm": 0.5608453154563904, + "learning_rate": 0.00039682494943410555, + "loss": 0.7258, + "step": 7670 + }, + { + "epoch": 0.17094017094017094, + "grad_norm": 0.5924692153930664, + "learning_rate": 0.000396816669659967, + "loss": 0.5982, + "step": 7680 + }, + { + "epoch": 0.1711627492877493, + "grad_norm": 0.5104554891586304, + "learning_rate": 0.00039680837919064943, + "loss": 0.646, + "step": 7690 + }, + { + "epoch": 0.17138532763532763, + "grad_norm": 0.6683109998703003, + "learning_rate": 0.0003968000780266035, + "loss": 0.6309, + "step": 7700 + }, + { + "epoch": 0.17160790598290598, + "grad_norm": 0.9296523928642273, + "learning_rate": 0.0003967917661682802, + "loss": 0.5818, + "step": 7710 + }, + { + "epoch": 0.17183048433048434, + "grad_norm": 0.6392726302146912, + "learning_rate": 0.00039678344361613113, + "loss": 0.6847, + "step": 7720 + }, + { + "epoch": 0.17205306267806267, + "grad_norm": 0.35833972692489624, + "learning_rate": 0.0003967751103706088, + "loss": 0.5908, + "step": 7730 + }, + { + "epoch": 0.17227564102564102, + "grad_norm": 0.8805040121078491, + "learning_rate": 0.0003967667664321658, + "loss": 0.7011, + "step": 7740 + }, + { + "epoch": 0.17249821937321938, + "grad_norm": 0.6751018166542053, + "learning_rate": 0.00039675841180125557, + "loss": 0.4629, + "step": 7750 + }, + { + "epoch": 0.1727207977207977, + "grad_norm": 0.5945695638656616, + "learning_rate": 0.00039675004647833227, + "loss": 0.6547, + "step": 7760 + }, + { + "epoch": 0.17294337606837606, + "grad_norm": 1.1874905824661255, + "learning_rate": 0.00039674167046385033, + "loss": 0.6882, + "step": 7770 + }, + { + "epoch": 0.17316595441595442, + "grad_norm": 0.6665182709693909, + "learning_rate": 0.000396733283758265, + "loss": 0.4854, + "step": 7780 + }, + { + "epoch": 0.17338853276353278, + "grad_norm": 0.45658349990844727, + "learning_rate": 0.0003967248863620319, + "loss": 0.6115, + "step": 7790 + }, + { + "epoch": 0.1736111111111111, + "grad_norm": 0.5834914445877075, + "learning_rate": 0.00039671647827560746, + "loss": 0.536, + "step": 7800 + }, + { + "epoch": 0.17383368945868946, + "grad_norm": 0.3922024369239807, + "learning_rate": 0.0003967080594994486, + "loss": 0.4614, + "step": 7810 + }, + { + "epoch": 0.17405626780626782, + "grad_norm": 0.6845150589942932, + "learning_rate": 0.00039669963003401273, + "loss": 0.705, + "step": 7820 + }, + { + "epoch": 0.17427884615384615, + "grad_norm": 0.7234890460968018, + "learning_rate": 0.00039669118987975793, + "loss": 0.6177, + "step": 7830 + }, + { + "epoch": 0.1745014245014245, + "grad_norm": 0.9000455141067505, + "learning_rate": 0.0003966827390371428, + "loss": 0.689, + "step": 7840 + }, + { + "epoch": 0.17472400284900286, + "grad_norm": 0.5849109292030334, + "learning_rate": 0.00039667427750662674, + "loss": 0.6172, + "step": 7850 + }, + { + "epoch": 0.17494658119658119, + "grad_norm": 0.6037909388542175, + "learning_rate": 0.00039666580528866934, + "loss": 0.6353, + "step": 7860 + }, + { + "epoch": 0.17516915954415954, + "grad_norm": 0.4271775782108307, + "learning_rate": 0.0003966573223837311, + "loss": 0.6561, + "step": 7870 + }, + { + "epoch": 0.1753917378917379, + "grad_norm": 0.6348351836204529, + "learning_rate": 0.00039664882879227297, + "loss": 0.5983, + "step": 7880 + }, + { + "epoch": 0.17561431623931623, + "grad_norm": 0.7074285745620728, + "learning_rate": 0.0003966403245147565, + "loss": 0.7175, + "step": 7890 + }, + { + "epoch": 0.17583689458689458, + "grad_norm": 0.7125088572502136, + "learning_rate": 0.00039663180955164387, + "loss": 0.6112, + "step": 7900 + }, + { + "epoch": 0.17605947293447294, + "grad_norm": 0.5250646471977234, + "learning_rate": 0.00039662328390339767, + "loss": 0.6212, + "step": 7910 + }, + { + "epoch": 0.1762820512820513, + "grad_norm": 0.5152850151062012, + "learning_rate": 0.0003966147475704813, + "loss": 0.6286, + "step": 7920 + }, + { + "epoch": 0.17650462962962962, + "grad_norm": 0.674994945526123, + "learning_rate": 0.0003966062005533585, + "loss": 0.5661, + "step": 7930 + }, + { + "epoch": 0.17672720797720798, + "grad_norm": 0.8784959316253662, + "learning_rate": 0.00039659764285249395, + "loss": 0.6632, + "step": 7940 + }, + { + "epoch": 0.17694978632478633, + "grad_norm": 0.6024604439735413, + "learning_rate": 0.00039658907446835247, + "loss": 0.5838, + "step": 7950 + }, + { + "epoch": 0.17717236467236466, + "grad_norm": 0.6517950892448425, + "learning_rate": 0.00039658049540139975, + "loss": 0.6594, + "step": 7960 + }, + { + "epoch": 0.17739494301994302, + "grad_norm": 0.9225330948829651, + "learning_rate": 0.000396571905652102, + "loss": 0.6816, + "step": 7970 + }, + { + "epoch": 0.17761752136752137, + "grad_norm": 0.5469350218772888, + "learning_rate": 0.00039656330522092596, + "loss": 0.5391, + "step": 7980 + }, + { + "epoch": 0.1778400997150997, + "grad_norm": 0.7431405782699585, + "learning_rate": 0.000396554694108339, + "loss": 0.7307, + "step": 7990 + }, + { + "epoch": 0.17806267806267806, + "grad_norm": 0.643224835395813, + "learning_rate": 0.00039654607231480904, + "loss": 0.6326, + "step": 8000 + }, + { + "epoch": 0.17828525641025642, + "grad_norm": 0.6889805197715759, + "learning_rate": 0.0003965374398408047, + "loss": 0.5998, + "step": 8010 + }, + { + "epoch": 0.17850783475783477, + "grad_norm": 0.6159041523933411, + "learning_rate": 0.00039652879668679487, + "loss": 0.6374, + "step": 8020 + }, + { + "epoch": 0.1787304131054131, + "grad_norm": 0.63005131483078, + "learning_rate": 0.0003965201428532494, + "loss": 0.6403, + "step": 8030 + }, + { + "epoch": 0.17895299145299146, + "grad_norm": 0.4250248372554779, + "learning_rate": 0.00039651147834063853, + "loss": 0.5081, + "step": 8040 + }, + { + "epoch": 0.1791755698005698, + "grad_norm": 0.757688045501709, + "learning_rate": 0.00039650280314943294, + "loss": 0.6617, + "step": 8050 + }, + { + "epoch": 0.17939814814814814, + "grad_norm": 0.5579215884208679, + "learning_rate": 0.00039649411728010425, + "loss": 0.5452, + "step": 8060 + }, + { + "epoch": 0.1796207264957265, + "grad_norm": 0.5789703726768494, + "learning_rate": 0.00039648542073312436, + "loss": 0.7948, + "step": 8070 + }, + { + "epoch": 0.17984330484330485, + "grad_norm": 0.5581598281860352, + "learning_rate": 0.0003964767135089658, + "loss": 0.7904, + "step": 8080 + }, + { + "epoch": 0.18002136752136752, + "eval_loss": 0.6378280520439148, + "eval_runtime": 337.3975, + "eval_samples_per_second": 7.01, + "eval_steps_per_second": 7.01, + "step": 8088 + }, + { + "epoch": 0.18006588319088318, + "grad_norm": 0.855660080909729, + "learning_rate": 0.00039646799560810183, + "loss": 0.7713, + "step": 8090 + }, + { + "epoch": 0.18028846153846154, + "grad_norm": 1.1552504301071167, + "learning_rate": 0.00039645926703100613, + "loss": 0.6661, + "step": 8100 + }, + { + "epoch": 0.1805110398860399, + "grad_norm": 0.6898704767227173, + "learning_rate": 0.000396450527778153, + "loss": 0.7366, + "step": 8110 + }, + { + "epoch": 0.18073361823361822, + "grad_norm": 0.9909326434135437, + "learning_rate": 0.0003964417778500175, + "loss": 0.6548, + "step": 8120 + }, + { + "epoch": 0.18095619658119658, + "grad_norm": 0.8328597545623779, + "learning_rate": 0.0003964330172470748, + "loss": 0.537, + "step": 8130 + }, + { + "epoch": 0.18117877492877493, + "grad_norm": 0.696701169013977, + "learning_rate": 0.00039642424596980126, + "loss": 0.8127, + "step": 8140 + }, + { + "epoch": 0.1814013532763533, + "grad_norm": 0.6511324644088745, + "learning_rate": 0.00039641546401867337, + "loss": 0.4931, + "step": 8150 + }, + { + "epoch": 0.18162393162393162, + "grad_norm": 0.5209097266197205, + "learning_rate": 0.00039640667139416837, + "loss": 0.5291, + "step": 8160 + }, + { + "epoch": 0.18184650997150997, + "grad_norm": 0.6797854900360107, + "learning_rate": 0.00039639786809676406, + "loss": 0.584, + "step": 8170 + }, + { + "epoch": 0.18206908831908833, + "grad_norm": 0.5029028058052063, + "learning_rate": 0.0003963890541269388, + "loss": 0.6508, + "step": 8180 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.6294994354248047, + "learning_rate": 0.00039638022948517153, + "loss": 0.5387, + "step": 8190 + }, + { + "epoch": 0.182514245014245, + "grad_norm": 0.7849052548408508, + "learning_rate": 0.0003963713941719419, + "loss": 0.6858, + "step": 8200 + }, + { + "epoch": 0.18273682336182337, + "grad_norm": 0.5021756291389465, + "learning_rate": 0.0003963625481877299, + "loss": 0.6363, + "step": 8210 + }, + { + "epoch": 0.1829594017094017, + "grad_norm": 0.7355058193206787, + "learning_rate": 0.00039635369153301635, + "loss": 0.7881, + "step": 8220 + }, + { + "epoch": 0.18318198005698005, + "grad_norm": 0.6309876441955566, + "learning_rate": 0.0003963448242082824, + "loss": 0.6424, + "step": 8230 + }, + { + "epoch": 0.1834045584045584, + "grad_norm": 0.4791712164878845, + "learning_rate": 0.0003963359462140099, + "loss": 0.5954, + "step": 8240 + }, + { + "epoch": 0.18362713675213677, + "grad_norm": 0.4504138231277466, + "learning_rate": 0.0003963270575506815, + "loss": 0.5041, + "step": 8250 + }, + { + "epoch": 0.1838497150997151, + "grad_norm": 0.6858550310134888, + "learning_rate": 0.00039631815821878, + "loss": 0.6295, + "step": 8260 + }, + { + "epoch": 0.18407229344729345, + "grad_norm": 0.5094566941261292, + "learning_rate": 0.00039630924821878907, + "loss": 0.6105, + "step": 8270 + }, + { + "epoch": 0.1842948717948718, + "grad_norm": 0.827899158000946, + "learning_rate": 0.0003963003275511929, + "loss": 0.5796, + "step": 8280 + }, + { + "epoch": 0.18451745014245013, + "grad_norm": 0.6932994723320007, + "learning_rate": 0.00039629139621647625, + "loss": 0.6434, + "step": 8290 + }, + { + "epoch": 0.1847400284900285, + "grad_norm": 0.5510367155075073, + "learning_rate": 0.00039628245421512436, + "loss": 0.6752, + "step": 8300 + }, + { + "epoch": 0.18496260683760685, + "grad_norm": 0.6791307330131531, + "learning_rate": 0.0003962735015476233, + "loss": 0.7159, + "step": 8310 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.5468825101852417, + "learning_rate": 0.00039626453821445945, + "loss": 0.5856, + "step": 8320 + }, + { + "epoch": 0.18540776353276353, + "grad_norm": 0.7370385527610779, + "learning_rate": 0.00039625556421611993, + "loss": 0.645, + "step": 8330 + }, + { + "epoch": 0.1856303418803419, + "grad_norm": 0.5625198483467102, + "learning_rate": 0.00039624657955309237, + "loss": 0.5382, + "step": 8340 + }, + { + "epoch": 0.18585292022792022, + "grad_norm": 0.7717461585998535, + "learning_rate": 0.00039623758422586514, + "loss": 0.5829, + "step": 8350 + }, + { + "epoch": 0.18607549857549857, + "grad_norm": 0.6957966685295105, + "learning_rate": 0.0003962285782349268, + "loss": 0.6079, + "step": 8360 + }, + { + "epoch": 0.18629807692307693, + "grad_norm": 0.5371886491775513, + "learning_rate": 0.0003962195615807669, + "loss": 0.6639, + "step": 8370 + }, + { + "epoch": 0.18652065527065528, + "grad_norm": 0.575091540813446, + "learning_rate": 0.0003962105342638754, + "loss": 0.5967, + "step": 8380 + }, + { + "epoch": 0.1867432336182336, + "grad_norm": 0.5292919278144836, + "learning_rate": 0.00039620149628474284, + "loss": 0.6593, + "step": 8390 + }, + { + "epoch": 0.18696581196581197, + "grad_norm": 0.5691158771514893, + "learning_rate": 0.0003961924476438604, + "loss": 0.572, + "step": 8400 + }, + { + "epoch": 0.18718839031339032, + "grad_norm": 0.5576357245445251, + "learning_rate": 0.0003961833883417197, + "loss": 0.5306, + "step": 8410 + }, + { + "epoch": 0.18741096866096865, + "grad_norm": 0.7963212132453918, + "learning_rate": 0.00039617431837881306, + "loss": 0.6785, + "step": 8420 + }, + { + "epoch": 0.187633547008547, + "grad_norm": 0.648432195186615, + "learning_rate": 0.00039616523775563346, + "loss": 0.5836, + "step": 8430 + }, + { + "epoch": 0.18785612535612536, + "grad_norm": 0.7052241563796997, + "learning_rate": 0.0003961561464726742, + "loss": 0.6708, + "step": 8440 + }, + { + "epoch": 0.1880787037037037, + "grad_norm": 0.6538907885551453, + "learning_rate": 0.0003961470445304293, + "loss": 0.5733, + "step": 8450 + }, + { + "epoch": 0.18830128205128205, + "grad_norm": 0.5793173909187317, + "learning_rate": 0.0003961379319293935, + "loss": 0.6527, + "step": 8460 + }, + { + "epoch": 0.1885238603988604, + "grad_norm": 0.9065718650817871, + "learning_rate": 0.0003961288086700619, + "loss": 0.6676, + "step": 8470 + }, + { + "epoch": 0.18874643874643873, + "grad_norm": 0.6593164205551147, + "learning_rate": 0.00039611967475293024, + "loss": 0.7057, + "step": 8480 + }, + { + "epoch": 0.1889690170940171, + "grad_norm": 0.6331801414489746, + "learning_rate": 0.0003961105301784949, + "loss": 0.668, + "step": 8490 + }, + { + "epoch": 0.18919159544159544, + "grad_norm": 0.8595057725906372, + "learning_rate": 0.0003961013749472529, + "loss": 0.8011, + "step": 8500 + }, + { + "epoch": 0.1894141737891738, + "grad_norm": 0.8133804798126221, + "learning_rate": 0.00039609220905970153, + "loss": 0.6667, + "step": 8510 + }, + { + "epoch": 0.18963675213675213, + "grad_norm": 0.7005951404571533, + "learning_rate": 0.00039608303251633905, + "loss": 0.5827, + "step": 8520 + }, + { + "epoch": 0.18985933048433049, + "grad_norm": 0.6705909371376038, + "learning_rate": 0.00039607384531766405, + "loss": 0.6357, + "step": 8530 + }, + { + "epoch": 0.19008190883190884, + "grad_norm": 0.3930394649505615, + "learning_rate": 0.00039606464746417576, + "loss": 0.6741, + "step": 8540 + }, + { + "epoch": 0.19030448717948717, + "grad_norm": 0.5143939852714539, + "learning_rate": 0.00039605543895637405, + "loss": 0.6344, + "step": 8550 + }, + { + "epoch": 0.19052706552706553, + "grad_norm": 1.4165090322494507, + "learning_rate": 0.0003960462197947593, + "loss": 0.6876, + "step": 8560 + }, + { + "epoch": 0.19074964387464388, + "grad_norm": 0.5349679589271545, + "learning_rate": 0.00039603698997983243, + "loss": 0.6644, + "step": 8570 + }, + { + "epoch": 0.1909722222222222, + "grad_norm": 0.8081477284431458, + "learning_rate": 0.0003960277495120951, + "loss": 0.6176, + "step": 8580 + }, + { + "epoch": 0.19119480056980057, + "grad_norm": 0.6244815587997437, + "learning_rate": 0.00039601849839204935, + "loss": 0.7712, + "step": 8590 + }, + { + "epoch": 0.19141737891737892, + "grad_norm": 1.218777060508728, + "learning_rate": 0.00039600923662019795, + "loss": 0.692, + "step": 8600 + }, + { + "epoch": 0.19163995726495728, + "grad_norm": 0.6282643675804138, + "learning_rate": 0.0003959999641970441, + "loss": 0.5798, + "step": 8610 + }, + { + "epoch": 0.1918625356125356, + "grad_norm": 0.9417448043823242, + "learning_rate": 0.00039599068112309183, + "loss": 0.6914, + "step": 8620 + }, + { + "epoch": 0.19208511396011396, + "grad_norm": 0.7199434041976929, + "learning_rate": 0.0003959813873988455, + "loss": 0.5549, + "step": 8630 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 0.46725979447364807, + "learning_rate": 0.0003959720830248101, + "loss": 0.808, + "step": 8640 + }, + { + "epoch": 0.19253027065527065, + "grad_norm": 0.6076691746711731, + "learning_rate": 0.0003959627680014913, + "loss": 0.5883, + "step": 8650 + }, + { + "epoch": 0.192752849002849, + "grad_norm": 0.4056580662727356, + "learning_rate": 0.0003959534423293953, + "loss": 0.5899, + "step": 8660 + }, + { + "epoch": 0.19297542735042736, + "grad_norm": 0.7676776647567749, + "learning_rate": 0.0003959441060090288, + "loss": 0.562, + "step": 8670 + }, + { + "epoch": 0.1931980056980057, + "grad_norm": 0.43628600239753723, + "learning_rate": 0.0003959347590408991, + "loss": 0.6806, + "step": 8680 + }, + { + "epoch": 0.19342058404558404, + "grad_norm": 0.8665661215782166, + "learning_rate": 0.0003959254014255143, + "loss": 0.679, + "step": 8690 + }, + { + "epoch": 0.1936431623931624, + "grad_norm": 0.8895514011383057, + "learning_rate": 0.0003959160331633827, + "loss": 0.7006, + "step": 8700 + }, + { + "epoch": 0.19386574074074073, + "grad_norm": 0.6683569550514221, + "learning_rate": 0.0003959066542550135, + "loss": 0.6949, + "step": 8710 + }, + { + "epoch": 0.19408831908831908, + "grad_norm": 0.8206831812858582, + "learning_rate": 0.0003958972647009164, + "loss": 0.642, + "step": 8720 + }, + { + "epoch": 0.19431089743589744, + "grad_norm": 0.5844119191169739, + "learning_rate": 0.0003958878645016015, + "loss": 0.574, + "step": 8730 + }, + { + "epoch": 0.1945334757834758, + "grad_norm": 0.609693169593811, + "learning_rate": 0.0003958784536575797, + "loss": 0.6547, + "step": 8740 + }, + { + "epoch": 0.19475605413105412, + "grad_norm": 0.4209142029285431, + "learning_rate": 0.00039586903216936236, + "loss": 0.6194, + "step": 8750 + }, + { + "epoch": 0.19497863247863248, + "grad_norm": 0.7345901727676392, + "learning_rate": 0.0003958596000374615, + "loss": 0.676, + "step": 8760 + }, + { + "epoch": 0.19520121082621084, + "grad_norm": 0.6274478435516357, + "learning_rate": 0.00039585015726238963, + "loss": 0.7814, + "step": 8770 + }, + { + "epoch": 0.19542378917378916, + "grad_norm": 0.6954050064086914, + "learning_rate": 0.0003958407038446598, + "loss": 0.7683, + "step": 8780 + }, + { + "epoch": 0.19564636752136752, + "grad_norm": 0.9183118939399719, + "learning_rate": 0.0003958312397847859, + "loss": 0.6729, + "step": 8790 + }, + { + "epoch": 0.19586894586894588, + "grad_norm": 0.7694580554962158, + "learning_rate": 0.0003958217650832821, + "loss": 0.6632, + "step": 8800 + }, + { + "epoch": 0.1960915242165242, + "grad_norm": 0.6874392032623291, + "learning_rate": 0.0003958122797406633, + "loss": 0.6299, + "step": 8810 + }, + { + "epoch": 0.19631410256410256, + "grad_norm": 0.5900786519050598, + "learning_rate": 0.00039580278375744485, + "loss": 0.6253, + "step": 8820 + }, + { + "epoch": 0.19653668091168092, + "grad_norm": 0.42310723662376404, + "learning_rate": 0.00039579327713414286, + "loss": 0.456, + "step": 8830 + }, + { + "epoch": 0.19675925925925927, + "grad_norm": 0.7175334692001343, + "learning_rate": 0.0003957837598712739, + "loss": 0.4972, + "step": 8840 + }, + { + "epoch": 0.1969818376068376, + "grad_norm": 0.6608636379241943, + "learning_rate": 0.0003957742319693552, + "loss": 0.6856, + "step": 8850 + }, + { + "epoch": 0.19720441595441596, + "grad_norm": 0.7150349020957947, + "learning_rate": 0.0003957646934289044, + "loss": 0.7014, + "step": 8860 + }, + { + "epoch": 0.1974269943019943, + "grad_norm": 0.8787618279457092, + "learning_rate": 0.00039575514425043996, + "loss": 0.7043, + "step": 8870 + }, + { + "epoch": 0.19764957264957264, + "grad_norm": 0.8096165657043457, + "learning_rate": 0.0003957455844344807, + "loss": 0.7042, + "step": 8880 + }, + { + "epoch": 0.197872150997151, + "grad_norm": 0.5648137927055359, + "learning_rate": 0.00039573601398154617, + "loss": 0.6565, + "step": 8890 + }, + { + "epoch": 0.19809472934472935, + "grad_norm": 0.7525642514228821, + "learning_rate": 0.00039572643289215636, + "loss": 0.6618, + "step": 8900 + }, + { + "epoch": 0.19831730769230768, + "grad_norm": 0.7236807346343994, + "learning_rate": 0.00039571684116683194, + "loss": 0.6229, + "step": 8910 + }, + { + "epoch": 0.19853988603988604, + "grad_norm": 0.690436601638794, + "learning_rate": 0.0003957072388060942, + "loss": 0.5987, + "step": 8920 + }, + { + "epoch": 0.1987624643874644, + "grad_norm": 0.38092443346977234, + "learning_rate": 0.0003956976258104649, + "loss": 0.5136, + "step": 8930 + }, + { + "epoch": 0.19898504273504272, + "grad_norm": 0.6596299409866333, + "learning_rate": 0.0003956880021804664, + "loss": 0.7893, + "step": 8940 + }, + { + "epoch": 0.19920762108262108, + "grad_norm": 0.5776147842407227, + "learning_rate": 0.0003956783679166216, + "loss": 0.5902, + "step": 8950 + }, + { + "epoch": 0.19943019943019943, + "grad_norm": 0.7063926458358765, + "learning_rate": 0.00039566872301945416, + "loss": 0.6769, + "step": 8960 + }, + { + "epoch": 0.1996527777777778, + "grad_norm": 0.7202835083007812, + "learning_rate": 0.0003956590674894881, + "loss": 0.5673, + "step": 8970 + }, + { + "epoch": 0.19987535612535612, + "grad_norm": 0.721526563167572, + "learning_rate": 0.00039564940132724816, + "loss": 0.5911, + "step": 8980 + }, + { + "epoch": 0.20009793447293447, + "grad_norm": 0.6836729049682617, + "learning_rate": 0.00039563972453325954, + "loss": 0.5803, + "step": 8990 + }, + { + "epoch": 0.20032051282051283, + "grad_norm": 0.6123881936073303, + "learning_rate": 0.0003956300371080482, + "loss": 0.6177, + "step": 9000 + }, + { + "epoch": 0.20054309116809116, + "grad_norm": 0.48289263248443604, + "learning_rate": 0.0003956203390521405, + "loss": 0.6179, + "step": 9010 + }, + { + "epoch": 0.20076566951566951, + "grad_norm": 0.4708038568496704, + "learning_rate": 0.0003956106303660634, + "loss": 0.4972, + "step": 9020 + }, + { + "epoch": 0.20098824786324787, + "grad_norm": 0.4728592336177826, + "learning_rate": 0.00039560091105034445, + "loss": 0.652, + "step": 9030 + }, + { + "epoch": 0.2012108262108262, + "grad_norm": 1.943331241607666, + "learning_rate": 0.000395591181105512, + "loss": 0.7249, + "step": 9040 + }, + { + "epoch": 0.20143340455840456, + "grad_norm": 0.9894505739212036, + "learning_rate": 0.0003955814405320945, + "loss": 0.7012, + "step": 9050 + }, + { + "epoch": 0.2016559829059829, + "grad_norm": 0.6454615592956543, + "learning_rate": 0.0003955716893306215, + "loss": 0.6747, + "step": 9060 + }, + { + "epoch": 0.20187856125356127, + "grad_norm": 0.5542591214179993, + "learning_rate": 0.00039556192750162276, + "loss": 0.6098, + "step": 9070 + }, + { + "epoch": 0.2021011396011396, + "grad_norm": 0.5976565480232239, + "learning_rate": 0.0003955521550456288, + "loss": 0.7787, + "step": 9080 + }, + { + "epoch": 0.20232371794871795, + "grad_norm": 1.246936559677124, + "learning_rate": 0.0003955423719631707, + "loss": 0.6445, + "step": 9090 + }, + { + "epoch": 0.2025462962962963, + "grad_norm": 0.6775862574577332, + "learning_rate": 0.0003955325782547799, + "loss": 0.5769, + "step": 9100 + }, + { + "epoch": 0.20276887464387464, + "grad_norm": 0.4151378273963928, + "learning_rate": 0.0003955227739209889, + "loss": 0.6429, + "step": 9110 + }, + { + "epoch": 0.202991452991453, + "grad_norm": 0.6238030791282654, + "learning_rate": 0.00039551295896233016, + "loss": 0.6775, + "step": 9120 + }, + { + "epoch": 0.20321403133903135, + "grad_norm": 0.6367015838623047, + "learning_rate": 0.00039550313337933726, + "loss": 0.5909, + "step": 9130 + }, + { + "epoch": 0.20343660968660968, + "grad_norm": 0.605278491973877, + "learning_rate": 0.000395493297172544, + "loss": 0.6829, + "step": 9140 + }, + { + "epoch": 0.20365918803418803, + "grad_norm": 0.5138265490531921, + "learning_rate": 0.00039548345034248495, + "loss": 0.56, + "step": 9150 + }, + { + "epoch": 0.2038817663817664, + "grad_norm": 0.703345775604248, + "learning_rate": 0.0003954735928896952, + "loss": 0.6245, + "step": 9160 + }, + { + "epoch": 0.20410434472934472, + "grad_norm": 0.5316170454025269, + "learning_rate": 0.0003954637248147104, + "loss": 0.7017, + "step": 9170 + }, + { + "epoch": 0.20432692307692307, + "grad_norm": 0.8845970034599304, + "learning_rate": 0.00039545384611806676, + "loss": 0.6894, + "step": 9180 + }, + { + "epoch": 0.20454950142450143, + "grad_norm": 0.34213122725486755, + "learning_rate": 0.0003954439568003011, + "loss": 0.659, + "step": 9190 + }, + { + "epoch": 0.20477207977207978, + "grad_norm": 0.6910355091094971, + "learning_rate": 0.0003954340568619508, + "loss": 0.7405, + "step": 9200 + }, + { + "epoch": 0.2049946581196581, + "grad_norm": 0.8337056040763855, + "learning_rate": 0.0003954241463035539, + "loss": 0.6928, + "step": 9210 + }, + { + "epoch": 0.20521723646723647, + "grad_norm": 0.6273629069328308, + "learning_rate": 0.0003954142251256489, + "loss": 0.52, + "step": 9220 + }, + { + "epoch": 0.20543981481481483, + "grad_norm": 0.4294908046722412, + "learning_rate": 0.0003954042933287749, + "loss": 0.6373, + "step": 9230 + }, + { + "epoch": 0.20566239316239315, + "grad_norm": 0.6154212355613708, + "learning_rate": 0.00039539435091347176, + "loss": 0.827, + "step": 9240 + }, + { + "epoch": 0.2058849715099715, + "grad_norm": 1.015303134918213, + "learning_rate": 0.0003953843978802795, + "loss": 0.6347, + "step": 9250 + }, + { + "epoch": 0.20610754985754987, + "grad_norm": 0.9026708006858826, + "learning_rate": 0.0003953744342297391, + "loss": 0.6995, + "step": 9260 + }, + { + "epoch": 0.2063301282051282, + "grad_norm": 0.3805409073829651, + "learning_rate": 0.0003953644599623921, + "loss": 0.5911, + "step": 9270 + }, + { + "epoch": 0.20655270655270655, + "grad_norm": 0.8477494120597839, + "learning_rate": 0.00039535447507878035, + "loss": 0.5985, + "step": 9280 + }, + { + "epoch": 0.2067752849002849, + "grad_norm": 0.5137654542922974, + "learning_rate": 0.0003953444795794465, + "loss": 0.7368, + "step": 9290 + }, + { + "epoch": 0.20699786324786323, + "grad_norm": 0.8575098514556885, + "learning_rate": 0.0003953344734649338, + "loss": 0.591, + "step": 9300 + }, + { + "epoch": 0.2072204415954416, + "grad_norm": 0.9351524114608765, + "learning_rate": 0.00039532445673578587, + "loss": 0.4845, + "step": 9310 + }, + { + "epoch": 0.20744301994301995, + "grad_norm": 0.4439866542816162, + "learning_rate": 0.000395314429392547, + "loss": 0.6406, + "step": 9320 + }, + { + "epoch": 0.2076655982905983, + "grad_norm": 0.8442595601081848, + "learning_rate": 0.0003953043914357622, + "loss": 0.7016, + "step": 9330 + }, + { + "epoch": 0.20788817663817663, + "grad_norm": 0.6058248281478882, + "learning_rate": 0.0003952943428659768, + "loss": 0.5101, + "step": 9340 + }, + { + "epoch": 0.208110754985755, + "grad_norm": 0.6372517347335815, + "learning_rate": 0.00039528428368373696, + "loss": 0.5624, + "step": 9350 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.6820414662361145, + "learning_rate": 0.0003952742138895894, + "loss": 0.5839, + "step": 9360 + }, + { + "epoch": 0.20855591168091167, + "grad_norm": 0.7529656291007996, + "learning_rate": 0.0003952641334840811, + "loss": 0.651, + "step": 9370 + }, + { + "epoch": 0.20877849002849003, + "grad_norm": 0.943543553352356, + "learning_rate": 0.0003952540424677599, + "loss": 0.5813, + "step": 9380 + }, + { + "epoch": 0.20900106837606838, + "grad_norm": 0.9541203379631042, + "learning_rate": 0.00039524394084117427, + "loss": 0.6527, + "step": 9390 + }, + { + "epoch": 0.2092236467236467, + "grad_norm": 0.7158280611038208, + "learning_rate": 0.000395233828604873, + "loss": 0.712, + "step": 9400 + }, + { + "epoch": 0.20944622507122507, + "grad_norm": 0.5238418579101562, + "learning_rate": 0.0003952237057594057, + "loss": 0.6042, + "step": 9410 + }, + { + "epoch": 0.20966880341880342, + "grad_norm": 0.855786919593811, + "learning_rate": 0.0003952135723053224, + "loss": 0.6428, + "step": 9420 + }, + { + "epoch": 0.20989138176638178, + "grad_norm": 0.6584997773170471, + "learning_rate": 0.0003952034282431738, + "loss": 0.6112, + "step": 9430 + }, + { + "epoch": 0.2101139601139601, + "grad_norm": 1.005631446838379, + "learning_rate": 0.000395193273573511, + "loss": 0.7763, + "step": 9440 + }, + { + "epoch": 0.21033653846153846, + "grad_norm": 0.8165929317474365, + "learning_rate": 0.00039518310829688596, + "loss": 0.6888, + "step": 9450 + }, + { + "epoch": 0.21055911680911682, + "grad_norm": 0.44331878423690796, + "learning_rate": 0.0003951729324138511, + "loss": 0.6588, + "step": 9460 + }, + { + "epoch": 0.21078169515669515, + "grad_norm": 0.7836384177207947, + "learning_rate": 0.0003951627459249593, + "loss": 0.5745, + "step": 9470 + }, + { + "epoch": 0.2110042735042735, + "grad_norm": 0.7953740954399109, + "learning_rate": 0.0003951525488307641, + "loss": 0.5136, + "step": 9480 + }, + { + "epoch": 0.21122685185185186, + "grad_norm": 0.5712646842002869, + "learning_rate": 0.0003951423411318197, + "loss": 0.7217, + "step": 9490 + }, + { + "epoch": 0.2114494301994302, + "grad_norm": 0.667172908782959, + "learning_rate": 0.00039513212282868063, + "loss": 0.6767, + "step": 9500 + }, + { + "epoch": 0.21167200854700854, + "grad_norm": 0.747340738773346, + "learning_rate": 0.0003951218939219023, + "loss": 0.7564, + "step": 9510 + }, + { + "epoch": 0.2118945868945869, + "grad_norm": 1.0341793298721313, + "learning_rate": 0.0003951116544120405, + "loss": 0.7363, + "step": 9520 + }, + { + "epoch": 0.21211716524216523, + "grad_norm": 0.5947072505950928, + "learning_rate": 0.0003951014042996517, + "loss": 0.7156, + "step": 9530 + }, + { + "epoch": 0.21233974358974358, + "grad_norm": 0.535180926322937, + "learning_rate": 0.0003950911435852929, + "loss": 0.594, + "step": 9540 + }, + { + "epoch": 0.21256232193732194, + "grad_norm": 0.9859617352485657, + "learning_rate": 0.0003950808722695216, + "loss": 0.638, + "step": 9550 + }, + { + "epoch": 0.2127849002849003, + "grad_norm": 0.5788066983222961, + "learning_rate": 0.00039507059035289604, + "loss": 0.6786, + "step": 9560 + }, + { + "epoch": 0.21300747863247863, + "grad_norm": 0.5944311618804932, + "learning_rate": 0.0003950602978359749, + "loss": 0.5964, + "step": 9570 + }, + { + "epoch": 0.21323005698005698, + "grad_norm": 0.5792214274406433, + "learning_rate": 0.00039504999471931746, + "loss": 0.553, + "step": 9580 + }, + { + "epoch": 0.21345263532763534, + "grad_norm": 0.6323608756065369, + "learning_rate": 0.0003950396810034836, + "loss": 0.5525, + "step": 9590 + }, + { + "epoch": 0.21367521367521367, + "grad_norm": 0.7955515384674072, + "learning_rate": 0.00039502935668903386, + "loss": 0.6462, + "step": 9600 + }, + { + "epoch": 0.21389779202279202, + "grad_norm": 0.7051757574081421, + "learning_rate": 0.00039501902177652924, + "loss": 0.5955, + "step": 9610 + }, + { + "epoch": 0.21412037037037038, + "grad_norm": 0.458967387676239, + "learning_rate": 0.0003950086762665313, + "loss": 0.6787, + "step": 9620 + }, + { + "epoch": 0.2143429487179487, + "grad_norm": 0.7646269798278809, + "learning_rate": 0.00039499832015960225, + "loss": 0.5951, + "step": 9630 + }, + { + "epoch": 0.21456552706552706, + "grad_norm": 0.7068632245063782, + "learning_rate": 0.00039498795345630487, + "loss": 0.6497, + "step": 9640 + }, + { + "epoch": 0.21478810541310542, + "grad_norm": 0.7031964659690857, + "learning_rate": 0.00039497757615720243, + "loss": 0.795, + "step": 9650 + }, + { + "epoch": 0.21501068376068377, + "grad_norm": 0.7918108701705933, + "learning_rate": 0.00039496718826285894, + "loss": 0.6667, + "step": 9660 + }, + { + "epoch": 0.2152332621082621, + "grad_norm": 0.4682900011539459, + "learning_rate": 0.0003949567897738388, + "loss": 0.6158, + "step": 9670 + }, + { + "epoch": 0.21545584045584046, + "grad_norm": 0.47049540281295776, + "learning_rate": 0.0003949463806907071, + "loss": 0.6403, + "step": 9680 + }, + { + "epoch": 0.21567841880341881, + "grad_norm": 0.6707554459571838, + "learning_rate": 0.00039493596101402954, + "loss": 0.6894, + "step": 9690 + }, + { + "epoch": 0.21590099715099714, + "grad_norm": 0.6850556135177612, + "learning_rate": 0.00039492553074437224, + "loss": 0.5975, + "step": 9700 + }, + { + "epoch": 0.2161235754985755, + "grad_norm": 0.5969346165657043, + "learning_rate": 0.000394915089882302, + "loss": 0.8084, + "step": 9710 + }, + { + "epoch": 0.21634615384615385, + "grad_norm": 0.8701211214065552, + "learning_rate": 0.0003949046384283862, + "loss": 0.8172, + "step": 9720 + }, + { + "epoch": 0.21656873219373218, + "grad_norm": 0.7247389554977417, + "learning_rate": 0.0003948941763831928, + "loss": 0.7495, + "step": 9730 + }, + { + "epoch": 0.21679131054131054, + "grad_norm": 0.4859636723995209, + "learning_rate": 0.0003948837037472903, + "loss": 0.6212, + "step": 9740 + }, + { + "epoch": 0.2170138888888889, + "grad_norm": 0.4931568205356598, + "learning_rate": 0.00039487322052124787, + "loss": 0.6867, + "step": 9750 + }, + { + "epoch": 0.21723646723646722, + "grad_norm": 0.8555331826210022, + "learning_rate": 0.00039486272670563507, + "loss": 0.6811, + "step": 9760 + }, + { + "epoch": 0.21745904558404558, + "grad_norm": 0.7685542702674866, + "learning_rate": 0.00039485222230102216, + "loss": 0.5231, + "step": 9770 + }, + { + "epoch": 0.21768162393162394, + "grad_norm": 0.7093376517295837, + "learning_rate": 0.00039484170730798, + "loss": 0.7635, + "step": 9780 + }, + { + "epoch": 0.2179042022792023, + "grad_norm": 0.5934985876083374, + "learning_rate": 0.0003948311817270799, + "loss": 0.6767, + "step": 9790 + }, + { + "epoch": 0.21812678062678062, + "grad_norm": 0.638926088809967, + "learning_rate": 0.000394820645558894, + "loss": 0.7192, + "step": 9800 + }, + { + "epoch": 0.21834935897435898, + "grad_norm": 0.5633494257926941, + "learning_rate": 0.0003948100988039946, + "loss": 0.6495, + "step": 9810 + }, + { + "epoch": 0.21857193732193733, + "grad_norm": 1.0099177360534668, + "learning_rate": 0.0003947995414629551, + "loss": 0.5245, + "step": 9820 + }, + { + "epoch": 0.21879451566951566, + "grad_norm": 0.7500694394111633, + "learning_rate": 0.00039478897353634895, + "loss": 0.8152, + "step": 9830 + }, + { + "epoch": 0.21901709401709402, + "grad_norm": 0.6685538291931152, + "learning_rate": 0.0003947783950247505, + "loss": 0.682, + "step": 9840 + }, + { + "epoch": 0.21923967236467237, + "grad_norm": 0.5551173090934753, + "learning_rate": 0.00039476780592873463, + "loss": 0.468, + "step": 9850 + }, + { + "epoch": 0.2194622507122507, + "grad_norm": 0.5178750157356262, + "learning_rate": 0.0003947572062488768, + "loss": 0.621, + "step": 9860 + }, + { + "epoch": 0.21968482905982906, + "grad_norm": 0.6670868396759033, + "learning_rate": 0.0003947465959857529, + "loss": 0.7889, + "step": 9870 + }, + { + "epoch": 0.2199074074074074, + "grad_norm": 0.4501799941062927, + "learning_rate": 0.0003947359751399395, + "loss": 0.6168, + "step": 9880 + }, + { + "epoch": 0.22012998575498577, + "grad_norm": 0.6546468138694763, + "learning_rate": 0.00039472534371201383, + "loss": 0.5885, + "step": 9890 + }, + { + "epoch": 0.2203525641025641, + "grad_norm": 0.6405220031738281, + "learning_rate": 0.0003947147017025536, + "loss": 0.5922, + "step": 9900 + }, + { + "epoch": 0.22057514245014245, + "grad_norm": 0.4797993302345276, + "learning_rate": 0.00039470404911213705, + "loss": 0.7047, + "step": 9910 + }, + { + "epoch": 0.2207977207977208, + "grad_norm": 0.721960186958313, + "learning_rate": 0.0003946933859413431, + "loss": 0.6127, + "step": 9920 + }, + { + "epoch": 0.22102029914529914, + "grad_norm": 0.4744751453399658, + "learning_rate": 0.0003946827121907512, + "loss": 0.5057, + "step": 9930 + }, + { + "epoch": 0.2212428774928775, + "grad_norm": 0.6154679656028748, + "learning_rate": 0.0003946720278609413, + "loss": 0.6269, + "step": 9940 + }, + { + "epoch": 0.22146545584045585, + "grad_norm": 0.7666019797325134, + "learning_rate": 0.00039466133295249406, + "loss": 0.6895, + "step": 9950 + }, + { + "epoch": 0.22168803418803418, + "grad_norm": 0.6650170683860779, + "learning_rate": 0.0003946506274659906, + "loss": 0.7485, + "step": 9960 + }, + { + "epoch": 0.22191061253561253, + "grad_norm": 0.8512768745422363, + "learning_rate": 0.00039463991140201274, + "loss": 0.6962, + "step": 9970 + }, + { + "epoch": 0.2221331908831909, + "grad_norm": 0.7195132970809937, + "learning_rate": 0.00039462918476114277, + "loss": 0.6445, + "step": 9980 + }, + { + "epoch": 0.22235576923076922, + "grad_norm": 0.6763859391212463, + "learning_rate": 0.0003946184475439635, + "loss": 0.7474, + "step": 9990 + }, + { + "epoch": 0.22257834757834757, + "grad_norm": 0.6375609636306763, + "learning_rate": 0.00039460769975105853, + "loss": 0.575, + "step": 10000 + }, + { + "epoch": 0.22280092592592593, + "grad_norm": 0.4558027684688568, + "learning_rate": 0.0003945969413830118, + "loss": 0.6552, + "step": 10010 + }, + { + "epoch": 0.22302350427350429, + "grad_norm": 0.6802572011947632, + "learning_rate": 0.000394586172440408, + "loss": 0.6885, + "step": 10020 + }, + { + "epoch": 0.22324608262108261, + "grad_norm": 0.77765291929245, + "learning_rate": 0.00039457539292383223, + "loss": 0.7531, + "step": 10030 + }, + { + "epoch": 0.22346866096866097, + "grad_norm": 0.8640170693397522, + "learning_rate": 0.0003945646028338704, + "loss": 0.7519, + "step": 10040 + }, + { + "epoch": 0.22369123931623933, + "grad_norm": 0.464138925075531, + "learning_rate": 0.00039455380217110874, + "loss": 0.7144, + "step": 10050 + }, + { + "epoch": 0.22391381766381765, + "grad_norm": 0.5078659057617188, + "learning_rate": 0.0003945429909361342, + "loss": 0.602, + "step": 10060 + }, + { + "epoch": 0.224136396011396, + "grad_norm": 0.6784718632698059, + "learning_rate": 0.0003945321691295343, + "loss": 0.5502, + "step": 10070 + }, + { + "epoch": 0.22435897435897437, + "grad_norm": 0.6645569801330566, + "learning_rate": 0.000394521336751897, + "loss": 0.5954, + "step": 10080 + }, + { + "epoch": 0.2245815527065527, + "grad_norm": 0.6368417143821716, + "learning_rate": 0.0003945104938038111, + "loss": 0.6147, + "step": 10090 + }, + { + "epoch": 0.22480413105413105, + "grad_norm": 1.4571202993392944, + "learning_rate": 0.0003944996402858657, + "loss": 0.6064, + "step": 10100 + }, + { + "epoch": 0.2250267094017094, + "grad_norm": 0.7869674563407898, + "learning_rate": 0.0003944887761986506, + "loss": 0.5489, + "step": 10110 + }, + { + "epoch": 0.22524928774928774, + "grad_norm": 0.7581375241279602, + "learning_rate": 0.0003944779015427562, + "loss": 0.6868, + "step": 10120 + }, + { + "epoch": 0.2254718660968661, + "grad_norm": 0.7421298623085022, + "learning_rate": 0.0003944670163187735, + "loss": 0.5624, + "step": 10130 + }, + { + "epoch": 0.22569444444444445, + "grad_norm": 0.6129317283630371, + "learning_rate": 0.0003944561205272939, + "loss": 0.7149, + "step": 10140 + }, + { + "epoch": 0.2259170227920228, + "grad_norm": 0.7636057138442993, + "learning_rate": 0.00039444521416890944, + "loss": 0.584, + "step": 10150 + }, + { + "epoch": 0.22613960113960113, + "grad_norm": 0.7134559154510498, + "learning_rate": 0.0003944342972442129, + "loss": 0.7097, + "step": 10160 + }, + { + "epoch": 0.2263621794871795, + "grad_norm": 0.6366649866104126, + "learning_rate": 0.0003944233697537975, + "loss": 0.5996, + "step": 10170 + }, + { + "epoch": 0.22658475783475784, + "grad_norm": 0.6678193807601929, + "learning_rate": 0.000394412431698257, + "loss": 0.6114, + "step": 10180 + }, + { + "epoch": 0.22680733618233617, + "grad_norm": 0.6763032674789429, + "learning_rate": 0.0003944014830781858, + "loss": 0.5864, + "step": 10190 + }, + { + "epoch": 0.22702991452991453, + "grad_norm": 0.6134129166603088, + "learning_rate": 0.0003943905238941789, + "loss": 0.6545, + "step": 10200 + }, + { + "epoch": 0.22725249287749288, + "grad_norm": 0.8235966563224792, + "learning_rate": 0.0003943795541468318, + "loss": 0.5798, + "step": 10210 + }, + { + "epoch": 0.2274750712250712, + "grad_norm": 0.6266351342201233, + "learning_rate": 0.0003943685738367406, + "loss": 0.5682, + "step": 10220 + }, + { + "epoch": 0.22769764957264957, + "grad_norm": 0.5528531074523926, + "learning_rate": 0.00039435758296450194, + "loss": 0.4977, + "step": 10230 + }, + { + "epoch": 0.22792022792022792, + "grad_norm": 0.7468633055686951, + "learning_rate": 0.00039434658153071313, + "loss": 0.557, + "step": 10240 + }, + { + "epoch": 0.22814280626780628, + "grad_norm": 0.4979506731033325, + "learning_rate": 0.00039433556953597204, + "loss": 0.582, + "step": 10250 + }, + { + "epoch": 0.2283653846153846, + "grad_norm": 0.7964197397232056, + "learning_rate": 0.000394324546980877, + "loss": 0.7436, + "step": 10260 + }, + { + "epoch": 0.22858796296296297, + "grad_norm": 0.8756190538406372, + "learning_rate": 0.0003943135138660269, + "loss": 0.7267, + "step": 10270 + }, + { + "epoch": 0.22881054131054132, + "grad_norm": 0.6306865215301514, + "learning_rate": 0.00039430247019202146, + "loss": 0.5717, + "step": 10280 + }, + { + "epoch": 0.22903311965811965, + "grad_norm": 0.8970215320587158, + "learning_rate": 0.00039429141595946073, + "loss": 0.6467, + "step": 10290 + }, + { + "epoch": 0.229255698005698, + "grad_norm": 0.9413365721702576, + "learning_rate": 0.0003942803511689453, + "loss": 0.7759, + "step": 10300 + }, + { + "epoch": 0.22947827635327636, + "grad_norm": 0.684522807598114, + "learning_rate": 0.00039426927582107663, + "loss": 0.8251, + "step": 10310 + }, + { + "epoch": 0.2297008547008547, + "grad_norm": 0.7772836685180664, + "learning_rate": 0.0003942581899164565, + "loss": 0.5982, + "step": 10320 + }, + { + "epoch": 0.22992343304843305, + "grad_norm": 0.8296146988868713, + "learning_rate": 0.0003942470934556873, + "loss": 0.6668, + "step": 10330 + }, + { + "epoch": 0.2301460113960114, + "grad_norm": 0.6865838170051575, + "learning_rate": 0.00039423598643937197, + "loss": 0.5596, + "step": 10340 + }, + { + "epoch": 0.23036858974358973, + "grad_norm": 0.8723232746124268, + "learning_rate": 0.00039422486886811416, + "loss": 0.6831, + "step": 10350 + }, + { + "epoch": 0.23059116809116809, + "grad_norm": 0.7553172707557678, + "learning_rate": 0.00039421374074251797, + "loss": 0.6949, + "step": 10360 + }, + { + "epoch": 0.23081374643874644, + "grad_norm": 0.7564347982406616, + "learning_rate": 0.00039420260206318806, + "loss": 0.7041, + "step": 10370 + }, + { + "epoch": 0.2310363247863248, + "grad_norm": 0.6427896618843079, + "learning_rate": 0.00039419145283072984, + "loss": 0.5599, + "step": 10380 + }, + { + "epoch": 0.23125890313390313, + "grad_norm": 0.7506496906280518, + "learning_rate": 0.00039418029304574907, + "loss": 0.5654, + "step": 10390 + }, + { + "epoch": 0.23148148148148148, + "grad_norm": 0.7676074504852295, + "learning_rate": 0.0003941691227088523, + "loss": 0.7259, + "step": 10400 + }, + { + "epoch": 0.23170405982905984, + "grad_norm": 1.0894349813461304, + "learning_rate": 0.00039415794182064633, + "loss": 0.7539, + "step": 10410 + }, + { + "epoch": 0.23192663817663817, + "grad_norm": 0.4351309835910797, + "learning_rate": 0.0003941467503817389, + "loss": 0.6425, + "step": 10420 + }, + { + "epoch": 0.23214921652421652, + "grad_norm": 0.5593180656433105, + "learning_rate": 0.00039413554839273817, + "loss": 0.6471, + "step": 10430 + }, + { + "epoch": 0.23237179487179488, + "grad_norm": 0.41081494092941284, + "learning_rate": 0.00039412433585425276, + "loss": 0.6515, + "step": 10440 + }, + { + "epoch": 0.2325943732193732, + "grad_norm": 0.4768132269382477, + "learning_rate": 0.000394113112766892, + "loss": 0.582, + "step": 10450 + }, + { + "epoch": 0.23281695156695156, + "grad_norm": 0.6515503525733948, + "learning_rate": 0.0003941018791312658, + "loss": 0.6886, + "step": 10460 + }, + { + "epoch": 0.23303952991452992, + "grad_norm": 0.4547460377216339, + "learning_rate": 0.00039409063494798464, + "loss": 0.6271, + "step": 10470 + }, + { + "epoch": 0.23326210826210828, + "grad_norm": 0.7479694485664368, + "learning_rate": 0.0003940793802176594, + "loss": 0.5905, + "step": 10480 + }, + { + "epoch": 0.2334846866096866, + "grad_norm": 0.5531708598136902, + "learning_rate": 0.0003940681149409018, + "loss": 0.5962, + "step": 10490 + }, + { + "epoch": 0.23370726495726496, + "grad_norm": 0.8904018402099609, + "learning_rate": 0.000394056839118324, + "loss": 0.5928, + "step": 10500 + }, + { + "epoch": 0.23392984330484332, + "grad_norm": 0.8517378568649292, + "learning_rate": 0.0003940455527505387, + "loss": 0.6815, + "step": 10510 + }, + { + "epoch": 0.23415242165242164, + "grad_norm": 0.6255640983581543, + "learning_rate": 0.0003940342558381591, + "loss": 0.5768, + "step": 10520 + }, + { + "epoch": 0.234375, + "grad_norm": 0.8824275135993958, + "learning_rate": 0.00039402294838179937, + "loss": 0.553, + "step": 10530 + }, + { + "epoch": 0.23459757834757836, + "grad_norm": 0.8169212341308594, + "learning_rate": 0.00039401163038207363, + "loss": 0.5748, + "step": 10540 + }, + { + "epoch": 0.23482015669515668, + "grad_norm": 0.8070210218429565, + "learning_rate": 0.0003940003018395971, + "loss": 0.721, + "step": 10550 + }, + { + "epoch": 0.23504273504273504, + "grad_norm": 0.7101810574531555, + "learning_rate": 0.0003939889627549854, + "loss": 0.7535, + "step": 10560 + }, + { + "epoch": 0.2352653133903134, + "grad_norm": 0.6106790900230408, + "learning_rate": 0.00039397761312885465, + "loss": 0.7275, + "step": 10570 + }, + { + "epoch": 0.23548789173789172, + "grad_norm": 0.8647605180740356, + "learning_rate": 0.0003939662529618216, + "loss": 0.6286, + "step": 10580 + }, + { + "epoch": 0.23571047008547008, + "grad_norm": 0.481120228767395, + "learning_rate": 0.00039395488225450363, + "loss": 0.6296, + "step": 10590 + }, + { + "epoch": 0.23593304843304844, + "grad_norm": 0.8005710244178772, + "learning_rate": 0.00039394350100751846, + "loss": 0.734, + "step": 10600 + }, + { + "epoch": 0.2361556267806268, + "grad_norm": 0.5818419456481934, + "learning_rate": 0.00039393210922148477, + "loss": 0.7026, + "step": 10610 + }, + { + "epoch": 0.23637820512820512, + "grad_norm": 0.5003513693809509, + "learning_rate": 0.0003939207068970214, + "loss": 0.5966, + "step": 10620 + }, + { + "epoch": 0.23660078347578348, + "grad_norm": 0.661308765411377, + "learning_rate": 0.00039390929403474817, + "loss": 0.5786, + "step": 10630 + }, + { + "epoch": 0.23682336182336183, + "grad_norm": 0.9554111957550049, + "learning_rate": 0.00039389787063528515, + "loss": 0.5686, + "step": 10640 + }, + { + "epoch": 0.23704594017094016, + "grad_norm": 0.6321176290512085, + "learning_rate": 0.00039388643669925307, + "loss": 0.5561, + "step": 10650 + }, + { + "epoch": 0.23726851851851852, + "grad_norm": 0.4389246106147766, + "learning_rate": 0.00039387499222727333, + "loss": 0.654, + "step": 10660 + }, + { + "epoch": 0.23749109686609687, + "grad_norm": 0.5420764088630676, + "learning_rate": 0.0003938635372199678, + "loss": 0.4846, + "step": 10670 + }, + { + "epoch": 0.2377136752136752, + "grad_norm": 0.7103323340415955, + "learning_rate": 0.0003938520716779589, + "loss": 0.5769, + "step": 10680 + }, + { + "epoch": 0.23793625356125356, + "grad_norm": 0.7802491188049316, + "learning_rate": 0.00039384059560186975, + "loss": 0.6831, + "step": 10690 + }, + { + "epoch": 0.23815883190883191, + "grad_norm": 0.7568184733390808, + "learning_rate": 0.000393829108992324, + "loss": 0.7352, + "step": 10700 + }, + { + "epoch": 0.23838141025641027, + "grad_norm": 0.5993239879608154, + "learning_rate": 0.0003938176118499458, + "loss": 0.684, + "step": 10710 + }, + { + "epoch": 0.2386039886039886, + "grad_norm": 0.6817304491996765, + "learning_rate": 0.0003938061041753598, + "loss": 0.733, + "step": 10720 + }, + { + "epoch": 0.23882656695156695, + "grad_norm": 0.7760605216026306, + "learning_rate": 0.0003937945859691915, + "loss": 0.6041, + "step": 10730 + }, + { + "epoch": 0.2390491452991453, + "grad_norm": 0.9699644446372986, + "learning_rate": 0.0003937830572320668, + "loss": 0.6125, + "step": 10740 + }, + { + "epoch": 0.23927172364672364, + "grad_norm": 0.808280348777771, + "learning_rate": 0.00039377151796461213, + "loss": 0.6503, + "step": 10750 + }, + { + "epoch": 0.239494301994302, + "grad_norm": 0.5646578073501587, + "learning_rate": 0.00039375996816745445, + "loss": 0.5865, + "step": 10760 + }, + { + "epoch": 0.23971688034188035, + "grad_norm": 0.811384916305542, + "learning_rate": 0.0003937484078412215, + "loss": 0.6442, + "step": 10770 + }, + { + "epoch": 0.23993945868945868, + "grad_norm": 0.7848852872848511, + "learning_rate": 0.0003937368369865415, + "loss": 0.646, + "step": 10780 + }, + { + "epoch": 0.24002849002849003, + "eval_loss": 0.6369755864143372, + "eval_runtime": 337.4494, + "eval_samples_per_second": 7.008, + "eval_steps_per_second": 7.008, + "step": 10784 + }, + { + "epoch": 0.24016203703703703, + "grad_norm": 0.6187881827354431, + "learning_rate": 0.0003937252556040432, + "loss": 0.6877, + "step": 10790 + }, + { + "epoch": 0.2403846153846154, + "grad_norm": 0.6488537192344666, + "learning_rate": 0.0003937136636943559, + "loss": 0.7301, + "step": 10800 + }, + { + "epoch": 0.24060719373219372, + "grad_norm": 0.8597158789634705, + "learning_rate": 0.0003937020612581095, + "loss": 0.6421, + "step": 10810 + }, + { + "epoch": 0.24082977207977208, + "grad_norm": 0.6430831551551819, + "learning_rate": 0.0003936904482959346, + "loss": 0.6035, + "step": 10820 + }, + { + "epoch": 0.24105235042735043, + "grad_norm": 0.690912127494812, + "learning_rate": 0.00039367882480846204, + "loss": 0.5751, + "step": 10830 + }, + { + "epoch": 0.2412749287749288, + "grad_norm": 0.664210319519043, + "learning_rate": 0.00039366719079632367, + "loss": 0.5842, + "step": 10840 + }, + { + "epoch": 0.24149750712250712, + "grad_norm": 0.6491913199424744, + "learning_rate": 0.00039365554626015167, + "loss": 0.7105, + "step": 10850 + }, + { + "epoch": 0.24172008547008547, + "grad_norm": 0.7318790555000305, + "learning_rate": 0.00039364389120057866, + "loss": 0.5448, + "step": 10860 + }, + { + "epoch": 0.24194266381766383, + "grad_norm": 0.780798614025116, + "learning_rate": 0.0003936322256182381, + "loss": 0.6862, + "step": 10870 + }, + { + "epoch": 0.24216524216524216, + "grad_norm": 0.8476507067680359, + "learning_rate": 0.0003936205495137639, + "loss": 0.699, + "step": 10880 + }, + { + "epoch": 0.2423878205128205, + "grad_norm": 0.6828676462173462, + "learning_rate": 0.0003936088628877905, + "loss": 0.6713, + "step": 10890 + }, + { + "epoch": 0.24261039886039887, + "grad_norm": 0.8574651479721069, + "learning_rate": 0.00039359716574095306, + "loss": 0.6122, + "step": 10900 + }, + { + "epoch": 0.2428329772079772, + "grad_norm": 0.7367123961448669, + "learning_rate": 0.0003935854580738871, + "loss": 0.6888, + "step": 10910 + }, + { + "epoch": 0.24305555555555555, + "grad_norm": 0.40965545177459717, + "learning_rate": 0.0003935737398872289, + "loss": 0.5856, + "step": 10920 + }, + { + "epoch": 0.2432781339031339, + "grad_norm": 0.5977027416229248, + "learning_rate": 0.0003935620111816151, + "loss": 0.6046, + "step": 10930 + }, + { + "epoch": 0.24350071225071226, + "grad_norm": 0.8471280932426453, + "learning_rate": 0.0003935502719576833, + "loss": 0.7611, + "step": 10940 + }, + { + "epoch": 0.2437232905982906, + "grad_norm": 0.8705294728279114, + "learning_rate": 0.00039353852221607125, + "loss": 0.5893, + "step": 10950 + }, + { + "epoch": 0.24394586894586895, + "grad_norm": 0.43957415223121643, + "learning_rate": 0.0003935267619574174, + "loss": 0.6643, + "step": 10960 + }, + { + "epoch": 0.2441684472934473, + "grad_norm": 0.7031153440475464, + "learning_rate": 0.0003935149911823609, + "loss": 0.7179, + "step": 10970 + }, + { + "epoch": 0.24439102564102563, + "grad_norm": 0.7570757269859314, + "learning_rate": 0.00039350320989154134, + "loss": 0.5547, + "step": 10980 + }, + { + "epoch": 0.244613603988604, + "grad_norm": 0.69059157371521, + "learning_rate": 0.000393491418085599, + "loss": 0.5241, + "step": 10990 + }, + { + "epoch": 0.24483618233618235, + "grad_norm": 0.6872458457946777, + "learning_rate": 0.00039347961576517455, + "loss": 0.6495, + "step": 11000 + }, + { + "epoch": 0.24505876068376067, + "grad_norm": 0.5548238158226013, + "learning_rate": 0.00039346780293090947, + "loss": 0.6809, + "step": 11010 + }, + { + "epoch": 0.24528133903133903, + "grad_norm": 0.7153156399726868, + "learning_rate": 0.0003934559795834455, + "loss": 0.5655, + "step": 11020 + }, + { + "epoch": 0.24550391737891739, + "grad_norm": 0.8266432285308838, + "learning_rate": 0.0003934441457234253, + "loss": 0.7458, + "step": 11030 + }, + { + "epoch": 0.24572649572649571, + "grad_norm": 0.4854261875152588, + "learning_rate": 0.0003934323013514918, + "loss": 0.758, + "step": 11040 + }, + { + "epoch": 0.24594907407407407, + "grad_norm": 0.7041235566139221, + "learning_rate": 0.00039342044646828873, + "loss": 0.6934, + "step": 11050 + }, + { + "epoch": 0.24617165242165243, + "grad_norm": 0.4012024402618408, + "learning_rate": 0.0003934085810744603, + "loss": 0.5695, + "step": 11060 + }, + { + "epoch": 0.24639423076923078, + "grad_norm": 0.6638675332069397, + "learning_rate": 0.00039339670517065116, + "loss": 0.7782, + "step": 11070 + }, + { + "epoch": 0.2466168091168091, + "grad_norm": 0.7643360495567322, + "learning_rate": 0.00039338481875750677, + "loss": 0.582, + "step": 11080 + }, + { + "epoch": 0.24683938746438747, + "grad_norm": 0.5708128213882446, + "learning_rate": 0.0003933729218356731, + "loss": 0.57, + "step": 11090 + }, + { + "epoch": 0.24706196581196582, + "grad_norm": 0.5449902415275574, + "learning_rate": 0.00039336101440579644, + "loss": 0.6559, + "step": 11100 + }, + { + "epoch": 0.24728454415954415, + "grad_norm": 0.8565914630889893, + "learning_rate": 0.00039334909646852396, + "loss": 0.5984, + "step": 11110 + }, + { + "epoch": 0.2475071225071225, + "grad_norm": 0.7144999504089355, + "learning_rate": 0.00039333716802450333, + "loss": 0.816, + "step": 11120 + }, + { + "epoch": 0.24772970085470086, + "grad_norm": 0.5774109363555908, + "learning_rate": 0.00039332522907438276, + "loss": 0.4983, + "step": 11130 + }, + { + "epoch": 0.2479522792022792, + "grad_norm": 0.4489073157310486, + "learning_rate": 0.00039331327961881097, + "loss": 0.7048, + "step": 11140 + }, + { + "epoch": 0.24817485754985755, + "grad_norm": 0.572086751461029, + "learning_rate": 0.0003933013196584373, + "loss": 0.6814, + "step": 11150 + }, + { + "epoch": 0.2483974358974359, + "grad_norm": 0.9465423822402954, + "learning_rate": 0.00039328934919391164, + "loss": 0.6921, + "step": 11160 + }, + { + "epoch": 0.24862001424501423, + "grad_norm": 0.7562800645828247, + "learning_rate": 0.0003932773682258845, + "loss": 0.5815, + "step": 11170 + }, + { + "epoch": 0.2488425925925926, + "grad_norm": 0.7506551146507263, + "learning_rate": 0.00039326537675500703, + "loss": 0.567, + "step": 11180 + }, + { + "epoch": 0.24906517094017094, + "grad_norm": 0.5940730571746826, + "learning_rate": 0.0003932533747819306, + "loss": 0.6497, + "step": 11190 + }, + { + "epoch": 0.2492877492877493, + "grad_norm": 0.6203614473342896, + "learning_rate": 0.00039324136230730776, + "loss": 0.6522, + "step": 11200 + }, + { + "epoch": 0.24951032763532763, + "grad_norm": 0.9097982048988342, + "learning_rate": 0.00039322933933179106, + "loss": 0.6082, + "step": 11210 + }, + { + "epoch": 0.24973290598290598, + "grad_norm": 0.9215477705001831, + "learning_rate": 0.00039321730585603387, + "loss": 0.6086, + "step": 11220 + }, + { + "epoch": 0.24995548433048434, + "grad_norm": 0.3814356327056885, + "learning_rate": 0.0003932052618806901, + "loss": 0.6829, + "step": 11230 + }, + { + "epoch": 0.2501780626780627, + "grad_norm": 0.8164243698120117, + "learning_rate": 0.00039319320740641416, + "loss": 0.6757, + "step": 11240 + }, + { + "epoch": 0.250400641025641, + "grad_norm": 0.6056912541389465, + "learning_rate": 0.00039318114243386124, + "loss": 0.5695, + "step": 11250 + }, + { + "epoch": 0.25062321937321935, + "grad_norm": 0.7039971351623535, + "learning_rate": 0.00039316906696368694, + "loss": 0.6198, + "step": 11260 + }, + { + "epoch": 0.25084579772079774, + "grad_norm": 0.7455176115036011, + "learning_rate": 0.0003931569809965473, + "loss": 0.5613, + "step": 11270 + }, + { + "epoch": 0.25106837606837606, + "grad_norm": 0.5015754103660583, + "learning_rate": 0.0003931448845330993, + "loss": 0.61, + "step": 11280 + }, + { + "epoch": 0.2512909544159544, + "grad_norm": 1.0012961626052856, + "learning_rate": 0.00039313277757400006, + "loss": 0.6641, + "step": 11290 + }, + { + "epoch": 0.2515135327635328, + "grad_norm": 0.6197919845581055, + "learning_rate": 0.00039312066011990763, + "loss": 0.6363, + "step": 11300 + }, + { + "epoch": 0.2517361111111111, + "grad_norm": 0.5845314264297485, + "learning_rate": 0.00039310853217148047, + "loss": 0.6148, + "step": 11310 + }, + { + "epoch": 0.25195868945868943, + "grad_norm": 0.5891401171684265, + "learning_rate": 0.00039309639372937756, + "loss": 0.6119, + "step": 11320 + }, + { + "epoch": 0.2521812678062678, + "grad_norm": 0.5159587264060974, + "learning_rate": 0.00039308424479425846, + "loss": 0.725, + "step": 11330 + }, + { + "epoch": 0.25240384615384615, + "grad_norm": 0.6200169324874878, + "learning_rate": 0.00039307208536678353, + "loss": 0.5924, + "step": 11340 + }, + { + "epoch": 0.25262642450142453, + "grad_norm": 0.8009082674980164, + "learning_rate": 0.00039305991544761335, + "loss": 0.6477, + "step": 11350 + }, + { + "epoch": 0.25284900284900286, + "grad_norm": 0.5633246302604675, + "learning_rate": 0.00039304773503740935, + "loss": 0.6603, + "step": 11360 + }, + { + "epoch": 0.2530715811965812, + "grad_norm": 0.7888376116752625, + "learning_rate": 0.00039303554413683343, + "loss": 0.6561, + "step": 11370 + }, + { + "epoch": 0.25329415954415957, + "grad_norm": 0.8472663760185242, + "learning_rate": 0.00039302334274654793, + "loss": 0.5612, + "step": 11380 + }, + { + "epoch": 0.2535167378917379, + "grad_norm": 0.7532154321670532, + "learning_rate": 0.00039301113086721607, + "loss": 0.534, + "step": 11390 + }, + { + "epoch": 0.2537393162393162, + "grad_norm": 0.7994014024734497, + "learning_rate": 0.0003929989084995013, + "loss": 0.6819, + "step": 11400 + }, + { + "epoch": 0.2539618945868946, + "grad_norm": 0.7608009576797485, + "learning_rate": 0.0003929866756440679, + "loss": 0.6416, + "step": 11410 + }, + { + "epoch": 0.25418447293447294, + "grad_norm": 0.620042085647583, + "learning_rate": 0.0003929744323015805, + "loss": 0.6459, + "step": 11420 + }, + { + "epoch": 0.25440705128205127, + "grad_norm": 0.8690176010131836, + "learning_rate": 0.00039296217847270445, + "loss": 0.5823, + "step": 11430 + }, + { + "epoch": 0.25462962962962965, + "grad_norm": 1.0111145973205566, + "learning_rate": 0.00039294991415810574, + "loss": 0.6488, + "step": 11440 + }, + { + "epoch": 0.254852207977208, + "grad_norm": 0.5568084120750427, + "learning_rate": 0.0003929376393584506, + "loss": 0.6298, + "step": 11450 + }, + { + "epoch": 0.2550747863247863, + "grad_norm": 0.5776306986808777, + "learning_rate": 0.0003929253540744063, + "loss": 0.7216, + "step": 11460 + }, + { + "epoch": 0.2552973646723647, + "grad_norm": 0.7077421545982361, + "learning_rate": 0.00039291305830664033, + "loss": 0.6168, + "step": 11470 + }, + { + "epoch": 0.255519943019943, + "grad_norm": 0.6776533126831055, + "learning_rate": 0.0003929007520558208, + "loss": 0.598, + "step": 11480 + }, + { + "epoch": 0.25574252136752135, + "grad_norm": 0.6140464544296265, + "learning_rate": 0.0003928884353226165, + "loss": 0.7212, + "step": 11490 + }, + { + "epoch": 0.25596509971509973, + "grad_norm": 0.8022708296775818, + "learning_rate": 0.00039287610810769674, + "loss": 0.6462, + "step": 11500 + }, + { + "epoch": 0.25618767806267806, + "grad_norm": 0.6690821647644043, + "learning_rate": 0.00039286377041173134, + "loss": 0.6317, + "step": 11510 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.8081620931625366, + "learning_rate": 0.0003928514222353908, + "loss": 0.7468, + "step": 11520 + }, + { + "epoch": 0.25663283475783477, + "grad_norm": 0.39370104670524597, + "learning_rate": 0.0003928390635793461, + "loss": 0.672, + "step": 11530 + }, + { + "epoch": 0.2568554131054131, + "grad_norm": 0.7194493412971497, + "learning_rate": 0.0003928266944442688, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 0.25707799145299143, + "grad_norm": 0.8301408886909485, + "learning_rate": 0.000392814314830831, + "loss": 0.546, + "step": 11550 + }, + { + "epoch": 0.2573005698005698, + "grad_norm": 0.5671295523643494, + "learning_rate": 0.00039280192473970557, + "loss": 0.7288, + "step": 11560 + }, + { + "epoch": 0.25752314814814814, + "grad_norm": 0.942840576171875, + "learning_rate": 0.00039278952417156574, + "loss": 0.6642, + "step": 11570 + }, + { + "epoch": 0.25774572649572647, + "grad_norm": 0.6493533253669739, + "learning_rate": 0.0003927771131270853, + "loss": 0.6634, + "step": 11580 + }, + { + "epoch": 0.25796830484330485, + "grad_norm": 0.7302709221839905, + "learning_rate": 0.0003927646916069387, + "loss": 0.5299, + "step": 11590 + }, + { + "epoch": 0.2581908831908832, + "grad_norm": 0.7260543704032898, + "learning_rate": 0.000392752259611801, + "loss": 0.8042, + "step": 11600 + }, + { + "epoch": 0.25841346153846156, + "grad_norm": 0.6935957670211792, + "learning_rate": 0.0003927398171423477, + "loss": 0.5939, + "step": 11610 + }, + { + "epoch": 0.2586360398860399, + "grad_norm": 1.0461655855178833, + "learning_rate": 0.00039272736419925495, + "loss": 0.6719, + "step": 11620 + }, + { + "epoch": 0.2588586182336182, + "grad_norm": 0.7319514155387878, + "learning_rate": 0.00039271490078319945, + "loss": 0.6549, + "step": 11630 + }, + { + "epoch": 0.2590811965811966, + "grad_norm": 0.5853979587554932, + "learning_rate": 0.0003927024268948585, + "loss": 0.6174, + "step": 11640 + }, + { + "epoch": 0.25930377492877493, + "grad_norm": 0.4176386594772339, + "learning_rate": 0.00039268994253490987, + "loss": 0.5942, + "step": 11650 + }, + { + "epoch": 0.25952635327635326, + "grad_norm": 0.6766052842140198, + "learning_rate": 0.000392677447704032, + "loss": 0.6723, + "step": 11660 + }, + { + "epoch": 0.25974893162393164, + "grad_norm": 0.6937527060508728, + "learning_rate": 0.000392664942402904, + "loss": 0.7434, + "step": 11670 + }, + { + "epoch": 0.25997150997151, + "grad_norm": 0.32973021268844604, + "learning_rate": 0.0003926524266322052, + "loss": 0.5382, + "step": 11680 + }, + { + "epoch": 0.2601940883190883, + "grad_norm": 0.8505682349205017, + "learning_rate": 0.0003926399003926159, + "loss": 0.6913, + "step": 11690 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.7429829835891724, + "learning_rate": 0.00039262736368481663, + "loss": 0.5692, + "step": 11700 + }, + { + "epoch": 0.260639245014245, + "grad_norm": 0.8143709897994995, + "learning_rate": 0.0003926148165094888, + "loss": 0.6598, + "step": 11710 + }, + { + "epoch": 0.26086182336182334, + "grad_norm": 0.7106712460517883, + "learning_rate": 0.0003926022588673142, + "loss": 0.6684, + "step": 11720 + }, + { + "epoch": 0.2610844017094017, + "grad_norm": 1.1324150562286377, + "learning_rate": 0.0003925896907589751, + "loss": 0.647, + "step": 11730 + }, + { + "epoch": 0.26130698005698005, + "grad_norm": 0.43803641200065613, + "learning_rate": 0.0003925771121851545, + "loss": 0.6768, + "step": 11740 + }, + { + "epoch": 0.2615295584045584, + "grad_norm": 0.8513333201408386, + "learning_rate": 0.00039256452314653605, + "loss": 0.6525, + "step": 11750 + }, + { + "epoch": 0.26175213675213677, + "grad_norm": 0.8602158427238464, + "learning_rate": 0.0003925519236438038, + "loss": 0.6689, + "step": 11760 + }, + { + "epoch": 0.2619747150997151, + "grad_norm": 0.41731908917427063, + "learning_rate": 0.0003925393136776423, + "loss": 0.5316, + "step": 11770 + }, + { + "epoch": 0.2621972934472934, + "grad_norm": 0.863296389579773, + "learning_rate": 0.0003925266932487369, + "loss": 0.7248, + "step": 11780 + }, + { + "epoch": 0.2624198717948718, + "grad_norm": 0.6863886117935181, + "learning_rate": 0.00039251406235777346, + "loss": 0.5437, + "step": 11790 + }, + { + "epoch": 0.26264245014245013, + "grad_norm": 0.6773598194122314, + "learning_rate": 0.00039250142100543815, + "loss": 0.6482, + "step": 11800 + }, + { + "epoch": 0.26286502849002846, + "grad_norm": 0.6856159567832947, + "learning_rate": 0.0003924887691924181, + "loss": 0.7223, + "step": 11810 + }, + { + "epoch": 0.26308760683760685, + "grad_norm": 0.770422637462616, + "learning_rate": 0.00039247610691940074, + "loss": 0.652, + "step": 11820 + }, + { + "epoch": 0.2633101851851852, + "grad_norm": 0.5794972777366638, + "learning_rate": 0.00039246343418707417, + "loss": 0.6338, + "step": 11830 + }, + { + "epoch": 0.26353276353276356, + "grad_norm": 0.6820332407951355, + "learning_rate": 0.000392450750996127, + "loss": 0.7889, + "step": 11840 + }, + { + "epoch": 0.2637553418803419, + "grad_norm": 0.7752724885940552, + "learning_rate": 0.0003924380573472485, + "loss": 0.7358, + "step": 11850 + }, + { + "epoch": 0.2639779202279202, + "grad_norm": 0.606903076171875, + "learning_rate": 0.0003924253532411284, + "loss": 0.6442, + "step": 11860 + }, + { + "epoch": 0.2642004985754986, + "grad_norm": 0.4325622320175171, + "learning_rate": 0.00039241263867845715, + "loss": 0.6324, + "step": 11870 + }, + { + "epoch": 0.2644230769230769, + "grad_norm": 0.7513414025306702, + "learning_rate": 0.00039239991365992553, + "loss": 0.5955, + "step": 11880 + }, + { + "epoch": 0.26464565527065526, + "grad_norm": 0.9659456610679626, + "learning_rate": 0.0003923871781862251, + "loss": 0.8048, + "step": 11890 + }, + { + "epoch": 0.26486823361823364, + "grad_norm": 0.733268141746521, + "learning_rate": 0.00039237443225804795, + "loss": 0.6318, + "step": 11900 + }, + { + "epoch": 0.26509081196581197, + "grad_norm": 0.7908543348312378, + "learning_rate": 0.0003923616758760867, + "loss": 0.627, + "step": 11910 + }, + { + "epoch": 0.2653133903133903, + "grad_norm": 0.7832129597663879, + "learning_rate": 0.00039234890904103444, + "loss": 0.7449, + "step": 11920 + }, + { + "epoch": 0.2655359686609687, + "grad_norm": 0.6216659545898438, + "learning_rate": 0.000392336131753585, + "loss": 0.579, + "step": 11930 + }, + { + "epoch": 0.265758547008547, + "grad_norm": 0.5633769631385803, + "learning_rate": 0.0003923233440144327, + "loss": 0.6811, + "step": 11940 + }, + { + "epoch": 0.26598112535612534, + "grad_norm": 0.8467236757278442, + "learning_rate": 0.00039231054582427243, + "loss": 0.6787, + "step": 11950 + }, + { + "epoch": 0.2662037037037037, + "grad_norm": 0.5862749218940735, + "learning_rate": 0.00039229773718379974, + "loss": 0.5676, + "step": 11960 + }, + { + "epoch": 0.26642628205128205, + "grad_norm": 0.7872330546379089, + "learning_rate": 0.00039228491809371055, + "loss": 0.6813, + "step": 11970 + }, + { + "epoch": 0.2666488603988604, + "grad_norm": 0.535454273223877, + "learning_rate": 0.0003922720885547015, + "loss": 0.5713, + "step": 11980 + }, + { + "epoch": 0.26687143874643876, + "grad_norm": 0.6369079351425171, + "learning_rate": 0.0003922592485674697, + "loss": 0.6077, + "step": 11990 + }, + { + "epoch": 0.2670940170940171, + "grad_norm": 0.4691181778907776, + "learning_rate": 0.00039224639813271306, + "loss": 0.5685, + "step": 12000 + }, + { + "epoch": 0.2673165954415954, + "grad_norm": 0.8555895686149597, + "learning_rate": 0.0003922335372511297, + "loss": 0.6186, + "step": 12010 + }, + { + "epoch": 0.2675391737891738, + "grad_norm": 0.7896236777305603, + "learning_rate": 0.00039222066592341855, + "loss": 0.6436, + "step": 12020 + }, + { + "epoch": 0.26776175213675213, + "grad_norm": 0.651438295841217, + "learning_rate": 0.000392207784150279, + "loss": 0.6381, + "step": 12030 + }, + { + "epoch": 0.26798433048433046, + "grad_norm": 0.7535800933837891, + "learning_rate": 0.00039219489193241124, + "loss": 0.6344, + "step": 12040 + }, + { + "epoch": 0.26820690883190884, + "grad_norm": 0.7681666612625122, + "learning_rate": 0.0003921819892705156, + "loss": 0.6896, + "step": 12050 + }, + { + "epoch": 0.26842948717948717, + "grad_norm": 0.8771412968635559, + "learning_rate": 0.00039216907616529336, + "loss": 0.4947, + "step": 12060 + }, + { + "epoch": 0.26865206552706555, + "grad_norm": 0.7458842396736145, + "learning_rate": 0.00039215615261744625, + "loss": 0.5895, + "step": 12070 + }, + { + "epoch": 0.2688746438746439, + "grad_norm": 0.7990144491195679, + "learning_rate": 0.0003921432186276765, + "loss": 0.5073, + "step": 12080 + }, + { + "epoch": 0.2690972222222222, + "grad_norm": 0.8297378420829773, + "learning_rate": 0.0003921302741966869, + "loss": 0.7029, + "step": 12090 + }, + { + "epoch": 0.2693198005698006, + "grad_norm": 0.7059113383293152, + "learning_rate": 0.000392117319325181, + "loss": 0.8696, + "step": 12100 + }, + { + "epoch": 0.2695423789173789, + "grad_norm": 0.514255702495575, + "learning_rate": 0.0003921043540138626, + "loss": 0.6366, + "step": 12110 + }, + { + "epoch": 0.26976495726495725, + "grad_norm": 0.6856957674026489, + "learning_rate": 0.00039209137826343637, + "loss": 0.5064, + "step": 12120 + }, + { + "epoch": 0.26998753561253563, + "grad_norm": 0.6941282749176025, + "learning_rate": 0.00039207839207460745, + "loss": 0.522, + "step": 12130 + }, + { + "epoch": 0.27021011396011396, + "grad_norm": 0.8638391494750977, + "learning_rate": 0.0003920653954480813, + "loss": 0.7316, + "step": 12140 + }, + { + "epoch": 0.2704326923076923, + "grad_norm": 0.792180597782135, + "learning_rate": 0.0003920523883845645, + "loss": 0.6044, + "step": 12150 + }, + { + "epoch": 0.2706552706552707, + "grad_norm": 0.9648000597953796, + "learning_rate": 0.00039203937088476366, + "loss": 0.6755, + "step": 12160 + }, + { + "epoch": 0.270877849002849, + "grad_norm": 0.6021059155464172, + "learning_rate": 0.00039202634294938614, + "loss": 0.6504, + "step": 12170 + }, + { + "epoch": 0.27110042735042733, + "grad_norm": 0.8093569278717041, + "learning_rate": 0.00039201330457914, + "loss": 0.8009, + "step": 12180 + }, + { + "epoch": 0.2713230056980057, + "grad_norm": 0.5667636394500732, + "learning_rate": 0.0003920002557747337, + "loss": 0.5784, + "step": 12190 + }, + { + "epoch": 0.27154558404558404, + "grad_norm": 0.672580361366272, + "learning_rate": 0.00039198719653687624, + "loss": 0.6791, + "step": 12200 + }, + { + "epoch": 0.27176816239316237, + "grad_norm": 0.520078718662262, + "learning_rate": 0.0003919741268662774, + "loss": 0.5996, + "step": 12210 + }, + { + "epoch": 0.27199074074074076, + "grad_norm": 0.5578593015670776, + "learning_rate": 0.0003919610467636474, + "loss": 0.6789, + "step": 12220 + }, + { + "epoch": 0.2722133190883191, + "grad_norm": 1.0336942672729492, + "learning_rate": 0.0003919479562296969, + "loss": 0.7435, + "step": 12230 + }, + { + "epoch": 0.2724358974358974, + "grad_norm": 0.7458341717720032, + "learning_rate": 0.00039193485526513734, + "loss": 0.623, + "step": 12240 + }, + { + "epoch": 0.2726584757834758, + "grad_norm": 0.44324880838394165, + "learning_rate": 0.0003919217438706807, + "loss": 0.5499, + "step": 12250 + }, + { + "epoch": 0.2728810541310541, + "grad_norm": 0.5924796462059021, + "learning_rate": 0.00039190862204703926, + "loss": 0.5998, + "step": 12260 + }, + { + "epoch": 0.27310363247863245, + "grad_norm": 0.6144111752510071, + "learning_rate": 0.00039189548979492626, + "loss": 0.5662, + "step": 12270 + }, + { + "epoch": 0.27332621082621084, + "grad_norm": 0.5802494287490845, + "learning_rate": 0.0003918823471150552, + "loss": 0.5966, + "step": 12280 + }, + { + "epoch": 0.27354878917378916, + "grad_norm": 0.6217107772827148, + "learning_rate": 0.0003918691940081404, + "loss": 0.6996, + "step": 12290 + }, + { + "epoch": 0.27377136752136755, + "grad_norm": 0.8809908032417297, + "learning_rate": 0.0003918560304748965, + "loss": 0.6406, + "step": 12300 + }, + { + "epoch": 0.2739939458689459, + "grad_norm": 3.5845541954040527, + "learning_rate": 0.0003918428565160388, + "loss": 0.6467, + "step": 12310 + }, + { + "epoch": 0.2742165242165242, + "grad_norm": 0.366034597158432, + "learning_rate": 0.00039182967213228327, + "loss": 0.5746, + "step": 12320 + }, + { + "epoch": 0.2744391025641026, + "grad_norm": 0.6246192455291748, + "learning_rate": 0.0003918164773243463, + "loss": 0.6412, + "step": 12330 + }, + { + "epoch": 0.2746616809116809, + "grad_norm": 1.0219444036483765, + "learning_rate": 0.0003918032720929449, + "loss": 0.5756, + "step": 12340 + }, + { + "epoch": 0.27488425925925924, + "grad_norm": 0.9017455577850342, + "learning_rate": 0.0003917900564387967, + "loss": 0.66, + "step": 12350 + }, + { + "epoch": 0.27510683760683763, + "grad_norm": 0.8596403002738953, + "learning_rate": 0.00039177683036261985, + "loss": 0.712, + "step": 12360 + }, + { + "epoch": 0.27532941595441596, + "grad_norm": 0.7327362895011902, + "learning_rate": 0.000391763593865133, + "loss": 0.7019, + "step": 12370 + }, + { + "epoch": 0.2755519943019943, + "grad_norm": 0.42158403992652893, + "learning_rate": 0.0003917503469470555, + "loss": 0.6587, + "step": 12380 + }, + { + "epoch": 0.27577457264957267, + "grad_norm": 0.9592531323432922, + "learning_rate": 0.00039173708960910716, + "loss": 0.6499, + "step": 12390 + }, + { + "epoch": 0.275997150997151, + "grad_norm": 1.0942060947418213, + "learning_rate": 0.0003917238218520084, + "loss": 0.692, + "step": 12400 + }, + { + "epoch": 0.2762197293447293, + "grad_norm": 0.8639572858810425, + "learning_rate": 0.0003917105436764803, + "loss": 0.6718, + "step": 12410 + }, + { + "epoch": 0.2764423076923077, + "grad_norm": 0.682167112827301, + "learning_rate": 0.0003916972550832442, + "loss": 0.6555, + "step": 12420 + }, + { + "epoch": 0.27666488603988604, + "grad_norm": 0.6073299646377563, + "learning_rate": 0.0003916839560730224, + "loss": 0.5755, + "step": 12430 + }, + { + "epoch": 0.27688746438746437, + "grad_norm": 0.8207989931106567, + "learning_rate": 0.0003916706466465375, + "loss": 0.6832, + "step": 12440 + }, + { + "epoch": 0.27711004273504275, + "grad_norm": 0.5853878259658813, + "learning_rate": 0.00039165732680451266, + "loss": 0.6955, + "step": 12450 + }, + { + "epoch": 0.2773326210826211, + "grad_norm": 0.6994444131851196, + "learning_rate": 0.0003916439965476718, + "loss": 0.591, + "step": 12460 + }, + { + "epoch": 0.2775551994301994, + "grad_norm": 0.5923715233802795, + "learning_rate": 0.0003916306558767394, + "loss": 0.5649, + "step": 12470 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.698859691619873, + "learning_rate": 0.00039161730479244023, + "loss": 0.7315, + "step": 12480 + }, + { + "epoch": 0.2780003561253561, + "grad_norm": 0.554038405418396, + "learning_rate": 0.0003916039432954998, + "loss": 0.5093, + "step": 12490 + }, + { + "epoch": 0.27822293447293445, + "grad_norm": 0.8297765851020813, + "learning_rate": 0.00039159057138664425, + "loss": 0.5514, + "step": 12500 + }, + { + "epoch": 0.27844551282051283, + "grad_norm": 0.6924365758895874, + "learning_rate": 0.00039157718906660026, + "loss": 0.5219, + "step": 12510 + }, + { + "epoch": 0.27866809116809116, + "grad_norm": 0.5998541116714478, + "learning_rate": 0.00039156379633609493, + "loss": 0.6241, + "step": 12520 + }, + { + "epoch": 0.27889066951566954, + "grad_norm": 0.4820443391799927, + "learning_rate": 0.00039155039319585614, + "loss": 0.7489, + "step": 12530 + }, + { + "epoch": 0.27911324786324787, + "grad_norm": 0.48114636540412903, + "learning_rate": 0.0003915369796466121, + "loss": 0.4851, + "step": 12540 + }, + { + "epoch": 0.2793358262108262, + "grad_norm": 0.8057467937469482, + "learning_rate": 0.00039152355568909187, + "loss": 0.6562, + "step": 12550 + }, + { + "epoch": 0.2795584045584046, + "grad_norm": 0.6877124905586243, + "learning_rate": 0.0003915101213240248, + "loss": 0.6969, + "step": 12560 + }, + { + "epoch": 0.2797809829059829, + "grad_norm": 0.775690495967865, + "learning_rate": 0.00039149667655214094, + "loss": 0.642, + "step": 12570 + }, + { + "epoch": 0.28000356125356124, + "grad_norm": 0.80083829164505, + "learning_rate": 0.0003914832213741709, + "loss": 0.6972, + "step": 12580 + }, + { + "epoch": 0.2802261396011396, + "grad_norm": 1.017709493637085, + "learning_rate": 0.00039146975579084584, + "loss": 0.6516, + "step": 12590 + }, + { + "epoch": 0.28044871794871795, + "grad_norm": 0.7244583964347839, + "learning_rate": 0.0003914562798028976, + "loss": 0.6617, + "step": 12600 + }, + { + "epoch": 0.2806712962962963, + "grad_norm": 0.58905029296875, + "learning_rate": 0.0003914427934110583, + "loss": 0.6822, + "step": 12610 + }, + { + "epoch": 0.28089387464387466, + "grad_norm": 0.7043224573135376, + "learning_rate": 0.0003914292966160609, + "loss": 0.5562, + "step": 12620 + }, + { + "epoch": 0.281116452991453, + "grad_norm": 0.8032645583152771, + "learning_rate": 0.00039141578941863885, + "loss": 0.5138, + "step": 12630 + }, + { + "epoch": 0.2813390313390313, + "grad_norm": 0.8150473833084106, + "learning_rate": 0.0003914022718195261, + "loss": 0.6488, + "step": 12640 + }, + { + "epoch": 0.2815616096866097, + "grad_norm": 0.8619401454925537, + "learning_rate": 0.00039138874381945715, + "loss": 0.5323, + "step": 12650 + }, + { + "epoch": 0.28178418803418803, + "grad_norm": 0.5432770848274231, + "learning_rate": 0.00039137520541916726, + "loss": 0.6153, + "step": 12660 + }, + { + "epoch": 0.28200676638176636, + "grad_norm": 0.9435534477233887, + "learning_rate": 0.00039136165661939195, + "loss": 0.7069, + "step": 12670 + }, + { + "epoch": 0.28222934472934474, + "grad_norm": 0.5666217803955078, + "learning_rate": 0.0003913480974208676, + "loss": 0.5967, + "step": 12680 + }, + { + "epoch": 0.2824519230769231, + "grad_norm": 1.044399619102478, + "learning_rate": 0.00039133452782433097, + "loss": 0.6681, + "step": 12690 + }, + { + "epoch": 0.2826745014245014, + "grad_norm": 0.5483806133270264, + "learning_rate": 0.0003913209478305196, + "loss": 0.6281, + "step": 12700 + }, + { + "epoch": 0.2828970797720798, + "grad_norm": 0.8065449595451355, + "learning_rate": 0.00039130735744017113, + "loss": 0.5793, + "step": 12710 + }, + { + "epoch": 0.2831196581196581, + "grad_norm": 0.5208731889724731, + "learning_rate": 0.00039129375665402434, + "loss": 0.7103, + "step": 12720 + }, + { + "epoch": 0.28334223646723644, + "grad_norm": 0.821640133857727, + "learning_rate": 0.0003912801454728181, + "loss": 0.7682, + "step": 12730 + }, + { + "epoch": 0.2835648148148148, + "grad_norm": 0.6433324813842773, + "learning_rate": 0.0003912665238972923, + "loss": 0.6647, + "step": 12740 + }, + { + "epoch": 0.28378739316239315, + "grad_norm": 0.7271362543106079, + "learning_rate": 0.000391252891928187, + "loss": 0.5601, + "step": 12750 + }, + { + "epoch": 0.28400997150997154, + "grad_norm": 0.5222648978233337, + "learning_rate": 0.00039123924956624293, + "loss": 0.6467, + "step": 12760 + }, + { + "epoch": 0.28423254985754987, + "grad_norm": 0.7300889492034912, + "learning_rate": 0.00039122559681220153, + "loss": 0.5843, + "step": 12770 + }, + { + "epoch": 0.2844551282051282, + "grad_norm": 0.6423002481460571, + "learning_rate": 0.0003912119336668046, + "loss": 0.6337, + "step": 12780 + }, + { + "epoch": 0.2846777065527066, + "grad_norm": 0.568601667881012, + "learning_rate": 0.00039119826013079466, + "loss": 0.6153, + "step": 12790 + }, + { + "epoch": 0.2849002849002849, + "grad_norm": 0.9605054259300232, + "learning_rate": 0.0003911845762049148, + "loss": 0.6467, + "step": 12800 + }, + { + "epoch": 0.28512286324786323, + "grad_norm": 0.6418153643608093, + "learning_rate": 0.0003911708818899086, + "loss": 0.5672, + "step": 12810 + }, + { + "epoch": 0.2853454415954416, + "grad_norm": 0.5108728408813477, + "learning_rate": 0.00039115717718652006, + "loss": 0.7055, + "step": 12820 + }, + { + "epoch": 0.28556801994301995, + "grad_norm": 0.8929889798164368, + "learning_rate": 0.0003911434620954941, + "loss": 0.6672, + "step": 12830 + }, + { + "epoch": 0.2857905982905983, + "grad_norm": 0.8994865417480469, + "learning_rate": 0.00039112973661757584, + "loss": 0.7828, + "step": 12840 + }, + { + "epoch": 0.28601317663817666, + "grad_norm": 0.5077641010284424, + "learning_rate": 0.00039111600075351135, + "loss": 0.7327, + "step": 12850 + }, + { + "epoch": 0.286235754985755, + "grad_norm": 0.6523764729499817, + "learning_rate": 0.00039110225450404685, + "loss": 0.7155, + "step": 12860 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.5647468566894531, + "learning_rate": 0.0003910884978699294, + "loss": 0.5385, + "step": 12870 + }, + { + "epoch": 0.2866809116809117, + "grad_norm": 0.6217672228813171, + "learning_rate": 0.00039107473085190654, + "loss": 0.4422, + "step": 12880 + }, + { + "epoch": 0.28690349002849, + "grad_norm": 0.7185326814651489, + "learning_rate": 0.00039106095345072645, + "loss": 0.7382, + "step": 12890 + }, + { + "epoch": 0.28712606837606836, + "grad_norm": 0.6368201971054077, + "learning_rate": 0.0003910471656671376, + "loss": 0.6403, + "step": 12900 + }, + { + "epoch": 0.28734864672364674, + "grad_norm": 0.6725485324859619, + "learning_rate": 0.00039103336750188954, + "loss": 0.4992, + "step": 12910 + }, + { + "epoch": 0.28757122507122507, + "grad_norm": 0.7353759407997131, + "learning_rate": 0.0003910195589557318, + "loss": 0.7156, + "step": 12920 + }, + { + "epoch": 0.2877938034188034, + "grad_norm": 0.4989571273326874, + "learning_rate": 0.0003910057400294149, + "loss": 0.6063, + "step": 12930 + }, + { + "epoch": 0.2880163817663818, + "grad_norm": 0.48165708780288696, + "learning_rate": 0.00039099191072368964, + "loss": 0.6651, + "step": 12940 + }, + { + "epoch": 0.2882389601139601, + "grad_norm": 0.8041085004806519, + "learning_rate": 0.0003909780710393077, + "loss": 0.6656, + "step": 12950 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 0.6328619122505188, + "learning_rate": 0.00039096422097702096, + "loss": 0.5947, + "step": 12960 + }, + { + "epoch": 0.2886841168091168, + "grad_norm": 0.9309022426605225, + "learning_rate": 0.00039095036053758215, + "loss": 0.6813, + "step": 12970 + }, + { + "epoch": 0.28890669515669515, + "grad_norm": 0.703187108039856, + "learning_rate": 0.0003909364897217445, + "loss": 0.5864, + "step": 12980 + }, + { + "epoch": 0.28912927350427353, + "grad_norm": 0.8175052404403687, + "learning_rate": 0.0003909226085302616, + "loss": 0.6581, + "step": 12990 + }, + { + "epoch": 0.28935185185185186, + "grad_norm": 1.1442813873291016, + "learning_rate": 0.00039090871696388787, + "loss": 0.7457, + "step": 13000 + }, + { + "epoch": 0.2895744301994302, + "grad_norm": 0.6309537887573242, + "learning_rate": 0.0003908948150233782, + "loss": 0.7326, + "step": 13010 + }, + { + "epoch": 0.28979700854700857, + "grad_norm": 0.7221547365188599, + "learning_rate": 0.0003908809027094879, + "loss": 0.5485, + "step": 13020 + }, + { + "epoch": 0.2900195868945869, + "grad_norm": 0.5818193554878235, + "learning_rate": 0.0003908669800229732, + "loss": 0.6009, + "step": 13030 + }, + { + "epoch": 0.29024216524216523, + "grad_norm": 0.5316341519355774, + "learning_rate": 0.0003908530469645905, + "loss": 0.5764, + "step": 13040 + }, + { + "epoch": 0.2904647435897436, + "grad_norm": 0.6840851902961731, + "learning_rate": 0.00039083910353509703, + "loss": 0.6708, + "step": 13050 + }, + { + "epoch": 0.29068732193732194, + "grad_norm": 0.9945989847183228, + "learning_rate": 0.0003908251497352505, + "loss": 0.6217, + "step": 13060 + }, + { + "epoch": 0.29090990028490027, + "grad_norm": 0.6313449144363403, + "learning_rate": 0.000390811185565809, + "loss": 0.7558, + "step": 13070 + }, + { + "epoch": 0.29113247863247865, + "grad_norm": 0.8493428230285645, + "learning_rate": 0.0003907972110275315, + "loss": 0.6076, + "step": 13080 + }, + { + "epoch": 0.291355056980057, + "grad_norm": 0.5349470973014832, + "learning_rate": 0.0003907832261211774, + "loss": 0.7087, + "step": 13090 + }, + { + "epoch": 0.2915776353276353, + "grad_norm": 0.5637439489364624, + "learning_rate": 0.0003907692308475066, + "loss": 0.6684, + "step": 13100 + }, + { + "epoch": 0.2918002136752137, + "grad_norm": 0.7880241274833679, + "learning_rate": 0.0003907552252072796, + "loss": 0.7463, + "step": 13110 + }, + { + "epoch": 0.292022792022792, + "grad_norm": 0.8356769680976868, + "learning_rate": 0.00039074120920125756, + "loss": 0.5763, + "step": 13120 + }, + { + "epoch": 0.29224537037037035, + "grad_norm": 0.9101738333702087, + "learning_rate": 0.000390727182830202, + "loss": 0.6954, + "step": 13130 + }, + { + "epoch": 0.29246794871794873, + "grad_norm": 0.49127423763275146, + "learning_rate": 0.0003907131460948752, + "loss": 0.6114, + "step": 13140 + }, + { + "epoch": 0.29269052706552706, + "grad_norm": 0.4214502274990082, + "learning_rate": 0.00039069909899604, + "loss": 0.5386, + "step": 13150 + }, + { + "epoch": 0.2929131054131054, + "grad_norm": 0.9073315262794495, + "learning_rate": 0.0003906850415344595, + "loss": 0.6927, + "step": 13160 + }, + { + "epoch": 0.2931356837606838, + "grad_norm": 0.35063672065734863, + "learning_rate": 0.0003906709737108978, + "loss": 0.6315, + "step": 13170 + }, + { + "epoch": 0.2933582621082621, + "grad_norm": 0.4961087107658386, + "learning_rate": 0.0003906568955261193, + "loss": 0.6075, + "step": 13180 + }, + { + "epoch": 0.29358084045584043, + "grad_norm": 0.7128926515579224, + "learning_rate": 0.00039064280698088903, + "loss": 0.6542, + "step": 13190 + }, + { + "epoch": 0.2938034188034188, + "grad_norm": 0.7185338735580444, + "learning_rate": 0.0003906287080759726, + "loss": 0.6805, + "step": 13200 + }, + { + "epoch": 0.29402599715099714, + "grad_norm": 0.7085132002830505, + "learning_rate": 0.00039061459881213606, + "loss": 0.6612, + "step": 13210 + }, + { + "epoch": 0.29424857549857547, + "grad_norm": 0.8242709040641785, + "learning_rate": 0.00039060047919014623, + "loss": 0.5647, + "step": 13220 + }, + { + "epoch": 0.29447115384615385, + "grad_norm": 0.8153027296066284, + "learning_rate": 0.0003905863492107702, + "loss": 0.6909, + "step": 13230 + }, + { + "epoch": 0.2946937321937322, + "grad_norm": 0.6219608783721924, + "learning_rate": 0.00039057220887477615, + "loss": 0.6192, + "step": 13240 + }, + { + "epoch": 0.29491631054131057, + "grad_norm": 0.9387757182121277, + "learning_rate": 0.00039055805818293205, + "loss": 0.6474, + "step": 13250 + }, + { + "epoch": 0.2951388888888889, + "grad_norm": 0.538312554359436, + "learning_rate": 0.00039054389713600717, + "loss": 0.5752, + "step": 13260 + }, + { + "epoch": 0.2953614672364672, + "grad_norm": 0.6803849935531616, + "learning_rate": 0.00039052972573477097, + "loss": 0.4585, + "step": 13270 + }, + { + "epoch": 0.2955840455840456, + "grad_norm": 0.5901275873184204, + "learning_rate": 0.0003905155439799934, + "loss": 0.6493, + "step": 13280 + }, + { + "epoch": 0.29580662393162394, + "grad_norm": 0.49050992727279663, + "learning_rate": 0.00039050135187244526, + "loss": 0.6126, + "step": 13290 + }, + { + "epoch": 0.29602920227920226, + "grad_norm": 0.4540439546108246, + "learning_rate": 0.0003904871494128977, + "loss": 0.4577, + "step": 13300 + }, + { + "epoch": 0.29625178062678065, + "grad_norm": 0.7715014219284058, + "learning_rate": 0.0003904729366021225, + "loss": 0.5782, + "step": 13310 + }, + { + "epoch": 0.296474358974359, + "grad_norm": 0.39345112442970276, + "learning_rate": 0.000390458713440892, + "loss": 0.5754, + "step": 13320 + }, + { + "epoch": 0.2966969373219373, + "grad_norm": 0.7827655076980591, + "learning_rate": 0.0003904444799299791, + "loss": 0.7669, + "step": 13330 + }, + { + "epoch": 0.2969195156695157, + "grad_norm": 0.7510057091712952, + "learning_rate": 0.0003904302360701572, + "loss": 0.4807, + "step": 13340 + }, + { + "epoch": 0.297142094017094, + "grad_norm": 0.7219814658164978, + "learning_rate": 0.0003904159818622005, + "loss": 0.581, + "step": 13350 + }, + { + "epoch": 0.29736467236467234, + "grad_norm": 0.6957118511199951, + "learning_rate": 0.0003904017173068834, + "loss": 0.5917, + "step": 13360 + }, + { + "epoch": 0.29758725071225073, + "grad_norm": 0.4547037184238434, + "learning_rate": 0.00039038744240498105, + "loss": 0.5798, + "step": 13370 + }, + { + "epoch": 0.29780982905982906, + "grad_norm": 0.6696369647979736, + "learning_rate": 0.0003903731571572693, + "loss": 0.6329, + "step": 13380 + }, + { + "epoch": 0.2980324074074074, + "grad_norm": 0.6257852911949158, + "learning_rate": 0.00039035886156452436, + "loss": 0.6221, + "step": 13390 + }, + { + "epoch": 0.29825498575498577, + "grad_norm": 0.8077847957611084, + "learning_rate": 0.000390344555627523, + "loss": 0.6277, + "step": 13400 + }, + { + "epoch": 0.2984775641025641, + "grad_norm": 0.8844859600067139, + "learning_rate": 0.0003903302393470426, + "loss": 0.6486, + "step": 13410 + }, + { + "epoch": 0.2987001424501424, + "grad_norm": 0.8643153309822083, + "learning_rate": 0.0003903159127238613, + "loss": 0.6019, + "step": 13420 + }, + { + "epoch": 0.2989227207977208, + "grad_norm": 0.7332974672317505, + "learning_rate": 0.0003903015757587574, + "loss": 0.5546, + "step": 13430 + }, + { + "epoch": 0.29914529914529914, + "grad_norm": 0.7666581869125366, + "learning_rate": 0.0003902872284525102, + "loss": 0.5489, + "step": 13440 + }, + { + "epoch": 0.29936787749287747, + "grad_norm": 0.5811508893966675, + "learning_rate": 0.0003902728708058991, + "loss": 0.6611, + "step": 13450 + }, + { + "epoch": 0.29959045584045585, + "grad_norm": 0.9579280018806458, + "learning_rate": 0.00039025850281970454, + "loss": 0.6002, + "step": 13460 + }, + { + "epoch": 0.2998130341880342, + "grad_norm": 0.5538046956062317, + "learning_rate": 0.00039024412449470717, + "loss": 0.7622, + "step": 13470 + }, + { + "epoch": 0.30003561253561256, + "grad_norm": 0.609813928604126, + "learning_rate": 0.0003902297358316883, + "loss": 0.6494, + "step": 13480 + }, + { + "epoch": 0.30003561253561256, + "eval_loss": 0.6374564170837402, + "eval_runtime": 337.424, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 7.009, + "step": 13480 + }, + { + "epoch": 0.3002581908831909, + "grad_norm": 0.7406212687492371, + "learning_rate": 0.00039021533683142984, + "loss": 0.6261, + "step": 13490 + }, + { + "epoch": 0.3004807692307692, + "grad_norm": 0.8207390308380127, + "learning_rate": 0.00039020092749471433, + "loss": 0.6493, + "step": 13500 + }, + { + "epoch": 0.3007033475783476, + "grad_norm": 0.5859150886535645, + "learning_rate": 0.00039018650782232466, + "loss": 0.5998, + "step": 13510 + }, + { + "epoch": 0.30092592592592593, + "grad_norm": 0.5716903805732727, + "learning_rate": 0.0003901720778150445, + "loss": 0.6489, + "step": 13520 + }, + { + "epoch": 0.30114850427350426, + "grad_norm": 0.7157092094421387, + "learning_rate": 0.0003901576374736579, + "loss": 0.7296, + "step": 13530 + }, + { + "epoch": 0.30137108262108264, + "grad_norm": 0.8354113101959229, + "learning_rate": 0.0003901431867989496, + "loss": 0.6373, + "step": 13540 + }, + { + "epoch": 0.30159366096866097, + "grad_norm": 0.7195545434951782, + "learning_rate": 0.00039012872579170495, + "loss": 0.5963, + "step": 13550 + }, + { + "epoch": 0.3018162393162393, + "grad_norm": 0.5032168030738831, + "learning_rate": 0.00039011425445270966, + "loss": 0.5855, + "step": 13560 + }, + { + "epoch": 0.3020388176638177, + "grad_norm": 0.6268942356109619, + "learning_rate": 0.0003900997727827501, + "loss": 0.5637, + "step": 13570 + }, + { + "epoch": 0.302261396011396, + "grad_norm": 0.7669327259063721, + "learning_rate": 0.0003900852807826133, + "loss": 0.5935, + "step": 13580 + }, + { + "epoch": 0.30248397435897434, + "grad_norm": 0.6793097853660583, + "learning_rate": 0.0003900707784530867, + "loss": 0.696, + "step": 13590 + }, + { + "epoch": 0.3027065527065527, + "grad_norm": 0.552746593952179, + "learning_rate": 0.0003900562657949585, + "loss": 0.6304, + "step": 13600 + }, + { + "epoch": 0.30292913105413105, + "grad_norm": 0.6738058924674988, + "learning_rate": 0.00039004174280901714, + "loss": 0.6347, + "step": 13610 + }, + { + "epoch": 0.3031517094017094, + "grad_norm": 0.8077411651611328, + "learning_rate": 0.0003900272094960519, + "loss": 0.7001, + "step": 13620 + }, + { + "epoch": 0.30337428774928776, + "grad_norm": 0.806163489818573, + "learning_rate": 0.00039001266585685253, + "loss": 0.5889, + "step": 13630 + }, + { + "epoch": 0.3035968660968661, + "grad_norm": 0.68730628490448, + "learning_rate": 0.0003899981118922094, + "loss": 0.6084, + "step": 13640 + }, + { + "epoch": 0.3038194444444444, + "grad_norm": 0.7665461301803589, + "learning_rate": 0.0003899835476029133, + "loss": 0.5428, + "step": 13650 + }, + { + "epoch": 0.3040420227920228, + "grad_norm": 0.9742176532745361, + "learning_rate": 0.0003899689729897557, + "loss": 0.6455, + "step": 13660 + }, + { + "epoch": 0.30426460113960113, + "grad_norm": 0.6864128112792969, + "learning_rate": 0.0003899543880535286, + "loss": 0.6755, + "step": 13670 + }, + { + "epoch": 0.30448717948717946, + "grad_norm": 0.8203412890434265, + "learning_rate": 0.0003899397927950245, + "loss": 0.6731, + "step": 13680 + }, + { + "epoch": 0.30470975783475784, + "grad_norm": 0.6704636216163635, + "learning_rate": 0.0003899251872150366, + "loss": 0.5118, + "step": 13690 + }, + { + "epoch": 0.30493233618233617, + "grad_norm": 0.6445683836936951, + "learning_rate": 0.0003899105713143586, + "loss": 0.5113, + "step": 13700 + }, + { + "epoch": 0.30515491452991456, + "grad_norm": 0.7139549851417542, + "learning_rate": 0.0003898959450937846, + "loss": 0.5594, + "step": 13710 + }, + { + "epoch": 0.3053774928774929, + "grad_norm": 0.9933525323867798, + "learning_rate": 0.00038988130855410947, + "loss": 0.6033, + "step": 13720 + }, + { + "epoch": 0.3056000712250712, + "grad_norm": 0.6525797843933105, + "learning_rate": 0.0003898666616961287, + "loss": 0.6571, + "step": 13730 + }, + { + "epoch": 0.3058226495726496, + "grad_norm": 0.6067349314689636, + "learning_rate": 0.00038985200452063804, + "loss": 0.7637, + "step": 13740 + }, + { + "epoch": 0.3060452279202279, + "grad_norm": 0.7824680209159851, + "learning_rate": 0.00038983733702843405, + "loss": 0.5977, + "step": 13750 + }, + { + "epoch": 0.30626780626780625, + "grad_norm": 0.3695496618747711, + "learning_rate": 0.00038982265922031376, + "loss": 0.4772, + "step": 13760 + }, + { + "epoch": 0.30649038461538464, + "grad_norm": 0.5000861883163452, + "learning_rate": 0.00038980797109707476, + "loss": 0.6247, + "step": 13770 + }, + { + "epoch": 0.30671296296296297, + "grad_norm": 0.9075541496276855, + "learning_rate": 0.0003897932726595152, + "loss": 0.6185, + "step": 13780 + }, + { + "epoch": 0.3069355413105413, + "grad_norm": 0.8326929807662964, + "learning_rate": 0.00038977856390843386, + "loss": 0.6476, + "step": 13790 + }, + { + "epoch": 0.3071581196581197, + "grad_norm": 0.47030866146087646, + "learning_rate": 0.00038976384484462997, + "loss": 0.6748, + "step": 13800 + }, + { + "epoch": 0.307380698005698, + "grad_norm": 0.5289137363433838, + "learning_rate": 0.0003897491154689034, + "loss": 0.5827, + "step": 13810 + }, + { + "epoch": 0.30760327635327633, + "grad_norm": 0.6581617593765259, + "learning_rate": 0.00038973437578205465, + "loss": 0.6515, + "step": 13820 + }, + { + "epoch": 0.3078258547008547, + "grad_norm": 0.4768698513507843, + "learning_rate": 0.0003897196257848845, + "loss": 0.6406, + "step": 13830 + }, + { + "epoch": 0.30804843304843305, + "grad_norm": 0.5576051473617554, + "learning_rate": 0.0003897048654781946, + "loss": 0.5606, + "step": 13840 + }, + { + "epoch": 0.3082710113960114, + "grad_norm": 0.6507335901260376, + "learning_rate": 0.00038969009486278705, + "loss": 0.6384, + "step": 13850 + }, + { + "epoch": 0.30849358974358976, + "grad_norm": 1.2971117496490479, + "learning_rate": 0.0003896753139394644, + "loss": 0.6221, + "step": 13860 + }, + { + "epoch": 0.3087161680911681, + "grad_norm": 0.5396764874458313, + "learning_rate": 0.00038966052270902993, + "loss": 0.6386, + "step": 13870 + }, + { + "epoch": 0.3089387464387464, + "grad_norm": 0.5200977325439453, + "learning_rate": 0.0003896457211722874, + "loss": 0.6257, + "step": 13880 + }, + { + "epoch": 0.3091613247863248, + "grad_norm": 0.4826856851577759, + "learning_rate": 0.00038963090933004114, + "loss": 0.6103, + "step": 13890 + }, + { + "epoch": 0.3093839031339031, + "grad_norm": 0.6851069927215576, + "learning_rate": 0.000389616087183096, + "loss": 0.5623, + "step": 13900 + }, + { + "epoch": 0.30960648148148145, + "grad_norm": 0.5373729467391968, + "learning_rate": 0.0003896012547322575, + "loss": 0.6298, + "step": 13910 + }, + { + "epoch": 0.30982905982905984, + "grad_norm": 0.4789588153362274, + "learning_rate": 0.00038958641197833153, + "loss": 0.4662, + "step": 13920 + }, + { + "epoch": 0.31005163817663817, + "grad_norm": 0.6719768047332764, + "learning_rate": 0.0003895715589221248, + "loss": 0.7211, + "step": 13930 + }, + { + "epoch": 0.31027421652421655, + "grad_norm": 0.9529721736907959, + "learning_rate": 0.00038955669556444436, + "loss": 0.655, + "step": 13940 + }, + { + "epoch": 0.3104967948717949, + "grad_norm": 0.6303074359893799, + "learning_rate": 0.00038954182190609784, + "loss": 0.5846, + "step": 13950 + }, + { + "epoch": 0.3107193732193732, + "grad_norm": 0.8553338050842285, + "learning_rate": 0.0003895269379478936, + "loss": 0.7021, + "step": 13960 + }, + { + "epoch": 0.3109419515669516, + "grad_norm": 0.8133220672607422, + "learning_rate": 0.00038951204369064047, + "loss": 0.6132, + "step": 13970 + }, + { + "epoch": 0.3111645299145299, + "grad_norm": 0.6394578218460083, + "learning_rate": 0.00038949713913514767, + "loss": 0.6557, + "step": 13980 + }, + { + "epoch": 0.31138710826210825, + "grad_norm": 0.5929552912712097, + "learning_rate": 0.0003894822242822252, + "loss": 0.5875, + "step": 13990 + }, + { + "epoch": 0.31160968660968663, + "grad_norm": 0.5577292442321777, + "learning_rate": 0.0003894672991326835, + "loss": 0.5826, + "step": 14000 + }, + { + "epoch": 0.31183226495726496, + "grad_norm": 0.6399706602096558, + "learning_rate": 0.00038945236368733376, + "loss": 0.7175, + "step": 14010 + }, + { + "epoch": 0.3120548433048433, + "grad_norm": 0.5782634019851685, + "learning_rate": 0.0003894374179469874, + "loss": 0.6909, + "step": 14020 + }, + { + "epoch": 0.31227742165242167, + "grad_norm": 0.49260640144348145, + "learning_rate": 0.00038942246191245674, + "loss": 0.6483, + "step": 14030 + }, + { + "epoch": 0.3125, + "grad_norm": 0.8788143396377563, + "learning_rate": 0.0003894074955845544, + "loss": 0.7238, + "step": 14040 + }, + { + "epoch": 0.31272257834757833, + "grad_norm": 0.7116497755050659, + "learning_rate": 0.0003893925189640936, + "loss": 0.6048, + "step": 14050 + }, + { + "epoch": 0.3129451566951567, + "grad_norm": 0.6584917306900024, + "learning_rate": 0.00038937753205188844, + "loss": 0.5464, + "step": 14060 + }, + { + "epoch": 0.31316773504273504, + "grad_norm": 0.7588227391242981, + "learning_rate": 0.00038936253484875304, + "loss": 0.6912, + "step": 14070 + }, + { + "epoch": 0.31339031339031337, + "grad_norm": 0.8676776885986328, + "learning_rate": 0.00038934752735550254, + "loss": 0.7545, + "step": 14080 + }, + { + "epoch": 0.31361289173789175, + "grad_norm": 0.4870927631855011, + "learning_rate": 0.0003893325095729524, + "loss": 0.8075, + "step": 14090 + }, + { + "epoch": 0.3138354700854701, + "grad_norm": 0.5967193841934204, + "learning_rate": 0.0003893174815019186, + "loss": 0.6192, + "step": 14100 + }, + { + "epoch": 0.3140580484330484, + "grad_norm": 0.5258004665374756, + "learning_rate": 0.00038930244314321793, + "loss": 0.611, + "step": 14110 + }, + { + "epoch": 0.3142806267806268, + "grad_norm": 0.7101635932922363, + "learning_rate": 0.00038928739449766754, + "loss": 0.7051, + "step": 14120 + }, + { + "epoch": 0.3145032051282051, + "grad_norm": 0.6969490647315979, + "learning_rate": 0.0003892723355660852, + "loss": 0.5837, + "step": 14130 + }, + { + "epoch": 0.31472578347578345, + "grad_norm": 0.8672996759414673, + "learning_rate": 0.0003892572663492892, + "loss": 0.7978, + "step": 14140 + }, + { + "epoch": 0.31494836182336183, + "grad_norm": 0.6507262587547302, + "learning_rate": 0.0003892421868480984, + "loss": 0.4929, + "step": 14150 + }, + { + "epoch": 0.31517094017094016, + "grad_norm": 0.6931272745132446, + "learning_rate": 0.0003892270970633322, + "loss": 0.7297, + "step": 14160 + }, + { + "epoch": 0.31539351851851855, + "grad_norm": 0.6623298525810242, + "learning_rate": 0.0003892119969958107, + "loss": 0.5653, + "step": 14170 + }, + { + "epoch": 0.3156160968660969, + "grad_norm": 0.7796519994735718, + "learning_rate": 0.00038919688664635434, + "loss": 0.5297, + "step": 14180 + }, + { + "epoch": 0.3158386752136752, + "grad_norm": 0.8253905177116394, + "learning_rate": 0.0003891817660157843, + "loss": 0.6523, + "step": 14190 + }, + { + "epoch": 0.3160612535612536, + "grad_norm": 0.7216755747795105, + "learning_rate": 0.0003891666351049222, + "loss": 0.6081, + "step": 14200 + }, + { + "epoch": 0.3162838319088319, + "grad_norm": 0.788808286190033, + "learning_rate": 0.00038915149391459034, + "loss": 0.7106, + "step": 14210 + }, + { + "epoch": 0.31650641025641024, + "grad_norm": 0.9707726240158081, + "learning_rate": 0.0003891363424456114, + "loss": 0.6154, + "step": 14220 + }, + { + "epoch": 0.3167289886039886, + "grad_norm": 0.6529638767242432, + "learning_rate": 0.0003891211806988088, + "loss": 0.6526, + "step": 14230 + }, + { + "epoch": 0.31695156695156695, + "grad_norm": 0.6194156408309937, + "learning_rate": 0.0003891060086750064, + "loss": 0.6328, + "step": 14240 + }, + { + "epoch": 0.3171741452991453, + "grad_norm": 0.4574219882488251, + "learning_rate": 0.0003890908263750287, + "loss": 0.4735, + "step": 14250 + }, + { + "epoch": 0.31739672364672367, + "grad_norm": 0.4415189325809479, + "learning_rate": 0.0003890756337997007, + "loss": 0.6351, + "step": 14260 + }, + { + "epoch": 0.317619301994302, + "grad_norm": 0.8414084911346436, + "learning_rate": 0.00038906043094984796, + "loss": 0.7368, + "step": 14270 + }, + { + "epoch": 0.3178418803418803, + "grad_norm": 0.8305835127830505, + "learning_rate": 0.00038904521782629663, + "loss": 0.5888, + "step": 14280 + }, + { + "epoch": 0.3180644586894587, + "grad_norm": 0.43939080834388733, + "learning_rate": 0.0003890299944298734, + "loss": 0.6886, + "step": 14290 + }, + { + "epoch": 0.31828703703703703, + "grad_norm": 0.6629655361175537, + "learning_rate": 0.0003890147607614056, + "loss": 0.6991, + "step": 14300 + }, + { + "epoch": 0.31850961538461536, + "grad_norm": 0.549863874912262, + "learning_rate": 0.00038899951682172084, + "loss": 0.6437, + "step": 14310 + }, + { + "epoch": 0.31873219373219375, + "grad_norm": 0.6703042387962341, + "learning_rate": 0.0003889842626116477, + "loss": 0.6647, + "step": 14320 + }, + { + "epoch": 0.3189547720797721, + "grad_norm": 0.7996009588241577, + "learning_rate": 0.000388968998132015, + "loss": 0.478, + "step": 14330 + }, + { + "epoch": 0.3191773504273504, + "grad_norm": 0.6925052404403687, + "learning_rate": 0.0003889537233836523, + "loss": 0.777, + "step": 14340 + }, + { + "epoch": 0.3193999287749288, + "grad_norm": 0.7804181575775146, + "learning_rate": 0.00038893843836738945, + "loss": 0.6204, + "step": 14350 + }, + { + "epoch": 0.3196225071225071, + "grad_norm": 0.677384078502655, + "learning_rate": 0.0003889231430840573, + "loss": 0.5158, + "step": 14360 + }, + { + "epoch": 0.31984508547008544, + "grad_norm": 0.7306846976280212, + "learning_rate": 0.00038890783753448683, + "loss": 0.5729, + "step": 14370 + }, + { + "epoch": 0.32006766381766383, + "grad_norm": 0.7445614337921143, + "learning_rate": 0.0003888925217195099, + "loss": 0.7354, + "step": 14380 + }, + { + "epoch": 0.32029024216524216, + "grad_norm": 0.6261416077613831, + "learning_rate": 0.0003888771956399586, + "loss": 0.6795, + "step": 14390 + }, + { + "epoch": 0.32051282051282054, + "grad_norm": 0.8659251928329468, + "learning_rate": 0.00038886185929666594, + "loss": 0.6217, + "step": 14400 + }, + { + "epoch": 0.32073539886039887, + "grad_norm": 0.5013568997383118, + "learning_rate": 0.00038884651269046526, + "loss": 0.5297, + "step": 14410 + }, + { + "epoch": 0.3209579772079772, + "grad_norm": 0.5654299855232239, + "learning_rate": 0.00038883115582219046, + "loss": 0.6419, + "step": 14420 + }, + { + "epoch": 0.3211805555555556, + "grad_norm": 0.9625751376152039, + "learning_rate": 0.000388815788692676, + "loss": 0.497, + "step": 14430 + }, + { + "epoch": 0.3214031339031339, + "grad_norm": 0.6212838292121887, + "learning_rate": 0.00038880041130275706, + "loss": 0.6888, + "step": 14440 + }, + { + "epoch": 0.32162571225071224, + "grad_norm": 0.7968057990074158, + "learning_rate": 0.00038878502365326923, + "loss": 0.6761, + "step": 14450 + }, + { + "epoch": 0.3218482905982906, + "grad_norm": 0.7091825008392334, + "learning_rate": 0.0003887696257450486, + "loss": 0.5637, + "step": 14460 + }, + { + "epoch": 0.32207086894586895, + "grad_norm": 0.9699985384941101, + "learning_rate": 0.00038875421757893203, + "loss": 0.6297, + "step": 14470 + }, + { + "epoch": 0.3222934472934473, + "grad_norm": 0.7332914471626282, + "learning_rate": 0.0003887387991557568, + "loss": 0.6152, + "step": 14480 + }, + { + "epoch": 0.32251602564102566, + "grad_norm": 0.5954024791717529, + "learning_rate": 0.0003887233704763606, + "loss": 0.5747, + "step": 14490 + }, + { + "epoch": 0.322738603988604, + "grad_norm": 0.906563937664032, + "learning_rate": 0.00038870793154158206, + "loss": 0.5849, + "step": 14500 + }, + { + "epoch": 0.3229611823361823, + "grad_norm": 0.5665345788002014, + "learning_rate": 0.00038869248235226, + "loss": 0.5829, + "step": 14510 + }, + { + "epoch": 0.3231837606837607, + "grad_norm": 0.722339928150177, + "learning_rate": 0.00038867702290923395, + "loss": 0.5569, + "step": 14520 + }, + { + "epoch": 0.32340633903133903, + "grad_norm": 0.6280966997146606, + "learning_rate": 0.00038866155321334406, + "loss": 0.5243, + "step": 14530 + }, + { + "epoch": 0.32362891737891736, + "grad_norm": 0.5326039791107178, + "learning_rate": 0.00038864607326543086, + "loss": 0.7024, + "step": 14540 + }, + { + "epoch": 0.32385149572649574, + "grad_norm": 0.748881995677948, + "learning_rate": 0.00038863058306633566, + "loss": 0.6266, + "step": 14550 + }, + { + "epoch": 0.32407407407407407, + "grad_norm": 0.7223565578460693, + "learning_rate": 0.00038861508261690017, + "loss": 0.6603, + "step": 14560 + }, + { + "epoch": 0.3242966524216524, + "grad_norm": 0.7084729671478271, + "learning_rate": 0.00038859957191796665, + "loss": 0.7027, + "step": 14570 + }, + { + "epoch": 0.3245192307692308, + "grad_norm": 0.6114988923072815, + "learning_rate": 0.00038858405097037796, + "loss": 0.5891, + "step": 14580 + }, + { + "epoch": 0.3247418091168091, + "grad_norm": 0.9163961410522461, + "learning_rate": 0.0003885685197749776, + "loss": 0.6098, + "step": 14590 + }, + { + "epoch": 0.32496438746438744, + "grad_norm": 1.123960256576538, + "learning_rate": 0.00038855297833260955, + "loss": 0.6258, + "step": 14600 + }, + { + "epoch": 0.3251869658119658, + "grad_norm": 0.7056410908699036, + "learning_rate": 0.00038853742664411825, + "loss": 0.7209, + "step": 14610 + }, + { + "epoch": 0.32540954415954415, + "grad_norm": 0.49438974261283875, + "learning_rate": 0.00038852186471034886, + "loss": 0.5346, + "step": 14620 + }, + { + "epoch": 0.32563212250712253, + "grad_norm": 0.4864642918109894, + "learning_rate": 0.000388506292532147, + "loss": 0.6908, + "step": 14630 + }, + { + "epoch": 0.32585470085470086, + "grad_norm": 0.7527804374694824, + "learning_rate": 0.0003884907101103589, + "loss": 0.5082, + "step": 14640 + }, + { + "epoch": 0.3260772792022792, + "grad_norm": 0.8456656336784363, + "learning_rate": 0.00038847511744583127, + "loss": 0.7523, + "step": 14650 + }, + { + "epoch": 0.3262998575498576, + "grad_norm": 0.5107564330101013, + "learning_rate": 0.0003884595145394115, + "loss": 0.47, + "step": 14660 + }, + { + "epoch": 0.3265224358974359, + "grad_norm": 0.5012245774269104, + "learning_rate": 0.0003884439013919474, + "loss": 0.6565, + "step": 14670 + }, + { + "epoch": 0.32674501424501423, + "grad_norm": 0.7485440373420715, + "learning_rate": 0.00038842827800428747, + "loss": 0.67, + "step": 14680 + }, + { + "epoch": 0.3269675925925926, + "grad_norm": 0.7245430946350098, + "learning_rate": 0.00038841264437728057, + "loss": 0.6425, + "step": 14690 + }, + { + "epoch": 0.32719017094017094, + "grad_norm": 0.4616965353488922, + "learning_rate": 0.00038839700051177645, + "loss": 0.6578, + "step": 14700 + }, + { + "epoch": 0.32741274928774927, + "grad_norm": 0.8992801308631897, + "learning_rate": 0.00038838134640862507, + "loss": 0.6097, + "step": 14710 + }, + { + "epoch": 0.32763532763532766, + "grad_norm": 0.7011298537254333, + "learning_rate": 0.00038836568206867704, + "loss": 0.6629, + "step": 14720 + }, + { + "epoch": 0.327857905982906, + "grad_norm": 0.5743442177772522, + "learning_rate": 0.0003883500074927837, + "loss": 0.6509, + "step": 14730 + }, + { + "epoch": 0.3280804843304843, + "grad_norm": 0.6825052499771118, + "learning_rate": 0.0003883343226817967, + "loss": 0.5321, + "step": 14740 + }, + { + "epoch": 0.3283030626780627, + "grad_norm": 0.8856199979782104, + "learning_rate": 0.00038831862763656855, + "loss": 0.7376, + "step": 14750 + }, + { + "epoch": 0.328525641025641, + "grad_norm": 0.6327480673789978, + "learning_rate": 0.00038830292235795193, + "loss": 0.6739, + "step": 14760 + }, + { + "epoch": 0.32874821937321935, + "grad_norm": 0.6097829341888428, + "learning_rate": 0.00038828720684680034, + "loss": 0.6656, + "step": 14770 + }, + { + "epoch": 0.32897079772079774, + "grad_norm": 0.686923623085022, + "learning_rate": 0.00038827148110396785, + "loss": 0.6197, + "step": 14780 + }, + { + "epoch": 0.32919337606837606, + "grad_norm": 0.5756046175956726, + "learning_rate": 0.00038825574513030886, + "loss": 0.5645, + "step": 14790 + }, + { + "epoch": 0.3294159544159544, + "grad_norm": 0.5095319747924805, + "learning_rate": 0.00038823999892667865, + "loss": 0.6822, + "step": 14800 + }, + { + "epoch": 0.3296385327635328, + "grad_norm": 0.9366177916526794, + "learning_rate": 0.0003882242424939327, + "loss": 0.5301, + "step": 14810 + }, + { + "epoch": 0.3298611111111111, + "grad_norm": 0.7820391654968262, + "learning_rate": 0.00038820847583292744, + "loss": 0.6568, + "step": 14820 + }, + { + "epoch": 0.33008368945868943, + "grad_norm": 0.6345818638801575, + "learning_rate": 0.0003881926989445195, + "loss": 0.6711, + "step": 14830 + }, + { + "epoch": 0.3303062678062678, + "grad_norm": 0.6514973044395447, + "learning_rate": 0.00038817691182956623, + "loss": 0.8656, + "step": 14840 + }, + { + "epoch": 0.33052884615384615, + "grad_norm": 0.7673830389976501, + "learning_rate": 0.0003881611144889255, + "loss": 0.558, + "step": 14850 + }, + { + "epoch": 0.33075142450142453, + "grad_norm": 0.5530751943588257, + "learning_rate": 0.0003881453069234558, + "loss": 0.6342, + "step": 14860 + }, + { + "epoch": 0.33097400284900286, + "grad_norm": 0.7152727842330933, + "learning_rate": 0.00038812948913401603, + "loss": 0.5806, + "step": 14870 + }, + { + "epoch": 0.3311965811965812, + "grad_norm": 0.7458562254905701, + "learning_rate": 0.00038811366112146594, + "loss": 0.6891, + "step": 14880 + }, + { + "epoch": 0.33141915954415957, + "grad_norm": 0.6589481830596924, + "learning_rate": 0.00038809782288666534, + "loss": 0.5577, + "step": 14890 + }, + { + "epoch": 0.3316417378917379, + "grad_norm": 0.6615011692047119, + "learning_rate": 0.0003880819744304752, + "loss": 0.7042, + "step": 14900 + }, + { + "epoch": 0.3318643162393162, + "grad_norm": 0.4032817482948303, + "learning_rate": 0.0003880661157537565, + "loss": 0.5792, + "step": 14910 + }, + { + "epoch": 0.3320868945868946, + "grad_norm": 0.9323031306266785, + "learning_rate": 0.0003880502468573712, + "loss": 0.7316, + "step": 14920 + }, + { + "epoch": 0.33230947293447294, + "grad_norm": 0.5589344501495361, + "learning_rate": 0.00038803436774218153, + "loss": 0.6257, + "step": 14930 + }, + { + "epoch": 0.33253205128205127, + "grad_norm": 0.5097547173500061, + "learning_rate": 0.00038801847840905034, + "loss": 0.5746, + "step": 14940 + }, + { + "epoch": 0.33275462962962965, + "grad_norm": 0.6569631695747375, + "learning_rate": 0.00038800257885884115, + "loss": 0.607, + "step": 14950 + }, + { + "epoch": 0.332977207977208, + "grad_norm": 0.6447013020515442, + "learning_rate": 0.0003879866690924179, + "loss": 0.7258, + "step": 14960 + }, + { + "epoch": 0.3331997863247863, + "grad_norm": 0.9661784768104553, + "learning_rate": 0.00038797074911064517, + "loss": 0.62, + "step": 14970 + }, + { + "epoch": 0.3334223646723647, + "grad_norm": 0.7204642295837402, + "learning_rate": 0.000387954818914388, + "loss": 0.7651, + "step": 14980 + }, + { + "epoch": 0.333644943019943, + "grad_norm": 1.1423128843307495, + "learning_rate": 0.0003879388785045122, + "loss": 0.6571, + "step": 14990 + }, + { + "epoch": 0.33386752136752135, + "grad_norm": 1.5268319845199585, + "learning_rate": 0.0003879229278818838, + "loss": 0.508, + "step": 15000 + }, + { + "epoch": 0.33409009971509973, + "grad_norm": 0.7197328805923462, + "learning_rate": 0.0003879069670473697, + "loss": 0.618, + "step": 15010 + }, + { + "epoch": 0.33431267806267806, + "grad_norm": 0.5955047607421875, + "learning_rate": 0.00038789099600183716, + "loss": 0.651, + "step": 15020 + }, + { + "epoch": 0.3345352564102564, + "grad_norm": 0.7561123967170715, + "learning_rate": 0.0003878750147461541, + "loss": 0.5922, + "step": 15030 + }, + { + "epoch": 0.33475783475783477, + "grad_norm": 0.5777478218078613, + "learning_rate": 0.00038785902328118894, + "loss": 0.7357, + "step": 15040 + }, + { + "epoch": 0.3349804131054131, + "grad_norm": 1.1352577209472656, + "learning_rate": 0.00038784302160781067, + "loss": 0.6401, + "step": 15050 + }, + { + "epoch": 0.33520299145299143, + "grad_norm": 0.7301923036575317, + "learning_rate": 0.0003878270097268888, + "loss": 0.7025, + "step": 15060 + }, + { + "epoch": 0.3354255698005698, + "grad_norm": 1.0378355979919434, + "learning_rate": 0.0003878109876392935, + "loss": 0.5704, + "step": 15070 + }, + { + "epoch": 0.33564814814814814, + "grad_norm": 0.5009276866912842, + "learning_rate": 0.00038779495534589534, + "loss": 0.5682, + "step": 15080 + }, + { + "epoch": 0.33587072649572647, + "grad_norm": 0.636855959892273, + "learning_rate": 0.0003877789128475656, + "loss": 0.6932, + "step": 15090 + }, + { + "epoch": 0.33609330484330485, + "grad_norm": 0.9682590365409851, + "learning_rate": 0.000387762860145176, + "loss": 0.6237, + "step": 15100 + }, + { + "epoch": 0.3363158831908832, + "grad_norm": 1.0007511377334595, + "learning_rate": 0.0003877467972395989, + "loss": 0.5892, + "step": 15110 + }, + { + "epoch": 0.33653846153846156, + "grad_norm": 0.6202926635742188, + "learning_rate": 0.00038773072413170716, + "loss": 0.5913, + "step": 15120 + }, + { + "epoch": 0.3367610398860399, + "grad_norm": 0.56829434633255, + "learning_rate": 0.0003877146408223741, + "loss": 0.6632, + "step": 15130 + }, + { + "epoch": 0.3369836182336182, + "grad_norm": 0.8759254813194275, + "learning_rate": 0.00038769854731247384, + "loss": 0.6321, + "step": 15140 + }, + { + "epoch": 0.3372061965811966, + "grad_norm": 0.6968265175819397, + "learning_rate": 0.00038768244360288086, + "loss": 0.6905, + "step": 15150 + }, + { + "epoch": 0.33742877492877493, + "grad_norm": 0.8931856751441956, + "learning_rate": 0.00038766632969447024, + "loss": 0.6507, + "step": 15160 + }, + { + "epoch": 0.33765135327635326, + "grad_norm": 0.6409062743186951, + "learning_rate": 0.0003876502055881177, + "loss": 0.6041, + "step": 15170 + }, + { + "epoch": 0.33787393162393164, + "grad_norm": 0.6757847666740417, + "learning_rate": 0.0003876340712846992, + "loss": 0.5633, + "step": 15180 + }, + { + "epoch": 0.33809650997151, + "grad_norm": 0.6828376650810242, + "learning_rate": 0.0003876179267850918, + "loss": 0.6538, + "step": 15190 + }, + { + "epoch": 0.3383190883190883, + "grad_norm": 1.2060045003890991, + "learning_rate": 0.00038760177209017267, + "loss": 0.6757, + "step": 15200 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.654515266418457, + "learning_rate": 0.0003875856072008196, + "loss": 0.6166, + "step": 15210 + }, + { + "epoch": 0.338764245014245, + "grad_norm": 0.6422186493873596, + "learning_rate": 0.0003875694321179111, + "loss": 0.7331, + "step": 15220 + }, + { + "epoch": 0.33898682336182334, + "grad_norm": 0.8152149319648743, + "learning_rate": 0.00038755324684232616, + "loss": 0.6558, + "step": 15230 + }, + { + "epoch": 0.3392094017094017, + "grad_norm": 0.8730194568634033, + "learning_rate": 0.00038753705137494415, + "loss": 0.6754, + "step": 15240 + }, + { + "epoch": 0.33943198005698005, + "grad_norm": 0.6081664562225342, + "learning_rate": 0.0003875208457166453, + "loss": 0.659, + "step": 15250 + }, + { + "epoch": 0.3396545584045584, + "grad_norm": 0.4139869213104248, + "learning_rate": 0.0003875046298683102, + "loss": 0.5921, + "step": 15260 + }, + { + "epoch": 0.33987713675213677, + "grad_norm": 0.621178925037384, + "learning_rate": 0.00038748840383082004, + "loss": 0.522, + "step": 15270 + }, + { + "epoch": 0.3400997150997151, + "grad_norm": 0.8623523712158203, + "learning_rate": 0.00038747216760505644, + "loss": 0.7395, + "step": 15280 + }, + { + "epoch": 0.3403222934472934, + "grad_norm": 0.557036280632019, + "learning_rate": 0.0003874559211919018, + "loss": 0.608, + "step": 15290 + }, + { + "epoch": 0.3405448717948718, + "grad_norm": 0.3644825220108032, + "learning_rate": 0.00038743966459223894, + "loss": 0.5917, + "step": 15300 + }, + { + "epoch": 0.34076745014245013, + "grad_norm": 0.678953230381012, + "learning_rate": 0.0003874233978069513, + "loss": 0.6087, + "step": 15310 + }, + { + "epoch": 0.34099002849002846, + "grad_norm": 0.5321779251098633, + "learning_rate": 0.0003874071208369228, + "loss": 0.569, + "step": 15320 + }, + { + "epoch": 0.34121260683760685, + "grad_norm": 0.6926671862602234, + "learning_rate": 0.0003873908336830379, + "loss": 0.6858, + "step": 15330 + }, + { + "epoch": 0.3414351851851852, + "grad_norm": 0.8424744606018066, + "learning_rate": 0.0003873745363461817, + "loss": 0.6813, + "step": 15340 + }, + { + "epoch": 0.34165776353276356, + "grad_norm": 0.6209324598312378, + "learning_rate": 0.0003873582288272398, + "loss": 0.7667, + "step": 15350 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 0.5841759443283081, + "learning_rate": 0.0003873419111270984, + "loss": 0.4966, + "step": 15360 + }, + { + "epoch": 0.3421029202279202, + "grad_norm": 0.6942004561424255, + "learning_rate": 0.0003873255832466442, + "loss": 0.5913, + "step": 15370 + }, + { + "epoch": 0.3423254985754986, + "grad_norm": 0.8919575810432434, + "learning_rate": 0.00038730924518676435, + "loss": 0.6681, + "step": 15380 + }, + { + "epoch": 0.3425480769230769, + "grad_norm": 0.6405223608016968, + "learning_rate": 0.0003872928969483469, + "loss": 0.5544, + "step": 15390 + }, + { + "epoch": 0.34277065527065526, + "grad_norm": 0.63236004114151, + "learning_rate": 0.00038727653853228, + "loss": 0.6711, + "step": 15400 + }, + { + "epoch": 0.34299323361823364, + "grad_norm": 0.6226939558982849, + "learning_rate": 0.00038726016993945276, + "loss": 0.6142, + "step": 15410 + }, + { + "epoch": 0.34321581196581197, + "grad_norm": 0.7648651599884033, + "learning_rate": 0.00038724379117075457, + "loss": 0.7771, + "step": 15420 + }, + { + "epoch": 0.3434383903133903, + "grad_norm": 0.7330575585365295, + "learning_rate": 0.00038722740222707546, + "loss": 0.704, + "step": 15430 + }, + { + "epoch": 0.3436609686609687, + "grad_norm": 0.6973746418952942, + "learning_rate": 0.00038721100310930604, + "loss": 0.7464, + "step": 15440 + }, + { + "epoch": 0.343883547008547, + "grad_norm": 0.851994514465332, + "learning_rate": 0.0003871945938183375, + "loss": 0.5979, + "step": 15450 + }, + { + "epoch": 0.34410612535612534, + "grad_norm": 0.5129497051239014, + "learning_rate": 0.00038717817435506143, + "loss": 0.5974, + "step": 15460 + }, + { + "epoch": 0.3443287037037037, + "grad_norm": 0.4729333817958832, + "learning_rate": 0.00038716174472037013, + "loss": 0.6519, + "step": 15470 + }, + { + "epoch": 0.34455128205128205, + "grad_norm": 0.6375283598899841, + "learning_rate": 0.00038714530491515647, + "loss": 0.6194, + "step": 15480 + }, + { + "epoch": 0.3447738603988604, + "grad_norm": 0.753584623336792, + "learning_rate": 0.0003871288549403137, + "loss": 0.6828, + "step": 15490 + }, + { + "epoch": 0.34499643874643876, + "grad_norm": 0.5335111618041992, + "learning_rate": 0.0003871123947967357, + "loss": 0.5159, + "step": 15500 + }, + { + "epoch": 0.3452190170940171, + "grad_norm": 0.9678267240524292, + "learning_rate": 0.0003870959244853171, + "loss": 0.6309, + "step": 15510 + }, + { + "epoch": 0.3454415954415954, + "grad_norm": 0.875391960144043, + "learning_rate": 0.0003870794440069527, + "loss": 0.5351, + "step": 15520 + }, + { + "epoch": 0.3456641737891738, + "grad_norm": 0.7199859619140625, + "learning_rate": 0.00038706295336253825, + "loss": 0.6512, + "step": 15530 + }, + { + "epoch": 0.34588675213675213, + "grad_norm": 0.48389071226119995, + "learning_rate": 0.00038704645255296976, + "loss": 0.7339, + "step": 15540 + }, + { + "epoch": 0.34610933048433046, + "grad_norm": 0.5464368462562561, + "learning_rate": 0.0003870299415791439, + "loss": 0.5017, + "step": 15550 + }, + { + "epoch": 0.34633190883190884, + "grad_norm": 1.0332673788070679, + "learning_rate": 0.00038701342044195785, + "loss": 0.6115, + "step": 15560 + }, + { + "epoch": 0.34655448717948717, + "grad_norm": 0.6519765257835388, + "learning_rate": 0.0003869968891423095, + "loss": 0.6015, + "step": 15570 + }, + { + "epoch": 0.34677706552706555, + "grad_norm": 0.49065589904785156, + "learning_rate": 0.0003869803476810971, + "loss": 0.6006, + "step": 15580 + }, + { + "epoch": 0.3469996438746439, + "grad_norm": 0.7999041080474854, + "learning_rate": 0.0003869637960592195, + "loss": 0.7736, + "step": 15590 + }, + { + "epoch": 0.3472222222222222, + "grad_norm": 0.7751318216323853, + "learning_rate": 0.00038694723427757624, + "loss": 0.5886, + "step": 15600 + }, + { + "epoch": 0.3474448005698006, + "grad_norm": 0.7226809859275818, + "learning_rate": 0.00038693066233706715, + "loss": 0.5635, + "step": 15610 + }, + { + "epoch": 0.3476673789173789, + "grad_norm": 0.5385544896125793, + "learning_rate": 0.0003869140802385929, + "loss": 0.6578, + "step": 15620 + }, + { + "epoch": 0.34788995726495725, + "grad_norm": 0.739736795425415, + "learning_rate": 0.0003868974879830545, + "loss": 0.7629, + "step": 15630 + }, + { + "epoch": 0.34811253561253563, + "grad_norm": 0.8486205339431763, + "learning_rate": 0.00038688088557135364, + "loss": 0.6528, + "step": 15640 + }, + { + "epoch": 0.34833511396011396, + "grad_norm": 0.9158767461776733, + "learning_rate": 0.00038686427300439237, + "loss": 0.5216, + "step": 15650 + }, + { + "epoch": 0.3485576923076923, + "grad_norm": 0.6496191620826721, + "learning_rate": 0.0003868476502830736, + "loss": 0.6958, + "step": 15660 + }, + { + "epoch": 0.3487802706552707, + "grad_norm": 0.6712784171104431, + "learning_rate": 0.0003868310174083005, + "loss": 0.6468, + "step": 15670 + }, + { + "epoch": 0.349002849002849, + "grad_norm": 0.4141985774040222, + "learning_rate": 0.00038681437438097704, + "loss": 0.6772, + "step": 15680 + }, + { + "epoch": 0.34922542735042733, + "grad_norm": 0.8436933159828186, + "learning_rate": 0.00038679772120200754, + "loss": 0.6587, + "step": 15690 + }, + { + "epoch": 0.3494480056980057, + "grad_norm": 0.6606414914131165, + "learning_rate": 0.0003867810578722969, + "loss": 0.6682, + "step": 15700 + }, + { + "epoch": 0.34967058404558404, + "grad_norm": 0.4795196056365967, + "learning_rate": 0.0003867643843927507, + "loss": 0.6216, + "step": 15710 + }, + { + "epoch": 0.34989316239316237, + "grad_norm": 0.5964969992637634, + "learning_rate": 0.0003867477007642749, + "loss": 0.6396, + "step": 15720 + }, + { + "epoch": 0.35011574074074076, + "grad_norm": 0.9071320295333862, + "learning_rate": 0.0003867310069877762, + "loss": 0.6362, + "step": 15730 + }, + { + "epoch": 0.3503383190883191, + "grad_norm": 0.9060355424880981, + "learning_rate": 0.00038671430306416174, + "loss": 0.5166, + "step": 15740 + }, + { + "epoch": 0.3505608974358974, + "grad_norm": 0.7958703637123108, + "learning_rate": 0.0003866975889943392, + "loss": 0.5176, + "step": 15750 + }, + { + "epoch": 0.3507834757834758, + "grad_norm": 0.5803673267364502, + "learning_rate": 0.00038668086477921685, + "loss": 0.5258, + "step": 15760 + }, + { + "epoch": 0.3510060541310541, + "grad_norm": 0.8940984606742859, + "learning_rate": 0.00038666413041970346, + "loss": 0.601, + "step": 15770 + }, + { + "epoch": 0.35122863247863245, + "grad_norm": 0.7736290693283081, + "learning_rate": 0.00038664738591670837, + "loss": 0.6799, + "step": 15780 + }, + { + "epoch": 0.35145121082621084, + "grad_norm": 0.3179571330547333, + "learning_rate": 0.0003866306312711416, + "loss": 0.6422, + "step": 15790 + }, + { + "epoch": 0.35167378917378916, + "grad_norm": 0.6942645311355591, + "learning_rate": 0.0003866138664839135, + "loss": 0.6703, + "step": 15800 + }, + { + "epoch": 0.35189636752136755, + "grad_norm": 0.63601154088974, + "learning_rate": 0.0003865970915559351, + "loss": 0.6482, + "step": 15810 + }, + { + "epoch": 0.3521189458689459, + "grad_norm": 0.981825590133667, + "learning_rate": 0.000386580306488118, + "loss": 0.499, + "step": 15820 + }, + { + "epoch": 0.3523415242165242, + "grad_norm": 0.836377739906311, + "learning_rate": 0.00038656351128137437, + "loss": 0.5932, + "step": 15830 + }, + { + "epoch": 0.3525641025641026, + "grad_norm": 0.9686137437820435, + "learning_rate": 0.0003865467059366168, + "loss": 0.7086, + "step": 15840 + }, + { + "epoch": 0.3527866809116809, + "grad_norm": 0.937958836555481, + "learning_rate": 0.00038652989045475847, + "loss": 0.7537, + "step": 15850 + }, + { + "epoch": 0.35300925925925924, + "grad_norm": 0.46349942684173584, + "learning_rate": 0.00038651306483671326, + "loss": 0.6199, + "step": 15860 + }, + { + "epoch": 0.35323183760683763, + "grad_norm": 0.7645864486694336, + "learning_rate": 0.0003864962290833953, + "loss": 0.6083, + "step": 15870 + }, + { + "epoch": 0.35345441595441596, + "grad_norm": 0.4245271384716034, + "learning_rate": 0.00038647938319571963, + "loss": 0.4999, + "step": 15880 + }, + { + "epoch": 0.3536769943019943, + "grad_norm": 0.6851386427879333, + "learning_rate": 0.0003864625271746017, + "loss": 0.6637, + "step": 15890 + }, + { + "epoch": 0.35389957264957267, + "grad_norm": 0.863572895526886, + "learning_rate": 0.0003864456610209573, + "loss": 0.6681, + "step": 15900 + }, + { + "epoch": 0.354122150997151, + "grad_norm": 0.5621462464332581, + "learning_rate": 0.0003864287847357031, + "loss": 0.5403, + "step": 15910 + }, + { + "epoch": 0.3543447293447293, + "grad_norm": 0.781972348690033, + "learning_rate": 0.00038641189831975606, + "loss": 0.6362, + "step": 15920 + }, + { + "epoch": 0.3545673076923077, + "grad_norm": 0.9945448040962219, + "learning_rate": 0.0003863950017740339, + "loss": 0.6942, + "step": 15930 + }, + { + "epoch": 0.35478988603988604, + "grad_norm": 0.8110677599906921, + "learning_rate": 0.0003863780950994548, + "loss": 0.5756, + "step": 15940 + }, + { + "epoch": 0.35501246438746437, + "grad_norm": 0.9088088274002075, + "learning_rate": 0.0003863611782969374, + "loss": 0.5669, + "step": 15950 + }, + { + "epoch": 0.35523504273504275, + "grad_norm": 0.9252178072929382, + "learning_rate": 0.00038634425136740096, + "loss": 0.6743, + "step": 15960 + }, + { + "epoch": 0.3554576210826211, + "grad_norm": 0.5928763747215271, + "learning_rate": 0.0003863273143117654, + "loss": 0.6595, + "step": 15970 + }, + { + "epoch": 0.3556801994301994, + "grad_norm": 0.5745807886123657, + "learning_rate": 0.0003863103671309511, + "loss": 0.5165, + "step": 15980 + }, + { + "epoch": 0.3559027777777778, + "grad_norm": 0.6771084070205688, + "learning_rate": 0.0003862934098258788, + "loss": 0.66, + "step": 15990 + }, + { + "epoch": 0.3561253561253561, + "grad_norm": 0.6055546402931213, + "learning_rate": 0.00038627644239747023, + "loss": 0.6146, + "step": 16000 + }, + { + "epoch": 0.35634793447293445, + "grad_norm": 0.7007344961166382, + "learning_rate": 0.0003862594648466472, + "loss": 0.658, + "step": 16010 + }, + { + "epoch": 0.35657051282051283, + "grad_norm": 0.6773852109909058, + "learning_rate": 0.0003862424771743324, + "loss": 0.716, + "step": 16020 + }, + { + "epoch": 0.35679309116809116, + "grad_norm": 0.45812416076660156, + "learning_rate": 0.0003862254793814489, + "loss": 0.5845, + "step": 16030 + }, + { + "epoch": 0.35701566951566954, + "grad_norm": 0.7533524632453918, + "learning_rate": 0.0003862084714689204, + "loss": 0.6995, + "step": 16040 + }, + { + "epoch": 0.35723824786324787, + "grad_norm": 0.5962110757827759, + "learning_rate": 0.0003861914534376712, + "loss": 0.5721, + "step": 16050 + }, + { + "epoch": 0.3574608262108262, + "grad_norm": 0.9739892482757568, + "learning_rate": 0.00038617442528862596, + "loss": 0.6319, + "step": 16060 + }, + { + "epoch": 0.3576834045584046, + "grad_norm": 0.9599641561508179, + "learning_rate": 0.00038615738702271003, + "loss": 0.6161, + "step": 16070 + }, + { + "epoch": 0.3579059829059829, + "grad_norm": 0.49321597814559937, + "learning_rate": 0.0003861403386408493, + "loss": 0.647, + "step": 16080 + }, + { + "epoch": 0.35812856125356124, + "grad_norm": 0.5280551910400391, + "learning_rate": 0.0003861232801439702, + "loss": 0.8232, + "step": 16090 + }, + { + "epoch": 0.3583511396011396, + "grad_norm": 0.6274698376655579, + "learning_rate": 0.0003861062115329996, + "loss": 0.8408, + "step": 16100 + }, + { + "epoch": 0.35857371794871795, + "grad_norm": 0.4998033046722412, + "learning_rate": 0.0003860891328088652, + "loss": 0.5723, + "step": 16110 + }, + { + "epoch": 0.3587962962962963, + "grad_norm": 0.791354775428772, + "learning_rate": 0.00038607204397249497, + "loss": 0.6366, + "step": 16120 + }, + { + "epoch": 0.35901887464387466, + "grad_norm": 0.6643574237823486, + "learning_rate": 0.0003860549450248175, + "loss": 0.5278, + "step": 16130 + }, + { + "epoch": 0.359241452991453, + "grad_norm": 0.7511951923370361, + "learning_rate": 0.000386037835966762, + "loss": 0.6013, + "step": 16140 + }, + { + "epoch": 0.3594640313390313, + "grad_norm": 0.589197039604187, + "learning_rate": 0.0003860207167992583, + "loss": 0.6842, + "step": 16150 + }, + { + "epoch": 0.3596866096866097, + "grad_norm": 0.5355151295661926, + "learning_rate": 0.0003860035875232365, + "loss": 0.5977, + "step": 16160 + }, + { + "epoch": 0.35990918803418803, + "grad_norm": 0.5727527737617493, + "learning_rate": 0.0003859864481396275, + "loss": 0.5406, + "step": 16170 + }, + { + "epoch": 0.36004273504273504, + "eval_loss": 0.6386870741844177, + "eval_runtime": 337.1238, + "eval_samples_per_second": 7.015, + "eval_steps_per_second": 7.015, + "step": 16176 + }, + { + "epoch": 0.36013176638176636, + "grad_norm": 0.5183542966842651, + "learning_rate": 0.0003859692986493626, + "loss": 0.4373, + "step": 16180 + }, + { + "epoch": 0.36035434472934474, + "grad_norm": 0.5630384683609009, + "learning_rate": 0.0003859521390533738, + "loss": 0.4529, + "step": 16190 + }, + { + "epoch": 0.3605769230769231, + "grad_norm": 0.8775503039360046, + "learning_rate": 0.0003859349693525935, + "loss": 0.6765, + "step": 16200 + }, + { + "epoch": 0.3607995014245014, + "grad_norm": 0.80485999584198, + "learning_rate": 0.0003859177895479549, + "loss": 0.5268, + "step": 16210 + }, + { + "epoch": 0.3610220797720798, + "grad_norm": 0.582216739654541, + "learning_rate": 0.00038590059964039127, + "loss": 0.6238, + "step": 16220 + }, + { + "epoch": 0.3612446581196581, + "grad_norm": 0.7229297757148743, + "learning_rate": 0.0003858833996308369, + "loss": 0.5674, + "step": 16230 + }, + { + "epoch": 0.36146723646723644, + "grad_norm": 0.5581183433532715, + "learning_rate": 0.00038586618952022645, + "loss": 0.72, + "step": 16240 + }, + { + "epoch": 0.3616898148148148, + "grad_norm": 0.8541936278343201, + "learning_rate": 0.0003858489693094951, + "loss": 0.6385, + "step": 16250 + }, + { + "epoch": 0.36191239316239315, + "grad_norm": 0.5558866262435913, + "learning_rate": 0.0003858317389995786, + "loss": 0.6244, + "step": 16260 + }, + { + "epoch": 0.36213497150997154, + "grad_norm": 0.44892260432243347, + "learning_rate": 0.0003858144985914133, + "loss": 0.6247, + "step": 16270 + }, + { + "epoch": 0.36235754985754987, + "grad_norm": 0.4403764009475708, + "learning_rate": 0.00038579724808593597, + "loss": 0.5703, + "step": 16280 + }, + { + "epoch": 0.3625801282051282, + "grad_norm": 0.7626928091049194, + "learning_rate": 0.0003857799874840842, + "loss": 0.4869, + "step": 16290 + }, + { + "epoch": 0.3628027065527066, + "grad_norm": 0.5833191871643066, + "learning_rate": 0.0003857627167867957, + "loss": 0.5903, + "step": 16300 + }, + { + "epoch": 0.3630252849002849, + "grad_norm": 0.9135043025016785, + "learning_rate": 0.00038574543599500914, + "loss": 0.6778, + "step": 16310 + }, + { + "epoch": 0.36324786324786323, + "grad_norm": 0.633033275604248, + "learning_rate": 0.00038572814510966355, + "loss": 0.659, + "step": 16320 + }, + { + "epoch": 0.3634704415954416, + "grad_norm": 0.5691318511962891, + "learning_rate": 0.00038571084413169845, + "loss": 0.5765, + "step": 16330 + }, + { + "epoch": 0.36369301994301995, + "grad_norm": 0.7053967118263245, + "learning_rate": 0.0003856935330620541, + "loss": 0.6832, + "step": 16340 + }, + { + "epoch": 0.3639155982905983, + "grad_norm": 1.0150195360183716, + "learning_rate": 0.0003856762119016711, + "loss": 0.7993, + "step": 16350 + }, + { + "epoch": 0.36413817663817666, + "grad_norm": 0.5497720837593079, + "learning_rate": 0.00038565888065149084, + "loss": 0.6516, + "step": 16360 + }, + { + "epoch": 0.364360754985755, + "grad_norm": 0.974551260471344, + "learning_rate": 0.00038564153931245493, + "loss": 0.6516, + "step": 16370 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.7135359644889832, + "learning_rate": 0.00038562418788550587, + "loss": 0.623, + "step": 16380 + }, + { + "epoch": 0.3648059116809117, + "grad_norm": 0.7332806587219238, + "learning_rate": 0.00038560682637158643, + "loss": 0.6314, + "step": 16390 + }, + { + "epoch": 0.36502849002849, + "grad_norm": 0.8631349802017212, + "learning_rate": 0.0003855894547716401, + "loss": 0.5178, + "step": 16400 + }, + { + "epoch": 0.36525106837606836, + "grad_norm": 0.52054363489151, + "learning_rate": 0.0003855720730866109, + "loss": 0.4755, + "step": 16410 + }, + { + "epoch": 0.36547364672364674, + "grad_norm": 0.552855908870697, + "learning_rate": 0.0003855546813174433, + "loss": 0.6811, + "step": 16420 + }, + { + "epoch": 0.36569622507122507, + "grad_norm": 0.5592019557952881, + "learning_rate": 0.00038553727946508246, + "loss": 0.6854, + "step": 16430 + }, + { + "epoch": 0.3659188034188034, + "grad_norm": 0.7347126007080078, + "learning_rate": 0.0003855198675304739, + "loss": 0.584, + "step": 16440 + }, + { + "epoch": 0.3661413817663818, + "grad_norm": 0.7042830586433411, + "learning_rate": 0.0003855024455145639, + "loss": 0.6909, + "step": 16450 + }, + { + "epoch": 0.3663639601139601, + "grad_norm": 0.6266321539878845, + "learning_rate": 0.0003854850134182991, + "loss": 0.7062, + "step": 16460 + }, + { + "epoch": 0.36658653846153844, + "grad_norm": 0.9455707669258118, + "learning_rate": 0.0003854675712426269, + "loss": 0.5297, + "step": 16470 + }, + { + "epoch": 0.3668091168091168, + "grad_norm": 0.6796744465827942, + "learning_rate": 0.000385450118988495, + "loss": 0.8696, + "step": 16480 + }, + { + "epoch": 0.36703169515669515, + "grad_norm": 0.5304951071739197, + "learning_rate": 0.0003854326566568519, + "loss": 0.6025, + "step": 16490 + }, + { + "epoch": 0.36725427350427353, + "grad_norm": 0.5772337913513184, + "learning_rate": 0.00038541518424864635, + "loss": 0.6433, + "step": 16500 + }, + { + "epoch": 0.36747685185185186, + "grad_norm": 0.7764016389846802, + "learning_rate": 0.0003853977017648279, + "loss": 0.6938, + "step": 16510 + }, + { + "epoch": 0.3676994301994302, + "grad_norm": 0.8687075972557068, + "learning_rate": 0.00038538020920634664, + "loss": 0.6411, + "step": 16520 + }, + { + "epoch": 0.36792200854700857, + "grad_norm": 0.6664961576461792, + "learning_rate": 0.00038536270657415296, + "loss": 0.7044, + "step": 16530 + }, + { + "epoch": 0.3681445868945869, + "grad_norm": 0.659508228302002, + "learning_rate": 0.0003853451938691981, + "loss": 0.5969, + "step": 16540 + }, + { + "epoch": 0.36836716524216523, + "grad_norm": 0.6755292415618896, + "learning_rate": 0.00038532767109243366, + "loss": 0.7851, + "step": 16550 + }, + { + "epoch": 0.3685897435897436, + "grad_norm": 0.5175288319587708, + "learning_rate": 0.0003853101382448119, + "loss": 0.6716, + "step": 16560 + }, + { + "epoch": 0.36881232193732194, + "grad_norm": 0.8062911033630371, + "learning_rate": 0.00038529259532728543, + "loss": 0.5692, + "step": 16570 + }, + { + "epoch": 0.36903490028490027, + "grad_norm": 0.9816310405731201, + "learning_rate": 0.00038527504234080775, + "loss": 0.7529, + "step": 16580 + }, + { + "epoch": 0.36925747863247865, + "grad_norm": 0.5312542915344238, + "learning_rate": 0.00038525747928633253, + "loss": 0.5502, + "step": 16590 + }, + { + "epoch": 0.369480056980057, + "grad_norm": 0.8313422203063965, + "learning_rate": 0.0003852399061648143, + "loss": 0.597, + "step": 16600 + }, + { + "epoch": 0.3697026353276353, + "grad_norm": 0.5209640860557556, + "learning_rate": 0.00038522232297720786, + "loss": 0.5609, + "step": 16610 + }, + { + "epoch": 0.3699252136752137, + "grad_norm": 0.6579368710517883, + "learning_rate": 0.0003852047297244687, + "loss": 0.6973, + "step": 16620 + }, + { + "epoch": 0.370147792022792, + "grad_norm": 0.7361441850662231, + "learning_rate": 0.00038518712640755304, + "loss": 0.6594, + "step": 16630 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.674850344657898, + "learning_rate": 0.00038516951302741735, + "loss": 0.6678, + "step": 16640 + }, + { + "epoch": 0.37059294871794873, + "grad_norm": 0.552830159664154, + "learning_rate": 0.0003851518895850186, + "loss": 0.5816, + "step": 16650 + }, + { + "epoch": 0.37081552706552706, + "grad_norm": 0.8027279376983643, + "learning_rate": 0.00038513425608131466, + "loss": 0.6624, + "step": 16660 + }, + { + "epoch": 0.3710381054131054, + "grad_norm": 0.8535835146903992, + "learning_rate": 0.0003851166125172637, + "loss": 0.5926, + "step": 16670 + }, + { + "epoch": 0.3712606837606838, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00038509895889382443, + "loss": 0.5824, + "step": 16680 + }, + { + "epoch": 0.3714832621082621, + "grad_norm": 0.7670106887817383, + "learning_rate": 0.00038508129521195623, + "loss": 0.7243, + "step": 16690 + }, + { + "epoch": 0.37170584045584043, + "grad_norm": 0.7338453531265259, + "learning_rate": 0.00038506362147261897, + "loss": 0.6473, + "step": 16700 + }, + { + "epoch": 0.3719284188034188, + "grad_norm": 0.7035108804702759, + "learning_rate": 0.000385045937676773, + "loss": 0.5963, + "step": 16710 + }, + { + "epoch": 0.37215099715099714, + "grad_norm": 0.5865857601165771, + "learning_rate": 0.00038502824382537925, + "loss": 0.618, + "step": 16720 + }, + { + "epoch": 0.37237357549857547, + "grad_norm": 0.7001277208328247, + "learning_rate": 0.00038501053991939926, + "loss": 0.6251, + "step": 16730 + }, + { + "epoch": 0.37259615384615385, + "grad_norm": 0.692415714263916, + "learning_rate": 0.00038499282595979515, + "loss": 0.5677, + "step": 16740 + }, + { + "epoch": 0.3728187321937322, + "grad_norm": 0.6563718318939209, + "learning_rate": 0.0003849751019475294, + "loss": 0.6252, + "step": 16750 + }, + { + "epoch": 0.37304131054131057, + "grad_norm": 0.7072324752807617, + "learning_rate": 0.00038495736788356514, + "loss": 0.6283, + "step": 16760 + }, + { + "epoch": 0.3732638888888889, + "grad_norm": 0.5536751747131348, + "learning_rate": 0.00038493962376886614, + "loss": 0.6762, + "step": 16770 + }, + { + "epoch": 0.3734864672364672, + "grad_norm": 0.707798421382904, + "learning_rate": 0.00038492186960439656, + "loss": 0.6905, + "step": 16780 + }, + { + "epoch": 0.3737090455840456, + "grad_norm": 1.0452958345413208, + "learning_rate": 0.0003849041053911212, + "loss": 0.7101, + "step": 16790 + }, + { + "epoch": 0.37393162393162394, + "grad_norm": 0.7269765138626099, + "learning_rate": 0.0003848863311300054, + "loss": 0.7, + "step": 16800 + }, + { + "epoch": 0.37415420227920226, + "grad_norm": 0.8381827473640442, + "learning_rate": 0.000384868546822015, + "loss": 0.719, + "step": 16810 + }, + { + "epoch": 0.37437678062678065, + "grad_norm": 0.590775191783905, + "learning_rate": 0.0003848507524681164, + "loss": 0.5596, + "step": 16820 + }, + { + "epoch": 0.374599358974359, + "grad_norm": 0.7995975017547607, + "learning_rate": 0.0003848329480692766, + "loss": 0.6334, + "step": 16830 + }, + { + "epoch": 0.3748219373219373, + "grad_norm": 0.8189190626144409, + "learning_rate": 0.00038481513362646313, + "loss": 0.5925, + "step": 16840 + }, + { + "epoch": 0.3750445156695157, + "grad_norm": 0.6057611107826233, + "learning_rate": 0.000384797309140644, + "loss": 0.6012, + "step": 16850 + }, + { + "epoch": 0.375267094017094, + "grad_norm": 0.7356190085411072, + "learning_rate": 0.0003847794746127878, + "loss": 0.6582, + "step": 16860 + }, + { + "epoch": 0.37548967236467234, + "grad_norm": 0.8917863368988037, + "learning_rate": 0.0003847616300438636, + "loss": 0.6011, + "step": 16870 + }, + { + "epoch": 0.37571225071225073, + "grad_norm": 0.6896722316741943, + "learning_rate": 0.0003847437754348413, + "loss": 0.6149, + "step": 16880 + }, + { + "epoch": 0.37593482905982906, + "grad_norm": 0.5577037930488586, + "learning_rate": 0.00038472591078669095, + "loss": 0.669, + "step": 16890 + }, + { + "epoch": 0.3761574074074074, + "grad_norm": 0.8162705898284912, + "learning_rate": 0.00038470803610038336, + "loss": 0.8111, + "step": 16900 + }, + { + "epoch": 0.37637998575498577, + "grad_norm": 0.8367506861686707, + "learning_rate": 0.0003846901513768899, + "loss": 0.7359, + "step": 16910 + }, + { + "epoch": 0.3766025641025641, + "grad_norm": 0.8381311893463135, + "learning_rate": 0.0003846722566171824, + "loss": 0.8893, + "step": 16920 + }, + { + "epoch": 0.3768251424501424, + "grad_norm": 0.8151772618293762, + "learning_rate": 0.00038465435182223335, + "loss": 0.6444, + "step": 16930 + }, + { + "epoch": 0.3770477207977208, + "grad_norm": 0.5317022204399109, + "learning_rate": 0.00038463643699301566, + "loss": 0.5619, + "step": 16940 + }, + { + "epoch": 0.37727029914529914, + "grad_norm": 0.39144349098205566, + "learning_rate": 0.00038461851213050276, + "loss": 0.5957, + "step": 16950 + }, + { + "epoch": 0.37749287749287747, + "grad_norm": 0.6841157078742981, + "learning_rate": 0.0003846005772356688, + "loss": 0.458, + "step": 16960 + }, + { + "epoch": 0.37771545584045585, + "grad_norm": 1.2825673818588257, + "learning_rate": 0.0003845826323094883, + "loss": 0.6741, + "step": 16970 + }, + { + "epoch": 0.3779380341880342, + "grad_norm": 0.7954182028770447, + "learning_rate": 0.00038456467735293654, + "loss": 0.751, + "step": 16980 + }, + { + "epoch": 0.37816061253561256, + "grad_norm": 0.6667285561561584, + "learning_rate": 0.00038454671236698917, + "loss": 0.6689, + "step": 16990 + }, + { + "epoch": 0.3783831908831909, + "grad_norm": 0.8058348894119263, + "learning_rate": 0.00038452873735262224, + "loss": 0.6378, + "step": 17000 + }, + { + "epoch": 0.3786057692307692, + "grad_norm": 0.79270339012146, + "learning_rate": 0.00038451075231081273, + "loss": 0.5072, + "step": 17010 + }, + { + "epoch": 0.3788283475783476, + "grad_norm": 0.931929886341095, + "learning_rate": 0.0003844927572425379, + "loss": 0.6612, + "step": 17020 + }, + { + "epoch": 0.37905092592592593, + "grad_norm": 0.7652475833892822, + "learning_rate": 0.0003844747521487756, + "loss": 0.5512, + "step": 17030 + }, + { + "epoch": 0.37927350427350426, + "grad_norm": 1.1219477653503418, + "learning_rate": 0.00038445673703050426, + "loss": 0.6397, + "step": 17040 + }, + { + "epoch": 0.37949608262108264, + "grad_norm": 0.5693344473838806, + "learning_rate": 0.0003844387118887028, + "loss": 0.679, + "step": 17050 + }, + { + "epoch": 0.37971866096866097, + "grad_norm": 0.4774753153324127, + "learning_rate": 0.0003844206767243507, + "loss": 0.4744, + "step": 17060 + }, + { + "epoch": 0.3799412393162393, + "grad_norm": 0.6411837935447693, + "learning_rate": 0.0003844026315384281, + "loss": 0.7865, + "step": 17070 + }, + { + "epoch": 0.3801638176638177, + "grad_norm": 0.768381655216217, + "learning_rate": 0.00038438457633191555, + "loss": 0.5511, + "step": 17080 + }, + { + "epoch": 0.380386396011396, + "grad_norm": 0.7788591980934143, + "learning_rate": 0.0003843665111057942, + "loss": 0.6633, + "step": 17090 + }, + { + "epoch": 0.38060897435897434, + "grad_norm": 0.5127415657043457, + "learning_rate": 0.0003843484358610457, + "loss": 0.5877, + "step": 17100 + }, + { + "epoch": 0.3808315527065527, + "grad_norm": 0.8092899322509766, + "learning_rate": 0.00038433035059865227, + "loss": 0.5595, + "step": 17110 + }, + { + "epoch": 0.38105413105413105, + "grad_norm": 0.716167688369751, + "learning_rate": 0.00038431225531959667, + "loss": 0.7413, + "step": 17120 + }, + { + "epoch": 0.3812767094017094, + "grad_norm": 1.0542246103286743, + "learning_rate": 0.00038429415002486225, + "loss": 0.6816, + "step": 17130 + }, + { + "epoch": 0.38149928774928776, + "grad_norm": 0.6427927017211914, + "learning_rate": 0.0003842760347154328, + "loss": 0.578, + "step": 17140 + }, + { + "epoch": 0.3817218660968661, + "grad_norm": 0.5981531143188477, + "learning_rate": 0.00038425790939229285, + "loss": 0.4801, + "step": 17150 + }, + { + "epoch": 0.3819444444444444, + "grad_norm": 0.9075816869735718, + "learning_rate": 0.0003842397740564272, + "loss": 0.6257, + "step": 17160 + }, + { + "epoch": 0.3821670227920228, + "grad_norm": 0.9044390916824341, + "learning_rate": 0.00038422162870882146, + "loss": 0.6812, + "step": 17170 + }, + { + "epoch": 0.38238960113960113, + "grad_norm": 0.7249968647956848, + "learning_rate": 0.00038420347335046154, + "loss": 0.6322, + "step": 17180 + }, + { + "epoch": 0.38261217948717946, + "grad_norm": 0.58587646484375, + "learning_rate": 0.00038418530798233413, + "loss": 0.5745, + "step": 17190 + }, + { + "epoch": 0.38283475783475784, + "grad_norm": 0.6931901574134827, + "learning_rate": 0.0003841671326054263, + "loss": 0.7244, + "step": 17200 + }, + { + "epoch": 0.38305733618233617, + "grad_norm": 0.6910154223442078, + "learning_rate": 0.0003841489472207257, + "loss": 0.7252, + "step": 17210 + }, + { + "epoch": 0.38327991452991456, + "grad_norm": 0.6198472380638123, + "learning_rate": 0.0003841307518292205, + "loss": 0.7574, + "step": 17220 + }, + { + "epoch": 0.3835024928774929, + "grad_norm": 0.6532488465309143, + "learning_rate": 0.0003841125464318996, + "loss": 0.6653, + "step": 17230 + }, + { + "epoch": 0.3837250712250712, + "grad_norm": 0.8199801445007324, + "learning_rate": 0.00038409433102975225, + "loss": 0.5744, + "step": 17240 + }, + { + "epoch": 0.3839476495726496, + "grad_norm": 0.47872394323349, + "learning_rate": 0.0003840761056237681, + "loss": 0.6205, + "step": 17250 + }, + { + "epoch": 0.3841702279202279, + "grad_norm": 0.6670446395874023, + "learning_rate": 0.0003840578702149378, + "loss": 0.6703, + "step": 17260 + }, + { + "epoch": 0.38439280626780625, + "grad_norm": 0.6099238991737366, + "learning_rate": 0.00038403962480425204, + "loss": 0.6519, + "step": 17270 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.862278163433075, + "learning_rate": 0.0003840213693927025, + "loss": 0.5906, + "step": 17280 + }, + { + "epoch": 0.38483796296296297, + "grad_norm": 0.6451935172080994, + "learning_rate": 0.00038400310398128105, + "loss": 0.7502, + "step": 17290 + }, + { + "epoch": 0.3850605413105413, + "grad_norm": 0.7635634541511536, + "learning_rate": 0.00038398482857098036, + "loss": 0.6136, + "step": 17300 + }, + { + "epoch": 0.3852831196581197, + "grad_norm": 0.7795054912567139, + "learning_rate": 0.0003839665431627934, + "loss": 0.5953, + "step": 17310 + }, + { + "epoch": 0.385505698005698, + "grad_norm": 0.7930306196212769, + "learning_rate": 0.0003839482477577139, + "loss": 0.7062, + "step": 17320 + }, + { + "epoch": 0.38572827635327633, + "grad_norm": 0.5813744068145752, + "learning_rate": 0.000383929942356736, + "loss": 0.5969, + "step": 17330 + }, + { + "epoch": 0.3859508547008547, + "grad_norm": 0.8584675192832947, + "learning_rate": 0.00038391162696085456, + "loss": 0.6734, + "step": 17340 + }, + { + "epoch": 0.38617343304843305, + "grad_norm": 0.6163985729217529, + "learning_rate": 0.00038389330157106473, + "loss": 0.5826, + "step": 17350 + }, + { + "epoch": 0.3863960113960114, + "grad_norm": 0.6514464020729065, + "learning_rate": 0.00038387496618836226, + "loss": 0.5307, + "step": 17360 + }, + { + "epoch": 0.38661858974358976, + "grad_norm": 0.9078758358955383, + "learning_rate": 0.00038385662081374364, + "loss": 0.7306, + "step": 17370 + }, + { + "epoch": 0.3868411680911681, + "grad_norm": 0.8621419668197632, + "learning_rate": 0.0003838382654482058, + "loss": 0.7171, + "step": 17380 + }, + { + "epoch": 0.3870637464387464, + "grad_norm": 0.5458530187606812, + "learning_rate": 0.00038381990009274603, + "loss": 0.6731, + "step": 17390 + }, + { + "epoch": 0.3872863247863248, + "grad_norm": 0.4690433442592621, + "learning_rate": 0.00038380152474836246, + "loss": 0.7042, + "step": 17400 + }, + { + "epoch": 0.3875089031339031, + "grad_norm": 0.4992372393608093, + "learning_rate": 0.0003837831394160535, + "loss": 0.5502, + "step": 17410 + }, + { + "epoch": 0.38773148148148145, + "grad_norm": 0.601311981678009, + "learning_rate": 0.0003837647440968184, + "loss": 0.583, + "step": 17420 + }, + { + "epoch": 0.38795405982905984, + "grad_norm": 0.7417802214622498, + "learning_rate": 0.00038374633879165664, + "loss": 0.6168, + "step": 17430 + }, + { + "epoch": 0.38817663817663817, + "grad_norm": 0.8696532845497131, + "learning_rate": 0.00038372792350156834, + "loss": 0.6895, + "step": 17440 + }, + { + "epoch": 0.38839921652421655, + "grad_norm": 0.660205066204071, + "learning_rate": 0.00038370949822755436, + "loss": 0.7395, + "step": 17450 + }, + { + "epoch": 0.3886217948717949, + "grad_norm": 0.506757915019989, + "learning_rate": 0.0003836910629706158, + "loss": 0.6585, + "step": 17460 + }, + { + "epoch": 0.3888443732193732, + "grad_norm": 0.793761134147644, + "learning_rate": 0.00038367261773175447, + "loss": 0.5542, + "step": 17470 + }, + { + "epoch": 0.3890669515669516, + "grad_norm": 0.7295845746994019, + "learning_rate": 0.00038365416251197283, + "loss": 0.577, + "step": 17480 + }, + { + "epoch": 0.3892895299145299, + "grad_norm": 1.1868880987167358, + "learning_rate": 0.0003836356973122735, + "loss": 0.7305, + "step": 17490 + }, + { + "epoch": 0.38951210826210825, + "grad_norm": 0.6900010704994202, + "learning_rate": 0.0003836172221336602, + "loss": 0.6991, + "step": 17500 + }, + { + "epoch": 0.38973468660968663, + "grad_norm": 0.7464649677276611, + "learning_rate": 0.0003835987369771367, + "loss": 0.607, + "step": 17510 + }, + { + "epoch": 0.38995726495726496, + "grad_norm": 0.6252322196960449, + "learning_rate": 0.00038358024184370745, + "loss": 0.563, + "step": 17520 + }, + { + "epoch": 0.3901798433048433, + "grad_norm": 0.6333285570144653, + "learning_rate": 0.0003835617367343776, + "loss": 0.6089, + "step": 17530 + }, + { + "epoch": 0.39040242165242167, + "grad_norm": 0.7202562093734741, + "learning_rate": 0.0003835432216501528, + "loss": 0.7287, + "step": 17540 + }, + { + "epoch": 0.390625, + "grad_norm": 0.6413396000862122, + "learning_rate": 0.000383524696592039, + "loss": 0.6301, + "step": 17550 + }, + { + "epoch": 0.39084757834757833, + "grad_norm": 0.521155834197998, + "learning_rate": 0.0003835061615610429, + "loss": 0.5874, + "step": 17560 + }, + { + "epoch": 0.3910701566951567, + "grad_norm": 0.4937155544757843, + "learning_rate": 0.0003834876165581719, + "loss": 0.4613, + "step": 17570 + }, + { + "epoch": 0.39129273504273504, + "grad_norm": 0.9195260405540466, + "learning_rate": 0.0003834690615844335, + "loss": 0.5578, + "step": 17580 + }, + { + "epoch": 0.39151531339031337, + "grad_norm": 1.136011004447937, + "learning_rate": 0.0003834504966408361, + "loss": 0.6035, + "step": 17590 + }, + { + "epoch": 0.39173789173789175, + "grad_norm": 0.5977762937545776, + "learning_rate": 0.00038343192172838854, + "loss": 0.6696, + "step": 17600 + }, + { + "epoch": 0.3919604700854701, + "grad_norm": 0.5439203381538391, + "learning_rate": 0.0003834133368481002, + "loss": 0.6116, + "step": 17610 + }, + { + "epoch": 0.3921830484330484, + "grad_norm": 0.40402865409851074, + "learning_rate": 0.000383394742000981, + "loss": 0.6585, + "step": 17620 + }, + { + "epoch": 0.3924056267806268, + "grad_norm": 1.3911811113357544, + "learning_rate": 0.00038337613718804136, + "loss": 0.7167, + "step": 17630 + }, + { + "epoch": 0.3926282051282051, + "grad_norm": 0.8521585464477539, + "learning_rate": 0.00038335752241029235, + "loss": 0.6441, + "step": 17640 + }, + { + "epoch": 0.39285078347578345, + "grad_norm": 0.6077307462692261, + "learning_rate": 0.0003833388976687454, + "loss": 0.591, + "step": 17650 + }, + { + "epoch": 0.39307336182336183, + "grad_norm": 0.7540651559829712, + "learning_rate": 0.0003833202629644127, + "loss": 0.6238, + "step": 17660 + }, + { + "epoch": 0.39329594017094016, + "grad_norm": 1.261109709739685, + "learning_rate": 0.0003833016182983069, + "loss": 0.6148, + "step": 17670 + }, + { + "epoch": 0.39351851851851855, + "grad_norm": 0.6413763761520386, + "learning_rate": 0.00038328296367144097, + "loss": 0.6449, + "step": 17680 + }, + { + "epoch": 0.3937410968660969, + "grad_norm": 1.002094030380249, + "learning_rate": 0.00038326429908482887, + "loss": 0.5893, + "step": 17690 + }, + { + "epoch": 0.3939636752136752, + "grad_norm": 0.6313454508781433, + "learning_rate": 0.00038324562453948463, + "loss": 0.7269, + "step": 17700 + }, + { + "epoch": 0.3941862535612536, + "grad_norm": 0.49584224820137024, + "learning_rate": 0.00038322694003642323, + "loss": 0.5761, + "step": 17710 + }, + { + "epoch": 0.3944088319088319, + "grad_norm": 0.8321060538291931, + "learning_rate": 0.00038320824557665987, + "loss": 0.5659, + "step": 17720 + }, + { + "epoch": 0.39463141025641024, + "grad_norm": 0.5422239899635315, + "learning_rate": 0.0003831895411612105, + "loss": 0.5458, + "step": 17730 + }, + { + "epoch": 0.3948539886039886, + "grad_norm": 0.5938933491706848, + "learning_rate": 0.00038317082679109143, + "loss": 0.6163, + "step": 17740 + }, + { + "epoch": 0.39507656695156695, + "grad_norm": 0.8462273478507996, + "learning_rate": 0.0003831521024673197, + "loss": 0.6927, + "step": 17750 + }, + { + "epoch": 0.3952991452991453, + "grad_norm": 0.7590085864067078, + "learning_rate": 0.00038313336819091284, + "loss": 0.7876, + "step": 17760 + }, + { + "epoch": 0.39552172364672367, + "grad_norm": 0.6377508044242859, + "learning_rate": 0.0003831146239628888, + "loss": 0.6783, + "step": 17770 + }, + { + "epoch": 0.395744301994302, + "grad_norm": 0.5582488775253296, + "learning_rate": 0.00038309586978426617, + "loss": 0.7001, + "step": 17780 + }, + { + "epoch": 0.3959668803418803, + "grad_norm": 0.9236611127853394, + "learning_rate": 0.00038307710565606414, + "loss": 0.7399, + "step": 17790 + }, + { + "epoch": 0.3961894586894587, + "grad_norm": 0.5856517553329468, + "learning_rate": 0.0003830583315793023, + "loss": 0.5419, + "step": 17800 + }, + { + "epoch": 0.39641203703703703, + "grad_norm": 0.7868704199790955, + "learning_rate": 0.0003830395475550008, + "loss": 0.6998, + "step": 17810 + }, + { + "epoch": 0.39663461538461536, + "grad_norm": 0.745160698890686, + "learning_rate": 0.0003830207535841805, + "loss": 0.6117, + "step": 17820 + }, + { + "epoch": 0.39685719373219375, + "grad_norm": 0.966376006603241, + "learning_rate": 0.00038300194966786263, + "loss": 0.5653, + "step": 17830 + }, + { + "epoch": 0.3970797720797721, + "grad_norm": 1.0567193031311035, + "learning_rate": 0.000382983135807069, + "loss": 0.6592, + "step": 17840 + }, + { + "epoch": 0.3973023504273504, + "grad_norm": 0.6462680697441101, + "learning_rate": 0.0003829643120028219, + "loss": 0.6337, + "step": 17850 + }, + { + "epoch": 0.3975249287749288, + "grad_norm": 0.7606768012046814, + "learning_rate": 0.0003829454782561444, + "loss": 0.6381, + "step": 17860 + }, + { + "epoch": 0.3977475071225071, + "grad_norm": 0.697704553604126, + "learning_rate": 0.0003829266345680598, + "loss": 0.64, + "step": 17870 + }, + { + "epoch": 0.39797008547008544, + "grad_norm": 0.7255660891532898, + "learning_rate": 0.0003829077809395921, + "loss": 0.7084, + "step": 17880 + }, + { + "epoch": 0.39819266381766383, + "grad_norm": 0.7107759714126587, + "learning_rate": 0.0003828889173717659, + "loss": 0.6351, + "step": 17890 + }, + { + "epoch": 0.39841524216524216, + "grad_norm": 0.635511577129364, + "learning_rate": 0.0003828700438656062, + "loss": 0.5843, + "step": 17900 + }, + { + "epoch": 0.39863782051282054, + "grad_norm": 0.7312158942222595, + "learning_rate": 0.0003828511604221386, + "loss": 0.6226, + "step": 17910 + }, + { + "epoch": 0.39886039886039887, + "grad_norm": 0.6904292702674866, + "learning_rate": 0.0003828322670423893, + "loss": 0.6916, + "step": 17920 + }, + { + "epoch": 0.3990829772079772, + "grad_norm": 0.5614930987358093, + "learning_rate": 0.0003828133637273848, + "loss": 0.6149, + "step": 17930 + }, + { + "epoch": 0.3993055555555556, + "grad_norm": 1.0300472974777222, + "learning_rate": 0.00038279445047815255, + "loss": 0.6267, + "step": 17940 + }, + { + "epoch": 0.3995281339031339, + "grad_norm": 0.535834014415741, + "learning_rate": 0.00038277552729572024, + "loss": 0.6585, + "step": 17950 + }, + { + "epoch": 0.39975071225071224, + "grad_norm": 0.4721270203590393, + "learning_rate": 0.00038275659418111614, + "loss": 0.5913, + "step": 17960 + }, + { + "epoch": 0.3999732905982906, + "grad_norm": 0.9660285711288452, + "learning_rate": 0.00038273765113536906, + "loss": 0.652, + "step": 17970 + }, + { + "epoch": 0.40019586894586895, + "grad_norm": 0.5322182178497314, + "learning_rate": 0.0003827186981595085, + "loss": 0.6456, + "step": 17980 + }, + { + "epoch": 0.4004184472934473, + "grad_norm": 0.4535355567932129, + "learning_rate": 0.0003826997352545642, + "loss": 0.8067, + "step": 17990 + }, + { + "epoch": 0.40064102564102566, + "grad_norm": 0.7890868186950684, + "learning_rate": 0.00038268076242156684, + "loss": 0.4926, + "step": 18000 + }, + { + "epoch": 0.400863603988604, + "grad_norm": 0.8056808114051819, + "learning_rate": 0.0003826617796615472, + "loss": 0.628, + "step": 18010 + }, + { + "epoch": 0.4010861823361823, + "grad_norm": 0.6489529609680176, + "learning_rate": 0.00038264278697553697, + "loss": 0.7349, + "step": 18020 + }, + { + "epoch": 0.4013087606837607, + "grad_norm": 0.7820817232131958, + "learning_rate": 0.00038262378436456815, + "loss": 0.6355, + "step": 18030 + }, + { + "epoch": 0.40153133903133903, + "grad_norm": 0.5969810485839844, + "learning_rate": 0.0003826047718296734, + "loss": 0.6453, + "step": 18040 + }, + { + "epoch": 0.40175391737891736, + "grad_norm": 0.7493919134140015, + "learning_rate": 0.0003825857493718858, + "loss": 0.5864, + "step": 18050 + }, + { + "epoch": 0.40197649572649574, + "grad_norm": 0.6539332270622253, + "learning_rate": 0.0003825667169922392, + "loss": 0.7754, + "step": 18060 + }, + { + "epoch": 0.40219907407407407, + "grad_norm": 0.6059889793395996, + "learning_rate": 0.0003825476746917677, + "loss": 0.6061, + "step": 18070 + }, + { + "epoch": 0.4024216524216524, + "grad_norm": 0.616847574710846, + "learning_rate": 0.0003825286224715061, + "loss": 0.5818, + "step": 18080 + }, + { + "epoch": 0.4026442307692308, + "grad_norm": 0.728132963180542, + "learning_rate": 0.0003825095603324898, + "loss": 0.5179, + "step": 18090 + }, + { + "epoch": 0.4028668091168091, + "grad_norm": 0.45363709330558777, + "learning_rate": 0.0003824904882757545, + "loss": 0.7275, + "step": 18100 + }, + { + "epoch": 0.40308938746438744, + "grad_norm": 1.0843173265457153, + "learning_rate": 0.0003824714063023367, + "loss": 0.5816, + "step": 18110 + }, + { + "epoch": 0.4033119658119658, + "grad_norm": 0.5212698578834534, + "learning_rate": 0.00038245231441327333, + "loss": 0.5885, + "step": 18120 + }, + { + "epoch": 0.40353454415954415, + "grad_norm": 0.7329869866371155, + "learning_rate": 0.00038243321260960186, + "loss": 0.6696, + "step": 18130 + }, + { + "epoch": 0.40375712250712253, + "grad_norm": 0.6578690409660339, + "learning_rate": 0.00038241410089236014, + "loss": 0.5821, + "step": 18140 + }, + { + "epoch": 0.40397970085470086, + "grad_norm": 0.5347065329551697, + "learning_rate": 0.00038239497926258697, + "loss": 0.6141, + "step": 18150 + }, + { + "epoch": 0.4042022792022792, + "grad_norm": 0.6627506017684937, + "learning_rate": 0.00038237584772132126, + "loss": 0.646, + "step": 18160 + }, + { + "epoch": 0.4044248575498576, + "grad_norm": 0.6520497798919678, + "learning_rate": 0.0003823567062696027, + "loss": 0.5735, + "step": 18170 + }, + { + "epoch": 0.4046474358974359, + "grad_norm": 1.2275110483169556, + "learning_rate": 0.00038233755490847145, + "loss": 0.5811, + "step": 18180 + }, + { + "epoch": 0.40487001424501423, + "grad_norm": 0.7590609788894653, + "learning_rate": 0.0003823183936389682, + "loss": 0.594, + "step": 18190 + }, + { + "epoch": 0.4050925925925926, + "grad_norm": 0.7115891575813293, + "learning_rate": 0.00038229922246213417, + "loss": 0.5999, + "step": 18200 + }, + { + "epoch": 0.40531517094017094, + "grad_norm": 0.4889591634273529, + "learning_rate": 0.00038228004137901114, + "loss": 0.6459, + "step": 18210 + }, + { + "epoch": 0.40553774928774927, + "grad_norm": 0.7974612712860107, + "learning_rate": 0.0003822608503906414, + "loss": 0.7665, + "step": 18220 + }, + { + "epoch": 0.40576032763532766, + "grad_norm": 0.4833396077156067, + "learning_rate": 0.0003822416494980679, + "loss": 0.5332, + "step": 18230 + }, + { + "epoch": 0.405982905982906, + "grad_norm": 0.7940080165863037, + "learning_rate": 0.000382222438702334, + "loss": 0.6702, + "step": 18240 + }, + { + "epoch": 0.4062054843304843, + "grad_norm": 0.6132873892784119, + "learning_rate": 0.00038220321800448356, + "loss": 0.5453, + "step": 18250 + }, + { + "epoch": 0.4064280626780627, + "grad_norm": 0.7880875468254089, + "learning_rate": 0.00038218398740556115, + "loss": 0.7812, + "step": 18260 + }, + { + "epoch": 0.406650641025641, + "grad_norm": 0.6013901233673096, + "learning_rate": 0.0003821647469066117, + "loss": 0.705, + "step": 18270 + }, + { + "epoch": 0.40687321937321935, + "grad_norm": 0.6151648759841919, + "learning_rate": 0.0003821454965086807, + "loss": 0.6537, + "step": 18280 + }, + { + "epoch": 0.40709579772079774, + "grad_norm": 0.6463825702667236, + "learning_rate": 0.0003821262362128144, + "loss": 0.6388, + "step": 18290 + }, + { + "epoch": 0.40731837606837606, + "grad_norm": 0.7021704316139221, + "learning_rate": 0.0003821069660200593, + "loss": 0.6958, + "step": 18300 + }, + { + "epoch": 0.4075409544159544, + "grad_norm": 0.9794663786888123, + "learning_rate": 0.0003820876859314626, + "loss": 0.6377, + "step": 18310 + }, + { + "epoch": 0.4077635327635328, + "grad_norm": 0.8478348255157471, + "learning_rate": 0.00038206839594807197, + "loss": 0.6647, + "step": 18320 + }, + { + "epoch": 0.4079861111111111, + "grad_norm": 0.7850647568702698, + "learning_rate": 0.00038204909607093563, + "loss": 0.4885, + "step": 18330 + }, + { + "epoch": 0.40820868945868943, + "grad_norm": 0.9271420836448669, + "learning_rate": 0.00038202978630110245, + "loss": 0.716, + "step": 18340 + }, + { + "epoch": 0.4084312678062678, + "grad_norm": 0.8825610280036926, + "learning_rate": 0.0003820104666396216, + "loss": 0.5358, + "step": 18350 + }, + { + "epoch": 0.40865384615384615, + "grad_norm": 0.5441417694091797, + "learning_rate": 0.000381991137087543, + "loss": 0.6127, + "step": 18360 + }, + { + "epoch": 0.40887642450142453, + "grad_norm": 0.8587644696235657, + "learning_rate": 0.00038197179764591703, + "loss": 0.5525, + "step": 18370 + }, + { + "epoch": 0.40909900284900286, + "grad_norm": 0.6348844170570374, + "learning_rate": 0.0003819524483157946, + "loss": 0.6251, + "step": 18380 + }, + { + "epoch": 0.4093215811965812, + "grad_norm": 0.5927433967590332, + "learning_rate": 0.0003819330890982272, + "loss": 0.4646, + "step": 18390 + }, + { + "epoch": 0.40954415954415957, + "grad_norm": 0.6170734763145447, + "learning_rate": 0.0003819137199942668, + "loss": 0.6167, + "step": 18400 + }, + { + "epoch": 0.4097667378917379, + "grad_norm": 0.49462154507637024, + "learning_rate": 0.00038189434100496594, + "loss": 0.6901, + "step": 18410 + }, + { + "epoch": 0.4099893162393162, + "grad_norm": 0.5856154561042786, + "learning_rate": 0.0003818749521313777, + "loss": 0.7309, + "step": 18420 + }, + { + "epoch": 0.4102118945868946, + "grad_norm": 0.7598608732223511, + "learning_rate": 0.0003818555533745556, + "loss": 0.6551, + "step": 18430 + }, + { + "epoch": 0.41043447293447294, + "grad_norm": 0.5119199156761169, + "learning_rate": 0.00038183614473555387, + "loss": 0.7321, + "step": 18440 + }, + { + "epoch": 0.41065705128205127, + "grad_norm": 0.6292290687561035, + "learning_rate": 0.0003818167262154272, + "loss": 0.6233, + "step": 18450 + }, + { + "epoch": 0.41087962962962965, + "grad_norm": 0.47254326939582825, + "learning_rate": 0.0003817972978152308, + "loss": 0.6084, + "step": 18460 + }, + { + "epoch": 0.411102207977208, + "grad_norm": 0.7951480746269226, + "learning_rate": 0.00038177785953602035, + "loss": 0.6765, + "step": 18470 + }, + { + "epoch": 0.4113247863247863, + "grad_norm": 0.7218686938285828, + "learning_rate": 0.00038175841137885226, + "loss": 0.5814, + "step": 18480 + }, + { + "epoch": 0.4115473646723647, + "grad_norm": 0.6350971460342407, + "learning_rate": 0.00038173895334478333, + "loss": 0.557, + "step": 18490 + }, + { + "epoch": 0.411769943019943, + "grad_norm": 0.43875062465667725, + "learning_rate": 0.0003817194854348709, + "loss": 0.4481, + "step": 18500 + }, + { + "epoch": 0.41199252136752135, + "grad_norm": 0.49206212162971497, + "learning_rate": 0.0003817000076501728, + "loss": 0.5852, + "step": 18510 + }, + { + "epoch": 0.41221509971509973, + "grad_norm": 0.4355153441429138, + "learning_rate": 0.00038168051999174754, + "loss": 0.5811, + "step": 18520 + }, + { + "epoch": 0.41243767806267806, + "grad_norm": 0.5758615136146545, + "learning_rate": 0.00038166102246065415, + "loss": 0.7024, + "step": 18530 + }, + { + "epoch": 0.4126602564102564, + "grad_norm": 0.4741193354129791, + "learning_rate": 0.000381641515057952, + "loss": 0.7598, + "step": 18540 + }, + { + "epoch": 0.41288283475783477, + "grad_norm": 1.0277267694473267, + "learning_rate": 0.00038162199778470134, + "loss": 0.6103, + "step": 18550 + }, + { + "epoch": 0.4131054131054131, + "grad_norm": 0.8170305490493774, + "learning_rate": 0.00038160247064196256, + "loss": 0.683, + "step": 18560 + }, + { + "epoch": 0.41332799145299143, + "grad_norm": 0.5887982249259949, + "learning_rate": 0.00038158293363079685, + "loss": 0.7479, + "step": 18570 + }, + { + "epoch": 0.4135505698005698, + "grad_norm": 0.7957956194877625, + "learning_rate": 0.0003815633867522659, + "loss": 0.6387, + "step": 18580 + }, + { + "epoch": 0.41377314814814814, + "grad_norm": 0.5423303842544556, + "learning_rate": 0.0003815438300074319, + "loss": 0.5784, + "step": 18590 + }, + { + "epoch": 0.41399572649572647, + "grad_norm": 0.6572567820549011, + "learning_rate": 0.00038152426339735753, + "loss": 0.5361, + "step": 18600 + }, + { + "epoch": 0.41421830484330485, + "grad_norm": 1.020389199256897, + "learning_rate": 0.0003815046869231061, + "loss": 0.7019, + "step": 18610 + }, + { + "epoch": 0.4144408831908832, + "grad_norm": 0.6280043721199036, + "learning_rate": 0.0003814851005857413, + "loss": 0.5532, + "step": 18620 + }, + { + "epoch": 0.41466346153846156, + "grad_norm": 1.0602699518203735, + "learning_rate": 0.0003814655043863277, + "loss": 0.6681, + "step": 18630 + }, + { + "epoch": 0.4148860398860399, + "grad_norm": 1.3655202388763428, + "learning_rate": 0.00038144589832593003, + "loss": 0.5886, + "step": 18640 + }, + { + "epoch": 0.4151086182336182, + "grad_norm": 1.4455960988998413, + "learning_rate": 0.0003814262824056137, + "loss": 0.6894, + "step": 18650 + }, + { + "epoch": 0.4153311965811966, + "grad_norm": 0.6867120862007141, + "learning_rate": 0.00038140665662644456, + "loss": 0.7084, + "step": 18660 + }, + { + "epoch": 0.41555377492877493, + "grad_norm": 0.6166002154350281, + "learning_rate": 0.00038138702098948924, + "loss": 0.638, + "step": 18670 + }, + { + "epoch": 0.41577635327635326, + "grad_norm": 0.7494068741798401, + "learning_rate": 0.00038136737549581475, + "loss": 0.6502, + "step": 18680 + }, + { + "epoch": 0.41599893162393164, + "grad_norm": 0.9189199209213257, + "learning_rate": 0.00038134772014648854, + "loss": 0.6337, + "step": 18690 + }, + { + "epoch": 0.41622150997151, + "grad_norm": 0.44938209652900696, + "learning_rate": 0.0003813280549425788, + "loss": 0.6499, + "step": 18700 + }, + { + "epoch": 0.4164440883190883, + "grad_norm": 0.4579305648803711, + "learning_rate": 0.0003813083798851541, + "loss": 0.6309, + "step": 18710 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.5728359818458557, + "learning_rate": 0.0003812886949752837, + "loss": 0.6228, + "step": 18720 + }, + { + "epoch": 0.416889245014245, + "grad_norm": 0.652080237865448, + "learning_rate": 0.00038126900021403707, + "loss": 0.6655, + "step": 18730 + }, + { + "epoch": 0.41711182336182334, + "grad_norm": 0.6501042246818542, + "learning_rate": 0.00038124929560248466, + "loss": 0.6089, + "step": 18740 + }, + { + "epoch": 0.4173344017094017, + "grad_norm": 0.7336424589157104, + "learning_rate": 0.00038122958114169707, + "loss": 0.5065, + "step": 18750 + }, + { + "epoch": 0.41755698005698005, + "grad_norm": 0.5579254031181335, + "learning_rate": 0.0003812098568327458, + "loss": 0.5542, + "step": 18760 + }, + { + "epoch": 0.4177795584045584, + "grad_norm": 0.585763156414032, + "learning_rate": 0.00038119012267670246, + "loss": 0.7124, + "step": 18770 + }, + { + "epoch": 0.41800213675213677, + "grad_norm": 0.8041399121284485, + "learning_rate": 0.00038117037867463956, + "loss": 0.5581, + "step": 18780 + }, + { + "epoch": 0.4182247150997151, + "grad_norm": 0.6856745481491089, + "learning_rate": 0.00038115062482763, + "loss": 0.5774, + "step": 18790 + }, + { + "epoch": 0.4184472934472934, + "grad_norm": 0.6076633930206299, + "learning_rate": 0.0003811308611367471, + "loss": 0.5197, + "step": 18800 + }, + { + "epoch": 0.4186698717948718, + "grad_norm": 0.8954238891601562, + "learning_rate": 0.000381111087603065, + "loss": 0.6354, + "step": 18810 + }, + { + "epoch": 0.41889245014245013, + "grad_norm": 0.6402801275253296, + "learning_rate": 0.0003810913042276581, + "loss": 0.5346, + "step": 18820 + }, + { + "epoch": 0.41911502849002846, + "grad_norm": 0.8094208240509033, + "learning_rate": 0.00038107151101160155, + "loss": 0.6664, + "step": 18830 + }, + { + "epoch": 0.41933760683760685, + "grad_norm": 0.8701035976409912, + "learning_rate": 0.0003810517079559708, + "loss": 0.5255, + "step": 18840 + }, + { + "epoch": 0.4195601851851852, + "grad_norm": 0.5222055315971375, + "learning_rate": 0.000381031895061842, + "loss": 0.5832, + "step": 18850 + }, + { + "epoch": 0.41978276353276356, + "grad_norm": 0.5464537739753723, + "learning_rate": 0.00038101207233029184, + "loss": 0.638, + "step": 18860 + }, + { + "epoch": 0.4200053418803419, + "grad_norm": 0.666346549987793, + "learning_rate": 0.0003809922397623975, + "loss": 0.5112, + "step": 18870 + }, + { + "epoch": 0.4200498575498576, + "eval_loss": 0.6332426071166992, + "eval_runtime": 337.3455, + "eval_samples_per_second": 7.011, + "eval_steps_per_second": 7.011, + "step": 18872 + }, + { + "epoch": 0.4202279202279202, + "grad_norm": 0.4587787091732025, + "learning_rate": 0.00038097239735923675, + "loss": 0.633, + "step": 18880 + }, + { + "epoch": 0.4204504985754986, + "grad_norm": 0.9687144160270691, + "learning_rate": 0.0003809525451218877, + "loss": 0.8232, + "step": 18890 + }, + { + "epoch": 0.4206730769230769, + "grad_norm": 1.0683296918869019, + "learning_rate": 0.0003809326830514292, + "loss": 0.5876, + "step": 18900 + }, + { + "epoch": 0.42089565527065526, + "grad_norm": 0.6031848192214966, + "learning_rate": 0.0003809128111489406, + "loss": 0.6018, + "step": 18910 + }, + { + "epoch": 0.42111823361823364, + "grad_norm": 0.961243212223053, + "learning_rate": 0.0003808929294155018, + "loss": 0.6699, + "step": 18920 + }, + { + "epoch": 0.42134081196581197, + "grad_norm": 0.9062433838844299, + "learning_rate": 0.00038087303785219306, + "loss": 0.7075, + "step": 18930 + }, + { + "epoch": 0.4215633903133903, + "grad_norm": 0.5354229807853699, + "learning_rate": 0.0003808531364600954, + "loss": 0.7262, + "step": 18940 + }, + { + "epoch": 0.4217859686609687, + "grad_norm": 0.7969614863395691, + "learning_rate": 0.00038083322524029025, + "loss": 0.6425, + "step": 18950 + }, + { + "epoch": 0.422008547008547, + "grad_norm": 0.6732149720191956, + "learning_rate": 0.0003808133041938596, + "loss": 0.6986, + "step": 18960 + }, + { + "epoch": 0.42223112535612534, + "grad_norm": 0.6219040751457214, + "learning_rate": 0.0003807933733218859, + "loss": 0.6399, + "step": 18970 + }, + { + "epoch": 0.4224537037037037, + "grad_norm": 0.44824638962745667, + "learning_rate": 0.0003807734326254524, + "loss": 0.5337, + "step": 18980 + }, + { + "epoch": 0.42267628205128205, + "grad_norm": 0.5918938517570496, + "learning_rate": 0.0003807534821056426, + "loss": 0.5343, + "step": 18990 + }, + { + "epoch": 0.4228988603988604, + "grad_norm": 0.41495221853256226, + "learning_rate": 0.00038073352176354054, + "loss": 0.6466, + "step": 19000 + }, + { + "epoch": 0.42312143874643876, + "grad_norm": 0.643989086151123, + "learning_rate": 0.00038071355160023096, + "loss": 0.6864, + "step": 19010 + }, + { + "epoch": 0.4233440170940171, + "grad_norm": 0.7840922474861145, + "learning_rate": 0.00038069357161679907, + "loss": 0.7135, + "step": 19020 + }, + { + "epoch": 0.4235665954415954, + "grad_norm": 0.8468214273452759, + "learning_rate": 0.00038067358181433054, + "loss": 0.514, + "step": 19030 + }, + { + "epoch": 0.4237891737891738, + "grad_norm": 0.7911776304244995, + "learning_rate": 0.00038065358219391174, + "loss": 0.6465, + "step": 19040 + }, + { + "epoch": 0.42401175213675213, + "grad_norm": 0.5380447506904602, + "learning_rate": 0.00038063357275662936, + "loss": 0.6041, + "step": 19050 + }, + { + "epoch": 0.42423433048433046, + "grad_norm": 0.5977974534034729, + "learning_rate": 0.0003806135535035707, + "loss": 0.6742, + "step": 19060 + }, + { + "epoch": 0.42445690883190884, + "grad_norm": 0.636170506477356, + "learning_rate": 0.00038059352443582374, + "loss": 0.6234, + "step": 19070 + }, + { + "epoch": 0.42467948717948717, + "grad_norm": 0.4726337492465973, + "learning_rate": 0.0003805734855544768, + "loss": 0.5494, + "step": 19080 + }, + { + "epoch": 0.42490206552706555, + "grad_norm": 0.7931132912635803, + "learning_rate": 0.0003805534368606189, + "loss": 0.6485, + "step": 19090 + }, + { + "epoch": 0.4251246438746439, + "grad_norm": 0.766657292842865, + "learning_rate": 0.00038053337835533937, + "loss": 0.5549, + "step": 19100 + }, + { + "epoch": 0.4253472222222222, + "grad_norm": 0.796826183795929, + "learning_rate": 0.0003805133100397283, + "loss": 0.7259, + "step": 19110 + }, + { + "epoch": 0.4255698005698006, + "grad_norm": 0.7064245343208313, + "learning_rate": 0.0003804932319148761, + "loss": 0.6653, + "step": 19120 + }, + { + "epoch": 0.4257923789173789, + "grad_norm": 0.7597333788871765, + "learning_rate": 0.000380473143981874, + "loss": 0.6861, + "step": 19130 + }, + { + "epoch": 0.42601495726495725, + "grad_norm": 0.7162373661994934, + "learning_rate": 0.00038045304624181354, + "loss": 0.8093, + "step": 19140 + }, + { + "epoch": 0.42623753561253563, + "grad_norm": 0.4276905059814453, + "learning_rate": 0.0003804329386957868, + "loss": 0.5909, + "step": 19150 + }, + { + "epoch": 0.42646011396011396, + "grad_norm": 1.56265389919281, + "learning_rate": 0.0003804128213448864, + "loss": 0.7254, + "step": 19160 + }, + { + "epoch": 0.4266826923076923, + "grad_norm": 0.6898297071456909, + "learning_rate": 0.00038039269419020566, + "loss": 0.6241, + "step": 19170 + }, + { + "epoch": 0.4269052706552707, + "grad_norm": 0.5293618440628052, + "learning_rate": 0.00038037255723283824, + "loss": 0.6753, + "step": 19180 + }, + { + "epoch": 0.427127849002849, + "grad_norm": 0.7283686995506287, + "learning_rate": 0.00038035241047387834, + "loss": 0.6608, + "step": 19190 + }, + { + "epoch": 0.42735042735042733, + "grad_norm": 0.6621398329734802, + "learning_rate": 0.00038033225391442084, + "loss": 0.6593, + "step": 19200 + }, + { + "epoch": 0.4275730056980057, + "grad_norm": 0.6056594252586365, + "learning_rate": 0.00038031208755556105, + "loss": 0.5824, + "step": 19210 + }, + { + "epoch": 0.42779558404558404, + "grad_norm": 0.818027138710022, + "learning_rate": 0.0003802919113983948, + "loss": 0.5348, + "step": 19220 + }, + { + "epoch": 0.42801816239316237, + "grad_norm": 0.6298645734786987, + "learning_rate": 0.0003802717254440185, + "loss": 0.6155, + "step": 19230 + }, + { + "epoch": 0.42824074074074076, + "grad_norm": 0.48016032576560974, + "learning_rate": 0.00038025152969352907, + "loss": 0.593, + "step": 19240 + }, + { + "epoch": 0.4284633190883191, + "grad_norm": 0.3382346034049988, + "learning_rate": 0.00038023132414802393, + "loss": 0.4929, + "step": 19250 + }, + { + "epoch": 0.4286858974358974, + "grad_norm": 0.8397310972213745, + "learning_rate": 0.00038021110880860116, + "loss": 0.5998, + "step": 19260 + }, + { + "epoch": 0.4289084757834758, + "grad_norm": 0.813310980796814, + "learning_rate": 0.0003801908836763591, + "loss": 0.7211, + "step": 19270 + }, + { + "epoch": 0.4291310541310541, + "grad_norm": 0.7902941107749939, + "learning_rate": 0.000380170648752397, + "loss": 0.6227, + "step": 19280 + }, + { + "epoch": 0.42935363247863245, + "grad_norm": 0.5741865634918213, + "learning_rate": 0.0003801504040378143, + "loss": 0.4576, + "step": 19290 + }, + { + "epoch": 0.42957621082621084, + "grad_norm": 0.4125952124595642, + "learning_rate": 0.00038013014953371127, + "loss": 0.4642, + "step": 19300 + }, + { + "epoch": 0.42979878917378916, + "grad_norm": 0.8440093398094177, + "learning_rate": 0.0003801098852411883, + "loss": 0.8402, + "step": 19310 + }, + { + "epoch": 0.43002136752136755, + "grad_norm": 0.7689343094825745, + "learning_rate": 0.0003800896111613468, + "loss": 0.6535, + "step": 19320 + }, + { + "epoch": 0.4302439458689459, + "grad_norm": 0.5488240718841553, + "learning_rate": 0.0003800693272952884, + "loss": 0.6912, + "step": 19330 + }, + { + "epoch": 0.4304665242165242, + "grad_norm": 0.6208325624465942, + "learning_rate": 0.0003800490336441153, + "loss": 0.6589, + "step": 19340 + }, + { + "epoch": 0.4306891025641026, + "grad_norm": 0.5974116921424866, + "learning_rate": 0.0003800287302089304, + "loss": 0.5887, + "step": 19350 + }, + { + "epoch": 0.4309116809116809, + "grad_norm": 0.7017622590065002, + "learning_rate": 0.00038000841699083686, + "loss": 0.5926, + "step": 19360 + }, + { + "epoch": 0.43113425925925924, + "grad_norm": 0.7657302021980286, + "learning_rate": 0.0003799880939909386, + "loss": 0.6634, + "step": 19370 + }, + { + "epoch": 0.43135683760683763, + "grad_norm": 0.7428944706916809, + "learning_rate": 0.00037996776121034, + "loss": 0.5295, + "step": 19380 + }, + { + "epoch": 0.43157941595441596, + "grad_norm": 0.7173558473587036, + "learning_rate": 0.00037994741865014585, + "loss": 0.6091, + "step": 19390 + }, + { + "epoch": 0.4318019943019943, + "grad_norm": 0.42774316668510437, + "learning_rate": 0.00037992706631146165, + "loss": 0.4986, + "step": 19400 + }, + { + "epoch": 0.43202457264957267, + "grad_norm": 0.7183822989463806, + "learning_rate": 0.00037990670419539346, + "loss": 0.6067, + "step": 19410 + }, + { + "epoch": 0.432247150997151, + "grad_norm": 0.8368187546730042, + "learning_rate": 0.0003798863323030476, + "loss": 0.8275, + "step": 19420 + }, + { + "epoch": 0.4324697293447293, + "grad_norm": 1.3504329919815063, + "learning_rate": 0.0003798659506355313, + "loss": 0.6697, + "step": 19430 + }, + { + "epoch": 0.4326923076923077, + "grad_norm": 0.4535563588142395, + "learning_rate": 0.0003798455591939519, + "loss": 0.5502, + "step": 19440 + }, + { + "epoch": 0.43291488603988604, + "grad_norm": 0.5445149540901184, + "learning_rate": 0.0003798251579794176, + "loss": 0.5602, + "step": 19450 + }, + { + "epoch": 0.43313746438746437, + "grad_norm": 0.43985071778297424, + "learning_rate": 0.000379804746993037, + "loss": 0.5629, + "step": 19460 + }, + { + "epoch": 0.43336004273504275, + "grad_norm": 0.6011999845504761, + "learning_rate": 0.0003797843262359193, + "loss": 0.655, + "step": 19470 + }, + { + "epoch": 0.4335826210826211, + "grad_norm": 0.5765155553817749, + "learning_rate": 0.00037976389570917407, + "loss": 0.6234, + "step": 19480 + }, + { + "epoch": 0.4338051994301994, + "grad_norm": 0.4562526047229767, + "learning_rate": 0.0003797434554139116, + "loss": 0.64, + "step": 19490 + }, + { + "epoch": 0.4340277777777778, + "grad_norm": 0.846602201461792, + "learning_rate": 0.00037972300535124267, + "loss": 0.7789, + "step": 19500 + }, + { + "epoch": 0.4342503561253561, + "grad_norm": 0.47665026783943176, + "learning_rate": 0.00037970254552227844, + "loss": 0.5548, + "step": 19510 + }, + { + "epoch": 0.43447293447293445, + "grad_norm": 0.7600558996200562, + "learning_rate": 0.0003796820759281308, + "loss": 0.7801, + "step": 19520 + }, + { + "epoch": 0.43469551282051283, + "grad_norm": 0.6859473586082458, + "learning_rate": 0.0003796615965699121, + "loss": 0.5721, + "step": 19530 + }, + { + "epoch": 0.43491809116809116, + "grad_norm": 0.8362597823143005, + "learning_rate": 0.0003796411074487351, + "loss": 0.6686, + "step": 19540 + }, + { + "epoch": 0.43514066951566954, + "grad_norm": 0.47653642296791077, + "learning_rate": 0.0003796206085657133, + "loss": 0.7665, + "step": 19550 + }, + { + "epoch": 0.43536324786324787, + "grad_norm": 0.7798507213592529, + "learning_rate": 0.00037960009992196053, + "loss": 0.6039, + "step": 19560 + }, + { + "epoch": 0.4355858262108262, + "grad_norm": 0.8148535490036011, + "learning_rate": 0.00037957958151859137, + "loss": 0.7193, + "step": 19570 + }, + { + "epoch": 0.4358084045584046, + "grad_norm": 0.842745304107666, + "learning_rate": 0.0003795590533567207, + "loss": 0.5693, + "step": 19580 + }, + { + "epoch": 0.4360309829059829, + "grad_norm": 0.8762781023979187, + "learning_rate": 0.0003795385154374641, + "loss": 0.5878, + "step": 19590 + }, + { + "epoch": 0.43625356125356124, + "grad_norm": 0.4569462239742279, + "learning_rate": 0.0003795179677619376, + "loss": 0.5811, + "step": 19600 + }, + { + "epoch": 0.4364761396011396, + "grad_norm": 1.1307685375213623, + "learning_rate": 0.0003794974103312577, + "loss": 0.5775, + "step": 19610 + }, + { + "epoch": 0.43669871794871795, + "grad_norm": 1.057785153388977, + "learning_rate": 0.00037947684314654164, + "loss": 0.7388, + "step": 19620 + }, + { + "epoch": 0.4369212962962963, + "grad_norm": 0.6690455079078674, + "learning_rate": 0.000379456266208907, + "loss": 0.5863, + "step": 19630 + }, + { + "epoch": 0.43714387464387466, + "grad_norm": 0.9382486939430237, + "learning_rate": 0.00037943567951947196, + "loss": 0.5357, + "step": 19640 + }, + { + "epoch": 0.437366452991453, + "grad_norm": 0.6697343587875366, + "learning_rate": 0.00037941508307935516, + "loss": 0.5759, + "step": 19650 + }, + { + "epoch": 0.4375890313390313, + "grad_norm": 0.6135135889053345, + "learning_rate": 0.0003793944768896759, + "loss": 0.6192, + "step": 19660 + }, + { + "epoch": 0.4378116096866097, + "grad_norm": 0.7405604124069214, + "learning_rate": 0.0003793738609515539, + "loss": 0.6352, + "step": 19670 + }, + { + "epoch": 0.43803418803418803, + "grad_norm": 0.7242349982261658, + "learning_rate": 0.0003793532352661094, + "loss": 0.6406, + "step": 19680 + }, + { + "epoch": 0.43825676638176636, + "grad_norm": 0.5112965106964111, + "learning_rate": 0.0003793325998344633, + "loss": 0.5229, + "step": 19690 + }, + { + "epoch": 0.43847934472934474, + "grad_norm": 0.573747992515564, + "learning_rate": 0.000379311954657737, + "loss": 0.6026, + "step": 19700 + }, + { + "epoch": 0.4387019230769231, + "grad_norm": 0.6106606721878052, + "learning_rate": 0.00037929129973705215, + "loss": 0.5305, + "step": 19710 + }, + { + "epoch": 0.4389245014245014, + "grad_norm": 0.6479077935218811, + "learning_rate": 0.0003792706350735314, + "loss": 0.5524, + "step": 19720 + }, + { + "epoch": 0.4391470797720798, + "grad_norm": 0.43118560314178467, + "learning_rate": 0.00037924996066829753, + "loss": 0.5813, + "step": 19730 + }, + { + "epoch": 0.4393696581196581, + "grad_norm": 1.0099085569381714, + "learning_rate": 0.0003792292765224741, + "loss": 0.6683, + "step": 19740 + }, + { + "epoch": 0.43959223646723644, + "grad_norm": 0.7920699119567871, + "learning_rate": 0.00037920858263718504, + "loss": 0.6348, + "step": 19750 + }, + { + "epoch": 0.4398148148148148, + "grad_norm": 0.7661017179489136, + "learning_rate": 0.0003791878790135549, + "loss": 0.5387, + "step": 19760 + }, + { + "epoch": 0.44003739316239315, + "grad_norm": 0.7241034507751465, + "learning_rate": 0.0003791671656527087, + "loss": 0.5588, + "step": 19770 + }, + { + "epoch": 0.44025997150997154, + "grad_norm": 0.8879605531692505, + "learning_rate": 0.0003791464425557721, + "loss": 0.7116, + "step": 19780 + }, + { + "epoch": 0.44048254985754987, + "grad_norm": 0.7457120418548584, + "learning_rate": 0.00037912570972387116, + "loss": 0.6061, + "step": 19790 + }, + { + "epoch": 0.4407051282051282, + "grad_norm": 0.49138614535331726, + "learning_rate": 0.0003791049671581324, + "loss": 0.4982, + "step": 19800 + }, + { + "epoch": 0.4409277065527066, + "grad_norm": 0.669255256652832, + "learning_rate": 0.0003790842148596832, + "loss": 0.5541, + "step": 19810 + }, + { + "epoch": 0.4411502849002849, + "grad_norm": 0.554853081703186, + "learning_rate": 0.0003790634528296511, + "loss": 0.6385, + "step": 19820 + }, + { + "epoch": 0.44137286324786323, + "grad_norm": 0.49231138825416565, + "learning_rate": 0.00037904268106916445, + "loss": 0.5486, + "step": 19830 + }, + { + "epoch": 0.4415954415954416, + "grad_norm": 0.8338886499404907, + "learning_rate": 0.00037902189957935193, + "loss": 0.6802, + "step": 19840 + }, + { + "epoch": 0.44181801994301995, + "grad_norm": 0.6095072031021118, + "learning_rate": 0.0003790011083613428, + "loss": 0.5594, + "step": 19850 + }, + { + "epoch": 0.4420405982905983, + "grad_norm": 0.4810936450958252, + "learning_rate": 0.00037898030741626693, + "loss": 0.6117, + "step": 19860 + }, + { + "epoch": 0.44226317663817666, + "grad_norm": 0.42109841108322144, + "learning_rate": 0.0003789594967452546, + "loss": 0.5319, + "step": 19870 + }, + { + "epoch": 0.442485754985755, + "grad_norm": 0.9655678272247314, + "learning_rate": 0.00037893867634943674, + "loss": 0.6568, + "step": 19880 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.5279555916786194, + "learning_rate": 0.0003789178462299447, + "loss": 0.5831, + "step": 19890 + }, + { + "epoch": 0.4429309116809117, + "grad_norm": 0.8120352625846863, + "learning_rate": 0.0003788970063879105, + "loss": 0.7757, + "step": 19900 + }, + { + "epoch": 0.44315349002849, + "grad_norm": 0.9156867861747742, + "learning_rate": 0.0003788761568244664, + "loss": 0.6598, + "step": 19910 + }, + { + "epoch": 0.44337606837606836, + "grad_norm": 0.3847975432872772, + "learning_rate": 0.0003788552975407456, + "loss": 0.5608, + "step": 19920 + }, + { + "epoch": 0.44359864672364674, + "grad_norm": 0.8960539102554321, + "learning_rate": 0.0003788344285378815, + "loss": 0.677, + "step": 19930 + }, + { + "epoch": 0.44382122507122507, + "grad_norm": 0.9364832043647766, + "learning_rate": 0.0003788135498170081, + "loss": 0.8056, + "step": 19940 + }, + { + "epoch": 0.4440438034188034, + "grad_norm": 0.7676774859428406, + "learning_rate": 0.00037879266137926003, + "loss": 0.6854, + "step": 19950 + }, + { + "epoch": 0.4442663817663818, + "grad_norm": 0.7598232626914978, + "learning_rate": 0.00037877176322577243, + "loss": 0.6803, + "step": 19960 + }, + { + "epoch": 0.4444889601139601, + "grad_norm": 1.0017249584197998, + "learning_rate": 0.00037875085535768086, + "loss": 0.6141, + "step": 19970 + }, + { + "epoch": 0.44471153846153844, + "grad_norm": 0.8074401021003723, + "learning_rate": 0.00037872993777612147, + "loss": 0.6689, + "step": 19980 + }, + { + "epoch": 0.4449341168091168, + "grad_norm": 0.5603516697883606, + "learning_rate": 0.0003787090104822309, + "loss": 0.4914, + "step": 19990 + }, + { + "epoch": 0.44515669515669515, + "grad_norm": 0.6445973515510559, + "learning_rate": 0.0003786880734771464, + "loss": 0.6741, + "step": 20000 + }, + { + "epoch": 0.44537927350427353, + "grad_norm": 0.7388302087783813, + "learning_rate": 0.00037866712676200574, + "loss": 0.6186, + "step": 20010 + }, + { + "epoch": 0.44560185185185186, + "grad_norm": 0.6099076867103577, + "learning_rate": 0.00037864617033794715, + "loss": 0.475, + "step": 20020 + }, + { + "epoch": 0.4458244301994302, + "grad_norm": 0.7826319336891174, + "learning_rate": 0.00037862520420610943, + "loss": 0.6261, + "step": 20030 + }, + { + "epoch": 0.44604700854700857, + "grad_norm": 0.5059393048286438, + "learning_rate": 0.0003786042283676319, + "loss": 0.677, + "step": 20040 + }, + { + "epoch": 0.4462695868945869, + "grad_norm": 0.6359015703201294, + "learning_rate": 0.00037858324282365435, + "loss": 0.6867, + "step": 20050 + }, + { + "epoch": 0.44649216524216523, + "grad_norm": 0.5743014812469482, + "learning_rate": 0.00037856224757531717, + "loss": 0.567, + "step": 20060 + }, + { + "epoch": 0.4467147435897436, + "grad_norm": 0.5873332023620605, + "learning_rate": 0.00037854124262376134, + "loss": 0.5609, + "step": 20070 + }, + { + "epoch": 0.44693732193732194, + "grad_norm": 0.6396929025650024, + "learning_rate": 0.0003785202279701282, + "loss": 0.5618, + "step": 20080 + }, + { + "epoch": 0.44715990028490027, + "grad_norm": 0.8452794551849365, + "learning_rate": 0.00037849920361555966, + "loss": 0.5362, + "step": 20090 + }, + { + "epoch": 0.44738247863247865, + "grad_norm": 1.1678051948547363, + "learning_rate": 0.0003784781695611983, + "loss": 0.6013, + "step": 20100 + }, + { + "epoch": 0.447605056980057, + "grad_norm": 0.6501947045326233, + "learning_rate": 0.0003784571258081871, + "loss": 0.5056, + "step": 20110 + }, + { + "epoch": 0.4478276353276353, + "grad_norm": 0.7473871111869812, + "learning_rate": 0.00037843607235766967, + "loss": 0.6463, + "step": 20120 + }, + { + "epoch": 0.4480502136752137, + "grad_norm": 0.4127056300640106, + "learning_rate": 0.00037841500921078996, + "loss": 0.6582, + "step": 20130 + }, + { + "epoch": 0.448272792022792, + "grad_norm": 0.7116015553474426, + "learning_rate": 0.0003783939363686925, + "loss": 0.6459, + "step": 20140 + }, + { + "epoch": 0.44849537037037035, + "grad_norm": 0.6783828735351562, + "learning_rate": 0.0003783728538325226, + "loss": 0.6952, + "step": 20150 + }, + { + "epoch": 0.44871794871794873, + "grad_norm": 0.46691304445266724, + "learning_rate": 0.0003783517616034258, + "loss": 0.7028, + "step": 20160 + }, + { + "epoch": 0.44894052706552706, + "grad_norm": 0.6711305379867554, + "learning_rate": 0.00037833065968254824, + "loss": 0.5297, + "step": 20170 + }, + { + "epoch": 0.4491631054131054, + "grad_norm": 0.856510579586029, + "learning_rate": 0.00037830954807103665, + "loss": 0.7389, + "step": 20180 + }, + { + "epoch": 0.4493856837606838, + "grad_norm": 0.6568981409072876, + "learning_rate": 0.0003782884267700382, + "loss": 0.7781, + "step": 20190 + }, + { + "epoch": 0.4496082621082621, + "grad_norm": 1.019824743270874, + "learning_rate": 0.00037826729578070077, + "loss": 0.585, + "step": 20200 + }, + { + "epoch": 0.44983084045584043, + "grad_norm": 0.9498888254165649, + "learning_rate": 0.0003782461551041725, + "loss": 0.7491, + "step": 20210 + }, + { + "epoch": 0.4500534188034188, + "grad_norm": 1.014466643333435, + "learning_rate": 0.0003782250047416023, + "loss": 0.588, + "step": 20220 + }, + { + "epoch": 0.45027599715099714, + "grad_norm": 0.5617772340774536, + "learning_rate": 0.00037820384469413937, + "loss": 0.5928, + "step": 20230 + }, + { + "epoch": 0.45049857549857547, + "grad_norm": 0.5865727066993713, + "learning_rate": 0.0003781826749629336, + "loss": 0.7415, + "step": 20240 + }, + { + "epoch": 0.45072115384615385, + "grad_norm": 0.8064214587211609, + "learning_rate": 0.0003781614955491355, + "loss": 0.5553, + "step": 20250 + }, + { + "epoch": 0.4509437321937322, + "grad_norm": 0.5541604161262512, + "learning_rate": 0.00037814030645389585, + "loss": 0.5791, + "step": 20260 + }, + { + "epoch": 0.45116631054131057, + "grad_norm": 0.4761522710323334, + "learning_rate": 0.00037811910767836606, + "loss": 0.5813, + "step": 20270 + }, + { + "epoch": 0.4513888888888889, + "grad_norm": 0.5106942057609558, + "learning_rate": 0.0003780978992236982, + "loss": 0.5916, + "step": 20280 + }, + { + "epoch": 0.4516114672364672, + "grad_norm": 0.5374470949172974, + "learning_rate": 0.0003780766810910447, + "loss": 0.686, + "step": 20290 + }, + { + "epoch": 0.4518340455840456, + "grad_norm": 0.6303367018699646, + "learning_rate": 0.0003780554532815586, + "loss": 0.6568, + "step": 20300 + }, + { + "epoch": 0.45205662393162394, + "grad_norm": 0.7026957273483276, + "learning_rate": 0.0003780342157963933, + "loss": 0.6484, + "step": 20310 + }, + { + "epoch": 0.45227920227920226, + "grad_norm": 0.6824941635131836, + "learning_rate": 0.00037801296863670307, + "loss": 0.5139, + "step": 20320 + }, + { + "epoch": 0.45250178062678065, + "grad_norm": 0.8238844275474548, + "learning_rate": 0.00037799171180364233, + "loss": 0.7948, + "step": 20330 + }, + { + "epoch": 0.452724358974359, + "grad_norm": 0.5989433526992798, + "learning_rate": 0.0003779704452983663, + "loss": 0.6106, + "step": 20340 + }, + { + "epoch": 0.4529469373219373, + "grad_norm": 0.7117564678192139, + "learning_rate": 0.00037794916912203054, + "loss": 0.6437, + "step": 20350 + }, + { + "epoch": 0.4531695156695157, + "grad_norm": 1.0147078037261963, + "learning_rate": 0.00037792788327579134, + "loss": 0.7429, + "step": 20360 + }, + { + "epoch": 0.453392094017094, + "grad_norm": 0.8484946489334106, + "learning_rate": 0.0003779065877608052, + "loss": 0.6691, + "step": 20370 + }, + { + "epoch": 0.45361467236467234, + "grad_norm": 0.8865606188774109, + "learning_rate": 0.0003778852825782295, + "loss": 0.6478, + "step": 20380 + }, + { + "epoch": 0.45383725071225073, + "grad_norm": 0.6782509088516235, + "learning_rate": 0.0003778639677292219, + "loss": 0.5474, + "step": 20390 + }, + { + "epoch": 0.45405982905982906, + "grad_norm": 0.82992023229599, + "learning_rate": 0.00037784264321494065, + "loss": 0.6493, + "step": 20400 + }, + { + "epoch": 0.4542824074074074, + "grad_norm": 0.6378486752510071, + "learning_rate": 0.00037782130903654465, + "loss": 0.5903, + "step": 20410 + }, + { + "epoch": 0.45450498575498577, + "grad_norm": 0.561734676361084, + "learning_rate": 0.00037779996519519314, + "loss": 1.501, + "step": 20420 + }, + { + "epoch": 0.4547275641025641, + "grad_norm": 0.5857129693031311, + "learning_rate": 0.0003777786116920459, + "loss": 0.756, + "step": 20430 + }, + { + "epoch": 0.4549501424501424, + "grad_norm": 0.8134432435035706, + "learning_rate": 0.00037775724852826345, + "loss": 0.6615, + "step": 20440 + }, + { + "epoch": 0.4551727207977208, + "grad_norm": 0.6562219858169556, + "learning_rate": 0.00037773587570500653, + "loss": 0.6501, + "step": 20450 + }, + { + "epoch": 0.45539529914529914, + "grad_norm": 0.7692068219184875, + "learning_rate": 0.00037771449322343667, + "loss": 0.6392, + "step": 20460 + }, + { + "epoch": 0.45561787749287747, + "grad_norm": 0.7109014391899109, + "learning_rate": 0.00037769310108471576, + "loss": 0.6475, + "step": 20470 + }, + { + "epoch": 0.45584045584045585, + "grad_norm": 0.961484432220459, + "learning_rate": 0.0003776716992900062, + "loss": 0.6645, + "step": 20480 + }, + { + "epoch": 0.4560630341880342, + "grad_norm": 0.5773506164550781, + "learning_rate": 0.0003776502878404712, + "loss": 0.6764, + "step": 20490 + }, + { + "epoch": 0.45628561253561256, + "grad_norm": 0.8350430727005005, + "learning_rate": 0.00037762886673727394, + "loss": 0.5623, + "step": 20500 + }, + { + "epoch": 0.4565081908831909, + "grad_norm": 0.6372119188308716, + "learning_rate": 0.00037760743598157877, + "loss": 0.7378, + "step": 20510 + }, + { + "epoch": 0.4567307692307692, + "grad_norm": 0.31066614389419556, + "learning_rate": 0.0003775859955745501, + "loss": 0.5992, + "step": 20520 + }, + { + "epoch": 0.4569533475783476, + "grad_norm": 0.5898124575614929, + "learning_rate": 0.00037756454551735307, + "loss": 0.7565, + "step": 20530 + }, + { + "epoch": 0.45717592592592593, + "grad_norm": 0.5064905881881714, + "learning_rate": 0.00037754308581115325, + "loss": 0.7678, + "step": 20540 + }, + { + "epoch": 0.45739850427350426, + "grad_norm": 2.300577402114868, + "learning_rate": 0.00037752161645711676, + "loss": 0.6805, + "step": 20550 + }, + { + "epoch": 0.45762108262108264, + "grad_norm": 1.0232763290405273, + "learning_rate": 0.0003775001374564104, + "loss": 0.6008, + "step": 20560 + }, + { + "epoch": 0.45784366096866097, + "grad_norm": 0.8136579394340515, + "learning_rate": 0.0003774786488102012, + "loss": 0.7021, + "step": 20570 + }, + { + "epoch": 0.4580662393162393, + "grad_norm": 0.9268751740455627, + "learning_rate": 0.000377457150519657, + "loss": 0.7578, + "step": 20580 + }, + { + "epoch": 0.4582888176638177, + "grad_norm": 1.078420639038086, + "learning_rate": 0.0003774356425859459, + "loss": 0.5974, + "step": 20590 + }, + { + "epoch": 0.458511396011396, + "grad_norm": 0.8404714465141296, + "learning_rate": 0.00037741412501023677, + "loss": 0.6292, + "step": 20600 + }, + { + "epoch": 0.45873397435897434, + "grad_norm": 0.708867073059082, + "learning_rate": 0.00037739259779369876, + "loss": 0.7109, + "step": 20610 + }, + { + "epoch": 0.4589565527065527, + "grad_norm": 0.5947948098182678, + "learning_rate": 0.0003773710609375019, + "loss": 0.6681, + "step": 20620 + }, + { + "epoch": 0.45917913105413105, + "grad_norm": 0.6741048693656921, + "learning_rate": 0.0003773495144428163, + "loss": 0.6094, + "step": 20630 + }, + { + "epoch": 0.4594017094017094, + "grad_norm": 1.0347967147827148, + "learning_rate": 0.000377327958310813, + "loss": 0.6386, + "step": 20640 + }, + { + "epoch": 0.45962428774928776, + "grad_norm": 1.3338631391525269, + "learning_rate": 0.00037730639254266314, + "loss": 0.7083, + "step": 20650 + }, + { + "epoch": 0.4598468660968661, + "grad_norm": 0.883495032787323, + "learning_rate": 0.0003772848171395388, + "loss": 0.7422, + "step": 20660 + }, + { + "epoch": 0.4600694444444444, + "grad_norm": 0.734618604183197, + "learning_rate": 0.0003772632321026124, + "loss": 0.5737, + "step": 20670 + }, + { + "epoch": 0.4602920227920228, + "grad_norm": 0.8266713619232178, + "learning_rate": 0.0003772416374330568, + "loss": 0.8087, + "step": 20680 + }, + { + "epoch": 0.46051460113960113, + "grad_norm": 0.7156257033348083, + "learning_rate": 0.00037722003313204555, + "loss": 0.7197, + "step": 20690 + }, + { + "epoch": 0.46073717948717946, + "grad_norm": 0.5872949957847595, + "learning_rate": 0.00037719841920075265, + "loss": 0.5454, + "step": 20700 + }, + { + "epoch": 0.46095975783475784, + "grad_norm": 0.6012685894966125, + "learning_rate": 0.0003771767956403526, + "loss": 0.7516, + "step": 20710 + }, + { + "epoch": 0.46118233618233617, + "grad_norm": 0.5311578512191772, + "learning_rate": 0.00037715516245202037, + "loss": 0.5725, + "step": 20720 + }, + { + "epoch": 0.46140491452991456, + "grad_norm": 0.6649594306945801, + "learning_rate": 0.0003771335196369316, + "loss": 0.7018, + "step": 20730 + }, + { + "epoch": 0.4616274928774929, + "grad_norm": 0.8509739637374878, + "learning_rate": 0.0003771118671962624, + "loss": 0.7174, + "step": 20740 + }, + { + "epoch": 0.4618500712250712, + "grad_norm": 0.6986957788467407, + "learning_rate": 0.00037709020513118933, + "loss": 0.6416, + "step": 20750 + }, + { + "epoch": 0.4620726495726496, + "grad_norm": 0.48745501041412354, + "learning_rate": 0.00037706853344288957, + "loss": 0.6395, + "step": 20760 + }, + { + "epoch": 0.4622952279202279, + "grad_norm": 0.7349006533622742, + "learning_rate": 0.0003770468521325407, + "loss": 0.6701, + "step": 20770 + }, + { + "epoch": 0.46251780626780625, + "grad_norm": 0.9265018105506897, + "learning_rate": 0.0003770251612013209, + "loss": 0.7558, + "step": 20780 + }, + { + "epoch": 0.46274038461538464, + "grad_norm": 0.7990279197692871, + "learning_rate": 0.00037700346065040903, + "loss": 0.7709, + "step": 20790 + }, + { + "epoch": 0.46296296296296297, + "grad_norm": 0.7279727458953857, + "learning_rate": 0.0003769817504809842, + "loss": 0.7482, + "step": 20800 + }, + { + "epoch": 0.4631855413105413, + "grad_norm": 0.5113296508789062, + "learning_rate": 0.0003769600306942261, + "loss": 0.6341, + "step": 20810 + }, + { + "epoch": 0.4634081196581197, + "grad_norm": 0.6591224074363708, + "learning_rate": 0.0003769383012913151, + "loss": 0.5927, + "step": 20820 + }, + { + "epoch": 0.463630698005698, + "grad_norm": 0.9548632502555847, + "learning_rate": 0.00037691656227343195, + "loss": 0.6021, + "step": 20830 + }, + { + "epoch": 0.46385327635327633, + "grad_norm": 0.8181374669075012, + "learning_rate": 0.000376894813641758, + "loss": 0.6352, + "step": 20840 + }, + { + "epoch": 0.4640758547008547, + "grad_norm": 0.7255536317825317, + "learning_rate": 0.00037687305539747497, + "loss": 0.5023, + "step": 20850 + }, + { + "epoch": 0.46429843304843305, + "grad_norm": 0.8241360783576965, + "learning_rate": 0.00037685128754176545, + "loss": 0.588, + "step": 20860 + }, + { + "epoch": 0.4645210113960114, + "grad_norm": 0.7465112805366516, + "learning_rate": 0.00037682951007581207, + "loss": 0.698, + "step": 20870 + }, + { + "epoch": 0.46474358974358976, + "grad_norm": 0.5946525931358337, + "learning_rate": 0.00037680772300079844, + "loss": 0.5726, + "step": 20880 + }, + { + "epoch": 0.4649661680911681, + "grad_norm": 0.7747005820274353, + "learning_rate": 0.00037678592631790837, + "loss": 0.706, + "step": 20890 + }, + { + "epoch": 0.4651887464387464, + "grad_norm": 1.039262056350708, + "learning_rate": 0.00037676412002832633, + "loss": 0.6516, + "step": 20900 + }, + { + "epoch": 0.4654113247863248, + "grad_norm": 0.7734355330467224, + "learning_rate": 0.0003767423041332373, + "loss": 0.581, + "step": 20910 + }, + { + "epoch": 0.4656339031339031, + "grad_norm": 0.8960153460502625, + "learning_rate": 0.0003767204786338268, + "loss": 0.6354, + "step": 20920 + }, + { + "epoch": 0.46585648148148145, + "grad_norm": 0.6276922225952148, + "learning_rate": 0.0003766986435312808, + "loss": 0.6693, + "step": 20930 + }, + { + "epoch": 0.46607905982905984, + "grad_norm": 0.7944068908691406, + "learning_rate": 0.00037667679882678586, + "loss": 0.6442, + "step": 20940 + }, + { + "epoch": 0.46630163817663817, + "grad_norm": 0.7230113744735718, + "learning_rate": 0.000376654944521529, + "loss": 0.6328, + "step": 20950 + }, + { + "epoch": 0.46652421652421655, + "grad_norm": 0.5475065112113953, + "learning_rate": 0.0003766330806166979, + "loss": 0.6933, + "step": 20960 + }, + { + "epoch": 0.4667467948717949, + "grad_norm": 0.6445038318634033, + "learning_rate": 0.00037661120711348056, + "loss": 0.6437, + "step": 20970 + }, + { + "epoch": 0.4669693732193732, + "grad_norm": 0.6793280839920044, + "learning_rate": 0.0003765893240130657, + "loss": 0.6109, + "step": 20980 + }, + { + "epoch": 0.4671919515669516, + "grad_norm": 0.7854118347167969, + "learning_rate": 0.00037656743131664236, + "loss": 0.576, + "step": 20990 + }, + { + "epoch": 0.4674145299145299, + "grad_norm": 0.6892343759536743, + "learning_rate": 0.00037654552902540025, + "loss": 0.7005, + "step": 21000 + }, + { + "epoch": 0.46763710826210825, + "grad_norm": 0.7508336901664734, + "learning_rate": 0.0003765236171405296, + "loss": 0.5504, + "step": 21010 + }, + { + "epoch": 0.46785968660968663, + "grad_norm": 0.4559895098209381, + "learning_rate": 0.0003765016956632211, + "loss": 0.4633, + "step": 21020 + }, + { + "epoch": 0.46808226495726496, + "grad_norm": 0.7084289789199829, + "learning_rate": 0.00037647976459466594, + "loss": 0.7449, + "step": 21030 + }, + { + "epoch": 0.4683048433048433, + "grad_norm": 0.720272958278656, + "learning_rate": 0.0003764578239360559, + "loss": 0.5805, + "step": 21040 + }, + { + "epoch": 0.46852742165242167, + "grad_norm": 0.8112165331840515, + "learning_rate": 0.00037643587368858323, + "loss": 0.5505, + "step": 21050 + }, + { + "epoch": 0.46875, + "grad_norm": 0.4571852385997772, + "learning_rate": 0.00037641391385344076, + "loss": 0.6441, + "step": 21060 + }, + { + "epoch": 0.46897257834757833, + "grad_norm": 0.8824451565742493, + "learning_rate": 0.0003763919444318218, + "loss": 0.615, + "step": 21070 + }, + { + "epoch": 0.4691951566951567, + "grad_norm": 1.367150068283081, + "learning_rate": 0.0003763699654249202, + "loss": 0.5973, + "step": 21080 + }, + { + "epoch": 0.46941773504273504, + "grad_norm": 0.9916679263114929, + "learning_rate": 0.0003763479768339303, + "loss": 0.7755, + "step": 21090 + }, + { + "epoch": 0.46964031339031337, + "grad_norm": 0.6311511397361755, + "learning_rate": 0.0003763259786600469, + "loss": 0.6311, + "step": 21100 + }, + { + "epoch": 0.46986289173789175, + "grad_norm": 1.2279185056686401, + "learning_rate": 0.0003763039709044655, + "loss": 0.6385, + "step": 21110 + }, + { + "epoch": 0.4700854700854701, + "grad_norm": 0.7499618530273438, + "learning_rate": 0.00037628195356838204, + "loss": 0.5764, + "step": 21120 + }, + { + "epoch": 0.4703080484330484, + "grad_norm": 0.7156887054443359, + "learning_rate": 0.0003762599266529929, + "loss": 0.56, + "step": 21130 + }, + { + "epoch": 0.4705306267806268, + "grad_norm": 0.7908844351768494, + "learning_rate": 0.000376237890159495, + "loss": 0.7162, + "step": 21140 + }, + { + "epoch": 0.4707532051282051, + "grad_norm": 0.8906026482582092, + "learning_rate": 0.00037621584408908596, + "loss": 0.6661, + "step": 21150 + }, + { + "epoch": 0.47097578347578345, + "grad_norm": 0.7481372356414795, + "learning_rate": 0.0003761937884429636, + "loss": 0.6073, + "step": 21160 + }, + { + "epoch": 0.47119836182336183, + "grad_norm": 0.6544264554977417, + "learning_rate": 0.0003761717232223266, + "loss": 0.6206, + "step": 21170 + }, + { + "epoch": 0.47142094017094016, + "grad_norm": 1.5991239547729492, + "learning_rate": 0.0003761496484283739, + "loss": 0.6163, + "step": 21180 + }, + { + "epoch": 0.47164351851851855, + "grad_norm": 0.9703873991966248, + "learning_rate": 0.00037612756406230514, + "loss": 0.6135, + "step": 21190 + }, + { + "epoch": 0.4718660968660969, + "grad_norm": 0.8297857046127319, + "learning_rate": 0.0003761054701253204, + "loss": 0.558, + "step": 21200 + }, + { + "epoch": 0.4720886752136752, + "grad_norm": 0.471038281917572, + "learning_rate": 0.00037608336661862016, + "loss": 0.5461, + "step": 21210 + }, + { + "epoch": 0.4723112535612536, + "grad_norm": 0.362981915473938, + "learning_rate": 0.00037606125354340563, + "loss": 0.543, + "step": 21220 + }, + { + "epoch": 0.4725338319088319, + "grad_norm": 0.7584346532821655, + "learning_rate": 0.0003760391309008785, + "loss": 0.6135, + "step": 21230 + }, + { + "epoch": 0.47275641025641024, + "grad_norm": 0.6571023464202881, + "learning_rate": 0.0003760169986922409, + "loss": 0.6293, + "step": 21240 + }, + { + "epoch": 0.4729789886039886, + "grad_norm": 0.5312460660934448, + "learning_rate": 0.00037599485691869544, + "loss": 0.665, + "step": 21250 + }, + { + "epoch": 0.47320156695156695, + "grad_norm": 0.767343282699585, + "learning_rate": 0.00037597270558144545, + "loss": 0.5404, + "step": 21260 + }, + { + "epoch": 0.4734241452991453, + "grad_norm": 1.1337848901748657, + "learning_rate": 0.00037595054468169455, + "loss": 0.615, + "step": 21270 + }, + { + "epoch": 0.47364672364672367, + "grad_norm": 0.6291069388389587, + "learning_rate": 0.00037592837422064697, + "loss": 0.6336, + "step": 21280 + }, + { + "epoch": 0.473869301994302, + "grad_norm": 0.9171074032783508, + "learning_rate": 0.0003759061941995075, + "loss": 0.614, + "step": 21290 + }, + { + "epoch": 0.4740918803418803, + "grad_norm": 0.6373035907745361, + "learning_rate": 0.0003758840046194815, + "loss": 0.678, + "step": 21300 + }, + { + "epoch": 0.4743144586894587, + "grad_norm": 0.6766063570976257, + "learning_rate": 0.00037586180548177466, + "loss": 0.7444, + "step": 21310 + }, + { + "epoch": 0.47453703703703703, + "grad_norm": 0.6747170090675354, + "learning_rate": 0.00037583959678759335, + "loss": 0.706, + "step": 21320 + }, + { + "epoch": 0.47475961538461536, + "grad_norm": 0.6572831273078918, + "learning_rate": 0.0003758173785381445, + "loss": 0.6419, + "step": 21330 + }, + { + "epoch": 0.47498219373219375, + "grad_norm": 0.773108720779419, + "learning_rate": 0.0003757951507346352, + "loss": 0.6609, + "step": 21340 + }, + { + "epoch": 0.4752047720797721, + "grad_norm": 0.5571287274360657, + "learning_rate": 0.0003757729133782736, + "loss": 0.4853, + "step": 21350 + }, + { + "epoch": 0.4754273504273504, + "grad_norm": 0.6754597425460815, + "learning_rate": 0.000375750666470268, + "loss": 0.6991, + "step": 21360 + }, + { + "epoch": 0.4756499287749288, + "grad_norm": 0.7020809650421143, + "learning_rate": 0.00037572841001182726, + "loss": 0.8095, + "step": 21370 + }, + { + "epoch": 0.4758725071225071, + "grad_norm": 0.7230724692344666, + "learning_rate": 0.0003757061440041609, + "loss": 0.7263, + "step": 21380 + }, + { + "epoch": 0.47609508547008544, + "grad_norm": 0.7014229893684387, + "learning_rate": 0.00037568386844847885, + "loss": 0.6969, + "step": 21390 + }, + { + "epoch": 0.47631766381766383, + "grad_norm": 0.5396464467048645, + "learning_rate": 0.0003756615833459915, + "loss": 0.6536, + "step": 21400 + }, + { + "epoch": 0.47654024216524216, + "grad_norm": 0.8703533411026001, + "learning_rate": 0.0003756392886979099, + "loss": 0.656, + "step": 21410 + }, + { + "epoch": 0.47676282051282054, + "grad_norm": 0.8160148859024048, + "learning_rate": 0.00037561698450544565, + "loss": 0.7003, + "step": 21420 + }, + { + "epoch": 0.47698539886039887, + "grad_norm": 0.8830581307411194, + "learning_rate": 0.0003755946707698106, + "loss": 0.656, + "step": 21430 + }, + { + "epoch": 0.4772079772079772, + "grad_norm": 0.7755318880081177, + "learning_rate": 0.0003755723474922175, + "loss": 0.5179, + "step": 21440 + }, + { + "epoch": 0.4774305555555556, + "grad_norm": 0.8805164694786072, + "learning_rate": 0.00037555001467387924, + "loss": 0.6699, + "step": 21450 + }, + { + "epoch": 0.4776531339031339, + "grad_norm": 0.4396213889122009, + "learning_rate": 0.0003755276723160095, + "loss": 0.4509, + "step": 21460 + }, + { + "epoch": 0.47787571225071224, + "grad_norm": 0.5609117746353149, + "learning_rate": 0.00037550532041982234, + "loss": 0.5504, + "step": 21470 + }, + { + "epoch": 0.4780982905982906, + "grad_norm": 0.6986370086669922, + "learning_rate": 0.0003754829589865324, + "loss": 0.5165, + "step": 21480 + }, + { + "epoch": 0.47832086894586895, + "grad_norm": 0.6412321925163269, + "learning_rate": 0.0003754605880173548, + "loss": 0.6239, + "step": 21490 + }, + { + "epoch": 0.4785434472934473, + "grad_norm": 0.6213876008987427, + "learning_rate": 0.0003754382075135052, + "loss": 0.4858, + "step": 21500 + }, + { + "epoch": 0.47876602564102566, + "grad_norm": 0.400880366563797, + "learning_rate": 0.0003754158174761998, + "loss": 0.5655, + "step": 21510 + }, + { + "epoch": 0.478988603988604, + "grad_norm": 0.707527756690979, + "learning_rate": 0.0003753934179066552, + "loss": 0.7367, + "step": 21520 + }, + { + "epoch": 0.4792111823361823, + "grad_norm": 0.5736559629440308, + "learning_rate": 0.00037537100880608883, + "loss": 0.6368, + "step": 21530 + }, + { + "epoch": 0.4794337606837607, + "grad_norm": 0.7527413368225098, + "learning_rate": 0.00037534859017571815, + "loss": 0.6444, + "step": 21540 + }, + { + "epoch": 0.47965633903133903, + "grad_norm": 0.739428699016571, + "learning_rate": 0.00037532616201676165, + "loss": 0.6678, + "step": 21550 + }, + { + "epoch": 0.47987891737891736, + "grad_norm": 0.4493646025657654, + "learning_rate": 0.00037530372433043787, + "loss": 0.5731, + "step": 21560 + }, + { + "epoch": 0.48005698005698005, + "eval_loss": 0.6317010521888733, + "eval_runtime": 337.4466, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 7.009, + "step": 21568 + }, + { + "epoch": 0.48010149572649574, + "grad_norm": 0.8192841410636902, + "learning_rate": 0.00037528127711796626, + "loss": 0.627, + "step": 21570 + }, + { + "epoch": 0.48032407407407407, + "grad_norm": 0.7585240006446838, + "learning_rate": 0.00037525882038056654, + "loss": 0.6677, + "step": 21580 + }, + { + "epoch": 0.4805466524216524, + "grad_norm": 0.7832114696502686, + "learning_rate": 0.00037523635411945905, + "loss": 0.5631, + "step": 21590 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 0.7623310685157776, + "learning_rate": 0.0003752138783358646, + "loss": 0.5473, + "step": 21600 + }, + { + "epoch": 0.4809918091168091, + "grad_norm": 0.7951676249504089, + "learning_rate": 0.0003751913930310046, + "loss": 0.6501, + "step": 21610 + }, + { + "epoch": 0.48121438746438744, + "grad_norm": 0.6903308033943176, + "learning_rate": 0.00037516889820610086, + "loss": 0.7276, + "step": 21620 + }, + { + "epoch": 0.4814369658119658, + "grad_norm": 0.815481960773468, + "learning_rate": 0.00037514639386237585, + "loss": 0.6222, + "step": 21630 + }, + { + "epoch": 0.48165954415954415, + "grad_norm": 0.6189022064208984, + "learning_rate": 0.0003751238800010523, + "loss": 0.6608, + "step": 21640 + }, + { + "epoch": 0.48188212250712253, + "grad_norm": 0.8595947027206421, + "learning_rate": 0.0003751013566233538, + "loss": 0.6706, + "step": 21650 + }, + { + "epoch": 0.48210470085470086, + "grad_norm": 0.47164031863212585, + "learning_rate": 0.0003750788237305043, + "loss": 0.6553, + "step": 21660 + }, + { + "epoch": 0.4823272792022792, + "grad_norm": 0.5663688778877258, + "learning_rate": 0.00037505628132372817, + "loss": 0.4919, + "step": 21670 + }, + { + "epoch": 0.4825498575498576, + "grad_norm": 0.840472936630249, + "learning_rate": 0.00037503372940425036, + "loss": 0.6202, + "step": 21680 + }, + { + "epoch": 0.4827724358974359, + "grad_norm": 0.8134217262268066, + "learning_rate": 0.00037501116797329637, + "loss": 0.7261, + "step": 21690 + }, + { + "epoch": 0.48299501424501423, + "grad_norm": 0.6884470582008362, + "learning_rate": 0.0003749885970320923, + "loss": 0.5365, + "step": 21700 + }, + { + "epoch": 0.4832175925925926, + "grad_norm": 0.7177421450614929, + "learning_rate": 0.00037496601658186464, + "loss": 0.6229, + "step": 21710 + }, + { + "epoch": 0.48344017094017094, + "grad_norm": 0.7939632534980774, + "learning_rate": 0.0003749434266238404, + "loss": 0.5793, + "step": 21720 + }, + { + "epoch": 0.48366274928774927, + "grad_norm": 0.5567938089370728, + "learning_rate": 0.00037492082715924707, + "loss": 0.5883, + "step": 21730 + }, + { + "epoch": 0.48388532763532766, + "grad_norm": 0.47863462567329407, + "learning_rate": 0.0003748982181893129, + "loss": 0.5821, + "step": 21740 + }, + { + "epoch": 0.484107905982906, + "grad_norm": 0.6115350723266602, + "learning_rate": 0.0003748755997152663, + "loss": 0.5837, + "step": 21750 + }, + { + "epoch": 0.4843304843304843, + "grad_norm": 0.915977418422699, + "learning_rate": 0.0003748529717383365, + "loss": 0.5941, + "step": 21760 + }, + { + "epoch": 0.4845530626780627, + "grad_norm": 0.7119535803794861, + "learning_rate": 0.000374830334259753, + "loss": 0.5648, + "step": 21770 + }, + { + "epoch": 0.484775641025641, + "grad_norm": 1.0399959087371826, + "learning_rate": 0.00037480768728074605, + "loss": 0.6491, + "step": 21780 + }, + { + "epoch": 0.48499821937321935, + "grad_norm": 0.5946542620658875, + "learning_rate": 0.00037478503080254626, + "loss": 0.5447, + "step": 21790 + }, + { + "epoch": 0.48522079772079774, + "grad_norm": 0.6002269387245178, + "learning_rate": 0.00037476236482638487, + "loss": 0.5436, + "step": 21800 + }, + { + "epoch": 0.48544337606837606, + "grad_norm": 0.6677395701408386, + "learning_rate": 0.0003747396893534934, + "loss": 0.5574, + "step": 21810 + }, + { + "epoch": 0.4856659544159544, + "grad_norm": 0.7505663633346558, + "learning_rate": 0.00037471700438510426, + "loss": 0.5188, + "step": 21820 + }, + { + "epoch": 0.4858885327635328, + "grad_norm": 0.860384464263916, + "learning_rate": 0.0003746943099224501, + "loss": 1.6389, + "step": 21830 + }, + { + "epoch": 0.4861111111111111, + "grad_norm": 0.6575486660003662, + "learning_rate": 0.000374671605966764, + "loss": 0.6875, + "step": 21840 + }, + { + "epoch": 0.48633368945868943, + "grad_norm": 0.48352324962615967, + "learning_rate": 0.00037464889251927994, + "loss": 0.5916, + "step": 21850 + }, + { + "epoch": 0.4865562678062678, + "grad_norm": 1.2798899412155151, + "learning_rate": 0.00037462616958123206, + "loss": 0.6379, + "step": 21860 + }, + { + "epoch": 0.48677884615384615, + "grad_norm": 0.8500372767448425, + "learning_rate": 0.0003746034371538551, + "loss": 0.6326, + "step": 21870 + }, + { + "epoch": 0.48700142450142453, + "grad_norm": 0.6251464486122131, + "learning_rate": 0.0003745806952383845, + "loss": 0.5766, + "step": 21880 + }, + { + "epoch": 0.48722400284900286, + "grad_norm": 0.6444466710090637, + "learning_rate": 0.00037455794383605605, + "loss": 0.7005, + "step": 21890 + }, + { + "epoch": 0.4874465811965812, + "grad_norm": 0.8004457950592041, + "learning_rate": 0.00037453518294810594, + "loss": 0.6974, + "step": 21900 + }, + { + "epoch": 0.48766915954415957, + "grad_norm": 0.5639207363128662, + "learning_rate": 0.00037451241257577115, + "loss": 0.6421, + "step": 21910 + }, + { + "epoch": 0.4878917378917379, + "grad_norm": 0.6112554669380188, + "learning_rate": 0.00037448963272028896, + "loss": 0.5886, + "step": 21920 + }, + { + "epoch": 0.4881143162393162, + "grad_norm": 0.5895763635635376, + "learning_rate": 0.0003744668433828974, + "loss": 0.6137, + "step": 21930 + }, + { + "epoch": 0.4883368945868946, + "grad_norm": 0.8514500260353088, + "learning_rate": 0.0003744440445648346, + "loss": 0.5455, + "step": 21940 + }, + { + "epoch": 0.48855947293447294, + "grad_norm": 0.7488554120063782, + "learning_rate": 0.0003744212362673396, + "loss": 0.6121, + "step": 21950 + }, + { + "epoch": 0.48878205128205127, + "grad_norm": 0.49900588393211365, + "learning_rate": 0.000374398418491652, + "loss": 0.6145, + "step": 21960 + }, + { + "epoch": 0.48900462962962965, + "grad_norm": 0.7089706659317017, + "learning_rate": 0.00037437559123901145, + "loss": 0.5018, + "step": 21970 + }, + { + "epoch": 0.489227207977208, + "grad_norm": 0.6289607882499695, + "learning_rate": 0.00037435275451065854, + "loss": 0.6118, + "step": 21980 + }, + { + "epoch": 0.4894497863247863, + "grad_norm": 0.8315375447273254, + "learning_rate": 0.00037432990830783423, + "loss": 0.7574, + "step": 21990 + }, + { + "epoch": 0.4896723646723647, + "grad_norm": 0.8315839171409607, + "learning_rate": 0.00037430705263177995, + "loss": 0.4651, + "step": 22000 + }, + { + "epoch": 0.489894943019943, + "grad_norm": 0.6891462802886963, + "learning_rate": 0.0003742841874837378, + "loss": 0.6585, + "step": 22010 + }, + { + "epoch": 0.49011752136752135, + "grad_norm": 1.2439353466033936, + "learning_rate": 0.0003742613128649502, + "loss": 0.7067, + "step": 22020 + }, + { + "epoch": 0.49034009971509973, + "grad_norm": 0.7287011742591858, + "learning_rate": 0.00037423842877666016, + "loss": 0.6761, + "step": 22030 + }, + { + "epoch": 0.49056267806267806, + "grad_norm": 0.6492077708244324, + "learning_rate": 0.00037421553522011135, + "loss": 0.5644, + "step": 22040 + }, + { + "epoch": 0.4907852564102564, + "grad_norm": 0.6765930652618408, + "learning_rate": 0.00037419263219654763, + "loss": 0.5853, + "step": 22050 + }, + { + "epoch": 0.49100783475783477, + "grad_norm": 0.5932199358940125, + "learning_rate": 0.0003741697197072138, + "loss": 0.5416, + "step": 22060 + }, + { + "epoch": 0.4912304131054131, + "grad_norm": 0.9352262020111084, + "learning_rate": 0.0003741467977533547, + "loss": 0.771, + "step": 22070 + }, + { + "epoch": 0.49145299145299143, + "grad_norm": 0.56147700548172, + "learning_rate": 0.0003741238663362161, + "loss": 0.6866, + "step": 22080 + }, + { + "epoch": 0.4916755698005698, + "grad_norm": 0.7923839092254639, + "learning_rate": 0.00037410092545704405, + "loss": 0.746, + "step": 22090 + }, + { + "epoch": 0.49189814814814814, + "grad_norm": 1.0098870992660522, + "learning_rate": 0.00037407797511708517, + "loss": 0.5563, + "step": 22100 + }, + { + "epoch": 0.49212072649572647, + "grad_norm": 0.8181535005569458, + "learning_rate": 0.00037405501531758665, + "loss": 0.6467, + "step": 22110 + }, + { + "epoch": 0.49234330484330485, + "grad_norm": 0.8707475066184998, + "learning_rate": 0.0003740320460597961, + "loss": 0.676, + "step": 22120 + }, + { + "epoch": 0.4925658831908832, + "grad_norm": 0.7622846961021423, + "learning_rate": 0.0003740090673449617, + "loss": 0.5734, + "step": 22130 + }, + { + "epoch": 0.49278846153846156, + "grad_norm": 0.6305636763572693, + "learning_rate": 0.0003739860791743321, + "loss": 0.5678, + "step": 22140 + }, + { + "epoch": 0.4930110398860399, + "grad_norm": 0.5063610672950745, + "learning_rate": 0.0003739630815491566, + "loss": 0.419, + "step": 22150 + }, + { + "epoch": 0.4932336182336182, + "grad_norm": 0.8419811725616455, + "learning_rate": 0.0003739400744706848, + "loss": 0.7332, + "step": 22160 + }, + { + "epoch": 0.4934561965811966, + "grad_norm": 0.6084222197532654, + "learning_rate": 0.0003739170579401669, + "loss": 0.5931, + "step": 22170 + }, + { + "epoch": 0.49367877492877493, + "grad_norm": 0.8849558234214783, + "learning_rate": 0.00037389403195885374, + "loss": 0.6365, + "step": 22180 + }, + { + "epoch": 0.49390135327635326, + "grad_norm": 0.8881486058235168, + "learning_rate": 0.00037387099652799657, + "loss": 0.7062, + "step": 22190 + }, + { + "epoch": 0.49412393162393164, + "grad_norm": 0.7176637649536133, + "learning_rate": 0.0003738479516488471, + "loss": 0.6893, + "step": 22200 + }, + { + "epoch": 0.49434650997151, + "grad_norm": 0.6873642206192017, + "learning_rate": 0.00037382489732265756, + "loss": 0.7328, + "step": 22210 + }, + { + "epoch": 0.4945690883190883, + "grad_norm": 0.5525507926940918, + "learning_rate": 0.00037380183355068084, + "loss": 0.5907, + "step": 22220 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.7779015898704529, + "learning_rate": 0.00037377876033417015, + "loss": 0.6603, + "step": 22230 + }, + { + "epoch": 0.495014245014245, + "grad_norm": 0.9474874138832092, + "learning_rate": 0.0003737556776743794, + "loss": 0.7767, + "step": 22240 + }, + { + "epoch": 0.49523682336182334, + "grad_norm": 0.9156774878501892, + "learning_rate": 0.0003737325855725629, + "loss": 0.5673, + "step": 22250 + }, + { + "epoch": 0.4954594017094017, + "grad_norm": 0.9191159605979919, + "learning_rate": 0.00037370948402997545, + "loss": 0.7124, + "step": 22260 + }, + { + "epoch": 0.49568198005698005, + "grad_norm": 0.9254758358001709, + "learning_rate": 0.00037368637304787246, + "loss": 0.635, + "step": 22270 + }, + { + "epoch": 0.4959045584045584, + "grad_norm": 0.8832774758338928, + "learning_rate": 0.0003736632526275098, + "loss": 0.5771, + "step": 22280 + }, + { + "epoch": 0.49612713675213677, + "grad_norm": 0.8039012551307678, + "learning_rate": 0.0003736401227701437, + "loss": 0.657, + "step": 22290 + }, + { + "epoch": 0.4963497150997151, + "grad_norm": 0.8206833600997925, + "learning_rate": 0.00037361698347703127, + "loss": 0.5793, + "step": 22300 + }, + { + "epoch": 0.4965722934472934, + "grad_norm": 0.7622146010398865, + "learning_rate": 0.0003735938347494298, + "loss": 0.5886, + "step": 22310 + }, + { + "epoch": 0.4967948717948718, + "grad_norm": 0.3747854232788086, + "learning_rate": 0.0003735706765885973, + "loss": 0.7687, + "step": 22320 + }, + { + "epoch": 0.49701745014245013, + "grad_norm": 0.8601908087730408, + "learning_rate": 0.00037354750899579214, + "loss": 0.7588, + "step": 22330 + }, + { + "epoch": 0.49724002849002846, + "grad_norm": 0.4530029594898224, + "learning_rate": 0.00037352433197227315, + "loss": 0.7051, + "step": 22340 + }, + { + "epoch": 0.49746260683760685, + "grad_norm": 0.8396738171577454, + "learning_rate": 0.00037350114551930005, + "loss": 0.738, + "step": 22350 + }, + { + "epoch": 0.4976851851851852, + "grad_norm": 0.6398061513900757, + "learning_rate": 0.0003734779496381326, + "loss": 0.6236, + "step": 22360 + }, + { + "epoch": 0.49790776353276356, + "grad_norm": 0.7329898476600647, + "learning_rate": 0.0003734547443300313, + "loss": 0.7076, + "step": 22370 + }, + { + "epoch": 0.4981303418803419, + "grad_norm": 0.8609546422958374, + "learning_rate": 0.0003734315295962573, + "loss": 0.6196, + "step": 22380 + }, + { + "epoch": 0.4983529202279202, + "grad_norm": 0.746404230594635, + "learning_rate": 0.00037340830543807196, + "loss": 0.6858, + "step": 22390 + }, + { + "epoch": 0.4985754985754986, + "grad_norm": 0.4643799066543579, + "learning_rate": 0.0003733850718567373, + "loss": 0.583, + "step": 22400 + }, + { + "epoch": 0.4987980769230769, + "grad_norm": 0.7553014755249023, + "learning_rate": 0.00037336182885351594, + "loss": 0.661, + "step": 22410 + }, + { + "epoch": 0.49902065527065526, + "grad_norm": 0.5482897758483887, + "learning_rate": 0.0003733385764296709, + "loss": 0.7295, + "step": 22420 + }, + { + "epoch": 0.49924323361823364, + "grad_norm": 0.5720972418785095, + "learning_rate": 0.0003733153145864657, + "loss": 0.5406, + "step": 22430 + }, + { + "epoch": 0.49946581196581197, + "grad_norm": 0.8809942007064819, + "learning_rate": 0.0003732920433251644, + "loss": 0.8006, + "step": 22440 + }, + { + "epoch": 0.4996883903133903, + "grad_norm": 0.5236983895301819, + "learning_rate": 0.00037326876264703163, + "loss": 0.5946, + "step": 22450 + }, + { + "epoch": 0.4999109686609687, + "grad_norm": 0.8043753504753113, + "learning_rate": 0.0003732454725533324, + "loss": 0.5971, + "step": 22460 + }, + { + "epoch": 0.5001335470085471, + "grad_norm": 0.8286345601081848, + "learning_rate": 0.00037322217304533244, + "loss": 0.6566, + "step": 22470 + }, + { + "epoch": 0.5003561253561254, + "grad_norm": 0.7317217588424683, + "learning_rate": 0.00037319886412429777, + "loss": 0.6861, + "step": 22480 + }, + { + "epoch": 0.5005787037037037, + "grad_norm": 0.9986227750778198, + "learning_rate": 0.000373175545791495, + "loss": 0.8038, + "step": 22490 + }, + { + "epoch": 0.500801282051282, + "grad_norm": 0.5715426802635193, + "learning_rate": 0.00037315221804819134, + "loss": 0.6179, + "step": 22500 + }, + { + "epoch": 0.5010238603988604, + "grad_norm": 0.7587363719940186, + "learning_rate": 0.0003731288808956544, + "loss": 0.5958, + "step": 22510 + }, + { + "epoch": 0.5012464387464387, + "grad_norm": 0.5153867602348328, + "learning_rate": 0.0003731055343351523, + "loss": 0.607, + "step": 22520 + }, + { + "epoch": 0.5014690170940171, + "grad_norm": 0.71803879737854, + "learning_rate": 0.0003730821783679538, + "loss": 0.6498, + "step": 22530 + }, + { + "epoch": 0.5016915954415955, + "grad_norm": 0.8072484731674194, + "learning_rate": 0.000373058812995328, + "loss": 0.6912, + "step": 22540 + }, + { + "epoch": 0.5019141737891738, + "grad_norm": 0.9070923328399658, + "learning_rate": 0.0003730354382185447, + "loss": 0.6367, + "step": 22550 + }, + { + "epoch": 0.5021367521367521, + "grad_norm": 0.6757694482803345, + "learning_rate": 0.00037301205403887395, + "loss": 0.6196, + "step": 22560 + }, + { + "epoch": 0.5023593304843305, + "grad_norm": 0.6036407947540283, + "learning_rate": 0.00037298866045758656, + "loss": 0.6573, + "step": 22570 + }, + { + "epoch": 0.5025819088319088, + "grad_norm": 0.8025512099266052, + "learning_rate": 0.0003729652574759538, + "loss": 0.6912, + "step": 22580 + }, + { + "epoch": 0.5028044871794872, + "grad_norm": 0.9629495143890381, + "learning_rate": 0.0003729418450952473, + "loss": 0.6471, + "step": 22590 + }, + { + "epoch": 0.5030270655270656, + "grad_norm": 0.7720807194709778, + "learning_rate": 0.00037291842331673943, + "loss": 0.6282, + "step": 22600 + }, + { + "epoch": 0.5032496438746439, + "grad_norm": 1.0801931619644165, + "learning_rate": 0.0003728949921417028, + "loss": 0.8203, + "step": 22610 + }, + { + "epoch": 0.5034722222222222, + "grad_norm": 0.6417792439460754, + "learning_rate": 0.0003728715515714108, + "loss": 0.6012, + "step": 22620 + }, + { + "epoch": 0.5036948005698005, + "grad_norm": 0.8479206562042236, + "learning_rate": 0.00037284810160713715, + "loss": 0.5524, + "step": 22630 + }, + { + "epoch": 0.5039173789173789, + "grad_norm": 1.038094401359558, + "learning_rate": 0.00037282464225015617, + "loss": 0.7643, + "step": 22640 + }, + { + "epoch": 0.5041399572649573, + "grad_norm": 0.48345646262168884, + "learning_rate": 0.0003728011735017427, + "loss": 0.5751, + "step": 22650 + }, + { + "epoch": 0.5043625356125356, + "grad_norm": 0.5323746204376221, + "learning_rate": 0.0003727776953631719, + "loss": 0.6605, + "step": 22660 + }, + { + "epoch": 0.504585113960114, + "grad_norm": 0.6944118142127991, + "learning_rate": 0.0003727542078357197, + "loss": 0.6518, + "step": 22670 + }, + { + "epoch": 0.5048076923076923, + "grad_norm": 0.6447984576225281, + "learning_rate": 0.0003727307109206625, + "loss": 0.5313, + "step": 22680 + }, + { + "epoch": 0.5050302706552706, + "grad_norm": 0.7850675582885742, + "learning_rate": 0.00037270720461927704, + "loss": 0.6458, + "step": 22690 + }, + { + "epoch": 0.5052528490028491, + "grad_norm": 0.8559784889221191, + "learning_rate": 0.0003726836889328407, + "loss": 0.6545, + "step": 22700 + }, + { + "epoch": 0.5054754273504274, + "grad_norm": 0.54237961769104, + "learning_rate": 0.00037266016386263123, + "loss": 0.5886, + "step": 22710 + }, + { + "epoch": 0.5056980056980057, + "grad_norm": 0.6112391948699951, + "learning_rate": 0.00037263662940992725, + "loss": 0.6634, + "step": 22720 + }, + { + "epoch": 0.505920584045584, + "grad_norm": 0.7948022484779358, + "learning_rate": 0.0003726130855760074, + "loss": 0.7228, + "step": 22730 + }, + { + "epoch": 0.5061431623931624, + "grad_norm": 0.5578857660293579, + "learning_rate": 0.00037258953236215126, + "loss": 0.5033, + "step": 22740 + }, + { + "epoch": 0.5063657407407407, + "grad_norm": 0.6403529047966003, + "learning_rate": 0.00037256596976963866, + "loss": 0.6009, + "step": 22750 + }, + { + "epoch": 0.5065883190883191, + "grad_norm": 0.7904924154281616, + "learning_rate": 0.0003725423977997499, + "loss": 0.5801, + "step": 22760 + }, + { + "epoch": 0.5068108974358975, + "grad_norm": 0.5248593091964722, + "learning_rate": 0.00037251881645376605, + "loss": 0.5125, + "step": 22770 + }, + { + "epoch": 0.5070334757834758, + "grad_norm": 0.6650285720825195, + "learning_rate": 0.0003724952257329684, + "loss": 0.6307, + "step": 22780 + }, + { + "epoch": 0.5072560541310541, + "grad_norm": 0.8868667483329773, + "learning_rate": 0.00037247162563863907, + "loss": 0.7645, + "step": 22790 + }, + { + "epoch": 0.5074786324786325, + "grad_norm": 0.8926972150802612, + "learning_rate": 0.0003724480161720604, + "loss": 0.6528, + "step": 22800 + }, + { + "epoch": 0.5077012108262108, + "grad_norm": 0.6713107824325562, + "learning_rate": 0.00037242439733451533, + "loss": 0.5721, + "step": 22810 + }, + { + "epoch": 0.5079237891737892, + "grad_norm": 0.9303070306777954, + "learning_rate": 0.00037240076912728736, + "loss": 0.664, + "step": 22820 + }, + { + "epoch": 0.5081463675213675, + "grad_norm": 0.9031490683555603, + "learning_rate": 0.0003723771315516605, + "loss": 0.7006, + "step": 22830 + }, + { + "epoch": 0.5083689458689459, + "grad_norm": 0.5676231980323792, + "learning_rate": 0.00037235348460891915, + "loss": 0.6106, + "step": 22840 + }, + { + "epoch": 0.5085915242165242, + "grad_norm": 0.6862084865570068, + "learning_rate": 0.00037232982830034836, + "loss": 0.6386, + "step": 22850 + }, + { + "epoch": 0.5088141025641025, + "grad_norm": 0.5832529664039612, + "learning_rate": 0.00037230616262723366, + "loss": 0.5575, + "step": 22860 + }, + { + "epoch": 0.5090366809116809, + "grad_norm": 0.4869333803653717, + "learning_rate": 0.00037228248759086095, + "loss": 0.6385, + "step": 22870 + }, + { + "epoch": 0.5092592592592593, + "grad_norm": 0.8570073246955872, + "learning_rate": 0.0003722588031925169, + "loss": 0.6212, + "step": 22880 + }, + { + "epoch": 0.5094818376068376, + "grad_norm": 0.7280600666999817, + "learning_rate": 0.0003722351094334884, + "loss": 0.5418, + "step": 22890 + }, + { + "epoch": 0.509704415954416, + "grad_norm": 0.8902294635772705, + "learning_rate": 0.0003722114063150631, + "loss": 0.5825, + "step": 22900 + }, + { + "epoch": 0.5099269943019943, + "grad_norm": 0.9598889946937561, + "learning_rate": 0.00037218769383852906, + "loss": 0.528, + "step": 22910 + }, + { + "epoch": 0.5101495726495726, + "grad_norm": 0.6155282258987427, + "learning_rate": 0.00037216397200517465, + "loss": 0.5101, + "step": 22920 + }, + { + "epoch": 0.5103721509971509, + "grad_norm": 0.604421854019165, + "learning_rate": 0.00037214024081628914, + "loss": 0.558, + "step": 22930 + }, + { + "epoch": 0.5105947293447294, + "grad_norm": 0.8476591110229492, + "learning_rate": 0.000372116500273162, + "loss": 0.6325, + "step": 22940 + }, + { + "epoch": 0.5108173076923077, + "grad_norm": 0.6499559283256531, + "learning_rate": 0.00037209275037708336, + "loss": 0.7019, + "step": 22950 + }, + { + "epoch": 0.511039886039886, + "grad_norm": 0.6757542490959167, + "learning_rate": 0.0003720689911293437, + "loss": 0.6507, + "step": 22960 + }, + { + "epoch": 0.5112624643874644, + "grad_norm": 0.6575201153755188, + "learning_rate": 0.0003720452225312343, + "loss": 0.6491, + "step": 22970 + }, + { + "epoch": 0.5114850427350427, + "grad_norm": 0.3498440682888031, + "learning_rate": 0.00037202144458404665, + "loss": 0.6026, + "step": 22980 + }, + { + "epoch": 0.5117076210826211, + "grad_norm": 0.8307576775550842, + "learning_rate": 0.0003719976572890729, + "loss": 0.573, + "step": 22990 + }, + { + "epoch": 0.5119301994301995, + "grad_norm": 0.8176911473274231, + "learning_rate": 0.0003719738606476056, + "loss": 0.719, + "step": 23000 + }, + { + "epoch": 0.5121527777777778, + "grad_norm": 0.7571500539779663, + "learning_rate": 0.00037195005466093795, + "loss": 0.6963, + "step": 23010 + }, + { + "epoch": 0.5123753561253561, + "grad_norm": 0.9647487998008728, + "learning_rate": 0.0003719262393303635, + "loss": 0.5809, + "step": 23020 + }, + { + "epoch": 0.5125979344729344, + "grad_norm": 0.7572385668754578, + "learning_rate": 0.0003719024146571765, + "loss": 0.7812, + "step": 23030 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 1.1750061511993408, + "learning_rate": 0.0003718785806426716, + "loss": 0.6111, + "step": 23040 + }, + { + "epoch": 0.5130430911680912, + "grad_norm": 0.5060396790504456, + "learning_rate": 0.00037185473728814386, + "loss": 0.7578, + "step": 23050 + }, + { + "epoch": 0.5132656695156695, + "grad_norm": 0.5580783486366272, + "learning_rate": 0.00037183088459488906, + "loss": 0.6576, + "step": 23060 + }, + { + "epoch": 0.5134882478632479, + "grad_norm": 0.6911407709121704, + "learning_rate": 0.0003718070225642033, + "loss": 0.6099, + "step": 23070 + }, + { + "epoch": 0.5137108262108262, + "grad_norm": 0.6477782726287842, + "learning_rate": 0.00037178315119738327, + "loss": 0.7283, + "step": 23080 + }, + { + "epoch": 0.5139334045584045, + "grad_norm": 0.5770291090011597, + "learning_rate": 0.00037175927049572623, + "loss": 0.6806, + "step": 23090 + }, + { + "epoch": 0.5141559829059829, + "grad_norm": 0.7728904485702515, + "learning_rate": 0.00037173538046052977, + "loss": 0.6663, + "step": 23100 + }, + { + "epoch": 0.5143785612535613, + "grad_norm": 0.8347638249397278, + "learning_rate": 0.0003717114810930922, + "loss": 0.5948, + "step": 23110 + }, + { + "epoch": 0.5146011396011396, + "grad_norm": 0.6052911281585693, + "learning_rate": 0.0003716875723947121, + "loss": 0.5974, + "step": 23120 + }, + { + "epoch": 0.514823717948718, + "grad_norm": 0.9551176428794861, + "learning_rate": 0.0003716636543666888, + "loss": 0.6198, + "step": 23130 + }, + { + "epoch": 0.5150462962962963, + "grad_norm": 0.4984239339828491, + "learning_rate": 0.00037163972701032206, + "loss": 0.5537, + "step": 23140 + }, + { + "epoch": 0.5152688746438746, + "grad_norm": 0.5534871220588684, + "learning_rate": 0.000371615790326912, + "loss": 0.7402, + "step": 23150 + }, + { + "epoch": 0.5154914529914529, + "grad_norm": 0.6838101744651794, + "learning_rate": 0.00037159184431775937, + "loss": 0.604, + "step": 23160 + }, + { + "epoch": 0.5157140313390314, + "grad_norm": 0.5246807932853699, + "learning_rate": 0.0003715678889841654, + "loss": 0.5308, + "step": 23170 + }, + { + "epoch": 0.5159366096866097, + "grad_norm": 1.1374447345733643, + "learning_rate": 0.000371543924327432, + "loss": 0.7181, + "step": 23180 + }, + { + "epoch": 0.516159188034188, + "grad_norm": 0.8713901042938232, + "learning_rate": 0.0003715199503488613, + "loss": 0.5903, + "step": 23190 + }, + { + "epoch": 0.5163817663817664, + "grad_norm": 0.5066902041435242, + "learning_rate": 0.00037149596704975604, + "loss": 0.5557, + "step": 23200 + }, + { + "epoch": 0.5166043447293447, + "grad_norm": 0.7605969309806824, + "learning_rate": 0.00037147197443141957, + "loss": 0.56, + "step": 23210 + }, + { + "epoch": 0.5168269230769231, + "grad_norm": 0.9316904544830322, + "learning_rate": 0.0003714479724951556, + "loss": 0.6541, + "step": 23220 + }, + { + "epoch": 0.5170495014245015, + "grad_norm": 0.6615206599235535, + "learning_rate": 0.00037142396124226847, + "loss": 0.7339, + "step": 23230 + }, + { + "epoch": 0.5172720797720798, + "grad_norm": 0.4588244557380676, + "learning_rate": 0.000371399940674063, + "loss": 0.6555, + "step": 23240 + }, + { + "epoch": 0.5174946581196581, + "grad_norm": 0.8920488357543945, + "learning_rate": 0.00037137591079184436, + "loss": 0.6542, + "step": 23250 + }, + { + "epoch": 0.5177172364672364, + "grad_norm": 0.6547673344612122, + "learning_rate": 0.0003713518715969185, + "loss": 0.6547, + "step": 23260 + }, + { + "epoch": 0.5179398148148148, + "grad_norm": 0.8283072710037231, + "learning_rate": 0.00037132782309059163, + "loss": 0.7011, + "step": 23270 + }, + { + "epoch": 0.5181623931623932, + "grad_norm": 0.7763789296150208, + "learning_rate": 0.0003713037652741706, + "loss": 0.688, + "step": 23280 + }, + { + "epoch": 0.5183849715099715, + "grad_norm": 0.6761392951011658, + "learning_rate": 0.0003712796981489627, + "loss": 0.6504, + "step": 23290 + }, + { + "epoch": 0.5186075498575499, + "grad_norm": 0.6442679166793823, + "learning_rate": 0.0003712556217162758, + "loss": 0.6057, + "step": 23300 + }, + { + "epoch": 0.5188301282051282, + "grad_norm": 0.6514276266098022, + "learning_rate": 0.00037123153597741823, + "loss": 0.5104, + "step": 23310 + }, + { + "epoch": 0.5190527065527065, + "grad_norm": 0.79853355884552, + "learning_rate": 0.00037120744093369887, + "loss": 0.6224, + "step": 23320 + }, + { + "epoch": 0.5192752849002849, + "grad_norm": 0.630884051322937, + "learning_rate": 0.00037118333658642694, + "loss": 0.5658, + "step": 23330 + }, + { + "epoch": 0.5194978632478633, + "grad_norm": 1.0390676259994507, + "learning_rate": 0.00037115922293691245, + "loss": 0.7549, + "step": 23340 + }, + { + "epoch": 0.5197204415954416, + "grad_norm": 0.7197343111038208, + "learning_rate": 0.00037113509998646554, + "loss": 0.6401, + "step": 23350 + }, + { + "epoch": 0.51994301994302, + "grad_norm": 0.7944161295890808, + "learning_rate": 0.00037111096773639727, + "loss": 0.5773, + "step": 23360 + }, + { + "epoch": 0.5201655982905983, + "grad_norm": 0.5929605960845947, + "learning_rate": 0.00037108682618801895, + "loss": 0.5739, + "step": 23370 + }, + { + "epoch": 0.5203881766381766, + "grad_norm": 0.679068922996521, + "learning_rate": 0.0003710626753426424, + "loss": 0.5944, + "step": 23380 + }, + { + "epoch": 0.5206107549857549, + "grad_norm": 0.867856502532959, + "learning_rate": 0.00037103851520158004, + "loss": 0.7019, + "step": 23390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.4647440016269684, + "learning_rate": 0.0003710143457661448, + "loss": 0.5525, + "step": 23400 + }, + { + "epoch": 0.5210559116809117, + "grad_norm": 0.7468436360359192, + "learning_rate": 0.00037099016703764996, + "loss": 0.5012, + "step": 23410 + }, + { + "epoch": 0.52127849002849, + "grad_norm": 0.8206691145896912, + "learning_rate": 0.00037096597901740947, + "loss": 0.7897, + "step": 23420 + }, + { + "epoch": 0.5215010683760684, + "grad_norm": 0.6338486671447754, + "learning_rate": 0.00037094178170673765, + "loss": 0.589, + "step": 23430 + }, + { + "epoch": 0.5217236467236467, + "grad_norm": 0.6311833262443542, + "learning_rate": 0.0003709175751069496, + "loss": 0.5387, + "step": 23440 + }, + { + "epoch": 0.5219462250712251, + "grad_norm": 0.8822476267814636, + "learning_rate": 0.00037089335921936054, + "loss": 0.6912, + "step": 23450 + }, + { + "epoch": 0.5221688034188035, + "grad_norm": 0.5450563430786133, + "learning_rate": 0.0003708691340452865, + "loss": 0.5881, + "step": 23460 + }, + { + "epoch": 0.5223913817663818, + "grad_norm": 0.845903754234314, + "learning_rate": 0.00037084489958604373, + "loss": 0.6549, + "step": 23470 + }, + { + "epoch": 0.5226139601139601, + "grad_norm": 0.7649895548820496, + "learning_rate": 0.00037082065584294934, + "loss": 0.7001, + "step": 23480 + }, + { + "epoch": 0.5228365384615384, + "grad_norm": 0.521933376789093, + "learning_rate": 0.00037079640281732063, + "loss": 0.7073, + "step": 23490 + }, + { + "epoch": 0.5230591168091168, + "grad_norm": 0.6133053302764893, + "learning_rate": 0.00037077214051047555, + "loss": 0.5488, + "step": 23500 + }, + { + "epoch": 0.5232816951566952, + "grad_norm": 0.422405481338501, + "learning_rate": 0.0003707478689237326, + "loss": 0.593, + "step": 23510 + }, + { + "epoch": 0.5235042735042735, + "grad_norm": 0.8190181255340576, + "learning_rate": 0.00037072358805841066, + "loss": 0.673, + "step": 23520 + }, + { + "epoch": 0.5237268518518519, + "grad_norm": 0.7577298879623413, + "learning_rate": 0.0003706992979158292, + "loss": 0.6556, + "step": 23530 + }, + { + "epoch": 0.5239494301994302, + "grad_norm": 0.447704941034317, + "learning_rate": 0.00037067499849730815, + "loss": 0.7052, + "step": 23540 + }, + { + "epoch": 0.5241720085470085, + "grad_norm": 0.5758539438247681, + "learning_rate": 0.0003706506898041679, + "loss": 0.5871, + "step": 23550 + }, + { + "epoch": 0.5243945868945868, + "grad_norm": 0.669194221496582, + "learning_rate": 0.0003706263718377295, + "loss": 0.5948, + "step": 23560 + }, + { + "epoch": 0.5246171652421653, + "grad_norm": 0.8515828847885132, + "learning_rate": 0.00037060204459931435, + "loss": 0.6965, + "step": 23570 + }, + { + "epoch": 0.5248397435897436, + "grad_norm": 0.594035267829895, + "learning_rate": 0.0003705777080902445, + "loss": 0.6552, + "step": 23580 + }, + { + "epoch": 0.5250623219373219, + "grad_norm": 0.4388732612133026, + "learning_rate": 0.0003705533623118423, + "loss": 0.6365, + "step": 23590 + }, + { + "epoch": 0.5252849002849003, + "grad_norm": 0.8432769179344177, + "learning_rate": 0.00037052900726543085, + "loss": 0.4611, + "step": 23600 + }, + { + "epoch": 0.5255074786324786, + "grad_norm": 0.6244149208068848, + "learning_rate": 0.0003705046429523335, + "loss": 0.6372, + "step": 23610 + }, + { + "epoch": 0.5257300569800569, + "grad_norm": 0.4183826744556427, + "learning_rate": 0.0003704802693738742, + "loss": 0.6894, + "step": 23620 + }, + { + "epoch": 0.5259526353276354, + "grad_norm": 0.8266147971153259, + "learning_rate": 0.00037045588653137755, + "loss": 0.7249, + "step": 23630 + }, + { + "epoch": 0.5261752136752137, + "grad_norm": 0.669316291809082, + "learning_rate": 0.00037043149442616847, + "loss": 0.5349, + "step": 23640 + }, + { + "epoch": 0.526397792022792, + "grad_norm": 0.7408722639083862, + "learning_rate": 0.0003704070930595725, + "loss": 0.7687, + "step": 23650 + }, + { + "epoch": 0.5266203703703703, + "grad_norm": 0.9338363409042358, + "learning_rate": 0.0003703826824329155, + "loss": 0.6314, + "step": 23660 + }, + { + "epoch": 0.5268429487179487, + "grad_norm": 1.0505681037902832, + "learning_rate": 0.00037035826254752413, + "loss": 0.6607, + "step": 23670 + }, + { + "epoch": 0.5270655270655271, + "grad_norm": 1.1003633737564087, + "learning_rate": 0.00037033383340472536, + "loss": 0.7044, + "step": 23680 + }, + { + "epoch": 0.5272881054131054, + "grad_norm": 0.5831704139709473, + "learning_rate": 0.00037030939500584654, + "loss": 0.7256, + "step": 23690 + }, + { + "epoch": 0.5275106837606838, + "grad_norm": 0.8710972666740417, + "learning_rate": 0.0003702849473522158, + "loss": 0.5331, + "step": 23700 + }, + { + "epoch": 0.5277332621082621, + "grad_norm": 0.6843211650848389, + "learning_rate": 0.00037026049044516166, + "loss": 0.6554, + "step": 23710 + }, + { + "epoch": 0.5279558404558404, + "grad_norm": 0.5306840538978577, + "learning_rate": 0.0003702360242860131, + "loss": 0.5618, + "step": 23720 + }, + { + "epoch": 0.5281784188034188, + "grad_norm": 0.6895719766616821, + "learning_rate": 0.00037021154887609953, + "loss": 0.5243, + "step": 23730 + }, + { + "epoch": 0.5284009971509972, + "grad_norm": 0.41083499789237976, + "learning_rate": 0.0003701870642167511, + "loss": 0.7049, + "step": 23740 + }, + { + "epoch": 0.5286235754985755, + "grad_norm": 0.9922495484352112, + "learning_rate": 0.0003701625703092983, + "loss": 0.641, + "step": 23750 + }, + { + "epoch": 0.5288461538461539, + "grad_norm": 0.6313428282737732, + "learning_rate": 0.00037013806715507214, + "loss": 0.662, + "step": 23760 + }, + { + "epoch": 0.5290687321937322, + "grad_norm": 0.5346630811691284, + "learning_rate": 0.00037011355475540414, + "loss": 0.7308, + "step": 23770 + }, + { + "epoch": 0.5292913105413105, + "grad_norm": 0.7471534013748169, + "learning_rate": 0.00037008903311162617, + "loss": 0.57, + "step": 23780 + }, + { + "epoch": 0.5295138888888888, + "grad_norm": 0.6286147832870483, + "learning_rate": 0.0003700645022250711, + "loss": 0.6702, + "step": 23790 + }, + { + "epoch": 0.5297364672364673, + "grad_norm": 0.526298463344574, + "learning_rate": 0.00037003996209707157, + "loss": 0.6621, + "step": 23800 + }, + { + "epoch": 0.5299590455840456, + "grad_norm": 0.579492449760437, + "learning_rate": 0.00037001541272896143, + "loss": 0.6442, + "step": 23810 + }, + { + "epoch": 0.5301816239316239, + "grad_norm": 0.40422266721725464, + "learning_rate": 0.00036999085412207455, + "loss": 0.6136, + "step": 23820 + }, + { + "epoch": 0.5304042022792023, + "grad_norm": 0.5754667520523071, + "learning_rate": 0.0003699662862777455, + "loss": 0.7358, + "step": 23830 + }, + { + "epoch": 0.5306267806267806, + "grad_norm": 0.3586984872817993, + "learning_rate": 0.00036994170919730926, + "loss": 0.572, + "step": 23840 + }, + { + "epoch": 0.5308493589743589, + "grad_norm": 0.5701540112495422, + "learning_rate": 0.00036991712288210146, + "loss": 0.5319, + "step": 23850 + }, + { + "epoch": 0.5310719373219374, + "grad_norm": 0.6356937885284424, + "learning_rate": 0.0003698925273334581, + "loss": 0.5811, + "step": 23860 + }, + { + "epoch": 0.5312945156695157, + "grad_norm": 0.7801693081855774, + "learning_rate": 0.0003698679225527157, + "loss": 0.5285, + "step": 23870 + }, + { + "epoch": 0.531517094017094, + "grad_norm": 0.6584993600845337, + "learning_rate": 0.0003698433085412114, + "loss": 0.5872, + "step": 23880 + }, + { + "epoch": 0.5317396723646723, + "grad_norm": 0.9852097630500793, + "learning_rate": 0.00036981868530028267, + "loss": 0.5429, + "step": 23890 + }, + { + "epoch": 0.5319622507122507, + "grad_norm": 1.0301185846328735, + "learning_rate": 0.00036979405283126747, + "loss": 0.7417, + "step": 23900 + }, + { + "epoch": 0.5321848290598291, + "grad_norm": 0.712128758430481, + "learning_rate": 0.00036976941113550454, + "loss": 0.5969, + "step": 23910 + }, + { + "epoch": 0.5324074074074074, + "grad_norm": 0.6619728207588196, + "learning_rate": 0.00036974476021433276, + "loss": 0.4735, + "step": 23920 + }, + { + "epoch": 0.5326299857549858, + "grad_norm": 0.6637291312217712, + "learning_rate": 0.00036972010006909177, + "loss": 0.5637, + "step": 23930 + }, + { + "epoch": 0.5328525641025641, + "grad_norm": 0.670197606086731, + "learning_rate": 0.00036969543070112154, + "loss": 0.5575, + "step": 23940 + }, + { + "epoch": 0.5330751424501424, + "grad_norm": 0.6946614384651184, + "learning_rate": 0.00036967075211176285, + "loss": 0.733, + "step": 23950 + }, + { + "epoch": 0.5332977207977208, + "grad_norm": 0.9224454760551453, + "learning_rate": 0.00036964606430235647, + "loss": 0.5485, + "step": 23960 + }, + { + "epoch": 0.5335202991452992, + "grad_norm": 0.6943668127059937, + "learning_rate": 0.0003696213672742441, + "loss": 0.7453, + "step": 23970 + }, + { + "epoch": 0.5337428774928775, + "grad_norm": 0.7649568915367126, + "learning_rate": 0.0003695966610287677, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 0.5339654558404558, + "grad_norm": 0.8155977129936218, + "learning_rate": 0.00036957194556727, + "loss": 0.5108, + "step": 23990 + }, + { + "epoch": 0.5341880341880342, + "grad_norm": 0.8972145915031433, + "learning_rate": 0.00036954722089109395, + "loss": 0.4441, + "step": 24000 + }, + { + "epoch": 0.5344106125356125, + "grad_norm": 0.7684455513954163, + "learning_rate": 0.00036952248700158305, + "loss": 0.543, + "step": 24010 + }, + { + "epoch": 0.5346331908831908, + "grad_norm": 0.8472932577133179, + "learning_rate": 0.0003694977439000815, + "loss": 0.7048, + "step": 24020 + }, + { + "epoch": 0.5348557692307693, + "grad_norm": 0.8715865612030029, + "learning_rate": 0.0003694729915879338, + "loss": 0.6962, + "step": 24030 + }, + { + "epoch": 0.5350783475783476, + "grad_norm": 0.8468021750450134, + "learning_rate": 0.00036944823006648494, + "loss": 0.6192, + "step": 24040 + }, + { + "epoch": 0.5353009259259259, + "grad_norm": 0.5794544219970703, + "learning_rate": 0.0003694234593370806, + "loss": 0.6453, + "step": 24050 + }, + { + "epoch": 0.5355235042735043, + "grad_norm": 1.058516263961792, + "learning_rate": 0.00036939867940106677, + "loss": 0.593, + "step": 24060 + }, + { + "epoch": 0.5357460826210826, + "grad_norm": 0.6857909560203552, + "learning_rate": 0.00036937389025979, + "loss": 0.6452, + "step": 24070 + }, + { + "epoch": 0.5359686609686609, + "grad_norm": 0.5412909388542175, + "learning_rate": 0.00036934909191459734, + "loss": 0.6902, + "step": 24080 + }, + { + "epoch": 0.5361912393162394, + "grad_norm": 0.696171760559082, + "learning_rate": 0.0003693242843668365, + "loss": 0.7975, + "step": 24090 + }, + { + "epoch": 0.5364138176638177, + "grad_norm": 0.5655274987220764, + "learning_rate": 0.00036929946761785537, + "loss": 0.507, + "step": 24100 + }, + { + "epoch": 0.536636396011396, + "grad_norm": 0.8406558632850647, + "learning_rate": 0.00036927464166900255, + "loss": 0.7638, + "step": 24110 + }, + { + "epoch": 0.5368589743589743, + "grad_norm": 0.831794023513794, + "learning_rate": 0.00036924980652162714, + "loss": 0.7466, + "step": 24120 + }, + { + "epoch": 0.5370815527065527, + "grad_norm": 0.7433670163154602, + "learning_rate": 0.0003692249621770787, + "loss": 0.6373, + "step": 24130 + }, + { + "epoch": 0.5373041310541311, + "grad_norm": 1.1349642276763916, + "learning_rate": 0.0003692001086367073, + "loss": 0.6649, + "step": 24140 + }, + { + "epoch": 0.5375267094017094, + "grad_norm": 0.5549025535583496, + "learning_rate": 0.0003691752459018634, + "loss": 0.7769, + "step": 24150 + }, + { + "epoch": 0.5377492877492878, + "grad_norm": 0.4678022861480713, + "learning_rate": 0.00036915037397389824, + "loss": 0.6035, + "step": 24160 + }, + { + "epoch": 0.5379718660968661, + "grad_norm": 0.6058796644210815, + "learning_rate": 0.0003691254928541633, + "loss": 0.5491, + "step": 24170 + }, + { + "epoch": 0.5381944444444444, + "grad_norm": 0.7192685008049011, + "learning_rate": 0.00036910060254401054, + "loss": 0.7346, + "step": 24180 + }, + { + "epoch": 0.5384170227920227, + "grad_norm": 0.5943816304206848, + "learning_rate": 0.00036907570304479264, + "loss": 0.7317, + "step": 24190 + }, + { + "epoch": 0.5386396011396012, + "grad_norm": 0.8074010014533997, + "learning_rate": 0.00036905079435786264, + "loss": 0.7167, + "step": 24200 + }, + { + "epoch": 0.5388621794871795, + "grad_norm": 0.5450646877288818, + "learning_rate": 0.0003690258764845741, + "loss": 0.6761, + "step": 24210 + }, + { + "epoch": 0.5390847578347578, + "grad_norm": 0.8199670314788818, + "learning_rate": 0.00036900094942628105, + "loss": 0.5708, + "step": 24220 + }, + { + "epoch": 0.5393073361823362, + "grad_norm": 0.5967410206794739, + "learning_rate": 0.000368976013184338, + "loss": 0.4851, + "step": 24230 + }, + { + "epoch": 0.5395299145299145, + "grad_norm": 0.8061283826828003, + "learning_rate": 0.00036895106776010006, + "loss": 0.5799, + "step": 24240 + }, + { + "epoch": 0.5397524928774928, + "grad_norm": 0.6298791170120239, + "learning_rate": 0.0003689261131549229, + "loss": 0.6722, + "step": 24250 + }, + { + "epoch": 0.5399750712250713, + "grad_norm": 0.8000302314758301, + "learning_rate": 0.0003689011493701624, + "loss": 0.5899, + "step": 24260 + }, + { + "epoch": 0.5400641025641025, + "eval_loss": 0.6258811950683594, + "eval_runtime": 337.2215, + "eval_samples_per_second": 7.013, + "eval_steps_per_second": 7.013, + "step": 24264 + }, + { + "epoch": 0.5401976495726496, + "grad_norm": 0.8310365080833435, + "learning_rate": 0.00036887617640717513, + "loss": 0.6246, + "step": 24270 + }, + { + "epoch": 0.5404202279202279, + "grad_norm": 0.7653577327728271, + "learning_rate": 0.0003688511942673182, + "loss": 0.5665, + "step": 24280 + }, + { + "epoch": 0.5406428062678063, + "grad_norm": 0.6216724514961243, + "learning_rate": 0.0003688262029519492, + "loss": 0.5356, + "step": 24290 + }, + { + "epoch": 0.5408653846153846, + "grad_norm": 0.5706295967102051, + "learning_rate": 0.0003688012024624261, + "loss": 0.7744, + "step": 24300 + }, + { + "epoch": 0.5410879629629629, + "grad_norm": 0.7092143297195435, + "learning_rate": 0.00036877619280010744, + "loss": 0.6825, + "step": 24310 + }, + { + "epoch": 0.5413105413105413, + "grad_norm": 0.8476496338844299, + "learning_rate": 0.00036875117396635234, + "loss": 0.768, + "step": 24320 + }, + { + "epoch": 0.5415331196581197, + "grad_norm": 0.6610291004180908, + "learning_rate": 0.0003687261459625203, + "loss": 0.6644, + "step": 24330 + }, + { + "epoch": 0.541755698005698, + "grad_norm": 0.4774007797241211, + "learning_rate": 0.0003687011087899713, + "loss": 0.6664, + "step": 24340 + }, + { + "epoch": 0.5419782763532763, + "grad_norm": 1.256393313407898, + "learning_rate": 0.00036867606245006597, + "loss": 0.6652, + "step": 24350 + }, + { + "epoch": 0.5422008547008547, + "grad_norm": 0.5507504343986511, + "learning_rate": 0.00036865100694416535, + "loss": 0.5793, + "step": 24360 + }, + { + "epoch": 0.5424234330484331, + "grad_norm": 1.010922908782959, + "learning_rate": 0.000368625942273631, + "loss": 0.6169, + "step": 24370 + }, + { + "epoch": 0.5426460113960114, + "grad_norm": 0.7326360940933228, + "learning_rate": 0.0003686008684398248, + "loss": 0.6562, + "step": 24380 + }, + { + "epoch": 0.5428685897435898, + "grad_norm": 0.8364658951759338, + "learning_rate": 0.0003685757854441095, + "loss": 0.6444, + "step": 24390 + }, + { + "epoch": 0.5430911680911681, + "grad_norm": 0.73721843957901, + "learning_rate": 0.0003685506932878479, + "loss": 0.74, + "step": 24400 + }, + { + "epoch": 0.5433137464387464, + "grad_norm": 0.689838707447052, + "learning_rate": 0.00036852559197240363, + "loss": 0.6273, + "step": 24410 + }, + { + "epoch": 0.5435363247863247, + "grad_norm": 0.5031551122665405, + "learning_rate": 0.0003685004814991408, + "loss": 0.5855, + "step": 24420 + }, + { + "epoch": 0.5437589031339032, + "grad_norm": 0.8085066676139832, + "learning_rate": 0.0003684753618694239, + "loss": 0.5039, + "step": 24430 + }, + { + "epoch": 0.5439814814814815, + "grad_norm": 0.5739592909812927, + "learning_rate": 0.00036845023308461783, + "loss": 0.6338, + "step": 24440 + }, + { + "epoch": 0.5442040598290598, + "grad_norm": 0.7855496406555176, + "learning_rate": 0.00036842509514608824, + "loss": 0.489, + "step": 24450 + }, + { + "epoch": 0.5444266381766382, + "grad_norm": 0.5251293182373047, + "learning_rate": 0.00036839994805520107, + "loss": 0.6098, + "step": 24460 + }, + { + "epoch": 0.5446492165242165, + "grad_norm": 0.8609602451324463, + "learning_rate": 0.00036837479181332295, + "loss": 0.5285, + "step": 24470 + }, + { + "epoch": 0.5448717948717948, + "grad_norm": 0.628017783164978, + "learning_rate": 0.00036834962642182074, + "loss": 0.6919, + "step": 24480 + }, + { + "epoch": 0.5450943732193733, + "grad_norm": 0.5945170521736145, + "learning_rate": 0.000368324451882062, + "loss": 0.7378, + "step": 24490 + }, + { + "epoch": 0.5453169515669516, + "grad_norm": 0.6073055267333984, + "learning_rate": 0.00036829926819541476, + "loss": 0.7293, + "step": 24500 + }, + { + "epoch": 0.5455395299145299, + "grad_norm": 0.8239802122116089, + "learning_rate": 0.00036827407536324747, + "loss": 0.7061, + "step": 24510 + }, + { + "epoch": 0.5457621082621082, + "grad_norm": 0.5099710822105408, + "learning_rate": 0.00036824887338692924, + "loss": 0.6795, + "step": 24520 + }, + { + "epoch": 0.5459846866096866, + "grad_norm": 0.5891330242156982, + "learning_rate": 0.00036822366226782943, + "loss": 0.6871, + "step": 24530 + }, + { + "epoch": 0.5462072649572649, + "grad_norm": 0.8029281497001648, + "learning_rate": 0.00036819844200731814, + "loss": 0.6513, + "step": 24540 + }, + { + "epoch": 0.5464298433048433, + "grad_norm": 0.7131868600845337, + "learning_rate": 0.0003681732126067659, + "loss": 0.6778, + "step": 24550 + }, + { + "epoch": 0.5466524216524217, + "grad_norm": 0.6184797883033752, + "learning_rate": 0.0003681479740675435, + "loss": 0.7158, + "step": 24560 + }, + { + "epoch": 0.546875, + "grad_norm": 0.8021351099014282, + "learning_rate": 0.0003681227263910225, + "loss": 0.5596, + "step": 24570 + }, + { + "epoch": 0.5470975783475783, + "grad_norm": 0.71075040102005, + "learning_rate": 0.00036809746957857504, + "loss": 0.6611, + "step": 24580 + }, + { + "epoch": 0.5473201566951567, + "grad_norm": 0.813607931137085, + "learning_rate": 0.0003680722036315734, + "loss": 0.6434, + "step": 24590 + }, + { + "epoch": 0.5475427350427351, + "grad_norm": 0.4937489330768585, + "learning_rate": 0.00036804692855139064, + "loss": 0.5076, + "step": 24600 + }, + { + "epoch": 0.5477653133903134, + "grad_norm": 0.7622106075286865, + "learning_rate": 0.00036802164433940025, + "loss": 0.7269, + "step": 24610 + }, + { + "epoch": 0.5479878917378918, + "grad_norm": 0.8251647353172302, + "learning_rate": 0.00036799635099697605, + "loss": 0.6925, + "step": 24620 + }, + { + "epoch": 0.5482104700854701, + "grad_norm": 0.5652473568916321, + "learning_rate": 0.00036797104852549274, + "loss": 0.4639, + "step": 24630 + }, + { + "epoch": 0.5484330484330484, + "grad_norm": 0.6232659816741943, + "learning_rate": 0.00036794573692632503, + "loss": 0.6967, + "step": 24640 + }, + { + "epoch": 0.5486556267806267, + "grad_norm": 0.8803776502609253, + "learning_rate": 0.0003679204162008485, + "loss": 0.6616, + "step": 24650 + }, + { + "epoch": 0.5488782051282052, + "grad_norm": 0.9684003591537476, + "learning_rate": 0.0003678950863504392, + "loss": 0.7329, + "step": 24660 + }, + { + "epoch": 0.5491007834757835, + "grad_norm": 1.0482511520385742, + "learning_rate": 0.00036786974737647337, + "loss": 0.6262, + "step": 24670 + }, + { + "epoch": 0.5493233618233618, + "grad_norm": 0.8103609681129456, + "learning_rate": 0.0003678443992803281, + "loss": 0.63, + "step": 24680 + }, + { + "epoch": 0.5495459401709402, + "grad_norm": 0.7665788531303406, + "learning_rate": 0.0003678190420633807, + "loss": 0.6572, + "step": 24690 + }, + { + "epoch": 0.5497685185185185, + "grad_norm": 0.6839772462844849, + "learning_rate": 0.0003677936757270092, + "loss": 0.589, + "step": 24700 + }, + { + "epoch": 0.5499910968660968, + "grad_norm": 0.7093663215637207, + "learning_rate": 0.00036776830027259204, + "loss": 0.676, + "step": 24710 + }, + { + "epoch": 0.5502136752136753, + "grad_norm": 0.8419938087463379, + "learning_rate": 0.000367742915701508, + "loss": 0.6811, + "step": 24720 + }, + { + "epoch": 0.5504362535612536, + "grad_norm": 0.6843752861022949, + "learning_rate": 0.0003677175220151367, + "loss": 0.6703, + "step": 24730 + }, + { + "epoch": 0.5506588319088319, + "grad_norm": 0.8185937404632568, + "learning_rate": 0.0003676921192148579, + "loss": 0.6178, + "step": 24740 + }, + { + "epoch": 0.5508814102564102, + "grad_norm": 0.5972009301185608, + "learning_rate": 0.0003676667073020521, + "loss": 0.6306, + "step": 24750 + }, + { + "epoch": 0.5511039886039886, + "grad_norm": 0.7188451290130615, + "learning_rate": 0.00036764128627810017, + "loss": 0.5118, + "step": 24760 + }, + { + "epoch": 0.5513265669515669, + "grad_norm": 0.565358579158783, + "learning_rate": 0.0003676158561443835, + "loss": 0.6679, + "step": 24770 + }, + { + "epoch": 0.5515491452991453, + "grad_norm": 0.6097283959388733, + "learning_rate": 0.00036759041690228396, + "loss": 0.6096, + "step": 24780 + }, + { + "epoch": 0.5517717236467237, + "grad_norm": 0.641505777835846, + "learning_rate": 0.00036756496855318396, + "loss": 0.6809, + "step": 24790 + }, + { + "epoch": 0.551994301994302, + "grad_norm": 0.8230586647987366, + "learning_rate": 0.00036753951109846645, + "loss": 0.5768, + "step": 24800 + }, + { + "epoch": 0.5522168803418803, + "grad_norm": 1.1163147687911987, + "learning_rate": 0.0003675140445395147, + "loss": 0.637, + "step": 24810 + }, + { + "epoch": 0.5524394586894587, + "grad_norm": 1.0802885293960571, + "learning_rate": 0.0003674885688777127, + "loss": 0.5016, + "step": 24820 + }, + { + "epoch": 0.5526620370370371, + "grad_norm": 0.955658495426178, + "learning_rate": 0.00036746308411444463, + "loss": 0.609, + "step": 24830 + }, + { + "epoch": 0.5528846153846154, + "grad_norm": 0.5523378849029541, + "learning_rate": 0.0003674375902510956, + "loss": 0.5569, + "step": 24840 + }, + { + "epoch": 0.5531071937321937, + "grad_norm": 0.8032726645469666, + "learning_rate": 0.0003674120872890508, + "loss": 0.6498, + "step": 24850 + }, + { + "epoch": 0.5533297720797721, + "grad_norm": 0.5347462296485901, + "learning_rate": 0.0003673865752296961, + "loss": 0.6242, + "step": 24860 + }, + { + "epoch": 0.5535523504273504, + "grad_norm": 0.603898823261261, + "learning_rate": 0.0003673610540744179, + "loss": 0.5649, + "step": 24870 + }, + { + "epoch": 0.5537749287749287, + "grad_norm": 0.3848460614681244, + "learning_rate": 0.00036733552382460304, + "loss": 0.5996, + "step": 24880 + }, + { + "epoch": 0.5539975071225072, + "grad_norm": 0.7490546107292175, + "learning_rate": 0.0003673099844816388, + "loss": 0.6149, + "step": 24890 + }, + { + "epoch": 0.5542200854700855, + "grad_norm": 0.630919873714447, + "learning_rate": 0.0003672844360469131, + "loss": 0.59, + "step": 24900 + }, + { + "epoch": 0.5544426638176638, + "grad_norm": 1.0681458711624146, + "learning_rate": 0.00036725887852181413, + "loss": 0.7436, + "step": 24910 + }, + { + "epoch": 0.5546652421652422, + "grad_norm": 0.5887709856033325, + "learning_rate": 0.0003672333119077307, + "loss": 0.604, + "step": 24920 + }, + { + "epoch": 0.5548878205128205, + "grad_norm": 0.621523916721344, + "learning_rate": 0.0003672077362060524, + "loss": 0.6135, + "step": 24930 + }, + { + "epoch": 0.5551103988603988, + "grad_norm": 0.5083357095718384, + "learning_rate": 0.0003671821514181686, + "loss": 0.6993, + "step": 24940 + }, + { + "epoch": 0.5553329772079773, + "grad_norm": 0.7470302581787109, + "learning_rate": 0.00036715655754547, + "loss": 0.692, + "step": 24950 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.5239513516426086, + "learning_rate": 0.0003671309545893472, + "loss": 0.6401, + "step": 24960 + }, + { + "epoch": 0.5557781339031339, + "grad_norm": 0.6273417472839355, + "learning_rate": 0.00036710534255119146, + "loss": 0.5611, + "step": 24970 + }, + { + "epoch": 0.5560007122507122, + "grad_norm": 0.4738331735134125, + "learning_rate": 0.00036707972143239465, + "loss": 0.6627, + "step": 24980 + }, + { + "epoch": 0.5562232905982906, + "grad_norm": 0.5370237827301025, + "learning_rate": 0.000367054091234349, + "loss": 0.6241, + "step": 24990 + }, + { + "epoch": 0.5564458689458689, + "grad_norm": 0.5510512590408325, + "learning_rate": 0.0003670284519584472, + "loss": 0.6591, + "step": 25000 + }, + { + "epoch": 0.5566684472934473, + "grad_norm": 0.6586712002754211, + "learning_rate": 0.0003670028036060826, + "loss": 0.6104, + "step": 25010 + }, + { + "epoch": 0.5568910256410257, + "grad_norm": 0.8063983917236328, + "learning_rate": 0.000366977146178649, + "loss": 0.6048, + "step": 25020 + }, + { + "epoch": 0.557113603988604, + "grad_norm": 0.8083786368370056, + "learning_rate": 0.0003669514796775406, + "loss": 0.6517, + "step": 25030 + }, + { + "epoch": 0.5573361823361823, + "grad_norm": 0.481600821018219, + "learning_rate": 0.00036692580410415207, + "loss": 0.6172, + "step": 25040 + }, + { + "epoch": 0.5575587606837606, + "grad_norm": 0.7515348196029663, + "learning_rate": 0.0003669001194598787, + "loss": 0.6766, + "step": 25050 + }, + { + "epoch": 0.5577813390313391, + "grad_norm": 0.5784083008766174, + "learning_rate": 0.00036687442574611615, + "loss": 0.5886, + "step": 25060 + }, + { + "epoch": 0.5580039173789174, + "grad_norm": 0.8006730079650879, + "learning_rate": 0.0003668487229642608, + "loss": 0.6038, + "step": 25070 + }, + { + "epoch": 0.5582264957264957, + "grad_norm": 0.6441351771354675, + "learning_rate": 0.0003668230111157092, + "loss": 0.5502, + "step": 25080 + }, + { + "epoch": 0.5584490740740741, + "grad_norm": 0.8060956597328186, + "learning_rate": 0.0003667972902018586, + "loss": 0.6798, + "step": 25090 + }, + { + "epoch": 0.5586716524216524, + "grad_norm": 0.5438023805618286, + "learning_rate": 0.00036677156022410674, + "loss": 0.7869, + "step": 25100 + }, + { + "epoch": 0.5588942307692307, + "grad_norm": 0.6967006921768188, + "learning_rate": 0.00036674582118385174, + "loss": 0.7252, + "step": 25110 + }, + { + "epoch": 0.5591168091168092, + "grad_norm": 1.0618207454681396, + "learning_rate": 0.0003667200730824924, + "loss": 0.7587, + "step": 25120 + }, + { + "epoch": 0.5593393874643875, + "grad_norm": 0.8812199234962463, + "learning_rate": 0.0003666943159214276, + "loss": 0.6272, + "step": 25130 + }, + { + "epoch": 0.5595619658119658, + "grad_norm": 0.7430441379547119, + "learning_rate": 0.0003666685497020574, + "loss": 0.6784, + "step": 25140 + }, + { + "epoch": 0.5597845441595442, + "grad_norm": 0.6264715790748596, + "learning_rate": 0.0003666427744257817, + "loss": 0.5762, + "step": 25150 + }, + { + "epoch": 0.5600071225071225, + "grad_norm": 0.7243641018867493, + "learning_rate": 0.00036661699009400125, + "loss": 0.564, + "step": 25160 + }, + { + "epoch": 0.5602297008547008, + "grad_norm": 0.7546271681785583, + "learning_rate": 0.0003665911967081171, + "loss": 0.5746, + "step": 25170 + }, + { + "epoch": 0.5604522792022792, + "grad_norm": 0.7522180080413818, + "learning_rate": 0.00036656539426953104, + "loss": 0.6288, + "step": 25180 + }, + { + "epoch": 0.5606748575498576, + "grad_norm": 0.7609201073646545, + "learning_rate": 0.000366539582779645, + "loss": 0.7739, + "step": 25190 + }, + { + "epoch": 0.5608974358974359, + "grad_norm": 0.804742157459259, + "learning_rate": 0.0003665137622398617, + "loss": 0.7322, + "step": 25200 + }, + { + "epoch": 0.5611200142450142, + "grad_norm": 0.7445507049560547, + "learning_rate": 0.0003664879326515843, + "loss": 0.6174, + "step": 25210 + }, + { + "epoch": 0.5613425925925926, + "grad_norm": 0.5064197182655334, + "learning_rate": 0.0003664620940162164, + "loss": 0.6026, + "step": 25220 + }, + { + "epoch": 0.5615651709401709, + "grad_norm": 0.7717465758323669, + "learning_rate": 0.0003664362463351619, + "loss": 0.6095, + "step": 25230 + }, + { + "epoch": 0.5617877492877493, + "grad_norm": 0.8355364799499512, + "learning_rate": 0.0003664103896098256, + "loss": 0.7885, + "step": 25240 + }, + { + "epoch": 0.5620103276353277, + "grad_norm": 0.9792073369026184, + "learning_rate": 0.0003663845238416125, + "loss": 0.7913, + "step": 25250 + }, + { + "epoch": 0.562232905982906, + "grad_norm": 0.5464093089103699, + "learning_rate": 0.0003663586490319281, + "loss": 0.6366, + "step": 25260 + }, + { + "epoch": 0.5624554843304843, + "grad_norm": 1.1562082767486572, + "learning_rate": 0.0003663327651821786, + "loss": 0.6432, + "step": 25270 + }, + { + "epoch": 0.5626780626780626, + "grad_norm": 0.7032479047775269, + "learning_rate": 0.00036630687229377047, + "loss": 0.6814, + "step": 25280 + }, + { + "epoch": 0.5629006410256411, + "grad_norm": 0.855377733707428, + "learning_rate": 0.0003662809703681107, + "loss": 0.6815, + "step": 25290 + }, + { + "epoch": 0.5631232193732194, + "grad_norm": 0.7194737792015076, + "learning_rate": 0.00036625505940660687, + "loss": 0.6352, + "step": 25300 + }, + { + "epoch": 0.5633457977207977, + "grad_norm": 0.6877269744873047, + "learning_rate": 0.00036622913941066707, + "loss": 0.5418, + "step": 25310 + }, + { + "epoch": 0.5635683760683761, + "grad_norm": 0.6020021438598633, + "learning_rate": 0.0003662032103816998, + "loss": 0.4835, + "step": 25320 + }, + { + "epoch": 0.5637909544159544, + "grad_norm": 0.9626232385635376, + "learning_rate": 0.00036617727232111393, + "loss": 0.7111, + "step": 25330 + }, + { + "epoch": 0.5640135327635327, + "grad_norm": 0.5444539785385132, + "learning_rate": 0.000366151325230319, + "loss": 0.5697, + "step": 25340 + }, + { + "epoch": 0.5642361111111112, + "grad_norm": 0.8597228527069092, + "learning_rate": 0.00036612536911072513, + "loss": 0.7058, + "step": 25350 + }, + { + "epoch": 0.5644586894586895, + "grad_norm": 0.46579617261886597, + "learning_rate": 0.0003660994039637427, + "loss": 0.5261, + "step": 25360 + }, + { + "epoch": 0.5646812678062678, + "grad_norm": 0.7070087790489197, + "learning_rate": 0.0003660734297907826, + "loss": 0.664, + "step": 25370 + }, + { + "epoch": 0.5649038461538461, + "grad_norm": 0.7118215560913086, + "learning_rate": 0.0003660474465932565, + "loss": 0.7071, + "step": 25380 + }, + { + "epoch": 0.5651264245014245, + "grad_norm": 0.6779159307479858, + "learning_rate": 0.00036602145437257614, + "loss": 0.5657, + "step": 25390 + }, + { + "epoch": 0.5653490028490028, + "grad_norm": 0.5989352464675903, + "learning_rate": 0.00036599545313015404, + "loss": 0.5557, + "step": 25400 + }, + { + "epoch": 0.5655715811965812, + "grad_norm": 0.8546493053436279, + "learning_rate": 0.0003659694428674032, + "loss": 0.7181, + "step": 25410 + }, + { + "epoch": 0.5657941595441596, + "grad_norm": 0.6739733219146729, + "learning_rate": 0.00036594342358573683, + "loss": 0.566, + "step": 25420 + }, + { + "epoch": 0.5660167378917379, + "grad_norm": 0.9013862013816833, + "learning_rate": 0.00036591739528656905, + "loss": 0.7043, + "step": 25430 + }, + { + "epoch": 0.5662393162393162, + "grad_norm": 0.7019580602645874, + "learning_rate": 0.0003658913579713142, + "loss": 0.6938, + "step": 25440 + }, + { + "epoch": 0.5664618945868946, + "grad_norm": 0.6055698394775391, + "learning_rate": 0.00036586531164138706, + "loss": 0.6043, + "step": 25450 + }, + { + "epoch": 0.5666844729344729, + "grad_norm": 0.5486964583396912, + "learning_rate": 0.0003658392562982032, + "loss": 0.6373, + "step": 25460 + }, + { + "epoch": 0.5669070512820513, + "grad_norm": 0.9835435748100281, + "learning_rate": 0.0003658131919431784, + "loss": 0.6492, + "step": 25470 + }, + { + "epoch": 0.5671296296296297, + "grad_norm": 1.1097290515899658, + "learning_rate": 0.000365787118577729, + "loss": 0.52, + "step": 25480 + }, + { + "epoch": 0.567352207977208, + "grad_norm": 0.7063470482826233, + "learning_rate": 0.0003657610362032718, + "loss": 0.6542, + "step": 25490 + }, + { + "epoch": 0.5675747863247863, + "grad_norm": 1.0969784259796143, + "learning_rate": 0.00036573494482122423, + "loss": 0.68, + "step": 25500 + }, + { + "epoch": 0.5677973646723646, + "grad_norm": 0.7936034202575684, + "learning_rate": 0.00036570884443300406, + "loss": 0.5786, + "step": 25510 + }, + { + "epoch": 0.5680199430199431, + "grad_norm": 0.7072309851646423, + "learning_rate": 0.00036568273504002964, + "loss": 0.6274, + "step": 25520 + }, + { + "epoch": 0.5682425213675214, + "grad_norm": 1.030142903327942, + "learning_rate": 0.0003656566166437198, + "loss": 0.6594, + "step": 25530 + }, + { + "epoch": 0.5684650997150997, + "grad_norm": 0.7467629909515381, + "learning_rate": 0.00036563048924549376, + "loss": 0.5441, + "step": 25540 + }, + { + "epoch": 0.5686876780626781, + "grad_norm": 0.5796084403991699, + "learning_rate": 0.0003656043528467714, + "loss": 0.5869, + "step": 25550 + }, + { + "epoch": 0.5689102564102564, + "grad_norm": 0.5709526538848877, + "learning_rate": 0.00036557820744897285, + "loss": 0.5471, + "step": 25560 + }, + { + "epoch": 0.5691328347578347, + "grad_norm": 1.1228928565979004, + "learning_rate": 0.000365552053053519, + "loss": 0.7536, + "step": 25570 + }, + { + "epoch": 0.5693554131054132, + "grad_norm": 0.6233734488487244, + "learning_rate": 0.00036552588966183103, + "loss": 0.638, + "step": 25580 + }, + { + "epoch": 0.5695779914529915, + "grad_norm": 0.848446249961853, + "learning_rate": 0.00036549971727533074, + "loss": 0.5712, + "step": 25590 + }, + { + "epoch": 0.5698005698005698, + "grad_norm": 0.7704927921295166, + "learning_rate": 0.00036547353589544033, + "loss": 0.5454, + "step": 25600 + }, + { + "epoch": 0.5700231481481481, + "grad_norm": 0.5541343092918396, + "learning_rate": 0.00036544734552358254, + "loss": 0.5125, + "step": 25610 + }, + { + "epoch": 0.5702457264957265, + "grad_norm": 0.6036107540130615, + "learning_rate": 0.0003654211461611805, + "loss": 0.6403, + "step": 25620 + }, + { + "epoch": 0.5704683048433048, + "grad_norm": 0.996376097202301, + "learning_rate": 0.000365394937809658, + "loss": 0.608, + "step": 25630 + }, + { + "epoch": 0.5706908831908832, + "grad_norm": 0.5010185837745667, + "learning_rate": 0.0003653687204704391, + "loss": 0.4907, + "step": 25640 + }, + { + "epoch": 0.5709134615384616, + "grad_norm": 1.5402705669403076, + "learning_rate": 0.0003653424941449487, + "loss": 0.6147, + "step": 25650 + }, + { + "epoch": 0.5711360398860399, + "grad_norm": 0.9346346259117126, + "learning_rate": 0.0003653162588346117, + "loss": 0.7224, + "step": 25660 + }, + { + "epoch": 0.5713586182336182, + "grad_norm": 0.8029820919036865, + "learning_rate": 0.00036529001454085387, + "loss": 0.5599, + "step": 25670 + }, + { + "epoch": 0.5715811965811965, + "grad_norm": 0.7116772532463074, + "learning_rate": 0.00036526376126510136, + "loss": 0.6636, + "step": 25680 + }, + { + "epoch": 0.5718037749287749, + "grad_norm": 0.4518112242221832, + "learning_rate": 0.0003652374990087807, + "loss": 0.6618, + "step": 25690 + }, + { + "epoch": 0.5720263532763533, + "grad_norm": 0.6178964376449585, + "learning_rate": 0.0003652112277733192, + "loss": 0.4945, + "step": 25700 + }, + { + "epoch": 0.5722489316239316, + "grad_norm": 0.5029324889183044, + "learning_rate": 0.0003651849475601443, + "loss": 0.5303, + "step": 25710 + }, + { + "epoch": 0.57247150997151, + "grad_norm": 0.6093102693557739, + "learning_rate": 0.00036515865837068406, + "loss": 0.5364, + "step": 25720 + }, + { + "epoch": 0.5726940883190883, + "grad_norm": 0.6700037121772766, + "learning_rate": 0.0003651323602063672, + "loss": 0.6722, + "step": 25730 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.5653918385505676, + "learning_rate": 0.0003651060530686226, + "loss": 0.5539, + "step": 25740 + }, + { + "epoch": 0.5731392450142451, + "grad_norm": 1.0705516338348389, + "learning_rate": 0.00036507973695888, + "loss": 0.7888, + "step": 25750 + }, + { + "epoch": 0.5733618233618234, + "grad_norm": 0.5369784235954285, + "learning_rate": 0.0003650534118785693, + "loss": 0.5768, + "step": 25760 + }, + { + "epoch": 0.5735844017094017, + "grad_norm": 0.7054560780525208, + "learning_rate": 0.0003650270778291211, + "loss": 0.6524, + "step": 25770 + }, + { + "epoch": 0.57380698005698, + "grad_norm": 0.5634106397628784, + "learning_rate": 0.00036500073481196646, + "loss": 0.6372, + "step": 25780 + }, + { + "epoch": 0.5740295584045584, + "grad_norm": 0.6334834098815918, + "learning_rate": 0.0003649743828285368, + "loss": 0.6347, + "step": 25790 + }, + { + "epoch": 0.5742521367521367, + "grad_norm": 0.8807213306427002, + "learning_rate": 0.00036494802188026396, + "loss": 0.6003, + "step": 25800 + }, + { + "epoch": 0.5744747150997151, + "grad_norm": 0.8929731845855713, + "learning_rate": 0.0003649216519685807, + "loss": 0.7382, + "step": 25810 + }, + { + "epoch": 0.5746972934472935, + "grad_norm": 0.43042662739753723, + "learning_rate": 0.0003648952730949199, + "loss": 0.6431, + "step": 25820 + }, + { + "epoch": 0.5749198717948718, + "grad_norm": 0.6679039597511292, + "learning_rate": 0.00036486888526071496, + "loss": 0.7355, + "step": 25830 + }, + { + "epoch": 0.5751424501424501, + "grad_norm": 0.8297902941703796, + "learning_rate": 0.00036484248846739976, + "loss": 0.7014, + "step": 25840 + }, + { + "epoch": 0.5753650284900285, + "grad_norm": 0.4913148283958435, + "learning_rate": 0.0003648160827164088, + "loss": 0.5351, + "step": 25850 + }, + { + "epoch": 0.5755876068376068, + "grad_norm": 0.7515653371810913, + "learning_rate": 0.000364789668009177, + "loss": 0.6329, + "step": 25860 + }, + { + "epoch": 0.5758101851851852, + "grad_norm": 0.7551255822181702, + "learning_rate": 0.0003647632443471398, + "loss": 0.6201, + "step": 25870 + }, + { + "epoch": 0.5760327635327636, + "grad_norm": 0.8179723024368286, + "learning_rate": 0.00036473681173173294, + "loss": 0.6514, + "step": 25880 + }, + { + "epoch": 0.5762553418803419, + "grad_norm": 0.4706272482872009, + "learning_rate": 0.00036471037016439287, + "loss": 0.7006, + "step": 25890 + }, + { + "epoch": 0.5764779202279202, + "grad_norm": 0.6636437773704529, + "learning_rate": 0.0003646839196465565, + "loss": 0.6523, + "step": 25900 + }, + { + "epoch": 0.5767004985754985, + "grad_norm": 0.7169828414916992, + "learning_rate": 0.0003646574601796611, + "loss": 0.6445, + "step": 25910 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 0.7686861753463745, + "learning_rate": 0.00036463099176514447, + "loss": 0.5783, + "step": 25920 + }, + { + "epoch": 0.5771456552706553, + "grad_norm": 0.6391460299491882, + "learning_rate": 0.000364604514404445, + "loss": 0.5322, + "step": 25930 + }, + { + "epoch": 0.5773682336182336, + "grad_norm": 0.79499751329422, + "learning_rate": 0.0003645780280990015, + "loss": 0.659, + "step": 25940 + }, + { + "epoch": 0.577590811965812, + "grad_norm": 0.7389349341392517, + "learning_rate": 0.0003645515328502532, + "loss": 0.5739, + "step": 25950 + }, + { + "epoch": 0.5778133903133903, + "grad_norm": 0.8088229894638062, + "learning_rate": 0.0003645250286596399, + "loss": 0.528, + "step": 25960 + }, + { + "epoch": 0.5780359686609686, + "grad_norm": 0.4672555923461914, + "learning_rate": 0.00036449851552860184, + "loss": 0.6496, + "step": 25970 + }, + { + "epoch": 0.5782585470085471, + "grad_norm": 0.7340976595878601, + "learning_rate": 0.00036447199345857983, + "loss": 0.7119, + "step": 25980 + }, + { + "epoch": 0.5784811253561254, + "grad_norm": 0.5094794631004333, + "learning_rate": 0.000364445462451015, + "loss": 0.6694, + "step": 25990 + }, + { + "epoch": 0.5787037037037037, + "grad_norm": 0.6030073165893555, + "learning_rate": 0.00036441892250734914, + "loss": 0.6226, + "step": 26000 + }, + { + "epoch": 0.578926282051282, + "grad_norm": 0.7402897477149963, + "learning_rate": 0.0003643923736290244, + "loss": 0.6389, + "step": 26010 + }, + { + "epoch": 0.5791488603988604, + "grad_norm": 0.48543375730514526, + "learning_rate": 0.00036436581581748357, + "loss": 0.5502, + "step": 26020 + }, + { + "epoch": 0.5793714387464387, + "grad_norm": 0.7590353488922119, + "learning_rate": 0.0003643392490741697, + "loss": 0.715, + "step": 26030 + }, + { + "epoch": 0.5795940170940171, + "grad_norm": 0.7811538577079773, + "learning_rate": 0.0003643126734005265, + "loss": 0.6448, + "step": 26040 + }, + { + "epoch": 0.5798165954415955, + "grad_norm": 0.7644380927085876, + "learning_rate": 0.00036428608879799816, + "loss": 0.6684, + "step": 26050 + }, + { + "epoch": 0.5800391737891738, + "grad_norm": 0.7908637523651123, + "learning_rate": 0.0003642594952680292, + "loss": 0.7409, + "step": 26060 + }, + { + "epoch": 0.5802617521367521, + "grad_norm": 0.539625346660614, + "learning_rate": 0.00036423289281206487, + "loss": 0.6192, + "step": 26070 + }, + { + "epoch": 0.5804843304843305, + "grad_norm": 1.0244312286376953, + "learning_rate": 0.0003642062814315506, + "loss": 0.7173, + "step": 26080 + }, + { + "epoch": 0.5807069088319088, + "grad_norm": 0.5301309823989868, + "learning_rate": 0.0003641796611279327, + "loss": 0.5922, + "step": 26090 + }, + { + "epoch": 0.5809294871794872, + "grad_norm": 0.769178569316864, + "learning_rate": 0.00036415303190265747, + "loss": 0.5554, + "step": 26100 + }, + { + "epoch": 0.5811520655270656, + "grad_norm": 0.47896939516067505, + "learning_rate": 0.0003641263937571722, + "loss": 0.5712, + "step": 26110 + }, + { + "epoch": 0.5813746438746439, + "grad_norm": 0.49155372381210327, + "learning_rate": 0.0003640997466929243, + "loss": 0.5569, + "step": 26120 + }, + { + "epoch": 0.5815972222222222, + "grad_norm": 0.9530248641967773, + "learning_rate": 0.00036407309071136184, + "loss": 0.7337, + "step": 26130 + }, + { + "epoch": 0.5818198005698005, + "grad_norm": 0.639849841594696, + "learning_rate": 0.0003640464258139333, + "loss": 0.7232, + "step": 26140 + }, + { + "epoch": 0.5820423789173789, + "grad_norm": 0.5914270877838135, + "learning_rate": 0.0003640197520020877, + "loss": 0.6236, + "step": 26150 + }, + { + "epoch": 0.5822649572649573, + "grad_norm": 1.2797378301620483, + "learning_rate": 0.00036399306927727446, + "loss": 0.6709, + "step": 26160 + }, + { + "epoch": 0.5824875356125356, + "grad_norm": 0.7411321997642517, + "learning_rate": 0.00036396637764094365, + "loss": 0.6829, + "step": 26170 + }, + { + "epoch": 0.582710113960114, + "grad_norm": 0.7942419648170471, + "learning_rate": 0.0003639396770945456, + "loss": 0.6094, + "step": 26180 + }, + { + "epoch": 0.5829326923076923, + "grad_norm": 0.5377127528190613, + "learning_rate": 0.0003639129676395313, + "loss": 0.5491, + "step": 26190 + }, + { + "epoch": 0.5831552706552706, + "grad_norm": 0.6508163213729858, + "learning_rate": 0.00036388624927735224, + "loss": 0.6355, + "step": 26200 + }, + { + "epoch": 0.5833778490028491, + "grad_norm": 0.761579692363739, + "learning_rate": 0.0003638595220094601, + "loss": 0.509, + "step": 26210 + }, + { + "epoch": 0.5836004273504274, + "grad_norm": 0.6185252666473389, + "learning_rate": 0.00036383278583730747, + "loss": 0.6034, + "step": 26220 + }, + { + "epoch": 0.5838230056980057, + "grad_norm": 0.7071433067321777, + "learning_rate": 0.0003638060407623471, + "loss": 0.4562, + "step": 26230 + }, + { + "epoch": 0.584045584045584, + "grad_norm": 0.4515877068042755, + "learning_rate": 0.00036377928678603237, + "loss": 0.5233, + "step": 26240 + }, + { + "epoch": 0.5842681623931624, + "grad_norm": 0.5538159608840942, + "learning_rate": 0.0003637525239098172, + "loss": 0.4903, + "step": 26250 + }, + { + "epoch": 0.5844907407407407, + "grad_norm": 0.6422973275184631, + "learning_rate": 0.00036372575213515577, + "loss": 0.6706, + "step": 26260 + }, + { + "epoch": 0.5847133190883191, + "grad_norm": 0.6675984263420105, + "learning_rate": 0.0003636989714635029, + "loss": 0.6804, + "step": 26270 + }, + { + "epoch": 0.5849358974358975, + "grad_norm": 0.6627989411354065, + "learning_rate": 0.0003636721818963139, + "loss": 0.6538, + "step": 26280 + }, + { + "epoch": 0.5851584757834758, + "grad_norm": 0.5040997862815857, + "learning_rate": 0.0003636453834350446, + "loss": 0.5534, + "step": 26290 + }, + { + "epoch": 0.5853810541310541, + "grad_norm": 0.8540357351303101, + "learning_rate": 0.0003636185760811512, + "loss": 0.701, + "step": 26300 + }, + { + "epoch": 0.5856036324786325, + "grad_norm": 0.5554685592651367, + "learning_rate": 0.0003635917598360904, + "loss": 0.5512, + "step": 26310 + }, + { + "epoch": 0.5858262108262108, + "grad_norm": 0.6635841131210327, + "learning_rate": 0.0003635649347013195, + "loss": 0.6747, + "step": 26320 + }, + { + "epoch": 0.5860487891737892, + "grad_norm": 1.081513524055481, + "learning_rate": 0.00036353810067829616, + "loss": 0.5939, + "step": 26330 + }, + { + "epoch": 0.5862713675213675, + "grad_norm": 0.5436891913414001, + "learning_rate": 0.0003635112577684785, + "loss": 0.5855, + "step": 26340 + }, + { + "epoch": 0.5864939458689459, + "grad_norm": 0.586488664150238, + "learning_rate": 0.00036348440597332523, + "loss": 0.6242, + "step": 26350 + }, + { + "epoch": 0.5867165242165242, + "grad_norm": 0.5410413146018982, + "learning_rate": 0.00036345754529429553, + "loss": 0.5378, + "step": 26360 + }, + { + "epoch": 0.5869391025641025, + "grad_norm": 0.9179112315177917, + "learning_rate": 0.000363430675732849, + "loss": 0.6296, + "step": 26370 + }, + { + "epoch": 0.5871616809116809, + "grad_norm": 0.6292691826820374, + "learning_rate": 0.00036340379729044573, + "loss": 0.6738, + "step": 26380 + }, + { + "epoch": 0.5873842592592593, + "grad_norm": 0.47527605295181274, + "learning_rate": 0.00036337690996854637, + "loss": 0.6117, + "step": 26390 + }, + { + "epoch": 0.5876068376068376, + "grad_norm": 0.779073178768158, + "learning_rate": 0.00036335001376861203, + "loss": 0.6211, + "step": 26400 + }, + { + "epoch": 0.587829415954416, + "grad_norm": 0.765661358833313, + "learning_rate": 0.0003633231086921042, + "loss": 0.5958, + "step": 26410 + }, + { + "epoch": 0.5880519943019943, + "grad_norm": 0.7019172310829163, + "learning_rate": 0.00036329619474048485, + "loss": 0.5939, + "step": 26420 + }, + { + "epoch": 0.5882745726495726, + "grad_norm": 0.4380790591239929, + "learning_rate": 0.00036326927191521663, + "loss": 0.6325, + "step": 26430 + }, + { + "epoch": 0.5884971509971509, + "grad_norm": 0.6067693829536438, + "learning_rate": 0.0003632423402177626, + "loss": 0.5798, + "step": 26440 + }, + { + "epoch": 0.5887197293447294, + "grad_norm": 0.6987513899803162, + "learning_rate": 0.00036321539964958606, + "loss": 0.6136, + "step": 26450 + }, + { + "epoch": 0.5889423076923077, + "grad_norm": 0.7827839255332947, + "learning_rate": 0.0003631884502121511, + "loss": 0.5435, + "step": 26460 + }, + { + "epoch": 0.589164886039886, + "grad_norm": 0.6228511333465576, + "learning_rate": 0.00036316149190692223, + "loss": 0.5427, + "step": 26470 + }, + { + "epoch": 0.5893874643874644, + "grad_norm": 0.7204742431640625, + "learning_rate": 0.00036313452473536425, + "loss": 0.6434, + "step": 26480 + }, + { + "epoch": 0.5896100427350427, + "grad_norm": 0.6403375864028931, + "learning_rate": 0.0003631075486989427, + "loss": 0.6231, + "step": 26490 + }, + { + "epoch": 0.5898326210826211, + "grad_norm": 0.46823328733444214, + "learning_rate": 0.00036308056379912344, + "loss": 0.5668, + "step": 26500 + }, + { + "epoch": 0.5900551994301995, + "grad_norm": 0.5952900052070618, + "learning_rate": 0.00036305357003737284, + "loss": 0.5498, + "step": 26510 + }, + { + "epoch": 0.5902777777777778, + "grad_norm": 0.8604204654693604, + "learning_rate": 0.0003630265674151577, + "loss": 0.6581, + "step": 26520 + }, + { + "epoch": 0.5905003561253561, + "grad_norm": 1.0747565031051636, + "learning_rate": 0.00036299955593394544, + "loss": 0.6561, + "step": 26530 + }, + { + "epoch": 0.5907229344729344, + "grad_norm": 0.7867896556854248, + "learning_rate": 0.0003629725355952039, + "loss": 0.668, + "step": 26540 + }, + { + "epoch": 0.5909455128205128, + "grad_norm": 0.6911695599555969, + "learning_rate": 0.0003629455064004014, + "loss": 0.7318, + "step": 26550 + }, + { + "epoch": 0.5911680911680912, + "grad_norm": 0.65760737657547, + "learning_rate": 0.00036291846835100663, + "loss": 0.709, + "step": 26560 + }, + { + "epoch": 0.5913906695156695, + "grad_norm": 0.5564249157905579, + "learning_rate": 0.0003628914214484889, + "loss": 0.6305, + "step": 26570 + }, + { + "epoch": 0.5916132478632479, + "grad_norm": 0.6941995620727539, + "learning_rate": 0.00036286436569431805, + "loss": 0.7899, + "step": 26580 + }, + { + "epoch": 0.5918358262108262, + "grad_norm": 0.4514349699020386, + "learning_rate": 0.0003628373010899642, + "loss": 0.5176, + "step": 26590 + }, + { + "epoch": 0.5920584045584045, + "grad_norm": 0.4993559420108795, + "learning_rate": 0.0003628102276368981, + "loss": 0.6228, + "step": 26600 + }, + { + "epoch": 0.5922809829059829, + "grad_norm": 0.6619306802749634, + "learning_rate": 0.00036278314533659095, + "loss": 0.6596, + "step": 26610 + }, + { + "epoch": 0.5925035612535613, + "grad_norm": 0.6388404965400696, + "learning_rate": 0.0003627560541905144, + "loss": 0.5418, + "step": 26620 + }, + { + "epoch": 0.5927261396011396, + "grad_norm": 0.7671358585357666, + "learning_rate": 0.00036272895420014066, + "loss": 0.6312, + "step": 26630 + }, + { + "epoch": 0.592948717948718, + "grad_norm": 0.4864520728588104, + "learning_rate": 0.0003627018453669423, + "loss": 0.528, + "step": 26640 + }, + { + "epoch": 0.5931712962962963, + "grad_norm": 0.6411517262458801, + "learning_rate": 0.0003626747276923925, + "loss": 0.5324, + "step": 26650 + }, + { + "epoch": 0.5933938746438746, + "grad_norm": 0.5414084792137146, + "learning_rate": 0.00036264760117796484, + "loss": 0.5629, + "step": 26660 + }, + { + "epoch": 0.5936164529914529, + "grad_norm": 0.7658395171165466, + "learning_rate": 0.00036262046582513337, + "loss": 0.6198, + "step": 26670 + }, + { + "epoch": 0.5938390313390314, + "grad_norm": 0.5669873356819153, + "learning_rate": 0.00036259332163537266, + "loss": 0.5935, + "step": 26680 + }, + { + "epoch": 0.5940616096866097, + "grad_norm": 0.635143518447876, + "learning_rate": 0.0003625661686101578, + "loss": 0.5584, + "step": 26690 + }, + { + "epoch": 0.594284188034188, + "grad_norm": 0.8045666217803955, + "learning_rate": 0.0003625390067509641, + "loss": 0.5575, + "step": 26700 + }, + { + "epoch": 0.5945067663817664, + "grad_norm": 0.5953568816184998, + "learning_rate": 0.0003625118360592678, + "loss": 0.6082, + "step": 26710 + }, + { + "epoch": 0.5947293447293447, + "grad_norm": 0.6741862893104553, + "learning_rate": 0.0003624846565365453, + "loss": 0.626, + "step": 26720 + }, + { + "epoch": 0.5949519230769231, + "grad_norm": 0.4664577841758728, + "learning_rate": 0.0003624574681842736, + "loss": 0.6427, + "step": 26730 + }, + { + "epoch": 0.5951745014245015, + "grad_norm": 0.9127620458602905, + "learning_rate": 0.00036243027100393, + "loss": 0.7419, + "step": 26740 + }, + { + "epoch": 0.5953970797720798, + "grad_norm": 0.6853809952735901, + "learning_rate": 0.00036240306499699256, + "loss": 0.5637, + "step": 26750 + }, + { + "epoch": 0.5956196581196581, + "grad_norm": 0.8793456554412842, + "learning_rate": 0.00036237585016493955, + "loss": 0.7329, + "step": 26760 + }, + { + "epoch": 0.5958422364672364, + "grad_norm": 0.7338986396789551, + "learning_rate": 0.00036234862650925, + "loss": 0.7303, + "step": 26770 + }, + { + "epoch": 0.5960648148148148, + "grad_norm": 1.601108431816101, + "learning_rate": 0.00036232139403140313, + "loss": 0.7491, + "step": 26780 + }, + { + "epoch": 0.5962873931623932, + "grad_norm": 0.8078438639640808, + "learning_rate": 0.0003622941527328788, + "loss": 0.6711, + "step": 26790 + }, + { + "epoch": 0.5965099715099715, + "grad_norm": 0.7182638049125671, + "learning_rate": 0.00036226690261515734, + "loss": 0.6057, + "step": 26800 + }, + { + "epoch": 0.5967325498575499, + "grad_norm": 0.5949748158454895, + "learning_rate": 0.0003622396436797196, + "loss": 0.6201, + "step": 26810 + }, + { + "epoch": 0.5969551282051282, + "grad_norm": 0.6344726085662842, + "learning_rate": 0.0003622123759280468, + "loss": 0.5434, + "step": 26820 + }, + { + "epoch": 0.5971777065527065, + "grad_norm": 0.6737859845161438, + "learning_rate": 0.00036218509936162077, + "loss": 0.6778, + "step": 26830 + }, + { + "epoch": 0.5974002849002849, + "grad_norm": 0.6358687281608582, + "learning_rate": 0.0003621578139819236, + "loss": 0.5214, + "step": 26840 + }, + { + "epoch": 0.5976228632478633, + "grad_norm": 0.6937312483787537, + "learning_rate": 0.00036213051979043807, + "loss": 0.5482, + "step": 26850 + }, + { + "epoch": 0.5978454415954416, + "grad_norm": 0.809160053730011, + "learning_rate": 0.0003621032167886473, + "loss": 0.6438, + "step": 26860 + }, + { + "epoch": 0.59806801994302, + "grad_norm": 0.7212644815444946, + "learning_rate": 0.0003620759049780351, + "loss": 0.6243, + "step": 26870 + }, + { + "epoch": 0.5982905982905983, + "grad_norm": 0.8717097640037537, + "learning_rate": 0.0003620485843600856, + "loss": 0.6122, + "step": 26880 + }, + { + "epoch": 0.5985131766381766, + "grad_norm": 0.546472430229187, + "learning_rate": 0.00036202125493628326, + "loss": 0.6884, + "step": 26890 + }, + { + "epoch": 0.5987357549857549, + "grad_norm": 0.8792861700057983, + "learning_rate": 0.00036199391670811335, + "loss": 0.7374, + "step": 26900 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.6809139251708984, + "learning_rate": 0.0003619665696770614, + "loss": 0.6165, + "step": 26910 + }, + { + "epoch": 0.5991809116809117, + "grad_norm": 0.7540663480758667, + "learning_rate": 0.00036193921384461344, + "loss": 0.605, + "step": 26920 + }, + { + "epoch": 0.59940349002849, + "grad_norm": 0.6481123566627502, + "learning_rate": 0.000361911849212256, + "loss": 0.6943, + "step": 26930 + }, + { + "epoch": 0.5996260683760684, + "grad_norm": 0.8331771492958069, + "learning_rate": 0.00036188447578147615, + "loss": 0.737, + "step": 26940 + }, + { + "epoch": 0.5998486467236467, + "grad_norm": 0.6901444792747498, + "learning_rate": 0.0003618570935537614, + "loss": 0.6721, + "step": 26950 + }, + { + "epoch": 0.6000712250712251, + "grad_norm": 0.763639509677887, + "learning_rate": 0.00036182970253059965, + "loss": 0.554, + "step": 26960 + }, + { + "epoch": 0.6000712250712251, + "eval_loss": 0.6276374459266663, + "eval_runtime": 337.4029, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 7.009, + "step": 26960 + }, + { + "epoch": 0.6002938034188035, + "grad_norm": 0.6949964761734009, + "learning_rate": 0.0003618023027134794, + "loss": 0.6418, + "step": 26970 + }, + { + "epoch": 0.6005163817663818, + "grad_norm": 0.8426802754402161, + "learning_rate": 0.00036177489410388954, + "loss": 0.5959, + "step": 26980 + }, + { + "epoch": 0.6007389601139601, + "grad_norm": 0.709507167339325, + "learning_rate": 0.0003617474767033195, + "loss": 0.6305, + "step": 26990 + }, + { + "epoch": 0.6009615384615384, + "grad_norm": 0.7175819873809814, + "learning_rate": 0.00036172005051325916, + "loss": 0.4853, + "step": 27000 + }, + { + "epoch": 0.6011841168091168, + "grad_norm": 0.4408554434776306, + "learning_rate": 0.00036169261553519887, + "loss": 0.6845, + "step": 27010 + }, + { + "epoch": 0.6014066951566952, + "grad_norm": 0.7172206044197083, + "learning_rate": 0.00036166517177062957, + "loss": 0.5734, + "step": 27020 + }, + { + "epoch": 0.6016292735042735, + "grad_norm": 1.5862455368041992, + "learning_rate": 0.00036163771922104236, + "loss": 0.6894, + "step": 27030 + }, + { + "epoch": 0.6018518518518519, + "grad_norm": 0.9772451519966125, + "learning_rate": 0.0003616102578879293, + "loss": 0.6468, + "step": 27040 + }, + { + "epoch": 0.6020744301994302, + "grad_norm": 1.0505484342575073, + "learning_rate": 0.00036158278777278244, + "loss": 0.5143, + "step": 27050 + }, + { + "epoch": 0.6022970085470085, + "grad_norm": 0.5887817740440369, + "learning_rate": 0.0003615553088770946, + "loss": 0.5918, + "step": 27060 + }, + { + "epoch": 0.6025195868945868, + "grad_norm": 0.7041050791740417, + "learning_rate": 0.0003615278212023591, + "loss": 0.5378, + "step": 27070 + }, + { + "epoch": 0.6027421652421653, + "grad_norm": 0.8429660201072693, + "learning_rate": 0.00036150032475006945, + "loss": 0.7211, + "step": 27080 + }, + { + "epoch": 0.6029647435897436, + "grad_norm": 0.8379825353622437, + "learning_rate": 0.00036147281952172, + "loss": 0.671, + "step": 27090 + }, + { + "epoch": 0.6031873219373219, + "grad_norm": 0.9291252493858337, + "learning_rate": 0.0003614453055188054, + "loss": 0.581, + "step": 27100 + }, + { + "epoch": 0.6034099002849003, + "grad_norm": 0.8719322681427002, + "learning_rate": 0.0003614177827428207, + "loss": 0.6302, + "step": 27110 + }, + { + "epoch": 0.6036324786324786, + "grad_norm": 0.6128899455070496, + "learning_rate": 0.0003613902511952615, + "loss": 0.7683, + "step": 27120 + }, + { + "epoch": 0.6038550569800569, + "grad_norm": 0.9219420552253723, + "learning_rate": 0.00036136271087762396, + "loss": 0.5287, + "step": 27130 + }, + { + "epoch": 0.6040776353276354, + "grad_norm": 0.6347882747650146, + "learning_rate": 0.0003613351617914047, + "loss": 0.6586, + "step": 27140 + }, + { + "epoch": 0.6043002136752137, + "grad_norm": 0.7034119963645935, + "learning_rate": 0.0003613076039381006, + "loss": 0.5183, + "step": 27150 + }, + { + "epoch": 0.604522792022792, + "grad_norm": 0.8866810202598572, + "learning_rate": 0.00036128003731920915, + "loss": 0.6867, + "step": 27160 + }, + { + "epoch": 0.6047453703703703, + "grad_norm": 0.9018061757087708, + "learning_rate": 0.0003612524619362286, + "loss": 0.5517, + "step": 27170 + }, + { + "epoch": 0.6049679487179487, + "grad_norm": 0.6314164400100708, + "learning_rate": 0.00036122487779065716, + "loss": 0.6991, + "step": 27180 + }, + { + "epoch": 0.6051905270655271, + "grad_norm": 0.7263935804367065, + "learning_rate": 0.00036119728488399395, + "loss": 0.5825, + "step": 27190 + }, + { + "epoch": 0.6054131054131054, + "grad_norm": 0.6667831540107727, + "learning_rate": 0.00036116968321773824, + "loss": 0.5671, + "step": 27200 + }, + { + "epoch": 0.6056356837606838, + "grad_norm": 0.8208069801330566, + "learning_rate": 0.0003611420727933901, + "loss": 0.6483, + "step": 27210 + }, + { + "epoch": 0.6058582621082621, + "grad_norm": 0.7313910126686096, + "learning_rate": 0.00036111445361244974, + "loss": 0.585, + "step": 27220 + }, + { + "epoch": 0.6060808404558404, + "grad_norm": 0.6981318593025208, + "learning_rate": 0.00036108682567641807, + "loss": 0.6057, + "step": 27230 + }, + { + "epoch": 0.6063034188034188, + "grad_norm": 0.40817198157310486, + "learning_rate": 0.00036105918898679643, + "loss": 0.5288, + "step": 27240 + }, + { + "epoch": 0.6065259971509972, + "grad_norm": 1.2144339084625244, + "learning_rate": 0.0003610315435450866, + "loss": 0.8191, + "step": 27250 + }, + { + "epoch": 0.6067485754985755, + "grad_norm": 0.712494432926178, + "learning_rate": 0.0003610038893527909, + "loss": 0.5641, + "step": 27260 + }, + { + "epoch": 0.6069711538461539, + "grad_norm": 0.724338710308075, + "learning_rate": 0.000360976226411412, + "loss": 0.589, + "step": 27270 + }, + { + "epoch": 0.6071937321937322, + "grad_norm": 0.6945069432258606, + "learning_rate": 0.00036094855472245323, + "loss": 0.6487, + "step": 27280 + }, + { + "epoch": 0.6074163105413105, + "grad_norm": 0.6646556854248047, + "learning_rate": 0.0003609208742874182, + "loss": 0.6672, + "step": 27290 + }, + { + "epoch": 0.6076388888888888, + "grad_norm": 0.6772031188011169, + "learning_rate": 0.0003608931851078111, + "loss": 0.5538, + "step": 27300 + }, + { + "epoch": 0.6078614672364673, + "grad_norm": 0.4915063679218292, + "learning_rate": 0.00036086548718513667, + "loss": 0.7779, + "step": 27310 + }, + { + "epoch": 0.6080840455840456, + "grad_norm": 0.669331431388855, + "learning_rate": 0.0003608377805209, + "loss": 0.6369, + "step": 27320 + }, + { + "epoch": 0.6083066239316239, + "grad_norm": 0.5951343178749084, + "learning_rate": 0.00036081006511660664, + "loss": 0.6196, + "step": 27330 + }, + { + "epoch": 0.6085292022792023, + "grad_norm": 0.6411831378936768, + "learning_rate": 0.0003607823409737627, + "loss": 0.4927, + "step": 27340 + }, + { + "epoch": 0.6087517806267806, + "grad_norm": 0.7948203682899475, + "learning_rate": 0.00036075460809387465, + "loss": 0.772, + "step": 27350 + }, + { + "epoch": 0.6089743589743589, + "grad_norm": 0.5307528972625732, + "learning_rate": 0.00036072686647844966, + "loss": 0.5684, + "step": 27360 + }, + { + "epoch": 0.6091969373219374, + "grad_norm": 0.9788020253181458, + "learning_rate": 0.0003606991161289952, + "loss": 0.5741, + "step": 27370 + }, + { + "epoch": 0.6094195156695157, + "grad_norm": 0.5573745369911194, + "learning_rate": 0.0003606713570470192, + "loss": 0.6287, + "step": 27380 + }, + { + "epoch": 0.609642094017094, + "grad_norm": 0.655276894569397, + "learning_rate": 0.00036064358923403007, + "loss": 0.4993, + "step": 27390 + }, + { + "epoch": 0.6098646723646723, + "grad_norm": 0.6795368194580078, + "learning_rate": 0.00036061581269153684, + "loss": 0.5322, + "step": 27400 + }, + { + "epoch": 0.6100872507122507, + "grad_norm": 0.8472557067871094, + "learning_rate": 0.00036058802742104884, + "loss": 0.6817, + "step": 27410 + }, + { + "epoch": 0.6103098290598291, + "grad_norm": 0.38561925292015076, + "learning_rate": 0.00036056023342407597, + "loss": 0.548, + "step": 27420 + }, + { + "epoch": 0.6105324074074074, + "grad_norm": 0.5530402064323425, + "learning_rate": 0.0003605324307021286, + "loss": 0.599, + "step": 27430 + }, + { + "epoch": 0.6107549857549858, + "grad_norm": 0.6635321974754333, + "learning_rate": 0.00036050461925671756, + "loss": 0.5592, + "step": 27440 + }, + { + "epoch": 0.6109775641025641, + "grad_norm": 0.6213598847389221, + "learning_rate": 0.000360476799089354, + "loss": 0.6556, + "step": 27450 + }, + { + "epoch": 0.6112001424501424, + "grad_norm": 0.8107698559761047, + "learning_rate": 0.0003604489702015499, + "loss": 0.6625, + "step": 27460 + }, + { + "epoch": 0.6114227207977208, + "grad_norm": 0.8441813588142395, + "learning_rate": 0.0003604211325948174, + "loss": 0.659, + "step": 27470 + }, + { + "epoch": 0.6116452991452992, + "grad_norm": 0.959496021270752, + "learning_rate": 0.00036039328627066915, + "loss": 0.7157, + "step": 27480 + }, + { + "epoch": 0.6118678774928775, + "grad_norm": 1.1635382175445557, + "learning_rate": 0.0003603654312306185, + "loss": 0.6214, + "step": 27490 + }, + { + "epoch": 0.6120904558404558, + "grad_norm": 0.6326111555099487, + "learning_rate": 0.000360337567476179, + "loss": 0.6334, + "step": 27500 + }, + { + "epoch": 0.6123130341880342, + "grad_norm": 0.41505324840545654, + "learning_rate": 0.00036030969500886487, + "loss": 0.5372, + "step": 27510 + }, + { + "epoch": 0.6125356125356125, + "grad_norm": 0.7522630095481873, + "learning_rate": 0.00036028181383019063, + "loss": 0.6667, + "step": 27520 + }, + { + "epoch": 0.6127581908831908, + "grad_norm": 0.7173981070518494, + "learning_rate": 0.00036025392394167143, + "loss": 0.5472, + "step": 27530 + }, + { + "epoch": 0.6129807692307693, + "grad_norm": 0.5531005859375, + "learning_rate": 0.0003602260253448228, + "loss": 0.5993, + "step": 27540 + }, + { + "epoch": 0.6132033475783476, + "grad_norm": 0.9125895500183105, + "learning_rate": 0.00036019811804116077, + "loss": 0.5588, + "step": 27550 + }, + { + "epoch": 0.6134259259259259, + "grad_norm": 0.6552771329879761, + "learning_rate": 0.00036017020203220194, + "loss": 0.4663, + "step": 27560 + }, + { + "epoch": 0.6136485042735043, + "grad_norm": 0.9296733736991882, + "learning_rate": 0.00036014227731946316, + "loss": 0.6318, + "step": 27570 + }, + { + "epoch": 0.6138710826210826, + "grad_norm": 0.82627272605896, + "learning_rate": 0.0003601143439044619, + "loss": 0.6888, + "step": 27580 + }, + { + "epoch": 0.6140936609686609, + "grad_norm": 0.8942478895187378, + "learning_rate": 0.0003600864017887162, + "loss": 0.713, + "step": 27590 + }, + { + "epoch": 0.6143162393162394, + "grad_norm": 1.0062716007232666, + "learning_rate": 0.0003600584509737443, + "loss": 0.5045, + "step": 27600 + }, + { + "epoch": 0.6145388176638177, + "grad_norm": 0.904301643371582, + "learning_rate": 0.00036003049146106516, + "loss": 0.5635, + "step": 27610 + }, + { + "epoch": 0.614761396011396, + "grad_norm": 0.5842049717903137, + "learning_rate": 0.0003600025232521981, + "loss": 0.7147, + "step": 27620 + }, + { + "epoch": 0.6149839743589743, + "grad_norm": 0.3948265016078949, + "learning_rate": 0.0003599745463486629, + "loss": 0.6557, + "step": 27630 + }, + { + "epoch": 0.6152065527065527, + "grad_norm": 0.9642991423606873, + "learning_rate": 0.00035994656075198, + "loss": 0.6969, + "step": 27640 + }, + { + "epoch": 0.6154291310541311, + "grad_norm": 0.5985532999038696, + "learning_rate": 0.00035991856646367, + "loss": 0.5313, + "step": 27650 + }, + { + "epoch": 0.6156517094017094, + "grad_norm": 0.510049045085907, + "learning_rate": 0.00035989056348525414, + "loss": 0.6087, + "step": 27660 + }, + { + "epoch": 0.6158742877492878, + "grad_norm": 0.41505852341651917, + "learning_rate": 0.00035986255181825425, + "loss": 0.4565, + "step": 27670 + }, + { + "epoch": 0.6160968660968661, + "grad_norm": 0.7733290791511536, + "learning_rate": 0.00035983453146419233, + "loss": 0.6904, + "step": 27680 + }, + { + "epoch": 0.6163194444444444, + "grad_norm": 0.8328065872192383, + "learning_rate": 0.0003598065024245912, + "loss": 0.607, + "step": 27690 + }, + { + "epoch": 0.6165420227920227, + "grad_norm": 0.5937551856040955, + "learning_rate": 0.00035977846470097393, + "loss": 0.6013, + "step": 27700 + }, + { + "epoch": 0.6167646011396012, + "grad_norm": 0.4843963086605072, + "learning_rate": 0.000359750418294864, + "loss": 0.6265, + "step": 27710 + }, + { + "epoch": 0.6169871794871795, + "grad_norm": 0.6373627185821533, + "learning_rate": 0.00035972236320778555, + "loss": 0.5394, + "step": 27720 + }, + { + "epoch": 0.6172097578347578, + "grad_norm": 0.6972571611404419, + "learning_rate": 0.0003596942994412632, + "loss": 0.7018, + "step": 27730 + }, + { + "epoch": 0.6174323361823362, + "grad_norm": 0.6809747219085693, + "learning_rate": 0.00035966622699682186, + "loss": 0.6933, + "step": 27740 + }, + { + "epoch": 0.6176549145299145, + "grad_norm": 0.6674181818962097, + "learning_rate": 0.0003596381458759871, + "loss": 0.6342, + "step": 27750 + }, + { + "epoch": 0.6178774928774928, + "grad_norm": 0.5982582569122314, + "learning_rate": 0.0003596100560802847, + "loss": 0.5072, + "step": 27760 + }, + { + "epoch": 0.6181000712250713, + "grad_norm": 0.7734548449516296, + "learning_rate": 0.00035958195761124126, + "loss": 0.5844, + "step": 27770 + }, + { + "epoch": 0.6183226495726496, + "grad_norm": 0.5733859539031982, + "learning_rate": 0.00035955385047038355, + "loss": 0.5021, + "step": 27780 + }, + { + "epoch": 0.6185452279202279, + "grad_norm": 0.8854355216026306, + "learning_rate": 0.000359525734659239, + "loss": 0.7261, + "step": 27790 + }, + { + "epoch": 0.6187678062678063, + "grad_norm": 0.534376859664917, + "learning_rate": 0.00035949761017933546, + "loss": 0.5296, + "step": 27800 + }, + { + "epoch": 0.6189903846153846, + "grad_norm": 0.5967615246772766, + "learning_rate": 0.00035946947703220124, + "loss": 0.5746, + "step": 27810 + }, + { + "epoch": 0.6192129629629629, + "grad_norm": 0.9212782382965088, + "learning_rate": 0.0003594413352193651, + "loss": 0.6819, + "step": 27820 + }, + { + "epoch": 0.6194355413105413, + "grad_norm": 0.6893104910850525, + "learning_rate": 0.0003594131847423562, + "loss": 0.5115, + "step": 27830 + }, + { + "epoch": 0.6196581196581197, + "grad_norm": 0.46085241436958313, + "learning_rate": 0.0003593850256027044, + "loss": 0.6953, + "step": 27840 + }, + { + "epoch": 0.619880698005698, + "grad_norm": 0.5326361656188965, + "learning_rate": 0.00035935685780193974, + "loss": 0.5761, + "step": 27850 + }, + { + "epoch": 0.6201032763532763, + "grad_norm": 0.8955104947090149, + "learning_rate": 0.0003593286813415931, + "loss": 0.6468, + "step": 27860 + }, + { + "epoch": 0.6203258547008547, + "grad_norm": 0.6292840242385864, + "learning_rate": 0.00035930049622319535, + "loss": 0.5546, + "step": 27870 + }, + { + "epoch": 0.6205484330484331, + "grad_norm": 0.48575925827026367, + "learning_rate": 0.00035927230244827833, + "loss": 0.7017, + "step": 27880 + }, + { + "epoch": 0.6207710113960114, + "grad_norm": 0.7187725305557251, + "learning_rate": 0.00035924410001837395, + "loss": 0.6553, + "step": 27890 + }, + { + "epoch": 0.6209935897435898, + "grad_norm": 0.6629842519760132, + "learning_rate": 0.00035921588893501487, + "loss": 0.5627, + "step": 27900 + }, + { + "epoch": 0.6212161680911681, + "grad_norm": 0.8785918354988098, + "learning_rate": 0.00035918766919973395, + "loss": 0.6831, + "step": 27910 + }, + { + "epoch": 0.6214387464387464, + "grad_norm": 0.847978949546814, + "learning_rate": 0.0003591594408140649, + "loss": 0.6169, + "step": 27920 + }, + { + "epoch": 0.6216613247863247, + "grad_norm": 0.7552899122238159, + "learning_rate": 0.0003591312037795414, + "loss": 0.5807, + "step": 27930 + }, + { + "epoch": 0.6218839031339032, + "grad_norm": 0.5452701449394226, + "learning_rate": 0.0003591029580976981, + "loss": 0.7221, + "step": 27940 + }, + { + "epoch": 0.6221064814814815, + "grad_norm": 0.5792766213417053, + "learning_rate": 0.0003590747037700698, + "loss": 0.6582, + "step": 27950 + }, + { + "epoch": 0.6223290598290598, + "grad_norm": 0.7792395353317261, + "learning_rate": 0.0003590464407981919, + "loss": 0.7478, + "step": 27960 + }, + { + "epoch": 0.6225516381766382, + "grad_norm": 0.4148932993412018, + "learning_rate": 0.00035901816918360014, + "loss": 0.6012, + "step": 27970 + }, + { + "epoch": 0.6227742165242165, + "grad_norm": 0.8461902737617493, + "learning_rate": 0.00035898988892783096, + "loss": 0.6691, + "step": 27980 + }, + { + "epoch": 0.6229967948717948, + "grad_norm": 0.7921462059020996, + "learning_rate": 0.00035896160003242103, + "loss": 0.593, + "step": 27990 + }, + { + "epoch": 0.6232193732193733, + "grad_norm": 0.6527276635169983, + "learning_rate": 0.00035893330249890757, + "loss": 0.6857, + "step": 28000 + }, + { + "epoch": 0.6234419515669516, + "grad_norm": 0.35611993074417114, + "learning_rate": 0.0003589049963288284, + "loss": 0.7097, + "step": 28010 + }, + { + "epoch": 0.6236645299145299, + "grad_norm": 0.6560295224189758, + "learning_rate": 0.0003588766815237216, + "loss": 0.5787, + "step": 28020 + }, + { + "epoch": 0.6238871082621082, + "grad_norm": 0.669855535030365, + "learning_rate": 0.00035884835808512594, + "loss": 0.6569, + "step": 28030 + }, + { + "epoch": 0.6241096866096866, + "grad_norm": 0.386393278837204, + "learning_rate": 0.00035882002601458045, + "loss": 0.6923, + "step": 28040 + }, + { + "epoch": 0.6243322649572649, + "grad_norm": 0.5557284951210022, + "learning_rate": 0.0003587916853136247, + "loss": 0.606, + "step": 28050 + }, + { + "epoch": 0.6245548433048433, + "grad_norm": 0.5414696931838989, + "learning_rate": 0.00035876333598379873, + "loss": 0.5686, + "step": 28060 + }, + { + "epoch": 0.6247774216524217, + "grad_norm": 0.6101901531219482, + "learning_rate": 0.00035873497802664316, + "loss": 0.7799, + "step": 28070 + }, + { + "epoch": 0.625, + "grad_norm": 0.7300838232040405, + "learning_rate": 0.0003587066114436989, + "loss": 0.6007, + "step": 28080 + }, + { + "epoch": 0.6252225783475783, + "grad_norm": 0.42730575799942017, + "learning_rate": 0.00035867823623650754, + "loss": 0.5635, + "step": 28090 + }, + { + "epoch": 0.6254451566951567, + "grad_norm": 0.7427262663841248, + "learning_rate": 0.00035864985240661085, + "loss": 0.7147, + "step": 28100 + }, + { + "epoch": 0.6256677350427351, + "grad_norm": 0.6119892001152039, + "learning_rate": 0.00035862145995555134, + "loss": 0.6254, + "step": 28110 + }, + { + "epoch": 0.6258903133903134, + "grad_norm": 0.6742544770240784, + "learning_rate": 0.00035859305888487185, + "loss": 0.6997, + "step": 28120 + }, + { + "epoch": 0.6261128917378918, + "grad_norm": 0.7052304148674011, + "learning_rate": 0.0003585646491961157, + "loss": 0.5544, + "step": 28130 + }, + { + "epoch": 0.6263354700854701, + "grad_norm": 0.5233950018882751, + "learning_rate": 0.00035853623089082665, + "loss": 0.5589, + "step": 28140 + }, + { + "epoch": 0.6265580484330484, + "grad_norm": 0.6088318228721619, + "learning_rate": 0.0003585078039705491, + "loss": 0.8011, + "step": 28150 + }, + { + "epoch": 0.6267806267806267, + "grad_norm": 0.8205341100692749, + "learning_rate": 0.0003584793684368277, + "loss": 0.675, + "step": 28160 + }, + { + "epoch": 0.6270032051282052, + "grad_norm": 0.47185105085372925, + "learning_rate": 0.0003584509242912076, + "loss": 0.794, + "step": 28170 + }, + { + "epoch": 0.6272257834757835, + "grad_norm": 0.48162466287612915, + "learning_rate": 0.0003584224715352347, + "loss": 0.8218, + "step": 28180 + }, + { + "epoch": 0.6274483618233618, + "grad_norm": 0.6121693253517151, + "learning_rate": 0.0003583940101704549, + "loss": 0.6535, + "step": 28190 + }, + { + "epoch": 0.6276709401709402, + "grad_norm": 0.8618494868278503, + "learning_rate": 0.00035836554019841495, + "loss": 0.6823, + "step": 28200 + }, + { + "epoch": 0.6278935185185185, + "grad_norm": 0.9534609913825989, + "learning_rate": 0.00035833706162066194, + "loss": 0.6325, + "step": 28210 + }, + { + "epoch": 0.6281160968660968, + "grad_norm": 0.6802915334701538, + "learning_rate": 0.0003583085744387433, + "loss": 0.6319, + "step": 28220 + }, + { + "epoch": 0.6283386752136753, + "grad_norm": 0.7972056865692139, + "learning_rate": 0.0003582800786542072, + "loss": 0.5217, + "step": 28230 + }, + { + "epoch": 0.6285612535612536, + "grad_norm": 0.8219149708747864, + "learning_rate": 0.00035825157426860204, + "loss": 0.7513, + "step": 28240 + }, + { + "epoch": 0.6287838319088319, + "grad_norm": 0.813468337059021, + "learning_rate": 0.0003582230612834768, + "loss": 0.6351, + "step": 28250 + }, + { + "epoch": 0.6290064102564102, + "grad_norm": 0.6350398659706116, + "learning_rate": 0.0003581945397003809, + "loss": 0.558, + "step": 28260 + }, + { + "epoch": 0.6292289886039886, + "grad_norm": 0.8708319664001465, + "learning_rate": 0.0003581660095208641, + "loss": 0.5868, + "step": 28270 + }, + { + "epoch": 0.6294515669515669, + "grad_norm": 0.7231774926185608, + "learning_rate": 0.00035813747074647697, + "loss": 0.6014, + "step": 28280 + }, + { + "epoch": 0.6296741452991453, + "grad_norm": 0.9997678995132446, + "learning_rate": 0.0003581089233787702, + "loss": 0.6687, + "step": 28290 + }, + { + "epoch": 0.6298967236467237, + "grad_norm": 0.5924893617630005, + "learning_rate": 0.00035808036741929506, + "loss": 0.6482, + "step": 28300 + }, + { + "epoch": 0.630119301994302, + "grad_norm": 0.4346763789653778, + "learning_rate": 0.0003580518028696034, + "loss": 0.5784, + "step": 28310 + }, + { + "epoch": 0.6303418803418803, + "grad_norm": 1.0329116582870483, + "learning_rate": 0.00035802322973124733, + "loss": 0.5514, + "step": 28320 + }, + { + "epoch": 0.6305644586894587, + "grad_norm": 0.7910727262496948, + "learning_rate": 0.0003579946480057796, + "loss": 0.6541, + "step": 28330 + }, + { + "epoch": 0.6307870370370371, + "grad_norm": 0.7650035619735718, + "learning_rate": 0.00035796605769475336, + "loss": 0.5975, + "step": 28340 + }, + { + "epoch": 0.6310096153846154, + "grad_norm": 0.6159213185310364, + "learning_rate": 0.00035793745879972224, + "loss": 0.6394, + "step": 28350 + }, + { + "epoch": 0.6312321937321937, + "grad_norm": 0.5805545449256897, + "learning_rate": 0.0003579088513222403, + "loss": 0.6183, + "step": 28360 + }, + { + "epoch": 0.6314547720797721, + "grad_norm": 0.7439352869987488, + "learning_rate": 0.00035788023526386214, + "loss": 0.636, + "step": 28370 + }, + { + "epoch": 0.6316773504273504, + "grad_norm": 0.7128954529762268, + "learning_rate": 0.0003578516106261427, + "loss": 0.7803, + "step": 28380 + }, + { + "epoch": 0.6318999287749287, + "grad_norm": 0.7685819864273071, + "learning_rate": 0.0003578229774106376, + "loss": 0.5954, + "step": 28390 + }, + { + "epoch": 0.6321225071225072, + "grad_norm": 0.5727129578590393, + "learning_rate": 0.0003577943356189026, + "loss": 0.5705, + "step": 28400 + }, + { + "epoch": 0.6323450854700855, + "grad_norm": 0.6094703674316406, + "learning_rate": 0.0003577656852524943, + "loss": 0.6513, + "step": 28410 + }, + { + "epoch": 0.6325676638176638, + "grad_norm": 0.5944792032241821, + "learning_rate": 0.00035773702631296955, + "loss": 0.5142, + "step": 28420 + }, + { + "epoch": 0.6327902421652422, + "grad_norm": 0.4907169044017792, + "learning_rate": 0.00035770835880188554, + "loss": 0.6042, + "step": 28430 + }, + { + "epoch": 0.6330128205128205, + "grad_norm": 0.8299042582511902, + "learning_rate": 0.00035767968272080027, + "loss": 0.6913, + "step": 28440 + }, + { + "epoch": 0.6332353988603988, + "grad_norm": 0.5609634518623352, + "learning_rate": 0.00035765099807127194, + "loss": 0.6383, + "step": 28450 + }, + { + "epoch": 0.6334579772079773, + "grad_norm": 0.713447093963623, + "learning_rate": 0.00035762230485485933, + "loss": 0.5828, + "step": 28460 + }, + { + "epoch": 0.6336805555555556, + "grad_norm": 0.4866918623447418, + "learning_rate": 0.0003575936030731216, + "loss": 0.6244, + "step": 28470 + }, + { + "epoch": 0.6339031339031339, + "grad_norm": 0.4822397828102112, + "learning_rate": 0.00035756489272761855, + "loss": 0.5602, + "step": 28480 + }, + { + "epoch": 0.6341257122507122, + "grad_norm": 0.6143772602081299, + "learning_rate": 0.0003575361738199102, + "loss": 0.5137, + "step": 28490 + }, + { + "epoch": 0.6343482905982906, + "grad_norm": 0.7375980019569397, + "learning_rate": 0.0003575074463515572, + "loss": 0.6496, + "step": 28500 + }, + { + "epoch": 0.6345708689458689, + "grad_norm": 1.0098057985305786, + "learning_rate": 0.0003574787103241206, + "loss": 0.616, + "step": 28510 + }, + { + "epoch": 0.6347934472934473, + "grad_norm": 0.6686311364173889, + "learning_rate": 0.000357449965739162, + "loss": 0.5475, + "step": 28520 + }, + { + "epoch": 0.6350160256410257, + "grad_norm": 0.7210583090782166, + "learning_rate": 0.0003574212125982434, + "loss": 0.6409, + "step": 28530 + }, + { + "epoch": 0.635238603988604, + "grad_norm": 0.8931546211242676, + "learning_rate": 0.00035739245090292713, + "loss": 0.7039, + "step": 28540 + }, + { + "epoch": 0.6354611823361823, + "grad_norm": 0.4405869245529175, + "learning_rate": 0.0003573636806547763, + "loss": 0.5408, + "step": 28550 + }, + { + "epoch": 0.6356837606837606, + "grad_norm": 0.7087313532829285, + "learning_rate": 0.00035733490185535424, + "loss": 0.7057, + "step": 28560 + }, + { + "epoch": 0.6359063390313391, + "grad_norm": 0.6314172744750977, + "learning_rate": 0.00035730611450622476, + "loss": 0.5741, + "step": 28570 + }, + { + "epoch": 0.6361289173789174, + "grad_norm": 0.6478539705276489, + "learning_rate": 0.0003572773186089523, + "loss": 0.7413, + "step": 28580 + }, + { + "epoch": 0.6363514957264957, + "grad_norm": 0.47926536202430725, + "learning_rate": 0.0003572485141651016, + "loss": 0.5663, + "step": 28590 + }, + { + "epoch": 0.6365740740740741, + "grad_norm": 0.8513566851615906, + "learning_rate": 0.0003572197011762378, + "loss": 0.6291, + "step": 28600 + }, + { + "epoch": 0.6367966524216524, + "grad_norm": 0.7166478037834167, + "learning_rate": 0.00035719087964392683, + "loss": 0.7298, + "step": 28610 + }, + { + "epoch": 0.6370192307692307, + "grad_norm": 0.6767480969429016, + "learning_rate": 0.0003571620495697348, + "loss": 0.647, + "step": 28620 + }, + { + "epoch": 0.6372418091168092, + "grad_norm": 0.5176422595977783, + "learning_rate": 0.0003571332109552283, + "loss": 0.4404, + "step": 28630 + }, + { + "epoch": 0.6374643874643875, + "grad_norm": 0.7984474897384644, + "learning_rate": 0.00035710436380197445, + "loss": 0.6743, + "step": 28640 + }, + { + "epoch": 0.6376869658119658, + "grad_norm": 0.5979481339454651, + "learning_rate": 0.00035707550811154095, + "loss": 0.6531, + "step": 28650 + }, + { + "epoch": 0.6379095441595442, + "grad_norm": 0.6247133016586304, + "learning_rate": 0.00035704664388549567, + "loss": 0.7876, + "step": 28660 + }, + { + "epoch": 0.6381321225071225, + "grad_norm": 0.4504818916320801, + "learning_rate": 0.0003570177711254072, + "loss": 0.7235, + "step": 28670 + }, + { + "epoch": 0.6383547008547008, + "grad_norm": 0.6310564279556274, + "learning_rate": 0.00035698888983284454, + "loss": 0.5558, + "step": 28680 + }, + { + "epoch": 0.6385772792022792, + "grad_norm": 0.8027581572532654, + "learning_rate": 0.00035696000000937707, + "loss": 0.6422, + "step": 28690 + }, + { + "epoch": 0.6387998575498576, + "grad_norm": 0.55892413854599, + "learning_rate": 0.0003569311016565747, + "loss": 0.5426, + "step": 28700 + }, + { + "epoch": 0.6390224358974359, + "grad_norm": 0.5204585194587708, + "learning_rate": 0.0003569021947760078, + "loss": 0.4812, + "step": 28710 + }, + { + "epoch": 0.6392450142450142, + "grad_norm": 0.6362113356590271, + "learning_rate": 0.00035687327936924726, + "loss": 0.5975, + "step": 28720 + }, + { + "epoch": 0.6394675925925926, + "grad_norm": 0.45520856976509094, + "learning_rate": 0.0003568443554378642, + "loss": 0.4732, + "step": 28730 + }, + { + "epoch": 0.6396901709401709, + "grad_norm": 0.6635448336601257, + "learning_rate": 0.0003568154229834305, + "loss": 0.542, + "step": 28740 + }, + { + "epoch": 0.6399127492877493, + "grad_norm": 0.8992020487785339, + "learning_rate": 0.0003567864820075183, + "loss": 0.6056, + "step": 28750 + }, + { + "epoch": 0.6401353276353277, + "grad_norm": 0.5336700081825256, + "learning_rate": 0.00035675753251170045, + "loss": 0.6984, + "step": 28760 + }, + { + "epoch": 0.640357905982906, + "grad_norm": 0.5950776934623718, + "learning_rate": 0.00035672857449754985, + "loss": 0.7067, + "step": 28770 + }, + { + "epoch": 0.6405804843304843, + "grad_norm": 0.4215049743652344, + "learning_rate": 0.00035669960796664023, + "loss": 0.6264, + "step": 28780 + }, + { + "epoch": 0.6408030626780626, + "grad_norm": 1.0368537902832031, + "learning_rate": 0.0003566706329205456, + "loss": 0.4807, + "step": 28790 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 0.517913281917572, + "learning_rate": 0.00035664164936084053, + "loss": 0.5554, + "step": 28800 + }, + { + "epoch": 0.6412482193732194, + "grad_norm": 0.5046729445457458, + "learning_rate": 0.0003566126572891, + "loss": 0.5988, + "step": 28810 + }, + { + "epoch": 0.6414707977207977, + "grad_norm": 0.7036557197570801, + "learning_rate": 0.00035658365670689947, + "loss": 0.5184, + "step": 28820 + }, + { + "epoch": 0.6416933760683761, + "grad_norm": 0.4869050681591034, + "learning_rate": 0.0003565546476158149, + "loss": 0.6969, + "step": 28830 + }, + { + "epoch": 0.6419159544159544, + "grad_norm": 1.1376734972000122, + "learning_rate": 0.00035652563001742257, + "loss": 0.5895, + "step": 28840 + }, + { + "epoch": 0.6421385327635327, + "grad_norm": 0.6507123112678528, + "learning_rate": 0.00035649660391329934, + "loss": 0.7427, + "step": 28850 + }, + { + "epoch": 0.6423611111111112, + "grad_norm": 0.49312824010849, + "learning_rate": 0.00035646756930502257, + "loss": 0.5665, + "step": 28860 + }, + { + "epoch": 0.6425836894586895, + "grad_norm": 0.7473471760749817, + "learning_rate": 0.00035643852619417004, + "loss": 0.6141, + "step": 28870 + }, + { + "epoch": 0.6428062678062678, + "grad_norm": 0.602100133895874, + "learning_rate": 0.00035640947458231986, + "loss": 0.6248, + "step": 28880 + }, + { + "epoch": 0.6430288461538461, + "grad_norm": 0.4823180139064789, + "learning_rate": 0.0003563804144710508, + "loss": 0.5684, + "step": 28890 + }, + { + "epoch": 0.6432514245014245, + "grad_norm": 0.6477437019348145, + "learning_rate": 0.00035635134586194204, + "loss": 0.6773, + "step": 28900 + }, + { + "epoch": 0.6434740028490028, + "grad_norm": 0.6315403580665588, + "learning_rate": 0.00035632226875657316, + "loss": 0.6367, + "step": 28910 + }, + { + "epoch": 0.6436965811965812, + "grad_norm": 0.7338327169418335, + "learning_rate": 0.00035629318315652417, + "loss": 0.5882, + "step": 28920 + }, + { + "epoch": 0.6439191595441596, + "grad_norm": 0.4510704278945923, + "learning_rate": 0.0003562640890633756, + "loss": 0.6198, + "step": 28930 + }, + { + "epoch": 0.6441417378917379, + "grad_norm": 0.6776688694953918, + "learning_rate": 0.00035623498647870865, + "loss": 0.6024, + "step": 28940 + }, + { + "epoch": 0.6443643162393162, + "grad_norm": 0.4859476685523987, + "learning_rate": 0.0003562058754041045, + "loss": 0.5885, + "step": 28950 + }, + { + "epoch": 0.6445868945868946, + "grad_norm": 0.8991909027099609, + "learning_rate": 0.0003561767558411453, + "loss": 0.6528, + "step": 28960 + }, + { + "epoch": 0.6448094729344729, + "grad_norm": 0.6131160855293274, + "learning_rate": 0.00035614762779141333, + "loss": 0.7233, + "step": 28970 + }, + { + "epoch": 0.6450320512820513, + "grad_norm": 0.723871648311615, + "learning_rate": 0.0003561184912564914, + "loss": 0.7137, + "step": 28980 + }, + { + "epoch": 0.6452546296296297, + "grad_norm": 0.6209283471107483, + "learning_rate": 0.0003560893462379629, + "loss": 0.6371, + "step": 28990 + }, + { + "epoch": 0.645477207977208, + "grad_norm": 0.7736254334449768, + "learning_rate": 0.0003560601927374115, + "loss": 0.711, + "step": 29000 + }, + { + "epoch": 0.6456997863247863, + "grad_norm": 0.7672634720802307, + "learning_rate": 0.0003560310307564215, + "loss": 0.7021, + "step": 29010 + }, + { + "epoch": 0.6459223646723646, + "grad_norm": 0.7883049249649048, + "learning_rate": 0.0003560018602965775, + "loss": 0.6126, + "step": 29020 + }, + { + "epoch": 0.6461449430199431, + "grad_norm": 0.7914969325065613, + "learning_rate": 0.00035597268135946475, + "loss": 0.6034, + "step": 29030 + }, + { + "epoch": 0.6463675213675214, + "grad_norm": 0.5914848446846008, + "learning_rate": 0.00035594349394666884, + "loss": 0.5519, + "step": 29040 + }, + { + "epoch": 0.6465900997150997, + "grad_norm": 0.647603452205658, + "learning_rate": 0.00035591429805977573, + "loss": 0.7662, + "step": 29050 + }, + { + "epoch": 0.6468126780626781, + "grad_norm": 0.5746428370475769, + "learning_rate": 0.00035588509370037207, + "loss": 0.6192, + "step": 29060 + }, + { + "epoch": 0.6470352564102564, + "grad_norm": 0.7971509099006653, + "learning_rate": 0.0003558558808700448, + "loss": 0.727, + "step": 29070 + }, + { + "epoch": 0.6472578347578347, + "grad_norm": 0.8022960424423218, + "learning_rate": 0.0003558266595703814, + "loss": 0.5038, + "step": 29080 + }, + { + "epoch": 0.6474804131054132, + "grad_norm": 0.7282474040985107, + "learning_rate": 0.00035579742980296967, + "loss": 0.5816, + "step": 29090 + }, + { + "epoch": 0.6477029914529915, + "grad_norm": 0.667869508266449, + "learning_rate": 0.00035576819156939816, + "loss": 0.683, + "step": 29100 + }, + { + "epoch": 0.6479255698005698, + "grad_norm": 0.6066399812698364, + "learning_rate": 0.00035573894487125554, + "loss": 0.569, + "step": 29110 + }, + { + "epoch": 0.6481481481481481, + "grad_norm": 0.776150643825531, + "learning_rate": 0.0003557096897101312, + "loss": 0.6937, + "step": 29120 + }, + { + "epoch": 0.6483707264957265, + "grad_norm": 0.5233556628227234, + "learning_rate": 0.0003556804260876148, + "loss": 0.6897, + "step": 29130 + }, + { + "epoch": 0.6485933048433048, + "grad_norm": 1.1723662614822388, + "learning_rate": 0.00035565115400529665, + "loss": 0.802, + "step": 29140 + }, + { + "epoch": 0.6488158831908832, + "grad_norm": 1.0054035186767578, + "learning_rate": 0.00035562187346476734, + "loss": 0.5343, + "step": 29150 + }, + { + "epoch": 0.6490384615384616, + "grad_norm": 1.1244323253631592, + "learning_rate": 0.00035559258446761803, + "loss": 0.571, + "step": 29160 + }, + { + "epoch": 0.6492610398860399, + "grad_norm": 0.7842982411384583, + "learning_rate": 0.0003555632870154403, + "loss": 0.5818, + "step": 29170 + }, + { + "epoch": 0.6494836182336182, + "grad_norm": 0.5447773337364197, + "learning_rate": 0.00035553398110982625, + "loss": 0.6002, + "step": 29180 + }, + { + "epoch": 0.6497061965811965, + "grad_norm": 0.47989046573638916, + "learning_rate": 0.00035550466675236835, + "loss": 0.5581, + "step": 29190 + }, + { + "epoch": 0.6499287749287749, + "grad_norm": 0.9793609976768494, + "learning_rate": 0.0003554753439446595, + "loss": 0.6088, + "step": 29200 + }, + { + "epoch": 0.6501513532763533, + "grad_norm": 0.7243526577949524, + "learning_rate": 0.0003554460126882932, + "loss": 0.7478, + "step": 29210 + }, + { + "epoch": 0.6503739316239316, + "grad_norm": 0.43227654695510864, + "learning_rate": 0.00035541667298486326, + "loss": 0.6304, + "step": 29220 + }, + { + "epoch": 0.65059650997151, + "grad_norm": 0.8305972218513489, + "learning_rate": 0.00035538732483596415, + "loss": 0.5699, + "step": 29230 + }, + { + "epoch": 0.6508190883190883, + "grad_norm": 0.6374602317810059, + "learning_rate": 0.00035535796824319064, + "loss": 0.7133, + "step": 29240 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.3493801951408386, + "learning_rate": 0.00035532860320813787, + "loss": 0.6185, + "step": 29250 + }, + { + "epoch": 0.6512642450142451, + "grad_norm": 0.5009222626686096, + "learning_rate": 0.00035529922973240167, + "loss": 0.6071, + "step": 29260 + }, + { + "epoch": 0.6514868233618234, + "grad_norm": 0.9517810344696045, + "learning_rate": 0.0003552698478175782, + "loss": 0.6672, + "step": 29270 + }, + { + "epoch": 0.6517094017094017, + "grad_norm": 0.8424570560455322, + "learning_rate": 0.0003552404574652641, + "loss": 0.6607, + "step": 29280 + }, + { + "epoch": 0.65193198005698, + "grad_norm": 0.7108397483825684, + "learning_rate": 0.00035521105867705646, + "loss": 0.4565, + "step": 29290 + }, + { + "epoch": 0.6521545584045584, + "grad_norm": 0.7616039514541626, + "learning_rate": 0.0003551816514545528, + "loss": 0.6568, + "step": 29300 + }, + { + "epoch": 0.6523771367521367, + "grad_norm": 0.6160005331039429, + "learning_rate": 0.0003551522357993512, + "loss": 0.4912, + "step": 29310 + }, + { + "epoch": 0.6525997150997151, + "grad_norm": 0.7282902002334595, + "learning_rate": 0.00035512281171305, + "loss": 0.5883, + "step": 29320 + }, + { + "epoch": 0.6528222934472935, + "grad_norm": 1.0402792692184448, + "learning_rate": 0.0003550933791972483, + "loss": 0.5807, + "step": 29330 + }, + { + "epoch": 0.6530448717948718, + "grad_norm": 0.607012152671814, + "learning_rate": 0.00035506393825354547, + "loss": 0.5477, + "step": 29340 + }, + { + "epoch": 0.6532674501424501, + "grad_norm": 0.7881841063499451, + "learning_rate": 0.0003550344888835412, + "loss": 0.6495, + "step": 29350 + }, + { + "epoch": 0.6534900284900285, + "grad_norm": 0.6767070889472961, + "learning_rate": 0.0003550050310888359, + "loss": 0.5842, + "step": 29360 + }, + { + "epoch": 0.6537126068376068, + "grad_norm": 0.9461158514022827, + "learning_rate": 0.00035497556487103037, + "loss": 0.6715, + "step": 29370 + }, + { + "epoch": 0.6539351851851852, + "grad_norm": 0.7762941718101501, + "learning_rate": 0.0003549460902317257, + "loss": 0.6558, + "step": 29380 + }, + { + "epoch": 0.6541577635327636, + "grad_norm": 0.6619521379470825, + "learning_rate": 0.0003549166071725237, + "loss": 0.68, + "step": 29390 + }, + { + "epoch": 0.6543803418803419, + "grad_norm": 0.8690246343612671, + "learning_rate": 0.0003548871156950264, + "loss": 0.6592, + "step": 29400 + }, + { + "epoch": 0.6546029202279202, + "grad_norm": 0.9152123332023621, + "learning_rate": 0.00035485761580083646, + "loss": 0.6177, + "step": 29410 + }, + { + "epoch": 0.6548254985754985, + "grad_norm": 0.32496699690818787, + "learning_rate": 0.0003548281074915569, + "loss": 0.607, + "step": 29420 + }, + { + "epoch": 0.6550480769230769, + "grad_norm": 0.5853595733642578, + "learning_rate": 0.00035479859076879123, + "loss": 0.5845, + "step": 29430 + }, + { + "epoch": 0.6552706552706553, + "grad_norm": 0.792079508304596, + "learning_rate": 0.00035476906563414347, + "loss": 0.6061, + "step": 29440 + }, + { + "epoch": 0.6554932336182336, + "grad_norm": 0.6753571629524231, + "learning_rate": 0.00035473953208921787, + "loss": 0.6051, + "step": 29450 + }, + { + "epoch": 0.655715811965812, + "grad_norm": 0.6931265592575073, + "learning_rate": 0.00035470999013561947, + "loss": 0.6763, + "step": 29460 + }, + { + "epoch": 0.6559383903133903, + "grad_norm": 0.7315155267715454, + "learning_rate": 0.0003546804397749536, + "loss": 0.6601, + "step": 29470 + }, + { + "epoch": 0.6561609686609686, + "grad_norm": 0.826026201248169, + "learning_rate": 0.000354650881008826, + "loss": 0.6453, + "step": 29480 + }, + { + "epoch": 0.6563835470085471, + "grad_norm": 0.8723484873771667, + "learning_rate": 0.0003546213138388429, + "loss": 0.63, + "step": 29490 + }, + { + "epoch": 0.6566061253561254, + "grad_norm": 0.9157624244689941, + "learning_rate": 0.000354591738266611, + "loss": 0.523, + "step": 29500 + }, + { + "epoch": 0.6568287037037037, + "grad_norm": 0.7259867191314697, + "learning_rate": 0.0003545621542937375, + "loss": 0.6265, + "step": 29510 + }, + { + "epoch": 0.657051282051282, + "grad_norm": 0.6993746161460876, + "learning_rate": 0.00035453256192183, + "loss": 0.4993, + "step": 29520 + }, + { + "epoch": 0.6572738603988604, + "grad_norm": 0.7524179816246033, + "learning_rate": 0.00035450296115249665, + "loss": 0.5703, + "step": 29530 + }, + { + "epoch": 0.6574964387464387, + "grad_norm": 0.7260290384292603, + "learning_rate": 0.0003544733519873458, + "loss": 0.6222, + "step": 29540 + }, + { + "epoch": 0.6577190170940171, + "grad_norm": 0.4377947449684143, + "learning_rate": 0.00035444373442798666, + "loss": 0.5503, + "step": 29550 + }, + { + "epoch": 0.6579415954415955, + "grad_norm": 0.5709770321846008, + "learning_rate": 0.00035441410847602845, + "loss": 0.6104, + "step": 29560 + }, + { + "epoch": 0.6581641737891738, + "grad_norm": 0.6281164884567261, + "learning_rate": 0.0003543844741330812, + "loss": 0.6844, + "step": 29570 + }, + { + "epoch": 0.6583867521367521, + "grad_norm": 0.7790220379829407, + "learning_rate": 0.0003543548314007553, + "loss": 0.5133, + "step": 29580 + }, + { + "epoch": 0.6586093304843305, + "grad_norm": 0.5882272124290466, + "learning_rate": 0.00035432518028066145, + "loss": 0.5498, + "step": 29590 + }, + { + "epoch": 0.6588319088319088, + "grad_norm": 0.733991265296936, + "learning_rate": 0.00035429552077441103, + "loss": 0.664, + "step": 29600 + }, + { + "epoch": 0.6590544871794872, + "grad_norm": 0.7435101270675659, + "learning_rate": 0.0003542658528836156, + "loss": 0.782, + "step": 29610 + }, + { + "epoch": 0.6592770655270656, + "grad_norm": 0.5844705104827881, + "learning_rate": 0.0003542361766098875, + "loss": 0.6496, + "step": 29620 + }, + { + "epoch": 0.6594996438746439, + "grad_norm": 0.895987868309021, + "learning_rate": 0.0003542064919548393, + "loss": 0.6748, + "step": 29630 + }, + { + "epoch": 0.6597222222222222, + "grad_norm": 0.5698703527450562, + "learning_rate": 0.00035417679892008405, + "loss": 0.5785, + "step": 29640 + }, + { + "epoch": 0.6599448005698005, + "grad_norm": 0.761055588722229, + "learning_rate": 0.0003541470975072354, + "loss": 0.6263, + "step": 29650 + }, + { + "epoch": 0.6600783475783476, + "eval_loss": 0.6208131909370422, + "eval_runtime": 337.1211, + "eval_samples_per_second": 7.015, + "eval_steps_per_second": 7.015, + "step": 29656 + }, + { + "epoch": 0.6601673789173789, + "grad_norm": 0.6658109426498413, + "learning_rate": 0.0003541173877179072, + "loss": 0.7333, + "step": 29660 + }, + { + "epoch": 0.6603899572649573, + "grad_norm": 0.6604793667793274, + "learning_rate": 0.000354087669553714, + "loss": 0.4947, + "step": 29670 + }, + { + "epoch": 0.6606125356125356, + "grad_norm": 0.742704451084137, + "learning_rate": 0.00035405794301627077, + "loss": 0.5558, + "step": 29680 + }, + { + "epoch": 0.660835113960114, + "grad_norm": 0.7001523375511169, + "learning_rate": 0.0003540282081071927, + "loss": 0.7059, + "step": 29690 + }, + { + "epoch": 0.6610576923076923, + "grad_norm": 1.10336172580719, + "learning_rate": 0.0003539984648280958, + "loss": 0.7525, + "step": 29700 + }, + { + "epoch": 0.6612802706552706, + "grad_norm": 0.7263476252555847, + "learning_rate": 0.00035396871318059615, + "loss": 0.6185, + "step": 29710 + }, + { + "epoch": 0.6615028490028491, + "grad_norm": 0.9817376732826233, + "learning_rate": 0.0003539389531663107, + "loss": 0.6282, + "step": 29720 + }, + { + "epoch": 0.6617254273504274, + "grad_norm": 0.5954140424728394, + "learning_rate": 0.0003539091847868564, + "loss": 0.5171, + "step": 29730 + }, + { + "epoch": 0.6619480056980057, + "grad_norm": 0.6389563679695129, + "learning_rate": 0.00035387940804385107, + "loss": 0.6703, + "step": 29740 + }, + { + "epoch": 0.662170584045584, + "grad_norm": 0.5653455853462219, + "learning_rate": 0.0003538496229389127, + "loss": 0.5959, + "step": 29750 + }, + { + "epoch": 0.6623931623931624, + "grad_norm": 0.5209896564483643, + "learning_rate": 0.0003538198294736599, + "loss": 0.5791, + "step": 29760 + }, + { + "epoch": 0.6626157407407407, + "grad_norm": 0.8491448760032654, + "learning_rate": 0.00035379002764971166, + "loss": 0.5636, + "step": 29770 + }, + { + "epoch": 0.6628383190883191, + "grad_norm": 0.48439210653305054, + "learning_rate": 0.0003537602174686874, + "loss": 0.5535, + "step": 29780 + }, + { + "epoch": 0.6630608974358975, + "grad_norm": 0.5228064060211182, + "learning_rate": 0.00035373039893220706, + "loss": 0.5412, + "step": 29790 + }, + { + "epoch": 0.6632834757834758, + "grad_norm": 0.5903054475784302, + "learning_rate": 0.00035370057204189094, + "loss": 0.5664, + "step": 29800 + }, + { + "epoch": 0.6635060541310541, + "grad_norm": 0.7371571660041809, + "learning_rate": 0.0003536707367993599, + "loss": 0.7484, + "step": 29810 + }, + { + "epoch": 0.6637286324786325, + "grad_norm": 0.7421772480010986, + "learning_rate": 0.0003536408932062353, + "loss": 0.6104, + "step": 29820 + }, + { + "epoch": 0.6639512108262108, + "grad_norm": 0.7558051943778992, + "learning_rate": 0.0003536110412641388, + "loss": 0.491, + "step": 29830 + }, + { + "epoch": 0.6641737891737892, + "grad_norm": 0.4967116117477417, + "learning_rate": 0.0003535811809746925, + "loss": 0.5694, + "step": 29840 + }, + { + "epoch": 0.6643963675213675, + "grad_norm": 0.6911063194274902, + "learning_rate": 0.0003535513123395191, + "loss": 0.5885, + "step": 29850 + }, + { + "epoch": 0.6646189458689459, + "grad_norm": 0.6311195492744446, + "learning_rate": 0.00035352143536024165, + "loss": 0.5241, + "step": 29860 + }, + { + "epoch": 0.6648415242165242, + "grad_norm": 0.5731965899467468, + "learning_rate": 0.00035349155003848383, + "loss": 0.679, + "step": 29870 + }, + { + "epoch": 0.6650641025641025, + "grad_norm": 0.8735531568527222, + "learning_rate": 0.00035346165637586946, + "loss": 0.7353, + "step": 29880 + }, + { + "epoch": 0.6652866809116809, + "grad_norm": 0.663013756275177, + "learning_rate": 0.00035343175437402307, + "loss": 0.5838, + "step": 29890 + }, + { + "epoch": 0.6655092592592593, + "grad_norm": 0.6095181107521057, + "learning_rate": 0.0003534018440345696, + "loss": 0.5743, + "step": 29900 + }, + { + "epoch": 0.6657318376068376, + "grad_norm": 0.7778880596160889, + "learning_rate": 0.00035337192535913426, + "loss": 0.563, + "step": 29910 + }, + { + "epoch": 0.665954415954416, + "grad_norm": 0.7310755252838135, + "learning_rate": 0.00035334199834934294, + "loss": 0.6542, + "step": 29920 + }, + { + "epoch": 0.6661769943019943, + "grad_norm": 1.2421523332595825, + "learning_rate": 0.0003533120630068219, + "loss": 0.6713, + "step": 29930 + }, + { + "epoch": 0.6663995726495726, + "grad_norm": 0.7105190753936768, + "learning_rate": 0.0003532821193331979, + "loss": 0.6277, + "step": 29940 + }, + { + "epoch": 0.6666221509971509, + "grad_norm": 0.7447794675827026, + "learning_rate": 0.000353252167330098, + "loss": 0.4767, + "step": 29950 + }, + { + "epoch": 0.6668447293447294, + "grad_norm": 0.8217668533325195, + "learning_rate": 0.0003532222069991499, + "loss": 0.6367, + "step": 29960 + }, + { + "epoch": 0.6670673076923077, + "grad_norm": 0.7856162190437317, + "learning_rate": 0.0003531922383419816, + "loss": 0.7551, + "step": 29970 + }, + { + "epoch": 0.667289886039886, + "grad_norm": 0.7897430062294006, + "learning_rate": 0.00035316226136022173, + "loss": 0.5759, + "step": 29980 + }, + { + "epoch": 0.6675124643874644, + "grad_norm": 0.9113932251930237, + "learning_rate": 0.00035313227605549913, + "loss": 0.7267, + "step": 29990 + }, + { + "epoch": 0.6677350427350427, + "grad_norm": 0.7457767724990845, + "learning_rate": 0.0003531022824294433, + "loss": 0.6099, + "step": 30000 + }, + { + "epoch": 0.6679576210826211, + "grad_norm": 0.49353641271591187, + "learning_rate": 0.0003530722804836842, + "loss": 0.5984, + "step": 30010 + }, + { + "epoch": 0.6681801994301995, + "grad_norm": 0.5130963325500488, + "learning_rate": 0.00035304227021985195, + "loss": 0.5404, + "step": 30020 + }, + { + "epoch": 0.6684027777777778, + "grad_norm": 0.7923216819763184, + "learning_rate": 0.0003530122516395775, + "loss": 0.5814, + "step": 30030 + }, + { + "epoch": 0.6686253561253561, + "grad_norm": 0.7568293809890747, + "learning_rate": 0.000352982224744492, + "loss": 0.5782, + "step": 30040 + }, + { + "epoch": 0.6688479344729344, + "grad_norm": 0.6948940753936768, + "learning_rate": 0.00035295218953622717, + "loss": 0.5985, + "step": 30050 + }, + { + "epoch": 0.6690705128205128, + "grad_norm": 0.6068517565727234, + "learning_rate": 0.0003529221460164152, + "loss": 0.5879, + "step": 30060 + }, + { + "epoch": 0.6692930911680912, + "grad_norm": 0.7442594170570374, + "learning_rate": 0.0003528920941866886, + "loss": 0.6432, + "step": 30070 + }, + { + "epoch": 0.6695156695156695, + "grad_norm": 0.5323207378387451, + "learning_rate": 0.00035286203404868044, + "loss": 0.4737, + "step": 30080 + }, + { + "epoch": 0.6697382478632479, + "grad_norm": 0.9668570160865784, + "learning_rate": 0.00035283196560402416, + "loss": 0.7099, + "step": 30090 + }, + { + "epoch": 0.6699608262108262, + "grad_norm": 0.5750164985656738, + "learning_rate": 0.00035280188885435386, + "loss": 0.6165, + "step": 30100 + }, + { + "epoch": 0.6701834045584045, + "grad_norm": 0.46717461943626404, + "learning_rate": 0.00035277180380130377, + "loss": 0.4929, + "step": 30110 + }, + { + "epoch": 0.6704059829059829, + "grad_norm": 0.7870973348617554, + "learning_rate": 0.0003527417104465088, + "loss": 0.6755, + "step": 30120 + }, + { + "epoch": 0.6706285612535613, + "grad_norm": 0.6971989274024963, + "learning_rate": 0.0003527116087916042, + "loss": 0.6212, + "step": 30130 + }, + { + "epoch": 0.6708511396011396, + "grad_norm": 0.689439594745636, + "learning_rate": 0.0003526814988382258, + "loss": 0.5999, + "step": 30140 + }, + { + "epoch": 0.671073717948718, + "grad_norm": 0.48943307995796204, + "learning_rate": 0.0003526513805880098, + "loss": 0.5033, + "step": 30150 + }, + { + "epoch": 0.6712962962962963, + "grad_norm": 0.8167397379875183, + "learning_rate": 0.0003526212540425928, + "loss": 0.6477, + "step": 30160 + }, + { + "epoch": 0.6715188746438746, + "grad_norm": 0.8747900128364563, + "learning_rate": 0.00035259111920361185, + "loss": 0.6257, + "step": 30170 + }, + { + "epoch": 0.6717414529914529, + "grad_norm": 0.701403796672821, + "learning_rate": 0.0003525609760727046, + "loss": 0.4335, + "step": 30180 + }, + { + "epoch": 0.6719640313390314, + "grad_norm": 0.6762571930885315, + "learning_rate": 0.00035253082465150907, + "loss": 0.5561, + "step": 30190 + }, + { + "epoch": 0.6721866096866097, + "grad_norm": 0.8309723734855652, + "learning_rate": 0.00035250066494166364, + "loss": 0.678, + "step": 30200 + }, + { + "epoch": 0.672409188034188, + "grad_norm": 0.7671947479248047, + "learning_rate": 0.0003524704969448072, + "loss": 0.6803, + "step": 30210 + }, + { + "epoch": 0.6726317663817664, + "grad_norm": 0.7105935215950012, + "learning_rate": 0.00035244032066257915, + "loss": 0.6202, + "step": 30220 + }, + { + "epoch": 0.6728543447293447, + "grad_norm": 0.6643106937408447, + "learning_rate": 0.0003524101360966193, + "loss": 0.6799, + "step": 30230 + }, + { + "epoch": 0.6730769230769231, + "grad_norm": 0.6214847564697266, + "learning_rate": 0.00035237994324856784, + "loss": 0.7154, + "step": 30240 + }, + { + "epoch": 0.6732995014245015, + "grad_norm": 0.8612035512924194, + "learning_rate": 0.00035234974212006555, + "loss": 0.6859, + "step": 30250 + }, + { + "epoch": 0.6735220797720798, + "grad_norm": 0.6152910590171814, + "learning_rate": 0.00035231953271275355, + "loss": 0.6501, + "step": 30260 + }, + { + "epoch": 0.6737446581196581, + "grad_norm": 0.600477933883667, + "learning_rate": 0.0003522893150282735, + "loss": 0.446, + "step": 30270 + }, + { + "epoch": 0.6739672364672364, + "grad_norm": 0.7982945442199707, + "learning_rate": 0.0003522590890682673, + "loss": 0.681, + "step": 30280 + }, + { + "epoch": 0.6741898148148148, + "grad_norm": 0.7654717564582825, + "learning_rate": 0.00035222885483437766, + "loss": 0.6666, + "step": 30290 + }, + { + "epoch": 0.6744123931623932, + "grad_norm": 1.2194135189056396, + "learning_rate": 0.0003521986123282473, + "loss": 0.7611, + "step": 30300 + }, + { + "epoch": 0.6746349715099715, + "grad_norm": 0.6501625776290894, + "learning_rate": 0.0003521683615515198, + "loss": 0.566, + "step": 30310 + }, + { + "epoch": 0.6748575498575499, + "grad_norm": 0.714274525642395, + "learning_rate": 0.00035213810250583904, + "loss": 0.6326, + "step": 30320 + }, + { + "epoch": 0.6750801282051282, + "grad_norm": 0.7483553886413574, + "learning_rate": 0.0003521078351928492, + "loss": 0.7211, + "step": 30330 + }, + { + "epoch": 0.6753027065527065, + "grad_norm": 1.305942177772522, + "learning_rate": 0.00035207755961419506, + "loss": 0.7585, + "step": 30340 + }, + { + "epoch": 0.6755252849002849, + "grad_norm": 0.820911169052124, + "learning_rate": 0.00035204727577152186, + "loss": 0.6037, + "step": 30350 + }, + { + "epoch": 0.6757478632478633, + "grad_norm": 1.0651403665542603, + "learning_rate": 0.0003520169836664752, + "loss": 0.5951, + "step": 30360 + }, + { + "epoch": 0.6759704415954416, + "grad_norm": 0.9064886569976807, + "learning_rate": 0.0003519866833007012, + "loss": 0.6801, + "step": 30370 + }, + { + "epoch": 0.67619301994302, + "grad_norm": 0.5420659184455872, + "learning_rate": 0.0003519563746758464, + "loss": 0.6233, + "step": 30380 + }, + { + "epoch": 0.6764155982905983, + "grad_norm": 0.6689449548721313, + "learning_rate": 0.0003519260577935578, + "loss": 0.6992, + "step": 30390 + }, + { + "epoch": 0.6766381766381766, + "grad_norm": 0.48055005073547363, + "learning_rate": 0.0003518957326554829, + "loss": 0.5653, + "step": 30400 + }, + { + "epoch": 0.6768607549857549, + "grad_norm": 0.702401340007782, + "learning_rate": 0.0003518653992632695, + "loss": 0.5689, + "step": 30410 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.7911334037780762, + "learning_rate": 0.000351835057618566, + "loss": 0.6179, + "step": 30420 + }, + { + "epoch": 0.6773059116809117, + "grad_norm": 0.9225923418998718, + "learning_rate": 0.0003518047077230211, + "loss": 0.6146, + "step": 30430 + }, + { + "epoch": 0.67752849002849, + "grad_norm": 0.5468271374702454, + "learning_rate": 0.0003517743495782842, + "loss": 0.6134, + "step": 30440 + }, + { + "epoch": 0.6777510683760684, + "grad_norm": 0.7671268582344055, + "learning_rate": 0.0003517439831860048, + "loss": 0.5387, + "step": 30450 + }, + { + "epoch": 0.6779736467236467, + "grad_norm": 0.6207623481750488, + "learning_rate": 0.00035171360854783324, + "loss": 0.5735, + "step": 30460 + }, + { + "epoch": 0.6781962250712251, + "grad_norm": 0.5850103497505188, + "learning_rate": 0.00035168322566541993, + "loss": 0.5557, + "step": 30470 + }, + { + "epoch": 0.6784188034188035, + "grad_norm": 0.5051694512367249, + "learning_rate": 0.000351652834540416, + "loss": 0.6332, + "step": 30480 + }, + { + "epoch": 0.6786413817663818, + "grad_norm": 0.7209334373474121, + "learning_rate": 0.0003516224351744729, + "loss": 0.6913, + "step": 30490 + }, + { + "epoch": 0.6788639601139601, + "grad_norm": 0.8140676617622375, + "learning_rate": 0.0003515920275692425, + "loss": 0.6377, + "step": 30500 + }, + { + "epoch": 0.6790865384615384, + "grad_norm": 0.9659133553504944, + "learning_rate": 0.0003515616117263772, + "loss": 0.6876, + "step": 30510 + }, + { + "epoch": 0.6793091168091168, + "grad_norm": 0.875493586063385, + "learning_rate": 0.00035153118764752987, + "loss": 0.5625, + "step": 30520 + }, + { + "epoch": 0.6795316951566952, + "grad_norm": 0.4299869239330292, + "learning_rate": 0.0003515007553343538, + "loss": 0.4195, + "step": 30530 + }, + { + "epoch": 0.6797542735042735, + "grad_norm": 0.8948302268981934, + "learning_rate": 0.0003514703147885026, + "loss": 0.6594, + "step": 30540 + }, + { + "epoch": 0.6799768518518519, + "grad_norm": 0.5883477330207825, + "learning_rate": 0.00035143986601163057, + "loss": 0.6186, + "step": 30550 + }, + { + "epoch": 0.6801994301994302, + "grad_norm": 0.8271781206130981, + "learning_rate": 0.00035140940900539217, + "loss": 0.56, + "step": 30560 + }, + { + "epoch": 0.6804220085470085, + "grad_norm": 0.8564368486404419, + "learning_rate": 0.00035137894377144257, + "loss": 0.6942, + "step": 30570 + }, + { + "epoch": 0.6806445868945868, + "grad_norm": 1.0115987062454224, + "learning_rate": 0.0003513484703114372, + "loss": 0.681, + "step": 30580 + }, + { + "epoch": 0.6808671652421653, + "grad_norm": 0.6871652603149414, + "learning_rate": 0.00035131798862703215, + "loss": 0.5671, + "step": 30590 + }, + { + "epoch": 0.6810897435897436, + "grad_norm": 0.7678980827331543, + "learning_rate": 0.0003512874987198837, + "loss": 0.8507, + "step": 30600 + }, + { + "epoch": 0.6813123219373219, + "grad_norm": 0.8469058275222778, + "learning_rate": 0.00035125700059164864, + "loss": 0.6523, + "step": 30610 + }, + { + "epoch": 0.6815349002849003, + "grad_norm": 0.5048143863677979, + "learning_rate": 0.0003512264942439844, + "loss": 0.6415, + "step": 30620 + }, + { + "epoch": 0.6817574786324786, + "grad_norm": 0.6894258260726929, + "learning_rate": 0.0003511959796785486, + "loss": 0.6732, + "step": 30630 + }, + { + "epoch": 0.6819800569800569, + "grad_norm": 0.5911911129951477, + "learning_rate": 0.0003511654568969996, + "loss": 0.6205, + "step": 30640 + }, + { + "epoch": 0.6822026353276354, + "grad_norm": 0.519760012626648, + "learning_rate": 0.0003511349259009958, + "loss": 0.6151, + "step": 30650 + }, + { + "epoch": 0.6824252136752137, + "grad_norm": 0.4933334290981293, + "learning_rate": 0.00035110438669219647, + "loss": 0.5582, + "step": 30660 + }, + { + "epoch": 0.682647792022792, + "grad_norm": 0.7749009728431702, + "learning_rate": 0.0003510738392722611, + "loss": 0.6336, + "step": 30670 + }, + { + "epoch": 0.6828703703703703, + "grad_norm": 0.9254588484764099, + "learning_rate": 0.00035104328364284954, + "loss": 0.7357, + "step": 30680 + }, + { + "epoch": 0.6830929487179487, + "grad_norm": 0.8186060786247253, + "learning_rate": 0.0003510127198056224, + "loss": 0.635, + "step": 30690 + }, + { + "epoch": 0.6833155270655271, + "grad_norm": 1.0061089992523193, + "learning_rate": 0.0003509821477622404, + "loss": 0.5805, + "step": 30700 + }, + { + "epoch": 0.6835381054131054, + "grad_norm": 0.4683438241481781, + "learning_rate": 0.00035095156751436483, + "loss": 0.6817, + "step": 30710 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 0.7145074605941772, + "learning_rate": 0.0003509209790636576, + "loss": 0.5704, + "step": 30720 + }, + { + "epoch": 0.6839832621082621, + "grad_norm": 0.7668792605400085, + "learning_rate": 0.0003508903824117807, + "loss": 0.6027, + "step": 30730 + }, + { + "epoch": 0.6842058404558404, + "grad_norm": 0.5148479342460632, + "learning_rate": 0.00035085977756039695, + "loss": 0.7058, + "step": 30740 + }, + { + "epoch": 0.6844284188034188, + "grad_norm": 0.7288443446159363, + "learning_rate": 0.0003508291645111695, + "loss": 0.6624, + "step": 30750 + }, + { + "epoch": 0.6846509971509972, + "grad_norm": 2.099850654602051, + "learning_rate": 0.0003507985432657616, + "loss": 0.7333, + "step": 30760 + }, + { + "epoch": 0.6848735754985755, + "grad_norm": 0.8458130359649658, + "learning_rate": 0.0003507679138258375, + "loss": 0.5342, + "step": 30770 + }, + { + "epoch": 0.6850961538461539, + "grad_norm": 0.6164962649345398, + "learning_rate": 0.0003507372761930616, + "loss": 0.7124, + "step": 30780 + }, + { + "epoch": 0.6853187321937322, + "grad_norm": 0.706070601940155, + "learning_rate": 0.0003507066303690986, + "loss": 0.5651, + "step": 30790 + }, + { + "epoch": 0.6855413105413105, + "grad_norm": 0.6817519664764404, + "learning_rate": 0.000350675976355614, + "loss": 0.6462, + "step": 30800 + }, + { + "epoch": 0.6857638888888888, + "grad_norm": 0.8440865278244019, + "learning_rate": 0.00035064531415427347, + "loss": 0.4929, + "step": 30810 + }, + { + "epoch": 0.6859864672364673, + "grad_norm": 0.5422286987304688, + "learning_rate": 0.00035061464376674327, + "loss": 0.563, + "step": 30820 + }, + { + "epoch": 0.6862090455840456, + "grad_norm": 0.7093590497970581, + "learning_rate": 0.0003505839651946899, + "loss": 0.6696, + "step": 30830 + }, + { + "epoch": 0.6864316239316239, + "grad_norm": 0.6214165091514587, + "learning_rate": 0.00035055327843978076, + "loss": 0.7322, + "step": 30840 + }, + { + "epoch": 0.6866542022792023, + "grad_norm": 0.5425866842269897, + "learning_rate": 0.0003505225835036831, + "loss": 0.5924, + "step": 30850 + }, + { + "epoch": 0.6868767806267806, + "grad_norm": 0.6415085196495056, + "learning_rate": 0.0003504918803880651, + "loss": 0.5754, + "step": 30860 + }, + { + "epoch": 0.6870993589743589, + "grad_norm": 0.7075197696685791, + "learning_rate": 0.0003504611690945951, + "loss": 0.7709, + "step": 30870 + }, + { + "epoch": 0.6873219373219374, + "grad_norm": 0.6782618165016174, + "learning_rate": 0.00035043044962494203, + "loss": 0.6097, + "step": 30880 + }, + { + "epoch": 0.6875445156695157, + "grad_norm": 0.7449636459350586, + "learning_rate": 0.0003503997219807751, + "loss": 0.7023, + "step": 30890 + }, + { + "epoch": 0.687767094017094, + "grad_norm": 0.5584663152694702, + "learning_rate": 0.00035036898616376425, + "loss": 0.6441, + "step": 30900 + }, + { + "epoch": 0.6879896723646723, + "grad_norm": 0.7006946802139282, + "learning_rate": 0.0003503382421755795, + "loss": 0.6955, + "step": 30910 + }, + { + "epoch": 0.6882122507122507, + "grad_norm": 0.7925690412521362, + "learning_rate": 0.0003503074900178917, + "loss": 0.5764, + "step": 30920 + }, + { + "epoch": 0.6884348290598291, + "grad_norm": 0.520116925239563, + "learning_rate": 0.0003502767296923718, + "loss": 0.4167, + "step": 30930 + }, + { + "epoch": 0.6886574074074074, + "grad_norm": 0.6459872126579285, + "learning_rate": 0.0003502459612006913, + "loss": 0.5849, + "step": 30940 + }, + { + "epoch": 0.6888799857549858, + "grad_norm": 0.8839649558067322, + "learning_rate": 0.00035021518454452237, + "loss": 0.643, + "step": 30950 + }, + { + "epoch": 0.6891025641025641, + "grad_norm": 0.7639024257659912, + "learning_rate": 0.00035018439972553736, + "loss": 0.6718, + "step": 30960 + }, + { + "epoch": 0.6893251424501424, + "grad_norm": 0.8284814953804016, + "learning_rate": 0.0003501536067454091, + "loss": 0.6959, + "step": 30970 + }, + { + "epoch": 0.6895477207977208, + "grad_norm": 0.6810000538825989, + "learning_rate": 0.00035012280560581087, + "loss": 0.6354, + "step": 30980 + }, + { + "epoch": 0.6897702991452992, + "grad_norm": 0.6328868865966797, + "learning_rate": 0.00035009199630841654, + "loss": 0.6258, + "step": 30990 + }, + { + "epoch": 0.6899928774928775, + "grad_norm": 0.5300467610359192, + "learning_rate": 0.0003500611788549002, + "loss": 0.5189, + "step": 31000 + }, + { + "epoch": 0.6902154558404558, + "grad_norm": 0.7155830264091492, + "learning_rate": 0.0003500303532469366, + "loss": 0.6178, + "step": 31010 + }, + { + "epoch": 0.6904380341880342, + "grad_norm": 0.7034480571746826, + "learning_rate": 0.00034999951948620084, + "loss": 0.551, + "step": 31020 + }, + { + "epoch": 0.6906606125356125, + "grad_norm": 0.7898311018943787, + "learning_rate": 0.00034996867757436834, + "loss": 0.8097, + "step": 31030 + }, + { + "epoch": 0.6908831908831908, + "grad_norm": 0.7837551236152649, + "learning_rate": 0.0003499378275131151, + "loss": 0.598, + "step": 31040 + }, + { + "epoch": 0.6911057692307693, + "grad_norm": 0.6349511742591858, + "learning_rate": 0.00034990696930411764, + "loss": 0.6538, + "step": 31050 + }, + { + "epoch": 0.6913283475783476, + "grad_norm": 0.8751807808876038, + "learning_rate": 0.00034987610294905265, + "loss": 0.691, + "step": 31060 + }, + { + "epoch": 0.6915509259259259, + "grad_norm": 0.6790831089019775, + "learning_rate": 0.0003498452284495976, + "loss": 0.5019, + "step": 31070 + }, + { + "epoch": 0.6917735042735043, + "grad_norm": 0.6484060883522034, + "learning_rate": 0.0003498143458074302, + "loss": 0.6183, + "step": 31080 + }, + { + "epoch": 0.6919960826210826, + "grad_norm": 0.7466754913330078, + "learning_rate": 0.0003497834550242285, + "loss": 0.5818, + "step": 31090 + }, + { + "epoch": 0.6922186609686609, + "grad_norm": 0.5304076671600342, + "learning_rate": 0.0003497525561016713, + "loss": 0.5433, + "step": 31100 + }, + { + "epoch": 0.6924412393162394, + "grad_norm": 0.43836620450019836, + "learning_rate": 0.00034972164904143767, + "loss": 0.6111, + "step": 31110 + }, + { + "epoch": 0.6926638176638177, + "grad_norm": 0.5781193375587463, + "learning_rate": 0.0003496907338452069, + "loss": 0.6533, + "step": 31120 + }, + { + "epoch": 0.692886396011396, + "grad_norm": 0.4405815899372101, + "learning_rate": 0.00034965981051465923, + "loss": 0.6361, + "step": 31130 + }, + { + "epoch": 0.6931089743589743, + "grad_norm": 0.7085813879966736, + "learning_rate": 0.00034962887905147494, + "loss": 0.6169, + "step": 31140 + }, + { + "epoch": 0.6933315527065527, + "grad_norm": 0.412544846534729, + "learning_rate": 0.00034959793945733484, + "loss": 0.4898, + "step": 31150 + }, + { + "epoch": 0.6935541310541311, + "grad_norm": 0.8504443764686584, + "learning_rate": 0.00034956699173392024, + "loss": 0.6557, + "step": 31160 + }, + { + "epoch": 0.6937767094017094, + "grad_norm": 0.7342494130134583, + "learning_rate": 0.0003495360358829129, + "loss": 0.5863, + "step": 31170 + }, + { + "epoch": 0.6939992877492878, + "grad_norm": 1.312693476676941, + "learning_rate": 0.00034950507190599495, + "loss": 0.6473, + "step": 31180 + }, + { + "epoch": 0.6942218660968661, + "grad_norm": 0.7119789719581604, + "learning_rate": 0.000349474099804849, + "loss": 0.7246, + "step": 31190 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 0.7937467694282532, + "learning_rate": 0.0003494431195811581, + "loss": 0.6255, + "step": 31200 + }, + { + "epoch": 0.6946670227920227, + "grad_norm": 0.6429157257080078, + "learning_rate": 0.00034941213123660574, + "loss": 0.5958, + "step": 31210 + }, + { + "epoch": 0.6948896011396012, + "grad_norm": 0.8229726552963257, + "learning_rate": 0.0003493811347728758, + "loss": 0.5507, + "step": 31220 + }, + { + "epoch": 0.6951121794871795, + "grad_norm": 0.3695134222507477, + "learning_rate": 0.00034935013019165277, + "loss": 0.5767, + "step": 31230 + }, + { + "epoch": 0.6953347578347578, + "grad_norm": 0.6000204086303711, + "learning_rate": 0.00034931911749462135, + "loss": 0.6586, + "step": 31240 + }, + { + "epoch": 0.6955573361823362, + "grad_norm": 0.7966252565383911, + "learning_rate": 0.0003492880966834669, + "loss": 0.5889, + "step": 31250 + }, + { + "epoch": 0.6957799145299145, + "grad_norm": 0.7294942140579224, + "learning_rate": 0.00034925706775987503, + "loss": 0.6751, + "step": 31260 + }, + { + "epoch": 0.6960024928774928, + "grad_norm": 0.5740212798118591, + "learning_rate": 0.0003492260307255319, + "loss": 0.5789, + "step": 31270 + }, + { + "epoch": 0.6962250712250713, + "grad_norm": 0.6787393689155579, + "learning_rate": 0.00034919498558212415, + "loss": 0.6466, + "step": 31280 + }, + { + "epoch": 0.6964476495726496, + "grad_norm": 0.8797711730003357, + "learning_rate": 0.0003491639323313387, + "loss": 0.6238, + "step": 31290 + }, + { + "epoch": 0.6966702279202279, + "grad_norm": 0.7877987623214722, + "learning_rate": 0.0003491328709748631, + "loss": 0.6028, + "step": 31300 + }, + { + "epoch": 0.6968928062678063, + "grad_norm": 0.6889671683311462, + "learning_rate": 0.0003491018015143852, + "loss": 0.5021, + "step": 31310 + }, + { + "epoch": 0.6971153846153846, + "grad_norm": 0.38437703251838684, + "learning_rate": 0.0003490707239515933, + "loss": 0.676, + "step": 31320 + }, + { + "epoch": 0.6973379629629629, + "grad_norm": 0.6212413311004639, + "learning_rate": 0.00034903963828817626, + "loss": 0.6266, + "step": 31330 + }, + { + "epoch": 0.6975605413105413, + "grad_norm": 0.6480898857116699, + "learning_rate": 0.0003490085445258233, + "loss": 0.5704, + "step": 31340 + }, + { + "epoch": 0.6977831196581197, + "grad_norm": 0.658523678779602, + "learning_rate": 0.000348977442666224, + "loss": 0.7526, + "step": 31350 + }, + { + "epoch": 0.698005698005698, + "grad_norm": 1.17215096950531, + "learning_rate": 0.00034894633271106843, + "loss": 0.6893, + "step": 31360 + }, + { + "epoch": 0.6982282763532763, + "grad_norm": 0.8638959527015686, + "learning_rate": 0.0003489152146620473, + "loss": 0.7922, + "step": 31370 + }, + { + "epoch": 0.6984508547008547, + "grad_norm": 0.823543131351471, + "learning_rate": 0.00034888408852085155, + "loss": 0.5136, + "step": 31380 + }, + { + "epoch": 0.6986734330484331, + "grad_norm": 0.6390395760536194, + "learning_rate": 0.00034885295428917245, + "loss": 0.6816, + "step": 31390 + }, + { + "epoch": 0.6988960113960114, + "grad_norm": 0.6792179346084595, + "learning_rate": 0.000348821811968702, + "loss": 0.4866, + "step": 31400 + }, + { + "epoch": 0.6991185897435898, + "grad_norm": 0.5242462158203125, + "learning_rate": 0.00034879066156113245, + "loss": 0.5386, + "step": 31410 + }, + { + "epoch": 0.6993411680911681, + "grad_norm": 0.6970105767250061, + "learning_rate": 0.00034875950306815655, + "loss": 0.5325, + "step": 31420 + }, + { + "epoch": 0.6995637464387464, + "grad_norm": 0.7846478223800659, + "learning_rate": 0.0003487283364914674, + "loss": 0.6103, + "step": 31430 + }, + { + "epoch": 0.6997863247863247, + "grad_norm": 0.747134804725647, + "learning_rate": 0.0003486971618327588, + "loss": 0.7325, + "step": 31440 + }, + { + "epoch": 0.7000089031339032, + "grad_norm": 0.7611316442489624, + "learning_rate": 0.0003486659790937246, + "loss": 0.5463, + "step": 31450 + }, + { + "epoch": 0.7002314814814815, + "grad_norm": 1.1192259788513184, + "learning_rate": 0.0003486347882760595, + "loss": 0.7405, + "step": 31460 + }, + { + "epoch": 0.7004540598290598, + "grad_norm": 0.771552324295044, + "learning_rate": 0.00034860358938145825, + "loss": 0.5434, + "step": 31470 + }, + { + "epoch": 0.7006766381766382, + "grad_norm": 0.6160570383071899, + "learning_rate": 0.0003485723824116163, + "loss": 0.6583, + "step": 31480 + }, + { + "epoch": 0.7008992165242165, + "grad_norm": 0.7527022361755371, + "learning_rate": 0.00034854116736822953, + "loss": 0.6388, + "step": 31490 + }, + { + "epoch": 0.7011217948717948, + "grad_norm": 0.870607852935791, + "learning_rate": 0.00034850994425299404, + "loss": 0.6683, + "step": 31500 + }, + { + "epoch": 0.7013443732193733, + "grad_norm": 0.7308458685874939, + "learning_rate": 0.00034847871306760664, + "loss": 0.6992, + "step": 31510 + }, + { + "epoch": 0.7015669515669516, + "grad_norm": 0.7584428787231445, + "learning_rate": 0.0003484474738137644, + "loss": 0.649, + "step": 31520 + }, + { + "epoch": 0.7017895299145299, + "grad_norm": 0.7453662157058716, + "learning_rate": 0.000348416226493165, + "loss": 0.6592, + "step": 31530 + }, + { + "epoch": 0.7020121082621082, + "grad_norm": 0.7075641751289368, + "learning_rate": 0.00034838497110750623, + "loss": 0.6829, + "step": 31540 + }, + { + "epoch": 0.7022346866096866, + "grad_norm": 0.9151403307914734, + "learning_rate": 0.0003483537076584867, + "loss": 0.6963, + "step": 31550 + }, + { + "epoch": 0.7024572649572649, + "grad_norm": 0.5822266340255737, + "learning_rate": 0.0003483224361478053, + "loss": 0.5676, + "step": 31560 + }, + { + "epoch": 0.7026798433048433, + "grad_norm": 1.0856817960739136, + "learning_rate": 0.00034829115657716126, + "loss": 0.8445, + "step": 31570 + }, + { + "epoch": 0.7029024216524217, + "grad_norm": 0.5421498417854309, + "learning_rate": 0.00034825986894825435, + "loss": 0.5714, + "step": 31580 + }, + { + "epoch": 0.703125, + "grad_norm": 0.6657152771949768, + "learning_rate": 0.0003482285732627848, + "loss": 0.5931, + "step": 31590 + }, + { + "epoch": 0.7033475783475783, + "grad_norm": 0.8101097345352173, + "learning_rate": 0.00034819726952245325, + "loss": 0.6062, + "step": 31600 + }, + { + "epoch": 0.7035701566951567, + "grad_norm": 0.6771178841590881, + "learning_rate": 0.00034816595772896075, + "loss": 0.7197, + "step": 31610 + }, + { + "epoch": 0.7037927350427351, + "grad_norm": 0.8368094563484192, + "learning_rate": 0.0003481346378840088, + "loss": 0.6603, + "step": 31620 + }, + { + "epoch": 0.7040153133903134, + "grad_norm": 0.567988932132721, + "learning_rate": 0.00034810330998929936, + "loss": 0.6759, + "step": 31630 + }, + { + "epoch": 0.7042378917378918, + "grad_norm": 0.5423449277877808, + "learning_rate": 0.0003480719740465348, + "loss": 0.5562, + "step": 31640 + }, + { + "epoch": 0.7044604700854701, + "grad_norm": 0.704281210899353, + "learning_rate": 0.000348040630057418, + "loss": 0.5836, + "step": 31650 + }, + { + "epoch": 0.7046830484330484, + "grad_norm": 0.6200312972068787, + "learning_rate": 0.00034800927802365215, + "loss": 0.4597, + "step": 31660 + }, + { + "epoch": 0.7049056267806267, + "grad_norm": 0.5637904405593872, + "learning_rate": 0.00034797791794694097, + "loss": 0.7486, + "step": 31670 + }, + { + "epoch": 0.7051282051282052, + "grad_norm": 0.8819078207015991, + "learning_rate": 0.00034794654982898856, + "loss": 0.6331, + "step": 31680 + }, + { + "epoch": 0.7053507834757835, + "grad_norm": 0.8103722929954529, + "learning_rate": 0.00034791517367149956, + "loss": 0.6076, + "step": 31690 + }, + { + "epoch": 0.7055733618233618, + "grad_norm": 0.5280768275260925, + "learning_rate": 0.0003478837894761789, + "loss": 0.5688, + "step": 31700 + }, + { + "epoch": 0.7057959401709402, + "grad_norm": 0.8044383525848389, + "learning_rate": 0.0003478523972447321, + "loss": 0.7396, + "step": 31710 + }, + { + "epoch": 0.7060185185185185, + "grad_norm": 0.7547085881233215, + "learning_rate": 0.0003478209969788649, + "loss": 0.5739, + "step": 31720 + }, + { + "epoch": 0.7062410968660968, + "grad_norm": 0.7226533889770508, + "learning_rate": 0.0003477895886802838, + "loss": 0.5934, + "step": 31730 + }, + { + "epoch": 0.7064636752136753, + "grad_norm": 0.5839481353759766, + "learning_rate": 0.0003477581723506955, + "loss": 0.5804, + "step": 31740 + }, + { + "epoch": 0.7066862535612536, + "grad_norm": 0.6620025634765625, + "learning_rate": 0.000347726747991807, + "loss": 0.6022, + "step": 31750 + }, + { + "epoch": 0.7069088319088319, + "grad_norm": 0.7869464159011841, + "learning_rate": 0.0003476953156053262, + "loss": 0.6546, + "step": 31760 + }, + { + "epoch": 0.7071314102564102, + "grad_norm": 0.6986963748931885, + "learning_rate": 0.0003476638751929611, + "loss": 0.666, + "step": 31770 + }, + { + "epoch": 0.7073539886039886, + "grad_norm": 0.8924002051353455, + "learning_rate": 0.00034763242675642003, + "loss": 0.6925, + "step": 31780 + }, + { + "epoch": 0.7075765669515669, + "grad_norm": 0.5170907378196716, + "learning_rate": 0.000347600970297412, + "loss": 0.5942, + "step": 31790 + }, + { + "epoch": 0.7077991452991453, + "grad_norm": 0.9141911268234253, + "learning_rate": 0.0003475695058176465, + "loss": 0.6946, + "step": 31800 + }, + { + "epoch": 0.7080217236467237, + "grad_norm": 0.6999244689941406, + "learning_rate": 0.0003475380333188332, + "loss": 0.7155, + "step": 31810 + }, + { + "epoch": 0.708244301994302, + "grad_norm": 0.8103520274162292, + "learning_rate": 0.0003475065528026824, + "loss": 0.5796, + "step": 31820 + }, + { + "epoch": 0.7084668803418803, + "grad_norm": 0.6898837089538574, + "learning_rate": 0.0003474750642709048, + "loss": 0.5836, + "step": 31830 + }, + { + "epoch": 0.7086894586894587, + "grad_norm": 0.7483912706375122, + "learning_rate": 0.0003474435677252115, + "loss": 0.5765, + "step": 31840 + }, + { + "epoch": 0.7089120370370371, + "grad_norm": 0.9520956873893738, + "learning_rate": 0.0003474120631673139, + "loss": 0.6258, + "step": 31850 + }, + { + "epoch": 0.7091346153846154, + "grad_norm": 0.5112124681472778, + "learning_rate": 0.0003473805505989242, + "loss": 0.6141, + "step": 31860 + }, + { + "epoch": 0.7093571937321937, + "grad_norm": 0.7033752202987671, + "learning_rate": 0.0003473490300217547, + "loss": 0.5962, + "step": 31870 + }, + { + "epoch": 0.7095797720797721, + "grad_norm": 0.8038191795349121, + "learning_rate": 0.00034731750143751833, + "loss": 0.5635, + "step": 31880 + }, + { + "epoch": 0.7098023504273504, + "grad_norm": 0.7981531023979187, + "learning_rate": 0.0003472859648479283, + "loss": 0.6539, + "step": 31890 + }, + { + "epoch": 0.7100249287749287, + "grad_norm": 0.8092349767684937, + "learning_rate": 0.0003472544202546984, + "loss": 0.5704, + "step": 31900 + }, + { + "epoch": 0.7102475071225072, + "grad_norm": 0.7940104007720947, + "learning_rate": 0.00034722286765954274, + "loss": 0.6257, + "step": 31910 + }, + { + "epoch": 0.7104700854700855, + "grad_norm": 0.9252044558525085, + "learning_rate": 0.00034719130706417585, + "loss": 0.6482, + "step": 31920 + }, + { + "epoch": 0.7106926638176638, + "grad_norm": 1.02089524269104, + "learning_rate": 0.00034715973847031294, + "loss": 0.5812, + "step": 31930 + }, + { + "epoch": 0.7109152421652422, + "grad_norm": 0.7068274617195129, + "learning_rate": 0.0003471281618796693, + "loss": 0.6511, + "step": 31940 + }, + { + "epoch": 0.7111378205128205, + "grad_norm": 0.8781763315200806, + "learning_rate": 0.0003470965772939609, + "loss": 0.5816, + "step": 31950 + }, + { + "epoch": 0.7113603988603988, + "grad_norm": 0.909304678440094, + "learning_rate": 0.00034706498471490414, + "loss": 0.6203, + "step": 31960 + }, + { + "epoch": 0.7115829772079773, + "grad_norm": 0.7124899625778198, + "learning_rate": 0.0003470333841442157, + "loss": 1.022, + "step": 31970 + }, + { + "epoch": 0.7118055555555556, + "grad_norm": 0.8078319430351257, + "learning_rate": 0.00034700177558361273, + "loss": 0.6832, + "step": 31980 + }, + { + "epoch": 0.7120281339031339, + "grad_norm": 0.8073205351829529, + "learning_rate": 0.00034697015903481304, + "loss": 0.5991, + "step": 31990 + }, + { + "epoch": 0.7122507122507122, + "grad_norm": 0.9433754682540894, + "learning_rate": 0.0003469385344995345, + "loss": 0.6174, + "step": 32000 + }, + { + "epoch": 0.7124732905982906, + "grad_norm": 0.44269195199012756, + "learning_rate": 0.0003469069019794958, + "loss": 0.6549, + "step": 32010 + }, + { + "epoch": 0.7126958689458689, + "grad_norm": 0.5322273373603821, + "learning_rate": 0.0003468752614764156, + "loss": 0.6386, + "step": 32020 + }, + { + "epoch": 0.7129184472934473, + "grad_norm": 0.6944203972816467, + "learning_rate": 0.00034684361299201365, + "loss": 0.5912, + "step": 32030 + }, + { + "epoch": 0.7131410256410257, + "grad_norm": 0.7633023858070374, + "learning_rate": 0.00034681195652800945, + "loss": 0.5741, + "step": 32040 + }, + { + "epoch": 0.713363603988604, + "grad_norm": 0.8351055383682251, + "learning_rate": 0.00034678029208612345, + "loss": 0.5826, + "step": 32050 + }, + { + "epoch": 0.7135861823361823, + "grad_norm": 0.5108731985092163, + "learning_rate": 0.00034674861966807615, + "loss": 0.5997, + "step": 32060 + }, + { + "epoch": 0.7138087606837606, + "grad_norm": 0.48932331800460815, + "learning_rate": 0.0003467169392755887, + "loss": 0.5924, + "step": 32070 + }, + { + "epoch": 0.7140313390313391, + "grad_norm": 0.3925066888332367, + "learning_rate": 0.00034668525091038265, + "loss": 0.6352, + "step": 32080 + }, + { + "epoch": 0.7142539173789174, + "grad_norm": 0.9368833303451538, + "learning_rate": 0.00034665355457418, + "loss": 0.679, + "step": 32090 + }, + { + "epoch": 0.7144764957264957, + "grad_norm": 0.8664660453796387, + "learning_rate": 0.00034662185026870324, + "loss": 0.6223, + "step": 32100 + }, + { + "epoch": 0.7146990740740741, + "grad_norm": 0.8929124474525452, + "learning_rate": 0.000346590137995675, + "loss": 0.5348, + "step": 32110 + }, + { + "epoch": 0.7149216524216524, + "grad_norm": 0.5446501970291138, + "learning_rate": 0.0003465584177568187, + "loss": 0.5645, + "step": 32120 + }, + { + "epoch": 0.7151442307692307, + "grad_norm": 0.5276156663894653, + "learning_rate": 0.0003465266895538579, + "loss": 0.6256, + "step": 32130 + }, + { + "epoch": 0.7153668091168092, + "grad_norm": 0.5158522129058838, + "learning_rate": 0.000346494953388517, + "loss": 0.6573, + "step": 32140 + }, + { + "epoch": 0.7155893874643875, + "grad_norm": 0.8431310653686523, + "learning_rate": 0.00034646320926252027, + "loss": 0.8396, + "step": 32150 + }, + { + "epoch": 0.7158119658119658, + "grad_norm": 0.6748244762420654, + "learning_rate": 0.0003464314571775929, + "loss": 0.5628, + "step": 32160 + }, + { + "epoch": 0.7160345441595442, + "grad_norm": 0.668067216873169, + "learning_rate": 0.0003463996971354603, + "loss": 0.5305, + "step": 32170 + }, + { + "epoch": 0.7162571225071225, + "grad_norm": 0.5041331648826599, + "learning_rate": 0.0003463679291378483, + "loss": 0.63, + "step": 32180 + }, + { + "epoch": 0.7164797008547008, + "grad_norm": 0.749484658241272, + "learning_rate": 0.0003463361531864831, + "loss": 0.677, + "step": 32190 + }, + { + "epoch": 0.7167022792022792, + "grad_norm": 0.6030726432800293, + "learning_rate": 0.0003463043692830917, + "loss": 0.5044, + "step": 32200 + }, + { + "epoch": 0.7169248575498576, + "grad_norm": 0.4747789204120636, + "learning_rate": 0.000346272577429401, + "loss": 0.5998, + "step": 32210 + }, + { + "epoch": 0.7171474358974359, + "grad_norm": 0.6367260217666626, + "learning_rate": 0.0003462407776271388, + "loss": 0.6071, + "step": 32220 + }, + { + "epoch": 0.7173700142450142, + "grad_norm": 0.453498899936676, + "learning_rate": 0.00034620896987803295, + "loss": 0.7674, + "step": 32230 + }, + { + "epoch": 0.7175925925925926, + "grad_norm": 0.8879516124725342, + "learning_rate": 0.00034617715418381196, + "loss": 0.5801, + "step": 32240 + }, + { + "epoch": 0.7178151709401709, + "grad_norm": 0.5511301159858704, + "learning_rate": 0.00034614533054620473, + "loss": 0.5343, + "step": 32250 + }, + { + "epoch": 0.7180377492877493, + "grad_norm": 0.45961880683898926, + "learning_rate": 0.0003461134989669407, + "loss": 0.5569, + "step": 32260 + }, + { + "epoch": 0.7182603276353277, + "grad_norm": 0.8157079815864563, + "learning_rate": 0.00034608165944774943, + "loss": 0.6672, + "step": 32270 + }, + { + "epoch": 0.718482905982906, + "grad_norm": 1.0624051094055176, + "learning_rate": 0.0003460498119903613, + "loss": 0.6932, + "step": 32280 + }, + { + "epoch": 0.7187054843304843, + "grad_norm": 0.6351505517959595, + "learning_rate": 0.0003460179565965067, + "loss": 0.6836, + "step": 32290 + }, + { + "epoch": 0.7189280626780626, + "grad_norm": 0.5927011370658875, + "learning_rate": 0.0003459860932679169, + "loss": 0.7204, + "step": 32300 + }, + { + "epoch": 0.7191506410256411, + "grad_norm": 0.6216453909873962, + "learning_rate": 0.00034595422200632325, + "loss": 0.4918, + "step": 32310 + }, + { + "epoch": 0.7193732193732194, + "grad_norm": 0.9162493944168091, + "learning_rate": 0.00034592234281345766, + "loss": 0.601, + "step": 32320 + }, + { + "epoch": 0.7195957977207977, + "grad_norm": 0.6896253228187561, + "learning_rate": 0.00034589045569105256, + "loss": 0.706, + "step": 32330 + }, + { + "epoch": 0.7198183760683761, + "grad_norm": 0.5422629714012146, + "learning_rate": 0.00034585856064084066, + "loss": 0.63, + "step": 32340 + }, + { + "epoch": 0.7200409544159544, + "grad_norm": 0.6882944107055664, + "learning_rate": 0.0003458266576645551, + "loss": 0.5267, + "step": 32350 + }, + { + "epoch": 0.7200854700854701, + "eval_loss": 0.6198766827583313, + "eval_runtime": 337.4337, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 7.009, + "step": 32352 + }, + { + "epoch": 0.7202635327635327, + "grad_norm": 0.8453060388565063, + "learning_rate": 0.0003457947467639296, + "loss": 0.6723, + "step": 32360 + }, + { + "epoch": 0.7204861111111112, + "grad_norm": 0.6546816229820251, + "learning_rate": 0.00034576282794069826, + "loss": 0.607, + "step": 32370 + }, + { + "epoch": 0.7207086894586895, + "grad_norm": 0.5675908327102661, + "learning_rate": 0.0003457309011965955, + "loss": 0.6337, + "step": 32380 + }, + { + "epoch": 0.7209312678062678, + "grad_norm": 0.45033371448516846, + "learning_rate": 0.00034569896653335625, + "loss": 0.6044, + "step": 32390 + }, + { + "epoch": 0.7211538461538461, + "grad_norm": 0.6744174361228943, + "learning_rate": 0.00034566702395271597, + "loss": 0.6373, + "step": 32400 + }, + { + "epoch": 0.7213764245014245, + "grad_norm": 0.7250241041183472, + "learning_rate": 0.0003456350734564103, + "loss": 0.7083, + "step": 32410 + }, + { + "epoch": 0.7215990028490028, + "grad_norm": 0.3396146297454834, + "learning_rate": 0.0003456031150461755, + "loss": 0.5993, + "step": 32420 + }, + { + "epoch": 0.7218215811965812, + "grad_norm": 0.6883082985877991, + "learning_rate": 0.00034557114872374824, + "loss": 0.6734, + "step": 32430 + }, + { + "epoch": 0.7220441595441596, + "grad_norm": 0.7887311577796936, + "learning_rate": 0.00034553917449086556, + "loss": 0.6377, + "step": 32440 + }, + { + "epoch": 0.7222667378917379, + "grad_norm": 0.7964504957199097, + "learning_rate": 0.00034550719234926504, + "loss": 0.7179, + "step": 32450 + }, + { + "epoch": 0.7224893162393162, + "grad_norm": 0.7036212086677551, + "learning_rate": 0.00034547520230068454, + "loss": 0.6612, + "step": 32460 + }, + { + "epoch": 0.7227118945868946, + "grad_norm": 0.4557095170021057, + "learning_rate": 0.00034544320434686253, + "loss": 0.5499, + "step": 32470 + }, + { + "epoch": 0.7229344729344729, + "grad_norm": 0.9200230240821838, + "learning_rate": 0.00034541119848953764, + "loss": 0.592, + "step": 32480 + }, + { + "epoch": 0.7231570512820513, + "grad_norm": 0.48239457607269287, + "learning_rate": 0.00034537918473044924, + "loss": 0.6185, + "step": 32490 + }, + { + "epoch": 0.7233796296296297, + "grad_norm": 0.6177913546562195, + "learning_rate": 0.00034534716307133684, + "loss": 0.7293, + "step": 32500 + }, + { + "epoch": 0.723602207977208, + "grad_norm": 0.8865199685096741, + "learning_rate": 0.0003453151335139407, + "loss": 0.5699, + "step": 32510 + }, + { + "epoch": 0.7238247863247863, + "grad_norm": 0.6912305951118469, + "learning_rate": 0.0003452830960600012, + "loss": 0.5005, + "step": 32520 + }, + { + "epoch": 0.7240473646723646, + "grad_norm": 0.5994203090667725, + "learning_rate": 0.00034525105071125933, + "loss": 0.6104, + "step": 32530 + }, + { + "epoch": 0.7242699430199431, + "grad_norm": 1.0352870225906372, + "learning_rate": 0.0003452189974694565, + "loss": 0.8157, + "step": 32540 + }, + { + "epoch": 0.7244925213675214, + "grad_norm": 0.4674214720726013, + "learning_rate": 0.0003451869363363344, + "loss": 0.5084, + "step": 32550 + }, + { + "epoch": 0.7247150997150997, + "grad_norm": 0.9042168259620667, + "learning_rate": 0.0003451548673136354, + "loss": 0.6597, + "step": 32560 + }, + { + "epoch": 0.7249376780626781, + "grad_norm": 0.7562392950057983, + "learning_rate": 0.000345122790403102, + "loss": 0.6503, + "step": 32570 + }, + { + "epoch": 0.7251602564102564, + "grad_norm": 0.5220441222190857, + "learning_rate": 0.0003450907056064774, + "loss": 0.6522, + "step": 32580 + }, + { + "epoch": 0.7253828347578347, + "grad_norm": 0.32991522550582886, + "learning_rate": 0.00034505861292550514, + "loss": 0.5881, + "step": 32590 + }, + { + "epoch": 0.7256054131054132, + "grad_norm": 0.7591485381126404, + "learning_rate": 0.00034502651236192905, + "loss": 0.6425, + "step": 32600 + }, + { + "epoch": 0.7258279914529915, + "grad_norm": 0.6426007151603699, + "learning_rate": 0.0003449944039174935, + "loss": 0.626, + "step": 32610 + }, + { + "epoch": 0.7260505698005698, + "grad_norm": 0.5632911920547485, + "learning_rate": 0.0003449622875939435, + "loss": 0.67, + "step": 32620 + }, + { + "epoch": 0.7262731481481481, + "grad_norm": 0.6731725931167603, + "learning_rate": 0.00034493016339302396, + "loss": 0.5885, + "step": 32630 + }, + { + "epoch": 0.7264957264957265, + "grad_norm": 0.6174749732017517, + "learning_rate": 0.00034489803131648077, + "loss": 0.6846, + "step": 32640 + }, + { + "epoch": 0.7267183048433048, + "grad_norm": 1.0542770624160767, + "learning_rate": 0.00034486589136605993, + "loss": 0.6128, + "step": 32650 + }, + { + "epoch": 0.7269408831908832, + "grad_norm": 0.5963648557662964, + "learning_rate": 0.00034483374354350797, + "loss": 0.6567, + "step": 32660 + }, + { + "epoch": 0.7271634615384616, + "grad_norm": 0.912215530872345, + "learning_rate": 0.0003448015878505718, + "loss": 0.6237, + "step": 32670 + }, + { + "epoch": 0.7273860398860399, + "grad_norm": 0.6201345324516296, + "learning_rate": 0.00034476942428899877, + "loss": 0.5933, + "step": 32680 + }, + { + "epoch": 0.7276086182336182, + "grad_norm": 0.9106643199920654, + "learning_rate": 0.0003447372528605368, + "loss": 0.7458, + "step": 32690 + }, + { + "epoch": 0.7278311965811965, + "grad_norm": 0.7907149195671082, + "learning_rate": 0.00034470507356693396, + "loss": 0.7266, + "step": 32700 + }, + { + "epoch": 0.7280537749287749, + "grad_norm": 0.44493839144706726, + "learning_rate": 0.00034467288640993896, + "loss": 0.5099, + "step": 32710 + }, + { + "epoch": 0.7282763532763533, + "grad_norm": 0.7286844849586487, + "learning_rate": 0.0003446406913913009, + "loss": 0.6397, + "step": 32720 + }, + { + "epoch": 0.7284989316239316, + "grad_norm": 0.6643046736717224, + "learning_rate": 0.00034460848851276924, + "loss": 0.6728, + "step": 32730 + }, + { + "epoch": 0.72872150997151, + "grad_norm": 0.8569263219833374, + "learning_rate": 0.000344576277776094, + "loss": 0.6978, + "step": 32740 + }, + { + "epoch": 0.7289440883190883, + "grad_norm": 0.4604608118534088, + "learning_rate": 0.0003445440591830254, + "loss": 0.5785, + "step": 32750 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 1.0402268171310425, + "learning_rate": 0.0003445118327353143, + "loss": 0.646, + "step": 32760 + }, + { + "epoch": 0.7293892450142451, + "grad_norm": 0.6748735308647156, + "learning_rate": 0.0003444795984347119, + "loss": 0.5849, + "step": 32770 + }, + { + "epoch": 0.7296118233618234, + "grad_norm": 1.337708592414856, + "learning_rate": 0.0003444473562829699, + "loss": 0.7221, + "step": 32780 + }, + { + "epoch": 0.7298344017094017, + "grad_norm": 0.633197546005249, + "learning_rate": 0.00034441510628184025, + "loss": 0.6345, + "step": 32790 + }, + { + "epoch": 0.73005698005698, + "grad_norm": 0.8810262084007263, + "learning_rate": 0.0003443828484330755, + "loss": 0.6605, + "step": 32800 + }, + { + "epoch": 0.7302795584045584, + "grad_norm": 0.6289214491844177, + "learning_rate": 0.0003443505827384286, + "loss": 0.7498, + "step": 32810 + }, + { + "epoch": 0.7305021367521367, + "grad_norm": 0.5705475807189941, + "learning_rate": 0.00034431830919965284, + "loss": 0.581, + "step": 32820 + }, + { + "epoch": 0.7307247150997151, + "grad_norm": 0.7125277519226074, + "learning_rate": 0.0003442860278185021, + "loss": 0.5964, + "step": 32830 + }, + { + "epoch": 0.7309472934472935, + "grad_norm": 0.6184514760971069, + "learning_rate": 0.0003442537385967303, + "loss": 0.4608, + "step": 32840 + }, + { + "epoch": 0.7311698717948718, + "grad_norm": 0.48804301023483276, + "learning_rate": 0.00034422144153609243, + "loss": 0.5924, + "step": 32850 + }, + { + "epoch": 0.7313924501424501, + "grad_norm": 0.661341667175293, + "learning_rate": 0.00034418913663834333, + "loss": 0.7042, + "step": 32860 + }, + { + "epoch": 0.7316150284900285, + "grad_norm": 0.6228981614112854, + "learning_rate": 0.00034415682390523844, + "loss": 0.6062, + "step": 32870 + }, + { + "epoch": 0.7318376068376068, + "grad_norm": 0.6047166585922241, + "learning_rate": 0.0003441245033385338, + "loss": 0.5581, + "step": 32880 + }, + { + "epoch": 0.7320601851851852, + "grad_norm": 0.7112004160881042, + "learning_rate": 0.0003440921749399856, + "loss": 0.6883, + "step": 32890 + }, + { + "epoch": 0.7322827635327636, + "grad_norm": 0.778866708278656, + "learning_rate": 0.0003440598387113507, + "loss": 0.6048, + "step": 32900 + }, + { + "epoch": 0.7325053418803419, + "grad_norm": 0.6084232330322266, + "learning_rate": 0.0003440274946543862, + "loss": 0.5741, + "step": 32910 + }, + { + "epoch": 0.7327279202279202, + "grad_norm": 0.6071760654449463, + "learning_rate": 0.00034399514277084976, + "loss": 0.5831, + "step": 32920 + }, + { + "epoch": 0.7329504985754985, + "grad_norm": 0.622017502784729, + "learning_rate": 0.00034396278306249935, + "loss": 0.5463, + "step": 32930 + }, + { + "epoch": 0.7331730769230769, + "grad_norm": 0.6587297320365906, + "learning_rate": 0.00034393041553109347, + "loss": 0.5529, + "step": 32940 + }, + { + "epoch": 0.7333956552706553, + "grad_norm": 0.8048000335693359, + "learning_rate": 0.00034389804017839103, + "loss": 0.6364, + "step": 32950 + }, + { + "epoch": 0.7336182336182336, + "grad_norm": 0.76850426197052, + "learning_rate": 0.00034386565700615125, + "loss": 0.5629, + "step": 32960 + }, + { + "epoch": 0.733840811965812, + "grad_norm": 0.6999525427818298, + "learning_rate": 0.00034383326601613386, + "loss": 0.6677, + "step": 32970 + }, + { + "epoch": 0.7340633903133903, + "grad_norm": 0.794244647026062, + "learning_rate": 0.0003438008672100991, + "loss": 0.7736, + "step": 32980 + }, + { + "epoch": 0.7342859686609686, + "grad_norm": 0.7045572996139526, + "learning_rate": 0.00034376846058980744, + "loss": 0.6419, + "step": 32990 + }, + { + "epoch": 0.7345085470085471, + "grad_norm": 0.6053374409675598, + "learning_rate": 0.00034373604615702, + "loss": 0.4803, + "step": 33000 + }, + { + "epoch": 0.7347311253561254, + "grad_norm": 0.7537468671798706, + "learning_rate": 0.0003437036239134981, + "loss": 0.5578, + "step": 33010 + }, + { + "epoch": 0.7349537037037037, + "grad_norm": 0.7278876304626465, + "learning_rate": 0.0003436711938610037, + "loss": 0.4338, + "step": 33020 + }, + { + "epoch": 0.735176282051282, + "grad_norm": 0.6402742862701416, + "learning_rate": 0.0003436387560012989, + "loss": 0.59, + "step": 33030 + }, + { + "epoch": 0.7353988603988604, + "grad_norm": 0.7100064158439636, + "learning_rate": 0.0003436063103361466, + "loss": 0.6198, + "step": 33040 + }, + { + "epoch": 0.7356214387464387, + "grad_norm": 0.8709797859191895, + "learning_rate": 0.0003435738568673098, + "loss": 0.7059, + "step": 33050 + }, + { + "epoch": 0.7358440170940171, + "grad_norm": 0.5005736351013184, + "learning_rate": 0.0003435413955965521, + "loss": 0.6524, + "step": 33060 + }, + { + "epoch": 0.7360665954415955, + "grad_norm": 0.6272966265678406, + "learning_rate": 0.00034350892652563737, + "loss": 0.6263, + "step": 33070 + }, + { + "epoch": 0.7362891737891738, + "grad_norm": 0.6624988317489624, + "learning_rate": 0.00034347644965633024, + "loss": 0.5786, + "step": 33080 + }, + { + "epoch": 0.7365117521367521, + "grad_norm": 0.9508379697799683, + "learning_rate": 0.00034344396499039523, + "loss": 0.5954, + "step": 33090 + }, + { + "epoch": 0.7367343304843305, + "grad_norm": 0.5892514586448669, + "learning_rate": 0.0003434114725295978, + "loss": 0.6951, + "step": 33100 + }, + { + "epoch": 0.7369569088319088, + "grad_norm": 0.6346218585968018, + "learning_rate": 0.0003433789722757036, + "loss": 0.6452, + "step": 33110 + }, + { + "epoch": 0.7371794871794872, + "grad_norm": 0.6012640595436096, + "learning_rate": 0.0003433464642304786, + "loss": 0.4765, + "step": 33120 + }, + { + "epoch": 0.7374020655270656, + "grad_norm": 0.6348587870597839, + "learning_rate": 0.00034331394839568944, + "loss": 0.5871, + "step": 33130 + }, + { + "epoch": 0.7376246438746439, + "grad_norm": 0.5616714954376221, + "learning_rate": 0.000343281424773103, + "loss": 0.6428, + "step": 33140 + }, + { + "epoch": 0.7378472222222222, + "grad_norm": 0.7842782139778137, + "learning_rate": 0.0003432488933644866, + "loss": 0.5245, + "step": 33150 + }, + { + "epoch": 0.7380698005698005, + "grad_norm": 0.5432232022285461, + "learning_rate": 0.0003432163541716081, + "loss": 0.6654, + "step": 33160 + }, + { + "epoch": 0.7382923789173789, + "grad_norm": 0.5655632615089417, + "learning_rate": 0.00034318380719623563, + "loss": 0.6713, + "step": 33170 + }, + { + "epoch": 0.7385149572649573, + "grad_norm": 0.5200754404067993, + "learning_rate": 0.0003431512524401379, + "loss": 0.6942, + "step": 33180 + }, + { + "epoch": 0.7387375356125356, + "grad_norm": 0.7658675312995911, + "learning_rate": 0.00034311868990508386, + "loss": 0.5798, + "step": 33190 + }, + { + "epoch": 0.738960113960114, + "grad_norm": 0.5904093980789185, + "learning_rate": 0.0003430861195928431, + "loss": 0.6543, + "step": 33200 + }, + { + "epoch": 0.7391826923076923, + "grad_norm": 0.9393807649612427, + "learning_rate": 0.00034305354150518554, + "loss": 0.5894, + "step": 33210 + }, + { + "epoch": 0.7394052706552706, + "grad_norm": 0.5630085468292236, + "learning_rate": 0.0003430209556438813, + "loss": 0.6652, + "step": 33220 + }, + { + "epoch": 0.7396278490028491, + "grad_norm": 0.8873823881149292, + "learning_rate": 0.0003429883620107013, + "loss": 0.5247, + "step": 33230 + }, + { + "epoch": 0.7398504273504274, + "grad_norm": 0.6598261594772339, + "learning_rate": 0.00034295576060741666, + "loss": 0.5125, + "step": 33240 + }, + { + "epoch": 0.7400730056980057, + "grad_norm": 0.5915985107421875, + "learning_rate": 0.000342923151435799, + "loss": 0.637, + "step": 33250 + }, + { + "epoch": 0.740295584045584, + "grad_norm": 0.8445550799369812, + "learning_rate": 0.0003428905344976202, + "loss": 0.6333, + "step": 33260 + }, + { + "epoch": 0.7405181623931624, + "grad_norm": 0.9027766585350037, + "learning_rate": 0.0003428579097946528, + "loss": 0.6398, + "step": 33270 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.6807312965393066, + "learning_rate": 0.0003428252773286697, + "loss": 0.6241, + "step": 33280 + }, + { + "epoch": 0.7409633190883191, + "grad_norm": 0.6065702438354492, + "learning_rate": 0.000342792637101444, + "loss": 0.7469, + "step": 33290 + }, + { + "epoch": 0.7411858974358975, + "grad_norm": 0.9258270859718323, + "learning_rate": 0.00034275998911474957, + "loss": 0.7389, + "step": 33300 + }, + { + "epoch": 0.7414084757834758, + "grad_norm": 0.5104573965072632, + "learning_rate": 0.0003427273333703605, + "loss": 0.6547, + "step": 33310 + }, + { + "epoch": 0.7416310541310541, + "grad_norm": 0.7797284126281738, + "learning_rate": 0.0003426946698700512, + "loss": 0.6665, + "step": 33320 + }, + { + "epoch": 0.7418536324786325, + "grad_norm": 0.8530895709991455, + "learning_rate": 0.00034266199861559675, + "loss": 0.5328, + "step": 33330 + }, + { + "epoch": 0.7420762108262108, + "grad_norm": 0.5849500298500061, + "learning_rate": 0.00034262931960877256, + "loss": 0.6509, + "step": 33340 + }, + { + "epoch": 0.7422987891737892, + "grad_norm": 0.7217156291007996, + "learning_rate": 0.00034259663285135435, + "loss": 0.6171, + "step": 33350 + }, + { + "epoch": 0.7425213675213675, + "grad_norm": 0.5825576186180115, + "learning_rate": 0.0003425639383451184, + "loss": 0.674, + "step": 33360 + }, + { + "epoch": 0.7427439458689459, + "grad_norm": 0.41654032468795776, + "learning_rate": 0.00034253123609184126, + "loss": 0.5258, + "step": 33370 + }, + { + "epoch": 0.7429665242165242, + "grad_norm": 0.7584710121154785, + "learning_rate": 0.0003424985260933001, + "loss": 0.4909, + "step": 33380 + }, + { + "epoch": 0.7431891025641025, + "grad_norm": 0.7973439693450928, + "learning_rate": 0.00034246580835127244, + "loss": 0.6153, + "step": 33390 + }, + { + "epoch": 0.7434116809116809, + "grad_norm": 0.6786189079284668, + "learning_rate": 0.0003424330828675361, + "loss": 0.6246, + "step": 33400 + }, + { + "epoch": 0.7436342592592593, + "grad_norm": 0.40943869948387146, + "learning_rate": 0.0003424003496438694, + "loss": 0.4983, + "step": 33410 + }, + { + "epoch": 0.7438568376068376, + "grad_norm": 0.5922877192497253, + "learning_rate": 0.00034236760868205116, + "loss": 0.6999, + "step": 33420 + }, + { + "epoch": 0.744079415954416, + "grad_norm": 0.6780363917350769, + "learning_rate": 0.00034233485998386046, + "loss": 0.7115, + "step": 33430 + }, + { + "epoch": 0.7443019943019943, + "grad_norm": 1.0446654558181763, + "learning_rate": 0.000342302103551077, + "loss": 0.5759, + "step": 33440 + }, + { + "epoch": 0.7445245726495726, + "grad_norm": 0.6994029879570007, + "learning_rate": 0.00034226933938548074, + "loss": 0.5636, + "step": 33450 + }, + { + "epoch": 0.7447471509971509, + "grad_norm": 0.6624497175216675, + "learning_rate": 0.0003422365674888521, + "loss": 0.6727, + "step": 33460 + }, + { + "epoch": 0.7449697293447294, + "grad_norm": 0.6512168645858765, + "learning_rate": 0.0003422037878629719, + "loss": 0.593, + "step": 33470 + }, + { + "epoch": 0.7451923076923077, + "grad_norm": 0.49083104729652405, + "learning_rate": 0.00034217100050962153, + "loss": 0.5841, + "step": 33480 + }, + { + "epoch": 0.745414886039886, + "grad_norm": 0.8152703046798706, + "learning_rate": 0.0003421382054305825, + "loss": 0.6204, + "step": 33490 + }, + { + "epoch": 0.7456374643874644, + "grad_norm": 0.7201706767082214, + "learning_rate": 0.0003421054026276371, + "loss": 0.6814, + "step": 33500 + }, + { + "epoch": 0.7458600427350427, + "grad_norm": 0.5058859586715698, + "learning_rate": 0.00034207259210256784, + "loss": 0.5908, + "step": 33510 + }, + { + "epoch": 0.7460826210826211, + "grad_norm": 0.5888089537620544, + "learning_rate": 0.00034203977385715754, + "loss": 0.5705, + "step": 33520 + }, + { + "epoch": 0.7463051994301995, + "grad_norm": 0.3613194227218628, + "learning_rate": 0.00034200694789318967, + "loss": 0.6557, + "step": 33530 + }, + { + "epoch": 0.7465277777777778, + "grad_norm": 0.7530273795127869, + "learning_rate": 0.00034197411421244803, + "loss": 0.6738, + "step": 33540 + }, + { + "epoch": 0.7467503561253561, + "grad_norm": 0.84321129322052, + "learning_rate": 0.00034194127281671677, + "loss": 0.5973, + "step": 33550 + }, + { + "epoch": 0.7469729344729344, + "grad_norm": 0.7950652837753296, + "learning_rate": 0.00034190842370778054, + "loss": 0.7292, + "step": 33560 + }, + { + "epoch": 0.7471955128205128, + "grad_norm": 0.7758309841156006, + "learning_rate": 0.00034187556688742443, + "loss": 0.6571, + "step": 33570 + }, + { + "epoch": 0.7474180911680912, + "grad_norm": 0.712580144405365, + "learning_rate": 0.0003418427023574338, + "loss": 0.5803, + "step": 33580 + }, + { + "epoch": 0.7476406695156695, + "grad_norm": 0.523370087146759, + "learning_rate": 0.00034180983011959464, + "loss": 0.6803, + "step": 33590 + }, + { + "epoch": 0.7478632478632479, + "grad_norm": 0.5187812447547913, + "learning_rate": 0.00034177695017569324, + "loss": 0.5895, + "step": 33600 + }, + { + "epoch": 0.7480858262108262, + "grad_norm": 0.9683651328086853, + "learning_rate": 0.0003417440625275163, + "loss": 0.6858, + "step": 33610 + }, + { + "epoch": 0.7483084045584045, + "grad_norm": 0.9042038917541504, + "learning_rate": 0.00034171116717685094, + "loss": 0.7055, + "step": 33620 + }, + { + "epoch": 0.7485309829059829, + "grad_norm": 0.5068843960762024, + "learning_rate": 0.00034167826412548477, + "loss": 0.6619, + "step": 33630 + }, + { + "epoch": 0.7487535612535613, + "grad_norm": 0.5242236852645874, + "learning_rate": 0.00034164535337520574, + "loss": 0.5987, + "step": 33640 + }, + { + "epoch": 0.7489761396011396, + "grad_norm": 0.6024574041366577, + "learning_rate": 0.00034161243492780225, + "loss": 0.6789, + "step": 33650 + }, + { + "epoch": 0.749198717948718, + "grad_norm": 0.587216854095459, + "learning_rate": 0.00034157950878506313, + "loss": 0.6347, + "step": 33660 + }, + { + "epoch": 0.7494212962962963, + "grad_norm": 0.6755560636520386, + "learning_rate": 0.0003415465749487776, + "loss": 0.6212, + "step": 33670 + }, + { + "epoch": 0.7496438746438746, + "grad_norm": 0.5546956062316895, + "learning_rate": 0.00034151363342073524, + "loss": 0.5529, + "step": 33680 + }, + { + "epoch": 0.7498664529914529, + "grad_norm": 0.7829976081848145, + "learning_rate": 0.0003414806842027263, + "loss": 0.7853, + "step": 33690 + }, + { + "epoch": 0.7500890313390314, + "grad_norm": 0.7056668996810913, + "learning_rate": 0.00034144772729654107, + "loss": 0.4261, + "step": 33700 + }, + { + "epoch": 0.7503116096866097, + "grad_norm": 0.4800557792186737, + "learning_rate": 0.00034141476270397057, + "loss": 0.6852, + "step": 33710 + }, + { + "epoch": 0.750534188034188, + "grad_norm": 0.5922639966011047, + "learning_rate": 0.0003413817904268061, + "loss": 0.5152, + "step": 33720 + }, + { + "epoch": 0.7507567663817664, + "grad_norm": 0.6094176769256592, + "learning_rate": 0.0003413488104668394, + "loss": 0.6417, + "step": 33730 + }, + { + "epoch": 0.7509793447293447, + "grad_norm": 0.5336151719093323, + "learning_rate": 0.00034131582282586264, + "loss": 0.472, + "step": 33740 + }, + { + "epoch": 0.7512019230769231, + "grad_norm": 1.060437560081482, + "learning_rate": 0.00034128282750566836, + "loss": 0.6104, + "step": 33750 + }, + { + "epoch": 0.7514245014245015, + "grad_norm": 0.8258956074714661, + "learning_rate": 0.0003412498245080496, + "loss": 0.8442, + "step": 33760 + }, + { + "epoch": 0.7516470797720798, + "grad_norm": 0.6972953081130981, + "learning_rate": 0.00034121681383479977, + "loss": 0.6083, + "step": 33770 + }, + { + "epoch": 0.7518696581196581, + "grad_norm": 0.588059663772583, + "learning_rate": 0.0003411837954877126, + "loss": 0.6383, + "step": 33780 + }, + { + "epoch": 0.7520922364672364, + "grad_norm": 0.8543235659599304, + "learning_rate": 0.00034115076946858246, + "loss": 0.5739, + "step": 33790 + }, + { + "epoch": 0.7523148148148148, + "grad_norm": 0.5297170877456665, + "learning_rate": 0.00034111773577920394, + "loss": 0.6443, + "step": 33800 + }, + { + "epoch": 0.7525373931623932, + "grad_norm": 0.5189324021339417, + "learning_rate": 0.0003410846944213721, + "loss": 0.5406, + "step": 33810 + }, + { + "epoch": 0.7527599715099715, + "grad_norm": 0.8608090877532959, + "learning_rate": 0.00034105164539688246, + "loss": 0.6248, + "step": 33820 + }, + { + "epoch": 0.7529825498575499, + "grad_norm": 0.8554071187973022, + "learning_rate": 0.000341018588707531, + "loss": 0.6015, + "step": 33830 + }, + { + "epoch": 0.7532051282051282, + "grad_norm": 0.5413314700126648, + "learning_rate": 0.000340985524355114, + "loss": 0.6217, + "step": 33840 + }, + { + "epoch": 0.7534277065527065, + "grad_norm": 0.7716408371925354, + "learning_rate": 0.0003409524523414281, + "loss": 0.5588, + "step": 33850 + }, + { + "epoch": 0.7536502849002849, + "grad_norm": 0.7285047173500061, + "learning_rate": 0.0003409193726682706, + "loss": 0.7738, + "step": 33860 + }, + { + "epoch": 0.7538728632478633, + "grad_norm": 0.4201757311820984, + "learning_rate": 0.000340886285337439, + "loss": 0.5486, + "step": 33870 + }, + { + "epoch": 0.7540954415954416, + "grad_norm": 0.5679952502250671, + "learning_rate": 0.00034085319035073134, + "loss": 0.5122, + "step": 33880 + }, + { + "epoch": 0.75431801994302, + "grad_norm": 0.5826485753059387, + "learning_rate": 0.00034082008770994606, + "loss": 0.549, + "step": 33890 + }, + { + "epoch": 0.7545405982905983, + "grad_norm": 0.3409044146537781, + "learning_rate": 0.0003407869774168819, + "loss": 0.6678, + "step": 33900 + }, + { + "epoch": 0.7547631766381766, + "grad_norm": 0.5800386667251587, + "learning_rate": 0.00034075385947333805, + "loss": 0.6745, + "step": 33910 + }, + { + "epoch": 0.7549857549857549, + "grad_norm": 0.5018765926361084, + "learning_rate": 0.0003407207338811144, + "loss": 0.4987, + "step": 33920 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.7626890540122986, + "learning_rate": 0.0003406876006420108, + "loss": 0.5695, + "step": 33930 + }, + { + "epoch": 0.7554309116809117, + "grad_norm": 0.6939761638641357, + "learning_rate": 0.0003406544597578278, + "loss": 0.718, + "step": 33940 + }, + { + "epoch": 0.75565349002849, + "grad_norm": 0.5773590803146362, + "learning_rate": 0.0003406213112303663, + "loss": 0.7233, + "step": 33950 + }, + { + "epoch": 0.7558760683760684, + "grad_norm": 0.6457138061523438, + "learning_rate": 0.00034058815506142763, + "loss": 0.5802, + "step": 33960 + }, + { + "epoch": 0.7560986467236467, + "grad_norm": 0.643480122089386, + "learning_rate": 0.0003405549912528135, + "loss": 0.6462, + "step": 33970 + }, + { + "epoch": 0.7563212250712251, + "grad_norm": 0.6513316631317139, + "learning_rate": 0.00034052181980632617, + "loss": 0.6008, + "step": 33980 + }, + { + "epoch": 0.7565438034188035, + "grad_norm": 0.6498247385025024, + "learning_rate": 0.00034048864072376805, + "loss": 0.6596, + "step": 33990 + }, + { + "epoch": 0.7567663817663818, + "grad_norm": 0.7045326232910156, + "learning_rate": 0.0003404554540069422, + "loss": 0.5607, + "step": 34000 + }, + { + "epoch": 0.7569889601139601, + "grad_norm": 0.9087044596672058, + "learning_rate": 0.000340422259657652, + "loss": 0.7722, + "step": 34010 + }, + { + "epoch": 0.7572115384615384, + "grad_norm": 0.6496840715408325, + "learning_rate": 0.00034038905767770123, + "loss": 0.5651, + "step": 34020 + }, + { + "epoch": 0.7574341168091168, + "grad_norm": 0.6747355461120605, + "learning_rate": 0.00034035584806889417, + "loss": 0.5881, + "step": 34030 + }, + { + "epoch": 0.7576566951566952, + "grad_norm": 1.1374226808547974, + "learning_rate": 0.0003403226308330354, + "loss": 0.725, + "step": 34040 + }, + { + "epoch": 0.7578792735042735, + "grad_norm": 0.7352719902992249, + "learning_rate": 0.00034028940597193003, + "loss": 0.6371, + "step": 34050 + }, + { + "epoch": 0.7581018518518519, + "grad_norm": 0.8843091726303101, + "learning_rate": 0.0003402561734873834, + "loss": 0.5925, + "step": 34060 + }, + { + "epoch": 0.7583244301994302, + "grad_norm": 0.4928925335407257, + "learning_rate": 0.0003402229333812016, + "loss": 0.6538, + "step": 34070 + }, + { + "epoch": 0.7585470085470085, + "grad_norm": 0.4970260262489319, + "learning_rate": 0.00034018968565519074, + "loss": 0.4335, + "step": 34080 + }, + { + "epoch": 0.7587695868945868, + "grad_norm": 1.0510859489440918, + "learning_rate": 0.0003401564303111576, + "loss": 0.6611, + "step": 34090 + }, + { + "epoch": 0.7589921652421653, + "grad_norm": 0.5787474513053894, + "learning_rate": 0.00034012316735090934, + "loss": 0.4912, + "step": 34100 + }, + { + "epoch": 0.7592147435897436, + "grad_norm": 0.7155798673629761, + "learning_rate": 0.0003400898967762535, + "loss": 0.7107, + "step": 34110 + }, + { + "epoch": 0.7594373219373219, + "grad_norm": 0.8900055885314941, + "learning_rate": 0.0003400566185889979, + "loss": 0.6932, + "step": 34120 + }, + { + "epoch": 0.7596599002849003, + "grad_norm": 0.4244909882545471, + "learning_rate": 0.00034002333279095105, + "loss": 0.5289, + "step": 34130 + }, + { + "epoch": 0.7598824786324786, + "grad_norm": 0.4296218454837799, + "learning_rate": 0.0003399900393839216, + "loss": 0.5492, + "step": 34140 + }, + { + "epoch": 0.7601050569800569, + "grad_norm": 0.5768629312515259, + "learning_rate": 0.0003399567383697188, + "loss": 0.4549, + "step": 34150 + }, + { + "epoch": 0.7603276353276354, + "grad_norm": 0.5289151668548584, + "learning_rate": 0.0003399234297501523, + "loss": 0.5806, + "step": 34160 + }, + { + "epoch": 0.7605502136752137, + "grad_norm": 0.9661064147949219, + "learning_rate": 0.0003398901135270321, + "loss": 0.6065, + "step": 34170 + }, + { + "epoch": 0.760772792022792, + "grad_norm": 0.6725865006446838, + "learning_rate": 0.00033985678970216865, + "loss": 0.6203, + "step": 34180 + }, + { + "epoch": 0.7609953703703703, + "grad_norm": 0.7186846137046814, + "learning_rate": 0.00033982345827737265, + "loss": 0.6291, + "step": 34190 + }, + { + "epoch": 0.7612179487179487, + "grad_norm": 0.6042883396148682, + "learning_rate": 0.0003397901192544555, + "loss": 0.5486, + "step": 34200 + }, + { + "epoch": 0.7614405270655271, + "grad_norm": 0.41314706206321716, + "learning_rate": 0.0003397567726352289, + "loss": 0.5823, + "step": 34210 + }, + { + "epoch": 0.7616631054131054, + "grad_norm": 0.6745661497116089, + "learning_rate": 0.0003397234184215048, + "loss": 0.6311, + "step": 34220 + }, + { + "epoch": 0.7618856837606838, + "grad_norm": 0.6066137552261353, + "learning_rate": 0.0003396900566150957, + "loss": 0.7573, + "step": 34230 + }, + { + "epoch": 0.7621082621082621, + "grad_norm": 0.6698915362358093, + "learning_rate": 0.0003396566872178147, + "loss": 0.5738, + "step": 34240 + }, + { + "epoch": 0.7623308404558404, + "grad_norm": 0.6477178931236267, + "learning_rate": 0.00033962331023147495, + "loss": 0.6469, + "step": 34250 + }, + { + "epoch": 0.7625534188034188, + "grad_norm": 0.7714323997497559, + "learning_rate": 0.00033958992565789013, + "loss": 0.5805, + "step": 34260 + }, + { + "epoch": 0.7627759971509972, + "grad_norm": 0.6476292610168457, + "learning_rate": 0.00033955653349887455, + "loss": 0.665, + "step": 34270 + }, + { + "epoch": 0.7629985754985755, + "grad_norm": 0.49801915884017944, + "learning_rate": 0.0003395231337562427, + "loss": 0.5172, + "step": 34280 + }, + { + "epoch": 0.7632211538461539, + "grad_norm": 0.4742410182952881, + "learning_rate": 0.0003394897264318095, + "loss": 0.6301, + "step": 34290 + }, + { + "epoch": 0.7634437321937322, + "grad_norm": 0.8749886751174927, + "learning_rate": 0.0003394563115273904, + "loss": 0.5594, + "step": 34300 + }, + { + "epoch": 0.7636663105413105, + "grad_norm": 0.8473349809646606, + "learning_rate": 0.00033942288904480124, + "loss": 0.6868, + "step": 34310 + }, + { + "epoch": 0.7638888888888888, + "grad_norm": 1.0486420392990112, + "learning_rate": 0.00033938945898585805, + "loss": 0.6768, + "step": 34320 + }, + { + "epoch": 0.7641114672364673, + "grad_norm": 0.5907694697380066, + "learning_rate": 0.00033935602135237757, + "loss": 0.6682, + "step": 34330 + }, + { + "epoch": 0.7643340455840456, + "grad_norm": 0.7957288026809692, + "learning_rate": 0.00033932257614617686, + "loss": 0.4985, + "step": 34340 + }, + { + "epoch": 0.7645566239316239, + "grad_norm": 0.9030253291130066, + "learning_rate": 0.00033928912336907325, + "loss": 0.6389, + "step": 34350 + }, + { + "epoch": 0.7647792022792023, + "grad_norm": 0.6879715323448181, + "learning_rate": 0.00033925566302288465, + "loss": 0.6378, + "step": 34360 + }, + { + "epoch": 0.7650017806267806, + "grad_norm": 0.6054188013076782, + "learning_rate": 0.00033922219510942934, + "loss": 0.6265, + "step": 34370 + }, + { + "epoch": 0.7652243589743589, + "grad_norm": 0.6758320331573486, + "learning_rate": 0.000339188719630526, + "loss": 0.6042, + "step": 34380 + }, + { + "epoch": 0.7654469373219374, + "grad_norm": 0.717961311340332, + "learning_rate": 0.00033915523658799366, + "loss": 0.7417, + "step": 34390 + }, + { + "epoch": 0.7656695156695157, + "grad_norm": 0.7856202721595764, + "learning_rate": 0.00033912174598365187, + "loss": 0.5175, + "step": 34400 + }, + { + "epoch": 0.765892094017094, + "grad_norm": 0.7919657826423645, + "learning_rate": 0.0003390882478193205, + "loss": 0.6577, + "step": 34410 + }, + { + "epoch": 0.7661146723646723, + "grad_norm": 0.642520546913147, + "learning_rate": 0.0003390547420968198, + "loss": 0.6411, + "step": 34420 + }, + { + "epoch": 0.7663372507122507, + "grad_norm": 0.6515674591064453, + "learning_rate": 0.0003390212288179707, + "loss": 0.5741, + "step": 34430 + }, + { + "epoch": 0.7665598290598291, + "grad_norm": 0.7228606939315796, + "learning_rate": 0.0003389877079845942, + "loss": 0.6719, + "step": 34440 + }, + { + "epoch": 0.7667824074074074, + "grad_norm": 0.6175971031188965, + "learning_rate": 0.00033895417959851177, + "loss": 0.7009, + "step": 34450 + }, + { + "epoch": 0.7670049857549858, + "grad_norm": 0.6188730597496033, + "learning_rate": 0.00033892064366154555, + "loss": 0.6157, + "step": 34460 + }, + { + "epoch": 0.7672275641025641, + "grad_norm": 0.70756596326828, + "learning_rate": 0.00033888710017551785, + "loss": 0.6073, + "step": 34470 + }, + { + "epoch": 0.7674501424501424, + "grad_norm": 0.6952245235443115, + "learning_rate": 0.0003388535491422514, + "loss": 0.658, + "step": 34480 + }, + { + "epoch": 0.7676727207977208, + "grad_norm": 0.7117817401885986, + "learning_rate": 0.0003388199905635694, + "loss": 0.635, + "step": 34490 + }, + { + "epoch": 0.7678952991452992, + "grad_norm": 0.8737725615501404, + "learning_rate": 0.00033878642444129547, + "loss": 0.6462, + "step": 34500 + }, + { + "epoch": 0.7681178774928775, + "grad_norm": 0.8220909237861633, + "learning_rate": 0.0003387528507772536, + "loss": 0.7008, + "step": 34510 + }, + { + "epoch": 0.7683404558404558, + "grad_norm": 0.728020191192627, + "learning_rate": 0.0003387192695732683, + "loss": 0.5707, + "step": 34520 + }, + { + "epoch": 0.7685630341880342, + "grad_norm": 0.7700955271720886, + "learning_rate": 0.00033868568083116426, + "loss": 0.6196, + "step": 34530 + }, + { + "epoch": 0.7687856125356125, + "grad_norm": 0.725366473197937, + "learning_rate": 0.0003386520845527668, + "loss": 0.5375, + "step": 34540 + }, + { + "epoch": 0.7690081908831908, + "grad_norm": 0.48743993043899536, + "learning_rate": 0.0003386184807399016, + "loss": 0.4771, + "step": 34550 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.777994692325592, + "learning_rate": 0.00033858486939439465, + "loss": 0.6777, + "step": 34560 + }, + { + "epoch": 0.7694533475783476, + "grad_norm": 0.8114678263664246, + "learning_rate": 0.00033855125051807246, + "loss": 0.7312, + "step": 34570 + }, + { + "epoch": 0.7696759259259259, + "grad_norm": 0.7239607572555542, + "learning_rate": 0.0003385176241127619, + "loss": 0.6857, + "step": 34580 + }, + { + "epoch": 0.7698985042735043, + "grad_norm": 0.6901978850364685, + "learning_rate": 0.00033848399018029024, + "loss": 0.6023, + "step": 34590 + }, + { + "epoch": 0.7701210826210826, + "grad_norm": 0.5981554388999939, + "learning_rate": 0.00033845034872248515, + "loss": 0.6188, + "step": 34600 + }, + { + "epoch": 0.7703436609686609, + "grad_norm": 0.9385389685630798, + "learning_rate": 0.0003384166997411748, + "loss": 0.6454, + "step": 34610 + }, + { + "epoch": 0.7705662393162394, + "grad_norm": 0.6972981095314026, + "learning_rate": 0.0003383830432381877, + "loss": 0.5368, + "step": 34620 + }, + { + "epoch": 0.7707888176638177, + "grad_norm": 0.7719412446022034, + "learning_rate": 0.0003383493792153527, + "loss": 0.5493, + "step": 34630 + }, + { + "epoch": 0.771011396011396, + "grad_norm": 0.6207097172737122, + "learning_rate": 0.0003383157076744992, + "loss": 0.4886, + "step": 34640 + }, + { + "epoch": 0.7712339743589743, + "grad_norm": 0.7396118640899658, + "learning_rate": 0.0003382820286174569, + "loss": 0.5986, + "step": 34650 + }, + { + "epoch": 0.7714565527065527, + "grad_norm": 0.40450310707092285, + "learning_rate": 0.00033824834204605595, + "loss": 0.6138, + "step": 34660 + }, + { + "epoch": 0.7716791310541311, + "grad_norm": 0.5757637023925781, + "learning_rate": 0.00033821464796212697, + "loss": 0.5282, + "step": 34670 + }, + { + "epoch": 0.7719017094017094, + "grad_norm": 0.6552518606185913, + "learning_rate": 0.00033818094636750085, + "loss": 0.5035, + "step": 34680 + }, + { + "epoch": 0.7721242877492878, + "grad_norm": 0.6848192811012268, + "learning_rate": 0.00033814723726400896, + "loss": 0.6008, + "step": 34690 + }, + { + "epoch": 0.7723468660968661, + "grad_norm": 0.5395011305809021, + "learning_rate": 0.0003381135206534832, + "loss": 0.7105, + "step": 34700 + }, + { + "epoch": 0.7725694444444444, + "grad_norm": 0.5970385670661926, + "learning_rate": 0.00033807979653775554, + "loss": 0.592, + "step": 34710 + }, + { + "epoch": 0.7727920227920227, + "grad_norm": 0.6868297457695007, + "learning_rate": 0.00033804606491865877, + "loss": 0.6598, + "step": 34720 + }, + { + "epoch": 0.7730146011396012, + "grad_norm": 0.5928608775138855, + "learning_rate": 0.0003380123257980257, + "loss": 0.5612, + "step": 34730 + }, + { + "epoch": 0.7732371794871795, + "grad_norm": 0.7232846021652222, + "learning_rate": 0.00033797857917769003, + "loss": 0.509, + "step": 34740 + }, + { + "epoch": 0.7734597578347578, + "grad_norm": 0.662609875202179, + "learning_rate": 0.0003379448250594853, + "loss": 0.6415, + "step": 34750 + }, + { + "epoch": 0.7736823361823362, + "grad_norm": 0.286639541387558, + "learning_rate": 0.00033791106344524584, + "loss": 0.6826, + "step": 34760 + }, + { + "epoch": 0.7739049145299145, + "grad_norm": 1.0770177841186523, + "learning_rate": 0.0003378772943368064, + "loss": 0.5784, + "step": 34770 + }, + { + "epoch": 0.7741274928774928, + "grad_norm": 0.6964942216873169, + "learning_rate": 0.0003378435177360019, + "loss": 0.4603, + "step": 34780 + }, + { + "epoch": 0.7743500712250713, + "grad_norm": 0.40256690979003906, + "learning_rate": 0.0003378097336446677, + "loss": 0.5608, + "step": 34790 + }, + { + "epoch": 0.7745726495726496, + "grad_norm": 0.5216047763824463, + "learning_rate": 0.0003377759420646398, + "loss": 0.4453, + "step": 34800 + }, + { + "epoch": 0.7747952279202279, + "grad_norm": 0.7011343240737915, + "learning_rate": 0.00033774214299775446, + "loss": 0.6648, + "step": 34810 + }, + { + "epoch": 0.7750178062678063, + "grad_norm": 0.6892892122268677, + "learning_rate": 0.00033770833644584827, + "loss": 0.6408, + "step": 34820 + }, + { + "epoch": 0.7752403846153846, + "grad_norm": 1.0617207288742065, + "learning_rate": 0.00033767452241075836, + "loss": 0.5153, + "step": 34830 + }, + { + "epoch": 0.7754629629629629, + "grad_norm": 0.6845688819885254, + "learning_rate": 0.00033764070089432224, + "loss": 0.718, + "step": 34840 + }, + { + "epoch": 0.7756855413105413, + "grad_norm": 0.5099952816963196, + "learning_rate": 0.00033760687189837767, + "loss": 0.7615, + "step": 34850 + }, + { + "epoch": 0.7759081196581197, + "grad_norm": 0.5010229349136353, + "learning_rate": 0.00033757303542476314, + "loss": 0.5986, + "step": 34860 + }, + { + "epoch": 0.776130698005698, + "grad_norm": 0.50068199634552, + "learning_rate": 0.00033753919147531714, + "loss": 0.6493, + "step": 34870 + }, + { + "epoch": 0.7763532763532763, + "grad_norm": 0.7622873187065125, + "learning_rate": 0.00033750534005187895, + "loss": 0.7228, + "step": 34880 + }, + { + "epoch": 0.7765758547008547, + "grad_norm": 0.769127368927002, + "learning_rate": 0.00033747148115628793, + "loss": 0.5665, + "step": 34890 + }, + { + "epoch": 0.7767984330484331, + "grad_norm": 0.8573402762413025, + "learning_rate": 0.0003374376147903842, + "loss": 0.6565, + "step": 34900 + }, + { + "epoch": 0.7770210113960114, + "grad_norm": 0.5039611458778381, + "learning_rate": 0.0003374037409560078, + "loss": 0.7783, + "step": 34910 + }, + { + "epoch": 0.7772435897435898, + "grad_norm": 0.8162313103675842, + "learning_rate": 0.0003373698596549998, + "loss": 0.6285, + "step": 34920 + }, + { + "epoch": 0.7774661680911681, + "grad_norm": 0.6889898777008057, + "learning_rate": 0.0003373359708892011, + "loss": 0.6166, + "step": 34930 + }, + { + "epoch": 0.7776887464387464, + "grad_norm": 0.4901689887046814, + "learning_rate": 0.0003373020746604533, + "loss": 0.4887, + "step": 34940 + }, + { + "epoch": 0.7779113247863247, + "grad_norm": 0.9870404005050659, + "learning_rate": 0.0003372681709705984, + "loss": 0.5822, + "step": 34950 + }, + { + "epoch": 0.7781339031339032, + "grad_norm": 0.8554285764694214, + "learning_rate": 0.0003372342598214787, + "loss": 0.6114, + "step": 34960 + }, + { + "epoch": 0.7783564814814815, + "grad_norm": 0.6954330801963806, + "learning_rate": 0.000337200341214937, + "loss": 0.6105, + "step": 34970 + }, + { + "epoch": 0.7785790598290598, + "grad_norm": 0.6738479137420654, + "learning_rate": 0.0003371664151528164, + "loss": 0.7448, + "step": 34980 + }, + { + "epoch": 0.7788016381766382, + "grad_norm": 0.5652036070823669, + "learning_rate": 0.00033713248163696054, + "loss": 0.5914, + "step": 34990 + }, + { + "epoch": 0.7790242165242165, + "grad_norm": 0.5839760899543762, + "learning_rate": 0.00033709854066921337, + "loss": 0.7341, + "step": 35000 + }, + { + "epoch": 0.7792467948717948, + "grad_norm": 0.5647395849227905, + "learning_rate": 0.0003370645922514192, + "loss": 0.5992, + "step": 35010 + }, + { + "epoch": 0.7794693732193733, + "grad_norm": 0.6457863450050354, + "learning_rate": 0.0003370306363854229, + "loss": 0.6987, + "step": 35020 + }, + { + "epoch": 0.7796919515669516, + "grad_norm": 0.9118708968162537, + "learning_rate": 0.0003369966730730697, + "loss": 0.5721, + "step": 35030 + }, + { + "epoch": 0.7799145299145299, + "grad_norm": 0.4689837694168091, + "learning_rate": 0.00033696270231620514, + "loss": 0.6103, + "step": 35040 + }, + { + "epoch": 0.7800925925925926, + "eval_loss": 0.6148799061775208, + "eval_runtime": 337.3683, + "eval_samples_per_second": 7.01, + "eval_steps_per_second": 7.01, + "step": 35048 + }, + { + "epoch": 0.7801371082621082, + "grad_norm": 0.6478744149208069, + "learning_rate": 0.00033692872411667516, + "loss": 0.6644, + "step": 35050 + }, + { + "epoch": 0.7803596866096866, + "grad_norm": 0.4604703187942505, + "learning_rate": 0.0003368947384763263, + "loss": 0.5932, + "step": 35060 + }, + { + "epoch": 0.7805822649572649, + "grad_norm": 0.5048819780349731, + "learning_rate": 0.00033686074539700516, + "loss": 0.5681, + "step": 35070 + }, + { + "epoch": 0.7808048433048433, + "grad_norm": 0.9202898740768433, + "learning_rate": 0.0003368267448805591, + "loss": 0.7438, + "step": 35080 + }, + { + "epoch": 0.7810274216524217, + "grad_norm": 0.7058709859848022, + "learning_rate": 0.00033679273692883575, + "loss": 0.607, + "step": 35090 + }, + { + "epoch": 0.78125, + "grad_norm": 0.4312692880630493, + "learning_rate": 0.00033675872154368314, + "loss": 0.5137, + "step": 35100 + }, + { + "epoch": 0.7814725783475783, + "grad_norm": 0.9358759522438049, + "learning_rate": 0.00033672469872694956, + "loss": 0.6285, + "step": 35110 + }, + { + "epoch": 0.7816951566951567, + "grad_norm": 0.5190538763999939, + "learning_rate": 0.00033669066848048397, + "loss": 0.4727, + "step": 35120 + }, + { + "epoch": 0.7819177350427351, + "grad_norm": 0.6738810539245605, + "learning_rate": 0.0003366566308061355, + "loss": 0.7522, + "step": 35130 + }, + { + "epoch": 0.7821403133903134, + "grad_norm": 0.903041660785675, + "learning_rate": 0.0003366225857057539, + "loss": 0.5667, + "step": 35140 + }, + { + "epoch": 0.7823628917378918, + "grad_norm": 0.8216555714607239, + "learning_rate": 0.00033658853318118916, + "loss": 0.6111, + "step": 35150 + }, + { + "epoch": 0.7825854700854701, + "grad_norm": 0.9077396392822266, + "learning_rate": 0.0003365544732342917, + "loss": 0.6179, + "step": 35160 + }, + { + "epoch": 0.7828080484330484, + "grad_norm": 1.029372215270996, + "learning_rate": 0.00033652040586691233, + "loss": 0.6137, + "step": 35170 + }, + { + "epoch": 0.7830306267806267, + "grad_norm": 0.569773256778717, + "learning_rate": 0.0003364863310809024, + "loss": 0.5054, + "step": 35180 + }, + { + "epoch": 0.7832532051282052, + "grad_norm": 0.8230850696563721, + "learning_rate": 0.00033645224887811343, + "loss": 0.4942, + "step": 35190 + }, + { + "epoch": 0.7834757834757835, + "grad_norm": 0.5945245623588562, + "learning_rate": 0.0003364181592603976, + "loss": 0.6899, + "step": 35200 + }, + { + "epoch": 0.7836983618233618, + "grad_norm": 0.7154036164283752, + "learning_rate": 0.00033638406222960733, + "loss": 0.6614, + "step": 35210 + }, + { + "epoch": 0.7839209401709402, + "grad_norm": 0.7123019695281982, + "learning_rate": 0.00033634995778759544, + "loss": 0.8057, + "step": 35220 + }, + { + "epoch": 0.7841435185185185, + "grad_norm": 0.733191728591919, + "learning_rate": 0.00033631584593621524, + "loss": 0.6401, + "step": 35230 + }, + { + "epoch": 0.7843660968660968, + "grad_norm": 0.5150040984153748, + "learning_rate": 0.0003362817266773204, + "loss": 0.5645, + "step": 35240 + }, + { + "epoch": 0.7845886752136753, + "grad_norm": 0.8314915895462036, + "learning_rate": 0.0003362476000127649, + "loss": 0.6106, + "step": 35250 + }, + { + "epoch": 0.7848112535612536, + "grad_norm": 0.42255085706710815, + "learning_rate": 0.00033621346594440337, + "loss": 0.5478, + "step": 35260 + }, + { + "epoch": 0.7850338319088319, + "grad_norm": 0.6707674860954285, + "learning_rate": 0.0003361793244740905, + "loss": 0.7008, + "step": 35270 + }, + { + "epoch": 0.7852564102564102, + "grad_norm": 1.109925627708435, + "learning_rate": 0.0003361451756036817, + "loss": 0.724, + "step": 35280 + }, + { + "epoch": 0.7854789886039886, + "grad_norm": 0.9113699197769165, + "learning_rate": 0.0003361110193350326, + "loss": 0.6307, + "step": 35290 + }, + { + "epoch": 0.7857015669515669, + "grad_norm": 0.8045711517333984, + "learning_rate": 0.0003360768556699993, + "loss": 0.5713, + "step": 35300 + }, + { + "epoch": 0.7859241452991453, + "grad_norm": 0.5049736499786377, + "learning_rate": 0.00033604268461043826, + "loss": 0.6299, + "step": 35310 + }, + { + "epoch": 0.7861467236467237, + "grad_norm": 0.6946551203727722, + "learning_rate": 0.0003360085061582064, + "loss": 0.6118, + "step": 35320 + }, + { + "epoch": 0.786369301994302, + "grad_norm": 0.6586059927940369, + "learning_rate": 0.00033597432031516085, + "loss": 0.5925, + "step": 35330 + }, + { + "epoch": 0.7865918803418803, + "grad_norm": 0.8724303245544434, + "learning_rate": 0.00033594012708315955, + "loss": 0.4647, + "step": 35340 + }, + { + "epoch": 0.7868144586894587, + "grad_norm": 0.6088204383850098, + "learning_rate": 0.0003359059264640604, + "loss": 0.6173, + "step": 35350 + }, + { + "epoch": 0.7870370370370371, + "grad_norm": 0.718809187412262, + "learning_rate": 0.000335871718459722, + "loss": 0.6609, + "step": 35360 + }, + { + "epoch": 0.7872596153846154, + "grad_norm": 0.7139840722084045, + "learning_rate": 0.0003358375030720031, + "loss": 0.6287, + "step": 35370 + }, + { + "epoch": 0.7874821937321937, + "grad_norm": 0.6466283202171326, + "learning_rate": 0.0003358032803027632, + "loss": 0.5697, + "step": 35380 + }, + { + "epoch": 0.7877047720797721, + "grad_norm": 0.6111293435096741, + "learning_rate": 0.0003357690501538618, + "loss": 0.6616, + "step": 35390 + }, + { + "epoch": 0.7879273504273504, + "grad_norm": 0.7320126891136169, + "learning_rate": 0.000335734812627159, + "loss": 0.7165, + "step": 35400 + }, + { + "epoch": 0.7881499287749287, + "grad_norm": 1.3482513427734375, + "learning_rate": 0.00033570056772451543, + "loss": 0.4982, + "step": 35410 + }, + { + "epoch": 0.7883725071225072, + "grad_norm": 0.4562806785106659, + "learning_rate": 0.00033566631544779195, + "loss": 0.6573, + "step": 35420 + }, + { + "epoch": 0.7885950854700855, + "grad_norm": 0.663861870765686, + "learning_rate": 0.00033563205579884985, + "loss": 0.6171, + "step": 35430 + }, + { + "epoch": 0.7888176638176638, + "grad_norm": 0.6606183648109436, + "learning_rate": 0.00033559778877955077, + "loss": 0.5925, + "step": 35440 + }, + { + "epoch": 0.7890402421652422, + "grad_norm": 0.6257029175758362, + "learning_rate": 0.0003355635143917568, + "loss": 0.6735, + "step": 35450 + }, + { + "epoch": 0.7892628205128205, + "grad_norm": 0.7059953808784485, + "learning_rate": 0.0003355292326373305, + "loss": 0.6859, + "step": 35460 + }, + { + "epoch": 0.7894853988603988, + "grad_norm": 0.5329318642616272, + "learning_rate": 0.00033549494351813475, + "loss": 0.5843, + "step": 35470 + }, + { + "epoch": 0.7897079772079773, + "grad_norm": 0.6622263193130493, + "learning_rate": 0.00033546064703603287, + "loss": 0.6743, + "step": 35480 + }, + { + "epoch": 0.7899305555555556, + "grad_norm": 0.7779316306114197, + "learning_rate": 0.00033542634319288855, + "loss": 0.597, + "step": 35490 + }, + { + "epoch": 0.7901531339031339, + "grad_norm": 0.49835607409477234, + "learning_rate": 0.0003353920319905658, + "loss": 0.7376, + "step": 35500 + }, + { + "epoch": 0.7903757122507122, + "grad_norm": 0.7341712713241577, + "learning_rate": 0.00033535771343092935, + "loss": 0.5066, + "step": 35510 + }, + { + "epoch": 0.7905982905982906, + "grad_norm": 0.8437432646751404, + "learning_rate": 0.0003353233875158438, + "loss": 0.702, + "step": 35520 + }, + { + "epoch": 0.7908208689458689, + "grad_norm": 0.6561760306358337, + "learning_rate": 0.00033528905424717463, + "loss": 0.5098, + "step": 35530 + }, + { + "epoch": 0.7910434472934473, + "grad_norm": 0.2741072177886963, + "learning_rate": 0.0003352547136267875, + "loss": 0.5497, + "step": 35540 + }, + { + "epoch": 0.7912660256410257, + "grad_norm": 0.6874168515205383, + "learning_rate": 0.00033522036565654845, + "loss": 0.6892, + "step": 35550 + }, + { + "epoch": 0.791488603988604, + "grad_norm": 0.8610624074935913, + "learning_rate": 0.0003351860103383241, + "loss": 0.5923, + "step": 35560 + }, + { + "epoch": 0.7917111823361823, + "grad_norm": 0.5035836696624756, + "learning_rate": 0.00033515164767398134, + "loss": 0.6147, + "step": 35570 + }, + { + "epoch": 0.7919337606837606, + "grad_norm": 0.730381429195404, + "learning_rate": 0.0003351172776653873, + "loss": 0.6442, + "step": 35580 + }, + { + "epoch": 0.7921563390313391, + "grad_norm": 0.5736960768699646, + "learning_rate": 0.00033508290031440983, + "loss": 0.523, + "step": 35590 + }, + { + "epoch": 0.7923789173789174, + "grad_norm": 0.8420835733413696, + "learning_rate": 0.00033504851562291693, + "loss": 0.7223, + "step": 35600 + }, + { + "epoch": 0.7926014957264957, + "grad_norm": 1.1751604080200195, + "learning_rate": 0.00033501412359277714, + "loss": 0.6019, + "step": 35610 + }, + { + "epoch": 0.7928240740740741, + "grad_norm": 0.6272011399269104, + "learning_rate": 0.0003349797242258594, + "loss": 0.6303, + "step": 35620 + }, + { + "epoch": 0.7930466524216524, + "grad_norm": 0.7410932183265686, + "learning_rate": 0.00033494531752403296, + "loss": 0.5262, + "step": 35630 + }, + { + "epoch": 0.7932692307692307, + "grad_norm": 0.9739224910736084, + "learning_rate": 0.0003349109034891674, + "loss": 0.7398, + "step": 35640 + }, + { + "epoch": 0.7934918091168092, + "grad_norm": 1.0735089778900146, + "learning_rate": 0.00033487648212313293, + "loss": 0.7595, + "step": 35650 + }, + { + "epoch": 0.7937143874643875, + "grad_norm": 0.7913787961006165, + "learning_rate": 0.00033484205342780007, + "loss": 0.6099, + "step": 35660 + }, + { + "epoch": 0.7939369658119658, + "grad_norm": 0.9250198006629944, + "learning_rate": 0.0003348076174050396, + "loss": 0.6167, + "step": 35670 + }, + { + "epoch": 0.7941595441595442, + "grad_norm": 0.5305607318878174, + "learning_rate": 0.0003347731740567228, + "loss": 0.5329, + "step": 35680 + }, + { + "epoch": 0.7943821225071225, + "grad_norm": 0.7727921009063721, + "learning_rate": 0.0003347387233847215, + "loss": 0.5638, + "step": 35690 + }, + { + "epoch": 0.7946047008547008, + "grad_norm": 0.7801950573921204, + "learning_rate": 0.00033470426539090756, + "loss": 0.6296, + "step": 35700 + }, + { + "epoch": 0.7948272792022792, + "grad_norm": 0.5226908326148987, + "learning_rate": 0.00033466980007715357, + "loss": 0.6034, + "step": 35710 + }, + { + "epoch": 0.7950498575498576, + "grad_norm": 0.7990700006484985, + "learning_rate": 0.00033463532744533247, + "loss": 0.6582, + "step": 35720 + }, + { + "epoch": 0.7952724358974359, + "grad_norm": 0.6833288669586182, + "learning_rate": 0.0003346008474973174, + "loss": 0.553, + "step": 35730 + }, + { + "epoch": 0.7954950142450142, + "grad_norm": 0.6693017482757568, + "learning_rate": 0.0003345663602349821, + "loss": 0.5633, + "step": 35740 + }, + { + "epoch": 0.7957175925925926, + "grad_norm": 0.8235263228416443, + "learning_rate": 0.00033453186566020064, + "loss": 0.6119, + "step": 35750 + }, + { + "epoch": 0.7959401709401709, + "grad_norm": 0.5683590769767761, + "learning_rate": 0.0003344973637748475, + "loss": 0.7155, + "step": 35760 + }, + { + "epoch": 0.7961627492877493, + "grad_norm": 1.1679894924163818, + "learning_rate": 0.0003344628545807974, + "loss": 0.4866, + "step": 35770 + }, + { + "epoch": 0.7963853276353277, + "grad_norm": 0.5607545375823975, + "learning_rate": 0.0003344283380799258, + "loss": 0.5868, + "step": 35780 + }, + { + "epoch": 0.796607905982906, + "grad_norm": 0.7906357049942017, + "learning_rate": 0.00033439381427410826, + "loss": 0.6184, + "step": 35790 + }, + { + "epoch": 0.7968304843304843, + "grad_norm": 0.5747066140174866, + "learning_rate": 0.00033435928316522077, + "loss": 0.5405, + "step": 35800 + }, + { + "epoch": 0.7970530626780626, + "grad_norm": 0.5270879864692688, + "learning_rate": 0.00033432474475513993, + "loss": 0.5811, + "step": 35810 + }, + { + "epoch": 0.7972756410256411, + "grad_norm": 0.7191861867904663, + "learning_rate": 0.0003342901990457424, + "loss": 0.6492, + "step": 35820 + }, + { + "epoch": 0.7974982193732194, + "grad_norm": 0.9543971419334412, + "learning_rate": 0.0003342556460389056, + "loss": 0.6754, + "step": 35830 + }, + { + "epoch": 0.7977207977207977, + "grad_norm": 0.5868616104125977, + "learning_rate": 0.00033422108573650703, + "loss": 0.6033, + "step": 35840 + }, + { + "epoch": 0.7979433760683761, + "grad_norm": 0.5501631498336792, + "learning_rate": 0.0003341865181404248, + "loss": 0.7264, + "step": 35850 + }, + { + "epoch": 0.7981659544159544, + "grad_norm": 0.5450555682182312, + "learning_rate": 0.0003341519432525373, + "loss": 0.5298, + "step": 35860 + }, + { + "epoch": 0.7983885327635327, + "grad_norm": 0.7831109762191772, + "learning_rate": 0.0003341173610747235, + "loss": 0.83, + "step": 35870 + }, + { + "epoch": 0.7986111111111112, + "grad_norm": 0.8205850720405579, + "learning_rate": 0.0003340827716088624, + "loss": 0.5863, + "step": 35880 + }, + { + "epoch": 0.7988336894586895, + "grad_norm": 0.6535419225692749, + "learning_rate": 0.0003340481748568337, + "loss": 0.5711, + "step": 35890 + }, + { + "epoch": 0.7990562678062678, + "grad_norm": 0.55476975440979, + "learning_rate": 0.00033401357082051746, + "loss": 0.6778, + "step": 35900 + }, + { + "epoch": 0.7992788461538461, + "grad_norm": 0.7246994972229004, + "learning_rate": 0.0003339789595017941, + "loss": 0.7777, + "step": 35910 + }, + { + "epoch": 0.7995014245014245, + "grad_norm": 0.5746579170227051, + "learning_rate": 0.0003339443409025444, + "loss": 0.6582, + "step": 35920 + }, + { + "epoch": 0.7997240028490028, + "grad_norm": 0.5451569557189941, + "learning_rate": 0.0003339097150246496, + "loss": 0.6411, + "step": 35930 + }, + { + "epoch": 0.7999465811965812, + "grad_norm": 0.6169950366020203, + "learning_rate": 0.00033387508186999117, + "loss": 0.6785, + "step": 35940 + }, + { + "epoch": 0.8001691595441596, + "grad_norm": 0.5016130805015564, + "learning_rate": 0.0003338404414404513, + "loss": 0.6404, + "step": 35950 + }, + { + "epoch": 0.8003917378917379, + "grad_norm": 0.432508647441864, + "learning_rate": 0.0003338057937379122, + "loss": 0.5448, + "step": 35960 + }, + { + "epoch": 0.8006143162393162, + "grad_norm": 0.6061640381813049, + "learning_rate": 0.00033377113876425677, + "loss": 0.6483, + "step": 35970 + }, + { + "epoch": 0.8008368945868946, + "grad_norm": 0.6601110696792603, + "learning_rate": 0.0003337364765213681, + "loss": 0.5811, + "step": 35980 + }, + { + "epoch": 0.8010594729344729, + "grad_norm": 0.6183525919914246, + "learning_rate": 0.0003337018070111299, + "loss": 0.7447, + "step": 35990 + }, + { + "epoch": 0.8012820512820513, + "grad_norm": 0.7363453507423401, + "learning_rate": 0.00033366713023542596, + "loss": 0.6509, + "step": 36000 + }, + { + "epoch": 0.8015046296296297, + "grad_norm": 0.5352987051010132, + "learning_rate": 0.00033363244619614074, + "loss": 0.6707, + "step": 36010 + }, + { + "epoch": 0.801727207977208, + "grad_norm": 0.6848888993263245, + "learning_rate": 0.00033359775489515906, + "loss": 0.685, + "step": 36020 + }, + { + "epoch": 0.8019497863247863, + "grad_norm": 0.8171915411949158, + "learning_rate": 0.000333563056334366, + "loss": 0.5364, + "step": 36030 + }, + { + "epoch": 0.8021723646723646, + "grad_norm": 0.6944659352302551, + "learning_rate": 0.0003335283505156471, + "loss": 0.6257, + "step": 36040 + }, + { + "epoch": 0.8023949430199431, + "grad_norm": 0.744377851486206, + "learning_rate": 0.00033349363744088835, + "loss": 0.7668, + "step": 36050 + }, + { + "epoch": 0.8026175213675214, + "grad_norm": 0.5722485184669495, + "learning_rate": 0.00033345891711197595, + "loss": 0.6662, + "step": 36060 + }, + { + "epoch": 0.8028400997150997, + "grad_norm": 0.7393105626106262, + "learning_rate": 0.0003334241895307969, + "loss": 0.6016, + "step": 36070 + }, + { + "epoch": 0.8030626780626781, + "grad_norm": 0.37272927165031433, + "learning_rate": 0.0003333894546992381, + "loss": 0.5908, + "step": 36080 + }, + { + "epoch": 0.8032852564102564, + "grad_norm": 0.8427095413208008, + "learning_rate": 0.0003333547126191871, + "loss": 0.4997, + "step": 36090 + }, + { + "epoch": 0.8035078347578347, + "grad_norm": 0.641094982624054, + "learning_rate": 0.00033331996329253184, + "loss": 0.7035, + "step": 36100 + }, + { + "epoch": 0.8037304131054132, + "grad_norm": 0.6963313221931458, + "learning_rate": 0.0003332852067211607, + "loss": 0.7224, + "step": 36110 + }, + { + "epoch": 0.8039529914529915, + "grad_norm": 0.5846840739250183, + "learning_rate": 0.0003332504429069623, + "loss": 0.592, + "step": 36120 + }, + { + "epoch": 0.8041755698005698, + "grad_norm": 0.67938232421875, + "learning_rate": 0.0003332156718518257, + "loss": 0.7094, + "step": 36130 + }, + { + "epoch": 0.8043981481481481, + "grad_norm": 0.8473862409591675, + "learning_rate": 0.00033318089355764046, + "loss": 0.6621, + "step": 36140 + }, + { + "epoch": 0.8046207264957265, + "grad_norm": 0.665254533290863, + "learning_rate": 0.00033314610802629644, + "loss": 0.6013, + "step": 36150 + }, + { + "epoch": 0.8048433048433048, + "grad_norm": 0.4536101818084717, + "learning_rate": 0.0003331113152596839, + "loss": 0.5538, + "step": 36160 + }, + { + "epoch": 0.8050658831908832, + "grad_norm": 0.6429980993270874, + "learning_rate": 0.00033307651525969355, + "loss": 0.7499, + "step": 36170 + }, + { + "epoch": 0.8052884615384616, + "grad_norm": 0.9357435703277588, + "learning_rate": 0.0003330417080282164, + "loss": 0.6593, + "step": 36180 + }, + { + "epoch": 0.8055110398860399, + "grad_norm": 0.6779121160507202, + "learning_rate": 0.0003330068935671439, + "loss": 0.6892, + "step": 36190 + }, + { + "epoch": 0.8057336182336182, + "grad_norm": 0.587925910949707, + "learning_rate": 0.0003329720718783679, + "loss": 0.525, + "step": 36200 + }, + { + "epoch": 0.8059561965811965, + "grad_norm": 0.9141523241996765, + "learning_rate": 0.00033293724296378077, + "loss": 0.62, + "step": 36210 + }, + { + "epoch": 0.8061787749287749, + "grad_norm": 0.7711586356163025, + "learning_rate": 0.0003329024068252749, + "loss": 0.6833, + "step": 36220 + }, + { + "epoch": 0.8064013532763533, + "grad_norm": 0.722883939743042, + "learning_rate": 0.00033286756346474354, + "loss": 0.6203, + "step": 36230 + }, + { + "epoch": 0.8066239316239316, + "grad_norm": 0.6778685450553894, + "learning_rate": 0.0003328327128840799, + "loss": 0.6965, + "step": 36240 + }, + { + "epoch": 0.80684650997151, + "grad_norm": 0.47538506984710693, + "learning_rate": 0.00033279785508517803, + "loss": 0.6006, + "step": 36250 + }, + { + "epoch": 0.8070690883190883, + "grad_norm": 0.7078222632408142, + "learning_rate": 0.000332762990069932, + "loss": 0.6305, + "step": 36260 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 1.0178117752075195, + "learning_rate": 0.00033272811784023623, + "loss": 0.7016, + "step": 36270 + }, + { + "epoch": 0.8075142450142451, + "grad_norm": 0.5266718864440918, + "learning_rate": 0.0003326932383979861, + "loss": 0.565, + "step": 36280 + }, + { + "epoch": 0.8077368233618234, + "grad_norm": 0.4685933589935303, + "learning_rate": 0.00033265835174507664, + "loss": 0.4594, + "step": 36290 + }, + { + "epoch": 0.8079594017094017, + "grad_norm": 0.7532642483711243, + "learning_rate": 0.00033262345788340376, + "loss": 0.5572, + "step": 36300 + }, + { + "epoch": 0.80818198005698, + "grad_norm": 0.7520643472671509, + "learning_rate": 0.0003325885568148636, + "loss": 0.5862, + "step": 36310 + }, + { + "epoch": 0.8084045584045584, + "grad_norm": 0.6257230043411255, + "learning_rate": 0.00033255364854135275, + "loss": 0.5583, + "step": 36320 + }, + { + "epoch": 0.8086271367521367, + "grad_norm": 0.7662017941474915, + "learning_rate": 0.00033251873306476814, + "loss": 0.5729, + "step": 36330 + }, + { + "epoch": 0.8088497150997151, + "grad_norm": 0.7460561990737915, + "learning_rate": 0.0003324838103870071, + "loss": 0.6806, + "step": 36340 + }, + { + "epoch": 0.8090722934472935, + "grad_norm": 0.9843911528587341, + "learning_rate": 0.0003324488805099673, + "loss": 0.7306, + "step": 36350 + }, + { + "epoch": 0.8092948717948718, + "grad_norm": 0.5303930640220642, + "learning_rate": 0.0003324139434355469, + "loss": 0.5246, + "step": 36360 + }, + { + "epoch": 0.8095174501424501, + "grad_norm": 0.5533699989318848, + "learning_rate": 0.0003323789991656444, + "loss": 0.6466, + "step": 36370 + }, + { + "epoch": 0.8097400284900285, + "grad_norm": 0.6484217643737793, + "learning_rate": 0.0003323440477021587, + "loss": 0.5728, + "step": 36380 + }, + { + "epoch": 0.8099626068376068, + "grad_norm": 0.6987613439559937, + "learning_rate": 0.0003323090890469892, + "loss": 0.5621, + "step": 36390 + }, + { + "epoch": 0.8101851851851852, + "grad_norm": 0.9706330299377441, + "learning_rate": 0.00033227412320203546, + "loss": 0.655, + "step": 36400 + }, + { + "epoch": 0.8104077635327636, + "grad_norm": 0.5594670176506042, + "learning_rate": 0.00033223915016919757, + "loss": 0.7187, + "step": 36410 + }, + { + "epoch": 0.8106303418803419, + "grad_norm": 0.5286457538604736, + "learning_rate": 0.00033220416995037604, + "loss": 0.5207, + "step": 36420 + }, + { + "epoch": 0.8108529202279202, + "grad_norm": 0.49551501870155334, + "learning_rate": 0.0003321691825474716, + "loss": 0.4212, + "step": 36430 + }, + { + "epoch": 0.8110754985754985, + "grad_norm": 0.6061522364616394, + "learning_rate": 0.00033213418796238566, + "loss": 0.5597, + "step": 36440 + }, + { + "epoch": 0.8112980769230769, + "grad_norm": 0.9590118527412415, + "learning_rate": 0.0003320991861970197, + "loss": 0.6513, + "step": 36450 + }, + { + "epoch": 0.8115206552706553, + "grad_norm": 0.7732698321342468, + "learning_rate": 0.0003320641772532759, + "loss": 0.6888, + "step": 36460 + }, + { + "epoch": 0.8117432336182336, + "grad_norm": 0.7111770510673523, + "learning_rate": 0.00033202916113305657, + "loss": 0.6541, + "step": 36470 + }, + { + "epoch": 0.811965811965812, + "grad_norm": 0.6621142625808716, + "learning_rate": 0.0003319941378382645, + "loss": 0.6328, + "step": 36480 + }, + { + "epoch": 0.8121883903133903, + "grad_norm": 0.5446364283561707, + "learning_rate": 0.00033195910737080295, + "loss": 0.5501, + "step": 36490 + }, + { + "epoch": 0.8124109686609686, + "grad_norm": 0.7416331768035889, + "learning_rate": 0.00033192406973257555, + "loss": 0.6579, + "step": 36500 + }, + { + "epoch": 0.8126335470085471, + "grad_norm": 0.7212896943092346, + "learning_rate": 0.0003318890249254861, + "loss": 0.7042, + "step": 36510 + }, + { + "epoch": 0.8128561253561254, + "grad_norm": 0.5195409655570984, + "learning_rate": 0.0003318539729514391, + "loss": 0.6703, + "step": 36520 + }, + { + "epoch": 0.8130787037037037, + "grad_norm": 0.572192370891571, + "learning_rate": 0.0003318189138123392, + "loss": 0.5075, + "step": 36530 + }, + { + "epoch": 0.813301282051282, + "grad_norm": 0.4907376766204834, + "learning_rate": 0.0003317838475100918, + "loss": 0.6511, + "step": 36540 + }, + { + "epoch": 0.8135238603988604, + "grad_norm": 0.7664365768432617, + "learning_rate": 0.0003317487740466021, + "loss": 0.4856, + "step": 36550 + }, + { + "epoch": 0.8137464387464387, + "grad_norm": 0.5438021421432495, + "learning_rate": 0.00033171369342377616, + "loss": 0.6068, + "step": 36560 + }, + { + "epoch": 0.8139690170940171, + "grad_norm": 0.5583726763725281, + "learning_rate": 0.00033167860564352027, + "loss": 0.6175, + "step": 36570 + }, + { + "epoch": 0.8141915954415955, + "grad_norm": 0.5016492009162903, + "learning_rate": 0.00033164351070774124, + "loss": 0.6092, + "step": 36580 + }, + { + "epoch": 0.8144141737891738, + "grad_norm": 1.1724473237991333, + "learning_rate": 0.0003316084086183461, + "loss": 0.5914, + "step": 36590 + }, + { + "epoch": 0.8146367521367521, + "grad_norm": 0.5008374452590942, + "learning_rate": 0.00033157329937724217, + "loss": 0.6082, + "step": 36600 + }, + { + "epoch": 0.8148593304843305, + "grad_norm": 0.4436315596103668, + "learning_rate": 0.0003315381829863375, + "loss": 0.6483, + "step": 36610 + }, + { + "epoch": 0.8150819088319088, + "grad_norm": 1.088879942893982, + "learning_rate": 0.0003315030594475403, + "loss": 0.7563, + "step": 36620 + }, + { + "epoch": 0.8153044871794872, + "grad_norm": 0.5001720190048218, + "learning_rate": 0.00033146792876275914, + "loss": 0.5567, + "step": 36630 + }, + { + "epoch": 0.8155270655270656, + "grad_norm": 0.7010923624038696, + "learning_rate": 0.00033143279093390316, + "loss": 0.6492, + "step": 36640 + }, + { + "epoch": 0.8157496438746439, + "grad_norm": 0.7686439752578735, + "learning_rate": 0.0003313976459628817, + "loss": 0.5096, + "step": 36650 + }, + { + "epoch": 0.8159722222222222, + "grad_norm": 1.0666576623916626, + "learning_rate": 0.0003313624938516046, + "loss": 0.5674, + "step": 36660 + }, + { + "epoch": 0.8161948005698005, + "grad_norm": 0.8343584537506104, + "learning_rate": 0.0003313273346019821, + "loss": 0.6541, + "step": 36670 + }, + { + "epoch": 0.8164173789173789, + "grad_norm": 0.48941463232040405, + "learning_rate": 0.00033129216821592465, + "loss": 0.5769, + "step": 36680 + }, + { + "epoch": 0.8166399572649573, + "grad_norm": 0.5967426300048828, + "learning_rate": 0.00033125699469534333, + "loss": 0.6193, + "step": 36690 + }, + { + "epoch": 0.8168625356125356, + "grad_norm": 0.6565297245979309, + "learning_rate": 0.0003312218140421495, + "loss": 0.5518, + "step": 36700 + }, + { + "epoch": 0.817085113960114, + "grad_norm": 0.7968794107437134, + "learning_rate": 0.0003311866262582548, + "loss": 0.564, + "step": 36710 + }, + { + "epoch": 0.8173076923076923, + "grad_norm": 0.8082167506217957, + "learning_rate": 0.00033115143134557147, + "loss": 0.5872, + "step": 36720 + }, + { + "epoch": 0.8175302706552706, + "grad_norm": 0.5784871578216553, + "learning_rate": 0.00033111622930601196, + "loss": 0.595, + "step": 36730 + }, + { + "epoch": 0.8177528490028491, + "grad_norm": 0.8299255967140198, + "learning_rate": 0.0003310810201414893, + "loss": 0.5521, + "step": 36740 + }, + { + "epoch": 0.8179754273504274, + "grad_norm": 0.6350299715995789, + "learning_rate": 0.0003310458038539166, + "loss": 0.6748, + "step": 36750 + }, + { + "epoch": 0.8181980056980057, + "grad_norm": 0.5850403308868408, + "learning_rate": 0.0003310105804452077, + "loss": 0.6192, + "step": 36760 + }, + { + "epoch": 0.818420584045584, + "grad_norm": 0.6726582050323486, + "learning_rate": 0.0003309753499172766, + "loss": 0.6385, + "step": 36770 + }, + { + "epoch": 0.8186431623931624, + "grad_norm": 0.49928078055381775, + "learning_rate": 0.00033094011227203775, + "loss": 0.5801, + "step": 36780 + }, + { + "epoch": 0.8188657407407407, + "grad_norm": 0.5119436979293823, + "learning_rate": 0.00033090486751140606, + "loss": 0.7312, + "step": 36790 + }, + { + "epoch": 0.8190883190883191, + "grad_norm": 0.48218220472335815, + "learning_rate": 0.00033086961563729664, + "loss": 0.4542, + "step": 36800 + }, + { + "epoch": 0.8193108974358975, + "grad_norm": 0.7016624212265015, + "learning_rate": 0.00033083435665162524, + "loss": 0.6112, + "step": 36810 + }, + { + "epoch": 0.8195334757834758, + "grad_norm": 0.5343238115310669, + "learning_rate": 0.0003307990905563077, + "loss": 0.4985, + "step": 36820 + }, + { + "epoch": 0.8197560541310541, + "grad_norm": 1.0482207536697388, + "learning_rate": 0.0003307638173532605, + "loss": 0.4824, + "step": 36830 + }, + { + "epoch": 0.8199786324786325, + "grad_norm": 0.6126405596733093, + "learning_rate": 0.00033072853704440046, + "loss": 0.5322, + "step": 36840 + }, + { + "epoch": 0.8202012108262108, + "grad_norm": 0.8611001372337341, + "learning_rate": 0.00033069324963164474, + "loss": 0.7653, + "step": 36850 + }, + { + "epoch": 0.8204237891737892, + "grad_norm": 0.5939532518386841, + "learning_rate": 0.0003306579551169108, + "loss": 0.6571, + "step": 36860 + }, + { + "epoch": 0.8206463675213675, + "grad_norm": 1.0668377876281738, + "learning_rate": 0.0003306226535021166, + "loss": 0.7, + "step": 36870 + }, + { + "epoch": 0.8208689458689459, + "grad_norm": 0.8537008762359619, + "learning_rate": 0.0003305873447891806, + "loss": 0.8041, + "step": 36880 + }, + { + "epoch": 0.8210915242165242, + "grad_norm": 0.8350732922554016, + "learning_rate": 0.0003305520289800212, + "loss": 0.5502, + "step": 36890 + }, + { + "epoch": 0.8213141025641025, + "grad_norm": 0.7095655202865601, + "learning_rate": 0.0003305167060765578, + "loss": 0.5797, + "step": 36900 + }, + { + "epoch": 0.8215366809116809, + "grad_norm": 0.607930600643158, + "learning_rate": 0.0003304813760807097, + "loss": 0.6496, + "step": 36910 + }, + { + "epoch": 0.8217592592592593, + "grad_norm": 0.7336176037788391, + "learning_rate": 0.00033044603899439677, + "loss": 0.5963, + "step": 36920 + }, + { + "epoch": 0.8219818376068376, + "grad_norm": 0.4098057150840759, + "learning_rate": 0.00033041069481953936, + "loss": 0.5506, + "step": 36930 + }, + { + "epoch": 0.822204415954416, + "grad_norm": 0.8852785229682922, + "learning_rate": 0.0003303753435580579, + "loss": 0.5608, + "step": 36940 + }, + { + "epoch": 0.8224269943019943, + "grad_norm": 0.6913530826568604, + "learning_rate": 0.00033033998521187375, + "loss": 0.494, + "step": 36950 + }, + { + "epoch": 0.8226495726495726, + "grad_norm": 0.8665242195129395, + "learning_rate": 0.0003303046197829079, + "loss": 0.5654, + "step": 36960 + }, + { + "epoch": 0.8228721509971509, + "grad_norm": 0.41242697834968567, + "learning_rate": 0.0003302692472730825, + "loss": 0.5263, + "step": 36970 + }, + { + "epoch": 0.8230947293447294, + "grad_norm": 0.4868178367614746, + "learning_rate": 0.00033023386768431945, + "loss": 0.7242, + "step": 36980 + }, + { + "epoch": 0.8233173076923077, + "grad_norm": 0.8877532482147217, + "learning_rate": 0.00033019848101854143, + "loss": 0.6203, + "step": 36990 + }, + { + "epoch": 0.823539886039886, + "grad_norm": 0.5433663725852966, + "learning_rate": 0.00033016308727767143, + "loss": 0.5508, + "step": 37000 + }, + { + "epoch": 0.8237624643874644, + "grad_norm": 0.82059645652771, + "learning_rate": 0.0003301276864636327, + "loss": 0.6424, + "step": 37010 + }, + { + "epoch": 0.8239850427350427, + "grad_norm": 0.4982110857963562, + "learning_rate": 0.0003300922785783489, + "loss": 0.592, + "step": 37020 + }, + { + "epoch": 0.8242076210826211, + "grad_norm": 0.6042728424072266, + "learning_rate": 0.0003300568636237442, + "loss": 0.6052, + "step": 37030 + }, + { + "epoch": 0.8244301994301995, + "grad_norm": 0.6098039746284485, + "learning_rate": 0.0003300214416017431, + "loss": 0.7938, + "step": 37040 + }, + { + "epoch": 0.8246527777777778, + "grad_norm": 0.8635123372077942, + "learning_rate": 0.00032998601251427043, + "loss": 0.7422, + "step": 37050 + }, + { + "epoch": 0.8248753561253561, + "grad_norm": 0.842786967754364, + "learning_rate": 0.00032995057636325137, + "loss": 0.5865, + "step": 37060 + }, + { + "epoch": 0.8250979344729344, + "grad_norm": 0.703381359577179, + "learning_rate": 0.00032991513315061165, + "loss": 0.5967, + "step": 37070 + }, + { + "epoch": 0.8253205128205128, + "grad_norm": 0.652272641658783, + "learning_rate": 0.00032987968287827724, + "loss": 0.5933, + "step": 37080 + }, + { + "epoch": 0.8255430911680912, + "grad_norm": 0.467128187417984, + "learning_rate": 0.00032984422554817447, + "loss": 0.5171, + "step": 37090 + }, + { + "epoch": 0.8257656695156695, + "grad_norm": 0.7063168287277222, + "learning_rate": 0.0003298087611622303, + "loss": 0.5875, + "step": 37100 + }, + { + "epoch": 0.8259882478632479, + "grad_norm": 0.3760249614715576, + "learning_rate": 0.0003297732897223717, + "loss": 0.5645, + "step": 37110 + }, + { + "epoch": 0.8262108262108262, + "grad_norm": 0.796840488910675, + "learning_rate": 0.0003297378112305263, + "loss": 0.6644, + "step": 37120 + }, + { + "epoch": 0.8264334045584045, + "grad_norm": 0.6814299821853638, + "learning_rate": 0.0003297023256886221, + "loss": 0.5546, + "step": 37130 + }, + { + "epoch": 0.8266559829059829, + "grad_norm": 0.7078778147697449, + "learning_rate": 0.0003296668330985873, + "loss": 0.582, + "step": 37140 + }, + { + "epoch": 0.8268785612535613, + "grad_norm": 0.5444463491439819, + "learning_rate": 0.00032963133346235066, + "loss": 0.4931, + "step": 37150 + }, + { + "epoch": 0.8271011396011396, + "grad_norm": 0.7480208873748779, + "learning_rate": 0.0003295958267818412, + "loss": 0.6201, + "step": 37160 + }, + { + "epoch": 0.827323717948718, + "grad_norm": 0.6348940134048462, + "learning_rate": 0.00032956031305898836, + "loss": 0.5632, + "step": 37170 + }, + { + "epoch": 0.8275462962962963, + "grad_norm": 0.43813246488571167, + "learning_rate": 0.0003295247922957222, + "loss": 0.6038, + "step": 37180 + }, + { + "epoch": 0.8277688746438746, + "grad_norm": 0.6713902950286865, + "learning_rate": 0.00032948926449397265, + "loss": 0.5168, + "step": 37190 + }, + { + "epoch": 0.8279914529914529, + "grad_norm": 0.7052330374717712, + "learning_rate": 0.0003294537296556706, + "loss": 0.5426, + "step": 37200 + }, + { + "epoch": 0.8282140313390314, + "grad_norm": 0.6007211804389954, + "learning_rate": 0.00032941818778274676, + "loss": 0.5749, + "step": 37210 + }, + { + "epoch": 0.8284366096866097, + "grad_norm": 0.5694396495819092, + "learning_rate": 0.00032938263887713275, + "loss": 0.4963, + "step": 37220 + }, + { + "epoch": 0.828659188034188, + "grad_norm": 0.8777496218681335, + "learning_rate": 0.0003293470829407602, + "loss": 0.6211, + "step": 37230 + }, + { + "epoch": 0.8288817663817664, + "grad_norm": 0.8059148788452148, + "learning_rate": 0.00032931151997556124, + "loss": 0.6515, + "step": 37240 + }, + { + "epoch": 0.8291043447293447, + "grad_norm": 0.6097303628921509, + "learning_rate": 0.00032927594998346854, + "loss": 0.6599, + "step": 37250 + }, + { + "epoch": 0.8293269230769231, + "grad_norm": 0.8829268217086792, + "learning_rate": 0.00032924037296641476, + "loss": 0.619, + "step": 37260 + }, + { + "epoch": 0.8295495014245015, + "grad_norm": 0.7852076292037964, + "learning_rate": 0.0003292047889263334, + "loss": 0.6172, + "step": 37270 + }, + { + "epoch": 0.8297720797720798, + "grad_norm": 0.6745901703834534, + "learning_rate": 0.00032916919786515794, + "loss": 0.5426, + "step": 37280 + }, + { + "epoch": 0.8299946581196581, + "grad_norm": 0.5950226783752441, + "learning_rate": 0.00032913359978482256, + "loss": 0.6676, + "step": 37290 + }, + { + "epoch": 0.8302172364672364, + "grad_norm": 0.5079329609870911, + "learning_rate": 0.0003290979946872617, + "loss": 0.5057, + "step": 37300 + }, + { + "epoch": 0.8304398148148148, + "grad_norm": 0.5150577425956726, + "learning_rate": 0.0003290623825744101, + "loss": 0.4958, + "step": 37310 + }, + { + "epoch": 0.8306623931623932, + "grad_norm": 0.6359812617301941, + "learning_rate": 0.000329026763448203, + "loss": 0.5448, + "step": 37320 + }, + { + "epoch": 0.8308849715099715, + "grad_norm": 0.9012143611907959, + "learning_rate": 0.0003289911373105759, + "loss": 0.5756, + "step": 37330 + }, + { + "epoch": 0.8311075498575499, + "grad_norm": 0.5244525074958801, + "learning_rate": 0.00032895550416346485, + "loss": 0.5128, + "step": 37340 + }, + { + "epoch": 0.8313301282051282, + "grad_norm": 0.5555477142333984, + "learning_rate": 0.00032891986400880607, + "loss": 0.6683, + "step": 37350 + }, + { + "epoch": 0.8315527065527065, + "grad_norm": 0.5338780879974365, + "learning_rate": 0.0003288842168485364, + "loss": 0.582, + "step": 37360 + }, + { + "epoch": 0.8317752849002849, + "grad_norm": 0.5661174654960632, + "learning_rate": 0.00032884856268459284, + "loss": 0.5865, + "step": 37370 + }, + { + "epoch": 0.8319978632478633, + "grad_norm": 0.7979121804237366, + "learning_rate": 0.0003288129015189129, + "loss": 0.5493, + "step": 37380 + }, + { + "epoch": 0.8322204415954416, + "grad_norm": 0.6640493869781494, + "learning_rate": 0.0003287772333534345, + "loss": 0.5727, + "step": 37390 + }, + { + "epoch": 0.83244301994302, + "grad_norm": 0.6453659534454346, + "learning_rate": 0.0003287415581900958, + "loss": 0.5334, + "step": 37400 + }, + { + "epoch": 0.8326655982905983, + "grad_norm": 0.8953363299369812, + "learning_rate": 0.0003287058760308354, + "loss": 0.8172, + "step": 37410 + }, + { + "epoch": 0.8328881766381766, + "grad_norm": 0.5227009654045105, + "learning_rate": 0.0003286701868775923, + "loss": 0.6285, + "step": 37420 + }, + { + "epoch": 0.8331107549857549, + "grad_norm": 0.6829435229301453, + "learning_rate": 0.000328634490732306, + "loss": 0.4957, + "step": 37430 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6302551627159119, + "learning_rate": 0.0003285987875969161, + "loss": 0.5354, + "step": 37440 + }, + { + "epoch": 0.8335559116809117, + "grad_norm": 0.7687264680862427, + "learning_rate": 0.00032856307747336277, + "loss": 0.6332, + "step": 37450 + }, + { + "epoch": 0.83377849002849, + "grad_norm": 0.7602887749671936, + "learning_rate": 0.0003285273603635867, + "loss": 0.5534, + "step": 37460 + }, + { + "epoch": 0.8340010683760684, + "grad_norm": 0.7089099884033203, + "learning_rate": 0.00032849163626952853, + "loss": 0.4901, + "step": 37470 + }, + { + "epoch": 0.8342236467236467, + "grad_norm": 0.5121763944625854, + "learning_rate": 0.0003284559051931297, + "loss": 0.5413, + "step": 37480 + }, + { + "epoch": 0.8344462250712251, + "grad_norm": 0.38285666704177856, + "learning_rate": 0.00032842016713633185, + "loss": 0.5369, + "step": 37490 + }, + { + "epoch": 0.8346688034188035, + "grad_norm": 0.7840296030044556, + "learning_rate": 0.00032838442210107694, + "loss": 0.4726, + "step": 37500 + }, + { + "epoch": 0.8348913817663818, + "grad_norm": 0.8556367754936218, + "learning_rate": 0.00032834867008930745, + "loss": 0.5818, + "step": 37510 + }, + { + "epoch": 0.8351139601139601, + "grad_norm": 0.5417022109031677, + "learning_rate": 0.0003283129111029662, + "loss": 0.5866, + "step": 37520 + }, + { + "epoch": 0.8353365384615384, + "grad_norm": 0.7985814809799194, + "learning_rate": 0.0003282771451439963, + "loss": 0.7183, + "step": 37530 + }, + { + "epoch": 0.8355591168091168, + "grad_norm": 0.6315109133720398, + "learning_rate": 0.0003282413722143413, + "loss": 0.6315, + "step": 37540 + }, + { + "epoch": 0.8357816951566952, + "grad_norm": 0.45857003331184387, + "learning_rate": 0.00032820559231594513, + "loss": 0.5245, + "step": 37550 + }, + { + "epoch": 0.8360042735042735, + "grad_norm": 0.6412796378135681, + "learning_rate": 0.00032816980545075216, + "loss": 0.6218, + "step": 37560 + }, + { + "epoch": 0.8362268518518519, + "grad_norm": 0.5888071060180664, + "learning_rate": 0.000328134011620707, + "loss": 0.57, + "step": 37570 + }, + { + "epoch": 0.8364494301994302, + "grad_norm": 0.5844222903251648, + "learning_rate": 0.0003280982108277548, + "loss": 0.6996, + "step": 37580 + }, + { + "epoch": 0.8366720085470085, + "grad_norm": 0.6059329509735107, + "learning_rate": 0.0003280624030738409, + "loss": 0.5442, + "step": 37590 + }, + { + "epoch": 0.8368945868945868, + "grad_norm": 0.38470956683158875, + "learning_rate": 0.00032802658836091124, + "loss": 0.5368, + "step": 37600 + }, + { + "epoch": 0.8371171652421653, + "grad_norm": 0.8118173480033875, + "learning_rate": 0.0003279907666909119, + "loss": 0.6644, + "step": 37610 + }, + { + "epoch": 0.8373397435897436, + "grad_norm": 0.5427059531211853, + "learning_rate": 0.0003279549380657896, + "loss": 0.5515, + "step": 37620 + }, + { + "epoch": 0.8375623219373219, + "grad_norm": 0.7684349417686462, + "learning_rate": 0.0003279191024874911, + "loss": 0.7565, + "step": 37630 + }, + { + "epoch": 0.8377849002849003, + "grad_norm": 0.8485226631164551, + "learning_rate": 0.00032788325995796396, + "loss": 0.6957, + "step": 37640 + }, + { + "epoch": 0.8380074786324786, + "grad_norm": 1.1634833812713623, + "learning_rate": 0.0003278474104791557, + "loss": 0.6749, + "step": 37650 + }, + { + "epoch": 0.8382300569800569, + "grad_norm": 0.7527285814285278, + "learning_rate": 0.0003278115540530146, + "loss": 0.5256, + "step": 37660 + }, + { + "epoch": 0.8384526353276354, + "grad_norm": 0.46826934814453125, + "learning_rate": 0.00032777569068148893, + "loss": 0.6188, + "step": 37670 + }, + { + "epoch": 0.8386752136752137, + "grad_norm": 0.6066411137580872, + "learning_rate": 0.00032773982036652765, + "loss": 0.6407, + "step": 37680 + }, + { + "epoch": 0.838897792022792, + "grad_norm": 0.8002700805664062, + "learning_rate": 0.00032770394311007993, + "loss": 0.6213, + "step": 37690 + }, + { + "epoch": 0.8391203703703703, + "grad_norm": 0.5692296028137207, + "learning_rate": 0.0003276680589140955, + "loss": 0.7038, + "step": 37700 + }, + { + "epoch": 0.8393429487179487, + "grad_norm": 0.5736252665519714, + "learning_rate": 0.0003276321677805241, + "loss": 0.531, + "step": 37710 + }, + { + "epoch": 0.8395655270655271, + "grad_norm": 0.6136668920516968, + "learning_rate": 0.0003275962697113163, + "loss": 0.5928, + "step": 37720 + }, + { + "epoch": 0.8397881054131054, + "grad_norm": 0.8906228542327881, + "learning_rate": 0.00032756036470842277, + "loss": 0.6129, + "step": 37730 + }, + { + "epoch": 0.8400106837606838, + "grad_norm": 0.7700905203819275, + "learning_rate": 0.0003275244527737945, + "loss": 0.5606, + "step": 37740 + }, + { + "epoch": 0.8400997150997151, + "eval_loss": 0.6122514605522156, + "eval_runtime": 337.0919, + "eval_samples_per_second": 7.016, + "eval_steps_per_second": 7.016, + "step": 37744 + }, + { + "epoch": 0.8402332621082621, + "grad_norm": 0.7092208862304688, + "learning_rate": 0.00032748853390938314, + "loss": 0.573, + "step": 37750 + }, + { + "epoch": 0.8404558404558404, + "grad_norm": 0.6493269801139832, + "learning_rate": 0.00032745260811714046, + "loss": 0.5398, + "step": 37760 + }, + { + "epoch": 0.8406784188034188, + "grad_norm": 0.7034374475479126, + "learning_rate": 0.00032741667539901875, + "loss": 0.593, + "step": 37770 + }, + { + "epoch": 0.8409009971509972, + "grad_norm": 0.7925833463668823, + "learning_rate": 0.00032738073575697054, + "loss": 0.5922, + "step": 37780 + }, + { + "epoch": 0.8411235754985755, + "grad_norm": 0.7417179942131042, + "learning_rate": 0.000327344789192949, + "loss": 0.527, + "step": 37790 + }, + { + "epoch": 0.8413461538461539, + "grad_norm": 0.5861806869506836, + "learning_rate": 0.0003273088357089072, + "loss": 0.6919, + "step": 37800 + }, + { + "epoch": 0.8415687321937322, + "grad_norm": 0.7780978083610535, + "learning_rate": 0.00032727287530679914, + "loss": 0.6734, + "step": 37810 + }, + { + "epoch": 0.8417913105413105, + "grad_norm": 0.6386851072311401, + "learning_rate": 0.00032723690798857876, + "loss": 0.6519, + "step": 37820 + }, + { + "epoch": 0.8420138888888888, + "grad_norm": 0.7099935412406921, + "learning_rate": 0.00032720093375620065, + "loss": 0.5304, + "step": 37830 + }, + { + "epoch": 0.8422364672364673, + "grad_norm": 0.6465808153152466, + "learning_rate": 0.0003271649526116198, + "loss": 0.6308, + "step": 37840 + }, + { + "epoch": 0.8424590455840456, + "grad_norm": 0.4721263349056244, + "learning_rate": 0.00032712896455679125, + "loss": 0.6353, + "step": 37850 + }, + { + "epoch": 0.8426816239316239, + "grad_norm": 0.46420302987098694, + "learning_rate": 0.0003270929695936706, + "loss": 0.6307, + "step": 37860 + }, + { + "epoch": 0.8429042022792023, + "grad_norm": 0.8225671052932739, + "learning_rate": 0.00032705696772421407, + "loss": 0.6044, + "step": 37870 + }, + { + "epoch": 0.8431267806267806, + "grad_norm": 0.6508604288101196, + "learning_rate": 0.00032702095895037784, + "loss": 0.6072, + "step": 37880 + }, + { + "epoch": 0.8433493589743589, + "grad_norm": 0.47768697142601013, + "learning_rate": 0.0003269849432741187, + "loss": 0.5711, + "step": 37890 + }, + { + "epoch": 0.8435719373219374, + "grad_norm": 0.7702714204788208, + "learning_rate": 0.00032694892069739384, + "loss": 0.739, + "step": 37900 + }, + { + "epoch": 0.8437945156695157, + "grad_norm": 0.6471502780914307, + "learning_rate": 0.0003269128912221607, + "loss": 0.5683, + "step": 37910 + }, + { + "epoch": 0.844017094017094, + "grad_norm": 0.5908994078636169, + "learning_rate": 0.0003268768548503771, + "loss": 0.64, + "step": 37920 + }, + { + "epoch": 0.8442396723646723, + "grad_norm": 0.5731729865074158, + "learning_rate": 0.00032684081158400135, + "loss": 0.6329, + "step": 37930 + }, + { + "epoch": 0.8444622507122507, + "grad_norm": 0.4691145718097687, + "learning_rate": 0.0003268047614249921, + "loss": 0.5713, + "step": 37940 + }, + { + "epoch": 0.8446848290598291, + "grad_norm": 0.5721814036369324, + "learning_rate": 0.0003267687043753083, + "loss": 0.5048, + "step": 37950 + }, + { + "epoch": 0.8449074074074074, + "grad_norm": 1.1615492105484009, + "learning_rate": 0.0003267326404369093, + "loss": 0.8903, + "step": 37960 + }, + { + "epoch": 0.8451299857549858, + "grad_norm": 0.6510785818099976, + "learning_rate": 0.0003266965696117549, + "loss": 0.6599, + "step": 37970 + }, + { + "epoch": 0.8453525641025641, + "grad_norm": 0.7093966007232666, + "learning_rate": 0.0003266604919018052, + "loss": 0.6341, + "step": 37980 + }, + { + "epoch": 0.8455751424501424, + "grad_norm": 0.8076547980308533, + "learning_rate": 0.0003266244073090206, + "loss": 0.505, + "step": 37990 + }, + { + "epoch": 0.8457977207977208, + "grad_norm": 1.5181430578231812, + "learning_rate": 0.00032658831583536215, + "loss": 0.6225, + "step": 38000 + }, + { + "epoch": 0.8460202991452992, + "grad_norm": 1.261081576347351, + "learning_rate": 0.00032655221748279097, + "loss": 0.6192, + "step": 38010 + }, + { + "epoch": 0.8462428774928775, + "grad_norm": 0.7477047443389893, + "learning_rate": 0.00032651611225326864, + "loss": 0.5815, + "step": 38020 + }, + { + "epoch": 0.8464654558404558, + "grad_norm": 0.582606315612793, + "learning_rate": 0.00032648000014875723, + "loss": 0.5552, + "step": 38030 + }, + { + "epoch": 0.8466880341880342, + "grad_norm": 0.5662571787834167, + "learning_rate": 0.00032644388117121905, + "loss": 0.6245, + "step": 38040 + }, + { + "epoch": 0.8469106125356125, + "grad_norm": 0.9054650664329529, + "learning_rate": 0.0003264077553226169, + "loss": 0.6222, + "step": 38050 + }, + { + "epoch": 0.8471331908831908, + "grad_norm": 1.1601990461349487, + "learning_rate": 0.00032637162260491386, + "loss": 0.6315, + "step": 38060 + }, + { + "epoch": 0.8473557692307693, + "grad_norm": 0.6006155610084534, + "learning_rate": 0.0003263354830200733, + "loss": 0.7051, + "step": 38070 + }, + { + "epoch": 0.8475783475783476, + "grad_norm": 0.7477725148200989, + "learning_rate": 0.0003262993365700592, + "loss": 0.5918, + "step": 38080 + }, + { + "epoch": 0.8478009259259259, + "grad_norm": 0.5139760971069336, + "learning_rate": 0.0003262631832568358, + "loss": 0.6576, + "step": 38090 + }, + { + "epoch": 0.8480235042735043, + "grad_norm": 0.5229126214981079, + "learning_rate": 0.00032622702308236757, + "loss": 0.4205, + "step": 38100 + }, + { + "epoch": 0.8482460826210826, + "grad_norm": 0.6959442496299744, + "learning_rate": 0.0003261908560486197, + "loss": 0.6536, + "step": 38110 + }, + { + "epoch": 0.8484686609686609, + "grad_norm": 0.5733725428581238, + "learning_rate": 0.0003261546821575573, + "loss": 0.5371, + "step": 38120 + }, + { + "epoch": 0.8486912393162394, + "grad_norm": 0.4880182445049286, + "learning_rate": 0.00032611850141114624, + "loss": 0.4979, + "step": 38130 + }, + { + "epoch": 0.8489138176638177, + "grad_norm": 0.7861484289169312, + "learning_rate": 0.0003260823138113526, + "loss": 0.5845, + "step": 38140 + }, + { + "epoch": 0.849136396011396, + "grad_norm": 0.8785387277603149, + "learning_rate": 0.0003260461193601428, + "loss": 0.769, + "step": 38150 + }, + { + "epoch": 0.8493589743589743, + "grad_norm": 0.9648477435112, + "learning_rate": 0.0003260099180594836, + "loss": 0.7094, + "step": 38160 + }, + { + "epoch": 0.8495815527065527, + "grad_norm": 0.475644052028656, + "learning_rate": 0.00032597370991134235, + "loss": 0.5446, + "step": 38170 + }, + { + "epoch": 0.8498041310541311, + "grad_norm": 0.6651561260223389, + "learning_rate": 0.00032593749491768663, + "loss": 0.5502, + "step": 38180 + }, + { + "epoch": 0.8500267094017094, + "grad_norm": 0.6330234408378601, + "learning_rate": 0.0003259012730804843, + "loss": 0.6349, + "step": 38190 + }, + { + "epoch": 0.8502492877492878, + "grad_norm": 0.7780224084854126, + "learning_rate": 0.0003258650444017037, + "loss": 0.7218, + "step": 38200 + }, + { + "epoch": 0.8504718660968661, + "grad_norm": 0.6567363142967224, + "learning_rate": 0.0003258288088833136, + "loss": 0.5092, + "step": 38210 + }, + { + "epoch": 0.8506944444444444, + "grad_norm": 0.7575187683105469, + "learning_rate": 0.000325792566527283, + "loss": 0.4875, + "step": 38220 + }, + { + "epoch": 0.8509170227920227, + "grad_norm": 0.6900361776351929, + "learning_rate": 0.00032575631733558133, + "loss": 0.6574, + "step": 38230 + }, + { + "epoch": 0.8511396011396012, + "grad_norm": 1.2023710012435913, + "learning_rate": 0.00032572006131017844, + "loss": 0.5447, + "step": 38240 + }, + { + "epoch": 0.8513621794871795, + "grad_norm": 0.594343900680542, + "learning_rate": 0.00032568379845304446, + "loss": 0.4553, + "step": 38250 + }, + { + "epoch": 0.8515847578347578, + "grad_norm": 0.7324010133743286, + "learning_rate": 0.00032564752876615004, + "loss": 0.5672, + "step": 38260 + }, + { + "epoch": 0.8518073361823362, + "grad_norm": 0.6826527118682861, + "learning_rate": 0.00032561125225146604, + "loss": 0.6463, + "step": 38270 + }, + { + "epoch": 0.8520299145299145, + "grad_norm": 0.8564441204071045, + "learning_rate": 0.00032557496891096375, + "loss": 0.5814, + "step": 38280 + }, + { + "epoch": 0.8522524928774928, + "grad_norm": 0.6812259554862976, + "learning_rate": 0.00032553867874661485, + "loss": 0.6826, + "step": 38290 + }, + { + "epoch": 0.8524750712250713, + "grad_norm": 0.7946615219116211, + "learning_rate": 0.0003255023817603914, + "loss": 0.6607, + "step": 38300 + }, + { + "epoch": 0.8526976495726496, + "grad_norm": 0.5488380193710327, + "learning_rate": 0.00032546607795426577, + "loss": 0.5731, + "step": 38310 + }, + { + "epoch": 0.8529202279202279, + "grad_norm": 0.7387109398841858, + "learning_rate": 0.0003254297673302108, + "loss": 0.6652, + "step": 38320 + }, + { + "epoch": 0.8531428062678063, + "grad_norm": 1.0683709383010864, + "learning_rate": 0.00032539344989019947, + "loss": 0.5992, + "step": 38330 + }, + { + "epoch": 0.8533653846153846, + "grad_norm": 0.5943742990493774, + "learning_rate": 0.0003253571256362055, + "loss": 0.6335, + "step": 38340 + }, + { + "epoch": 0.8535879629629629, + "grad_norm": 0.7003427743911743, + "learning_rate": 0.0003253207945702027, + "loss": 0.5376, + "step": 38350 + }, + { + "epoch": 0.8538105413105413, + "grad_norm": 0.6439740657806396, + "learning_rate": 0.00032528445669416524, + "loss": 0.6638, + "step": 38360 + }, + { + "epoch": 0.8540331196581197, + "grad_norm": 0.45052602887153625, + "learning_rate": 0.000325248112010068, + "loss": 0.6284, + "step": 38370 + }, + { + "epoch": 0.854255698005698, + "grad_norm": 0.8613705635070801, + "learning_rate": 0.00032521176051988573, + "loss": 0.5812, + "step": 38380 + }, + { + "epoch": 0.8544782763532763, + "grad_norm": 0.9319939613342285, + "learning_rate": 0.0003251754022255939, + "loss": 0.6173, + "step": 38390 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 0.8355406522750854, + "learning_rate": 0.00032513903712916823, + "loss": 0.6346, + "step": 38400 + }, + { + "epoch": 0.8549234330484331, + "grad_norm": 0.6055408716201782, + "learning_rate": 0.0003251026652325848, + "loss": 0.5601, + "step": 38410 + }, + { + "epoch": 0.8551460113960114, + "grad_norm": 0.6037227511405945, + "learning_rate": 0.0003250662865378202, + "loss": 0.5399, + "step": 38420 + }, + { + "epoch": 0.8553685897435898, + "grad_norm": 0.7371780276298523, + "learning_rate": 0.0003250299010468512, + "loss": 0.5711, + "step": 38430 + }, + { + "epoch": 0.8555911680911681, + "grad_norm": 0.4835084080696106, + "learning_rate": 0.000324993508761655, + "loss": 0.6015, + "step": 38440 + }, + { + "epoch": 0.8558137464387464, + "grad_norm": 0.8498573303222656, + "learning_rate": 0.0003249571096842092, + "loss": 0.6905, + "step": 38450 + }, + { + "epoch": 0.8560363247863247, + "grad_norm": 0.5593055486679077, + "learning_rate": 0.00032492070381649177, + "loss": 0.5372, + "step": 38460 + }, + { + "epoch": 0.8562589031339032, + "grad_norm": 0.46001946926116943, + "learning_rate": 0.0003248842911604811, + "loss": 0.559, + "step": 38470 + }, + { + "epoch": 0.8564814814814815, + "grad_norm": 0.7182387113571167, + "learning_rate": 0.00032484787171815574, + "loss": 0.7162, + "step": 38480 + }, + { + "epoch": 0.8567040598290598, + "grad_norm": 0.8392935395240784, + "learning_rate": 0.0003248114454914948, + "loss": 0.7595, + "step": 38490 + }, + { + "epoch": 0.8569266381766382, + "grad_norm": 0.8445121645927429, + "learning_rate": 0.00032477501248247775, + "loss": 0.5439, + "step": 38500 + }, + { + "epoch": 0.8571492165242165, + "grad_norm": 0.8420943021774292, + "learning_rate": 0.00032473857269308445, + "loss": 0.6484, + "step": 38510 + }, + { + "epoch": 0.8573717948717948, + "grad_norm": 0.5731983184814453, + "learning_rate": 0.00032470212612529495, + "loss": 0.6886, + "step": 38520 + }, + { + "epoch": 0.8575943732193733, + "grad_norm": 0.7260280251502991, + "learning_rate": 0.0003246656727810898, + "loss": 0.5561, + "step": 38530 + }, + { + "epoch": 0.8578169515669516, + "grad_norm": 0.8293136358261108, + "learning_rate": 0.00032462921266245, + "loss": 0.6339, + "step": 38540 + }, + { + "epoch": 0.8580395299145299, + "grad_norm": 1.0204569101333618, + "learning_rate": 0.0003245927457713567, + "loss": 0.7484, + "step": 38550 + }, + { + "epoch": 0.8582621082621082, + "grad_norm": 0.9848127961158752, + "learning_rate": 0.0003245562721097916, + "loss": 0.7431, + "step": 38560 + }, + { + "epoch": 0.8584846866096866, + "grad_norm": 0.4912465810775757, + "learning_rate": 0.00032451979167973674, + "loss": 0.6141, + "step": 38570 + }, + { + "epoch": 0.8587072649572649, + "grad_norm": 0.5672870874404907, + "learning_rate": 0.00032448330448317444, + "loss": 0.5866, + "step": 38580 + }, + { + "epoch": 0.8589298433048433, + "grad_norm": 0.6375393867492676, + "learning_rate": 0.0003244468105220875, + "loss": 0.6703, + "step": 38590 + }, + { + "epoch": 0.8591524216524217, + "grad_norm": 0.3784322142601013, + "learning_rate": 0.00032441030979845893, + "loss": 0.6274, + "step": 38600 + }, + { + "epoch": 0.859375, + "grad_norm": 0.6850314736366272, + "learning_rate": 0.0003243738023142723, + "loss": 0.6696, + "step": 38610 + }, + { + "epoch": 0.8595975783475783, + "grad_norm": 0.9629839062690735, + "learning_rate": 0.00032433728807151153, + "loss": 0.561, + "step": 38620 + }, + { + "epoch": 0.8598201566951567, + "grad_norm": 0.885299563407898, + "learning_rate": 0.00032430076707216064, + "loss": 0.6774, + "step": 38630 + }, + { + "epoch": 0.8600427350427351, + "grad_norm": 0.668006956577301, + "learning_rate": 0.00032426423931820436, + "loss": 0.5257, + "step": 38640 + }, + { + "epoch": 0.8602653133903134, + "grad_norm": 0.9198821187019348, + "learning_rate": 0.00032422770481162753, + "loss": 0.6907, + "step": 38650 + }, + { + "epoch": 0.8604878917378918, + "grad_norm": 0.6214486360549927, + "learning_rate": 0.00032419116355441555, + "loss": 0.7267, + "step": 38660 + }, + { + "epoch": 0.8607104700854701, + "grad_norm": 0.7451410889625549, + "learning_rate": 0.00032415461554855413, + "loss": 0.5338, + "step": 38670 + }, + { + "epoch": 0.8609330484330484, + "grad_norm": 0.43799296021461487, + "learning_rate": 0.0003241180607960292, + "loss": 0.6098, + "step": 38680 + }, + { + "epoch": 0.8611556267806267, + "grad_norm": 0.693584680557251, + "learning_rate": 0.00032408149929882726, + "loss": 0.5971, + "step": 38690 + }, + { + "epoch": 0.8613782051282052, + "grad_norm": 0.7107314467430115, + "learning_rate": 0.00032404493105893503, + "loss": 0.6508, + "step": 38700 + }, + { + "epoch": 0.8616007834757835, + "grad_norm": 0.6193886399269104, + "learning_rate": 0.00032400835607833975, + "loss": 0.6652, + "step": 38710 + }, + { + "epoch": 0.8618233618233618, + "grad_norm": 0.664205014705658, + "learning_rate": 0.0003239717743590289, + "loss": 0.6026, + "step": 38720 + }, + { + "epoch": 0.8620459401709402, + "grad_norm": 0.7307686805725098, + "learning_rate": 0.00032393518590299023, + "loss": 0.6434, + "step": 38730 + }, + { + "epoch": 0.8622685185185185, + "grad_norm": 0.815770149230957, + "learning_rate": 0.0003238985907122122, + "loss": 0.6246, + "step": 38740 + }, + { + "epoch": 0.8624910968660968, + "grad_norm": 0.8578146696090698, + "learning_rate": 0.0003238619887886833, + "loss": 0.5791, + "step": 38750 + }, + { + "epoch": 0.8627136752136753, + "grad_norm": 0.672818124294281, + "learning_rate": 0.0003238253801343925, + "loss": 0.9415, + "step": 38760 + }, + { + "epoch": 0.8629362535612536, + "grad_norm": 0.6374601125717163, + "learning_rate": 0.00032378876475132925, + "loss": 0.6183, + "step": 38770 + }, + { + "epoch": 0.8631588319088319, + "grad_norm": 0.4592694640159607, + "learning_rate": 0.00032375214264148317, + "loss": 0.5575, + "step": 38780 + }, + { + "epoch": 0.8633814102564102, + "grad_norm": 0.5796618461608887, + "learning_rate": 0.0003237155138068444, + "loss": 0.5472, + "step": 38790 + }, + { + "epoch": 0.8636039886039886, + "grad_norm": 0.5853553414344788, + "learning_rate": 0.00032367887824940315, + "loss": 0.5363, + "step": 38800 + }, + { + "epoch": 0.8638265669515669, + "grad_norm": 0.7586563229560852, + "learning_rate": 0.0003236422359711505, + "loss": 0.652, + "step": 38810 + }, + { + "epoch": 0.8640491452991453, + "grad_norm": 0.3684835135936737, + "learning_rate": 0.00032360558697407755, + "loss": 0.6623, + "step": 38820 + }, + { + "epoch": 0.8642717236467237, + "grad_norm": 0.686464250087738, + "learning_rate": 0.0003235689312601758, + "loss": 0.5202, + "step": 38830 + }, + { + "epoch": 0.864494301994302, + "grad_norm": 0.9118617177009583, + "learning_rate": 0.00032353226883143716, + "loss": 0.5926, + "step": 38840 + }, + { + "epoch": 0.8647168803418803, + "grad_norm": 0.6207898259162903, + "learning_rate": 0.00032349559968985396, + "loss": 0.5538, + "step": 38850 + }, + { + "epoch": 0.8649394586894587, + "grad_norm": 1.0657782554626465, + "learning_rate": 0.0003234589238374187, + "loss": 0.6224, + "step": 38860 + }, + { + "epoch": 0.8651620370370371, + "grad_norm": 0.5780830979347229, + "learning_rate": 0.0003234222412761245, + "loss": 0.5131, + "step": 38870 + }, + { + "epoch": 0.8653846153846154, + "grad_norm": 0.6197191476821899, + "learning_rate": 0.00032338555200796466, + "loss": 0.6776, + "step": 38880 + }, + { + "epoch": 0.8656071937321937, + "grad_norm": 0.5804508924484253, + "learning_rate": 0.00032334885603493293, + "loss": 0.5172, + "step": 38890 + }, + { + "epoch": 0.8658297720797721, + "grad_norm": 0.6176242232322693, + "learning_rate": 0.00032331215335902337, + "loss": 0.6256, + "step": 38900 + }, + { + "epoch": 0.8660523504273504, + "grad_norm": 0.9177045226097107, + "learning_rate": 0.0003232754439822304, + "loss": 0.6757, + "step": 38910 + }, + { + "epoch": 0.8662749287749287, + "grad_norm": 1.1486618518829346, + "learning_rate": 0.00032323872790654894, + "loss": 0.55, + "step": 38920 + }, + { + "epoch": 0.8664975071225072, + "grad_norm": 0.6468950510025024, + "learning_rate": 0.00032320200513397416, + "loss": 0.6339, + "step": 38930 + }, + { + "epoch": 0.8667200854700855, + "grad_norm": 0.7570953965187073, + "learning_rate": 0.0003231652756665015, + "loss": 0.7275, + "step": 38940 + }, + { + "epoch": 0.8669426638176638, + "grad_norm": 0.7113020420074463, + "learning_rate": 0.0003231285395061269, + "loss": 0.5602, + "step": 38950 + }, + { + "epoch": 0.8671652421652422, + "grad_norm": 0.4715264141559601, + "learning_rate": 0.0003230917966548467, + "loss": 0.6001, + "step": 38960 + }, + { + "epoch": 0.8673878205128205, + "grad_norm": 0.8866527676582336, + "learning_rate": 0.00032305504711465754, + "loss": 0.6429, + "step": 38970 + }, + { + "epoch": 0.8676103988603988, + "grad_norm": 0.7117496132850647, + "learning_rate": 0.00032301829088755634, + "loss": 0.5025, + "step": 38980 + }, + { + "epoch": 0.8678329772079773, + "grad_norm": 0.7015672326087952, + "learning_rate": 0.00032298152797554053, + "loss": 0.6292, + "step": 38990 + }, + { + "epoch": 0.8680555555555556, + "grad_norm": 0.7511372566223145, + "learning_rate": 0.0003229447583806078, + "loss": 0.6656, + "step": 39000 + }, + { + "epoch": 0.8682781339031339, + "grad_norm": 0.5627785325050354, + "learning_rate": 0.00032290798210475623, + "loss": 0.5419, + "step": 39010 + }, + { + "epoch": 0.8685007122507122, + "grad_norm": 0.5964632034301758, + "learning_rate": 0.00032287119914998434, + "loss": 0.6173, + "step": 39020 + }, + { + "epoch": 0.8687232905982906, + "grad_norm": 0.6726840734481812, + "learning_rate": 0.0003228344095182909, + "loss": 0.4604, + "step": 39030 + }, + { + "epoch": 0.8689458689458689, + "grad_norm": 0.6931193470954895, + "learning_rate": 0.00032279761321167506, + "loss": 0.5933, + "step": 39040 + }, + { + "epoch": 0.8691684472934473, + "grad_norm": 0.8055073022842407, + "learning_rate": 0.0003227608102321364, + "loss": 0.581, + "step": 39050 + }, + { + "epoch": 0.8693910256410257, + "grad_norm": 0.7156325578689575, + "learning_rate": 0.0003227240005816748, + "loss": 0.5492, + "step": 39060 + }, + { + "epoch": 0.869613603988604, + "grad_norm": 0.8013405203819275, + "learning_rate": 0.0003226871842622906, + "loss": 0.5804, + "step": 39070 + }, + { + "epoch": 0.8698361823361823, + "grad_norm": 0.61861252784729, + "learning_rate": 0.0003226503612759843, + "loss": 0.573, + "step": 39080 + }, + { + "epoch": 0.8700587606837606, + "grad_norm": 0.7677802443504333, + "learning_rate": 0.000322613531624757, + "loss": 0.7337, + "step": 39090 + }, + { + "epoch": 0.8702813390313391, + "grad_norm": 0.6915109753608704, + "learning_rate": 0.00032257669531061, + "loss": 0.5641, + "step": 39100 + }, + { + "epoch": 0.8705039173789174, + "grad_norm": 0.8183631896972656, + "learning_rate": 0.000322539852335545, + "loss": 0.5222, + "step": 39110 + }, + { + "epoch": 0.8707264957264957, + "grad_norm": 0.5204386711120605, + "learning_rate": 0.00032250300270156415, + "loss": 0.5386, + "step": 39120 + }, + { + "epoch": 0.8709490740740741, + "grad_norm": 0.4840696156024933, + "learning_rate": 0.0003224661464106698, + "loss": 0.5482, + "step": 39130 + }, + { + "epoch": 0.8711716524216524, + "grad_norm": 0.7944409847259521, + "learning_rate": 0.0003224292834648649, + "loss": 0.6042, + "step": 39140 + }, + { + "epoch": 0.8713942307692307, + "grad_norm": 0.8521105647087097, + "learning_rate": 0.00032239241386615246, + "loss": 0.7336, + "step": 39150 + }, + { + "epoch": 0.8716168091168092, + "grad_norm": 0.8539936542510986, + "learning_rate": 0.00032235553761653606, + "loss": 0.612, + "step": 39160 + }, + { + "epoch": 0.8718393874643875, + "grad_norm": 0.5302663445472717, + "learning_rate": 0.0003223186547180196, + "loss": 0.6096, + "step": 39170 + }, + { + "epoch": 0.8720619658119658, + "grad_norm": 0.8561046719551086, + "learning_rate": 0.00032228176517260724, + "loss": 0.5625, + "step": 39180 + }, + { + "epoch": 0.8722845441595442, + "grad_norm": 0.7265895009040833, + "learning_rate": 0.0003222448689823037, + "loss": 0.6372, + "step": 39190 + }, + { + "epoch": 0.8725071225071225, + "grad_norm": 0.6467223167419434, + "learning_rate": 0.00032220796614911386, + "loss": 0.6874, + "step": 39200 + }, + { + "epoch": 0.8727297008547008, + "grad_norm": 0.8323819041252136, + "learning_rate": 0.00032217105667504313, + "loss": 0.6516, + "step": 39210 + }, + { + "epoch": 0.8729522792022792, + "grad_norm": 0.42687639594078064, + "learning_rate": 0.0003221341405620972, + "loss": 0.551, + "step": 39220 + }, + { + "epoch": 0.8731748575498576, + "grad_norm": 0.5661822557449341, + "learning_rate": 0.000322097217812282, + "loss": 0.5575, + "step": 39230 + }, + { + "epoch": 0.8733974358974359, + "grad_norm": 0.9337760806083679, + "learning_rate": 0.00032206028842760416, + "loss": 0.6917, + "step": 39240 + }, + { + "epoch": 0.8736200142450142, + "grad_norm": 0.6879515647888184, + "learning_rate": 0.00032202335241007026, + "loss": 0.5838, + "step": 39250 + }, + { + "epoch": 0.8738425925925926, + "grad_norm": 0.8157190084457397, + "learning_rate": 0.00032198640976168743, + "loss": 0.5214, + "step": 39260 + }, + { + "epoch": 0.8740651709401709, + "grad_norm": 0.7548959255218506, + "learning_rate": 0.0003219494604844633, + "loss": 0.6848, + "step": 39270 + }, + { + "epoch": 0.8742877492877493, + "grad_norm": 0.6269037127494812, + "learning_rate": 0.00032191250458040566, + "loss": 0.5813, + "step": 39280 + }, + { + "epoch": 0.8745103276353277, + "grad_norm": 0.9379816651344299, + "learning_rate": 0.0003218755420515227, + "loss": 0.6591, + "step": 39290 + }, + { + "epoch": 0.874732905982906, + "grad_norm": 0.6595720052719116, + "learning_rate": 0.00032183857289982303, + "loss": 0.6301, + "step": 39300 + }, + { + "epoch": 0.8749554843304843, + "grad_norm": 0.5300357341766357, + "learning_rate": 0.00032180159712731556, + "loss": 0.6853, + "step": 39310 + }, + { + "epoch": 0.8751780626780626, + "grad_norm": 0.7907893061637878, + "learning_rate": 0.0003217646147360096, + "loss": 0.7163, + "step": 39320 + }, + { + "epoch": 0.8754006410256411, + "grad_norm": 0.819496750831604, + "learning_rate": 0.00032172762572791475, + "loss": 0.6733, + "step": 39330 + }, + { + "epoch": 0.8756232193732194, + "grad_norm": 0.7941359281539917, + "learning_rate": 0.00032169063010504113, + "loss": 0.6714, + "step": 39340 + }, + { + "epoch": 0.8758457977207977, + "grad_norm": 1.1781067848205566, + "learning_rate": 0.000321653627869399, + "loss": 0.7206, + "step": 39350 + }, + { + "epoch": 0.8760683760683761, + "grad_norm": 0.7022318243980408, + "learning_rate": 0.00032161661902299914, + "loss": 0.5669, + "step": 39360 + }, + { + "epoch": 0.8762909544159544, + "grad_norm": 0.7623576521873474, + "learning_rate": 0.0003215796035678527, + "loss": 0.5496, + "step": 39370 + }, + { + "epoch": 0.8765135327635327, + "grad_norm": 0.4989342987537384, + "learning_rate": 0.00032154258150597105, + "loss": 0.5955, + "step": 39380 + }, + { + "epoch": 0.8767361111111112, + "grad_norm": 0.5586682558059692, + "learning_rate": 0.00032150555283936595, + "loss": 0.5446, + "step": 39390 + }, + { + "epoch": 0.8769586894586895, + "grad_norm": 0.8281471133232117, + "learning_rate": 0.0003214685175700497, + "loss": 0.5533, + "step": 39400 + }, + { + "epoch": 0.8771812678062678, + "grad_norm": 0.6243104338645935, + "learning_rate": 0.0003214314757000347, + "loss": 0.6527, + "step": 39410 + }, + { + "epoch": 0.8774038461538461, + "grad_norm": 0.602491557598114, + "learning_rate": 0.00032139442723133404, + "loss": 0.6527, + "step": 39420 + }, + { + "epoch": 0.8776264245014245, + "grad_norm": 0.8057866096496582, + "learning_rate": 0.00032135737216596073, + "loss": 0.6958, + "step": 39430 + }, + { + "epoch": 0.8778490028490028, + "grad_norm": 0.6868727803230286, + "learning_rate": 0.0003213203105059285, + "loss": 0.5553, + "step": 39440 + }, + { + "epoch": 0.8780715811965812, + "grad_norm": 0.4937323033809662, + "learning_rate": 0.0003212832422532512, + "loss": 0.7375, + "step": 39450 + }, + { + "epoch": 0.8782941595441596, + "grad_norm": 0.7090859413146973, + "learning_rate": 0.00032124616740994335, + "loss": 0.6359, + "step": 39460 + }, + { + "epoch": 0.8785167378917379, + "grad_norm": 0.7405864000320435, + "learning_rate": 0.00032120908597801944, + "loss": 0.6074, + "step": 39470 + }, + { + "epoch": 0.8787393162393162, + "grad_norm": 0.5793715715408325, + "learning_rate": 0.0003211719979594946, + "loss": 0.5593, + "step": 39480 + }, + { + "epoch": 0.8789618945868946, + "grad_norm": 0.9205363988876343, + "learning_rate": 0.0003211349033563842, + "loss": 0.7185, + "step": 39490 + }, + { + "epoch": 0.8791844729344729, + "grad_norm": 0.8902425169944763, + "learning_rate": 0.00032109780217070395, + "loss": 0.5122, + "step": 39500 + }, + { + "epoch": 0.8794070512820513, + "grad_norm": 0.5989090204238892, + "learning_rate": 0.00032106069440447, + "loss": 0.6292, + "step": 39510 + }, + { + "epoch": 0.8796296296296297, + "grad_norm": 0.7132798433303833, + "learning_rate": 0.00032102358005969877, + "loss": 0.6775, + "step": 39520 + }, + { + "epoch": 0.879852207977208, + "grad_norm": 0.6441278457641602, + "learning_rate": 0.0003209864591384072, + "loss": 0.6522, + "step": 39530 + }, + { + "epoch": 0.8800747863247863, + "grad_norm": 0.7071154117584229, + "learning_rate": 0.00032094933164261236, + "loss": 0.8055, + "step": 39540 + }, + { + "epoch": 0.8802973646723646, + "grad_norm": 0.5231700539588928, + "learning_rate": 0.00032091219757433186, + "loss": 0.6843, + "step": 39550 + }, + { + "epoch": 0.8805199430199431, + "grad_norm": 0.9400531053543091, + "learning_rate": 0.0003208750569355835, + "loss": 0.659, + "step": 39560 + }, + { + "epoch": 0.8807425213675214, + "grad_norm": 0.6579751372337341, + "learning_rate": 0.00032083790972838565, + "loss": 0.624, + "step": 39570 + }, + { + "epoch": 0.8809650997150997, + "grad_norm": 0.683641791343689, + "learning_rate": 0.00032080075595475685, + "loss": 0.6428, + "step": 39580 + }, + { + "epoch": 0.8811876780626781, + "grad_norm": 0.8134473562240601, + "learning_rate": 0.00032076359561671606, + "loss": 0.6543, + "step": 39590 + }, + { + "epoch": 0.8814102564102564, + "grad_norm": 0.4869878888130188, + "learning_rate": 0.00032072642871628265, + "loss": 0.5401, + "step": 39600 + }, + { + "epoch": 0.8816328347578347, + "grad_norm": 0.734983503818512, + "learning_rate": 0.0003206892552554762, + "loss": 0.6767, + "step": 39610 + }, + { + "epoch": 0.8818554131054132, + "grad_norm": 1.119775652885437, + "learning_rate": 0.00032065207523631695, + "loss": 0.637, + "step": 39620 + }, + { + "epoch": 0.8820779914529915, + "grad_norm": 0.5646172165870667, + "learning_rate": 0.0003206148886608251, + "loss": 0.6219, + "step": 39630 + }, + { + "epoch": 0.8823005698005698, + "grad_norm": 0.5112728476524353, + "learning_rate": 0.00032057769553102145, + "loss": 0.6746, + "step": 39640 + }, + { + "epoch": 0.8825231481481481, + "grad_norm": 0.5482468605041504, + "learning_rate": 0.0003205404958489271, + "loss": 0.8427, + "step": 39650 + }, + { + "epoch": 0.8827457264957265, + "grad_norm": 0.6891186833381653, + "learning_rate": 0.0003205032896165635, + "loss": 0.6567, + "step": 39660 + }, + { + "epoch": 0.8829683048433048, + "grad_norm": 1.1482949256896973, + "learning_rate": 0.0003204660768359525, + "loss": 0.6634, + "step": 39670 + }, + { + "epoch": 0.8831908831908832, + "grad_norm": 0.5539254546165466, + "learning_rate": 0.00032042885750911633, + "loss": 0.5657, + "step": 39680 + }, + { + "epoch": 0.8834134615384616, + "grad_norm": 1.1480712890625, + "learning_rate": 0.00032039163163807746, + "loss": 0.6426, + "step": 39690 + }, + { + "epoch": 0.8836360398860399, + "grad_norm": 0.7224293351173401, + "learning_rate": 0.0003203543992248587, + "loss": 0.5478, + "step": 39700 + }, + { + "epoch": 0.8838586182336182, + "grad_norm": 0.8542203903198242, + "learning_rate": 0.0003203171602714834, + "loss": 0.6527, + "step": 39710 + }, + { + "epoch": 0.8840811965811965, + "grad_norm": 0.6108570694923401, + "learning_rate": 0.0003202799147799751, + "loss": 0.5726, + "step": 39720 + }, + { + "epoch": 0.8843037749287749, + "grad_norm": 0.5748672485351562, + "learning_rate": 0.00032024266275235776, + "loss": 0.6126, + "step": 39730 + }, + { + "epoch": 0.8845263532763533, + "grad_norm": 0.7551204562187195, + "learning_rate": 0.0003202054041906557, + "loss": 0.7203, + "step": 39740 + }, + { + "epoch": 0.8847489316239316, + "grad_norm": 0.7964481115341187, + "learning_rate": 0.00032016813909689363, + "loss": 0.5755, + "step": 39750 + }, + { + "epoch": 0.88497150997151, + "grad_norm": 0.5368321537971497, + "learning_rate": 0.0003201308674730964, + "loss": 0.5515, + "step": 39760 + }, + { + "epoch": 0.8851940883190883, + "grad_norm": 0.5676383376121521, + "learning_rate": 0.00032009358932128955, + "loss": 0.5616, + "step": 39770 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.7017415165901184, + "learning_rate": 0.00032005630464349873, + "loss": 0.5178, + "step": 39780 + }, + { + "epoch": 0.8856392450142451, + "grad_norm": 0.7300966382026672, + "learning_rate": 0.00032001901344175005, + "loss": 0.5538, + "step": 39790 + }, + { + "epoch": 0.8858618233618234, + "grad_norm": 0.6249212026596069, + "learning_rate": 0.00031998171571806993, + "loss": 0.5435, + "step": 39800 + }, + { + "epoch": 0.8860844017094017, + "grad_norm": 0.4156573414802551, + "learning_rate": 0.0003199444114744851, + "loss": 0.5803, + "step": 39810 + }, + { + "epoch": 0.88630698005698, + "grad_norm": 0.6538870930671692, + "learning_rate": 0.0003199071007130228, + "loss": 0.7323, + "step": 39820 + }, + { + "epoch": 0.8865295584045584, + "grad_norm": 0.7236296534538269, + "learning_rate": 0.0003198697834357105, + "loss": 0.6266, + "step": 39830 + }, + { + "epoch": 0.8867521367521367, + "grad_norm": 0.7877007126808167, + "learning_rate": 0.000319832459644576, + "loss": 0.6534, + "step": 39840 + }, + { + "epoch": 0.8869747150997151, + "grad_norm": 0.7191532254219055, + "learning_rate": 0.0003197951293416476, + "loss": 0.8359, + "step": 39850 + }, + { + "epoch": 0.8871972934472935, + "grad_norm": 0.5475645661354065, + "learning_rate": 0.00031975779252895375, + "loss": 0.7697, + "step": 39860 + }, + { + "epoch": 0.8874198717948718, + "grad_norm": 1.168534755706787, + "learning_rate": 0.0003197204492085234, + "loss": 0.6695, + "step": 39870 + }, + { + "epoch": 0.8876424501424501, + "grad_norm": 0.6283919215202332, + "learning_rate": 0.0003196830993823859, + "loss": 0.5903, + "step": 39880 + }, + { + "epoch": 0.8878650284900285, + "grad_norm": 3.3726868629455566, + "learning_rate": 0.00031964574305257083, + "loss": 0.5589, + "step": 39890 + }, + { + "epoch": 0.8880876068376068, + "grad_norm": 1.0309503078460693, + "learning_rate": 0.00031960838022110805, + "loss": 0.6842, + "step": 39900 + }, + { + "epoch": 0.8883101851851852, + "grad_norm": 0.6248919367790222, + "learning_rate": 0.00031957101089002797, + "loss": 0.5577, + "step": 39910 + }, + { + "epoch": 0.8885327635327636, + "grad_norm": 0.8905360102653503, + "learning_rate": 0.0003195336350613613, + "loss": 0.6414, + "step": 39920 + }, + { + "epoch": 0.8887553418803419, + "grad_norm": 0.7523095607757568, + "learning_rate": 0.00031949625273713906, + "loss": 0.7783, + "step": 39930 + }, + { + "epoch": 0.8889779202279202, + "grad_norm": 0.6982303261756897, + "learning_rate": 0.00031945886391939257, + "loss": 0.7337, + "step": 39940 + }, + { + "epoch": 0.8892004985754985, + "grad_norm": 0.5420619249343872, + "learning_rate": 0.00031942146861015374, + "loss": 0.7451, + "step": 39950 + }, + { + "epoch": 0.8894230769230769, + "grad_norm": 0.589128851890564, + "learning_rate": 0.0003193840668114544, + "loss": 0.4996, + "step": 39960 + }, + { + "epoch": 0.8896456552706553, + "grad_norm": 0.8386995196342468, + "learning_rate": 0.00031934665852532723, + "loss": 0.6711, + "step": 39970 + }, + { + "epoch": 0.8898682336182336, + "grad_norm": 0.9134801030158997, + "learning_rate": 0.0003193092437538049, + "loss": 0.5935, + "step": 39980 + }, + { + "epoch": 0.890090811965812, + "grad_norm": 0.9789322018623352, + "learning_rate": 0.00031927182249892063, + "loss": 0.7247, + "step": 39990 + }, + { + "epoch": 0.8903133903133903, + "grad_norm": 0.8390123844146729, + "learning_rate": 0.0003192343947627078, + "loss": 0.6485, + "step": 40000 + }, + { + "epoch": 0.8905359686609686, + "grad_norm": 0.5787250399589539, + "learning_rate": 0.0003191969605472004, + "loss": 0.4625, + "step": 40010 + }, + { + "epoch": 0.8907585470085471, + "grad_norm": 0.8755205273628235, + "learning_rate": 0.0003191595198544326, + "loss": 0.6651, + "step": 40020 + }, + { + "epoch": 0.8909811253561254, + "grad_norm": 0.8312947750091553, + "learning_rate": 0.000319122072686439, + "loss": 0.6148, + "step": 40030 + }, + { + "epoch": 0.8912037037037037, + "grad_norm": 0.7560413479804993, + "learning_rate": 0.0003190846190452543, + "loss": 0.5487, + "step": 40040 + }, + { + "epoch": 0.891426282051282, + "grad_norm": 0.7159906029701233, + "learning_rate": 0.000319047158932914, + "loss": 0.525, + "step": 40050 + }, + { + "epoch": 0.8916488603988604, + "grad_norm": 0.8458636403083801, + "learning_rate": 0.00031900969235145366, + "loss": 0.6415, + "step": 40060 + }, + { + "epoch": 0.8918714387464387, + "grad_norm": 0.7349972128868103, + "learning_rate": 0.0003189722193029091, + "loss": 0.6364, + "step": 40070 + }, + { + "epoch": 0.8920940170940171, + "grad_norm": 0.8532900810241699, + "learning_rate": 0.0003189347397893169, + "loss": 0.6439, + "step": 40080 + }, + { + "epoch": 0.8923165954415955, + "grad_norm": 1.060137391090393, + "learning_rate": 0.0003188972538127135, + "loss": 0.7015, + "step": 40090 + }, + { + "epoch": 0.8925391737891738, + "grad_norm": 0.6980665922164917, + "learning_rate": 0.00031885976137513596, + "loss": 0.7174, + "step": 40100 + }, + { + "epoch": 0.8927617521367521, + "grad_norm": 0.6002524495124817, + "learning_rate": 0.0003188222624786217, + "loss": 0.6698, + "step": 40110 + }, + { + "epoch": 0.8929843304843305, + "grad_norm": 0.45758989453315735, + "learning_rate": 0.00031878475712520846, + "loss": 0.545, + "step": 40120 + }, + { + "epoch": 0.8932069088319088, + "grad_norm": 0.8262528777122498, + "learning_rate": 0.0003187472453169343, + "loss": 0.4717, + "step": 40130 + }, + { + "epoch": 0.8934294871794872, + "grad_norm": 0.8281633853912354, + "learning_rate": 0.00031870972705583755, + "loss": 0.6197, + "step": 40140 + }, + { + "epoch": 0.8936520655270656, + "grad_norm": 0.739718496799469, + "learning_rate": 0.0003186722023439571, + "loss": 0.5508, + "step": 40150 + }, + { + "epoch": 0.8938746438746439, + "grad_norm": 0.6326023936271667, + "learning_rate": 0.000318634671183332, + "loss": 0.6486, + "step": 40160 + }, + { + "epoch": 0.8940972222222222, + "grad_norm": 0.649603009223938, + "learning_rate": 0.0003185971335760017, + "loss": 0.6006, + "step": 40170 + }, + { + "epoch": 0.8943198005698005, + "grad_norm": 0.5098690390586853, + "learning_rate": 0.0003185595895240061, + "loss": 0.6338, + "step": 40180 + }, + { + "epoch": 0.8945423789173789, + "grad_norm": 0.7413763999938965, + "learning_rate": 0.0003185220390293854, + "loss": 0.5233, + "step": 40190 + }, + { + "epoch": 0.8947649572649573, + "grad_norm": 0.5394598245620728, + "learning_rate": 0.00031848448209418007, + "loss": 0.5585, + "step": 40200 + }, + { + "epoch": 0.8949875356125356, + "grad_norm": 0.8628697991371155, + "learning_rate": 0.00031844691872043096, + "loss": 0.7985, + "step": 40210 + }, + { + "epoch": 0.895210113960114, + "grad_norm": 0.5882583260536194, + "learning_rate": 0.0003184093489101793, + "loss": 0.5264, + "step": 40220 + }, + { + "epoch": 0.8954326923076923, + "grad_norm": 0.8291221857070923, + "learning_rate": 0.0003183717726654667, + "loss": 0.6469, + "step": 40230 + }, + { + "epoch": 0.8956552706552706, + "grad_norm": 0.843272864818573, + "learning_rate": 0.00031833418998833505, + "loss": 0.7344, + "step": 40240 + }, + { + "epoch": 0.8958778490028491, + "grad_norm": 0.49712222814559937, + "learning_rate": 0.00031829660088082673, + "loss": 0.593, + "step": 40250 + }, + { + "epoch": 0.8961004273504274, + "grad_norm": 0.6992495656013489, + "learning_rate": 0.0003182590053449842, + "loss": 0.6134, + "step": 40260 + }, + { + "epoch": 0.8963230056980057, + "grad_norm": 0.48579835891723633, + "learning_rate": 0.0003182214033828505, + "loss": 0.6549, + "step": 40270 + }, + { + "epoch": 0.896545584045584, + "grad_norm": 0.558591365814209, + "learning_rate": 0.000318183794996469, + "loss": 0.6306, + "step": 40280 + }, + { + "epoch": 0.8967681623931624, + "grad_norm": 0.7068283557891846, + "learning_rate": 0.00031814618018788333, + "loss": 0.6821, + "step": 40290 + }, + { + "epoch": 0.8969907407407407, + "grad_norm": 0.7181311845779419, + "learning_rate": 0.0003181085589591375, + "loss": 0.6187, + "step": 40300 + }, + { + "epoch": 0.8972133190883191, + "grad_norm": 0.6119157075881958, + "learning_rate": 0.0003180709313122759, + "loss": 0.6697, + "step": 40310 + }, + { + "epoch": 0.8974358974358975, + "grad_norm": 0.5585033297538757, + "learning_rate": 0.00031803329724934315, + "loss": 0.6431, + "step": 40320 + }, + { + "epoch": 0.8976584757834758, + "grad_norm": 0.6253518462181091, + "learning_rate": 0.00031799565677238453, + "loss": 0.5559, + "step": 40330 + }, + { + "epoch": 0.8978810541310541, + "grad_norm": 0.6004060506820679, + "learning_rate": 0.00031795800988344527, + "loss": 0.6072, + "step": 40340 + }, + { + "epoch": 0.8981036324786325, + "grad_norm": 0.6097738742828369, + "learning_rate": 0.00031792035658457113, + "loss": 0.5182, + "step": 40350 + }, + { + "epoch": 0.8983262108262108, + "grad_norm": 1.233040690422058, + "learning_rate": 0.00031788269687780835, + "loss": 0.5693, + "step": 40360 + }, + { + "epoch": 0.8985487891737892, + "grad_norm": 0.9896701574325562, + "learning_rate": 0.0003178450307652033, + "loss": 0.5541, + "step": 40370 + }, + { + "epoch": 0.8987713675213675, + "grad_norm": 0.5857483744621277, + "learning_rate": 0.00031780735824880283, + "loss": 0.5526, + "step": 40380 + }, + { + "epoch": 0.8989939458689459, + "grad_norm": 0.7917017936706543, + "learning_rate": 0.00031776967933065404, + "loss": 0.6533, + "step": 40390 + }, + { + "epoch": 0.8992165242165242, + "grad_norm": 0.8823297023773193, + "learning_rate": 0.0003177319940128045, + "loss": 0.6673, + "step": 40400 + }, + { + "epoch": 0.8994391025641025, + "grad_norm": 0.5655326843261719, + "learning_rate": 0.000317694302297302, + "loss": 0.584, + "step": 40410 + }, + { + "epoch": 0.8996616809116809, + "grad_norm": 0.4943428933620453, + "learning_rate": 0.0003176566041861947, + "loss": 0.4593, + "step": 40420 + }, + { + "epoch": 0.8998842592592593, + "grad_norm": 0.6836668252944946, + "learning_rate": 0.0003176188996815313, + "loss": 0.6873, + "step": 40430 + }, + { + "epoch": 0.9001068376068376, + "grad_norm": 0.5697005987167358, + "learning_rate": 0.00031758118878536055, + "loss": 0.5643, + "step": 40440 + }, + { + "epoch": 0.9001068376068376, + "eval_loss": 0.6068045496940613, + "eval_runtime": 337.3079, + "eval_samples_per_second": 7.011, + "eval_steps_per_second": 7.011, + "step": 40440 + }, + { + "epoch": 0.900329415954416, + "grad_norm": 0.9452328681945801, + "learning_rate": 0.0003175434714997318, + "loss": 0.6032, + "step": 40450 + }, + { + "epoch": 0.9005519943019943, + "grad_norm": 0.3648838400840759, + "learning_rate": 0.0003175057478266945, + "loss": 0.7467, + "step": 40460 + }, + { + "epoch": 0.9007745726495726, + "grad_norm": 0.5086342096328735, + "learning_rate": 0.00031746801776829877, + "loss": 0.4958, + "step": 40470 + }, + { + "epoch": 0.9009971509971509, + "grad_norm": 0.4477662444114685, + "learning_rate": 0.0003174302813265948, + "loss": 0.6573, + "step": 40480 + }, + { + "epoch": 0.9012197293447294, + "grad_norm": 1.764902949333191, + "learning_rate": 0.00031739253850363307, + "loss": 0.5559, + "step": 40490 + }, + { + "epoch": 0.9014423076923077, + "grad_norm": 0.6134351491928101, + "learning_rate": 0.0003173547893014648, + "loss": 0.4968, + "step": 40500 + }, + { + "epoch": 0.901664886039886, + "grad_norm": 0.51532381772995, + "learning_rate": 0.00031731703372214114, + "loss": 0.6746, + "step": 40510 + }, + { + "epoch": 0.9018874643874644, + "grad_norm": 0.6712770462036133, + "learning_rate": 0.0003172792717677139, + "loss": 0.6102, + "step": 40520 + }, + { + "epoch": 0.9021100427350427, + "grad_norm": 0.5384175777435303, + "learning_rate": 0.000317241503440235, + "loss": 0.5977, + "step": 40530 + }, + { + "epoch": 0.9023326210826211, + "grad_norm": 0.7249717712402344, + "learning_rate": 0.00031720372874175683, + "loss": 0.5999, + "step": 40540 + }, + { + "epoch": 0.9025551994301995, + "grad_norm": 0.47203508019447327, + "learning_rate": 0.0003171659476743321, + "loss": 0.6368, + "step": 40550 + }, + { + "epoch": 0.9027777777777778, + "grad_norm": 0.6192301511764526, + "learning_rate": 0.0003171281602400138, + "loss": 0.521, + "step": 40560 + }, + { + "epoch": 0.9030003561253561, + "grad_norm": 0.8027661442756653, + "learning_rate": 0.00031709036644085543, + "loss": 0.5966, + "step": 40570 + }, + { + "epoch": 0.9032229344729344, + "grad_norm": 0.61912602186203, + "learning_rate": 0.0003170525662789107, + "loss": 0.6183, + "step": 40580 + }, + { + "epoch": 0.9034455128205128, + "grad_norm": 0.953890860080719, + "learning_rate": 0.0003170147597562337, + "loss": 0.6791, + "step": 40590 + }, + { + "epoch": 0.9036680911680912, + "grad_norm": 0.6121290326118469, + "learning_rate": 0.0003169769468748788, + "loss": 0.5699, + "step": 40600 + }, + { + "epoch": 0.9038906695156695, + "grad_norm": 0.3996066451072693, + "learning_rate": 0.00031693912763690096, + "loss": 0.5991, + "step": 40610 + }, + { + "epoch": 0.9041132478632479, + "grad_norm": 0.7684537768363953, + "learning_rate": 0.0003169013020443551, + "loss": 0.6052, + "step": 40620 + }, + { + "epoch": 0.9043358262108262, + "grad_norm": 0.628394365310669, + "learning_rate": 0.0003168634700992968, + "loss": 0.5785, + "step": 40630 + }, + { + "epoch": 0.9045584045584045, + "grad_norm": 0.8542444109916687, + "learning_rate": 0.0003168256318037819, + "loss": 0.6702, + "step": 40640 + }, + { + "epoch": 0.9047809829059829, + "grad_norm": 0.7758163213729858, + "learning_rate": 0.00031678778715986655, + "loss": 0.6605, + "step": 40650 + }, + { + "epoch": 0.9050035612535613, + "grad_norm": 0.6627446413040161, + "learning_rate": 0.0003167499361696071, + "loss": 0.5987, + "step": 40660 + }, + { + "epoch": 0.9052261396011396, + "grad_norm": 0.4540365934371948, + "learning_rate": 0.0003167120788350607, + "loss": 0.5297, + "step": 40670 + }, + { + "epoch": 0.905448717948718, + "grad_norm": 0.5813724398612976, + "learning_rate": 0.00031667421515828433, + "loss": 0.6154, + "step": 40680 + }, + { + "epoch": 0.9056712962962963, + "grad_norm": 0.9838677048683167, + "learning_rate": 0.0003166363451413356, + "loss": 0.5669, + "step": 40690 + }, + { + "epoch": 0.9058938746438746, + "grad_norm": 0.9327704310417175, + "learning_rate": 0.00031659846878627235, + "loss": 0.588, + "step": 40700 + }, + { + "epoch": 0.9061164529914529, + "grad_norm": 0.5827953219413757, + "learning_rate": 0.0003165605860951529, + "loss": 0.5702, + "step": 40710 + }, + { + "epoch": 0.9063390313390314, + "grad_norm": 0.8706266283988953, + "learning_rate": 0.0003165226970700358, + "loss": 0.7091, + "step": 40720 + }, + { + "epoch": 0.9065616096866097, + "grad_norm": 0.6274487972259521, + "learning_rate": 0.0003164848017129799, + "loss": 0.6592, + "step": 40730 + }, + { + "epoch": 0.906784188034188, + "grad_norm": 0.7041484117507935, + "learning_rate": 0.00031644690002604454, + "loss": 0.7755, + "step": 40740 + }, + { + "epoch": 0.9070067663817664, + "grad_norm": 0.6966767311096191, + "learning_rate": 0.0003164089920112893, + "loss": 0.612, + "step": 40750 + }, + { + "epoch": 0.9072293447293447, + "grad_norm": 0.5250440835952759, + "learning_rate": 0.0003163710776707742, + "loss": 0.4996, + "step": 40760 + }, + { + "epoch": 0.9074519230769231, + "grad_norm": 0.777004599571228, + "learning_rate": 0.00031633315700655936, + "loss": 0.6096, + "step": 40770 + }, + { + "epoch": 0.9076745014245015, + "grad_norm": 0.6018292307853699, + "learning_rate": 0.00031629523002070563, + "loss": 0.5804, + "step": 40780 + }, + { + "epoch": 0.9078970797720798, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.00031625729671527384, + "loss": 0.6846, + "step": 40790 + }, + { + "epoch": 0.9081196581196581, + "grad_norm": 0.6959264278411865, + "learning_rate": 0.0003162193570923254, + "loss": 0.6592, + "step": 40800 + }, + { + "epoch": 0.9083422364672364, + "grad_norm": 0.784415602684021, + "learning_rate": 0.00031618141115392196, + "loss": 0.7515, + "step": 40810 + }, + { + "epoch": 0.9085648148148148, + "grad_norm": 1.0342206954956055, + "learning_rate": 0.0003161434589021255, + "loss": 0.6266, + "step": 40820 + }, + { + "epoch": 0.9087873931623932, + "grad_norm": 0.8727086186408997, + "learning_rate": 0.0003161055003389984, + "loss": 0.5163, + "step": 40830 + }, + { + "epoch": 0.9090099715099715, + "grad_norm": 0.5465406179428101, + "learning_rate": 0.00031606753546660336, + "loss": 0.5507, + "step": 40840 + }, + { + "epoch": 0.9092325498575499, + "grad_norm": 0.5344588756561279, + "learning_rate": 0.0003160295642870034, + "loss": 0.5908, + "step": 40850 + }, + { + "epoch": 0.9094551282051282, + "grad_norm": 0.5498618483543396, + "learning_rate": 0.000315991586802262, + "loss": 0.4823, + "step": 40860 + }, + { + "epoch": 0.9096777065527065, + "grad_norm": 0.700057327747345, + "learning_rate": 0.00031595360301444285, + "loss": 0.6203, + "step": 40870 + }, + { + "epoch": 0.9099002849002849, + "grad_norm": 0.6060965657234192, + "learning_rate": 0.0003159156129256099, + "loss": 0.6212, + "step": 40880 + }, + { + "epoch": 0.9101228632478633, + "grad_norm": 0.9121140837669373, + "learning_rate": 0.0003158776165378277, + "loss": 0.73, + "step": 40890 + }, + { + "epoch": 0.9103454415954416, + "grad_norm": 0.7383050322532654, + "learning_rate": 0.0003158396138531609, + "loss": 0.616, + "step": 40900 + }, + { + "epoch": 0.91056801994302, + "grad_norm": 0.5925201773643494, + "learning_rate": 0.00031580160487367474, + "loss": 0.5811, + "step": 40910 + }, + { + "epoch": 0.9107905982905983, + "grad_norm": 0.6484251022338867, + "learning_rate": 0.00031576358960143445, + "loss": 0.6064, + "step": 40920 + }, + { + "epoch": 0.9110131766381766, + "grad_norm": 0.7028173804283142, + "learning_rate": 0.00031572556803850603, + "loss": 0.6034, + "step": 40930 + }, + { + "epoch": 0.9112357549857549, + "grad_norm": 0.6228461265563965, + "learning_rate": 0.0003156875401869555, + "loss": 0.4492, + "step": 40940 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.5296692252159119, + "learning_rate": 0.0003156495060488493, + "loss": 0.6993, + "step": 40950 + }, + { + "epoch": 0.9116809116809117, + "grad_norm": 0.5507607460021973, + "learning_rate": 0.0003156114656262543, + "loss": 0.6036, + "step": 40960 + }, + { + "epoch": 0.91190349002849, + "grad_norm": 0.6303926110267639, + "learning_rate": 0.0003155734189212375, + "loss": 0.5727, + "step": 40970 + }, + { + "epoch": 0.9121260683760684, + "grad_norm": 0.7401798963546753, + "learning_rate": 0.00031553536593586666, + "loss": 0.4791, + "step": 40980 + }, + { + "epoch": 0.9123486467236467, + "grad_norm": 0.8919644951820374, + "learning_rate": 0.00031549730667220936, + "loss": 0.7607, + "step": 40990 + }, + { + "epoch": 0.9125712250712251, + "grad_norm": 0.4974423050880432, + "learning_rate": 0.0003154592411323339, + "loss": 0.6209, + "step": 41000 + }, + { + "epoch": 0.9127938034188035, + "grad_norm": 0.9230018258094788, + "learning_rate": 0.00031542116931830875, + "loss": 0.5109, + "step": 41010 + }, + { + "epoch": 0.9130163817663818, + "grad_norm": 0.6732121109962463, + "learning_rate": 0.0003153830912322028, + "loss": 0.6689, + "step": 41020 + }, + { + "epoch": 0.9132389601139601, + "grad_norm": 0.5105556845664978, + "learning_rate": 0.0003153450068760852, + "loss": 0.5894, + "step": 41030 + }, + { + "epoch": 0.9134615384615384, + "grad_norm": 0.6419169306755066, + "learning_rate": 0.0003153069162520255, + "loss": 0.4713, + "step": 41040 + }, + { + "epoch": 0.9136841168091168, + "grad_norm": 0.8677101731300354, + "learning_rate": 0.0003152688193620935, + "loss": 0.7491, + "step": 41050 + }, + { + "epoch": 0.9139066951566952, + "grad_norm": 0.7640895843505859, + "learning_rate": 0.0003152307162083595, + "loss": 0.6545, + "step": 41060 + }, + { + "epoch": 0.9141292735042735, + "grad_norm": 0.37812137603759766, + "learning_rate": 0.00031519260679289414, + "loss": 0.5917, + "step": 41070 + }, + { + "epoch": 0.9143518518518519, + "grad_norm": 0.74465411901474, + "learning_rate": 0.00031515449111776825, + "loss": 0.7212, + "step": 41080 + }, + { + "epoch": 0.9145744301994302, + "grad_norm": 0.6022646427154541, + "learning_rate": 0.0003151163691850529, + "loss": 0.6398, + "step": 41090 + }, + { + "epoch": 0.9147970085470085, + "grad_norm": 0.5874007344245911, + "learning_rate": 0.00031507824099681993, + "loss": 0.5747, + "step": 41100 + }, + { + "epoch": 0.9150195868945868, + "grad_norm": 0.4806864857673645, + "learning_rate": 0.0003150401065551411, + "loss": 0.5708, + "step": 41110 + }, + { + "epoch": 0.9152421652421653, + "grad_norm": 0.6752296686172485, + "learning_rate": 0.0003150019658620887, + "loss": 0.548, + "step": 41120 + }, + { + "epoch": 0.9154647435897436, + "grad_norm": 0.5574817061424255, + "learning_rate": 0.00031496381891973533, + "loss": 0.6457, + "step": 41130 + }, + { + "epoch": 0.9156873219373219, + "grad_norm": 0.7656465768814087, + "learning_rate": 0.0003149256657301539, + "loss": 0.5559, + "step": 41140 + }, + { + "epoch": 0.9159099002849003, + "grad_norm": 0.5118249654769897, + "learning_rate": 0.0003148875062954177, + "loss": 0.7083, + "step": 41150 + }, + { + "epoch": 0.9161324786324786, + "grad_norm": 0.7542393207550049, + "learning_rate": 0.00031484934061760044, + "loss": 0.5739, + "step": 41160 + }, + { + "epoch": 0.9163550569800569, + "grad_norm": 0.5904927849769592, + "learning_rate": 0.00031481116869877594, + "loss": 0.5556, + "step": 41170 + }, + { + "epoch": 0.9165776353276354, + "grad_norm": 0.46618613600730896, + "learning_rate": 0.00031477299054101857, + "loss": 0.5605, + "step": 41180 + }, + { + "epoch": 0.9168002136752137, + "grad_norm": 0.5759552717208862, + "learning_rate": 0.00031473480614640294, + "loss": 0.6537, + "step": 41190 + }, + { + "epoch": 0.917022792022792, + "grad_norm": 0.8385375738143921, + "learning_rate": 0.00031469661551700395, + "loss": 0.6678, + "step": 41200 + }, + { + "epoch": 0.9172453703703703, + "grad_norm": 0.44897398352622986, + "learning_rate": 0.00031465841865489704, + "loss": 0.549, + "step": 41210 + }, + { + "epoch": 0.9174679487179487, + "grad_norm": 0.679852306842804, + "learning_rate": 0.0003146202155621578, + "loss": 0.547, + "step": 41220 + }, + { + "epoch": 0.9176905270655271, + "grad_norm": 0.7378935813903809, + "learning_rate": 0.0003145820062408622, + "loss": 0.7427, + "step": 41230 + }, + { + "epoch": 0.9179131054131054, + "grad_norm": 0.7794041633605957, + "learning_rate": 0.0003145437906930865, + "loss": 0.6781, + "step": 41240 + }, + { + "epoch": 0.9181356837606838, + "grad_norm": 0.5164105892181396, + "learning_rate": 0.00031450556892090753, + "loss": 0.5227, + "step": 41250 + }, + { + "epoch": 0.9183582621082621, + "grad_norm": 0.5497879385948181, + "learning_rate": 0.00031446734092640226, + "loss": 0.6573, + "step": 41260 + }, + { + "epoch": 0.9185808404558404, + "grad_norm": 0.5047428011894226, + "learning_rate": 0.0003144291067116479, + "loss": 0.6054, + "step": 41270 + }, + { + "epoch": 0.9188034188034188, + "grad_norm": 0.6174345016479492, + "learning_rate": 0.0003143908662787222, + "loss": 0.6215, + "step": 41280 + }, + { + "epoch": 0.9190259971509972, + "grad_norm": 0.6649008393287659, + "learning_rate": 0.0003143526196297032, + "loss": 0.5482, + "step": 41290 + }, + { + "epoch": 0.9192485754985755, + "grad_norm": 0.7139372825622559, + "learning_rate": 0.0003143143667666692, + "loss": 0.5291, + "step": 41300 + }, + { + "epoch": 0.9194711538461539, + "grad_norm": 0.661760151386261, + "learning_rate": 0.0003142761076916989, + "loss": 0.6053, + "step": 41310 + }, + { + "epoch": 0.9196937321937322, + "grad_norm": 0.5792003870010376, + "learning_rate": 0.0003142378424068715, + "loss": 0.5836, + "step": 41320 + }, + { + "epoch": 0.9199163105413105, + "grad_norm": 0.8285216093063354, + "learning_rate": 0.0003141995709142662, + "loss": 0.511, + "step": 41330 + }, + { + "epoch": 0.9201388888888888, + "grad_norm": 0.7349622845649719, + "learning_rate": 0.00031416129321596266, + "loss": 0.735, + "step": 41340 + }, + { + "epoch": 0.9203614672364673, + "grad_norm": 0.6661806106567383, + "learning_rate": 0.00031412300931404094, + "loss": 0.5323, + "step": 41350 + }, + { + "epoch": 0.9205840455840456, + "grad_norm": 0.6408804655075073, + "learning_rate": 0.0003140847192105815, + "loss": 0.5903, + "step": 41360 + }, + { + "epoch": 0.9208066239316239, + "grad_norm": 0.9972931742668152, + "learning_rate": 0.00031404642290766506, + "loss": 0.6596, + "step": 41370 + }, + { + "epoch": 0.9210292022792023, + "grad_norm": 0.6347147822380066, + "learning_rate": 0.00031400812040737266, + "loss": 0.5938, + "step": 41380 + }, + { + "epoch": 0.9212517806267806, + "grad_norm": 0.509615421295166, + "learning_rate": 0.00031396981171178567, + "loss": 0.7123, + "step": 41390 + }, + { + "epoch": 0.9214743589743589, + "grad_norm": 0.42907026410102844, + "learning_rate": 0.00031393149682298573, + "loss": 0.4879, + "step": 41400 + }, + { + "epoch": 0.9216969373219374, + "grad_norm": 0.5011876225471497, + "learning_rate": 0.000313893175743055, + "loss": 0.5529, + "step": 41410 + }, + { + "epoch": 0.9219195156695157, + "grad_norm": 0.5521091222763062, + "learning_rate": 0.0003138548484740759, + "loss": 0.5901, + "step": 41420 + }, + { + "epoch": 0.922142094017094, + "grad_norm": 0.6726789474487305, + "learning_rate": 0.00031381651501813113, + "loss": 0.568, + "step": 41430 + }, + { + "epoch": 0.9223646723646723, + "grad_norm": 0.479596883058548, + "learning_rate": 0.00031377817537730373, + "loss": 0.6254, + "step": 41440 + }, + { + "epoch": 0.9225872507122507, + "grad_norm": 0.8564170598983765, + "learning_rate": 0.00031373982955367716, + "loss": 0.4415, + "step": 41450 + }, + { + "epoch": 0.9228098290598291, + "grad_norm": 0.8632370233535767, + "learning_rate": 0.0003137014775493352, + "loss": 0.4761, + "step": 41460 + }, + { + "epoch": 0.9230324074074074, + "grad_norm": 0.674905002117157, + "learning_rate": 0.0003136631193663618, + "loss": 0.6691, + "step": 41470 + }, + { + "epoch": 0.9232549857549858, + "grad_norm": 0.8387262225151062, + "learning_rate": 0.0003136247550068414, + "loss": 0.6602, + "step": 41480 + }, + { + "epoch": 0.9234775641025641, + "grad_norm": 0.7182347774505615, + "learning_rate": 0.00031358638447285885, + "loss": 0.4812, + "step": 41490 + }, + { + "epoch": 0.9237001424501424, + "grad_norm": 0.8412067294120789, + "learning_rate": 0.0003135480077664992, + "loss": 0.5495, + "step": 41500 + }, + { + "epoch": 0.9239227207977208, + "grad_norm": 0.6454422473907471, + "learning_rate": 0.00031350962488984783, + "loss": 0.6778, + "step": 41510 + }, + { + "epoch": 0.9241452991452992, + "grad_norm": 0.8959324955940247, + "learning_rate": 0.0003134712358449905, + "loss": 0.6016, + "step": 41520 + }, + { + "epoch": 0.9243678774928775, + "grad_norm": 0.6741381287574768, + "learning_rate": 0.0003134328406340134, + "loss": 0.4758, + "step": 41530 + }, + { + "epoch": 0.9245904558404558, + "grad_norm": 0.6390167474746704, + "learning_rate": 0.0003133944392590028, + "loss": 0.492, + "step": 41540 + }, + { + "epoch": 0.9248130341880342, + "grad_norm": 0.6453841924667358, + "learning_rate": 0.00031335603172204555, + "loss": 0.5103, + "step": 41550 + }, + { + "epoch": 0.9250356125356125, + "grad_norm": 0.6824930310249329, + "learning_rate": 0.0003133176180252288, + "loss": 0.5694, + "step": 41560 + }, + { + "epoch": 0.9252581908831908, + "grad_norm": 0.8084796071052551, + "learning_rate": 0.0003132791981706398, + "loss": 0.5371, + "step": 41570 + }, + { + "epoch": 0.9254807692307693, + "grad_norm": 0.6897899508476257, + "learning_rate": 0.0003132407721603666, + "loss": 0.6974, + "step": 41580 + }, + { + "epoch": 0.9257033475783476, + "grad_norm": 0.8142766952514648, + "learning_rate": 0.000313202339996497, + "loss": 0.5623, + "step": 41590 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 1.095940351486206, + "learning_rate": 0.0003131639016811196, + "loss": 0.7174, + "step": 41600 + }, + { + "epoch": 0.9261485042735043, + "grad_norm": 1.131062388420105, + "learning_rate": 0.00031312545721632316, + "loss": 0.6568, + "step": 41610 + }, + { + "epoch": 0.9263710826210826, + "grad_norm": 0.42202919721603394, + "learning_rate": 0.00031308700660419677, + "loss": 0.4689, + "step": 41620 + }, + { + "epoch": 0.9265936609686609, + "grad_norm": 0.3474844992160797, + "learning_rate": 0.0003130485498468299, + "loss": 0.5296, + "step": 41630 + }, + { + "epoch": 0.9268162393162394, + "grad_norm": 0.6784580945968628, + "learning_rate": 0.0003130100869463122, + "loss": 0.5368, + "step": 41640 + }, + { + "epoch": 0.9270388176638177, + "grad_norm": 0.5489214062690735, + "learning_rate": 0.00031297161790473394, + "loss": 0.589, + "step": 41650 + }, + { + "epoch": 0.927261396011396, + "grad_norm": 0.7951946258544922, + "learning_rate": 0.0003129331427241855, + "loss": 0.6161, + "step": 41660 + }, + { + "epoch": 0.9274839743589743, + "grad_norm": 0.9211964011192322, + "learning_rate": 0.00031289466140675757, + "loss": 0.5625, + "step": 41670 + }, + { + "epoch": 0.9277065527065527, + "grad_norm": 0.7942441701889038, + "learning_rate": 0.0003128561739545413, + "loss": 0.7409, + "step": 41680 + }, + { + "epoch": 0.9279291310541311, + "grad_norm": 0.5457864999771118, + "learning_rate": 0.0003128176803696282, + "loss": 0.637, + "step": 41690 + }, + { + "epoch": 0.9281517094017094, + "grad_norm": 0.7595346570014954, + "learning_rate": 0.00031277918065410995, + "loss": 0.4984, + "step": 41700 + }, + { + "epoch": 0.9283742877492878, + "grad_norm": 0.6316278576850891, + "learning_rate": 0.00031274067481007874, + "loss": 0.6209, + "step": 41710 + }, + { + "epoch": 0.9285968660968661, + "grad_norm": 0.5016762018203735, + "learning_rate": 0.00031270216283962697, + "loss": 0.5954, + "step": 41720 + }, + { + "epoch": 0.9288194444444444, + "grad_norm": 0.6339188814163208, + "learning_rate": 0.0003126636447448473, + "loss": 0.6837, + "step": 41730 + }, + { + "epoch": 0.9290420227920227, + "grad_norm": 0.7948164343833923, + "learning_rate": 0.0003126251205278331, + "loss": 0.6081, + "step": 41740 + }, + { + "epoch": 0.9292646011396012, + "grad_norm": 0.6831077337265015, + "learning_rate": 0.0003125865901906776, + "loss": 0.6106, + "step": 41750 + }, + { + "epoch": 0.9294871794871795, + "grad_norm": 0.7580824494361877, + "learning_rate": 0.00031254805373547457, + "loss": 0.5183, + "step": 41760 + }, + { + "epoch": 0.9297097578347578, + "grad_norm": 0.6469782590866089, + "learning_rate": 0.0003125095111643182, + "loss": 0.602, + "step": 41770 + }, + { + "epoch": 0.9299323361823362, + "grad_norm": 0.6362723112106323, + "learning_rate": 0.0003124709624793029, + "loss": 0.673, + "step": 41780 + }, + { + "epoch": 0.9301549145299145, + "grad_norm": 0.6553264856338501, + "learning_rate": 0.0003124324076825234, + "loss": 0.5923, + "step": 41790 + }, + { + "epoch": 0.9303774928774928, + "grad_norm": 0.6719655394554138, + "learning_rate": 0.0003123938467760748, + "loss": 0.5773, + "step": 41800 + }, + { + "epoch": 0.9306000712250713, + "grad_norm": 0.46676400303840637, + "learning_rate": 0.0003123552797620526, + "loss": 0.6049, + "step": 41810 + }, + { + "epoch": 0.9308226495726496, + "grad_norm": 0.8152366876602173, + "learning_rate": 0.0003123167066425525, + "loss": 0.5892, + "step": 41820 + }, + { + "epoch": 0.9310452279202279, + "grad_norm": 0.7665014266967773, + "learning_rate": 0.00031227812741967066, + "loss": 0.5525, + "step": 41830 + }, + { + "epoch": 0.9312678062678063, + "grad_norm": 0.9132956862449646, + "learning_rate": 0.00031223954209550343, + "loss": 0.5866, + "step": 41840 + }, + { + "epoch": 0.9314903846153846, + "grad_norm": 0.5664402842521667, + "learning_rate": 0.0003122009506721476, + "loss": 0.5722, + "step": 41850 + }, + { + "epoch": 0.9317129629629629, + "grad_norm": 0.6909135580062866, + "learning_rate": 0.00031216235315170026, + "loss": 0.5854, + "step": 41860 + }, + { + "epoch": 0.9319355413105413, + "grad_norm": 0.6950509548187256, + "learning_rate": 0.00031212374953625883, + "loss": 0.6248, + "step": 41870 + }, + { + "epoch": 0.9321581196581197, + "grad_norm": 0.800899863243103, + "learning_rate": 0.0003120851398279211, + "loss": 0.6557, + "step": 41880 + }, + { + "epoch": 0.932380698005698, + "grad_norm": 0.8420230150222778, + "learning_rate": 0.00031204652402878506, + "loss": 0.6, + "step": 41890 + }, + { + "epoch": 0.9326032763532763, + "grad_norm": 0.6417230367660522, + "learning_rate": 0.0003120079021409492, + "loss": 0.7307, + "step": 41900 + }, + { + "epoch": 0.9328258547008547, + "grad_norm": 0.8272371292114258, + "learning_rate": 0.0003119692741665122, + "loss": 0.7223, + "step": 41910 + }, + { + "epoch": 0.9330484330484331, + "grad_norm": 0.7413893342018127, + "learning_rate": 0.0003119306401075732, + "loss": 0.6794, + "step": 41920 + }, + { + "epoch": 0.9332710113960114, + "grad_norm": 0.51690673828125, + "learning_rate": 0.0003118919999662316, + "loss": 0.5408, + "step": 41930 + }, + { + "epoch": 0.9334935897435898, + "grad_norm": 0.5625960230827332, + "learning_rate": 0.00031185335374458713, + "loss": 0.6769, + "step": 41940 + }, + { + "epoch": 0.9337161680911681, + "grad_norm": 0.8159250020980835, + "learning_rate": 0.00031181470144473985, + "loss": 0.6423, + "step": 41950 + }, + { + "epoch": 0.9339387464387464, + "grad_norm": 0.6201074123382568, + "learning_rate": 0.0003117760430687901, + "loss": 0.4813, + "step": 41960 + }, + { + "epoch": 0.9341613247863247, + "grad_norm": 0.6656062006950378, + "learning_rate": 0.00031173737861883873, + "loss": 0.5664, + "step": 41970 + }, + { + "epoch": 0.9343839031339032, + "grad_norm": 0.6236233115196228, + "learning_rate": 0.0003116987080969867, + "loss": 0.5341, + "step": 41980 + }, + { + "epoch": 0.9346064814814815, + "grad_norm": 0.5731833577156067, + "learning_rate": 0.0003116600315053355, + "loss": 0.566, + "step": 41990 + }, + { + "epoch": 0.9348290598290598, + "grad_norm": 0.724296510219574, + "learning_rate": 0.00031162134884598676, + "loss": 0.6318, + "step": 42000 + }, + { + "epoch": 0.9350516381766382, + "grad_norm": 0.8767474889755249, + "learning_rate": 0.0003115826601210425, + "loss": 0.5942, + "step": 42010 + }, + { + "epoch": 0.9352742165242165, + "grad_norm": 0.6816518306732178, + "learning_rate": 0.00031154396533260515, + "loss": 0.5605, + "step": 42020 + }, + { + "epoch": 0.9354967948717948, + "grad_norm": 0.8070791959762573, + "learning_rate": 0.0003115052644827774, + "loss": 0.5516, + "step": 42030 + }, + { + "epoch": 0.9357193732193733, + "grad_norm": 0.6746892333030701, + "learning_rate": 0.0003114665575736623, + "loss": 0.5388, + "step": 42040 + }, + { + "epoch": 0.9359419515669516, + "grad_norm": 0.6569458842277527, + "learning_rate": 0.00031142784460736324, + "loss": 0.6843, + "step": 42050 + }, + { + "epoch": 0.9361645299145299, + "grad_norm": 0.8369061350822449, + "learning_rate": 0.0003113891255859839, + "loss": 0.6018, + "step": 42060 + }, + { + "epoch": 0.9363871082621082, + "grad_norm": 0.692882776260376, + "learning_rate": 0.00031135040051162826, + "loss": 0.6985, + "step": 42070 + }, + { + "epoch": 0.9366096866096866, + "grad_norm": 0.5424764752388, + "learning_rate": 0.0003113116693864007, + "loss": 0.4902, + "step": 42080 + }, + { + "epoch": 0.9368322649572649, + "grad_norm": 0.5952123999595642, + "learning_rate": 0.00031127293221240587, + "loss": 0.6561, + "step": 42090 + }, + { + "epoch": 0.9370548433048433, + "grad_norm": 0.6257585287094116, + "learning_rate": 0.0003112341889917488, + "loss": 0.637, + "step": 42100 + }, + { + "epoch": 0.9372774216524217, + "grad_norm": 0.6995611786842346, + "learning_rate": 0.0003111954397265349, + "loss": 0.5221, + "step": 42110 + }, + { + "epoch": 0.9375, + "grad_norm": 0.62469482421875, + "learning_rate": 0.00031115668441886976, + "loss": 0.6586, + "step": 42120 + }, + { + "epoch": 0.9377225783475783, + "grad_norm": 0.7522687911987305, + "learning_rate": 0.0003111179230708594, + "loss": 0.6675, + "step": 42130 + }, + { + "epoch": 0.9379451566951567, + "grad_norm": 0.44003018736839294, + "learning_rate": 0.00031107915568461014, + "loss": 0.6966, + "step": 42140 + }, + { + "epoch": 0.9381677350427351, + "grad_norm": 0.4076796770095825, + "learning_rate": 0.0003110403822622286, + "loss": 0.6358, + "step": 42150 + }, + { + "epoch": 0.9383903133903134, + "grad_norm": 0.7261608839035034, + "learning_rate": 0.0003110016028058218, + "loss": 0.6083, + "step": 42160 + }, + { + "epoch": 0.9386128917378918, + "grad_norm": 0.5442433953285217, + "learning_rate": 0.000310962817317497, + "loss": 0.5734, + "step": 42170 + }, + { + "epoch": 0.9388354700854701, + "grad_norm": 0.5819172263145447, + "learning_rate": 0.0003109240257993619, + "loss": 0.5775, + "step": 42180 + }, + { + "epoch": 0.9390580484330484, + "grad_norm": 0.9003576636314392, + "learning_rate": 0.00031088522825352443, + "loss": 0.5348, + "step": 42190 + }, + { + "epoch": 0.9392806267806267, + "grad_norm": 0.6950390934944153, + "learning_rate": 0.00031084642468209286, + "loss": 0.5162, + "step": 42200 + }, + { + "epoch": 0.9395032051282052, + "grad_norm": 0.4344139099121094, + "learning_rate": 0.0003108076150871759, + "loss": 0.5719, + "step": 42210 + }, + { + "epoch": 0.9397257834757835, + "grad_norm": 1.0126549005508423, + "learning_rate": 0.00031076879947088235, + "loss": 0.6178, + "step": 42220 + }, + { + "epoch": 0.9399483618233618, + "grad_norm": 0.7256695628166199, + "learning_rate": 0.0003107299778353216, + "loss": 0.5811, + "step": 42230 + }, + { + "epoch": 0.9401709401709402, + "grad_norm": 0.9113925099372864, + "learning_rate": 0.00031069115018260315, + "loss": 0.574, + "step": 42240 + }, + { + "epoch": 0.9403935185185185, + "grad_norm": 0.5826119780540466, + "learning_rate": 0.0003106523165148371, + "loss": 0.5027, + "step": 42250 + }, + { + "epoch": 0.9406160968660968, + "grad_norm": 0.8413066864013672, + "learning_rate": 0.0003106134768341335, + "loss": 0.6589, + "step": 42260 + }, + { + "epoch": 0.9408386752136753, + "grad_norm": 0.5362156629562378, + "learning_rate": 0.000310574631142603, + "loss": 0.4235, + "step": 42270 + }, + { + "epoch": 0.9410612535612536, + "grad_norm": 0.7126069068908691, + "learning_rate": 0.00031053577944235654, + "loss": 0.5982, + "step": 42280 + }, + { + "epoch": 0.9412838319088319, + "grad_norm": 0.9489959478378296, + "learning_rate": 0.0003104969217355053, + "loss": 0.5615, + "step": 42290 + }, + { + "epoch": 0.9415064102564102, + "grad_norm": 0.8428272008895874, + "learning_rate": 0.00031045805802416094, + "loss": 0.6999, + "step": 42300 + }, + { + "epoch": 0.9417289886039886, + "grad_norm": 0.39634135365486145, + "learning_rate": 0.0003104191883104353, + "loss": 0.8428, + "step": 42310 + }, + { + "epoch": 0.9419515669515669, + "grad_norm": 0.7322662472724915, + "learning_rate": 0.00031038031259644056, + "loss": 0.6709, + "step": 42320 + }, + { + "epoch": 0.9421741452991453, + "grad_norm": 0.875149130821228, + "learning_rate": 0.00031034143088428924, + "loss": 0.678, + "step": 42330 + }, + { + "epoch": 0.9423967236467237, + "grad_norm": 0.7805944681167603, + "learning_rate": 0.00031030254317609426, + "loss": 0.6248, + "step": 42340 + }, + { + "epoch": 0.942619301994302, + "grad_norm": 0.6153649091720581, + "learning_rate": 0.0003102636494739687, + "loss": 0.7148, + "step": 42350 + }, + { + "epoch": 0.9428418803418803, + "grad_norm": 0.4850127100944519, + "learning_rate": 0.00031022474978002626, + "loss": 0.5177, + "step": 42360 + }, + { + "epoch": 0.9430644586894587, + "grad_norm": 0.5072647929191589, + "learning_rate": 0.00031018584409638067, + "loss": 0.5377, + "step": 42370 + }, + { + "epoch": 0.9432870370370371, + "grad_norm": 0.5038172006607056, + "learning_rate": 0.0003101469324251461, + "loss": 0.5688, + "step": 42380 + }, + { + "epoch": 0.9435096153846154, + "grad_norm": 0.736919641494751, + "learning_rate": 0.00031010801476843704, + "loss": 0.7235, + "step": 42390 + }, + { + "epoch": 0.9437321937321937, + "grad_norm": 0.6360201239585876, + "learning_rate": 0.0003100690911283683, + "loss": 0.5508, + "step": 42400 + }, + { + "epoch": 0.9439547720797721, + "grad_norm": 0.4708540439605713, + "learning_rate": 0.0003100301615070551, + "loss": 0.5771, + "step": 42410 + }, + { + "epoch": 0.9441773504273504, + "grad_norm": 0.4812738001346588, + "learning_rate": 0.0003099912259066128, + "loss": 0.5551, + "step": 42420 + }, + { + "epoch": 0.9443999287749287, + "grad_norm": 0.5777409076690674, + "learning_rate": 0.0003099522843291572, + "loss": 0.6947, + "step": 42430 + }, + { + "epoch": 0.9446225071225072, + "grad_norm": 0.6029115319252014, + "learning_rate": 0.00030991333677680456, + "loss": 0.6122, + "step": 42440 + }, + { + "epoch": 0.9448450854700855, + "grad_norm": 0.6521156430244446, + "learning_rate": 0.00030987438325167114, + "loss": 0.576, + "step": 42450 + }, + { + "epoch": 0.9450676638176638, + "grad_norm": 0.7701613306999207, + "learning_rate": 0.0003098354237558739, + "loss": 0.513, + "step": 42460 + }, + { + "epoch": 0.9452902421652422, + "grad_norm": 0.5667462944984436, + "learning_rate": 0.00030979645829152966, + "loss": 0.6486, + "step": 42470 + }, + { + "epoch": 0.9455128205128205, + "grad_norm": 0.8962845206260681, + "learning_rate": 0.0003097574868607561, + "loss": 0.5852, + "step": 42480 + }, + { + "epoch": 0.9457353988603988, + "grad_norm": 0.5495222210884094, + "learning_rate": 0.00030971850946567083, + "loss": 0.5829, + "step": 42490 + }, + { + "epoch": 0.9459579772079773, + "grad_norm": 0.6287023425102234, + "learning_rate": 0.0003096795261083919, + "loss": 0.6074, + "step": 42500 + }, + { + "epoch": 0.9461805555555556, + "grad_norm": 0.60505211353302, + "learning_rate": 0.00030964053679103775, + "loss": 0.5515, + "step": 42510 + }, + { + "epoch": 0.9464031339031339, + "grad_norm": 0.6437878608703613, + "learning_rate": 0.0003096015415157271, + "loss": 0.588, + "step": 42520 + }, + { + "epoch": 0.9466257122507122, + "grad_norm": 0.6274718642234802, + "learning_rate": 0.000309562540284579, + "loss": 0.5576, + "step": 42530 + }, + { + "epoch": 0.9468482905982906, + "grad_norm": 0.6374098062515259, + "learning_rate": 0.0003095235330997127, + "loss": 0.5445, + "step": 42540 + }, + { + "epoch": 0.9470708689458689, + "grad_norm": 0.8956291675567627, + "learning_rate": 0.000309484519963248, + "loss": 0.5241, + "step": 42550 + }, + { + "epoch": 0.9472934472934473, + "grad_norm": 0.8904242515563965, + "learning_rate": 0.0003094455008773048, + "loss": 0.7352, + "step": 42560 + }, + { + "epoch": 0.9475160256410257, + "grad_norm": 0.9804019331932068, + "learning_rate": 0.0003094064758440035, + "loss": 0.639, + "step": 42570 + }, + { + "epoch": 0.947738603988604, + "grad_norm": 0.5462363958358765, + "learning_rate": 0.0003093674448654648, + "loss": 0.6243, + "step": 42580 + }, + { + "epoch": 0.9479611823361823, + "grad_norm": 0.641595721244812, + "learning_rate": 0.00030932840794380953, + "loss": 0.6183, + "step": 42590 + }, + { + "epoch": 0.9481837606837606, + "grad_norm": 0.4465916156768799, + "learning_rate": 0.00030928936508115907, + "loss": 0.5259, + "step": 42600 + }, + { + "epoch": 0.9484063390313391, + "grad_norm": 0.7366055846214294, + "learning_rate": 0.0003092503162796351, + "loss": 0.7196, + "step": 42610 + }, + { + "epoch": 0.9486289173789174, + "grad_norm": 0.5737682580947876, + "learning_rate": 0.0003092112615413595, + "loss": 0.5638, + "step": 42620 + }, + { + "epoch": 0.9488514957264957, + "grad_norm": 0.6066577434539795, + "learning_rate": 0.0003091722008684545, + "loss": 0.6529, + "step": 42630 + }, + { + "epoch": 0.9490740740740741, + "grad_norm": 0.8688057661056519, + "learning_rate": 0.0003091331342630428, + "loss": 0.6387, + "step": 42640 + }, + { + "epoch": 0.9492966524216524, + "grad_norm": 0.749236524105072, + "learning_rate": 0.00030909406172724717, + "loss": 0.481, + "step": 42650 + }, + { + "epoch": 0.9495192307692307, + "grad_norm": 0.4578602910041809, + "learning_rate": 0.00030905498326319093, + "loss": 0.6296, + "step": 42660 + }, + { + "epoch": 0.9497418091168092, + "grad_norm": 0.9280728697776794, + "learning_rate": 0.0003090158988729977, + "loss": 0.5446, + "step": 42670 + }, + { + "epoch": 0.9499643874643875, + "grad_norm": 0.7775986790657043, + "learning_rate": 0.0003089768085587912, + "loss": 0.591, + "step": 42680 + }, + { + "epoch": 0.9501869658119658, + "grad_norm": 0.8049306869506836, + "learning_rate": 0.00030893771232269574, + "loss": 0.6957, + "step": 42690 + }, + { + "epoch": 0.9504095441595442, + "grad_norm": 0.6778254508972168, + "learning_rate": 0.0003088986101668358, + "loss": 0.6916, + "step": 42700 + }, + { + "epoch": 0.9506321225071225, + "grad_norm": 0.5649783611297607, + "learning_rate": 0.00030885950209333623, + "loss": 0.577, + "step": 42710 + }, + { + "epoch": 0.9508547008547008, + "grad_norm": 0.4995388984680176, + "learning_rate": 0.00030882038810432224, + "loss": 0.5385, + "step": 42720 + }, + { + "epoch": 0.9510772792022792, + "grad_norm": 0.492776095867157, + "learning_rate": 0.0003087812682019192, + "loss": 0.6314, + "step": 42730 + }, + { + "epoch": 0.9512998575498576, + "grad_norm": 0.4935969412326813, + "learning_rate": 0.0003087421423882531, + "loss": 0.6476, + "step": 42740 + }, + { + "epoch": 0.9515224358974359, + "grad_norm": 0.6174660921096802, + "learning_rate": 0.0003087030106654499, + "loss": 0.5734, + "step": 42750 + }, + { + "epoch": 0.9517450142450142, + "grad_norm": 0.799747884273529, + "learning_rate": 0.0003086638730356361, + "loss": 0.5207, + "step": 42760 + }, + { + "epoch": 0.9519675925925926, + "grad_norm": 0.5621200203895569, + "learning_rate": 0.0003086247295009385, + "loss": 0.7379, + "step": 42770 + }, + { + "epoch": 0.9521901709401709, + "grad_norm": 0.867217481136322, + "learning_rate": 0.00030858558006348417, + "loss": 0.5859, + "step": 42780 + }, + { + "epoch": 0.9524127492877493, + "grad_norm": 0.6303733587265015, + "learning_rate": 0.0003085464247254005, + "loss": 0.5096, + "step": 42790 + }, + { + "epoch": 0.9526353276353277, + "grad_norm": 0.7135583758354187, + "learning_rate": 0.0003085072634888153, + "loss": 0.5051, + "step": 42800 + }, + { + "epoch": 0.952857905982906, + "grad_norm": 0.7252805233001709, + "learning_rate": 0.00030846809635585656, + "loss": 0.6157, + "step": 42810 + }, + { + "epoch": 0.9530804843304843, + "grad_norm": 0.672892689704895, + "learning_rate": 0.00030842892332865265, + "loss": 0.6354, + "step": 42820 + }, + { + "epoch": 0.9533030626780626, + "grad_norm": 0.5883125066757202, + "learning_rate": 0.0003083897444093323, + "loss": 0.4532, + "step": 42830 + }, + { + "epoch": 0.9535256410256411, + "grad_norm": 0.5267878770828247, + "learning_rate": 0.00030835055960002456, + "loss": 0.4659, + "step": 42840 + }, + { + "epoch": 0.9537482193732194, + "grad_norm": 0.9347203373908997, + "learning_rate": 0.00030831136890285864, + "loss": 0.5361, + "step": 42850 + }, + { + "epoch": 0.9539707977207977, + "grad_norm": 0.3487144410610199, + "learning_rate": 0.0003082721723199643, + "loss": 0.5186, + "step": 42860 + }, + { + "epoch": 0.9541933760683761, + "grad_norm": 0.7461722493171692, + "learning_rate": 0.00030823296985347143, + "loss": 0.5533, + "step": 42870 + }, + { + "epoch": 0.9544159544159544, + "grad_norm": 0.5731413960456848, + "learning_rate": 0.0003081937615055104, + "loss": 0.4873, + "step": 42880 + }, + { + "epoch": 0.9546385327635327, + "grad_norm": 0.7203046083450317, + "learning_rate": 0.0003081545472782118, + "loss": 0.604, + "step": 42890 + }, + { + "epoch": 0.9548611111111112, + "grad_norm": 0.802855372428894, + "learning_rate": 0.00030811532717370656, + "loss": 0.5076, + "step": 42900 + }, + { + "epoch": 0.9550836894586895, + "grad_norm": 0.5875809192657471, + "learning_rate": 0.00030807610119412595, + "loss": 0.559, + "step": 42910 + }, + { + "epoch": 0.9553062678062678, + "grad_norm": 0.5268616676330566, + "learning_rate": 0.00030803686934160145, + "loss": 0.502, + "step": 42920 + }, + { + "epoch": 0.9555288461538461, + "grad_norm": 0.9944384694099426, + "learning_rate": 0.00030799763161826514, + "loss": 0.7, + "step": 42930 + }, + { + "epoch": 0.9557514245014245, + "grad_norm": 1.0348243713378906, + "learning_rate": 0.00030795838802624904, + "loss": 0.6141, + "step": 42940 + }, + { + "epoch": 0.9559740028490028, + "grad_norm": 0.5023366808891296, + "learning_rate": 0.00030791913856768573, + "loss": 0.541, + "step": 42950 + }, + { + "epoch": 0.9561965811965812, + "grad_norm": 0.9290866851806641, + "learning_rate": 0.0003078798832447082, + "loss": 0.7567, + "step": 42960 + }, + { + "epoch": 0.9564191595441596, + "grad_norm": 0.5600704550743103, + "learning_rate": 0.00030784062205944945, + "loss": 0.5905, + "step": 42970 + }, + { + "epoch": 0.9566417378917379, + "grad_norm": 0.559613823890686, + "learning_rate": 0.000307801355014043, + "loss": 0.5237, + "step": 42980 + }, + { + "epoch": 0.9568643162393162, + "grad_norm": 0.5399856567382812, + "learning_rate": 0.0003077620821106226, + "loss": 0.6838, + "step": 42990 + }, + { + "epoch": 0.9570868945868946, + "grad_norm": 0.42649152874946594, + "learning_rate": 0.0003077228033513226, + "loss": 0.6487, + "step": 43000 + }, + { + "epoch": 0.9573094729344729, + "grad_norm": 0.8213827610015869, + "learning_rate": 0.00030768351873827716, + "loss": 0.5688, + "step": 43010 + }, + { + "epoch": 0.9575320512820513, + "grad_norm": 0.6761860847473145, + "learning_rate": 0.0003076442282736212, + "loss": 0.6261, + "step": 43020 + }, + { + "epoch": 0.9577546296296297, + "grad_norm": 0.6132062673568726, + "learning_rate": 0.0003076049319594898, + "loss": 0.6003, + "step": 43030 + }, + { + "epoch": 0.957977207977208, + "grad_norm": 0.3853282034397125, + "learning_rate": 0.0003075656297980182, + "loss": 0.5995, + "step": 43040 + }, + { + "epoch": 0.9581997863247863, + "grad_norm": 0.4675573408603668, + "learning_rate": 0.00030752632179134224, + "loss": 0.6665, + "step": 43050 + }, + { + "epoch": 0.9584223646723646, + "grad_norm": 0.5293225646018982, + "learning_rate": 0.00030748700794159796, + "loss": 0.5195, + "step": 43060 + }, + { + "epoch": 0.9586449430199431, + "grad_norm": 0.7288942337036133, + "learning_rate": 0.00030744768825092167, + "loss": 0.6372, + "step": 43070 + }, + { + "epoch": 0.9588675213675214, + "grad_norm": 0.8237767815589905, + "learning_rate": 0.00030740836272145005, + "loss": 0.6325, + "step": 43080 + }, + { + "epoch": 0.9590900997150997, + "grad_norm": 0.7287095785140991, + "learning_rate": 0.00030736903135532, + "loss": 0.5934, + "step": 43090 + }, + { + "epoch": 0.9593126780626781, + "grad_norm": 0.8140029311180115, + "learning_rate": 0.00030732969415466903, + "loss": 0.5926, + "step": 43100 + }, + { + "epoch": 0.9595352564102564, + "grad_norm": 0.6723431944847107, + "learning_rate": 0.00030729035112163453, + "loss": 0.5835, + "step": 43110 + }, + { + "epoch": 0.9597578347578347, + "grad_norm": 0.5774185657501221, + "learning_rate": 0.0003072510022583545, + "loss": 0.6066, + "step": 43120 + }, + { + "epoch": 0.9599804131054132, + "grad_norm": 0.8784375190734863, + "learning_rate": 0.0003072116475669672, + "loss": 0.665, + "step": 43130 + }, + { + "epoch": 0.9601139601139601, + "eval_loss": 0.6042813658714294, + "eval_runtime": 337.231, + "eval_samples_per_second": 7.013, + "eval_steps_per_second": 7.013, + "step": 43136 + }, + { + "epoch": 0.9602029914529915, + "grad_norm": 0.7646319270133972, + "learning_rate": 0.0003071722870496113, + "loss": 0.6678, + "step": 43140 + }, + { + "epoch": 0.9604255698005698, + "grad_norm": 0.5818307399749756, + "learning_rate": 0.0003071329207084255, + "loss": 0.5272, + "step": 43150 + }, + { + "epoch": 0.9606481481481481, + "grad_norm": 0.65185546875, + "learning_rate": 0.0003070935485455491, + "loss": 0.6083, + "step": 43160 + }, + { + "epoch": 0.9608707264957265, + "grad_norm": 0.8646022081375122, + "learning_rate": 0.00030705417056312164, + "loss": 0.66, + "step": 43170 + }, + { + "epoch": 0.9610933048433048, + "grad_norm": 0.4757365882396698, + "learning_rate": 0.0003070147867632829, + "loss": 0.5159, + "step": 43180 + }, + { + "epoch": 0.9613158831908832, + "grad_norm": 0.6112939715385437, + "learning_rate": 0.00030697539714817304, + "loss": 0.5295, + "step": 43190 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 0.8913406133651733, + "learning_rate": 0.00030693600171993247, + "loss": 0.5462, + "step": 43200 + }, + { + "epoch": 0.9617610398860399, + "grad_norm": 0.8653389811515808, + "learning_rate": 0.0003068966004807021, + "loss": 0.6887, + "step": 43210 + }, + { + "epoch": 0.9619836182336182, + "grad_norm": 0.8796609044075012, + "learning_rate": 0.00030685719343262284, + "loss": 0.8114, + "step": 43220 + }, + { + "epoch": 0.9622061965811965, + "grad_norm": 0.5240429639816284, + "learning_rate": 0.0003068177805778364, + "loss": 0.5522, + "step": 43230 + }, + { + "epoch": 0.9624287749287749, + "grad_norm": 0.6439476609230042, + "learning_rate": 0.0003067783619184842, + "loss": 0.605, + "step": 43240 + }, + { + "epoch": 0.9626513532763533, + "grad_norm": 0.851402223110199, + "learning_rate": 0.0003067389374567084, + "loss": 0.5514, + "step": 43250 + }, + { + "epoch": 0.9628739316239316, + "grad_norm": 0.8971279263496399, + "learning_rate": 0.0003066995071946513, + "loss": 0.6248, + "step": 43260 + }, + { + "epoch": 0.96309650997151, + "grad_norm": 0.44522354006767273, + "learning_rate": 0.0003066600711344557, + "loss": 0.5892, + "step": 43270 + }, + { + "epoch": 0.9633190883190883, + "grad_norm": 0.6671565771102905, + "learning_rate": 0.0003066206292782645, + "loss": 0.7284, + "step": 43280 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.8489352464675903, + "learning_rate": 0.000306581181628221, + "loss": 0.4295, + "step": 43290 + }, + { + "epoch": 0.9637642450142451, + "grad_norm": 0.7675427794456482, + "learning_rate": 0.00030654172818646893, + "loss": 0.6216, + "step": 43300 + }, + { + "epoch": 0.9639868233618234, + "grad_norm": 0.9285682439804077, + "learning_rate": 0.0003065022689551521, + "loss": 0.7118, + "step": 43310 + }, + { + "epoch": 0.9642094017094017, + "grad_norm": 0.5413016676902771, + "learning_rate": 0.00030646280393641475, + "loss": 0.5437, + "step": 43320 + }, + { + "epoch": 0.96443198005698, + "grad_norm": 0.7694899439811707, + "learning_rate": 0.00030642333313240144, + "loss": 0.732, + "step": 43330 + }, + { + "epoch": 0.9646545584045584, + "grad_norm": 0.6586807370185852, + "learning_rate": 0.00030638385654525716, + "loss": 0.5091, + "step": 43340 + }, + { + "epoch": 0.9648771367521367, + "grad_norm": 0.4724593162536621, + "learning_rate": 0.000306344374177127, + "loss": 0.5117, + "step": 43350 + }, + { + "epoch": 0.9650997150997151, + "grad_norm": 0.8600608110427856, + "learning_rate": 0.00030630488603015655, + "loss": 0.546, + "step": 43360 + }, + { + "epoch": 0.9653222934472935, + "grad_norm": 0.6995387673377991, + "learning_rate": 0.0003062653921064915, + "loss": 0.5045, + "step": 43370 + }, + { + "epoch": 0.9655448717948718, + "grad_norm": 0.8419876098632812, + "learning_rate": 0.00030622589240827806, + "loss": 0.6705, + "step": 43380 + }, + { + "epoch": 0.9657674501424501, + "grad_norm": 0.5719366669654846, + "learning_rate": 0.00030618638693766264, + "loss": 0.6209, + "step": 43390 + }, + { + "epoch": 0.9659900284900285, + "grad_norm": 0.5143960118293762, + "learning_rate": 0.000306146875696792, + "loss": 0.5871, + "step": 43400 + }, + { + "epoch": 0.9662126068376068, + "grad_norm": 0.8171896934509277, + "learning_rate": 0.00030610735868781335, + "loss": 0.656, + "step": 43410 + }, + { + "epoch": 0.9664351851851852, + "grad_norm": 0.9602444767951965, + "learning_rate": 0.00030606783591287384, + "loss": 0.6494, + "step": 43420 + }, + { + "epoch": 0.9666577635327636, + "grad_norm": 0.8292070031166077, + "learning_rate": 0.0003060283073741214, + "loss": 0.5705, + "step": 43430 + }, + { + "epoch": 0.9668803418803419, + "grad_norm": 0.5162893533706665, + "learning_rate": 0.0003059887730737039, + "loss": 0.5762, + "step": 43440 + }, + { + "epoch": 0.9671029202279202, + "grad_norm": 1.0457990169525146, + "learning_rate": 0.00030594923301376976, + "loss": 0.6512, + "step": 43450 + }, + { + "epoch": 0.9673254985754985, + "grad_norm": 0.6627582311630249, + "learning_rate": 0.00030590968719646747, + "loss": 0.5665, + "step": 43460 + }, + { + "epoch": 0.9675480769230769, + "grad_norm": 0.6730947494506836, + "learning_rate": 0.0003058701356239461, + "loss": 0.5233, + "step": 43470 + }, + { + "epoch": 0.9677706552706553, + "grad_norm": 0.7079799175262451, + "learning_rate": 0.0003058305782983549, + "loss": 0.6293, + "step": 43480 + }, + { + "epoch": 0.9679932336182336, + "grad_norm": 0.6541440486907959, + "learning_rate": 0.0003057910152218435, + "loss": 0.5677, + "step": 43490 + }, + { + "epoch": 0.968215811965812, + "grad_norm": 0.5727503299713135, + "learning_rate": 0.00030575144639656164, + "loss": 0.6793, + "step": 43500 + }, + { + "epoch": 0.9684383903133903, + "grad_norm": 0.6658546924591064, + "learning_rate": 0.0003057118718246597, + "loss": 0.6498, + "step": 43510 + }, + { + "epoch": 0.9686609686609686, + "grad_norm": 1.0690386295318604, + "learning_rate": 0.000305672291508288, + "loss": 0.577, + "step": 43520 + }, + { + "epoch": 0.9688835470085471, + "grad_norm": 0.6312462091445923, + "learning_rate": 0.0003056327054495975, + "loss": 0.5497, + "step": 43530 + }, + { + "epoch": 0.9691061253561254, + "grad_norm": 0.49718987941741943, + "learning_rate": 0.0003055931136507393, + "loss": 0.5781, + "step": 43540 + }, + { + "epoch": 0.9693287037037037, + "grad_norm": 0.6868771910667419, + "learning_rate": 0.00030555351611386494, + "loss": 0.62, + "step": 43550 + }, + { + "epoch": 0.969551282051282, + "grad_norm": 0.5947691798210144, + "learning_rate": 0.0003055139128411261, + "loss": 0.4976, + "step": 43560 + }, + { + "epoch": 0.9697738603988604, + "grad_norm": 0.6794607043266296, + "learning_rate": 0.00030547430383467473, + "loss": 0.509, + "step": 43570 + }, + { + "epoch": 0.9699964387464387, + "grad_norm": 0.5065454840660095, + "learning_rate": 0.00030543468909666346, + "loss": 0.6121, + "step": 43580 + }, + { + "epoch": 0.9702190170940171, + "grad_norm": 0.7035951018333435, + "learning_rate": 0.00030539506862924484, + "loss": 0.7243, + "step": 43590 + }, + { + "epoch": 0.9704415954415955, + "grad_norm": 0.6408650279045105, + "learning_rate": 0.0003053554424345719, + "loss": 0.5817, + "step": 43600 + }, + { + "epoch": 0.9706641737891738, + "grad_norm": 0.5592657327651978, + "learning_rate": 0.00030531581051479803, + "loss": 0.623, + "step": 43610 + }, + { + "epoch": 0.9708867521367521, + "grad_norm": 0.6538555026054382, + "learning_rate": 0.00030527617287207673, + "loss": 0.4772, + "step": 43620 + }, + { + "epoch": 0.9711093304843305, + "grad_norm": 0.8865237236022949, + "learning_rate": 0.00030523652950856207, + "loss": 0.5248, + "step": 43630 + }, + { + "epoch": 0.9713319088319088, + "grad_norm": 0.7268497943878174, + "learning_rate": 0.0003051968804264082, + "loss": 0.6957, + "step": 43640 + }, + { + "epoch": 0.9715544871794872, + "grad_norm": 0.9204932451248169, + "learning_rate": 0.00030515722562776974, + "loss": 0.6097, + "step": 43650 + }, + { + "epoch": 0.9717770655270656, + "grad_norm": 0.7186421155929565, + "learning_rate": 0.00030511756511480157, + "loss": 0.5993, + "step": 43660 + }, + { + "epoch": 0.9719996438746439, + "grad_norm": 0.8013362288475037, + "learning_rate": 0.0003050778988896588, + "loss": 0.678, + "step": 43670 + }, + { + "epoch": 0.9722222222222222, + "grad_norm": 0.5864419937133789, + "learning_rate": 0.000305038226954497, + "loss": 0.662, + "step": 43680 + }, + { + "epoch": 0.9724448005698005, + "grad_norm": 0.8709241151809692, + "learning_rate": 0.000304998549311472, + "loss": 0.524, + "step": 43690 + }, + { + "epoch": 0.9726673789173789, + "grad_norm": 0.5410850644111633, + "learning_rate": 0.0003049588659627399, + "loss": 0.531, + "step": 43700 + }, + { + "epoch": 0.9728899572649573, + "grad_norm": 0.5670583844184875, + "learning_rate": 0.00030491917691045705, + "loss": 0.6241, + "step": 43710 + }, + { + "epoch": 0.9731125356125356, + "grad_norm": 0.7049233913421631, + "learning_rate": 0.00030487948215678025, + "loss": 0.6616, + "step": 43720 + }, + { + "epoch": 0.973335113960114, + "grad_norm": 0.46305641531944275, + "learning_rate": 0.0003048397817038665, + "loss": 0.4417, + "step": 43730 + }, + { + "epoch": 0.9735576923076923, + "grad_norm": 0.8418055772781372, + "learning_rate": 0.0003048000755538732, + "loss": 0.6689, + "step": 43740 + }, + { + "epoch": 0.9737802706552706, + "grad_norm": 0.6632723212242126, + "learning_rate": 0.000304760363708958, + "loss": 0.4771, + "step": 43750 + }, + { + "epoch": 0.9740028490028491, + "grad_norm": 0.7734985947608948, + "learning_rate": 0.0003047206461712789, + "loss": 0.5589, + "step": 43760 + }, + { + "epoch": 0.9742254273504274, + "grad_norm": 0.774679958820343, + "learning_rate": 0.0003046809229429942, + "loss": 0.5736, + "step": 43770 + }, + { + "epoch": 0.9744480056980057, + "grad_norm": 0.6929972171783447, + "learning_rate": 0.00030464119402626237, + "loss": 0.6126, + "step": 43780 + }, + { + "epoch": 0.974670584045584, + "grad_norm": 0.4365656077861786, + "learning_rate": 0.0003046014594232424, + "loss": 0.6184, + "step": 43790 + }, + { + "epoch": 0.9748931623931624, + "grad_norm": 0.7632807493209839, + "learning_rate": 0.0003045617191360935, + "loss": 0.5464, + "step": 43800 + }, + { + "epoch": 0.9751157407407407, + "grad_norm": 0.7853899598121643, + "learning_rate": 0.00030452197316697526, + "loss": 0.505, + "step": 43810 + }, + { + "epoch": 0.9753383190883191, + "grad_norm": 0.511212944984436, + "learning_rate": 0.00030448222151804736, + "loss": 0.597, + "step": 43820 + }, + { + "epoch": 0.9755608974358975, + "grad_norm": 0.5872905850410461, + "learning_rate": 0.00030444246419147, + "loss": 0.551, + "step": 43830 + }, + { + "epoch": 0.9757834757834758, + "grad_norm": 0.7174262404441833, + "learning_rate": 0.0003044027011894037, + "loss": 0.5746, + "step": 43840 + }, + { + "epoch": 0.9760060541310541, + "grad_norm": 0.6769015192985535, + "learning_rate": 0.0003043629325140091, + "loss": 0.5102, + "step": 43850 + }, + { + "epoch": 0.9762286324786325, + "grad_norm": 0.4921936094760895, + "learning_rate": 0.0003043231581674473, + "loss": 0.5115, + "step": 43860 + }, + { + "epoch": 0.9764512108262108, + "grad_norm": 0.9814968705177307, + "learning_rate": 0.00030428337815187974, + "loss": 0.7997, + "step": 43870 + }, + { + "epoch": 0.9766737891737892, + "grad_norm": 0.6479461789131165, + "learning_rate": 0.00030424359246946796, + "loss": 0.6602, + "step": 43880 + }, + { + "epoch": 0.9768963675213675, + "grad_norm": 1.096750020980835, + "learning_rate": 0.0003042038011223741, + "loss": 0.6912, + "step": 43890 + }, + { + "epoch": 0.9771189458689459, + "grad_norm": 0.8794252872467041, + "learning_rate": 0.0003041640041127603, + "loss": 0.6069, + "step": 43900 + }, + { + "epoch": 0.9773415242165242, + "grad_norm": 0.9189031720161438, + "learning_rate": 0.0003041242014427894, + "loss": 0.6475, + "step": 43910 + }, + { + "epoch": 0.9775641025641025, + "grad_norm": 0.6648414134979248, + "learning_rate": 0.000304084393114624, + "loss": 0.5758, + "step": 43920 + }, + { + "epoch": 0.9777866809116809, + "grad_norm": 0.694111704826355, + "learning_rate": 0.0003040445791304275, + "loss": 0.5805, + "step": 43930 + }, + { + "epoch": 0.9780092592592593, + "grad_norm": 0.4170455038547516, + "learning_rate": 0.00030400475949236345, + "loss": 0.6003, + "step": 43940 + }, + { + "epoch": 0.9782318376068376, + "grad_norm": 0.6975066065788269, + "learning_rate": 0.0003039649342025956, + "loss": 0.5608, + "step": 43950 + }, + { + "epoch": 0.978454415954416, + "grad_norm": 0.5400005578994751, + "learning_rate": 0.0003039251032632881, + "loss": 0.6245, + "step": 43960 + }, + { + "epoch": 0.9786769943019943, + "grad_norm": 0.43616047501564026, + "learning_rate": 0.0003038852666766054, + "loss": 0.5996, + "step": 43970 + }, + { + "epoch": 0.9788995726495726, + "grad_norm": 0.4482074975967407, + "learning_rate": 0.0003038454244447123, + "loss": 0.4164, + "step": 43980 + }, + { + "epoch": 0.9791221509971509, + "grad_norm": 0.8714448809623718, + "learning_rate": 0.00030380557656977384, + "loss": 0.5505, + "step": 43990 + }, + { + "epoch": 0.9793447293447294, + "grad_norm": 0.9814329743385315, + "learning_rate": 0.00030376572305395525, + "loss": 0.6242, + "step": 44000 + }, + { + "epoch": 0.9795673076923077, + "grad_norm": 0.5086197257041931, + "learning_rate": 0.00030372586389942245, + "loss": 0.5395, + "step": 44010 + }, + { + "epoch": 0.979789886039886, + "grad_norm": 0.49613261222839355, + "learning_rate": 0.00030368599910834124, + "loss": 0.6055, + "step": 44020 + }, + { + "epoch": 0.9800124643874644, + "grad_norm": 0.8347887396812439, + "learning_rate": 0.000303646128682878, + "loss": 0.6295, + "step": 44030 + }, + { + "epoch": 0.9802350427350427, + "grad_norm": 0.4856536090373993, + "learning_rate": 0.00030360625262519924, + "loss": 0.6006, + "step": 44040 + }, + { + "epoch": 0.9804576210826211, + "grad_norm": 0.7615097761154175, + "learning_rate": 0.0003035663709374719, + "loss": 0.5862, + "step": 44050 + }, + { + "epoch": 0.9806801994301995, + "grad_norm": 0.6671962738037109, + "learning_rate": 0.0003035264836218632, + "loss": 0.6552, + "step": 44060 + }, + { + "epoch": 0.9809027777777778, + "grad_norm": 0.597777783870697, + "learning_rate": 0.0003034865906805406, + "loss": 0.584, + "step": 44070 + }, + { + "epoch": 0.9811253561253561, + "grad_norm": 0.6560484766960144, + "learning_rate": 0.000303446692115672, + "loss": 0.6308, + "step": 44080 + }, + { + "epoch": 0.9813479344729344, + "grad_norm": 0.7486174702644348, + "learning_rate": 0.00030340678792942536, + "loss": 0.5648, + "step": 44090 + }, + { + "epoch": 0.9815705128205128, + "grad_norm": 0.6524199843406677, + "learning_rate": 0.0003033668781239693, + "loss": 0.6903, + "step": 44100 + }, + { + "epoch": 0.9817930911680912, + "grad_norm": 0.9349985122680664, + "learning_rate": 0.00030332696270147244, + "loss": 0.6595, + "step": 44110 + }, + { + "epoch": 0.9820156695156695, + "grad_norm": 0.4954359233379364, + "learning_rate": 0.0003032870416641038, + "loss": 0.5674, + "step": 44120 + }, + { + "epoch": 0.9822382478632479, + "grad_norm": 0.6777122020721436, + "learning_rate": 0.00030324711501403284, + "loss": 0.5881, + "step": 44130 + }, + { + "epoch": 0.9824608262108262, + "grad_norm": 0.5254316926002502, + "learning_rate": 0.0003032071827534291, + "loss": 0.5998, + "step": 44140 + }, + { + "epoch": 0.9826834045584045, + "grad_norm": 1.055670976638794, + "learning_rate": 0.0003031672448844625, + "loss": 0.4804, + "step": 44150 + }, + { + "epoch": 0.9829059829059829, + "grad_norm": 0.6267298460006714, + "learning_rate": 0.0003031273014093035, + "loss": 0.5552, + "step": 44160 + }, + { + "epoch": 0.9831285612535613, + "grad_norm": 0.7355030179023743, + "learning_rate": 0.00030308735233012233, + "loss": 0.7159, + "step": 44170 + }, + { + "epoch": 0.9833511396011396, + "grad_norm": 0.7559900283813477, + "learning_rate": 0.0003030473976490901, + "loss": 0.5603, + "step": 44180 + }, + { + "epoch": 0.983573717948718, + "grad_norm": 1.1423794031143188, + "learning_rate": 0.000303007437368378, + "loss": 0.5983, + "step": 44190 + }, + { + "epoch": 0.9837962962962963, + "grad_norm": 0.7574682831764221, + "learning_rate": 0.00030296747149015737, + "loss": 0.5575, + "step": 44200 + }, + { + "epoch": 0.9840188746438746, + "grad_norm": 0.4764862060546875, + "learning_rate": 0.0003029275000166001, + "loss": 0.5668, + "step": 44210 + }, + { + "epoch": 0.9842414529914529, + "grad_norm": 0.6267021894454956, + "learning_rate": 0.0003028875229498782, + "loss": 0.6212, + "step": 44220 + }, + { + "epoch": 0.9844640313390314, + "grad_norm": 0.5620948076248169, + "learning_rate": 0.00030284754029216406, + "loss": 0.6254, + "step": 44230 + }, + { + "epoch": 0.9846866096866097, + "grad_norm": 0.4687175750732422, + "learning_rate": 0.00030280755204563034, + "loss": 0.5893, + "step": 44240 + }, + { + "epoch": 0.984909188034188, + "grad_norm": 0.9068381786346436, + "learning_rate": 0.0003027675582124501, + "loss": 0.6576, + "step": 44250 + }, + { + "epoch": 0.9851317663817664, + "grad_norm": 0.5325086712837219, + "learning_rate": 0.00030272755879479665, + "loss": 0.587, + "step": 44260 + }, + { + "epoch": 0.9853543447293447, + "grad_norm": 0.9383319616317749, + "learning_rate": 0.0003026875537948436, + "loss": 0.6018, + "step": 44270 + }, + { + "epoch": 0.9855769230769231, + "grad_norm": 0.8008456230163574, + "learning_rate": 0.0003026475432147647, + "loss": 0.5127, + "step": 44280 + }, + { + "epoch": 0.9857995014245015, + "grad_norm": 0.6528541445732117, + "learning_rate": 0.00030260752705673434, + "loss": 0.6322, + "step": 44290 + }, + { + "epoch": 0.9860220797720798, + "grad_norm": 0.6493551135063171, + "learning_rate": 0.00030256750532292695, + "loss": 0.5318, + "step": 44300 + }, + { + "epoch": 0.9862446581196581, + "grad_norm": 0.7947010397911072, + "learning_rate": 0.00030252747801551733, + "loss": 0.6309, + "step": 44310 + }, + { + "epoch": 0.9864672364672364, + "grad_norm": 0.6355711817741394, + "learning_rate": 0.00030248744513668065, + "loss": 0.659, + "step": 44320 + }, + { + "epoch": 0.9866898148148148, + "grad_norm": 0.7108096480369568, + "learning_rate": 0.0003024474066885923, + "loss": 0.6309, + "step": 44330 + }, + { + "epoch": 0.9869123931623932, + "grad_norm": 0.6758264899253845, + "learning_rate": 0.000302407362673428, + "loss": 0.6833, + "step": 44340 + }, + { + "epoch": 0.9871349715099715, + "grad_norm": 0.6141546964645386, + "learning_rate": 0.0003023673130933638, + "loss": 0.5538, + "step": 44350 + }, + { + "epoch": 0.9873575498575499, + "grad_norm": 0.5645638704299927, + "learning_rate": 0.000302327257950576, + "loss": 0.6953, + "step": 44360 + }, + { + "epoch": 0.9875801282051282, + "grad_norm": 0.6685665845870972, + "learning_rate": 0.00030228719724724126, + "loss": 0.5587, + "step": 44370 + }, + { + "epoch": 0.9878027065527065, + "grad_norm": 0.8498179316520691, + "learning_rate": 0.0003022471309855364, + "loss": 0.6689, + "step": 44380 + }, + { + "epoch": 0.9880252849002849, + "grad_norm": 0.6236562132835388, + "learning_rate": 0.00030220705916763875, + "loss": 0.5571, + "step": 44390 + }, + { + "epoch": 0.9882478632478633, + "grad_norm": 0.7307772636413574, + "learning_rate": 0.00030216698179572586, + "loss": 0.5927, + "step": 44400 + }, + { + "epoch": 0.9884704415954416, + "grad_norm": 0.6887956261634827, + "learning_rate": 0.0003021268988719756, + "loss": 0.6221, + "step": 44410 + }, + { + "epoch": 0.98869301994302, + "grad_norm": 0.5145651698112488, + "learning_rate": 0.000302086810398566, + "loss": 0.7497, + "step": 44420 + }, + { + "epoch": 0.9889155982905983, + "grad_norm": 0.6188973784446716, + "learning_rate": 0.0003020467163776755, + "loss": 0.6187, + "step": 44430 + }, + { + "epoch": 0.9891381766381766, + "grad_norm": 0.8421676158905029, + "learning_rate": 0.000302006616811483, + "loss": 0.8401, + "step": 44440 + }, + { + "epoch": 0.9893607549857549, + "grad_norm": 0.5950721502304077, + "learning_rate": 0.0003019665117021673, + "loss": 0.5747, + "step": 44450 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.46402913331985474, + "learning_rate": 0.000301926401051908, + "loss": 0.6544, + "step": 44460 + }, + { + "epoch": 0.9898059116809117, + "grad_norm": 0.8954247236251831, + "learning_rate": 0.0003018862848628846, + "loss": 0.5758, + "step": 44470 + }, + { + "epoch": 0.99002849002849, + "grad_norm": 0.6421555876731873, + "learning_rate": 0.00030184616313727705, + "loss": 0.5715, + "step": 44480 + }, + { + "epoch": 0.9902510683760684, + "grad_norm": 0.6220232248306274, + "learning_rate": 0.0003018060358772656, + "loss": 0.4822, + "step": 44490 + }, + { + "epoch": 0.9904736467236467, + "grad_norm": 0.6873606443405151, + "learning_rate": 0.0003017659030850308, + "loss": 0.6926, + "step": 44500 + }, + { + "epoch": 0.9906962250712251, + "grad_norm": 0.6596918702125549, + "learning_rate": 0.00030172576476275364, + "loss": 0.7499, + "step": 44510 + }, + { + "epoch": 0.9909188034188035, + "grad_norm": 0.8093087673187256, + "learning_rate": 0.00030168562091261505, + "loss": 0.6116, + "step": 44520 + }, + { + "epoch": 0.9911413817663818, + "grad_norm": 0.5271719694137573, + "learning_rate": 0.00030164547153679655, + "loss": 0.6351, + "step": 44530 + }, + { + "epoch": 0.9913639601139601, + "grad_norm": 0.7143903374671936, + "learning_rate": 0.00030160531663748005, + "loss": 0.7094, + "step": 44540 + }, + { + "epoch": 0.9915865384615384, + "grad_norm": 0.8920359015464783, + "learning_rate": 0.0003015651562168474, + "loss": 0.5879, + "step": 44550 + }, + { + "epoch": 0.9918091168091168, + "grad_norm": 0.6196558475494385, + "learning_rate": 0.00030152499027708094, + "loss": 0.6776, + "step": 44560 + }, + { + "epoch": 0.9920316951566952, + "grad_norm": 0.7099842429161072, + "learning_rate": 0.0003014848188203634, + "loss": 0.6097, + "step": 44570 + }, + { + "epoch": 0.9922542735042735, + "grad_norm": 0.5325936079025269, + "learning_rate": 0.00030144464184887786, + "loss": 0.4899, + "step": 44580 + }, + { + "epoch": 0.9924768518518519, + "grad_norm": 0.5483936071395874, + "learning_rate": 0.0003014044593648073, + "loss": 0.4846, + "step": 44590 + }, + { + "epoch": 0.9926994301994302, + "grad_norm": 0.6429783701896667, + "learning_rate": 0.0003013642713703354, + "loss": 0.6581, + "step": 44600 + }, + { + "epoch": 0.9929220085470085, + "grad_norm": 0.7688935399055481, + "learning_rate": 0.0003013240778676462, + "loss": 0.5981, + "step": 44610 + }, + { + "epoch": 0.9931445868945868, + "grad_norm": 0.7688998579978943, + "learning_rate": 0.0003012838788589234, + "loss": 0.676, + "step": 44620 + }, + { + "epoch": 0.9933671652421653, + "grad_norm": 0.6492190361022949, + "learning_rate": 0.00030124367434635185, + "loss": 0.6548, + "step": 44630 + }, + { + "epoch": 0.9935897435897436, + "grad_norm": 0.9331862330436707, + "learning_rate": 0.00030120346433211616, + "loss": 0.7739, + "step": 44640 + }, + { + "epoch": 0.9938123219373219, + "grad_norm": 0.4795180559158325, + "learning_rate": 0.0003011632488184014, + "loss": 0.6069, + "step": 44650 + }, + { + "epoch": 0.9940349002849003, + "grad_norm": 0.6427105069160461, + "learning_rate": 0.00030112302780739276, + "loss": 0.6617, + "step": 44660 + }, + { + "epoch": 0.9942574786324786, + "grad_norm": 0.8687929511070251, + "learning_rate": 0.00030108280130127613, + "loss": 0.5745, + "step": 44670 + }, + { + "epoch": 0.9944800569800569, + "grad_norm": 0.3637758791446686, + "learning_rate": 0.00030104256930223725, + "loss": 0.5226, + "step": 44680 + }, + { + "epoch": 0.9947026353276354, + "grad_norm": 0.614262580871582, + "learning_rate": 0.00030100233181246246, + "loss": 0.6666, + "step": 44690 + }, + { + "epoch": 0.9949252136752137, + "grad_norm": 0.735403299331665, + "learning_rate": 0.0003009620888341383, + "loss": 0.4987, + "step": 44700 + }, + { + "epoch": 0.995147792022792, + "grad_norm": 0.7281267642974854, + "learning_rate": 0.0003009218403694516, + "loss": 0.6631, + "step": 44710 + }, + { + "epoch": 0.9953703703703703, + "grad_norm": 0.6882529258728027, + "learning_rate": 0.0003008815864205894, + "loss": 0.612, + "step": 44720 + }, + { + "epoch": 0.9955929487179487, + "grad_norm": 0.7215903997421265, + "learning_rate": 0.00030084132698973915, + "loss": 0.6723, + "step": 44730 + }, + { + "epoch": 0.9958155270655271, + "grad_norm": 1.017327070236206, + "learning_rate": 0.0003008010620790888, + "loss": 0.6662, + "step": 44740 + }, + { + "epoch": 0.9960381054131054, + "grad_norm": 0.5069785714149475, + "learning_rate": 0.00030076079169082614, + "loss": 0.478, + "step": 44750 + }, + { + "epoch": 0.9962606837606838, + "grad_norm": 0.7547106742858887, + "learning_rate": 0.0003007205158271396, + "loss": 0.7862, + "step": 44760 + }, + { + "epoch": 0.9964832621082621, + "grad_norm": 0.5410762429237366, + "learning_rate": 0.00030068023449021777, + "loss": 0.5717, + "step": 44770 + }, + { + "epoch": 0.9967058404558404, + "grad_norm": 0.6520301699638367, + "learning_rate": 0.0003006399476822495, + "loss": 0.6349, + "step": 44780 + }, + { + "epoch": 0.9969284188034188, + "grad_norm": 0.4946806728839874, + "learning_rate": 0.0003005996554054242, + "loss": 0.5792, + "step": 44790 + }, + { + "epoch": 0.9971509971509972, + "grad_norm": 0.6016153693199158, + "learning_rate": 0.00030055935766193115, + "loss": 0.5773, + "step": 44800 + }, + { + "epoch": 0.9973735754985755, + "grad_norm": 0.5813798904418945, + "learning_rate": 0.00030051905445396045, + "loss": 0.5932, + "step": 44810 + }, + { + "epoch": 0.9975961538461539, + "grad_norm": 0.52809739112854, + "learning_rate": 0.00030047874578370196, + "loss": 0.6489, + "step": 44820 + }, + { + "epoch": 0.9978187321937322, + "grad_norm": 0.7697903513908386, + "learning_rate": 0.0003004384316533461, + "loss": 0.604, + "step": 44830 + }, + { + "epoch": 0.9980413105413105, + "grad_norm": 0.750226616859436, + "learning_rate": 0.0003003981120650837, + "loss": 0.6535, + "step": 44840 + }, + { + "epoch": 0.9982638888888888, + "grad_norm": 0.8747888207435608, + "learning_rate": 0.0003003577870211057, + "loss": 0.5326, + "step": 44850 + }, + { + "epoch": 0.9984864672364673, + "grad_norm": 0.7685528993606567, + "learning_rate": 0.00030031745652360335, + "loss": 0.6057, + "step": 44860 + }, + { + "epoch": 0.9987090455840456, + "grad_norm": 0.690782904624939, + "learning_rate": 0.0003002771205747684, + "loss": 0.7289, + "step": 44870 + }, + { + "epoch": 0.9989316239316239, + "grad_norm": 0.6289418339729309, + "learning_rate": 0.00030023677917679253, + "loss": 0.6548, + "step": 44880 + }, + { + "epoch": 0.9991542022792023, + "grad_norm": 0.6266171336174011, + "learning_rate": 0.00030019643233186803, + "loss": 0.6579, + "step": 44890 + }, + { + "epoch": 0.9993767806267806, + "grad_norm": 0.602969229221344, + "learning_rate": 0.00030015608004218744, + "loss": 0.4845, + "step": 44900 + }, + { + "epoch": 0.9995993589743589, + "grad_norm": 0.6873510479927063, + "learning_rate": 0.0003001157223099434, + "loss": 0.5004, + "step": 44910 + }, + { + "epoch": 0.9998219373219374, + "grad_norm": 0.5698683261871338, + "learning_rate": 0.00030007535913732905, + "loss": 0.5475, + "step": 44920 + }, + { + "epoch": 1.0000445156695157, + "grad_norm": 0.7336957454681396, + "learning_rate": 0.00030003499052653777, + "loss": 0.6564, + "step": 44930 + }, + { + "epoch": 1.0002670940170941, + "grad_norm": 0.809277355670929, + "learning_rate": 0.0002999946164797633, + "loss": 0.6366, + "step": 44940 + }, + { + "epoch": 1.0004896723646723, + "grad_norm": 0.5942495465278625, + "learning_rate": 0.0002999542369991994, + "loss": 0.5138, + "step": 44950 + }, + { + "epoch": 1.0007122507122508, + "grad_norm": 0.5038458704948425, + "learning_rate": 0.0002999138520870405, + "loss": 0.5748, + "step": 44960 + }, + { + "epoch": 1.000934829059829, + "grad_norm": 0.8980821371078491, + "learning_rate": 0.00029987346174548097, + "loss": 0.4932, + "step": 44970 + }, + { + "epoch": 1.0011574074074074, + "grad_norm": 0.6963857412338257, + "learning_rate": 0.00029983306597671584, + "loss": 0.5204, + "step": 44980 + }, + { + "epoch": 1.0013799857549857, + "grad_norm": 0.8751989603042603, + "learning_rate": 0.00029979266478294024, + "loss": 0.5569, + "step": 44990 + }, + { + "epoch": 1.001602564102564, + "grad_norm": 0.9675778150558472, + "learning_rate": 0.00029975225816634954, + "loss": 0.6096, + "step": 45000 + }, + { + "epoch": 1.0018251424501425, + "grad_norm": 0.7282714247703552, + "learning_rate": 0.0002997118461291394, + "loss": 0.5134, + "step": 45010 + }, + { + "epoch": 1.0020477207977208, + "grad_norm": 0.6374226212501526, + "learning_rate": 0.0002996714286735059, + "loss": 0.5018, + "step": 45020 + }, + { + "epoch": 1.0022702991452992, + "grad_norm": 0.50523841381073, + "learning_rate": 0.0002996310058016454, + "loss": 0.7654, + "step": 45030 + }, + { + "epoch": 1.0024928774928774, + "grad_norm": 0.5092727541923523, + "learning_rate": 0.0002995905775157545, + "loss": 0.4078, + "step": 45040 + }, + { + "epoch": 1.0027154558404558, + "grad_norm": 0.6609806418418884, + "learning_rate": 0.0002995501438180301, + "loss": 0.4932, + "step": 45050 + }, + { + "epoch": 1.0029380341880343, + "grad_norm": 0.5364968776702881, + "learning_rate": 0.00029950970471066937, + "loss": 0.5699, + "step": 45060 + }, + { + "epoch": 1.0031606125356125, + "grad_norm": 0.6287388801574707, + "learning_rate": 0.0002994692601958698, + "loss": 0.6062, + "step": 45070 + }, + { + "epoch": 1.003383190883191, + "grad_norm": 0.5691414475440979, + "learning_rate": 0.00029942881027582925, + "loss": 0.5667, + "step": 45080 + }, + { + "epoch": 1.0036057692307692, + "grad_norm": 0.7531828284263611, + "learning_rate": 0.0002993883549527457, + "loss": 0.6336, + "step": 45090 + }, + { + "epoch": 1.0038283475783476, + "grad_norm": 0.5090972781181335, + "learning_rate": 0.0002993478942288176, + "loss": 0.5269, + "step": 45100 + }, + { + "epoch": 1.0040509259259258, + "grad_norm": 0.7268334031105042, + "learning_rate": 0.0002993074281062436, + "loss": 0.5477, + "step": 45110 + }, + { + "epoch": 1.0042735042735043, + "grad_norm": 0.8607544898986816, + "learning_rate": 0.0002992669565872227, + "loss": 0.4817, + "step": 45120 + }, + { + "epoch": 1.0044960826210827, + "grad_norm": 0.5529241561889648, + "learning_rate": 0.0002992264796739541, + "loss": 0.4315, + "step": 45130 + }, + { + "epoch": 1.004718660968661, + "grad_norm": 0.5200088620185852, + "learning_rate": 0.0002991859973686373, + "loss": 0.5856, + "step": 45140 + }, + { + "epoch": 1.0049412393162394, + "grad_norm": 0.7229819893836975, + "learning_rate": 0.0002991455096734723, + "loss": 0.537, + "step": 45150 + }, + { + "epoch": 1.0051638176638176, + "grad_norm": 0.8149036765098572, + "learning_rate": 0.00029910501659065905, + "loss": 0.63, + "step": 45160 + }, + { + "epoch": 1.005386396011396, + "grad_norm": 0.5828270316123962, + "learning_rate": 0.0002990645181223981, + "loss": 0.5111, + "step": 45170 + }, + { + "epoch": 1.0056089743589745, + "grad_norm": 0.7001296877861023, + "learning_rate": 0.00029902401427089014, + "loss": 0.6598, + "step": 45180 + }, + { + "epoch": 1.0058315527065527, + "grad_norm": 0.6515721082687378, + "learning_rate": 0.0002989835050383362, + "loss": 0.6275, + "step": 45190 + }, + { + "epoch": 1.006054131054131, + "grad_norm": 0.6579293012619019, + "learning_rate": 0.0002989429904269375, + "loss": 0.5268, + "step": 45200 + }, + { + "epoch": 1.0062767094017093, + "grad_norm": 0.9173814654350281, + "learning_rate": 0.0002989024704388958, + "loss": 0.5915, + "step": 45210 + }, + { + "epoch": 1.0064992877492878, + "grad_norm": 0.6272097229957581, + "learning_rate": 0.00029886194507641284, + "loss": 0.4741, + "step": 45220 + }, + { + "epoch": 1.0067218660968662, + "grad_norm": 0.694480836391449, + "learning_rate": 0.00029882141434169084, + "loss": 0.5071, + "step": 45230 + }, + { + "epoch": 1.0069444444444444, + "grad_norm": 0.664422869682312, + "learning_rate": 0.0002987808782369323, + "loss": 0.6094, + "step": 45240 + }, + { + "epoch": 1.0071670227920229, + "grad_norm": 0.5120536684989929, + "learning_rate": 0.00029874033676433997, + "loss": 0.5058, + "step": 45250 + }, + { + "epoch": 1.007389601139601, + "grad_norm": 0.5487117171287537, + "learning_rate": 0.0002986997899261169, + "loss": 0.6141, + "step": 45260 + }, + { + "epoch": 1.0076121794871795, + "grad_norm": 0.5553765296936035, + "learning_rate": 0.00029865923772446647, + "loss": 0.6366, + "step": 45270 + }, + { + "epoch": 1.0078347578347577, + "grad_norm": 0.5263356566429138, + "learning_rate": 0.0002986186801615923, + "loss": 0.4694, + "step": 45280 + }, + { + "epoch": 1.0080573361823362, + "grad_norm": 0.8697277903556824, + "learning_rate": 0.00029857811723969823, + "loss": 0.6524, + "step": 45290 + }, + { + "epoch": 1.0082799145299146, + "grad_norm": 0.6866238713264465, + "learning_rate": 0.0002985375489609886, + "loss": 0.555, + "step": 45300 + }, + { + "epoch": 1.0085024928774928, + "grad_norm": 0.623012900352478, + "learning_rate": 0.000298496975327668, + "loss": 0.5244, + "step": 45310 + }, + { + "epoch": 1.0087250712250713, + "grad_norm": 0.42710331082344055, + "learning_rate": 0.0002984563963419411, + "loss": 0.5599, + "step": 45320 + }, + { + "epoch": 1.0089476495726495, + "grad_norm": 0.8865259289741516, + "learning_rate": 0.00029841581200601295, + "loss": 0.551, + "step": 45330 + }, + { + "epoch": 1.009170227920228, + "grad_norm": 0.49819451570510864, + "learning_rate": 0.000298375222322089, + "loss": 0.6152, + "step": 45340 + }, + { + "epoch": 1.0093928062678064, + "grad_norm": 0.5800816416740417, + "learning_rate": 0.000298334627292375, + "loss": 0.6104, + "step": 45350 + }, + { + "epoch": 1.0096153846153846, + "grad_norm": 0.7009190320968628, + "learning_rate": 0.00029829402691907683, + "loss": 0.625, + "step": 45360 + }, + { + "epoch": 1.009837962962963, + "grad_norm": 0.3883427083492279, + "learning_rate": 0.00029825342120440074, + "loss": 0.5493, + "step": 45370 + }, + { + "epoch": 1.0100605413105412, + "grad_norm": 0.7414738535881042, + "learning_rate": 0.0002982128101505533, + "loss": 0.6544, + "step": 45380 + }, + { + "epoch": 1.0102831196581197, + "grad_norm": 0.6679602265357971, + "learning_rate": 0.0002981721937597414, + "loss": 0.5879, + "step": 45390 + }, + { + "epoch": 1.010505698005698, + "grad_norm": 0.7506459355354309, + "learning_rate": 0.00029813157203417213, + "loss": 0.4979, + "step": 45400 + }, + { + "epoch": 1.0107282763532763, + "grad_norm": 0.723677933216095, + "learning_rate": 0.0002980909449760529, + "loss": 0.5513, + "step": 45410 + }, + { + "epoch": 1.0109508547008548, + "grad_norm": 0.7994402647018433, + "learning_rate": 0.0002980503125875914, + "loss": 0.4788, + "step": 45420 + }, + { + "epoch": 1.011173433048433, + "grad_norm": 0.7365484833717346, + "learning_rate": 0.00029800967487099563, + "loss": 0.6212, + "step": 45430 + }, + { + "epoch": 1.0113960113960114, + "grad_norm": 0.3517953157424927, + "learning_rate": 0.00029796903182847386, + "loss": 0.5521, + "step": 45440 + }, + { + "epoch": 1.0116185897435896, + "grad_norm": 0.6479682326316833, + "learning_rate": 0.00029792838346223477, + "loss": 0.6885, + "step": 45450 + }, + { + "epoch": 1.011841168091168, + "grad_norm": 0.5270677208900452, + "learning_rate": 0.00029788772977448725, + "loss": 0.5095, + "step": 45460 + }, + { + "epoch": 1.0120637464387465, + "grad_norm": 0.7885897159576416, + "learning_rate": 0.00029784707076744015, + "loss": 0.532, + "step": 45470 + }, + { + "epoch": 1.0122863247863247, + "grad_norm": 0.7115561366081238, + "learning_rate": 0.00029780640644330324, + "loss": 0.428, + "step": 45480 + }, + { + "epoch": 1.0125089031339032, + "grad_norm": 0.8711773157119751, + "learning_rate": 0.0002977657368042861, + "loss": 0.5233, + "step": 45490 + }, + { + "epoch": 1.0127314814814814, + "grad_norm": 0.480892151594162, + "learning_rate": 0.0002977250618525988, + "loss": 0.5043, + "step": 45500 + }, + { + "epoch": 1.0129540598290598, + "grad_norm": 0.513482391834259, + "learning_rate": 0.0002976843815904516, + "loss": 0.3921, + "step": 45510 + }, + { + "epoch": 1.0131766381766383, + "grad_norm": 0.6706327199935913, + "learning_rate": 0.0002976436960200552, + "loss": 0.5297, + "step": 45520 + }, + { + "epoch": 1.0133992165242165, + "grad_norm": 0.7570107579231262, + "learning_rate": 0.00029760300514362046, + "loss": 0.6574, + "step": 45530 + }, + { + "epoch": 1.013621794871795, + "grad_norm": 0.6689755320549011, + "learning_rate": 0.00029756230896335845, + "loss": 0.564, + "step": 45540 + }, + { + "epoch": 1.0138443732193732, + "grad_norm": 0.35196173191070557, + "learning_rate": 0.0002975216074814807, + "loss": 0.5871, + "step": 45550 + }, + { + "epoch": 1.0140669515669516, + "grad_norm": 0.5093061923980713, + "learning_rate": 0.00029748090070019897, + "loss": 0.6584, + "step": 45560 + }, + { + "epoch": 1.0142895299145298, + "grad_norm": 0.7802603244781494, + "learning_rate": 0.0002974401886217253, + "loss": 0.4538, + "step": 45570 + }, + { + "epoch": 1.0145121082621082, + "grad_norm": 1.1860054731369019, + "learning_rate": 0.00029739947124827196, + "loss": 0.5224, + "step": 45580 + }, + { + "epoch": 1.0147346866096867, + "grad_norm": 0.5448476672172546, + "learning_rate": 0.0002973587485820517, + "loss": 0.6472, + "step": 45590 + }, + { + "epoch": 1.014957264957265, + "grad_norm": 0.5475285053253174, + "learning_rate": 0.00029731802062527734, + "loss": 0.5009, + "step": 45600 + }, + { + "epoch": 1.0151798433048433, + "grad_norm": 0.6277813911437988, + "learning_rate": 0.000297277287380162, + "loss": 0.536, + "step": 45610 + }, + { + "epoch": 1.0154024216524216, + "grad_norm": 0.9393863081932068, + "learning_rate": 0.00029723654884891926, + "loss": 0.4785, + "step": 45620 + }, + { + "epoch": 1.015625, + "grad_norm": 0.6652750968933105, + "learning_rate": 0.0002971958050337629, + "loss": 0.5289, + "step": 45630 + }, + { + "epoch": 1.0158475783475784, + "grad_norm": 0.6442055106163025, + "learning_rate": 0.00029715505593690686, + "loss": 0.654, + "step": 45640 + }, + { + "epoch": 1.0160701566951567, + "grad_norm": 1.0009881258010864, + "learning_rate": 0.00029711430156056554, + "loss": 0.6465, + "step": 45650 + }, + { + "epoch": 1.016292735042735, + "grad_norm": 0.5782968401908875, + "learning_rate": 0.00029707354190695363, + "loss": 0.6587, + "step": 45660 + }, + { + "epoch": 1.0165153133903133, + "grad_norm": 0.8827396631240845, + "learning_rate": 0.000297032776978286, + "loss": 0.6876, + "step": 45670 + }, + { + "epoch": 1.0167378917378918, + "grad_norm": 0.5242670774459839, + "learning_rate": 0.0002969920067767778, + "loss": 0.4636, + "step": 45680 + }, + { + "epoch": 1.0169604700854702, + "grad_norm": 0.5782427191734314, + "learning_rate": 0.0002969512313046445, + "loss": 0.6092, + "step": 45690 + }, + { + "epoch": 1.0171830484330484, + "grad_norm": 0.5887977480888367, + "learning_rate": 0.000296910450564102, + "loss": 0.4911, + "step": 45700 + }, + { + "epoch": 1.0174056267806268, + "grad_norm": 0.49322015047073364, + "learning_rate": 0.00029686966455736623, + "loss": 0.468, + "step": 45710 + }, + { + "epoch": 1.017628205128205, + "grad_norm": 0.7434693574905396, + "learning_rate": 0.0002968288732866536, + "loss": 0.5743, + "step": 45720 + }, + { + "epoch": 1.0178507834757835, + "grad_norm": 0.4505147337913513, + "learning_rate": 0.00029678807675418073, + "loss": 0.5177, + "step": 45730 + }, + { + "epoch": 1.0180733618233617, + "grad_norm": 0.593360185623169, + "learning_rate": 0.00029674727496216457, + "loss": 0.6319, + "step": 45740 + }, + { + "epoch": 1.0182959401709402, + "grad_norm": 0.8309880495071411, + "learning_rate": 0.00029670646791282225, + "loss": 0.6012, + "step": 45750 + }, + { + "epoch": 1.0185185185185186, + "grad_norm": 0.5800014734268188, + "learning_rate": 0.0002966656556083713, + "loss": 0.4977, + "step": 45760 + }, + { + "epoch": 1.0187410968660968, + "grad_norm": 0.6049439311027527, + "learning_rate": 0.00029662483805102945, + "loss": 0.5165, + "step": 45770 + }, + { + "epoch": 1.0189636752136753, + "grad_norm": 0.730975866317749, + "learning_rate": 0.0002965840152430149, + "loss": 0.5126, + "step": 45780 + }, + { + "epoch": 1.0191862535612535, + "grad_norm": 0.712568461894989, + "learning_rate": 0.00029654318718654586, + "loss": 0.4807, + "step": 45790 + }, + { + "epoch": 1.019408831908832, + "grad_norm": 0.6383031606674194, + "learning_rate": 0.00029650235388384093, + "loss": 0.5993, + "step": 45800 + }, + { + "epoch": 1.0196314102564104, + "grad_norm": 1.111724615097046, + "learning_rate": 0.00029646151533711915, + "loss": 0.5663, + "step": 45810 + }, + { + "epoch": 1.0198539886039886, + "grad_norm": 0.7301346063613892, + "learning_rate": 0.00029642067154859965, + "loss": 0.646, + "step": 45820 + }, + { + "epoch": 1.020076566951567, + "grad_norm": 0.9122412800788879, + "learning_rate": 0.00029637982252050184, + "loss": 0.5205, + "step": 45830 + }, + { + "epoch": 1.0201210826210827, + "eval_loss": 0.5998291373252869, + "eval_runtime": 337.5403, + "eval_samples_per_second": 7.007, + "eval_steps_per_second": 7.007, + "step": 45832 + }, + { + "epoch": 1.0202991452991452, + "grad_norm": 0.8955168128013611, + "learning_rate": 0.0002963389682550457, + "loss": 0.4934, + "step": 45840 + }, + { + "epoch": 1.0205217236467237, + "grad_norm": 0.7003114223480225, + "learning_rate": 0.0002962981087544511, + "loss": 0.5329, + "step": 45850 + }, + { + "epoch": 1.020744301994302, + "grad_norm": 0.9676564931869507, + "learning_rate": 0.00029625724402093846, + "loss": 0.6922, + "step": 45860 + }, + { + "epoch": 1.0209668803418803, + "grad_norm": 0.31678351759910583, + "learning_rate": 0.0002962163740567283, + "loss": 0.5997, + "step": 45870 + }, + { + "epoch": 1.0211894586894588, + "grad_norm": 0.7018308639526367, + "learning_rate": 0.00029617549886404174, + "loss": 0.6506, + "step": 45880 + }, + { + "epoch": 1.021412037037037, + "grad_norm": 0.6368674039840698, + "learning_rate": 0.00029613461844509975, + "loss": 0.5323, + "step": 45890 + }, + { + "epoch": 1.0216346153846154, + "grad_norm": 0.5889302492141724, + "learning_rate": 0.0002960937328021239, + "loss": 0.6061, + "step": 45900 + }, + { + "epoch": 1.0218571937321936, + "grad_norm": 0.6018082499504089, + "learning_rate": 0.000296052841937336, + "loss": 0.6059, + "step": 45910 + }, + { + "epoch": 1.022079772079772, + "grad_norm": 0.8754140138626099, + "learning_rate": 0.000296011945852958, + "loss": 0.5061, + "step": 45920 + }, + { + "epoch": 1.0223023504273505, + "grad_norm": 0.373577743768692, + "learning_rate": 0.00029597104455121224, + "loss": 0.5363, + "step": 45930 + }, + { + "epoch": 1.0225249287749287, + "grad_norm": 0.582492470741272, + "learning_rate": 0.0002959301380343214, + "loss": 0.6262, + "step": 45940 + }, + { + "epoch": 1.0227475071225072, + "grad_norm": 0.529481053352356, + "learning_rate": 0.00029588922630450825, + "loss": 0.5704, + "step": 45950 + }, + { + "epoch": 1.0229700854700854, + "grad_norm": 0.6251696944236755, + "learning_rate": 0.0002958483093639961, + "loss": 0.5239, + "step": 45960 + }, + { + "epoch": 1.0231926638176638, + "grad_norm": 0.8448797464370728, + "learning_rate": 0.0002958073872150083, + "loss": 0.498, + "step": 45970 + }, + { + "epoch": 1.0234152421652423, + "grad_norm": 0.8178675174713135, + "learning_rate": 0.00029576645985976874, + "loss": 0.6725, + "step": 45980 + }, + { + "epoch": 1.0236378205128205, + "grad_norm": 0.7064274549484253, + "learning_rate": 0.0002957255273005013, + "loss": 0.5908, + "step": 45990 + }, + { + "epoch": 1.023860398860399, + "grad_norm": 0.5764282941818237, + "learning_rate": 0.00029568458953943035, + "loss": 0.4453, + "step": 46000 + }, + { + "epoch": 1.0240829772079771, + "grad_norm": 0.6136001348495483, + "learning_rate": 0.0002956436465787805, + "loss": 0.6059, + "step": 46010 + }, + { + "epoch": 1.0243055555555556, + "grad_norm": 0.6569823622703552, + "learning_rate": 0.00029560269842077657, + "loss": 0.4823, + "step": 46020 + }, + { + "epoch": 1.0245281339031338, + "grad_norm": 0.6871667504310608, + "learning_rate": 0.0002955617450676437, + "loss": 0.5419, + "step": 46030 + }, + { + "epoch": 1.0247507122507122, + "grad_norm": 0.5179684162139893, + "learning_rate": 0.0002955207865216074, + "loss": 0.6193, + "step": 46040 + }, + { + "epoch": 1.0249732905982907, + "grad_norm": 0.5849087834358215, + "learning_rate": 0.0002954798227848934, + "loss": 0.78, + "step": 46050 + }, + { + "epoch": 1.025195868945869, + "grad_norm": 0.58812016248703, + "learning_rate": 0.0002954388538597277, + "loss": 0.6257, + "step": 46060 + }, + { + "epoch": 1.0254184472934473, + "grad_norm": 0.3955261707305908, + "learning_rate": 0.00029539787974833644, + "loss": 0.5079, + "step": 46070 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.5466234087944031, + "learning_rate": 0.0002953569004529464, + "loss": 0.4818, + "step": 46080 + }, + { + "epoch": 1.025863603988604, + "grad_norm": 1.1418683528900146, + "learning_rate": 0.00029531591597578425, + "loss": 0.6846, + "step": 46090 + }, + { + "epoch": 1.0260861823361824, + "grad_norm": 0.7915250062942505, + "learning_rate": 0.0002952749263190772, + "loss": 0.6076, + "step": 46100 + }, + { + "epoch": 1.0263087606837606, + "grad_norm": 0.5871915817260742, + "learning_rate": 0.0002952339314850527, + "loss": 0.6356, + "step": 46110 + }, + { + "epoch": 1.026531339031339, + "grad_norm": 0.6075290441513062, + "learning_rate": 0.00029519293147593843, + "loss": 0.4626, + "step": 46120 + }, + { + "epoch": 1.0267539173789173, + "grad_norm": 0.3844590485095978, + "learning_rate": 0.0002951519262939623, + "loss": 0.5026, + "step": 46130 + }, + { + "epoch": 1.0269764957264957, + "grad_norm": 0.6965892910957336, + "learning_rate": 0.00029511091594135256, + "loss": 0.4676, + "step": 46140 + }, + { + "epoch": 1.0271990740740742, + "grad_norm": 0.8521528244018555, + "learning_rate": 0.00029506990042033786, + "loss": 0.4739, + "step": 46150 + }, + { + "epoch": 1.0274216524216524, + "grad_norm": 0.7694055438041687, + "learning_rate": 0.00029502887973314687, + "loss": 0.6516, + "step": 46160 + }, + { + "epoch": 1.0276442307692308, + "grad_norm": 0.5086039900779724, + "learning_rate": 0.0002949878538820088, + "loss": 0.5088, + "step": 46170 + }, + { + "epoch": 1.027866809116809, + "grad_norm": 0.4185430705547333, + "learning_rate": 0.000294946822869153, + "loss": 0.4825, + "step": 46180 + }, + { + "epoch": 1.0280893874643875, + "grad_norm": 0.7606822848320007, + "learning_rate": 0.0002949057866968091, + "loss": 0.698, + "step": 46190 + }, + { + "epoch": 1.0283119658119657, + "grad_norm": 0.6091985702514648, + "learning_rate": 0.00029486474536720707, + "loss": 0.4992, + "step": 46200 + }, + { + "epoch": 1.0285345441595442, + "grad_norm": 0.6582888960838318, + "learning_rate": 0.000294823698882577, + "loss": 0.522, + "step": 46210 + }, + { + "epoch": 1.0287571225071226, + "grad_norm": 0.5502168536186218, + "learning_rate": 0.00029478264724514967, + "loss": 0.4766, + "step": 46220 + }, + { + "epoch": 1.0289797008547008, + "grad_norm": 0.487687885761261, + "learning_rate": 0.0002947415904571556, + "loss": 0.4861, + "step": 46230 + }, + { + "epoch": 1.0292022792022792, + "grad_norm": 1.1762452125549316, + "learning_rate": 0.000294700528520826, + "loss": 0.6959, + "step": 46240 + }, + { + "epoch": 1.0294248575498575, + "grad_norm": 0.814188539981842, + "learning_rate": 0.0002946594614383922, + "loss": 0.6722, + "step": 46250 + }, + { + "epoch": 1.029647435897436, + "grad_norm": 0.46749377250671387, + "learning_rate": 0.0002946183892120857, + "loss": 0.5451, + "step": 46260 + }, + { + "epoch": 1.0298700142450143, + "grad_norm": 0.6072514653205872, + "learning_rate": 0.0002945773118441385, + "loss": 0.5755, + "step": 46270 + }, + { + "epoch": 1.0300925925925926, + "grad_norm": 0.48334813117980957, + "learning_rate": 0.00029453622933678266, + "loss": 0.7137, + "step": 46280 + }, + { + "epoch": 1.030315170940171, + "grad_norm": 0.8070351481437683, + "learning_rate": 0.0002944951416922509, + "loss": 0.61, + "step": 46290 + }, + { + "epoch": 1.0305377492877492, + "grad_norm": 0.5267581343650818, + "learning_rate": 0.0002944540489127756, + "loss": 0.5429, + "step": 46300 + }, + { + "epoch": 1.0307603276353277, + "grad_norm": 0.6215224266052246, + "learning_rate": 0.0002944129510005901, + "loss": 0.5384, + "step": 46310 + }, + { + "epoch": 1.0309829059829059, + "grad_norm": 0.8042832612991333, + "learning_rate": 0.0002943718479579275, + "loss": 0.5225, + "step": 46320 + }, + { + "epoch": 1.0312054843304843, + "grad_norm": 0.6923868656158447, + "learning_rate": 0.0002943307397870214, + "loss": 0.5047, + "step": 46330 + }, + { + "epoch": 1.0314280626780628, + "grad_norm": 0.7418728470802307, + "learning_rate": 0.0002942896264901057, + "loss": 0.6181, + "step": 46340 + }, + { + "epoch": 1.031650641025641, + "grad_norm": 0.5733642578125, + "learning_rate": 0.00029424850806941444, + "loss": 0.5159, + "step": 46350 + }, + { + "epoch": 1.0318732193732194, + "grad_norm": 0.42896318435668945, + "learning_rate": 0.00029420738452718223, + "loss": 0.5517, + "step": 46360 + }, + { + "epoch": 1.0320957977207976, + "grad_norm": 0.8593009114265442, + "learning_rate": 0.0002941662558656435, + "loss": 0.5949, + "step": 46370 + }, + { + "epoch": 1.032318376068376, + "grad_norm": 0.6495790481567383, + "learning_rate": 0.00029412512208703347, + "loss": 0.4398, + "step": 46380 + }, + { + "epoch": 1.0325409544159545, + "grad_norm": 0.6790677905082703, + "learning_rate": 0.0002940839831935871, + "loss": 0.4867, + "step": 46390 + }, + { + "epoch": 1.0327635327635327, + "grad_norm": 0.7355635166168213, + "learning_rate": 0.0002940428391875402, + "loss": 0.4541, + "step": 46400 + }, + { + "epoch": 1.0329861111111112, + "grad_norm": 0.5827793478965759, + "learning_rate": 0.00029400169007112834, + "loss": 0.5366, + "step": 46410 + }, + { + "epoch": 1.0332086894586894, + "grad_norm": 0.47678840160369873, + "learning_rate": 0.0002939605358465877, + "loss": 0.4735, + "step": 46420 + }, + { + "epoch": 1.0334312678062678, + "grad_norm": 0.6106163263320923, + "learning_rate": 0.00029391937651615464, + "loss": 0.4913, + "step": 46430 + }, + { + "epoch": 1.0336538461538463, + "grad_norm": 0.6181490421295166, + "learning_rate": 0.00029387821208206574, + "loss": 0.6202, + "step": 46440 + }, + { + "epoch": 1.0338764245014245, + "grad_norm": 0.7003015875816345, + "learning_rate": 0.000293837042546558, + "loss": 0.5188, + "step": 46450 + }, + { + "epoch": 1.034099002849003, + "grad_norm": 0.7870360612869263, + "learning_rate": 0.00029379586791186853, + "loss": 0.6886, + "step": 46460 + }, + { + "epoch": 1.0343215811965811, + "grad_norm": 0.9469454288482666, + "learning_rate": 0.0002937546881802348, + "loss": 0.5678, + "step": 46470 + }, + { + "epoch": 1.0345441595441596, + "grad_norm": 0.5496917963027954, + "learning_rate": 0.00029371350335389456, + "loss": 0.5574, + "step": 46480 + }, + { + "epoch": 1.0347667378917378, + "grad_norm": 0.6661463379859924, + "learning_rate": 0.00029367231343508586, + "loss": 0.6327, + "step": 46490 + }, + { + "epoch": 1.0349893162393162, + "grad_norm": 0.7526752948760986, + "learning_rate": 0.00029363111842604694, + "loss": 0.5669, + "step": 46500 + }, + { + "epoch": 1.0352118945868947, + "grad_norm": 0.6870206594467163, + "learning_rate": 0.0002935899183290165, + "loss": 0.5794, + "step": 46510 + }, + { + "epoch": 1.0354344729344729, + "grad_norm": 0.8028680682182312, + "learning_rate": 0.0002935487131462331, + "loss": 0.6761, + "step": 46520 + }, + { + "epoch": 1.0356570512820513, + "grad_norm": 0.7931338548660278, + "learning_rate": 0.00029350750287993613, + "loss": 0.6181, + "step": 46530 + }, + { + "epoch": 1.0358796296296295, + "grad_norm": 0.3808395564556122, + "learning_rate": 0.00029346628753236493, + "loss": 0.6137, + "step": 46540 + }, + { + "epoch": 1.036102207977208, + "grad_norm": 0.8837310671806335, + "learning_rate": 0.00029342506710575904, + "loss": 0.5217, + "step": 46550 + }, + { + "epoch": 1.0363247863247864, + "grad_norm": 0.8158642649650574, + "learning_rate": 0.00029338384160235864, + "loss": 0.6392, + "step": 46560 + }, + { + "epoch": 1.0365473646723646, + "grad_norm": 0.7373179197311401, + "learning_rate": 0.0002933426110244038, + "loss": 0.6444, + "step": 46570 + }, + { + "epoch": 1.036769943019943, + "grad_norm": 0.46640434861183167, + "learning_rate": 0.00029330137537413514, + "loss": 0.5789, + "step": 46580 + }, + { + "epoch": 1.0369925213675213, + "grad_norm": 0.6358410716056824, + "learning_rate": 0.0002932601346537932, + "loss": 0.5697, + "step": 46590 + }, + { + "epoch": 1.0372150997150997, + "grad_norm": 0.7933520078659058, + "learning_rate": 0.00029321888886561933, + "loss": 0.4974, + "step": 46600 + }, + { + "epoch": 1.0374376780626782, + "grad_norm": 0.5213042497634888, + "learning_rate": 0.00029317763801185465, + "loss": 0.6363, + "step": 46610 + }, + { + "epoch": 1.0376602564102564, + "grad_norm": 0.5368013978004456, + "learning_rate": 0.0002931363820947409, + "loss": 0.5479, + "step": 46620 + }, + { + "epoch": 1.0378828347578348, + "grad_norm": 0.7525256872177124, + "learning_rate": 0.0002930951211165198, + "loss": 0.6095, + "step": 46630 + }, + { + "epoch": 1.038105413105413, + "grad_norm": 0.8799393177032471, + "learning_rate": 0.00029305385507943373, + "loss": 0.5226, + "step": 46640 + }, + { + "epoch": 1.0383279914529915, + "grad_norm": 0.597439706325531, + "learning_rate": 0.000293012583985725, + "loss": 0.6052, + "step": 46650 + }, + { + "epoch": 1.0385505698005697, + "grad_norm": 0.7384129762649536, + "learning_rate": 0.00029297130783763624, + "loss": 0.5461, + "step": 46660 + }, + { + "epoch": 1.0387731481481481, + "grad_norm": 0.7148140072822571, + "learning_rate": 0.00029293002663741054, + "loss": 0.4575, + "step": 46670 + }, + { + "epoch": 1.0389957264957266, + "grad_norm": 0.8531787991523743, + "learning_rate": 0.00029288874038729107, + "loss": 0.6255, + "step": 46680 + }, + { + "epoch": 1.0392183048433048, + "grad_norm": 0.6198742985725403, + "learning_rate": 0.0002928474490895214, + "loss": 0.4236, + "step": 46690 + }, + { + "epoch": 1.0394408831908832, + "grad_norm": 0.6757825016975403, + "learning_rate": 0.0002928061527463454, + "loss": 0.6227, + "step": 46700 + }, + { + "epoch": 1.0396634615384615, + "grad_norm": 0.7546913623809814, + "learning_rate": 0.00029276485136000706, + "loss": 0.5614, + "step": 46710 + }, + { + "epoch": 1.03988603988604, + "grad_norm": 0.7523819804191589, + "learning_rate": 0.0002927235449327508, + "loss": 0.4941, + "step": 46720 + }, + { + "epoch": 1.0401086182336183, + "grad_norm": 0.9511393308639526, + "learning_rate": 0.0002926822334668211, + "loss": 0.5301, + "step": 46730 + }, + { + "epoch": 1.0403311965811965, + "grad_norm": 0.8329669833183289, + "learning_rate": 0.00029264091696446306, + "loss": 0.6019, + "step": 46740 + }, + { + "epoch": 1.040553774928775, + "grad_norm": 0.6324247717857361, + "learning_rate": 0.0002925995954279217, + "loss": 0.5551, + "step": 46750 + }, + { + "epoch": 1.0407763532763532, + "grad_norm": 0.6515164375305176, + "learning_rate": 0.0002925582688594425, + "loss": 0.5437, + "step": 46760 + }, + { + "epoch": 1.0409989316239316, + "grad_norm": 0.4561392664909363, + "learning_rate": 0.0002925169372612713, + "loss": 0.4715, + "step": 46770 + }, + { + "epoch": 1.04122150997151, + "grad_norm": 0.6248541474342346, + "learning_rate": 0.0002924756006356539, + "loss": 0.6171, + "step": 46780 + }, + { + "epoch": 1.0414440883190883, + "grad_norm": 0.9043360352516174, + "learning_rate": 0.0002924342589848367, + "loss": 0.5258, + "step": 46790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.8318607807159424, + "learning_rate": 0.0002923929123110661, + "loss": 0.5527, + "step": 46800 + }, + { + "epoch": 1.041889245014245, + "grad_norm": 0.764336347579956, + "learning_rate": 0.0002923515606165891, + "loss": 0.6689, + "step": 46810 + }, + { + "epoch": 1.0421118233618234, + "grad_norm": 0.5335542559623718, + "learning_rate": 0.0002923102039036527, + "loss": 0.424, + "step": 46820 + }, + { + "epoch": 1.0423344017094016, + "grad_norm": 0.6081898808479309, + "learning_rate": 0.0002922688421745042, + "loss": 0.6146, + "step": 46830 + }, + { + "epoch": 1.04255698005698, + "grad_norm": 0.5974134802818298, + "learning_rate": 0.00029222747543139135, + "loss": 0.5533, + "step": 46840 + }, + { + "epoch": 1.0427795584045585, + "grad_norm": 0.7155280709266663, + "learning_rate": 0.0002921861036765619, + "loss": 0.6214, + "step": 46850 + }, + { + "epoch": 1.0430021367521367, + "grad_norm": 0.5848572254180908, + "learning_rate": 0.0002921447269122642, + "loss": 0.6203, + "step": 46860 + }, + { + "epoch": 1.0432247150997151, + "grad_norm": 0.5690578818321228, + "learning_rate": 0.00029210334514074654, + "loss": 0.4135, + "step": 46870 + }, + { + "epoch": 1.0434472934472934, + "grad_norm": 0.8502477407455444, + "learning_rate": 0.00029206195836425767, + "loss": 0.6178, + "step": 46880 + }, + { + "epoch": 1.0436698717948718, + "grad_norm": 0.7801943421363831, + "learning_rate": 0.0002920205665850467, + "loss": 0.6523, + "step": 46890 + }, + { + "epoch": 1.0438924501424502, + "grad_norm": 0.48374879360198975, + "learning_rate": 0.00029197916980536274, + "loss": 0.4545, + "step": 46900 + }, + { + "epoch": 1.0441150284900285, + "grad_norm": 0.4702428877353668, + "learning_rate": 0.00029193776802745547, + "loss": 0.4453, + "step": 46910 + }, + { + "epoch": 1.044337606837607, + "grad_norm": 0.6450238227844238, + "learning_rate": 0.0002918963612535746, + "loss": 0.5475, + "step": 46920 + }, + { + "epoch": 1.0445601851851851, + "grad_norm": 0.596511721611023, + "learning_rate": 0.00029185494948597024, + "loss": 0.6097, + "step": 46930 + }, + { + "epoch": 1.0447827635327636, + "grad_norm": 0.6973013281822205, + "learning_rate": 0.0002918135327268927, + "loss": 0.6392, + "step": 46940 + }, + { + "epoch": 1.0450053418803418, + "grad_norm": 0.6602265238761902, + "learning_rate": 0.0002917721109785926, + "loss": 0.5507, + "step": 46950 + }, + { + "epoch": 1.0452279202279202, + "grad_norm": 0.6385603547096252, + "learning_rate": 0.00029173068424332094, + "loss": 0.623, + "step": 46960 + }, + { + "epoch": 1.0454504985754987, + "grad_norm": 0.49321964383125305, + "learning_rate": 0.0002916892525233288, + "loss": 0.5145, + "step": 46970 + }, + { + "epoch": 1.0456730769230769, + "grad_norm": 0.6037874817848206, + "learning_rate": 0.00029164781582086753, + "loss": 0.5974, + "step": 46980 + }, + { + "epoch": 1.0458956552706553, + "grad_norm": 0.5289360880851746, + "learning_rate": 0.000291606374138189, + "loss": 0.6204, + "step": 46990 + }, + { + "epoch": 1.0461182336182335, + "grad_norm": 1.165456771850586, + "learning_rate": 0.0002915649274775451, + "loss": 0.5946, + "step": 47000 + }, + { + "epoch": 1.046340811965812, + "grad_norm": 0.9170607924461365, + "learning_rate": 0.000291523475841188, + "loss": 0.678, + "step": 47010 + }, + { + "epoch": 1.0465633903133904, + "grad_norm": 0.6722274422645569, + "learning_rate": 0.0002914820192313704, + "loss": 0.6116, + "step": 47020 + }, + { + "epoch": 1.0467859686609686, + "grad_norm": 0.738879919052124, + "learning_rate": 0.0002914405576503449, + "loss": 0.4964, + "step": 47030 + }, + { + "epoch": 1.047008547008547, + "grad_norm": 0.9060617685317993, + "learning_rate": 0.0002913990911003647, + "loss": 0.5929, + "step": 47040 + }, + { + "epoch": 1.0472311253561253, + "grad_norm": 0.45788079500198364, + "learning_rate": 0.00029135761958368303, + "loss": 0.5966, + "step": 47050 + }, + { + "epoch": 1.0474537037037037, + "grad_norm": 1.0104951858520508, + "learning_rate": 0.00029131614310255353, + "loss": 0.5632, + "step": 47060 + }, + { + "epoch": 1.047676282051282, + "grad_norm": 0.4078928530216217, + "learning_rate": 0.0002912746616592301, + "loss": 0.5451, + "step": 47070 + }, + { + "epoch": 1.0478988603988604, + "grad_norm": 0.43406420946121216, + "learning_rate": 0.0002912331752559668, + "loss": 0.4626, + "step": 47080 + }, + { + "epoch": 1.0481214387464388, + "grad_norm": 0.7026646137237549, + "learning_rate": 0.00029119168389501803, + "loss": 0.6345, + "step": 47090 + }, + { + "epoch": 1.048344017094017, + "grad_norm": 0.4128137528896332, + "learning_rate": 0.0002911501875786386, + "loss": 0.5166, + "step": 47100 + }, + { + "epoch": 1.0485665954415955, + "grad_norm": 0.6436150670051575, + "learning_rate": 0.00029110868630908334, + "loss": 0.5672, + "step": 47110 + }, + { + "epoch": 1.0487891737891737, + "grad_norm": 0.7415764927864075, + "learning_rate": 0.00029106718008860743, + "loss": 0.5835, + "step": 47120 + }, + { + "epoch": 1.0490117521367521, + "grad_norm": 0.6365389227867126, + "learning_rate": 0.00029102566891946635, + "loss": 0.6038, + "step": 47130 + }, + { + "epoch": 1.0492343304843306, + "grad_norm": 0.8669440746307373, + "learning_rate": 0.000290984152803916, + "loss": 0.6079, + "step": 47140 + }, + { + "epoch": 1.0494569088319088, + "grad_norm": 0.6483818888664246, + "learning_rate": 0.0002909426317442123, + "loss": 0.5265, + "step": 47150 + }, + { + "epoch": 1.0496794871794872, + "grad_norm": 0.6815274953842163, + "learning_rate": 0.00029090110574261154, + "loss": 0.6666, + "step": 47160 + }, + { + "epoch": 1.0499020655270654, + "grad_norm": 0.9355290532112122, + "learning_rate": 0.0002908595748013702, + "loss": 0.6024, + "step": 47170 + }, + { + "epoch": 1.0501246438746439, + "grad_norm": 0.883792519569397, + "learning_rate": 0.00029081803892274527, + "loss": 0.5186, + "step": 47180 + }, + { + "epoch": 1.0503472222222223, + "grad_norm": 0.6974046230316162, + "learning_rate": 0.00029077649810899374, + "loss": 0.5896, + "step": 47190 + }, + { + "epoch": 1.0505698005698005, + "grad_norm": 0.6341229677200317, + "learning_rate": 0.000290734952362373, + "loss": 0.4812, + "step": 47200 + }, + { + "epoch": 1.050792378917379, + "grad_norm": 0.8992486596107483, + "learning_rate": 0.0002906934016851406, + "loss": 0.6442, + "step": 47210 + }, + { + "epoch": 1.0510149572649572, + "grad_norm": 0.526816725730896, + "learning_rate": 0.0002906518460795546, + "loss": 0.5308, + "step": 47220 + }, + { + "epoch": 1.0512375356125356, + "grad_norm": 0.8405412435531616, + "learning_rate": 0.00029061028554787306, + "loss": 0.5937, + "step": 47230 + }, + { + "epoch": 1.0514601139601139, + "grad_norm": 0.7325668334960938, + "learning_rate": 0.0002905687200923544, + "loss": 0.6085, + "step": 47240 + }, + { + "epoch": 1.0516826923076923, + "grad_norm": 0.8678027987480164, + "learning_rate": 0.00029052714971525734, + "loss": 0.5914, + "step": 47250 + }, + { + "epoch": 1.0519052706552707, + "grad_norm": 0.6910134553909302, + "learning_rate": 0.00029048557441884083, + "loss": 0.601, + "step": 47260 + }, + { + "epoch": 1.052127849002849, + "grad_norm": 0.8204662203788757, + "learning_rate": 0.0002904439942053641, + "loss": 0.6606, + "step": 47270 + }, + { + "epoch": 1.0523504273504274, + "grad_norm": 0.5311092734336853, + "learning_rate": 0.0002904024090770868, + "loss": 0.5108, + "step": 47280 + }, + { + "epoch": 1.0525730056980056, + "grad_norm": 0.7201509475708008, + "learning_rate": 0.0002903608190362685, + "loss": 0.5197, + "step": 47290 + }, + { + "epoch": 1.052795584045584, + "grad_norm": 0.7765344381332397, + "learning_rate": 0.0002903192240851694, + "loss": 0.5792, + "step": 47300 + }, + { + "epoch": 1.0530181623931625, + "grad_norm": 0.6695293188095093, + "learning_rate": 0.0002902776242260497, + "loss": 0.5312, + "step": 47310 + }, + { + "epoch": 1.0532407407407407, + "grad_norm": 0.703255295753479, + "learning_rate": 0.00029023601946116996, + "loss": 0.6347, + "step": 47320 + }, + { + "epoch": 1.0534633190883191, + "grad_norm": 0.7623192071914673, + "learning_rate": 0.0002901944097927911, + "loss": 0.6112, + "step": 47330 + }, + { + "epoch": 1.0536858974358974, + "grad_norm": 0.920707106590271, + "learning_rate": 0.00029015279522317405, + "loss": 0.5685, + "step": 47340 + }, + { + "epoch": 1.0539084757834758, + "grad_norm": 0.744606077671051, + "learning_rate": 0.00029011117575458045, + "loss": 0.4672, + "step": 47350 + }, + { + "epoch": 1.0541310541310542, + "grad_norm": 0.7063766121864319, + "learning_rate": 0.0002900695513892717, + "loss": 0.5412, + "step": 47360 + }, + { + "epoch": 1.0543536324786325, + "grad_norm": 0.6504448056221008, + "learning_rate": 0.00029002792212950984, + "loss": 0.6068, + "step": 47370 + }, + { + "epoch": 1.054576210826211, + "grad_norm": 0.7272416353225708, + "learning_rate": 0.000289986287977557, + "loss": 0.5946, + "step": 47380 + }, + { + "epoch": 1.054798789173789, + "grad_norm": 0.4593257009983063, + "learning_rate": 0.00028994464893567553, + "loss": 0.4715, + "step": 47390 + }, + { + "epoch": 1.0550213675213675, + "grad_norm": 1.023835301399231, + "learning_rate": 0.0002899030050061283, + "loss": 0.6391, + "step": 47400 + }, + { + "epoch": 1.0552439458689458, + "grad_norm": 0.6272522211074829, + "learning_rate": 0.0002898613561911781, + "loss": 0.5719, + "step": 47410 + }, + { + "epoch": 1.0554665242165242, + "grad_norm": 0.784984290599823, + "learning_rate": 0.0002898197024930883, + "loss": 0.5492, + "step": 47420 + }, + { + "epoch": 1.0556891025641026, + "grad_norm": 0.8112901449203491, + "learning_rate": 0.0002897780439141223, + "loss": 0.5022, + "step": 47430 + }, + { + "epoch": 1.0559116809116809, + "grad_norm": 0.8352168202400208, + "learning_rate": 0.00028973638045654395, + "loss": 0.6067, + "step": 47440 + }, + { + "epoch": 1.0561342592592593, + "grad_norm": 0.8020703792572021, + "learning_rate": 0.0002896947121226172, + "loss": 0.5609, + "step": 47450 + }, + { + "epoch": 1.0563568376068375, + "grad_norm": 0.6858905553817749, + "learning_rate": 0.00028965303891460636, + "loss": 0.5885, + "step": 47460 + }, + { + "epoch": 1.056579415954416, + "grad_norm": 0.8974461555480957, + "learning_rate": 0.0002896113608347759, + "loss": 0.5669, + "step": 47470 + }, + { + "epoch": 1.0568019943019944, + "grad_norm": 1.0188039541244507, + "learning_rate": 0.0002895696778853908, + "loss": 0.5348, + "step": 47480 + }, + { + "epoch": 1.0570245726495726, + "grad_norm": 0.798266589641571, + "learning_rate": 0.0002895279900687161, + "loss": 0.6305, + "step": 47490 + }, + { + "epoch": 1.057247150997151, + "grad_norm": 0.7277517318725586, + "learning_rate": 0.0002894862973870172, + "loss": 0.5724, + "step": 47500 + }, + { + "epoch": 1.0574697293447293, + "grad_norm": 0.862324059009552, + "learning_rate": 0.00028944459984255955, + "loss": 0.561, + "step": 47510 + }, + { + "epoch": 1.0576923076923077, + "grad_norm": 0.45276638865470886, + "learning_rate": 0.00028940289743760916, + "loss": 0.5345, + "step": 47520 + }, + { + "epoch": 1.0579148860398861, + "grad_norm": 0.6970518231391907, + "learning_rate": 0.0002893611901744321, + "loss": 0.4926, + "step": 47530 + }, + { + "epoch": 1.0581374643874644, + "grad_norm": 1.012525200843811, + "learning_rate": 0.0002893194780552948, + "loss": 0.5492, + "step": 47540 + }, + { + "epoch": 1.0583600427350428, + "grad_norm": 0.7472673654556274, + "learning_rate": 0.000289277761082464, + "loss": 0.5119, + "step": 47550 + }, + { + "epoch": 1.058582621082621, + "grad_norm": 0.8029112815856934, + "learning_rate": 0.00028923603925820656, + "loss": 0.5215, + "step": 47560 + }, + { + "epoch": 1.0588051994301995, + "grad_norm": 0.5627382397651672, + "learning_rate": 0.00028919431258478965, + "loss": 0.5293, + "step": 47570 + }, + { + "epoch": 1.0590277777777777, + "grad_norm": 0.8303996324539185, + "learning_rate": 0.0002891525810644808, + "loss": 0.5951, + "step": 47580 + }, + { + "epoch": 1.0592503561253561, + "grad_norm": 0.46132200956344604, + "learning_rate": 0.0002891108446995477, + "loss": 0.5242, + "step": 47590 + }, + { + "epoch": 1.0594729344729346, + "grad_norm": 0.5943209528923035, + "learning_rate": 0.0002890691034922584, + "loss": 0.5589, + "step": 47600 + }, + { + "epoch": 1.0596955128205128, + "grad_norm": 0.5154911875724792, + "learning_rate": 0.00028902735744488106, + "loss": 0.623, + "step": 47610 + }, + { + "epoch": 1.0599180911680912, + "grad_norm": 0.5615541338920593, + "learning_rate": 0.0002889856065596842, + "loss": 0.6751, + "step": 47620 + }, + { + "epoch": 1.0601406695156694, + "grad_norm": 1.0154298543930054, + "learning_rate": 0.00028894385083893674, + "loss": 0.7437, + "step": 47630 + }, + { + "epoch": 1.0603632478632479, + "grad_norm": 1.1956875324249268, + "learning_rate": 0.0002889020902849075, + "loss": 0.6716, + "step": 47640 + }, + { + "epoch": 1.0605858262108263, + "grad_norm": 0.6510722637176514, + "learning_rate": 0.00028886032489986596, + "loss": 0.5675, + "step": 47650 + }, + { + "epoch": 1.0608084045584045, + "grad_norm": 0.8667111992835999, + "learning_rate": 0.0002888185546860816, + "loss": 0.6497, + "step": 47660 + }, + { + "epoch": 1.061030982905983, + "grad_norm": 0.794508159160614, + "learning_rate": 0.0002887767796458243, + "loss": 0.6908, + "step": 47670 + }, + { + "epoch": 1.0612535612535612, + "grad_norm": 0.8232386112213135, + "learning_rate": 0.000288734999781364, + "loss": 0.619, + "step": 47680 + }, + { + "epoch": 1.0614761396011396, + "grad_norm": 0.3844764530658722, + "learning_rate": 0.0002886932150949713, + "loss": 0.6195, + "step": 47690 + }, + { + "epoch": 1.061698717948718, + "grad_norm": 0.6595999002456665, + "learning_rate": 0.0002886514255889167, + "loss": 0.5477, + "step": 47700 + }, + { + "epoch": 1.0619212962962963, + "grad_norm": 0.6769583225250244, + "learning_rate": 0.00028860963126547094, + "loss": 0.4758, + "step": 47710 + }, + { + "epoch": 1.0621438746438747, + "grad_norm": 0.5123394727706909, + "learning_rate": 0.00028856783212690535, + "loss": 0.5444, + "step": 47720 + }, + { + "epoch": 1.062366452991453, + "grad_norm": 0.6297634840011597, + "learning_rate": 0.00028852602817549123, + "loss": 0.5524, + "step": 47730 + }, + { + "epoch": 1.0625890313390314, + "grad_norm": 0.8499884009361267, + "learning_rate": 0.0002884842194135003, + "loss": 0.5867, + "step": 47740 + }, + { + "epoch": 1.0628116096866096, + "grad_norm": 0.6831232309341431, + "learning_rate": 0.00028844240584320445, + "loss": 0.7312, + "step": 47750 + }, + { + "epoch": 1.063034188034188, + "grad_norm": 0.534511387348175, + "learning_rate": 0.00028840058746687584, + "loss": 0.5455, + "step": 47760 + }, + { + "epoch": 1.0632567663817665, + "grad_norm": 0.6291297078132629, + "learning_rate": 0.000288358764286787, + "loss": 0.5431, + "step": 47770 + }, + { + "epoch": 1.0634793447293447, + "grad_norm": 0.3100387454032898, + "learning_rate": 0.0002883169363052105, + "loss": 0.6269, + "step": 47780 + }, + { + "epoch": 1.0637019230769231, + "grad_norm": 0.43622884154319763, + "learning_rate": 0.0002882751035244194, + "loss": 0.5198, + "step": 47790 + }, + { + "epoch": 1.0639245014245013, + "grad_norm": 0.6858329772949219, + "learning_rate": 0.00028823326594668697, + "loss": 0.6213, + "step": 47800 + }, + { + "epoch": 1.0641470797720798, + "grad_norm": 0.8117880821228027, + "learning_rate": 0.0002881914235742865, + "loss": 0.5976, + "step": 47810 + }, + { + "epoch": 1.064369658119658, + "grad_norm": 0.7009490132331848, + "learning_rate": 0.000288149576409492, + "loss": 0.5409, + "step": 47820 + }, + { + "epoch": 1.0645922364672364, + "grad_norm": 0.5884186029434204, + "learning_rate": 0.0002881077244545773, + "loss": 0.6015, + "step": 47830 + }, + { + "epoch": 1.0648148148148149, + "grad_norm": 0.8653995394706726, + "learning_rate": 0.0002880658677118168, + "loss": 0.5313, + "step": 47840 + }, + { + "epoch": 1.065037393162393, + "grad_norm": 0.6076104044914246, + "learning_rate": 0.0002880240061834849, + "loss": 0.5347, + "step": 47850 + }, + { + "epoch": 1.0652599715099715, + "grad_norm": 0.4640044569969177, + "learning_rate": 0.0002879821398718564, + "loss": 0.4839, + "step": 47860 + }, + { + "epoch": 1.0654825498575498, + "grad_norm": 0.9877360463142395, + "learning_rate": 0.0002879402687792064, + "loss": 0.5968, + "step": 47870 + }, + { + "epoch": 1.0657051282051282, + "grad_norm": 0.857296347618103, + "learning_rate": 0.00028789839290781026, + "loss": 0.6268, + "step": 47880 + }, + { + "epoch": 1.0659277065527066, + "grad_norm": 0.6341042518615723, + "learning_rate": 0.00028785651225994346, + "loss": 0.5405, + "step": 47890 + }, + { + "epoch": 1.0661502849002849, + "grad_norm": 0.7061439156532288, + "learning_rate": 0.00028781462683788185, + "loss": 0.5172, + "step": 47900 + }, + { + "epoch": 1.0663728632478633, + "grad_norm": 0.7179070115089417, + "learning_rate": 0.0002877727366439016, + "loss": 0.6257, + "step": 47910 + }, + { + "epoch": 1.0665954415954415, + "grad_norm": 0.6872360110282898, + "learning_rate": 0.0002877308416802789, + "loss": 0.5932, + "step": 47920 + }, + { + "epoch": 1.06681801994302, + "grad_norm": 0.5205060839653015, + "learning_rate": 0.00028768894194929046, + "loss": 0.5841, + "step": 47930 + }, + { + "epoch": 1.0670405982905984, + "grad_norm": 0.6895760893821716, + "learning_rate": 0.0002876470374532132, + "loss": 0.6624, + "step": 47940 + }, + { + "epoch": 1.0672631766381766, + "grad_norm": 0.7013905048370361, + "learning_rate": 0.0002876051281943241, + "loss": 0.5467, + "step": 47950 + }, + { + "epoch": 1.067485754985755, + "grad_norm": 0.5206983089447021, + "learning_rate": 0.00028756321417490064, + "loss": 0.599, + "step": 47960 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.6484621167182922, + "learning_rate": 0.0002875212953972204, + "loss": 0.5841, + "step": 47970 + }, + { + "epoch": 1.0679309116809117, + "grad_norm": 0.47715893387794495, + "learning_rate": 0.00028747937186356136, + "loss": 0.6958, + "step": 47980 + }, + { + "epoch": 1.06815349002849, + "grad_norm": 0.343030720949173, + "learning_rate": 0.00028743744357620163, + "loss": 0.5377, + "step": 47990 + }, + { + "epoch": 1.0683760683760684, + "grad_norm": 0.7047831416130066, + "learning_rate": 0.0002873955105374196, + "loss": 0.6889, + "step": 48000 + }, + { + "epoch": 1.0685986467236468, + "grad_norm": 0.8576478958129883, + "learning_rate": 0.00028735357274949406, + "loss": 0.5977, + "step": 48010 + }, + { + "epoch": 1.068821225071225, + "grad_norm": 0.8345176577568054, + "learning_rate": 0.0002873116302147039, + "loss": 0.6413, + "step": 48020 + }, + { + "epoch": 1.0690438034188035, + "grad_norm": 0.9953126311302185, + "learning_rate": 0.0002872696829353282, + "loss": 0.4842, + "step": 48030 + }, + { + "epoch": 1.0692663817663817, + "grad_norm": 0.7177563905715942, + "learning_rate": 0.0002872277309136464, + "loss": 0.6846, + "step": 48040 + }, + { + "epoch": 1.06948896011396, + "grad_norm": 0.37509727478027344, + "learning_rate": 0.00028718577415193843, + "loss": 0.5025, + "step": 48050 + }, + { + "epoch": 1.0697115384615385, + "grad_norm": 0.6414170861244202, + "learning_rate": 0.0002871438126524841, + "loss": 0.5031, + "step": 48060 + }, + { + "epoch": 1.0699341168091168, + "grad_norm": 0.6359707713127136, + "learning_rate": 0.0002871018464175636, + "loss": 0.5773, + "step": 48070 + }, + { + "epoch": 1.0701566951566952, + "grad_norm": 0.6460133194923401, + "learning_rate": 0.0002870598754494575, + "loss": 0.6274, + "step": 48080 + }, + { + "epoch": 1.0703792735042734, + "grad_norm": 0.5895810127258301, + "learning_rate": 0.0002870178997504465, + "loss": 0.5491, + "step": 48090 + }, + { + "epoch": 1.0706018518518519, + "grad_norm": 0.7487587332725525, + "learning_rate": 0.0002869759193228116, + "loss": 0.5297, + "step": 48100 + }, + { + "epoch": 1.0708244301994303, + "grad_norm": 1.058447003364563, + "learning_rate": 0.000286933934168834, + "loss": 0.7013, + "step": 48110 + }, + { + "epoch": 1.0710470085470085, + "grad_norm": 0.6013885140419006, + "learning_rate": 0.0002868919442907953, + "loss": 0.6068, + "step": 48120 + }, + { + "epoch": 1.071269586894587, + "grad_norm": 0.5251742601394653, + "learning_rate": 0.00028684994969097716, + "loss": 0.7068, + "step": 48130 + }, + { + "epoch": 1.0714921652421652, + "grad_norm": 0.5119776725769043, + "learning_rate": 0.00028680795037166166, + "loss": 0.6008, + "step": 48140 + }, + { + "epoch": 1.0717147435897436, + "grad_norm": 1.0093798637390137, + "learning_rate": 0.0002867659463351311, + "loss": 0.6564, + "step": 48150 + }, + { + "epoch": 1.0719373219373218, + "grad_norm": 0.8163377046585083, + "learning_rate": 0.000286723937583668, + "loss": 0.6341, + "step": 48160 + }, + { + "epoch": 1.0721599002849003, + "grad_norm": 0.7382004857063293, + "learning_rate": 0.00028668192411955513, + "loss": 0.5709, + "step": 48170 + }, + { + "epoch": 1.0723824786324787, + "grad_norm": 0.5771430730819702, + "learning_rate": 0.0002866399059450755, + "loss": 0.5574, + "step": 48180 + }, + { + "epoch": 1.072605056980057, + "grad_norm": 0.5379883646965027, + "learning_rate": 0.00028659788306251247, + "loss": 0.5961, + "step": 48190 + }, + { + "epoch": 1.0728276353276354, + "grad_norm": 0.404498815536499, + "learning_rate": 0.0002865558554741496, + "loss": 0.6153, + "step": 48200 + }, + { + "epoch": 1.0730502136752136, + "grad_norm": 0.7866203188896179, + "learning_rate": 0.0002865138231822706, + "loss": 0.6833, + "step": 48210 + }, + { + "epoch": 1.073272792022792, + "grad_norm": 0.8402442932128906, + "learning_rate": 0.0002864717861891598, + "loss": 0.6441, + "step": 48220 + }, + { + "epoch": 1.0734953703703705, + "grad_norm": 0.4625440239906311, + "learning_rate": 0.0002864297444971013, + "loss": 0.6017, + "step": 48230 + }, + { + "epoch": 1.0737179487179487, + "grad_norm": 0.6925306916236877, + "learning_rate": 0.0002863876981083796, + "loss": 0.5939, + "step": 48240 + }, + { + "epoch": 1.0739405270655271, + "grad_norm": 0.7206820249557495, + "learning_rate": 0.00028634564702527973, + "loss": 0.4961, + "step": 48250 + }, + { + "epoch": 1.0741631054131053, + "grad_norm": 0.9899659752845764, + "learning_rate": 0.00028630359125008677, + "loss": 0.6196, + "step": 48260 + }, + { + "epoch": 1.0743856837606838, + "grad_norm": 0.505004346370697, + "learning_rate": 0.00028626153078508597, + "loss": 0.5804, + "step": 48270 + }, + { + "epoch": 1.0746082621082622, + "grad_norm": 0.46326836943626404, + "learning_rate": 0.000286219465632563, + "loss": 0.6611, + "step": 48280 + }, + { + "epoch": 1.0748308404558404, + "grad_norm": 0.6721147298812866, + "learning_rate": 0.0002861773957948036, + "loss": 0.5854, + "step": 48290 + }, + { + "epoch": 1.0750534188034189, + "grad_norm": 0.8311693072319031, + "learning_rate": 0.0002861353212740941, + "loss": 0.5729, + "step": 48300 + }, + { + "epoch": 1.075275997150997, + "grad_norm": 0.47469931840896606, + "learning_rate": 0.0002860932420727206, + "loss": 0.5694, + "step": 48310 + }, + { + "epoch": 1.0754985754985755, + "grad_norm": 0.44514262676239014, + "learning_rate": 0.0002860511581929699, + "loss": 0.671, + "step": 48320 + }, + { + "epoch": 1.0757211538461537, + "grad_norm": 0.6041156649589539, + "learning_rate": 0.00028600906963712885, + "loss": 0.6614, + "step": 48330 + }, + { + "epoch": 1.0759437321937322, + "grad_norm": 0.5938594341278076, + "learning_rate": 0.00028596697640748445, + "loss": 0.4862, + "step": 48340 + }, + { + "epoch": 1.0761663105413106, + "grad_norm": 0.8382576704025269, + "learning_rate": 0.0002859248785063243, + "loss": 0.5732, + "step": 48350 + }, + { + "epoch": 1.0763888888888888, + "grad_norm": 0.6398228406906128, + "learning_rate": 0.0002858827759359358, + "loss": 0.519, + "step": 48360 + }, + { + "epoch": 1.0766114672364673, + "grad_norm": 1.0226026773452759, + "learning_rate": 0.00028584066869860705, + "loss": 0.4869, + "step": 48370 + }, + { + "epoch": 1.0768340455840455, + "grad_norm": 0.42375320196151733, + "learning_rate": 0.00028579855679662603, + "loss": 0.4687, + "step": 48380 + }, + { + "epoch": 1.077056623931624, + "grad_norm": 0.6117098331451416, + "learning_rate": 0.00028575644023228115, + "loss": 0.5195, + "step": 48390 + }, + { + "epoch": 1.0772792022792024, + "grad_norm": 0.6376365423202515, + "learning_rate": 0.0002857143190078612, + "loss": 0.4911, + "step": 48400 + }, + { + "epoch": 1.0775017806267806, + "grad_norm": 0.5959184169769287, + "learning_rate": 0.00028567219312565495, + "loss": 0.6362, + "step": 48410 + }, + { + "epoch": 1.077724358974359, + "grad_norm": 0.5974500775337219, + "learning_rate": 0.0002856300625879516, + "loss": 0.6091, + "step": 48420 + }, + { + "epoch": 1.0779469373219372, + "grad_norm": 0.685723602771759, + "learning_rate": 0.00028558792739704047, + "loss": 0.5275, + "step": 48430 + }, + { + "epoch": 1.0781695156695157, + "grad_norm": 0.7660711407661438, + "learning_rate": 0.00028554578755521137, + "loss": 0.6141, + "step": 48440 + }, + { + "epoch": 1.0783920940170941, + "grad_norm": 0.6586264967918396, + "learning_rate": 0.00028550364306475416, + "loss": 0.5741, + "step": 48450 + }, + { + "epoch": 1.0786146723646723, + "grad_norm": 0.6372463703155518, + "learning_rate": 0.0002854614939279589, + "loss": 0.5087, + "step": 48460 + }, + { + "epoch": 1.0788372507122508, + "grad_norm": 0.6863496899604797, + "learning_rate": 0.00028541934014711623, + "loss": 0.6393, + "step": 48470 + }, + { + "epoch": 1.079059829059829, + "grad_norm": 0.3809172511100769, + "learning_rate": 0.00028537718172451664, + "loss": 0.5958, + "step": 48480 + }, + { + "epoch": 1.0792824074074074, + "grad_norm": 0.7188693284988403, + "learning_rate": 0.00028533501866245104, + "loss": 0.4992, + "step": 48490 + }, + { + "epoch": 1.0795049857549857, + "grad_norm": 0.5671699643135071, + "learning_rate": 0.00028529285096321074, + "loss": 0.5711, + "step": 48500 + }, + { + "epoch": 1.079727564102564, + "grad_norm": 0.669846773147583, + "learning_rate": 0.0002852506786290871, + "loss": 0.5551, + "step": 48510 + }, + { + "epoch": 1.0799501424501425, + "grad_norm": 0.6436952352523804, + "learning_rate": 0.0002852085016623717, + "loss": 0.5843, + "step": 48520 + }, + { + "epoch": 1.080128205128205, + "eval_loss": 0.597112238407135, + "eval_runtime": 337.36, + "eval_samples_per_second": 7.01, + "eval_steps_per_second": 7.01, + "step": 48528 + }, + { + "epoch": 1.0801727207977208, + "grad_norm": 0.5778088569641113, + "learning_rate": 0.0002851663200653566, + "loss": 0.5245, + "step": 48530 + }, + { + "epoch": 1.0803952991452992, + "grad_norm": 0.6053064465522766, + "learning_rate": 0.000285124133840334, + "loss": 0.5655, + "step": 48540 + }, + { + "epoch": 1.0806178774928774, + "grad_norm": 0.7350863814353943, + "learning_rate": 0.0002850819429895963, + "loss": 0.5324, + "step": 48550 + }, + { + "epoch": 1.0808404558404558, + "grad_norm": 0.568662703037262, + "learning_rate": 0.0002850397475154361, + "loss": 0.586, + "step": 48560 + }, + { + "epoch": 1.0810630341880343, + "grad_norm": 0.5690873861312866, + "learning_rate": 0.00028499754742014637, + "loss": 0.4823, + "step": 48570 + }, + { + "epoch": 1.0812856125356125, + "grad_norm": 0.7732914686203003, + "learning_rate": 0.00028495534270602045, + "loss": 0.5866, + "step": 48580 + }, + { + "epoch": 1.081508190883191, + "grad_norm": 0.856427788734436, + "learning_rate": 0.00028491313337535154, + "loss": 0.6257, + "step": 48590 + }, + { + "epoch": 1.0817307692307692, + "grad_norm": 0.6734537482261658, + "learning_rate": 0.0002848709194304335, + "loss": 0.5378, + "step": 48600 + }, + { + "epoch": 1.0819533475783476, + "grad_norm": 0.6803973317146301, + "learning_rate": 0.00028482870087356024, + "loss": 0.4872, + "step": 48610 + }, + { + "epoch": 1.082175925925926, + "grad_norm": 0.6282123923301697, + "learning_rate": 0.00028478647770702593, + "loss": 0.5887, + "step": 48620 + }, + { + "epoch": 1.0823985042735043, + "grad_norm": 0.4611707627773285, + "learning_rate": 0.000284744249933125, + "loss": 0.5363, + "step": 48630 + }, + { + "epoch": 1.0826210826210827, + "grad_norm": 0.7379218339920044, + "learning_rate": 0.0002847020175541521, + "loss": 0.6323, + "step": 48640 + }, + { + "epoch": 1.082843660968661, + "grad_norm": 0.745467483997345, + "learning_rate": 0.00028465978057240233, + "loss": 0.5655, + "step": 48650 + }, + { + "epoch": 1.0830662393162394, + "grad_norm": 1.1102386713027954, + "learning_rate": 0.0002846175389901707, + "loss": 0.6453, + "step": 48660 + }, + { + "epoch": 1.0832888176638176, + "grad_norm": 0.4855097532272339, + "learning_rate": 0.0002845752928097527, + "loss": 0.4949, + "step": 48670 + }, + { + "epoch": 1.083511396011396, + "grad_norm": 0.6703089475631714, + "learning_rate": 0.00028453304203344417, + "loss": 0.6062, + "step": 48680 + }, + { + "epoch": 1.0837339743589745, + "grad_norm": 0.6551437377929688, + "learning_rate": 0.0002844907866635409, + "loss": 0.5481, + "step": 48690 + }, + { + "epoch": 1.0839565527065527, + "grad_norm": 0.49450287222862244, + "learning_rate": 0.00028444852670233905, + "loss": 0.712, + "step": 48700 + }, + { + "epoch": 1.084179131054131, + "grad_norm": 0.9869162440299988, + "learning_rate": 0.00028440626215213514, + "loss": 0.6178, + "step": 48710 + }, + { + "epoch": 1.0844017094017093, + "grad_norm": 0.7060110569000244, + "learning_rate": 0.0002843639930152259, + "loss": 0.5479, + "step": 48720 + }, + { + "epoch": 1.0846242877492878, + "grad_norm": 0.6050060391426086, + "learning_rate": 0.00028432171929390816, + "loss": 0.5437, + "step": 48730 + }, + { + "epoch": 1.084846866096866, + "grad_norm": 0.5769473314285278, + "learning_rate": 0.0002842794409904792, + "loss": 0.5704, + "step": 48740 + }, + { + "epoch": 1.0850694444444444, + "grad_norm": 0.6124515533447266, + "learning_rate": 0.00028423715810723646, + "loss": 0.4613, + "step": 48750 + }, + { + "epoch": 1.0852920227920229, + "grad_norm": 0.7483047842979431, + "learning_rate": 0.00028419487064647753, + "loss": 0.5731, + "step": 48760 + }, + { + "epoch": 1.085514601139601, + "grad_norm": 0.6910437345504761, + "learning_rate": 0.0002841525786105003, + "loss": 0.5796, + "step": 48770 + }, + { + "epoch": 1.0857371794871795, + "grad_norm": 0.9437000751495361, + "learning_rate": 0.00028411028200160324, + "loss": 0.6652, + "step": 48780 + }, + { + "epoch": 1.0859597578347577, + "grad_norm": 0.8151460886001587, + "learning_rate": 0.0002840679808220845, + "loss": 0.649, + "step": 48790 + }, + { + "epoch": 1.0861823361823362, + "grad_norm": 0.6039183735847473, + "learning_rate": 0.0002840256750742429, + "loss": 0.509, + "step": 48800 + }, + { + "epoch": 1.0864049145299146, + "grad_norm": 0.7109172344207764, + "learning_rate": 0.00028398336476037736, + "loss": 0.4449, + "step": 48810 + }, + { + "epoch": 1.0866274928774928, + "grad_norm": 0.5566388964653015, + "learning_rate": 0.000283941049882787, + "loss": 0.5998, + "step": 48820 + }, + { + "epoch": 1.0868500712250713, + "grad_norm": 0.5644212365150452, + "learning_rate": 0.00028389873044377126, + "loss": 0.5577, + "step": 48830 + }, + { + "epoch": 1.0870726495726495, + "grad_norm": 0.5297816395759583, + "learning_rate": 0.0002838564064456298, + "loss": 0.4875, + "step": 48840 + }, + { + "epoch": 1.087295227920228, + "grad_norm": 0.792813241481781, + "learning_rate": 0.0002838140778906626, + "loss": 0.6341, + "step": 48850 + }, + { + "epoch": 1.0875178062678064, + "grad_norm": 0.5224297642707825, + "learning_rate": 0.0002837717447811698, + "loss": 0.5436, + "step": 48860 + }, + { + "epoch": 1.0877403846153846, + "grad_norm": 0.7507627606391907, + "learning_rate": 0.0002837294071194518, + "loss": 0.531, + "step": 48870 + }, + { + "epoch": 1.087962962962963, + "grad_norm": 0.8729502558708191, + "learning_rate": 0.0002836870649078092, + "loss": 0.7076, + "step": 48880 + }, + { + "epoch": 1.0881855413105412, + "grad_norm": 0.6775315403938293, + "learning_rate": 0.00028364471814854307, + "loss": 0.5732, + "step": 48890 + }, + { + "epoch": 1.0884081196581197, + "grad_norm": 0.43340983986854553, + "learning_rate": 0.00028360236684395445, + "loss": 0.6266, + "step": 48900 + }, + { + "epoch": 1.088630698005698, + "grad_norm": 0.7650466561317444, + "learning_rate": 0.00028356001099634476, + "loss": 0.594, + "step": 48910 + }, + { + "epoch": 1.0888532763532763, + "grad_norm": 0.7358598709106445, + "learning_rate": 0.00028351765060801576, + "loss": 0.5908, + "step": 48920 + }, + { + "epoch": 1.0890758547008548, + "grad_norm": 0.688077986240387, + "learning_rate": 0.00028347528568126916, + "loss": 0.6863, + "step": 48930 + }, + { + "epoch": 1.089298433048433, + "grad_norm": 0.714925229549408, + "learning_rate": 0.00028343291621840726, + "loss": 0.6128, + "step": 48940 + }, + { + "epoch": 1.0895210113960114, + "grad_norm": 0.7040088176727295, + "learning_rate": 0.00028339054222173237, + "loss": 0.5468, + "step": 48950 + }, + { + "epoch": 1.0897435897435896, + "grad_norm": 0.5987093448638916, + "learning_rate": 0.00028334816369354716, + "loss": 0.704, + "step": 48960 + }, + { + "epoch": 1.089966168091168, + "grad_norm": 0.6418634653091431, + "learning_rate": 0.00028330578063615443, + "loss": 0.4999, + "step": 48970 + }, + { + "epoch": 1.0901887464387465, + "grad_norm": 0.6263144612312317, + "learning_rate": 0.0002832633930518574, + "loss": 0.5361, + "step": 48980 + }, + { + "epoch": 1.0904113247863247, + "grad_norm": 0.6335980296134949, + "learning_rate": 0.00028322100094295953, + "loss": 0.6292, + "step": 48990 + }, + { + "epoch": 1.0906339031339032, + "grad_norm": 0.8092513084411621, + "learning_rate": 0.0002831786043117643, + "loss": 0.6042, + "step": 49000 + }, + { + "epoch": 1.0908564814814814, + "grad_norm": 0.5754400491714478, + "learning_rate": 0.00028313620316057557, + "loss": 0.4429, + "step": 49010 + }, + { + "epoch": 1.0910790598290598, + "grad_norm": 0.5414375066757202, + "learning_rate": 0.0002830937974916975, + "loss": 0.5623, + "step": 49020 + }, + { + "epoch": 1.0913016381766383, + "grad_norm": 0.699462890625, + "learning_rate": 0.00028305138730743453, + "loss": 0.605, + "step": 49030 + }, + { + "epoch": 1.0915242165242165, + "grad_norm": 0.5149260759353638, + "learning_rate": 0.0002830089726100911, + "loss": 0.4677, + "step": 49040 + }, + { + "epoch": 1.091746794871795, + "grad_norm": 0.36808982491493225, + "learning_rate": 0.00028296655340197214, + "loss": 0.559, + "step": 49050 + }, + { + "epoch": 1.0919693732193732, + "grad_norm": 0.4984835386276245, + "learning_rate": 0.00028292412968538287, + "loss": 0.6101, + "step": 49060 + }, + { + "epoch": 1.0921919515669516, + "grad_norm": 0.5984585881233215, + "learning_rate": 0.0002828817014626284, + "loss": 0.5243, + "step": 49070 + }, + { + "epoch": 1.0924145299145298, + "grad_norm": 0.469533771276474, + "learning_rate": 0.0002828392687360144, + "loss": 0.5437, + "step": 49080 + }, + { + "epoch": 1.0926371082621082, + "grad_norm": 0.5558397173881531, + "learning_rate": 0.00028279683150784677, + "loss": 0.5478, + "step": 49090 + }, + { + "epoch": 1.0928596866096867, + "grad_norm": 0.6726979613304138, + "learning_rate": 0.00028275438978043153, + "loss": 0.4915, + "step": 49100 + }, + { + "epoch": 1.093082264957265, + "grad_norm": 0.8622997403144836, + "learning_rate": 0.000282711943556075, + "loss": 0.5943, + "step": 49110 + }, + { + "epoch": 1.0933048433048433, + "grad_norm": 0.43874213099479675, + "learning_rate": 0.00028266949283708374, + "loss": 0.5687, + "step": 49120 + }, + { + "epoch": 1.0935274216524216, + "grad_norm": 0.6451436281204224, + "learning_rate": 0.0002826270376257646, + "loss": 0.6372, + "step": 49130 + }, + { + "epoch": 1.09375, + "grad_norm": 0.7769515514373779, + "learning_rate": 0.0002825845779244246, + "loss": 0.6425, + "step": 49140 + }, + { + "epoch": 1.0939725783475784, + "grad_norm": 0.4474250078201294, + "learning_rate": 0.000282542113735371, + "loss": 0.6699, + "step": 49150 + }, + { + "epoch": 1.0941951566951567, + "grad_norm": 0.9048413634300232, + "learning_rate": 0.00028249964506091134, + "loss": 0.7362, + "step": 49160 + }, + { + "epoch": 1.094417735042735, + "grad_norm": 0.9284833669662476, + "learning_rate": 0.0002824571719033535, + "loss": 0.5964, + "step": 49170 + }, + { + "epoch": 1.0946403133903133, + "grad_norm": 0.45629414916038513, + "learning_rate": 0.00028241469426500533, + "loss": 0.502, + "step": 49180 + }, + { + "epoch": 1.0948628917378918, + "grad_norm": 0.6365179419517517, + "learning_rate": 0.00028237221214817525, + "loss": 0.5897, + "step": 49190 + }, + { + "epoch": 1.0950854700854702, + "grad_norm": 0.7972654104232788, + "learning_rate": 0.00028232972555517177, + "loss": 0.5398, + "step": 49200 + }, + { + "epoch": 1.0953080484330484, + "grad_norm": 0.6874790787696838, + "learning_rate": 0.0002822872344883036, + "loss": 0.6816, + "step": 49210 + }, + { + "epoch": 1.0955306267806268, + "grad_norm": 0.6386187672615051, + "learning_rate": 0.0002822447389498797, + "loss": 0.5956, + "step": 49220 + }, + { + "epoch": 1.095753205128205, + "grad_norm": 0.6826030611991882, + "learning_rate": 0.00028220223894220934, + "loss": 0.5611, + "step": 49230 + }, + { + "epoch": 1.0959757834757835, + "grad_norm": 0.8523663282394409, + "learning_rate": 0.0002821597344676021, + "loss": 0.5323, + "step": 49240 + }, + { + "epoch": 1.0961983618233617, + "grad_norm": 0.8153828382492065, + "learning_rate": 0.0002821172255283676, + "loss": 0.6641, + "step": 49250 + }, + { + "epoch": 1.0964209401709402, + "grad_norm": 0.638305127620697, + "learning_rate": 0.00028207471212681585, + "loss": 0.6519, + "step": 49260 + }, + { + "epoch": 1.0966435185185186, + "grad_norm": 0.4358106553554535, + "learning_rate": 0.000282032194265257, + "loss": 0.5503, + "step": 49270 + }, + { + "epoch": 1.0968660968660968, + "grad_norm": 0.6441114544868469, + "learning_rate": 0.0002819896719460016, + "loss": 0.4837, + "step": 49280 + }, + { + "epoch": 1.0970886752136753, + "grad_norm": 0.43398475646972656, + "learning_rate": 0.00028194714517136034, + "loss": 0.4673, + "step": 49290 + }, + { + "epoch": 1.0973112535612535, + "grad_norm": 0.6699478626251221, + "learning_rate": 0.00028190461394364405, + "loss": 0.611, + "step": 49300 + }, + { + "epoch": 1.097533831908832, + "grad_norm": 0.519451379776001, + "learning_rate": 0.00028186207826516406, + "loss": 0.5189, + "step": 49310 + }, + { + "epoch": 1.0977564102564104, + "grad_norm": 0.5254443883895874, + "learning_rate": 0.0002818195381382316, + "loss": 0.5701, + "step": 49320 + }, + { + "epoch": 1.0979789886039886, + "grad_norm": 0.6886885762214661, + "learning_rate": 0.0002817769935651586, + "loss": 0.4703, + "step": 49330 + }, + { + "epoch": 1.098201566951567, + "grad_norm": 0.7456334233283997, + "learning_rate": 0.0002817344445482568, + "loss": 0.6408, + "step": 49340 + }, + { + "epoch": 1.0984241452991452, + "grad_norm": 0.7715759873390198, + "learning_rate": 0.00028169189108983835, + "loss": 0.53, + "step": 49350 + }, + { + "epoch": 1.0986467236467237, + "grad_norm": 0.647322952747345, + "learning_rate": 0.0002816493331922156, + "loss": 0.5447, + "step": 49360 + }, + { + "epoch": 1.098869301994302, + "grad_norm": 0.6035053730010986, + "learning_rate": 0.0002816067708577013, + "loss": 0.5208, + "step": 49370 + }, + { + "epoch": 1.0990918803418803, + "grad_norm": 0.6071327328681946, + "learning_rate": 0.0002815642040886083, + "loss": 0.5828, + "step": 49380 + }, + { + "epoch": 1.0993144586894588, + "grad_norm": 1.0424257516860962, + "learning_rate": 0.0002815216328872496, + "loss": 0.5788, + "step": 49390 + }, + { + "epoch": 1.099537037037037, + "grad_norm": 0.7601616978645325, + "learning_rate": 0.0002814790572559387, + "loss": 0.5718, + "step": 49400 + }, + { + "epoch": 1.0997596153846154, + "grad_norm": 0.7542137503623962, + "learning_rate": 0.0002814364771969891, + "loss": 0.5238, + "step": 49410 + }, + { + "epoch": 1.0999821937321936, + "grad_norm": 0.7142971754074097, + "learning_rate": 0.0002813938927127147, + "loss": 0.5532, + "step": 49420 + }, + { + "epoch": 1.100204772079772, + "grad_norm": 0.8098645806312561, + "learning_rate": 0.00028135130380542943, + "loss": 0.5718, + "step": 49430 + }, + { + "epoch": 1.1004273504273505, + "grad_norm": 0.7008213400840759, + "learning_rate": 0.0002813087104774478, + "loss": 0.6006, + "step": 49440 + }, + { + "epoch": 1.1006499287749287, + "grad_norm": 0.7402480244636536, + "learning_rate": 0.0002812661127310843, + "loss": 0.5456, + "step": 49450 + }, + { + "epoch": 1.1008725071225072, + "grad_norm": 0.36383122205734253, + "learning_rate": 0.0002812235105686537, + "loss": 0.5996, + "step": 49460 + }, + { + "epoch": 1.1010950854700854, + "grad_norm": 0.5641146302223206, + "learning_rate": 0.00028118090399247096, + "loss": 0.5792, + "step": 49470 + }, + { + "epoch": 1.1013176638176638, + "grad_norm": 0.8477437496185303, + "learning_rate": 0.0002811382930048515, + "loss": 0.5853, + "step": 49480 + }, + { + "epoch": 1.1015402421652423, + "grad_norm": 0.6013820171356201, + "learning_rate": 0.0002810956776081108, + "loss": 0.5816, + "step": 49490 + }, + { + "epoch": 1.1017628205128205, + "grad_norm": 1.032570242881775, + "learning_rate": 0.00028105305780456454, + "loss": 0.5702, + "step": 49500 + }, + { + "epoch": 1.101985398860399, + "grad_norm": 0.7165135145187378, + "learning_rate": 0.00028101043359652874, + "loss": 0.6409, + "step": 49510 + }, + { + "epoch": 1.1022079772079771, + "grad_norm": 0.7688490152359009, + "learning_rate": 0.0002809678049863197, + "loss": 0.589, + "step": 49520 + }, + { + "epoch": 1.1024305555555556, + "grad_norm": 0.5828442573547363, + "learning_rate": 0.00028092517197625394, + "loss": 0.4621, + "step": 49530 + }, + { + "epoch": 1.102653133903134, + "grad_norm": 0.4801251292228699, + "learning_rate": 0.00028088253456864796, + "loss": 0.6135, + "step": 49540 + }, + { + "epoch": 1.1028757122507122, + "grad_norm": 0.5696801543235779, + "learning_rate": 0.00028083989276581886, + "loss": 0.6186, + "step": 49550 + }, + { + "epoch": 1.1030982905982907, + "grad_norm": 0.6546761989593506, + "learning_rate": 0.00028079724657008385, + "loss": 0.592, + "step": 49560 + }, + { + "epoch": 1.103320868945869, + "grad_norm": 0.6570590734481812, + "learning_rate": 0.0002807545959837603, + "loss": 0.6159, + "step": 49570 + }, + { + "epoch": 1.1035434472934473, + "grad_norm": 0.5510005354881287, + "learning_rate": 0.0002807119410091659, + "loss": 0.4846, + "step": 49580 + }, + { + "epoch": 1.1037660256410255, + "grad_norm": 0.841663122177124, + "learning_rate": 0.00028066928164861854, + "loss": 0.618, + "step": 49590 + }, + { + "epoch": 1.103988603988604, + "grad_norm": 0.5082978010177612, + "learning_rate": 0.0002806266179044364, + "loss": 0.6095, + "step": 49600 + }, + { + "epoch": 1.1042111823361824, + "grad_norm": 0.6725553274154663, + "learning_rate": 0.0002805839497789378, + "loss": 0.5768, + "step": 49610 + }, + { + "epoch": 1.1044337606837606, + "grad_norm": 0.5792170166969299, + "learning_rate": 0.0002805412772744414, + "loss": 0.6151, + "step": 49620 + }, + { + "epoch": 1.104656339031339, + "grad_norm": 0.6467113494873047, + "learning_rate": 0.0002804986003932661, + "loss": 0.5936, + "step": 49630 + }, + { + "epoch": 1.1048789173789173, + "grad_norm": 0.5447518229484558, + "learning_rate": 0.0002804559191377309, + "loss": 0.4788, + "step": 49640 + }, + { + "epoch": 1.1051014957264957, + "grad_norm": 0.6909335851669312, + "learning_rate": 0.00028041323351015525, + "loss": 0.5433, + "step": 49650 + }, + { + "epoch": 1.105324074074074, + "grad_norm": 0.6738542318344116, + "learning_rate": 0.00028037054351285864, + "loss": 0.4256, + "step": 49660 + }, + { + "epoch": 1.1055466524216524, + "grad_norm": 0.4187549352645874, + "learning_rate": 0.0002803278491481609, + "loss": 0.5291, + "step": 49670 + }, + { + "epoch": 1.1057692307692308, + "grad_norm": 0.5423128604888916, + "learning_rate": 0.00028028515041838203, + "loss": 0.5666, + "step": 49680 + }, + { + "epoch": 1.105991809116809, + "grad_norm": 0.41569629311561584, + "learning_rate": 0.00028024244732584243, + "loss": 0.625, + "step": 49690 + }, + { + "epoch": 1.1062143874643875, + "grad_norm": 3.5168192386627197, + "learning_rate": 0.00028019973987286254, + "loss": 0.8, + "step": 49700 + }, + { + "epoch": 1.1064369658119657, + "grad_norm": 0.8853825926780701, + "learning_rate": 0.00028015702806176315, + "loss": 0.4617, + "step": 49710 + }, + { + "epoch": 1.1066595441595442, + "grad_norm": 0.7445874214172363, + "learning_rate": 0.00028011431189486517, + "loss": 0.5553, + "step": 49720 + }, + { + "epoch": 1.1068821225071226, + "grad_norm": 1.18168044090271, + "learning_rate": 0.00028007159137448997, + "loss": 0.582, + "step": 49730 + }, + { + "epoch": 1.1071047008547008, + "grad_norm": 0.5334780216217041, + "learning_rate": 0.0002800288665029589, + "loss": 0.5465, + "step": 49740 + }, + { + "epoch": 1.1073272792022792, + "grad_norm": 0.9836109280586243, + "learning_rate": 0.00027998613728259374, + "loss": 0.6869, + "step": 49750 + }, + { + "epoch": 1.1075498575498575, + "grad_norm": 0.9165136218070984, + "learning_rate": 0.00027994340371571635, + "loss": 0.6017, + "step": 49760 + }, + { + "epoch": 1.107772435897436, + "grad_norm": 0.8517610430717468, + "learning_rate": 0.00027990066580464896, + "loss": 0.5321, + "step": 49770 + }, + { + "epoch": 1.1079950142450143, + "grad_norm": 0.6246629953384399, + "learning_rate": 0.00027985792355171406, + "loss": 0.5605, + "step": 49780 + }, + { + "epoch": 1.1082175925925926, + "grad_norm": 0.9364470839500427, + "learning_rate": 0.0002798151769592342, + "loss": 0.6157, + "step": 49790 + }, + { + "epoch": 1.108440170940171, + "grad_norm": 0.7243967652320862, + "learning_rate": 0.0002797724260295322, + "loss": 0.6616, + "step": 49800 + }, + { + "epoch": 1.1086627492877492, + "grad_norm": 0.9982474446296692, + "learning_rate": 0.00027972967076493133, + "loss": 0.5218, + "step": 49810 + }, + { + "epoch": 1.1088853276353277, + "grad_norm": 0.7943207621574402, + "learning_rate": 0.0002796869111677548, + "loss": 0.6087, + "step": 49820 + }, + { + "epoch": 1.1091079059829059, + "grad_norm": 1.0676311254501343, + "learning_rate": 0.0002796441472403264, + "loss": 0.6007, + "step": 49830 + }, + { + "epoch": 1.1093304843304843, + "grad_norm": 0.6319277882575989, + "learning_rate": 0.0002796013789849698, + "loss": 0.6302, + "step": 49840 + }, + { + "epoch": 1.1095530626780628, + "grad_norm": 0.6356170773506165, + "learning_rate": 0.0002795586064040091, + "loss": 0.583, + "step": 49850 + }, + { + "epoch": 1.109775641025641, + "grad_norm": 0.652543306350708, + "learning_rate": 0.00027951582949976855, + "loss": 0.5332, + "step": 49860 + }, + { + "epoch": 1.1099982193732194, + "grad_norm": 1.1238377094268799, + "learning_rate": 0.00027947304827457273, + "loss": 0.6134, + "step": 49870 + }, + { + "epoch": 1.1102207977207976, + "grad_norm": 0.459655225276947, + "learning_rate": 0.0002794302627307465, + "loss": 0.497, + "step": 49880 + }, + { + "epoch": 1.110443376068376, + "grad_norm": 0.7160477042198181, + "learning_rate": 0.0002793874728706147, + "loss": 0.5633, + "step": 49890 + }, + { + "epoch": 1.1106659544159545, + "grad_norm": 0.5809594988822937, + "learning_rate": 0.0002793446786965026, + "loss": 0.5299, + "step": 49900 + }, + { + "epoch": 1.1108885327635327, + "grad_norm": 0.36635035276412964, + "learning_rate": 0.0002793018802107358, + "loss": 0.4768, + "step": 49910 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7522535920143127, + "learning_rate": 0.0002792590774156399, + "loss": 0.5788, + "step": 49920 + }, + { + "epoch": 1.1113336894586894, + "grad_norm": 0.687225878238678, + "learning_rate": 0.0002792162703135408, + "loss": 0.6284, + "step": 49930 + }, + { + "epoch": 1.1115562678062678, + "grad_norm": 0.766548216342926, + "learning_rate": 0.0002791734589067647, + "loss": 0.5067, + "step": 49940 + }, + { + "epoch": 1.1117788461538463, + "grad_norm": 0.6063271164894104, + "learning_rate": 0.00027913064319763805, + "loss": 0.6553, + "step": 49950 + }, + { + "epoch": 1.1120014245014245, + "grad_norm": 0.5717689990997314, + "learning_rate": 0.0002790878231884875, + "loss": 0.4677, + "step": 49960 + }, + { + "epoch": 1.112224002849003, + "grad_norm": 0.8794634342193604, + "learning_rate": 0.00027904499888163983, + "loss": 0.5599, + "step": 49970 + }, + { + "epoch": 1.1124465811965811, + "grad_norm": 0.7475899457931519, + "learning_rate": 0.0002790021702794223, + "loss": 0.5733, + "step": 49980 + }, + { + "epoch": 1.1126691595441596, + "grad_norm": 0.4899514615535736, + "learning_rate": 0.0002789593373841621, + "loss": 0.4454, + "step": 49990 + }, + { + "epoch": 1.1128917378917378, + "grad_norm": 0.5497984290122986, + "learning_rate": 0.0002789165001981869, + "loss": 0.5753, + "step": 50000 + }, + { + "epoch": 1.1131143162393162, + "grad_norm": 0.5439674854278564, + "learning_rate": 0.00027887365872382447, + "loss": 0.5884, + "step": 50010 + }, + { + "epoch": 1.1133368945868947, + "grad_norm": 0.4920559227466583, + "learning_rate": 0.00027883081296340285, + "loss": 0.4844, + "step": 50020 + }, + { + "epoch": 1.1135594729344729, + "grad_norm": 0.6336165070533752, + "learning_rate": 0.0002787879629192503, + "loss": 0.5899, + "step": 50030 + }, + { + "epoch": 1.1137820512820513, + "grad_norm": 0.42336025834083557, + "learning_rate": 0.00027874510859369534, + "loss": 0.4811, + "step": 50040 + }, + { + "epoch": 1.1140046296296295, + "grad_norm": 0.649666965007782, + "learning_rate": 0.0002787022499890668, + "loss": 0.5941, + "step": 50050 + }, + { + "epoch": 1.114227207977208, + "grad_norm": 0.6693488955497742, + "learning_rate": 0.0002786593871076935, + "loss": 0.5914, + "step": 50060 + }, + { + "epoch": 1.1144497863247864, + "grad_norm": 0.8287727236747742, + "learning_rate": 0.0002786165199519047, + "loss": 0.5427, + "step": 50070 + }, + { + "epoch": 1.1146723646723646, + "grad_norm": 0.9317525029182434, + "learning_rate": 0.0002785736485240299, + "loss": 0.4981, + "step": 50080 + }, + { + "epoch": 1.114894943019943, + "grad_norm": 0.4562414884567261, + "learning_rate": 0.00027853077282639867, + "loss": 0.536, + "step": 50090 + }, + { + "epoch": 1.1151175213675213, + "grad_norm": 0.7502874135971069, + "learning_rate": 0.0002784878928613409, + "loss": 0.6141, + "step": 50100 + }, + { + "epoch": 1.1153400997150997, + "grad_norm": 0.6522567868232727, + "learning_rate": 0.00027844500863118685, + "loss": 0.528, + "step": 50110 + }, + { + "epoch": 1.1155626780626782, + "grad_norm": 0.6541271209716797, + "learning_rate": 0.0002784021201382669, + "loss": 0.4591, + "step": 50120 + }, + { + "epoch": 1.1157852564102564, + "grad_norm": 0.48647746443748474, + "learning_rate": 0.0002783592273849114, + "loss": 0.5641, + "step": 50130 + }, + { + "epoch": 1.1160078347578348, + "grad_norm": 0.5013317465782166, + "learning_rate": 0.00027831633037345144, + "loss": 0.5116, + "step": 50140 + }, + { + "epoch": 1.116230413105413, + "grad_norm": 0.8025067448616028, + "learning_rate": 0.0002782734291062179, + "loss": 0.6091, + "step": 50150 + }, + { + "epoch": 1.1164529914529915, + "grad_norm": 0.8498494625091553, + "learning_rate": 0.0002782305235855422, + "loss": 0.5829, + "step": 50160 + }, + { + "epoch": 1.1166755698005697, + "grad_norm": 0.6301447749137878, + "learning_rate": 0.00027818761381375573, + "loss": 0.5449, + "step": 50170 + }, + { + "epoch": 1.1168981481481481, + "grad_norm": 0.3991430103778839, + "learning_rate": 0.00027814469979319043, + "loss": 0.6215, + "step": 50180 + }, + { + "epoch": 1.1171207264957266, + "grad_norm": 0.8412891626358032, + "learning_rate": 0.00027810178152617814, + "loss": 0.4809, + "step": 50190 + }, + { + "epoch": 1.1173433048433048, + "grad_norm": 0.7133082747459412, + "learning_rate": 0.00027805885901505107, + "loss": 0.4103, + "step": 50200 + }, + { + "epoch": 1.1175658831908832, + "grad_norm": 0.8361467719078064, + "learning_rate": 0.0002780159322621417, + "loss": 0.5977, + "step": 50210 + }, + { + "epoch": 1.1177884615384615, + "grad_norm": 0.7250277996063232, + "learning_rate": 0.0002779730012697827, + "loss": 0.6668, + "step": 50220 + }, + { + "epoch": 1.11801103988604, + "grad_norm": 0.5908347368240356, + "learning_rate": 0.00027793006604030703, + "loss": 0.5778, + "step": 50230 + }, + { + "epoch": 1.1182336182336183, + "grad_norm": 0.8042272925376892, + "learning_rate": 0.0002778871265760477, + "loss": 0.6422, + "step": 50240 + }, + { + "epoch": 1.1184561965811965, + "grad_norm": 0.867863655090332, + "learning_rate": 0.0002778441828793382, + "loss": 0.6648, + "step": 50250 + }, + { + "epoch": 1.118678774928775, + "grad_norm": 0.3790102005004883, + "learning_rate": 0.0002778012349525121, + "loss": 0.6024, + "step": 50260 + }, + { + "epoch": 1.1189013532763532, + "grad_norm": 0.6354236006736755, + "learning_rate": 0.00027775828279790317, + "loss": 0.5493, + "step": 50270 + }, + { + "epoch": 1.1191239316239316, + "grad_norm": 0.9611608386039734, + "learning_rate": 0.00027771532641784544, + "loss": 0.7362, + "step": 50280 + }, + { + "epoch": 1.11934650997151, + "grad_norm": 0.5919707417488098, + "learning_rate": 0.00027767236581467333, + "loss": 0.5445, + "step": 50290 + }, + { + "epoch": 1.1195690883190883, + "grad_norm": 0.6382510662078857, + "learning_rate": 0.00027762940099072125, + "loss": 0.6089, + "step": 50300 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.6243929862976074, + "learning_rate": 0.0002775864319483239, + "loss": 0.6145, + "step": 50310 + }, + { + "epoch": 1.120014245014245, + "grad_norm": 0.5520569086074829, + "learning_rate": 0.0002775434586898165, + "loss": 0.4577, + "step": 50320 + }, + { + "epoch": 1.1202368233618234, + "grad_norm": 0.5697688460350037, + "learning_rate": 0.00027750048121753394, + "loss": 0.4462, + "step": 50330 + }, + { + "epoch": 1.1204594017094016, + "grad_norm": 0.7084352374076843, + "learning_rate": 0.0002774574995338118, + "loss": 0.5839, + "step": 50340 + }, + { + "epoch": 1.12068198005698, + "grad_norm": 0.45827311277389526, + "learning_rate": 0.0002774145136409858, + "loss": 0.6047, + "step": 50350 + }, + { + "epoch": 1.1209045584045585, + "grad_norm": 0.7183460593223572, + "learning_rate": 0.00027737152354139166, + "loss": 0.5396, + "step": 50360 + }, + { + "epoch": 1.1211271367521367, + "grad_norm": 0.6099705696105957, + "learning_rate": 0.0002773285292373656, + "loss": 0.6081, + "step": 50370 + }, + { + "epoch": 1.1213497150997151, + "grad_norm": 0.8301957845687866, + "learning_rate": 0.00027728553073124405, + "loss": 0.5672, + "step": 50380 + }, + { + "epoch": 1.1215722934472934, + "grad_norm": 0.6909571886062622, + "learning_rate": 0.00027724252802536337, + "loss": 0.4832, + "step": 50390 + }, + { + "epoch": 1.1217948717948718, + "grad_norm": 0.5145627856254578, + "learning_rate": 0.00027719952112206054, + "loss": 0.5603, + "step": 50400 + }, + { + "epoch": 1.1220174501424502, + "grad_norm": 0.7964401841163635, + "learning_rate": 0.0002771565100236726, + "loss": 0.6, + "step": 50410 + }, + { + "epoch": 1.1222400284900285, + "grad_norm": 0.4715334475040436, + "learning_rate": 0.00027711349473253657, + "loss": 0.4057, + "step": 50420 + }, + { + "epoch": 1.122462606837607, + "grad_norm": 0.6473780870437622, + "learning_rate": 0.0002770704752509903, + "loss": 0.5549, + "step": 50430 + }, + { + "epoch": 1.1226851851851851, + "grad_norm": 0.5076059699058533, + "learning_rate": 0.00027702745158137115, + "loss": 0.6064, + "step": 50440 + }, + { + "epoch": 1.1229077635327636, + "grad_norm": 0.8316875100135803, + "learning_rate": 0.00027698442372601736, + "loss": 0.5, + "step": 50450 + }, + { + "epoch": 1.123130341880342, + "grad_norm": 0.718682587146759, + "learning_rate": 0.0002769413916872669, + "loss": 0.6023, + "step": 50460 + }, + { + "epoch": 1.1233529202279202, + "grad_norm": 0.7767266631126404, + "learning_rate": 0.00027689835546745823, + "loss": 0.5666, + "step": 50470 + }, + { + "epoch": 1.1235754985754987, + "grad_norm": 0.5583837628364563, + "learning_rate": 0.00027685531506892993, + "loss": 0.57, + "step": 50480 + }, + { + "epoch": 1.1237980769230769, + "grad_norm": 0.6625356674194336, + "learning_rate": 0.00027681227049402093, + "loss": 0.4804, + "step": 50490 + }, + { + "epoch": 1.1240206552706553, + "grad_norm": 0.42987099289894104, + "learning_rate": 0.0002767692217450703, + "loss": 0.4838, + "step": 50500 + }, + { + "epoch": 1.1242432336182335, + "grad_norm": 0.4361467957496643, + "learning_rate": 0.00027672616882441726, + "loss": 0.6805, + "step": 50510 + }, + { + "epoch": 1.124465811965812, + "grad_norm": 0.7139217257499695, + "learning_rate": 0.00027668311173440147, + "loss": 0.5663, + "step": 50520 + }, + { + "epoch": 1.1246883903133904, + "grad_norm": 0.7087649703025818, + "learning_rate": 0.0002766400504773625, + "loss": 0.5915, + "step": 50530 + }, + { + "epoch": 1.1249109686609686, + "grad_norm": 0.6644691824913025, + "learning_rate": 0.00027659698505564056, + "loss": 0.5914, + "step": 50540 + }, + { + "epoch": 1.125133547008547, + "grad_norm": 0.6349228620529175, + "learning_rate": 0.0002765539154715757, + "loss": 0.4877, + "step": 50550 + }, + { + "epoch": 1.1253561253561253, + "grad_norm": 0.710770308971405, + "learning_rate": 0.0002765108417275084, + "loss": 0.4708, + "step": 50560 + }, + { + "epoch": 1.1255787037037037, + "grad_norm": 0.699897825717926, + "learning_rate": 0.00027646776382577934, + "loss": 0.7282, + "step": 50570 + }, + { + "epoch": 1.125801282051282, + "grad_norm": 0.6822852492332458, + "learning_rate": 0.0002764246817687294, + "loss": 0.6721, + "step": 50580 + }, + { + "epoch": 1.1260238603988604, + "grad_norm": 0.6469874978065491, + "learning_rate": 0.00027638159555869966, + "loss": 0.6107, + "step": 50590 + }, + { + "epoch": 1.1262464387464388, + "grad_norm": 0.5671929121017456, + "learning_rate": 0.00027633850519803146, + "loss": 0.5588, + "step": 50600 + }, + { + "epoch": 1.126469017094017, + "grad_norm": 0.6890249848365784, + "learning_rate": 0.00027629541068906644, + "loss": 0.5867, + "step": 50610 + }, + { + "epoch": 1.1266915954415955, + "grad_norm": 0.6389486789703369, + "learning_rate": 0.0002762523120341463, + "loss": 0.5792, + "step": 50620 + }, + { + "epoch": 1.126914173789174, + "grad_norm": 1.1844886541366577, + "learning_rate": 0.0002762092092356131, + "loss": 0.6745, + "step": 50630 + }, + { + "epoch": 1.1271367521367521, + "grad_norm": 0.6294887065887451, + "learning_rate": 0.0002761661022958092, + "loss": 0.6385, + "step": 50640 + }, + { + "epoch": 1.1273593304843306, + "grad_norm": 0.6752757430076599, + "learning_rate": 0.00027612299121707685, + "loss": 0.5475, + "step": 50650 + }, + { + "epoch": 1.1275819088319088, + "grad_norm": 1.0090526342391968, + "learning_rate": 0.0002760798760017589, + "loss": 0.5784, + "step": 50660 + }, + { + "epoch": 1.1278044871794872, + "grad_norm": 0.7660955786705017, + "learning_rate": 0.0002760367566521981, + "loss": 0.5528, + "step": 50670 + }, + { + "epoch": 1.1280270655270654, + "grad_norm": 0.6590836644172668, + "learning_rate": 0.0002759936331707378, + "loss": 0.692, + "step": 50680 + }, + { + "epoch": 1.1282496438746439, + "grad_norm": 0.8564108610153198, + "learning_rate": 0.0002759505055597212, + "loss": 0.5678, + "step": 50690 + }, + { + "epoch": 1.1284722222222223, + "grad_norm": 0.587363600730896, + "learning_rate": 0.000275907373821492, + "loss": 0.5068, + "step": 50700 + }, + { + "epoch": 1.1286948005698005, + "grad_norm": 0.3763004541397095, + "learning_rate": 0.00027586423795839394, + "loss": 0.5296, + "step": 50710 + }, + { + "epoch": 1.128917378917379, + "grad_norm": 0.8793259859085083, + "learning_rate": 0.0002758210979727711, + "loss": 0.6651, + "step": 50720 + }, + { + "epoch": 1.1291399572649572, + "grad_norm": 0.8572914600372314, + "learning_rate": 0.0002757779538669677, + "loss": 0.5349, + "step": 50730 + }, + { + "epoch": 1.1293625356125356, + "grad_norm": 0.5334573984146118, + "learning_rate": 0.0002757348056433282, + "loss": 0.4848, + "step": 50740 + }, + { + "epoch": 1.1295851139601139, + "grad_norm": 0.5879189968109131, + "learning_rate": 0.0002756916533041975, + "loss": 0.5391, + "step": 50750 + }, + { + "epoch": 1.1298076923076923, + "grad_norm": 0.7693625688552856, + "learning_rate": 0.0002756484968519203, + "loss": 0.6844, + "step": 50760 + }, + { + "epoch": 1.1300302706552707, + "grad_norm": 0.9467472434043884, + "learning_rate": 0.0002756053362888419, + "loss": 0.655, + "step": 50770 + }, + { + "epoch": 1.130252849002849, + "grad_norm": 0.684518575668335, + "learning_rate": 0.00027556217161730763, + "loss": 0.5942, + "step": 50780 + }, + { + "epoch": 1.1304754273504274, + "grad_norm": 1.1190756559371948, + "learning_rate": 0.0002755190028396631, + "loss": 0.5974, + "step": 50790 + }, + { + "epoch": 1.1306980056980056, + "grad_norm": 0.7289356589317322, + "learning_rate": 0.00027547582995825405, + "loss": 0.7014, + "step": 50800 + }, + { + "epoch": 1.130920584045584, + "grad_norm": 0.8716086149215698, + "learning_rate": 0.00027543265297542665, + "loss": 0.6442, + "step": 50810 + }, + { + "epoch": 1.1311431623931625, + "grad_norm": 0.5718209743499756, + "learning_rate": 0.0002753894718935272, + "loss": 0.5779, + "step": 50820 + }, + { + "epoch": 1.1313657407407407, + "grad_norm": 0.6775786876678467, + "learning_rate": 0.0002753462867149021, + "loss": 0.6441, + "step": 50830 + }, + { + "epoch": 1.1315883190883191, + "grad_norm": 0.35854214429855347, + "learning_rate": 0.00027530309744189805, + "loss": 0.5515, + "step": 50840 + }, + { + "epoch": 1.1318108974358974, + "grad_norm": 0.3995623290538788, + "learning_rate": 0.00027525990407686207, + "loss": 0.5819, + "step": 50850 + }, + { + "epoch": 1.1320334757834758, + "grad_norm": 0.6870381832122803, + "learning_rate": 0.0002752167066221413, + "loss": 0.6606, + "step": 50860 + }, + { + "epoch": 1.1322560541310542, + "grad_norm": 0.9938977360725403, + "learning_rate": 0.00027517350508008315, + "loss": 0.6918, + "step": 50870 + }, + { + "epoch": 1.1324786324786325, + "grad_norm": 0.5590777397155762, + "learning_rate": 0.0002751302994530351, + "loss": 0.6583, + "step": 50880 + }, + { + "epoch": 1.132701210826211, + "grad_norm": 0.6947663426399231, + "learning_rate": 0.0002750870897433451, + "loss": 0.4942, + "step": 50890 + }, + { + "epoch": 1.132923789173789, + "grad_norm": 0.45011070370674133, + "learning_rate": 0.0002750438759533612, + "loss": 0.4522, + "step": 50900 + }, + { + "epoch": 1.1331463675213675, + "grad_norm": 2.3322110176086426, + "learning_rate": 0.0002750006580854317, + "loss": 0.4981, + "step": 50910 + }, + { + "epoch": 1.1333689458689458, + "grad_norm": 0.560584545135498, + "learning_rate": 0.00027495743614190497, + "loss": 0.6019, + "step": 50920 + }, + { + "epoch": 1.1335915242165242, + "grad_norm": 0.6565274596214294, + "learning_rate": 0.0002749142101251299, + "loss": 0.7166, + "step": 50930 + }, + { + "epoch": 1.1338141025641026, + "grad_norm": 0.7385024428367615, + "learning_rate": 0.00027487098003745514, + "loss": 0.5216, + "step": 50940 + }, + { + "epoch": 1.1340366809116809, + "grad_norm": 0.5504783391952515, + "learning_rate": 0.00027482774588123016, + "loss": 0.5478, + "step": 50950 + }, + { + "epoch": 1.1342592592592593, + "grad_norm": 0.6359137892723083, + "learning_rate": 0.00027478450765880424, + "loss": 0.6607, + "step": 50960 + }, + { + "epoch": 1.1344818376068375, + "grad_norm": 0.6465204954147339, + "learning_rate": 0.0002747412653725269, + "loss": 0.6833, + "step": 50970 + }, + { + "epoch": 1.134704415954416, + "grad_norm": 0.6576072573661804, + "learning_rate": 0.000274698019024748, + "loss": 0.5211, + "step": 50980 + }, + { + "epoch": 1.1349269943019944, + "grad_norm": 0.6555946469306946, + "learning_rate": 0.0002746547686178176, + "loss": 0.5938, + "step": 50990 + }, + { + "epoch": 1.1351495726495726, + "grad_norm": 0.5931433439254761, + "learning_rate": 0.00027461151415408597, + "loss": 0.4851, + "step": 51000 + }, + { + "epoch": 1.135372150997151, + "grad_norm": 0.48684531450271606, + "learning_rate": 0.00027456825563590355, + "loss": 0.4995, + "step": 51010 + }, + { + "epoch": 1.1355947293447293, + "grad_norm": 0.813859760761261, + "learning_rate": 0.00027452499306562106, + "loss": 0.5633, + "step": 51020 + }, + { + "epoch": 1.1358173076923077, + "grad_norm": 0.42580199241638184, + "learning_rate": 0.00027448172644558953, + "loss": 0.483, + "step": 51030 + }, + { + "epoch": 1.1360398860398861, + "grad_norm": 0.6813814043998718, + "learning_rate": 0.00027443845577816, + "loss": 0.4505, + "step": 51040 + }, + { + "epoch": 1.1362624643874644, + "grad_norm": 1.0832009315490723, + "learning_rate": 0.0002743951810656838, + "loss": 0.5776, + "step": 51050 + }, + { + "epoch": 1.1364850427350428, + "grad_norm": 0.6177014112472534, + "learning_rate": 0.0002743519023105125, + "loss": 0.6503, + "step": 51060 + }, + { + "epoch": 1.136707621082621, + "grad_norm": 0.8180685043334961, + "learning_rate": 0.0002743086195149981, + "loss": 0.6229, + "step": 51070 + }, + { + "epoch": 1.1369301994301995, + "grad_norm": 0.8289804458618164, + "learning_rate": 0.00027426533268149237, + "loss": 0.6538, + "step": 51080 + }, + { + "epoch": 1.1371527777777777, + "grad_norm": 0.7466900944709778, + "learning_rate": 0.00027422204181234766, + "loss": 0.6753, + "step": 51090 + }, + { + "epoch": 1.1373753561253561, + "grad_norm": 0.7393838763237, + "learning_rate": 0.00027417874690991654, + "loss": 0.5579, + "step": 51100 + }, + { + "epoch": 1.1375979344729346, + "grad_norm": 0.4931185841560364, + "learning_rate": 0.00027413544797655153, + "loss": 0.5762, + "step": 51110 + }, + { + "epoch": 1.1378205128205128, + "grad_norm": 0.6497383117675781, + "learning_rate": 0.0002740921450146056, + "loss": 0.3733, + "step": 51120 + }, + { + "epoch": 1.1380430911680912, + "grad_norm": 0.9239633679389954, + "learning_rate": 0.0002740488380264318, + "loss": 0.5591, + "step": 51130 + }, + { + "epoch": 1.1382656695156694, + "grad_norm": 0.6807590126991272, + "learning_rate": 0.0002740055270143836, + "loss": 0.5626, + "step": 51140 + }, + { + "epoch": 1.1384882478632479, + "grad_norm": 0.8736845254898071, + "learning_rate": 0.0002739622119808144, + "loss": 0.6351, + "step": 51150 + }, + { + "epoch": 1.138710826210826, + "grad_norm": 0.49122872948646545, + "learning_rate": 0.0002739188929280781, + "loss": 0.5252, + "step": 51160 + }, + { + "epoch": 1.1389334045584045, + "grad_norm": 0.5275121331214905, + "learning_rate": 0.00027387556985852867, + "loss": 0.5161, + "step": 51170 + }, + { + "epoch": 1.139155982905983, + "grad_norm": 0.8667572140693665, + "learning_rate": 0.00027383224277452027, + "loss": 0.617, + "step": 51180 + }, + { + "epoch": 1.1393785612535612, + "grad_norm": 0.885334849357605, + "learning_rate": 0.0002737889116784073, + "loss": 0.5347, + "step": 51190 + }, + { + "epoch": 1.1396011396011396, + "grad_norm": 0.5012974739074707, + "learning_rate": 0.0002737455765725445, + "loss": 0.6395, + "step": 51200 + }, + { + "epoch": 1.139823717948718, + "grad_norm": 1.1351536512374878, + "learning_rate": 0.00027370223745928673, + "loss": 0.5427, + "step": 51210 + }, + { + "epoch": 1.1400462962962963, + "grad_norm": 0.4724816381931305, + "learning_rate": 0.000273658894340989, + "loss": 0.5503, + "step": 51220 + }, + { + "epoch": 1.1401353276353277, + "eval_loss": 0.5929179787635803, + "eval_runtime": 337.1971, + "eval_samples_per_second": 7.014, + "eval_steps_per_second": 7.014, + "step": 51224 + }, + { + "epoch": 1.1402688746438747, + "grad_norm": 0.9337814450263977, + "learning_rate": 0.0002736155472200067, + "loss": 0.7102, + "step": 51230 + }, + { + "epoch": 1.140491452991453, + "grad_norm": 0.6762569546699524, + "learning_rate": 0.0002735721960986953, + "loss": 0.5636, + "step": 51240 + }, + { + "epoch": 1.1407140313390314, + "grad_norm": 0.6992948651313782, + "learning_rate": 0.0002735288409794105, + "loss": 0.623, + "step": 51250 + }, + { + "epoch": 1.1409366096866096, + "grad_norm": 0.6342442631721497, + "learning_rate": 0.00027348548186450827, + "loss": 0.5408, + "step": 51260 + }, + { + "epoch": 1.141159188034188, + "grad_norm": 0.5864853262901306, + "learning_rate": 0.0002734421187563448, + "loss": 0.5168, + "step": 51270 + }, + { + "epoch": 1.1413817663817665, + "grad_norm": 0.5342767834663391, + "learning_rate": 0.00027339875165727657, + "loss": 0.6507, + "step": 51280 + }, + { + "epoch": 1.1416043447293447, + "grad_norm": 0.7018928527832031, + "learning_rate": 0.00027335538056966, + "loss": 0.7081, + "step": 51290 + }, + { + "epoch": 1.1418269230769231, + "grad_norm": 0.7439033389091492, + "learning_rate": 0.000273312005495852, + "loss": 0.5094, + "step": 51300 + }, + { + "epoch": 1.1420495014245013, + "grad_norm": 0.7723569273948669, + "learning_rate": 0.00027326862643820964, + "loss": 0.5982, + "step": 51310 + }, + { + "epoch": 1.1422720797720798, + "grad_norm": 0.4509701132774353, + "learning_rate": 0.00027322524339909015, + "loss": 0.4447, + "step": 51320 + }, + { + "epoch": 1.142494658119658, + "grad_norm": 0.5816810131072998, + "learning_rate": 0.00027318185638085094, + "loss": 0.5588, + "step": 51330 + }, + { + "epoch": 1.1427172364672364, + "grad_norm": 0.7561889290809631, + "learning_rate": 0.00027313846538584973, + "loss": 0.6732, + "step": 51340 + }, + { + "epoch": 1.1429398148148149, + "grad_norm": 0.8186838030815125, + "learning_rate": 0.0002730950704164445, + "loss": 0.4714, + "step": 51350 + }, + { + "epoch": 1.143162393162393, + "grad_norm": 1.4625139236450195, + "learning_rate": 0.00027305167147499324, + "loss": 0.576, + "step": 51360 + }, + { + "epoch": 1.1433849715099715, + "grad_norm": 0.896793007850647, + "learning_rate": 0.0002730082685638544, + "loss": 0.5008, + "step": 51370 + }, + { + "epoch": 1.14360754985755, + "grad_norm": 0.9344685077667236, + "learning_rate": 0.00027296486168538645, + "loss": 0.4954, + "step": 51380 + }, + { + "epoch": 1.1438301282051282, + "grad_norm": 0.6186417937278748, + "learning_rate": 0.00027292145084194827, + "loss": 0.5856, + "step": 51390 + }, + { + "epoch": 1.1440527065527066, + "grad_norm": 1.0679600238800049, + "learning_rate": 0.00027287803603589866, + "loss": 0.5139, + "step": 51400 + }, + { + "epoch": 1.1442752849002849, + "grad_norm": 0.4196697771549225, + "learning_rate": 0.0002728346172695969, + "loss": 0.443, + "step": 51410 + }, + { + "epoch": 1.1444978632478633, + "grad_norm": 0.5441080927848816, + "learning_rate": 0.00027279119454540245, + "loss": 0.4916, + "step": 51420 + }, + { + "epoch": 1.1447204415954415, + "grad_norm": 0.919732928276062, + "learning_rate": 0.0002727477678656749, + "loss": 0.5201, + "step": 51430 + }, + { + "epoch": 1.14494301994302, + "grad_norm": 0.7137250900268555, + "learning_rate": 0.00027270433723277406, + "loss": 0.6276, + "step": 51440 + }, + { + "epoch": 1.1451655982905984, + "grad_norm": 0.535349428653717, + "learning_rate": 0.0002726609026490601, + "loss": 0.5404, + "step": 51450 + }, + { + "epoch": 1.1453881766381766, + "grad_norm": 0.6664863228797913, + "learning_rate": 0.00027261746411689315, + "loss": 0.4525, + "step": 51460 + }, + { + "epoch": 1.145610754985755, + "grad_norm": 0.5625571012496948, + "learning_rate": 0.00027257402163863374, + "loss": 0.5836, + "step": 51470 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.6733038425445557, + "learning_rate": 0.0002725305752166426, + "loss": 0.564, + "step": 51480 + }, + { + "epoch": 1.1460559116809117, + "grad_norm": 0.7813240885734558, + "learning_rate": 0.0002724871248532806, + "loss": 0.4939, + "step": 51490 + }, + { + "epoch": 1.14627849002849, + "grad_norm": 0.7392660975456238, + "learning_rate": 0.00027244367055090894, + "loss": 0.7043, + "step": 51500 + }, + { + "epoch": 1.1465010683760684, + "grad_norm": 0.4674152731895447, + "learning_rate": 0.00027240021231188883, + "loss": 0.5184, + "step": 51510 + }, + { + "epoch": 1.1467236467236468, + "grad_norm": 0.656161367893219, + "learning_rate": 0.000272356750138582, + "loss": 0.6081, + "step": 51520 + }, + { + "epoch": 1.146946225071225, + "grad_norm": 0.6295078992843628, + "learning_rate": 0.0002723132840333501, + "loss": 0.5184, + "step": 51530 + }, + { + "epoch": 1.1471688034188035, + "grad_norm": 0.769020676612854, + "learning_rate": 0.00027226981399855514, + "loss": 0.6159, + "step": 51540 + }, + { + "epoch": 1.147391381766382, + "grad_norm": 0.6695685386657715, + "learning_rate": 0.0002722263400365594, + "loss": 0.6638, + "step": 51550 + }, + { + "epoch": 1.14761396011396, + "grad_norm": 0.61558997631073, + "learning_rate": 0.00027218286214972514, + "loss": 0.5738, + "step": 51560 + }, + { + "epoch": 1.1478365384615385, + "grad_norm": 0.8256414532661438, + "learning_rate": 0.0002721393803404151, + "loss": 0.6167, + "step": 51570 + }, + { + "epoch": 1.1480591168091168, + "grad_norm": 0.8803865909576416, + "learning_rate": 0.00027209589461099203, + "loss": 0.6634, + "step": 51580 + }, + { + "epoch": 1.1482816951566952, + "grad_norm": 0.41930705308914185, + "learning_rate": 0.0002720524049638191, + "loss": 0.6836, + "step": 51590 + }, + { + "epoch": 1.1485042735042734, + "grad_norm": 1.0584372282028198, + "learning_rate": 0.0002720089114012594, + "loss": 0.4381, + "step": 51600 + }, + { + "epoch": 1.1487268518518519, + "grad_norm": 0.5026808381080627, + "learning_rate": 0.0002719654139256765, + "loss": 0.6022, + "step": 51610 + }, + { + "epoch": 1.1489494301994303, + "grad_norm": 1.0847352743148804, + "learning_rate": 0.00027192191253943415, + "loss": 0.6017, + "step": 51620 + }, + { + "epoch": 1.1491720085470085, + "grad_norm": 0.6440492272377014, + "learning_rate": 0.0002718784072448963, + "loss": 0.5381, + "step": 51630 + }, + { + "epoch": 1.149394586894587, + "grad_norm": 0.7534992694854736, + "learning_rate": 0.0002718348980444268, + "loss": 0.603, + "step": 51640 + }, + { + "epoch": 1.1496171652421652, + "grad_norm": 0.55170738697052, + "learning_rate": 0.0002717913849403901, + "loss": 0.7346, + "step": 51650 + }, + { + "epoch": 1.1498397435897436, + "grad_norm": 0.5600918531417847, + "learning_rate": 0.0002717478679351509, + "loss": 0.6366, + "step": 51660 + }, + { + "epoch": 1.1500623219373218, + "grad_norm": 0.6963495016098022, + "learning_rate": 0.0002717043470310738, + "loss": 0.586, + "step": 51670 + }, + { + "epoch": 1.1502849002849003, + "grad_norm": 0.6831533908843994, + "learning_rate": 0.00027166082223052375, + "loss": 0.6305, + "step": 51680 + }, + { + "epoch": 1.1505074786324787, + "grad_norm": 0.7113856673240662, + "learning_rate": 0.00027161729353586595, + "loss": 0.5272, + "step": 51690 + }, + { + "epoch": 1.150730056980057, + "grad_norm": 0.7831088900566101, + "learning_rate": 0.0002715737609494658, + "loss": 0.6085, + "step": 51700 + }, + { + "epoch": 1.1509526353276354, + "grad_norm": 0.401477575302124, + "learning_rate": 0.0002715302244736889, + "loss": 0.5864, + "step": 51710 + }, + { + "epoch": 1.1511752136752136, + "grad_norm": 0.814154863357544, + "learning_rate": 0.00027148668411090105, + "loss": 0.5873, + "step": 51720 + }, + { + "epoch": 1.151397792022792, + "grad_norm": 0.5306657552719116, + "learning_rate": 0.00027144313986346826, + "loss": 0.5309, + "step": 51730 + }, + { + "epoch": 1.1516203703703705, + "grad_norm": 0.45474058389663696, + "learning_rate": 0.0002713995917337567, + "loss": 0.5582, + "step": 51740 + }, + { + "epoch": 1.1518429487179487, + "grad_norm": 0.7120450139045715, + "learning_rate": 0.0002713560397241329, + "loss": 0.6194, + "step": 51750 + }, + { + "epoch": 1.1520655270655271, + "grad_norm": 0.650835394859314, + "learning_rate": 0.00027131248383696356, + "loss": 0.6751, + "step": 51760 + }, + { + "epoch": 1.1522881054131053, + "grad_norm": 0.8832502961158752, + "learning_rate": 0.00027126892407461546, + "loss": 0.5607, + "step": 51770 + }, + { + "epoch": 1.1525106837606838, + "grad_norm": 0.9212602376937866, + "learning_rate": 0.0002712253604394556, + "loss": 0.5538, + "step": 51780 + }, + { + "epoch": 1.1527332621082622, + "grad_norm": 0.6670851707458496, + "learning_rate": 0.0002711817929338514, + "loss": 0.5079, + "step": 51790 + }, + { + "epoch": 1.1529558404558404, + "grad_norm": 0.4576922059059143, + "learning_rate": 0.0002711382215601703, + "loss": 0.6051, + "step": 51800 + }, + { + "epoch": 1.1531784188034189, + "grad_norm": 0.4248937666416168, + "learning_rate": 0.00027109464632078, + "loss": 0.5919, + "step": 51810 + }, + { + "epoch": 1.153400997150997, + "grad_norm": 0.46840816736221313, + "learning_rate": 0.00027105106721804837, + "loss": 0.5576, + "step": 51820 + }, + { + "epoch": 1.1536235754985755, + "grad_norm": 0.5896355509757996, + "learning_rate": 0.00027100748425434363, + "loss": 0.574, + "step": 51830 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.8185621500015259, + "learning_rate": 0.0002709638974320341, + "loss": 0.5869, + "step": 51840 + }, + { + "epoch": 1.1540687321937322, + "grad_norm": 0.5478002429008484, + "learning_rate": 0.00027092030675348824, + "loss": 0.4995, + "step": 51850 + }, + { + "epoch": 1.1542913105413106, + "grad_norm": 0.7509670257568359, + "learning_rate": 0.00027087671222107484, + "loss": 0.6434, + "step": 51860 + }, + { + "epoch": 1.1545138888888888, + "grad_norm": 0.5304856300354004, + "learning_rate": 0.0002708331138371629, + "loss": 0.5819, + "step": 51870 + }, + { + "epoch": 1.1547364672364673, + "grad_norm": 0.39571112394332886, + "learning_rate": 0.00027078951160412155, + "loss": 0.6543, + "step": 51880 + }, + { + "epoch": 1.1549590455840455, + "grad_norm": 0.6003801226615906, + "learning_rate": 0.00027074590552432026, + "loss": 0.603, + "step": 51890 + }, + { + "epoch": 1.155181623931624, + "grad_norm": 0.5318456292152405, + "learning_rate": 0.00027070229560012847, + "loss": 0.4693, + "step": 51900 + }, + { + "epoch": 1.1554042022792024, + "grad_norm": 0.4450821578502655, + "learning_rate": 0.0002706586818339161, + "loss": 0.4332, + "step": 51910 + }, + { + "epoch": 1.1556267806267806, + "grad_norm": 0.5957626104354858, + "learning_rate": 0.0002706150642280531, + "loss": 0.5313, + "step": 51920 + }, + { + "epoch": 1.155849358974359, + "grad_norm": 0.7766496539115906, + "learning_rate": 0.00027057144278490964, + "loss": 0.5441, + "step": 51930 + }, + { + "epoch": 1.1560719373219372, + "grad_norm": 0.48109349608421326, + "learning_rate": 0.0002705278175068563, + "loss": 0.5063, + "step": 51940 + }, + { + "epoch": 1.1562945156695157, + "grad_norm": 0.7788119316101074, + "learning_rate": 0.0002704841883962636, + "loss": 0.5365, + "step": 51950 + }, + { + "epoch": 1.1565170940170941, + "grad_norm": 0.7502115368843079, + "learning_rate": 0.0002704405554555024, + "loss": 0.6355, + "step": 51960 + }, + { + "epoch": 1.1567396723646723, + "grad_norm": 0.6445801854133606, + "learning_rate": 0.00027039691868694374, + "loss": 0.5152, + "step": 51970 + }, + { + "epoch": 1.1569622507122508, + "grad_norm": 0.651919424533844, + "learning_rate": 0.00027035327809295885, + "loss": 0.559, + "step": 51980 + }, + { + "epoch": 1.157184829059829, + "grad_norm": 0.5348412990570068, + "learning_rate": 0.00027030963367591924, + "loss": 0.6066, + "step": 51990 + }, + { + "epoch": 1.1574074074074074, + "grad_norm": 0.5168819427490234, + "learning_rate": 0.00027026598543819665, + "loss": 0.6061, + "step": 52000 + }, + { + "epoch": 1.1576299857549857, + "grad_norm": 0.48285502195358276, + "learning_rate": 0.00027022233338216283, + "loss": 0.5346, + "step": 52010 + }, + { + "epoch": 1.157852564102564, + "grad_norm": 0.6535779237747192, + "learning_rate": 0.00027017867751019, + "loss": 0.5861, + "step": 52020 + }, + { + "epoch": 1.1580751424501425, + "grad_norm": 0.50959312915802, + "learning_rate": 0.0002701350178246503, + "loss": 0.5394, + "step": 52030 + }, + { + "epoch": 1.1582977207977208, + "grad_norm": 0.7481958270072937, + "learning_rate": 0.0002700913543279163, + "loss": 0.5979, + "step": 52040 + }, + { + "epoch": 1.1585202991452992, + "grad_norm": 0.5922252535820007, + "learning_rate": 0.0002700476870223608, + "loss": 0.5435, + "step": 52050 + }, + { + "epoch": 1.1587428774928774, + "grad_norm": 0.6566937565803528, + "learning_rate": 0.00027000401591035665, + "loss": 0.5208, + "step": 52060 + }, + { + "epoch": 1.1589654558404558, + "grad_norm": 0.6020127534866333, + "learning_rate": 0.0002699603409942769, + "loss": 0.5358, + "step": 52070 + }, + { + "epoch": 1.159188034188034, + "grad_norm": 0.5534032583236694, + "learning_rate": 0.00026991666227649497, + "loss": 0.5399, + "step": 52080 + }, + { + "epoch": 1.1594106125356125, + "grad_norm": 0.6564294695854187, + "learning_rate": 0.0002698729797593844, + "loss": 0.6216, + "step": 52090 + }, + { + "epoch": 1.159633190883191, + "grad_norm": 0.5825326442718506, + "learning_rate": 0.0002698292934453189, + "loss": 0.5339, + "step": 52100 + }, + { + "epoch": 1.1598557692307692, + "grad_norm": 0.48360762000083923, + "learning_rate": 0.0002697856033366724, + "loss": 0.5682, + "step": 52110 + }, + { + "epoch": 1.1600783475783476, + "grad_norm": 0.7247682213783264, + "learning_rate": 0.0002697419094358192, + "loss": 0.537, + "step": 52120 + }, + { + "epoch": 1.160300925925926, + "grad_norm": 0.7545642256736755, + "learning_rate": 0.0002696982117451334, + "loss": 0.5773, + "step": 52130 + }, + { + "epoch": 1.1605235042735043, + "grad_norm": 0.6881244778633118, + "learning_rate": 0.0002696545102669897, + "loss": 0.5605, + "step": 52140 + }, + { + "epoch": 1.1607460826210827, + "grad_norm": 0.463986873626709, + "learning_rate": 0.000269610805003763, + "loss": 0.5707, + "step": 52150 + }, + { + "epoch": 1.160968660968661, + "grad_norm": 0.41395312547683716, + "learning_rate": 0.0002695670959578282, + "loss": 0.6493, + "step": 52160 + }, + { + "epoch": 1.1611912393162394, + "grad_norm": 0.9375655055046082, + "learning_rate": 0.00026952338313156036, + "loss": 0.5997, + "step": 52170 + }, + { + "epoch": 1.1614138176638176, + "grad_norm": 0.6170309782028198, + "learning_rate": 0.00026947966652733494, + "loss": 0.5741, + "step": 52180 + }, + { + "epoch": 1.161636396011396, + "grad_norm": 0.6665390729904175, + "learning_rate": 0.0002694359461475277, + "loss": 0.4955, + "step": 52190 + }, + { + "epoch": 1.1618589743589745, + "grad_norm": 0.6789956092834473, + "learning_rate": 0.0002693922219945142, + "loss": 0.6042, + "step": 52200 + }, + { + "epoch": 1.1620815527065527, + "grad_norm": 0.38877052068710327, + "learning_rate": 0.00026934849407067054, + "loss": 0.5404, + "step": 52210 + }, + { + "epoch": 1.162304131054131, + "grad_norm": 0.8757748007774353, + "learning_rate": 0.000269304762378373, + "loss": 0.5848, + "step": 52220 + }, + { + "epoch": 1.1625267094017093, + "grad_norm": 0.8811602592468262, + "learning_rate": 0.0002692610269199979, + "loss": 0.5052, + "step": 52230 + }, + { + "epoch": 1.1627492877492878, + "grad_norm": 0.5613075494766235, + "learning_rate": 0.0002692172876979219, + "loss": 0.5791, + "step": 52240 + }, + { + "epoch": 1.162971866096866, + "grad_norm": 0.41427573561668396, + "learning_rate": 0.00026917354471452185, + "loss": 0.6177, + "step": 52250 + }, + { + "epoch": 1.1631944444444444, + "grad_norm": 0.8255056738853455, + "learning_rate": 0.0002691297979721747, + "loss": 0.5372, + "step": 52260 + }, + { + "epoch": 1.1634170227920229, + "grad_norm": 0.5964106917381287, + "learning_rate": 0.0002690860474732578, + "loss": 0.459, + "step": 52270 + }, + { + "epoch": 1.163639601139601, + "grad_norm": 0.875540018081665, + "learning_rate": 0.0002690422932201485, + "loss": 0.5354, + "step": 52280 + }, + { + "epoch": 1.1638621794871795, + "grad_norm": 0.5432913899421692, + "learning_rate": 0.0002689985352152244, + "loss": 0.6149, + "step": 52290 + }, + { + "epoch": 1.164084757834758, + "grad_norm": 0.732105553150177, + "learning_rate": 0.0002689547734608635, + "loss": 0.52, + "step": 52300 + }, + { + "epoch": 1.1643073361823362, + "grad_norm": 0.7695441246032715, + "learning_rate": 0.00026891100795944375, + "loss": 0.6225, + "step": 52310 + }, + { + "epoch": 1.1645299145299146, + "grad_norm": 0.9869396686553955, + "learning_rate": 0.00026886723871334336, + "loss": 0.5307, + "step": 52320 + }, + { + "epoch": 1.1647524928774928, + "grad_norm": 0.6698643565177917, + "learning_rate": 0.0002688234657249409, + "loss": 0.6412, + "step": 52330 + }, + { + "epoch": 1.1649750712250713, + "grad_norm": 0.8857219219207764, + "learning_rate": 0.0002687796889966149, + "loss": 0.5894, + "step": 52340 + }, + { + "epoch": 1.1651976495726495, + "grad_norm": 0.6016620397567749, + "learning_rate": 0.0002687359085307444, + "loss": 0.5249, + "step": 52350 + }, + { + "epoch": 1.165420227920228, + "grad_norm": 0.6211386322975159, + "learning_rate": 0.00026869212432970827, + "loss": 0.5685, + "step": 52360 + }, + { + "epoch": 1.1656428062678064, + "grad_norm": 0.622130811214447, + "learning_rate": 0.00026864833639588594, + "loss": 0.6297, + "step": 52370 + }, + { + "epoch": 1.1658653846153846, + "grad_norm": 0.4021855890750885, + "learning_rate": 0.0002686045447316567, + "loss": 0.6885, + "step": 52380 + }, + { + "epoch": 1.166087962962963, + "grad_norm": 0.6166818737983704, + "learning_rate": 0.0002685607493394004, + "loss": 0.6306, + "step": 52390 + }, + { + "epoch": 1.1663105413105412, + "grad_norm": 0.588339626789093, + "learning_rate": 0.0002685169502214969, + "loss": 0.6566, + "step": 52400 + }, + { + "epoch": 1.1665331196581197, + "grad_norm": 0.7152255177497864, + "learning_rate": 0.0002684731473803262, + "loss": 0.5807, + "step": 52410 + }, + { + "epoch": 1.166755698005698, + "grad_norm": 0.8684418201446533, + "learning_rate": 0.0002684293408182686, + "loss": 0.6132, + "step": 52420 + }, + { + "epoch": 1.1669782763532763, + "grad_norm": 0.6165472865104675, + "learning_rate": 0.0002683855305377046, + "loss": 0.5071, + "step": 52430 + }, + { + "epoch": 1.1672008547008548, + "grad_norm": 0.7422646880149841, + "learning_rate": 0.0002683417165410149, + "loss": 0.6087, + "step": 52440 + }, + { + "epoch": 1.167423433048433, + "grad_norm": 0.5423184037208557, + "learning_rate": 0.0002682978988305804, + "loss": 0.5059, + "step": 52450 + }, + { + "epoch": 1.1676460113960114, + "grad_norm": 0.8247236013412476, + "learning_rate": 0.0002682540774087821, + "loss": 0.5573, + "step": 52460 + }, + { + "epoch": 1.1678685897435896, + "grad_norm": 0.5281010866165161, + "learning_rate": 0.00026821025227800145, + "loss": 0.4842, + "step": 52470 + }, + { + "epoch": 1.168091168091168, + "grad_norm": 0.5985293388366699, + "learning_rate": 0.00026816642344061983, + "loss": 0.5485, + "step": 52480 + }, + { + "epoch": 1.1683137464387465, + "grad_norm": 0.5074992775917053, + "learning_rate": 0.0002681225908990189, + "loss": 0.5867, + "step": 52490 + }, + { + "epoch": 1.1685363247863247, + "grad_norm": 0.598908543586731, + "learning_rate": 0.00026807875465558064, + "loss": 0.4858, + "step": 52500 + }, + { + "epoch": 1.1687589031339032, + "grad_norm": 0.7070103883743286, + "learning_rate": 0.00026803491471268716, + "loss": 0.5633, + "step": 52510 + }, + { + "epoch": 1.1689814814814814, + "grad_norm": 0.8883180618286133, + "learning_rate": 0.00026799107107272066, + "loss": 0.5992, + "step": 52520 + }, + { + "epoch": 1.1692040598290598, + "grad_norm": 0.456495076417923, + "learning_rate": 0.00026794722373806365, + "loss": 0.4145, + "step": 52530 + }, + { + "epoch": 1.1694266381766383, + "grad_norm": 0.3848685026168823, + "learning_rate": 0.000267903372711099, + "loss": 0.6011, + "step": 52540 + }, + { + "epoch": 1.1696492165242165, + "grad_norm": 0.5944089889526367, + "learning_rate": 0.0002678595179942095, + "loss": 0.5212, + "step": 52550 + }, + { + "epoch": 1.169871794871795, + "grad_norm": 0.5271813273429871, + "learning_rate": 0.00026781565958977816, + "loss": 0.6392, + "step": 52560 + }, + { + "epoch": 1.1700943732193732, + "grad_norm": 0.6273509860038757, + "learning_rate": 0.0002677717975001883, + "loss": 0.5713, + "step": 52570 + }, + { + "epoch": 1.1703169515669516, + "grad_norm": 0.6436539888381958, + "learning_rate": 0.00026772793172782363, + "loss": 0.6242, + "step": 52580 + }, + { + "epoch": 1.1705395299145298, + "grad_norm": 0.49243152141571045, + "learning_rate": 0.0002676840622750676, + "loss": 0.5868, + "step": 52590 + }, + { + "epoch": 1.1707621082621082, + "grad_norm": 0.6814731955528259, + "learning_rate": 0.00026764018914430426, + "loss": 0.4408, + "step": 52600 + }, + { + "epoch": 1.1709846866096867, + "grad_norm": 0.49323052167892456, + "learning_rate": 0.00026759631233791767, + "loss": 0.4908, + "step": 52610 + }, + { + "epoch": 1.171207264957265, + "grad_norm": 0.48324576020240784, + "learning_rate": 0.00026755243185829213, + "loss": 0.5272, + "step": 52620 + }, + { + "epoch": 1.1714298433048433, + "grad_norm": 0.5585512518882751, + "learning_rate": 0.0002675085477078121, + "loss": 0.5014, + "step": 52630 + }, + { + "epoch": 1.1716524216524216, + "grad_norm": 0.5609182119369507, + "learning_rate": 0.0002674646598888624, + "loss": 0.5145, + "step": 52640 + }, + { + "epoch": 1.171875, + "grad_norm": 0.6564444899559021, + "learning_rate": 0.0002674207684038278, + "loss": 0.6005, + "step": 52650 + }, + { + "epoch": 1.1720975783475784, + "grad_norm": 0.6718125343322754, + "learning_rate": 0.00026737687325509345, + "loss": 0.6004, + "step": 52660 + }, + { + "epoch": 1.1723201566951567, + "grad_norm": 0.6030959486961365, + "learning_rate": 0.0002673329744450447, + "loss": 0.5524, + "step": 52670 + }, + { + "epoch": 1.172542735042735, + "grad_norm": 0.48877060413360596, + "learning_rate": 0.00026728907197606696, + "loss": 0.4131, + "step": 52680 + }, + { + "epoch": 1.1727653133903133, + "grad_norm": 0.5057005286216736, + "learning_rate": 0.00026724516585054596, + "loss": 0.694, + "step": 52690 + }, + { + "epoch": 1.1729878917378918, + "grad_norm": 1.0375540256500244, + "learning_rate": 0.0002672012560708676, + "loss": 0.486, + "step": 52700 + }, + { + "epoch": 1.1732104700854702, + "grad_norm": 0.7084336876869202, + "learning_rate": 0.00026715734263941794, + "loss": 0.5738, + "step": 52710 + }, + { + "epoch": 1.1734330484330484, + "grad_norm": 0.6579006314277649, + "learning_rate": 0.0002671134255585834, + "loss": 0.7216, + "step": 52720 + }, + { + "epoch": 1.1736556267806268, + "grad_norm": 0.7561776638031006, + "learning_rate": 0.0002670695048307502, + "loss": 0.6214, + "step": 52730 + }, + { + "epoch": 1.173878205128205, + "grad_norm": 0.7474552392959595, + "learning_rate": 0.0002670255804583054, + "loss": 0.6917, + "step": 52740 + }, + { + "epoch": 1.1741007834757835, + "grad_norm": 0.49505066871643066, + "learning_rate": 0.00026698165244363564, + "loss": 0.6039, + "step": 52750 + }, + { + "epoch": 1.1743233618233617, + "grad_norm": 1.0127018690109253, + "learning_rate": 0.00026693772078912795, + "loss": 0.5748, + "step": 52760 + }, + { + "epoch": 1.1745459401709402, + "grad_norm": 0.4878261685371399, + "learning_rate": 0.0002668937854971698, + "loss": 0.4545, + "step": 52770 + }, + { + "epoch": 1.1747685185185186, + "grad_norm": 0.7447136044502258, + "learning_rate": 0.0002668498465701485, + "loss": 0.7469, + "step": 52780 + }, + { + "epoch": 1.1749910968660968, + "grad_norm": 0.581995964050293, + "learning_rate": 0.00026680590401045195, + "loss": 0.6097, + "step": 52790 + }, + { + "epoch": 1.1752136752136753, + "grad_norm": 0.5570381283760071, + "learning_rate": 0.00026676195782046776, + "loss": 0.4755, + "step": 52800 + }, + { + "epoch": 1.1754362535612535, + "grad_norm": 0.49623483419418335, + "learning_rate": 0.0002667180080025842, + "loss": 0.5373, + "step": 52810 + }, + { + "epoch": 1.175658831908832, + "grad_norm": 0.5563613176345825, + "learning_rate": 0.00026667405455918947, + "loss": 0.5642, + "step": 52820 + }, + { + "epoch": 1.1758814102564104, + "grad_norm": 0.6269398927688599, + "learning_rate": 0.000266630097492672, + "loss": 0.6702, + "step": 52830 + }, + { + "epoch": 1.1761039886039886, + "grad_norm": 0.6564904451370239, + "learning_rate": 0.0002665861368054205, + "loss": 0.6917, + "step": 52840 + }, + { + "epoch": 1.176326566951567, + "grad_norm": 0.9609503746032715, + "learning_rate": 0.00026654217249982376, + "loss": 0.6088, + "step": 52850 + }, + { + "epoch": 1.1765491452991452, + "grad_norm": 0.6753114461898804, + "learning_rate": 0.00026649820457827093, + "loss": 0.5698, + "step": 52860 + }, + { + "epoch": 1.1767717236467237, + "grad_norm": 0.6223951578140259, + "learning_rate": 0.0002664542330431513, + "loss": 0.4725, + "step": 52870 + }, + { + "epoch": 1.176994301994302, + "grad_norm": 0.9456238150596619, + "learning_rate": 0.0002664102578968541, + "loss": 0.6092, + "step": 52880 + }, + { + "epoch": 1.1772168803418803, + "grad_norm": 0.5020238757133484, + "learning_rate": 0.0002663662791417693, + "loss": 0.5609, + "step": 52890 + }, + { + "epoch": 1.1774394586894588, + "grad_norm": 0.5456209778785706, + "learning_rate": 0.0002663222967802864, + "loss": 0.6475, + "step": 52900 + }, + { + "epoch": 1.177662037037037, + "grad_norm": 0.8311290144920349, + "learning_rate": 0.00026627831081479567, + "loss": 0.6265, + "step": 52910 + }, + { + "epoch": 1.1778846153846154, + "grad_norm": 0.6691579818725586, + "learning_rate": 0.00026623432124768726, + "loss": 0.6148, + "step": 52920 + }, + { + "epoch": 1.1781071937321936, + "grad_norm": 0.7260137796401978, + "learning_rate": 0.0002661903280813516, + "loss": 0.5983, + "step": 52930 + }, + { + "epoch": 1.178329772079772, + "grad_norm": 0.7961229085922241, + "learning_rate": 0.00026614633131817936, + "loss": 0.5465, + "step": 52940 + }, + { + "epoch": 1.1785523504273505, + "grad_norm": 0.7031990885734558, + "learning_rate": 0.00026610233096056136, + "loss": 0.7622, + "step": 52950 + }, + { + "epoch": 1.1787749287749287, + "grad_norm": 0.6887387037277222, + "learning_rate": 0.00026605832701088853, + "loss": 0.582, + "step": 52960 + }, + { + "epoch": 1.1789975071225072, + "grad_norm": 0.9699671864509583, + "learning_rate": 0.0002660143194715521, + "loss": 0.5528, + "step": 52970 + }, + { + "epoch": 1.1792200854700854, + "grad_norm": 0.6452189683914185, + "learning_rate": 0.0002659703083449435, + "loss": 0.5524, + "step": 52980 + }, + { + "epoch": 1.1794426638176638, + "grad_norm": 0.6887800097465515, + "learning_rate": 0.00026592629363345445, + "loss": 0.6631, + "step": 52990 + }, + { + "epoch": 1.179665242165242, + "grad_norm": 0.5450928211212158, + "learning_rate": 0.00026588227533947653, + "loss": 0.5136, + "step": 53000 + }, + { + "epoch": 1.1798878205128205, + "grad_norm": 0.6100490689277649, + "learning_rate": 0.0002658382534654019, + "loss": 0.5468, + "step": 53010 + }, + { + "epoch": 1.180110398860399, + "grad_norm": 0.3202427327632904, + "learning_rate": 0.0002657942280136226, + "loss": 0.4887, + "step": 53020 + }, + { + "epoch": 1.1803329772079771, + "grad_norm": 0.5512992739677429, + "learning_rate": 0.00026575019898653117, + "loss": 0.673, + "step": 53030 + }, + { + "epoch": 1.1805555555555556, + "grad_norm": 1.174062728881836, + "learning_rate": 0.00026570616638652006, + "loss": 0.5801, + "step": 53040 + }, + { + "epoch": 1.180778133903134, + "grad_norm": 0.7160769701004028, + "learning_rate": 0.0002656621302159821, + "loss": 0.5151, + "step": 53050 + }, + { + "epoch": 1.1810007122507122, + "grad_norm": 0.6652625203132629, + "learning_rate": 0.0002656180904773102, + "loss": 0.6656, + "step": 53060 + }, + { + "epoch": 1.1812232905982907, + "grad_norm": 0.3985412120819092, + "learning_rate": 0.00026557404717289756, + "loss": 0.4716, + "step": 53070 + }, + { + "epoch": 1.181445868945869, + "grad_norm": 0.6421456336975098, + "learning_rate": 0.0002655300003051375, + "loss": 0.597, + "step": 53080 + }, + { + "epoch": 1.1816684472934473, + "grad_norm": 0.4710211455821991, + "learning_rate": 0.00026548594987642365, + "loss": 0.4481, + "step": 53090 + }, + { + "epoch": 1.1818910256410255, + "grad_norm": 0.4872575104236603, + "learning_rate": 0.00026544189588914964, + "loss": 0.4522, + "step": 53100 + }, + { + "epoch": 1.182113603988604, + "grad_norm": 0.7041962146759033, + "learning_rate": 0.0002653978383457094, + "loss": 0.5575, + "step": 53110 + }, + { + "epoch": 1.1823361823361824, + "grad_norm": 0.5078232288360596, + "learning_rate": 0.00026535377724849703, + "loss": 0.6371, + "step": 53120 + }, + { + "epoch": 1.1825587606837606, + "grad_norm": 0.8470007181167603, + "learning_rate": 0.00026530971259990696, + "loss": 0.4924, + "step": 53130 + }, + { + "epoch": 1.182781339031339, + "grad_norm": 0.8424126505851746, + "learning_rate": 0.0002652656444023338, + "loss": 0.6354, + "step": 53140 + }, + { + "epoch": 1.1830039173789173, + "grad_norm": 0.6890965104103088, + "learning_rate": 0.0002652215726581719, + "loss": 0.6349, + "step": 53150 + }, + { + "epoch": 1.1832264957264957, + "grad_norm": 0.6198452115058899, + "learning_rate": 0.00026517749736981635, + "loss": 0.4863, + "step": 53160 + }, + { + "epoch": 1.183449074074074, + "grad_norm": 0.7459619641304016, + "learning_rate": 0.0002651334185396623, + "loss": 0.5476, + "step": 53170 + }, + { + "epoch": 1.1836716524216524, + "grad_norm": 0.7808142304420471, + "learning_rate": 0.000265089336170105, + "loss": 0.6134, + "step": 53180 + }, + { + "epoch": 1.1838942307692308, + "grad_norm": 0.6265953779220581, + "learning_rate": 0.0002650452502635398, + "loss": 0.5628, + "step": 53190 + }, + { + "epoch": 1.184116809116809, + "grad_norm": 0.5564361810684204, + "learning_rate": 0.0002650011608223625, + "loss": 0.5641, + "step": 53200 + }, + { + "epoch": 1.1843393874643875, + "grad_norm": 0.6715850234031677, + "learning_rate": 0.0002649570678489689, + "loss": 0.5415, + "step": 53210 + }, + { + "epoch": 1.184561965811966, + "grad_norm": 0.7137435078620911, + "learning_rate": 0.00026491297134575504, + "loss": 0.6647, + "step": 53220 + }, + { + "epoch": 1.1847845441595442, + "grad_norm": 0.6090908646583557, + "learning_rate": 0.0002648688713151172, + "loss": 0.5755, + "step": 53230 + }, + { + "epoch": 1.1850071225071226, + "grad_norm": 0.7091788649559021, + "learning_rate": 0.0002648247677594518, + "loss": 0.5249, + "step": 53240 + }, + { + "epoch": 1.1852297008547008, + "grad_norm": 0.5731387734413147, + "learning_rate": 0.0002647806606811554, + "loss": 0.5721, + "step": 53250 + }, + { + "epoch": 1.1854522792022792, + "grad_norm": 0.851579487323761, + "learning_rate": 0.00026473655008262486, + "loss": 0.6269, + "step": 53260 + }, + { + "epoch": 1.1856748575498575, + "grad_norm": 0.6304162740707397, + "learning_rate": 0.0002646924359662573, + "loss": 0.6449, + "step": 53270 + }, + { + "epoch": 1.185897435897436, + "grad_norm": 0.8386474847793579, + "learning_rate": 0.00026464831833444976, + "loss": 0.5611, + "step": 53280 + }, + { + "epoch": 1.1861200142450143, + "grad_norm": 0.6858943700790405, + "learning_rate": 0.00026460419718959965, + "loss": 0.6902, + "step": 53290 + }, + { + "epoch": 1.1863425925925926, + "grad_norm": 0.6749021410942078, + "learning_rate": 0.0002645600725341046, + "loss": 0.5678, + "step": 53300 + }, + { + "epoch": 1.186565170940171, + "grad_norm": 0.6653118133544922, + "learning_rate": 0.00026451594437036234, + "loss": 0.6369, + "step": 53310 + }, + { + "epoch": 1.1867877492877492, + "grad_norm": 0.4555003345012665, + "learning_rate": 0.00026447181270077084, + "loss": 0.5047, + "step": 53320 + }, + { + "epoch": 1.1870103276353277, + "grad_norm": 0.44015443325042725, + "learning_rate": 0.0002644276775277283, + "loss": 0.5344, + "step": 53330 + }, + { + "epoch": 1.1872329059829059, + "grad_norm": 0.4752315878868103, + "learning_rate": 0.00026438353885363297, + "loss": 0.5221, + "step": 53340 + }, + { + "epoch": 1.1874554843304843, + "grad_norm": 0.8851911425590515, + "learning_rate": 0.00026433939668088344, + "loss": 0.6015, + "step": 53350 + }, + { + "epoch": 1.1876780626780628, + "grad_norm": 0.4986988604068756, + "learning_rate": 0.0002642952510118785, + "loss": 0.6087, + "step": 53360 + }, + { + "epoch": 1.187900641025641, + "grad_norm": 1.0384236574172974, + "learning_rate": 0.00026425110184901687, + "loss": 0.4508, + "step": 53370 + }, + { + "epoch": 1.1881232193732194, + "grad_norm": 0.7926008105278015, + "learning_rate": 0.00026420694919469784, + "loss": 0.6285, + "step": 53380 + }, + { + "epoch": 1.1883457977207976, + "grad_norm": 0.9093513488769531, + "learning_rate": 0.0002641627930513206, + "loss": 0.532, + "step": 53390 + }, + { + "epoch": 1.188568376068376, + "grad_norm": 0.739948570728302, + "learning_rate": 0.0002641186334212847, + "loss": 0.576, + "step": 53400 + }, + { + "epoch": 1.1887909544159545, + "grad_norm": 0.4455528259277344, + "learning_rate": 0.00026407447030698974, + "loss": 0.437, + "step": 53410 + }, + { + "epoch": 1.1890135327635327, + "grad_norm": 0.6131460070610046, + "learning_rate": 0.00026403030371083557, + "loss": 0.5818, + "step": 53420 + }, + { + "epoch": 1.1892361111111112, + "grad_norm": 0.6285380721092224, + "learning_rate": 0.0002639861336352223, + "loss": 0.4712, + "step": 53430 + }, + { + "epoch": 1.1894586894586894, + "grad_norm": 0.8274964094161987, + "learning_rate": 0.00026394196008255015, + "loss": 0.5606, + "step": 53440 + }, + { + "epoch": 1.1896812678062678, + "grad_norm": 0.6058911681175232, + "learning_rate": 0.0002638977830552196, + "loss": 0.5672, + "step": 53450 + }, + { + "epoch": 1.1899038461538463, + "grad_norm": 0.6424249410629272, + "learning_rate": 0.0002638536025556312, + "loss": 0.5608, + "step": 53460 + }, + { + "epoch": 1.1901264245014245, + "grad_norm": 0.5659881234169006, + "learning_rate": 0.0002638094185861857, + "loss": 0.4935, + "step": 53470 + }, + { + "epoch": 1.190349002849003, + "grad_norm": 0.5742921829223633, + "learning_rate": 0.0002637652311492842, + "loss": 0.624, + "step": 53480 + }, + { + "epoch": 1.1905715811965811, + "grad_norm": 0.6720833778381348, + "learning_rate": 0.00026372104024732784, + "loss": 0.5983, + "step": 53490 + }, + { + "epoch": 1.1907941595441596, + "grad_norm": 0.9703565835952759, + "learning_rate": 0.00026367684588271794, + "loss": 0.6631, + "step": 53500 + }, + { + "epoch": 1.1910167378917378, + "grad_norm": 0.7967060208320618, + "learning_rate": 0.00026363264805785616, + "loss": 0.5349, + "step": 53510 + }, + { + "epoch": 1.1912393162393162, + "grad_norm": 0.5237561464309692, + "learning_rate": 0.0002635884467751442, + "loss": 0.5134, + "step": 53520 + }, + { + "epoch": 1.1914618945868947, + "grad_norm": 0.5853220820426941, + "learning_rate": 0.000263544242036984, + "loss": 0.5488, + "step": 53530 + }, + { + "epoch": 1.1916844729344729, + "grad_norm": 0.6570531129837036, + "learning_rate": 0.0002635000338457776, + "loss": 0.5671, + "step": 53540 + }, + { + "epoch": 1.1919070512820513, + "grad_norm": 0.6014338135719299, + "learning_rate": 0.00026345582220392734, + "loss": 0.4712, + "step": 53550 + }, + { + "epoch": 1.1921296296296295, + "grad_norm": 0.7948473691940308, + "learning_rate": 0.0002634116071138359, + "loss": 0.5277, + "step": 53560 + }, + { + "epoch": 1.192352207977208, + "grad_norm": 0.5840214490890503, + "learning_rate": 0.0002633673885779057, + "loss": 0.5787, + "step": 53570 + }, + { + "epoch": 1.1925747863247864, + "grad_norm": 0.5520692467689514, + "learning_rate": 0.00026332316659853975, + "loss": 0.5515, + "step": 53580 + }, + { + "epoch": 1.1927973646723646, + "grad_norm": 0.49704843759536743, + "learning_rate": 0.00026327894117814116, + "loss": 0.5606, + "step": 53590 + }, + { + "epoch": 1.193019943019943, + "grad_norm": 0.7613427639007568, + "learning_rate": 0.00026323471231911303, + "loss": 0.5976, + "step": 53600 + }, + { + "epoch": 1.1932425213675213, + "grad_norm": 0.615630567073822, + "learning_rate": 0.0002631904800238589, + "loss": 0.5749, + "step": 53610 + }, + { + "epoch": 1.1934650997150997, + "grad_norm": 0.38871222734451294, + "learning_rate": 0.0002631462442947823, + "loss": 0.5895, + "step": 53620 + }, + { + "epoch": 1.1936876780626782, + "grad_norm": 0.47710564732551575, + "learning_rate": 0.0002631020051342872, + "loss": 0.4715, + "step": 53630 + }, + { + "epoch": 1.1939102564102564, + "grad_norm": 0.6385695934295654, + "learning_rate": 0.00026305776254477735, + "loss": 0.6086, + "step": 53640 + }, + { + "epoch": 1.1941328347578348, + "grad_norm": 0.49752333760261536, + "learning_rate": 0.0002630135165286571, + "loss": 0.4322, + "step": 53650 + }, + { + "epoch": 1.194355413105413, + "grad_norm": 0.6448046565055847, + "learning_rate": 0.00026296926708833083, + "loss": 0.5492, + "step": 53660 + }, + { + "epoch": 1.1945779914529915, + "grad_norm": 0.6780838966369629, + "learning_rate": 0.00026292501422620307, + "loss": 0.5434, + "step": 53670 + }, + { + "epoch": 1.1948005698005697, + "grad_norm": 0.6584859490394592, + "learning_rate": 0.00026288075794467843, + "loss": 0.5916, + "step": 53680 + }, + { + "epoch": 1.1950231481481481, + "grad_norm": 0.6336795091629028, + "learning_rate": 0.00026283649824616195, + "loss": 0.5295, + "step": 53690 + }, + { + "epoch": 1.1952457264957266, + "grad_norm": 0.5201059579849243, + "learning_rate": 0.0002627922351330588, + "loss": 0.5759, + "step": 53700 + }, + { + "epoch": 1.1954683048433048, + "grad_norm": 0.4014904499053955, + "learning_rate": 0.0002627479686077741, + "loss": 0.5278, + "step": 53710 + }, + { + "epoch": 1.1956908831908832, + "grad_norm": 0.532713770866394, + "learning_rate": 0.00026270369867271336, + "loss": 0.4821, + "step": 53720 + }, + { + "epoch": 1.1959134615384615, + "grad_norm": 0.46067413687705994, + "learning_rate": 0.0002626594253302824, + "loss": 0.5367, + "step": 53730 + }, + { + "epoch": 1.19613603988604, + "grad_norm": 0.7721562385559082, + "learning_rate": 0.000262615148582887, + "loss": 0.4979, + "step": 53740 + }, + { + "epoch": 1.196358618233618, + "grad_norm": 0.6185513734817505, + "learning_rate": 0.0002625708684329331, + "loss": 0.5486, + "step": 53750 + }, + { + "epoch": 1.1965811965811965, + "grad_norm": 0.6002604365348816, + "learning_rate": 0.00026252658488282697, + "loss": 0.4288, + "step": 53760 + }, + { + "epoch": 1.196803774928775, + "grad_norm": 0.5647562742233276, + "learning_rate": 0.00026248229793497506, + "loss": 0.4885, + "step": 53770 + }, + { + "epoch": 1.1970263532763532, + "grad_norm": 0.7083515524864197, + "learning_rate": 0.00026243800759178396, + "loss": 0.7101, + "step": 53780 + }, + { + "epoch": 1.1972489316239316, + "grad_norm": 0.7820031642913818, + "learning_rate": 0.00026239371385566044, + "loss": 0.6792, + "step": 53790 + }, + { + "epoch": 1.19747150997151, + "grad_norm": 0.4229520857334137, + "learning_rate": 0.00026234941672901137, + "loss": 0.5298, + "step": 53800 + }, + { + "epoch": 1.1976940883190883, + "grad_norm": 0.8767134547233582, + "learning_rate": 0.00026230511621424396, + "loss": 0.6235, + "step": 53810 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.6357962489128113, + "learning_rate": 0.0002622608123137655, + "loss": 0.5968, + "step": 53820 + }, + { + "epoch": 1.198139245014245, + "grad_norm": 0.5747750997543335, + "learning_rate": 0.00026221650502998356, + "loss": 0.5996, + "step": 53830 + }, + { + "epoch": 1.1983618233618234, + "grad_norm": 0.6612414717674255, + "learning_rate": 0.0002621721943653058, + "loss": 0.6281, + "step": 53840 + }, + { + "epoch": 1.1985844017094016, + "grad_norm": 0.4362661838531494, + "learning_rate": 0.0002621278803221401, + "loss": 0.6812, + "step": 53850 + }, + { + "epoch": 1.19880698005698, + "grad_norm": 0.4877786934375763, + "learning_rate": 0.0002620835629028946, + "loss": 0.5982, + "step": 53860 + }, + { + "epoch": 1.1990295584045585, + "grad_norm": 0.6957717537879944, + "learning_rate": 0.00026203924210997735, + "loss": 0.5806, + "step": 53870 + }, + { + "epoch": 1.1992521367521367, + "grad_norm": 0.5992857813835144, + "learning_rate": 0.00026199491794579694, + "loss": 0.5286, + "step": 53880 + }, + { + "epoch": 1.1994747150997151, + "grad_norm": 0.6934967637062073, + "learning_rate": 0.0002619505904127619, + "loss": 0.7629, + "step": 53890 + }, + { + "epoch": 1.1996972934472934, + "grad_norm": 0.5664094090461731, + "learning_rate": 0.0002619062595132811, + "loss": 0.7004, + "step": 53900 + }, + { + "epoch": 1.1999198717948718, + "grad_norm": 1.015615701675415, + "learning_rate": 0.00026186192524976353, + "loss": 0.5354, + "step": 53910 + }, + { + "epoch": 1.20014245014245, + "grad_norm": 0.5994182825088501, + "learning_rate": 0.00026181758762461825, + "loss": 0.549, + "step": 53920 + }, + { + "epoch": 1.20014245014245, + "eval_loss": 0.5869694352149963, + "eval_runtime": 337.2585, + "eval_samples_per_second": 7.012, + "eval_steps_per_second": 7.012, + "step": 53920 + }, + { + "epoch": 1.2003650284900285, + "grad_norm": 0.9583232402801514, + "learning_rate": 0.0002617732466402547, + "loss": 0.6322, + "step": 53930 + }, + { + "epoch": 1.200587606837607, + "grad_norm": 0.7566698789596558, + "learning_rate": 0.00026172890229908226, + "loss": 0.7383, + "step": 53940 + }, + { + "epoch": 1.2008101851851851, + "grad_norm": 0.767450213432312, + "learning_rate": 0.0002616845546035108, + "loss": 0.5751, + "step": 53950 + }, + { + "epoch": 1.2010327635327636, + "grad_norm": 0.5808753967285156, + "learning_rate": 0.00026164020355595014, + "loss": 0.5769, + "step": 53960 + }, + { + "epoch": 1.201255341880342, + "grad_norm": 0.5383173823356628, + "learning_rate": 0.0002615958491588103, + "loss": 0.4942, + "step": 53970 + }, + { + "epoch": 1.2014779202279202, + "grad_norm": 1.001544713973999, + "learning_rate": 0.0002615514914145017, + "loss": 0.6011, + "step": 53980 + }, + { + "epoch": 1.2017004985754987, + "grad_norm": 0.48997971415519714, + "learning_rate": 0.0002615071303254346, + "loss": 0.496, + "step": 53990 + }, + { + "epoch": 1.2019230769230769, + "grad_norm": 0.7488561868667603, + "learning_rate": 0.00026146276589401966, + "loss": 0.5785, + "step": 54000 + }, + { + "epoch": 1.2021456552706553, + "grad_norm": 0.6256828904151917, + "learning_rate": 0.0002614183981226678, + "loss": 0.5895, + "step": 54010 + }, + { + "epoch": 1.2023682336182335, + "grad_norm": 0.5626097917556763, + "learning_rate": 0.00026137402701378984, + "loss": 0.4916, + "step": 54020 + }, + { + "epoch": 1.202590811965812, + "grad_norm": 0.5621308088302612, + "learning_rate": 0.000261329652569797, + "loss": 0.5951, + "step": 54030 + }, + { + "epoch": 1.2028133903133904, + "grad_norm": 0.6779301166534424, + "learning_rate": 0.00026128527479310064, + "loss": 0.5456, + "step": 54040 + }, + { + "epoch": 1.2030359686609686, + "grad_norm": 0.816089391708374, + "learning_rate": 0.0002612408936861123, + "loss": 0.482, + "step": 54050 + }, + { + "epoch": 1.203258547008547, + "grad_norm": 0.6551616191864014, + "learning_rate": 0.00026119650925124366, + "loss": 0.4211, + "step": 54060 + }, + { + "epoch": 1.2034811253561253, + "grad_norm": 0.6255223751068115, + "learning_rate": 0.0002611521214909066, + "loss": 0.5114, + "step": 54070 + }, + { + "epoch": 1.2037037037037037, + "grad_norm": 0.514076292514801, + "learning_rate": 0.0002611077304075132, + "loss": 0.4465, + "step": 54080 + }, + { + "epoch": 1.203926282051282, + "grad_norm": 0.4324864149093628, + "learning_rate": 0.00026106333600347566, + "loss": 0.4174, + "step": 54090 + }, + { + "epoch": 1.2041488603988604, + "grad_norm": 0.6271386742591858, + "learning_rate": 0.0002610189382812065, + "loss": 0.628, + "step": 54100 + }, + { + "epoch": 1.2043714387464388, + "grad_norm": 0.6508593559265137, + "learning_rate": 0.0002609745372431183, + "loss": 0.4914, + "step": 54110 + }, + { + "epoch": 1.204594017094017, + "grad_norm": 0.5108374953269958, + "learning_rate": 0.00026093013289162385, + "loss": 0.563, + "step": 54120 + }, + { + "epoch": 1.2048165954415955, + "grad_norm": 0.8042571544647217, + "learning_rate": 0.00026088572522913606, + "loss": 0.5608, + "step": 54130 + }, + { + "epoch": 1.205039173789174, + "grad_norm": 0.7624759674072266, + "learning_rate": 0.000260841314258068, + "loss": 0.456, + "step": 54140 + }, + { + "epoch": 1.2052617521367521, + "grad_norm": 0.789258599281311, + "learning_rate": 0.0002607968999808333, + "loss": 0.4817, + "step": 54150 + }, + { + "epoch": 1.2054843304843306, + "grad_norm": 0.7475525140762329, + "learning_rate": 0.0002607524823998452, + "loss": 0.5729, + "step": 54160 + }, + { + "epoch": 1.2057069088319088, + "grad_norm": 0.5522698163986206, + "learning_rate": 0.0002607080615175175, + "loss": 0.4242, + "step": 54170 + }, + { + "epoch": 1.2059294871794872, + "grad_norm": 0.9012867212295532, + "learning_rate": 0.00026066363733626396, + "loss": 0.5816, + "step": 54180 + }, + { + "epoch": 1.2061520655270654, + "grad_norm": 0.6278597712516785, + "learning_rate": 0.0002606192098584988, + "loss": 0.5937, + "step": 54190 + }, + { + "epoch": 1.2063746438746439, + "grad_norm": 0.674605131149292, + "learning_rate": 0.00026057477908663615, + "loss": 0.4871, + "step": 54200 + }, + { + "epoch": 1.2065972222222223, + "grad_norm": 0.5200433135032654, + "learning_rate": 0.00026053034502309037, + "loss": 0.5309, + "step": 54210 + }, + { + "epoch": 1.2068198005698005, + "grad_norm": 0.6211512088775635, + "learning_rate": 0.0002604859076702761, + "loss": 0.7805, + "step": 54220 + }, + { + "epoch": 1.207042378917379, + "grad_norm": 0.5976756811141968, + "learning_rate": 0.0002604414670306081, + "loss": 0.4788, + "step": 54230 + }, + { + "epoch": 1.2072649572649572, + "grad_norm": 0.7606221437454224, + "learning_rate": 0.0002603970231065013, + "loss": 0.4546, + "step": 54240 + }, + { + "epoch": 1.2074875356125356, + "grad_norm": 0.5437316298484802, + "learning_rate": 0.00026035257590037084, + "loss": 0.5922, + "step": 54250 + }, + { + "epoch": 1.2077101139601139, + "grad_norm": 0.7171007990837097, + "learning_rate": 0.000260308125414632, + "loss": 0.59, + "step": 54260 + }, + { + "epoch": 1.2079326923076923, + "grad_norm": 0.7423374652862549, + "learning_rate": 0.00026026367165170024, + "loss": 0.5753, + "step": 54270 + }, + { + "epoch": 1.2081552706552707, + "grad_norm": 0.42859748005867004, + "learning_rate": 0.0002602192146139912, + "loss": 0.5436, + "step": 54280 + }, + { + "epoch": 1.208377849002849, + "grad_norm": 0.5075267553329468, + "learning_rate": 0.0002601747543039207, + "loss": 0.513, + "step": 54290 + }, + { + "epoch": 1.2086004273504274, + "grad_norm": 0.6833608746528625, + "learning_rate": 0.0002601302907239049, + "loss": 0.5712, + "step": 54300 + }, + { + "epoch": 1.2088230056980056, + "grad_norm": 0.8390812873840332, + "learning_rate": 0.0002600858238763598, + "loss": 0.5476, + "step": 54310 + }, + { + "epoch": 1.209045584045584, + "grad_norm": 0.7078325748443604, + "learning_rate": 0.0002600413537637019, + "loss": 0.6077, + "step": 54320 + }, + { + "epoch": 1.2092681623931625, + "grad_norm": 0.8721582889556885, + "learning_rate": 0.0002599968803883477, + "loss": 0.6386, + "step": 54330 + }, + { + "epoch": 1.2094907407407407, + "grad_norm": 0.6040824055671692, + "learning_rate": 0.0002599524037527138, + "loss": 0.6223, + "step": 54340 + }, + { + "epoch": 1.2097133190883191, + "grad_norm": 0.8039819002151489, + "learning_rate": 0.00025990792385921724, + "loss": 0.5922, + "step": 54350 + }, + { + "epoch": 1.2099358974358974, + "grad_norm": 0.8088663816452026, + "learning_rate": 0.00025986344071027507, + "loss": 0.707, + "step": 54360 + }, + { + "epoch": 1.2101584757834758, + "grad_norm": 0.38190963864326477, + "learning_rate": 0.00025981895430830456, + "loss": 0.5355, + "step": 54370 + }, + { + "epoch": 1.2103810541310542, + "grad_norm": 0.5602192282676697, + "learning_rate": 0.00025977446465572313, + "loss": 0.6918, + "step": 54380 + }, + { + "epoch": 1.2106036324786325, + "grad_norm": 0.3187742531299591, + "learning_rate": 0.00025972997175494826, + "loss": 0.3899, + "step": 54390 + }, + { + "epoch": 1.210826210826211, + "grad_norm": 0.46724361181259155, + "learning_rate": 0.0002596854756083979, + "loss": 0.5282, + "step": 54400 + }, + { + "epoch": 1.211048789173789, + "grad_norm": 0.44511914253234863, + "learning_rate": 0.0002596409762184899, + "loss": 0.4879, + "step": 54410 + }, + { + "epoch": 1.2112713675213675, + "grad_norm": 0.5991200804710388, + "learning_rate": 0.00025959647358764237, + "loss": 0.5285, + "step": 54420 + }, + { + "epoch": 1.2114939458689458, + "grad_norm": 0.5048407316207886, + "learning_rate": 0.00025955196771827374, + "loss": 0.5108, + "step": 54430 + }, + { + "epoch": 1.2117165242165242, + "grad_norm": 0.4918557405471802, + "learning_rate": 0.00025950745861280243, + "loss": 0.4733, + "step": 54440 + }, + { + "epoch": 1.2119391025641026, + "grad_norm": 0.56037837266922, + "learning_rate": 0.00025946294627364713, + "loss": 0.5976, + "step": 54450 + }, + { + "epoch": 1.2121616809116809, + "grad_norm": 0.3522513508796692, + "learning_rate": 0.0002594184307032266, + "loss": 0.5696, + "step": 54460 + }, + { + "epoch": 1.2123842592592593, + "grad_norm": 0.6364935040473938, + "learning_rate": 0.0002593739119039599, + "loss": 0.651, + "step": 54470 + }, + { + "epoch": 1.2126068376068375, + "grad_norm": 0.6097647547721863, + "learning_rate": 0.00025932938987826626, + "loss": 0.6558, + "step": 54480 + }, + { + "epoch": 1.212829415954416, + "grad_norm": 0.5620438456535339, + "learning_rate": 0.000259284864628565, + "loss": 0.5837, + "step": 54490 + }, + { + "epoch": 1.2130519943019944, + "grad_norm": 0.7545850276947021, + "learning_rate": 0.00025924033615727567, + "loss": 0.6373, + "step": 54500 + }, + { + "epoch": 1.2132745726495726, + "grad_norm": 0.49159228801727295, + "learning_rate": 0.000259195804466818, + "loss": 0.4563, + "step": 54510 + }, + { + "epoch": 1.213497150997151, + "grad_norm": 0.5741070508956909, + "learning_rate": 0.0002591512695596118, + "loss": 0.537, + "step": 54520 + }, + { + "epoch": 1.2137197293447293, + "grad_norm": 0.7356338500976562, + "learning_rate": 0.0002591067314380772, + "loss": 0.5082, + "step": 54530 + }, + { + "epoch": 1.2139423076923077, + "grad_norm": 0.49523648619651794, + "learning_rate": 0.00025906219010463446, + "loss": 0.6259, + "step": 54540 + }, + { + "epoch": 1.2141648860398861, + "grad_norm": 0.5031092762947083, + "learning_rate": 0.00025901764556170387, + "loss": 0.5766, + "step": 54550 + }, + { + "epoch": 1.2143874643874644, + "grad_norm": 0.5970885753631592, + "learning_rate": 0.0002589730978117062, + "loss": 0.6695, + "step": 54560 + }, + { + "epoch": 1.2146100427350428, + "grad_norm": 0.8675116300582886, + "learning_rate": 0.0002589285468570621, + "loss": 0.5188, + "step": 54570 + }, + { + "epoch": 1.214832621082621, + "grad_norm": 0.6658830046653748, + "learning_rate": 0.0002588839927001925, + "loss": 0.6013, + "step": 54580 + }, + { + "epoch": 1.2150551994301995, + "grad_norm": 0.37236538529396057, + "learning_rate": 0.0002588394353435185, + "loss": 0.4736, + "step": 54590 + }, + { + "epoch": 1.2152777777777777, + "grad_norm": 0.6953976154327393, + "learning_rate": 0.0002587948747894615, + "loss": 0.4716, + "step": 54600 + }, + { + "epoch": 1.2155003561253561, + "grad_norm": 0.5283172726631165, + "learning_rate": 0.00025875031104044283, + "loss": 0.5972, + "step": 54610 + }, + { + "epoch": 1.2157229344729346, + "grad_norm": 0.9671580195426941, + "learning_rate": 0.00025870574409888415, + "loss": 0.5456, + "step": 54620 + }, + { + "epoch": 1.2159455128205128, + "grad_norm": 0.4424217641353607, + "learning_rate": 0.00025866117396720727, + "loss": 0.6644, + "step": 54630 + }, + { + "epoch": 1.2161680911680912, + "grad_norm": 0.6921446919441223, + "learning_rate": 0.0002586166006478342, + "loss": 0.7807, + "step": 54640 + }, + { + "epoch": 1.2163906695156694, + "grad_norm": 0.6515398025512695, + "learning_rate": 0.00025857202414318706, + "loss": 0.6287, + "step": 54650 + }, + { + "epoch": 1.2166132478632479, + "grad_norm": 0.6656025052070618, + "learning_rate": 0.0002585274444556882, + "loss": 0.5519, + "step": 54660 + }, + { + "epoch": 1.216835826210826, + "grad_norm": 0.5132244825363159, + "learning_rate": 0.00025848286158776005, + "loss": 0.5149, + "step": 54670 + }, + { + "epoch": 1.2170584045584045, + "grad_norm": 0.755617618560791, + "learning_rate": 0.00025843827554182535, + "loss": 0.5941, + "step": 54680 + }, + { + "epoch": 1.217280982905983, + "grad_norm": 0.4537512958049774, + "learning_rate": 0.0002583936863203069, + "loss": 0.5314, + "step": 54690 + }, + { + "epoch": 1.2175035612535612, + "grad_norm": 0.5429509282112122, + "learning_rate": 0.00025834909392562775, + "loss": 0.5425, + "step": 54700 + }, + { + "epoch": 1.2177261396011396, + "grad_norm": 0.766391396522522, + "learning_rate": 0.0002583044983602111, + "loss": 0.671, + "step": 54710 + }, + { + "epoch": 1.217948717948718, + "grad_norm": 0.7556502819061279, + "learning_rate": 0.00025825989962648024, + "loss": 0.5542, + "step": 54720 + }, + { + "epoch": 1.2181712962962963, + "grad_norm": 0.5049962401390076, + "learning_rate": 0.00025821529772685874, + "loss": 0.5729, + "step": 54730 + }, + { + "epoch": 1.2183938746438747, + "grad_norm": 0.6555672287940979, + "learning_rate": 0.00025817069266377026, + "loss": 0.6085, + "step": 54740 + }, + { + "epoch": 1.218616452991453, + "grad_norm": 0.6055775880813599, + "learning_rate": 0.00025812608443963884, + "loss": 0.6407, + "step": 54750 + }, + { + "epoch": 1.2188390313390314, + "grad_norm": 0.7035840749740601, + "learning_rate": 0.0002580814730568883, + "loss": 0.586, + "step": 54760 + }, + { + "epoch": 1.2190616096866096, + "grad_norm": 0.39872151613235474, + "learning_rate": 0.000258036858517943, + "loss": 0.555, + "step": 54770 + }, + { + "epoch": 1.219284188034188, + "grad_norm": 0.5271748900413513, + "learning_rate": 0.0002579922408252273, + "loss": 0.5704, + "step": 54780 + }, + { + "epoch": 1.2195067663817665, + "grad_norm": 0.5622698664665222, + "learning_rate": 0.00025794761998116576, + "loss": 0.6895, + "step": 54790 + }, + { + "epoch": 1.2197293447293447, + "grad_norm": 0.5202801823616028, + "learning_rate": 0.0002579029959881831, + "loss": 0.4322, + "step": 54800 + }, + { + "epoch": 1.2199519230769231, + "grad_norm": 0.7955455183982849, + "learning_rate": 0.00025785836884870426, + "loss": 0.5425, + "step": 54810 + }, + { + "epoch": 1.2201745014245013, + "grad_norm": 0.801661491394043, + "learning_rate": 0.00025781373856515426, + "loss": 0.5512, + "step": 54820 + }, + { + "epoch": 1.2203970797720798, + "grad_norm": 0.4858386516571045, + "learning_rate": 0.0002577691051399584, + "loss": 0.6204, + "step": 54830 + }, + { + "epoch": 1.220619658119658, + "grad_norm": 0.7022749185562134, + "learning_rate": 0.0002577244685755421, + "loss": 0.5774, + "step": 54840 + }, + { + "epoch": 1.2208422364672364, + "grad_norm": 0.6417854428291321, + "learning_rate": 0.00025767982887433085, + "loss": 0.4721, + "step": 54850 + }, + { + "epoch": 1.2210648148148149, + "grad_norm": 0.7101449966430664, + "learning_rate": 0.00025763518603875063, + "loss": 0.5465, + "step": 54860 + }, + { + "epoch": 1.221287393162393, + "grad_norm": 0.6686637997627258, + "learning_rate": 0.00025759054007122703, + "loss": 0.4875, + "step": 54870 + }, + { + "epoch": 1.2215099715099715, + "grad_norm": 0.6626191139221191, + "learning_rate": 0.00025754589097418644, + "loss": 0.4817, + "step": 54880 + }, + { + "epoch": 1.22173254985755, + "grad_norm": 0.8132066130638123, + "learning_rate": 0.00025750123875005503, + "loss": 0.5493, + "step": 54890 + }, + { + "epoch": 1.2219551282051282, + "grad_norm": 0.5762627720832825, + "learning_rate": 0.0002574565834012592, + "loss": 0.6158, + "step": 54900 + }, + { + "epoch": 1.2221777065527066, + "grad_norm": 0.4756952226161957, + "learning_rate": 0.0002574119249302256, + "loss": 0.5642, + "step": 54910 + }, + { + "epoch": 1.2224002849002849, + "grad_norm": 0.6637765765190125, + "learning_rate": 0.00025736726333938095, + "loss": 0.5216, + "step": 54920 + }, + { + "epoch": 1.2226228632478633, + "grad_norm": 0.7495138049125671, + "learning_rate": 0.0002573225986311523, + "loss": 0.6827, + "step": 54930 + }, + { + "epoch": 1.2228454415954415, + "grad_norm": 0.7420310974121094, + "learning_rate": 0.00025727793080796677, + "loss": 0.683, + "step": 54940 + }, + { + "epoch": 1.22306801994302, + "grad_norm": 0.5983907580375671, + "learning_rate": 0.0002572332598722515, + "loss": 0.5965, + "step": 54950 + }, + { + "epoch": 1.2232905982905984, + "grad_norm": 0.8083733320236206, + "learning_rate": 0.00025718858582643407, + "loss": 0.5241, + "step": 54960 + }, + { + "epoch": 1.2235131766381766, + "grad_norm": 0.630792498588562, + "learning_rate": 0.0002571439086729421, + "loss": 0.4769, + "step": 54970 + }, + { + "epoch": 1.223735754985755, + "grad_norm": 0.4133424758911133, + "learning_rate": 0.00025709922841420324, + "loss": 0.6182, + "step": 54980 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.6753107905387878, + "learning_rate": 0.00025705454505264565, + "loss": 0.488, + "step": 54990 + }, + { + "epoch": 1.2241809116809117, + "grad_norm": 0.6498923897743225, + "learning_rate": 0.0002570098585906974, + "loss": 0.533, + "step": 55000 + }, + { + "epoch": 1.22440349002849, + "grad_norm": 0.7844968438148499, + "learning_rate": 0.0002569651690307867, + "loss": 0.4815, + "step": 55010 + }, + { + "epoch": 1.2246260683760684, + "grad_norm": 0.644562304019928, + "learning_rate": 0.0002569204763753421, + "loss": 0.5618, + "step": 55020 + }, + { + "epoch": 1.2248486467236468, + "grad_norm": 0.342278391122818, + "learning_rate": 0.00025687578062679226, + "loss": 0.6154, + "step": 55030 + }, + { + "epoch": 1.225071225071225, + "grad_norm": 0.3919949233531952, + "learning_rate": 0.00025683108178756593, + "loss": 0.5703, + "step": 55040 + }, + { + "epoch": 1.2252938034188035, + "grad_norm": 0.655899167060852, + "learning_rate": 0.00025678637986009206, + "loss": 0.6265, + "step": 55050 + }, + { + "epoch": 1.225516381766382, + "grad_norm": 0.5508276224136353, + "learning_rate": 0.0002567416748467998, + "loss": 0.51, + "step": 55060 + }, + { + "epoch": 1.22573896011396, + "grad_norm": 0.5614111423492432, + "learning_rate": 0.00025669696675011854, + "loss": 0.692, + "step": 55070 + }, + { + "epoch": 1.2259615384615385, + "grad_norm": 0.5933703184127808, + "learning_rate": 0.00025665225557247763, + "loss": 0.696, + "step": 55080 + }, + { + "epoch": 1.2261841168091168, + "grad_norm": 0.9442933797836304, + "learning_rate": 0.0002566075413163068, + "loss": 0.4774, + "step": 55090 + }, + { + "epoch": 1.2264066951566952, + "grad_norm": 0.45799508690834045, + "learning_rate": 0.00025656282398403584, + "loss": 0.5187, + "step": 55100 + }, + { + "epoch": 1.2266292735042734, + "grad_norm": 1.2259610891342163, + "learning_rate": 0.00025651810357809474, + "loss": 0.5338, + "step": 55110 + }, + { + "epoch": 1.2268518518518519, + "grad_norm": 0.6217261552810669, + "learning_rate": 0.0002564733801009136, + "loss": 0.569, + "step": 55120 + }, + { + "epoch": 1.2270744301994303, + "grad_norm": 0.5604465007781982, + "learning_rate": 0.00025642865355492275, + "loss": 0.5324, + "step": 55130 + }, + { + "epoch": 1.2272970085470085, + "grad_norm": 0.9130906462669373, + "learning_rate": 0.0002563839239425527, + "loss": 0.5891, + "step": 55140 + }, + { + "epoch": 1.227519586894587, + "grad_norm": 0.42064061760902405, + "learning_rate": 0.00025633919126623404, + "loss": 0.6374, + "step": 55150 + }, + { + "epoch": 1.2277421652421652, + "grad_norm": 0.6115346550941467, + "learning_rate": 0.00025629445552839756, + "loss": 0.5738, + "step": 55160 + }, + { + "epoch": 1.2279647435897436, + "grad_norm": 0.7261988520622253, + "learning_rate": 0.00025624971673147436, + "loss": 0.5057, + "step": 55170 + }, + { + "epoch": 1.2281873219373218, + "grad_norm": 0.6409322023391724, + "learning_rate": 0.0002562049748778955, + "loss": 0.6314, + "step": 55180 + }, + { + "epoch": 1.2284099002849003, + "grad_norm": 0.7145477533340454, + "learning_rate": 0.00025616022997009225, + "loss": 0.6691, + "step": 55190 + }, + { + "epoch": 1.2286324786324787, + "grad_norm": 0.8280013203620911, + "learning_rate": 0.0002561154820104961, + "loss": 0.5243, + "step": 55200 + }, + { + "epoch": 1.228855056980057, + "grad_norm": 0.7874851822853088, + "learning_rate": 0.0002560707310015388, + "loss": 0.4837, + "step": 55210 + }, + { + "epoch": 1.2290776353276354, + "grad_norm": 0.6673729419708252, + "learning_rate": 0.00025602597694565204, + "loss": 0.5322, + "step": 55220 + }, + { + "epoch": 1.2293002136752136, + "grad_norm": 0.7367050647735596, + "learning_rate": 0.0002559812198452678, + "loss": 0.5932, + "step": 55230 + }, + { + "epoch": 1.229522792022792, + "grad_norm": 0.7605116367340088, + "learning_rate": 0.0002559364597028183, + "loss": 0.6529, + "step": 55240 + }, + { + "epoch": 1.2297453703703705, + "grad_norm": 0.7067446708679199, + "learning_rate": 0.0002558916965207358, + "loss": 0.5559, + "step": 55250 + }, + { + "epoch": 1.2299679487179487, + "grad_norm": 0.7943964004516602, + "learning_rate": 0.0002558469303014527, + "loss": 0.6749, + "step": 55260 + }, + { + "epoch": 1.2301905270655271, + "grad_norm": 0.7154000997543335, + "learning_rate": 0.00025580216104740167, + "loss": 0.5852, + "step": 55270 + }, + { + "epoch": 1.2304131054131053, + "grad_norm": 0.7004664540290833, + "learning_rate": 0.00025575738876101563, + "loss": 0.6099, + "step": 55280 + }, + { + "epoch": 1.2306356837606838, + "grad_norm": 0.37125012278556824, + "learning_rate": 0.0002557126134447273, + "loss": 0.5734, + "step": 55290 + }, + { + "epoch": 1.2308582621082622, + "grad_norm": 0.6853071451187134, + "learning_rate": 0.0002556678351009701, + "loss": 0.5636, + "step": 55300 + }, + { + "epoch": 1.2310808404558404, + "grad_norm": 0.5351976156234741, + "learning_rate": 0.00025562305373217703, + "loss": 0.5404, + "step": 55310 + }, + { + "epoch": 1.2313034188034189, + "grad_norm": 0.6240566372871399, + "learning_rate": 0.00025557826934078184, + "loss": 0.6675, + "step": 55320 + }, + { + "epoch": 1.231525997150997, + "grad_norm": 0.9049735069274902, + "learning_rate": 0.00025553348192921784, + "loss": 0.5755, + "step": 55330 + }, + { + "epoch": 1.2317485754985755, + "grad_norm": 0.49391135573387146, + "learning_rate": 0.000255488691499919, + "loss": 0.5853, + "step": 55340 + }, + { + "epoch": 1.2319711538461537, + "grad_norm": 0.6163642406463623, + "learning_rate": 0.0002554438980553193, + "loss": 0.5491, + "step": 55350 + }, + { + "epoch": 1.2321937321937322, + "grad_norm": 0.7874521017074585, + "learning_rate": 0.00025539910159785276, + "loss": 0.5642, + "step": 55360 + }, + { + "epoch": 1.2324163105413106, + "grad_norm": 0.7445282936096191, + "learning_rate": 0.00025535430212995366, + "loss": 0.5149, + "step": 55370 + }, + { + "epoch": 1.2326388888888888, + "grad_norm": 0.7496173977851868, + "learning_rate": 0.0002553094996540565, + "loss": 0.6407, + "step": 55380 + }, + { + "epoch": 1.2328614672364673, + "grad_norm": 0.7357646226882935, + "learning_rate": 0.00025526469417259587, + "loss": 0.5723, + "step": 55390 + }, + { + "epoch": 1.2330840455840455, + "grad_norm": 0.6955065727233887, + "learning_rate": 0.0002552198856880065, + "loss": 0.5503, + "step": 55400 + }, + { + "epoch": 1.233306623931624, + "grad_norm": 0.8007845878601074, + "learning_rate": 0.0002551750742027233, + "loss": 0.673, + "step": 55410 + }, + { + "epoch": 1.2335292022792024, + "grad_norm": 0.5288559794425964, + "learning_rate": 0.00025513025971918144, + "loss": 0.4695, + "step": 55420 + }, + { + "epoch": 1.2337517806267806, + "grad_norm": 0.4855462610721588, + "learning_rate": 0.00025508544223981617, + "loss": 0.4069, + "step": 55430 + }, + { + "epoch": 1.233974358974359, + "grad_norm": 0.5603774189949036, + "learning_rate": 0.0002550406217670628, + "loss": 0.5772, + "step": 55440 + }, + { + "epoch": 1.2341969373219372, + "grad_norm": 0.4866039454936981, + "learning_rate": 0.000254995798303357, + "loss": 0.586, + "step": 55450 + }, + { + "epoch": 1.2344195156695157, + "grad_norm": 0.5705690383911133, + "learning_rate": 0.0002549509718511345, + "loss": 0.406, + "step": 55460 + }, + { + "epoch": 1.2346420940170941, + "grad_norm": 1.1998240947723389, + "learning_rate": 0.0002549061424128312, + "loss": 0.5793, + "step": 55470 + }, + { + "epoch": 1.2348646723646723, + "grad_norm": 0.8668375015258789, + "learning_rate": 0.0002548613099908832, + "loss": 0.5561, + "step": 55480 + }, + { + "epoch": 1.2350872507122508, + "grad_norm": 0.5967520475387573, + "learning_rate": 0.0002548164745877267, + "loss": 0.5796, + "step": 55490 + }, + { + "epoch": 1.235309829059829, + "grad_norm": 0.41339319944381714, + "learning_rate": 0.00025477163620579816, + "loss": 0.5141, + "step": 55500 + }, + { + "epoch": 1.2355324074074074, + "grad_norm": 0.545566737651825, + "learning_rate": 0.00025472679484753397, + "loss": 0.5999, + "step": 55510 + }, + { + "epoch": 1.2357549857549857, + "grad_norm": 0.5253483653068542, + "learning_rate": 0.00025468195051537093, + "loss": 0.6773, + "step": 55520 + }, + { + "epoch": 1.235977564102564, + "grad_norm": 0.7213420271873474, + "learning_rate": 0.000254637103211746, + "loss": 0.5317, + "step": 55530 + }, + { + "epoch": 1.2362001424501425, + "grad_norm": 0.7994476556777954, + "learning_rate": 0.0002545922529390961, + "loss": 0.4462, + "step": 55540 + }, + { + "epoch": 1.2364227207977208, + "grad_norm": 0.4448365867137909, + "learning_rate": 0.0002545473996998585, + "loss": 0.5264, + "step": 55550 + }, + { + "epoch": 1.2366452991452992, + "grad_norm": 0.7122238874435425, + "learning_rate": 0.00025450254349647063, + "loss": 0.5363, + "step": 55560 + }, + { + "epoch": 1.2368678774928774, + "grad_norm": 0.5884889364242554, + "learning_rate": 0.0002544576843313698, + "loss": 0.4136, + "step": 55570 + }, + { + "epoch": 1.2370904558404558, + "grad_norm": 0.6523982286453247, + "learning_rate": 0.0002544128222069939, + "loss": 0.7017, + "step": 55580 + }, + { + "epoch": 1.237313034188034, + "grad_norm": 0.5850688219070435, + "learning_rate": 0.0002543679571257807, + "loss": 0.5455, + "step": 55590 + }, + { + "epoch": 1.2375356125356125, + "grad_norm": 0.6737033724784851, + "learning_rate": 0.00025432308909016817, + "loss": 0.4931, + "step": 55600 + }, + { + "epoch": 1.237758190883191, + "grad_norm": 0.4806579053401947, + "learning_rate": 0.00025427821810259456, + "loss": 0.4767, + "step": 55610 + }, + { + "epoch": 1.2379807692307692, + "grad_norm": 0.7363060116767883, + "learning_rate": 0.00025423334416549805, + "loss": 0.5505, + "step": 55620 + }, + { + "epoch": 1.2382033475783476, + "grad_norm": 0.662604570388794, + "learning_rate": 0.00025418846728131735, + "loss": 0.5909, + "step": 55630 + }, + { + "epoch": 1.238425925925926, + "grad_norm": 0.5632050633430481, + "learning_rate": 0.00025414358745249086, + "loss": 0.5021, + "step": 55640 + }, + { + "epoch": 1.2386485042735043, + "grad_norm": 0.5719519853591919, + "learning_rate": 0.0002540987046814575, + "loss": 0.4946, + "step": 55650 + }, + { + "epoch": 1.2388710826210827, + "grad_norm": 0.6867715716362, + "learning_rate": 0.00025405381897065633, + "loss": 0.5261, + "step": 55660 + }, + { + "epoch": 1.239093660968661, + "grad_norm": 0.7669585943222046, + "learning_rate": 0.00025400893032252633, + "loss": 0.5767, + "step": 55670 + }, + { + "epoch": 1.2393162393162394, + "grad_norm": 0.48740777373313904, + "learning_rate": 0.00025396403873950685, + "loss": 0.6365, + "step": 55680 + }, + { + "epoch": 1.2395388176638176, + "grad_norm": 0.5656322240829468, + "learning_rate": 0.0002539191442240373, + "loss": 0.5406, + "step": 55690 + }, + { + "epoch": 1.239761396011396, + "grad_norm": 0.4301076829433441, + "learning_rate": 0.0002538742467785574, + "loss": 0.6131, + "step": 55700 + }, + { + "epoch": 1.2399839743589745, + "grad_norm": 0.5691587924957275, + "learning_rate": 0.0002538293464055068, + "loss": 0.6222, + "step": 55710 + }, + { + "epoch": 1.2402065527065527, + "grad_norm": 0.9512937664985657, + "learning_rate": 0.00025378444310732536, + "loss": 0.7261, + "step": 55720 + }, + { + "epoch": 1.240429131054131, + "grad_norm": 0.8524298071861267, + "learning_rate": 0.0002537395368864534, + "loss": 0.5653, + "step": 55730 + }, + { + "epoch": 1.2406517094017093, + "grad_norm": 0.6196438670158386, + "learning_rate": 0.00025369462774533087, + "loss": 0.629, + "step": 55740 + }, + { + "epoch": 1.2408742877492878, + "grad_norm": 0.7953673601150513, + "learning_rate": 0.0002536497156863983, + "loss": 0.5801, + "step": 55750 + }, + { + "epoch": 1.241096866096866, + "grad_norm": 0.6761536002159119, + "learning_rate": 0.0002536048007120964, + "loss": 0.5326, + "step": 55760 + }, + { + "epoch": 1.2413194444444444, + "grad_norm": 1.063515067100525, + "learning_rate": 0.00025355988282486566, + "loss": 0.6407, + "step": 55770 + }, + { + "epoch": 1.2415420227920229, + "grad_norm": 0.559059739112854, + "learning_rate": 0.0002535149620271471, + "loss": 0.5126, + "step": 55780 + }, + { + "epoch": 1.241764601139601, + "grad_norm": 0.4550701081752777, + "learning_rate": 0.0002534700383213816, + "loss": 0.5939, + "step": 55790 + }, + { + "epoch": 1.2419871794871795, + "grad_norm": 0.49273550510406494, + "learning_rate": 0.0002534251117100105, + "loss": 0.5026, + "step": 55800 + }, + { + "epoch": 1.242209757834758, + "grad_norm": 0.8309057354927063, + "learning_rate": 0.0002533801821954751, + "loss": 0.4389, + "step": 55810 + }, + { + "epoch": 1.2424323361823362, + "grad_norm": 0.8344371914863586, + "learning_rate": 0.00025333524978021684, + "loss": 0.6484, + "step": 55820 + }, + { + "epoch": 1.2426549145299146, + "grad_norm": 0.5584569573402405, + "learning_rate": 0.0002532903144666775, + "loss": 0.6408, + "step": 55830 + }, + { + "epoch": 1.2428774928774928, + "grad_norm": 0.4493742287158966, + "learning_rate": 0.0002532453762572989, + "loss": 0.6098, + "step": 55840 + }, + { + "epoch": 1.2431000712250713, + "grad_norm": 0.623287558555603, + "learning_rate": 0.00025320043515452285, + "loss": 0.5406, + "step": 55850 + }, + { + "epoch": 1.2433226495726495, + "grad_norm": 0.4659002423286438, + "learning_rate": 0.00025315549116079164, + "loss": 0.5666, + "step": 55860 + }, + { + "epoch": 1.243545227920228, + "grad_norm": 1.6551035642623901, + "learning_rate": 0.0002531105442785476, + "loss": 0.7074, + "step": 55870 + }, + { + "epoch": 1.2437678062678064, + "grad_norm": 0.6472983360290527, + "learning_rate": 0.000253065594510233, + "loss": 0.6331, + "step": 55880 + }, + { + "epoch": 1.2439903846153846, + "grad_norm": 0.5087982416152954, + "learning_rate": 0.00025302064185829065, + "loss": 0.453, + "step": 55890 + }, + { + "epoch": 1.244212962962963, + "grad_norm": 0.49544757604599, + "learning_rate": 0.00025297568632516316, + "loss": 0.456, + "step": 55900 + }, + { + "epoch": 1.2444355413105412, + "grad_norm": 0.45202043652534485, + "learning_rate": 0.0002529307279132935, + "loss": 0.6218, + "step": 55910 + }, + { + "epoch": 1.2446581196581197, + "grad_norm": 0.6582351922988892, + "learning_rate": 0.00025288576662512477, + "loss": 0.6431, + "step": 55920 + }, + { + "epoch": 1.244880698005698, + "grad_norm": 0.5657109618186951, + "learning_rate": 0.0002528408024631002, + "loss": 0.511, + "step": 55930 + }, + { + "epoch": 1.2451032763532763, + "grad_norm": 0.7638427019119263, + "learning_rate": 0.00025279583542966316, + "loss": 0.5974, + "step": 55940 + }, + { + "epoch": 1.2453258547008548, + "grad_norm": 0.6323970556259155, + "learning_rate": 0.00025275086552725717, + "loss": 0.6025, + "step": 55950 + }, + { + "epoch": 1.245548433048433, + "grad_norm": 0.5526149868965149, + "learning_rate": 0.000252705892758326, + "loss": 0.5352, + "step": 55960 + }, + { + "epoch": 1.2457710113960114, + "grad_norm": 1.1778260469436646, + "learning_rate": 0.00025266091712531345, + "loss": 0.6064, + "step": 55970 + }, + { + "epoch": 1.2459935897435896, + "grad_norm": 0.9252521991729736, + "learning_rate": 0.00025261593863066357, + "loss": 0.7074, + "step": 55980 + }, + { + "epoch": 1.246216168091168, + "grad_norm": 0.5749571919441223, + "learning_rate": 0.0002525709572768205, + "loss": 0.5105, + "step": 55990 + }, + { + "epoch": 1.2464387464387465, + "grad_norm": 0.6217884421348572, + "learning_rate": 0.0002525259730662286, + "loss": 0.5243, + "step": 56000 + }, + { + "epoch": 1.2466613247863247, + "grad_norm": 0.6942879557609558, + "learning_rate": 0.00025248098600133225, + "loss": 0.6208, + "step": 56010 + }, + { + "epoch": 1.2468839031339032, + "grad_norm": 0.597964882850647, + "learning_rate": 0.0002524359960845763, + "loss": 0.5515, + "step": 56020 + }, + { + "epoch": 1.2471064814814814, + "grad_norm": 0.5471338033676147, + "learning_rate": 0.00025239100331840526, + "loss": 0.6837, + "step": 56030 + }, + { + "epoch": 1.2473290598290598, + "grad_norm": 0.6313568353652954, + "learning_rate": 0.00025234600770526424, + "loss": 0.5684, + "step": 56040 + }, + { + "epoch": 1.2475516381766383, + "grad_norm": 0.5859391689300537, + "learning_rate": 0.00025230100924759837, + "loss": 0.429, + "step": 56050 + }, + { + "epoch": 1.2477742165242165, + "grad_norm": 0.5821884870529175, + "learning_rate": 0.00025225600794785274, + "loss": 0.4953, + "step": 56060 + }, + { + "epoch": 1.247996794871795, + "grad_norm": 0.7767961621284485, + "learning_rate": 0.00025221100380847287, + "loss": 0.5567, + "step": 56070 + }, + { + "epoch": 1.2482193732193732, + "grad_norm": 0.4788326025009155, + "learning_rate": 0.00025216599683190445, + "loss": 0.6074, + "step": 56080 + }, + { + "epoch": 1.2484419515669516, + "grad_norm": 0.7091581225395203, + "learning_rate": 0.00025212098702059296, + "loss": 0.5637, + "step": 56090 + }, + { + "epoch": 1.2486645299145298, + "grad_norm": 0.723284125328064, + "learning_rate": 0.00025207597437698436, + "loss": 0.4697, + "step": 56100 + }, + { + "epoch": 1.2488871082621082, + "grad_norm": 0.6637664437294006, + "learning_rate": 0.00025203095890352466, + "loss": 0.5175, + "step": 56110 + }, + { + "epoch": 1.2491096866096867, + "grad_norm": 0.5903345942497253, + "learning_rate": 0.00025198594060266014, + "loss": 0.6325, + "step": 56120 + }, + { + "epoch": 1.249332264957265, + "grad_norm": 0.8338324427604675, + "learning_rate": 0.00025194091947683693, + "loss": 0.6398, + "step": 56130 + }, + { + "epoch": 1.2495548433048433, + "grad_norm": 0.6326521039009094, + "learning_rate": 0.0002518958955285017, + "loss": 0.3617, + "step": 56140 + }, + { + "epoch": 1.2497774216524216, + "grad_norm": 0.615234375, + "learning_rate": 0.00025185086876010104, + "loss": 0.7264, + "step": 56150 + }, + { + "epoch": 1.25, + "grad_norm": 0.8148561716079712, + "learning_rate": 0.00025180583917408175, + "loss": 0.6266, + "step": 56160 + }, + { + "epoch": 1.2502225783475782, + "grad_norm": 0.5812483429908752, + "learning_rate": 0.0002517608067728907, + "loss": 0.49, + "step": 56170 + }, + { + "epoch": 1.2504451566951567, + "grad_norm": 0.3880385458469391, + "learning_rate": 0.00025171577155897503, + "loss": 0.5487, + "step": 56180 + }, + { + "epoch": 1.250667735042735, + "grad_norm": 0.5918259024620056, + "learning_rate": 0.0002516707335347821, + "loss": 0.4947, + "step": 56190 + }, + { + "epoch": 1.2508903133903133, + "grad_norm": 0.6482241749763489, + "learning_rate": 0.0002516256927027591, + "loss": 0.6409, + "step": 56200 + }, + { + "epoch": 1.2511128917378918, + "grad_norm": 0.6706924438476562, + "learning_rate": 0.0002515806490653537, + "loss": 0.6098, + "step": 56210 + }, + { + "epoch": 1.2513354700854702, + "grad_norm": 0.6281888484954834, + "learning_rate": 0.00025153560262501363, + "loss": 0.5878, + "step": 56220 + }, + { + "epoch": 1.2515580484330484, + "grad_norm": 0.8230198621749878, + "learning_rate": 0.0002514905533841867, + "loss": 0.5487, + "step": 56230 + }, + { + "epoch": 1.2517806267806268, + "grad_norm": 0.6760601997375488, + "learning_rate": 0.000251445501345321, + "loss": 0.6127, + "step": 56240 + }, + { + "epoch": 1.252003205128205, + "grad_norm": 0.6492635607719421, + "learning_rate": 0.00025140044651086456, + "loss": 0.5875, + "step": 56250 + }, + { + "epoch": 1.2522257834757835, + "grad_norm": 0.6327205300331116, + "learning_rate": 0.00025135538888326585, + "loss": 0.5046, + "step": 56260 + }, + { + "epoch": 1.2524483618233617, + "grad_norm": 0.7743632793426514, + "learning_rate": 0.00025131032846497324, + "loss": 0.5061, + "step": 56270 + }, + { + "epoch": 1.2526709401709402, + "grad_norm": 0.5469549298286438, + "learning_rate": 0.0002512652652584354, + "loss": 0.5637, + "step": 56280 + }, + { + "epoch": 1.2528935185185186, + "grad_norm": 0.584494948387146, + "learning_rate": 0.00025122019926610104, + "loss": 0.4669, + "step": 56290 + }, + { + "epoch": 1.2531160968660968, + "grad_norm": 0.614250659942627, + "learning_rate": 0.00025117513049041916, + "loss": 0.7461, + "step": 56300 + }, + { + "epoch": 1.2533386752136753, + "grad_norm": 0.6992598176002502, + "learning_rate": 0.0002511300589338388, + "loss": 0.6399, + "step": 56310 + }, + { + "epoch": 1.2535612535612537, + "grad_norm": 0.7041775584220886, + "learning_rate": 0.0002510849845988091, + "loss": 0.5378, + "step": 56320 + }, + { + "epoch": 1.253783831908832, + "grad_norm": 0.7931853532791138, + "learning_rate": 0.00025103990748777963, + "loss": 0.5429, + "step": 56330 + }, + { + "epoch": 1.2540064102564101, + "grad_norm": 0.7894284725189209, + "learning_rate": 0.0002509948276031997, + "loss": 0.494, + "step": 56340 + }, + { + "epoch": 1.2542289886039886, + "grad_norm": 0.4571925103664398, + "learning_rate": 0.00025094974494751913, + "loss": 0.5855, + "step": 56350 + }, + { + "epoch": 1.254451566951567, + "grad_norm": 0.6734161376953125, + "learning_rate": 0.0002509046595231877, + "loss": 0.6186, + "step": 56360 + }, + { + "epoch": 1.2546741452991452, + "grad_norm": 0.543872594833374, + "learning_rate": 0.0002508595713326555, + "loss": 0.492, + "step": 56370 + }, + { + "epoch": 1.2548967236467237, + "grad_norm": 0.680564820766449, + "learning_rate": 0.0002508144803783724, + "loss": 0.5019, + "step": 56380 + }, + { + "epoch": 1.255119301994302, + "grad_norm": 0.5233339071273804, + "learning_rate": 0.00025076938666278894, + "loss": 0.4705, + "step": 56390 + }, + { + "epoch": 1.2553418803418803, + "grad_norm": 0.6287051439285278, + "learning_rate": 0.00025072429018835546, + "loss": 0.5671, + "step": 56400 + }, + { + "epoch": 1.2555644586894588, + "grad_norm": 0.6765820980072021, + "learning_rate": 0.00025067919095752244, + "loss": 0.619, + "step": 56410 + }, + { + "epoch": 1.255787037037037, + "grad_norm": 0.4503667950630188, + "learning_rate": 0.00025063408897274075, + "loss": 0.4486, + "step": 56420 + }, + { + "epoch": 1.2560096153846154, + "grad_norm": 0.7574901580810547, + "learning_rate": 0.00025058898423646115, + "loss": 0.5588, + "step": 56430 + }, + { + "epoch": 1.2562321937321936, + "grad_norm": 0.855404257774353, + "learning_rate": 0.00025054387675113484, + "loss": 0.4947, + "step": 56440 + }, + { + "epoch": 1.256454772079772, + "grad_norm": 0.6277042627334595, + "learning_rate": 0.00025049876651921283, + "loss": 0.6169, + "step": 56450 + }, + { + "epoch": 1.2566773504273505, + "grad_norm": 0.7483381628990173, + "learning_rate": 0.00025045365354314656, + "loss": 0.7175, + "step": 56460 + }, + { + "epoch": 1.2568999287749287, + "grad_norm": 0.5631914138793945, + "learning_rate": 0.00025040853782538734, + "loss": 0.5895, + "step": 56470 + }, + { + "epoch": 1.2571225071225072, + "grad_norm": 0.5679247975349426, + "learning_rate": 0.00025036341936838705, + "loss": 0.4367, + "step": 56480 + }, + { + "epoch": 1.2573450854700854, + "grad_norm": 0.5473216772079468, + "learning_rate": 0.00025031829817459723, + "loss": 0.5431, + "step": 56490 + }, + { + "epoch": 1.2575676638176638, + "grad_norm": 0.4608478546142578, + "learning_rate": 0.0002502731742464699, + "loss": 0.5678, + "step": 56500 + }, + { + "epoch": 1.257790242165242, + "grad_norm": 0.726207971572876, + "learning_rate": 0.00025022804758645714, + "loss": 0.6393, + "step": 56510 + }, + { + "epoch": 1.2580128205128205, + "grad_norm": 1.0481374263763428, + "learning_rate": 0.00025018291819701115, + "loss": 0.5899, + "step": 56520 + }, + { + "epoch": 1.258235398860399, + "grad_norm": 0.6271913647651672, + "learning_rate": 0.0002501377860805843, + "loss": 0.5223, + "step": 56530 + }, + { + "epoch": 1.2584579772079771, + "grad_norm": 0.626106321811676, + "learning_rate": 0.00025009265123962916, + "loss": 0.5491, + "step": 56540 + }, + { + "epoch": 1.2586805555555556, + "grad_norm": 0.48963356018066406, + "learning_rate": 0.0002500475136765983, + "loss": 0.5274, + "step": 56550 + }, + { + "epoch": 1.258903133903134, + "grad_norm": 0.4977332055568695, + "learning_rate": 0.0002500023733939446, + "loss": 0.4912, + "step": 56560 + }, + { + "epoch": 1.2591257122507122, + "grad_norm": 0.7764100432395935, + "learning_rate": 0.0002499572303941209, + "loss": 0.7082, + "step": 56570 + }, + { + "epoch": 1.2593482905982907, + "grad_norm": 0.6956349015235901, + "learning_rate": 0.00024991208467958054, + "loss": 0.6231, + "step": 56580 + }, + { + "epoch": 1.259570868945869, + "grad_norm": 0.7979551553726196, + "learning_rate": 0.00024986693625277654, + "loss": 0.5122, + "step": 56590 + }, + { + "epoch": 1.2597934472934473, + "grad_norm": 0.47603729367256165, + "learning_rate": 0.0002498217851161624, + "loss": 0.436, + "step": 56600 + }, + { + "epoch": 1.2600160256410255, + "grad_norm": 0.7334918975830078, + "learning_rate": 0.00024977663127219175, + "loss": 0.6345, + "step": 56610 + }, + { + "epoch": 1.2601495726495726, + "eval_loss": 0.5836193561553955, + "eval_runtime": 337.4574, + "eval_samples_per_second": 7.008, + "eval_steps_per_second": 7.008, + "step": 56616 + }, + { + "epoch": 1.260238603988604, + "grad_norm": 0.6111765503883362, + "learning_rate": 0.0002497314747233182, + "loss": 0.6062, + "step": 56620 + }, + { + "epoch": 1.2604611823361824, + "grad_norm": 0.4151182174682617, + "learning_rate": 0.0002496863154719955, + "loss": 0.4476, + "step": 56630 + }, + { + "epoch": 1.2606837606837606, + "grad_norm": 0.7085245251655579, + "learning_rate": 0.0002496411535206778, + "loss": 0.5815, + "step": 56640 + }, + { + "epoch": 1.260906339031339, + "grad_norm": 0.5205219388008118, + "learning_rate": 0.00024959598887181925, + "loss": 0.5473, + "step": 56650 + }, + { + "epoch": 1.2611289173789173, + "grad_norm": 0.7376794815063477, + "learning_rate": 0.000249550821527874, + "loss": 0.5397, + "step": 56660 + }, + { + "epoch": 1.2613514957264957, + "grad_norm": 0.4363914132118225, + "learning_rate": 0.00024950565149129653, + "loss": 0.5169, + "step": 56670 + }, + { + "epoch": 1.261574074074074, + "grad_norm": 0.6825656890869141, + "learning_rate": 0.0002494604787645415, + "loss": 0.5345, + "step": 56680 + }, + { + "epoch": 1.2617966524216524, + "grad_norm": 0.6744056344032288, + "learning_rate": 0.00024941530335006345, + "loss": 0.682, + "step": 56690 + }, + { + "epoch": 1.2620192307692308, + "grad_norm": 0.6776096820831299, + "learning_rate": 0.00024937012525031745, + "loss": 0.5671, + "step": 56700 + }, + { + "epoch": 1.262241809116809, + "grad_norm": 0.5862076282501221, + "learning_rate": 0.0002493249444677584, + "loss": 0.5864, + "step": 56710 + }, + { + "epoch": 1.2624643874643875, + "grad_norm": 0.6812041401863098, + "learning_rate": 0.0002492797610048415, + "loss": 0.6633, + "step": 56720 + }, + { + "epoch": 1.262686965811966, + "grad_norm": 0.610002875328064, + "learning_rate": 0.000249234574864022, + "loss": 0.5861, + "step": 56730 + }, + { + "epoch": 1.2629095441595442, + "grad_norm": 0.4847986102104187, + "learning_rate": 0.0002491893860477554, + "loss": 0.5637, + "step": 56740 + }, + { + "epoch": 1.2631321225071226, + "grad_norm": 0.641619861125946, + "learning_rate": 0.0002491441945584974, + "loss": 0.6276, + "step": 56750 + }, + { + "epoch": 1.2633547008547008, + "grad_norm": 0.6451306343078613, + "learning_rate": 0.00024909900039870355, + "loss": 0.6559, + "step": 56760 + }, + { + "epoch": 1.2635772792022792, + "grad_norm": 0.8267781138420105, + "learning_rate": 0.00024905380357082983, + "loss": 0.5444, + "step": 56770 + }, + { + "epoch": 1.2637998575498575, + "grad_norm": 0.6346902847290039, + "learning_rate": 0.00024900860407733226, + "loss": 0.5551, + "step": 56780 + }, + { + "epoch": 1.264022435897436, + "grad_norm": 0.8560341596603394, + "learning_rate": 0.00024896340192066704, + "loss": 0.4665, + "step": 56790 + }, + { + "epoch": 1.2642450142450143, + "grad_norm": 0.7849281430244446, + "learning_rate": 0.0002489181971032905, + "loss": 0.5678, + "step": 56800 + }, + { + "epoch": 1.2644675925925926, + "grad_norm": 0.5926291942596436, + "learning_rate": 0.00024887298962765903, + "loss": 0.5712, + "step": 56810 + }, + { + "epoch": 1.264690170940171, + "grad_norm": 0.5794327855110168, + "learning_rate": 0.0002488277794962293, + "loss": 0.567, + "step": 56820 + }, + { + "epoch": 1.2649127492877492, + "grad_norm": 0.5844407081604004, + "learning_rate": 0.0002487825667114581, + "loss": 0.4536, + "step": 56830 + }, + { + "epoch": 1.2651353276353277, + "grad_norm": 0.7397210597991943, + "learning_rate": 0.00024873735127580224, + "loss": 0.5212, + "step": 56840 + }, + { + "epoch": 1.2653579059829059, + "grad_norm": 0.7658064365386963, + "learning_rate": 0.0002486921331917189, + "loss": 0.681, + "step": 56850 + }, + { + "epoch": 1.2655804843304843, + "grad_norm": 0.6665608286857605, + "learning_rate": 0.0002486469124616651, + "loss": 0.5455, + "step": 56860 + }, + { + "epoch": 1.2658030626780628, + "grad_norm": 1.3184243440628052, + "learning_rate": 0.00024860168908809826, + "loss": 0.6189, + "step": 56870 + }, + { + "epoch": 1.266025641025641, + "grad_norm": 0.4860929548740387, + "learning_rate": 0.00024855646307347587, + "loss": 0.4443, + "step": 56880 + }, + { + "epoch": 1.2662482193732194, + "grad_norm": 0.6908466815948486, + "learning_rate": 0.0002485112344202555, + "loss": 0.7412, + "step": 56890 + }, + { + "epoch": 1.2664707977207978, + "grad_norm": 0.4728986620903015, + "learning_rate": 0.000248466003130895, + "loss": 0.6416, + "step": 56900 + }, + { + "epoch": 1.266693376068376, + "grad_norm": 0.5925902128219604, + "learning_rate": 0.00024842076920785215, + "loss": 0.5215, + "step": 56910 + }, + { + "epoch": 1.2669159544159543, + "grad_norm": 0.5110843777656555, + "learning_rate": 0.0002483755326535851, + "loss": 0.6172, + "step": 56920 + }, + { + "epoch": 1.2671385327635327, + "grad_norm": 0.795608401298523, + "learning_rate": 0.000248330293470552, + "loss": 0.5711, + "step": 56930 + }, + { + "epoch": 1.2673611111111112, + "grad_norm": 0.9751588702201843, + "learning_rate": 0.00024828505166121117, + "loss": 0.5147, + "step": 56940 + }, + { + "epoch": 1.2675836894586894, + "grad_norm": 0.6915881037712097, + "learning_rate": 0.0002482398072280211, + "loss": 0.6626, + "step": 56950 + }, + { + "epoch": 1.2678062678062678, + "grad_norm": 0.5895326137542725, + "learning_rate": 0.00024819456017344043, + "loss": 0.5313, + "step": 56960 + }, + { + "epoch": 1.2680288461538463, + "grad_norm": 0.6742547750473022, + "learning_rate": 0.00024814931049992793, + "loss": 0.6308, + "step": 56970 + }, + { + "epoch": 1.2682514245014245, + "grad_norm": 0.976432204246521, + "learning_rate": 0.0002481040582099424, + "loss": 0.5129, + "step": 56980 + }, + { + "epoch": 1.268474002849003, + "grad_norm": 0.6051344275474548, + "learning_rate": 0.000248058803305943, + "loss": 0.4686, + "step": 56990 + }, + { + "epoch": 1.2686965811965811, + "grad_norm": 0.47400906682014465, + "learning_rate": 0.00024801354579038896, + "loss": 0.6147, + "step": 57000 + }, + { + "epoch": 1.2689191595441596, + "grad_norm": 0.7733394503593445, + "learning_rate": 0.0002479682856657395, + "loss": 0.534, + "step": 57010 + }, + { + "epoch": 1.2691417378917378, + "grad_norm": 0.7045361995697021, + "learning_rate": 0.0002479230229344541, + "loss": 0.6043, + "step": 57020 + }, + { + "epoch": 1.2693643162393162, + "grad_norm": 0.6193640232086182, + "learning_rate": 0.0002478777575989924, + "loss": 0.6197, + "step": 57030 + }, + { + "epoch": 1.2695868945868947, + "grad_norm": 0.5314254760742188, + "learning_rate": 0.00024783248966181416, + "loss": 0.4338, + "step": 57040 + }, + { + "epoch": 1.2698094729344729, + "grad_norm": 0.5635586977005005, + "learning_rate": 0.00024778721912537926, + "loss": 0.5019, + "step": 57050 + }, + { + "epoch": 1.2700320512820513, + "grad_norm": 0.573419988155365, + "learning_rate": 0.0002477419459921478, + "loss": 0.5775, + "step": 57060 + }, + { + "epoch": 1.2702546296296298, + "grad_norm": 0.6648149490356445, + "learning_rate": 0.00024769667026457994, + "loss": 0.6239, + "step": 57070 + }, + { + "epoch": 1.270477207977208, + "grad_norm": 0.6661327481269836, + "learning_rate": 0.0002476513919451359, + "loss": 0.5564, + "step": 57080 + }, + { + "epoch": 1.2706997863247862, + "grad_norm": 0.5547678470611572, + "learning_rate": 0.0002476061110362762, + "loss": 0.6744, + "step": 57090 + }, + { + "epoch": 1.2709223646723646, + "grad_norm": 0.5434178709983826, + "learning_rate": 0.0002475608275404615, + "loss": 0.4616, + "step": 57100 + }, + { + "epoch": 1.271144943019943, + "grad_norm": 0.45356640219688416, + "learning_rate": 0.0002475155414601525, + "loss": 0.5319, + "step": 57110 + }, + { + "epoch": 1.2713675213675213, + "grad_norm": 0.7899428606033325, + "learning_rate": 0.00024747025279781004, + "loss": 0.5353, + "step": 57120 + }, + { + "epoch": 1.2715900997150997, + "grad_norm": 0.6958609223365784, + "learning_rate": 0.0002474249615558951, + "loss": 0.5495, + "step": 57130 + }, + { + "epoch": 1.2718126780626782, + "grad_norm": 0.579924464225769, + "learning_rate": 0.00024737966773686915, + "loss": 0.5014, + "step": 57140 + }, + { + "epoch": 1.2720352564102564, + "grad_norm": 0.7924838066101074, + "learning_rate": 0.0002473343713431931, + "loss": 0.6105, + "step": 57150 + }, + { + "epoch": 1.2722578347578348, + "grad_norm": 0.9225060343742371, + "learning_rate": 0.0002472890723773286, + "loss": 0.6499, + "step": 57160 + }, + { + "epoch": 1.272480413105413, + "grad_norm": 0.8588963747024536, + "learning_rate": 0.00024724377084173725, + "loss": 0.5493, + "step": 57170 + }, + { + "epoch": 1.2727029914529915, + "grad_norm": 0.6969565749168396, + "learning_rate": 0.00024719846673888063, + "loss": 0.5307, + "step": 57180 + }, + { + "epoch": 1.2729255698005697, + "grad_norm": 0.5771629810333252, + "learning_rate": 0.0002471531600712207, + "loss": 0.5566, + "step": 57190 + }, + { + "epoch": 1.2731481481481481, + "grad_norm": 0.6116702556610107, + "learning_rate": 0.0002471078508412195, + "loss": 0.5853, + "step": 57200 + }, + { + "epoch": 1.2733707264957266, + "grad_norm": 0.9617108106613159, + "learning_rate": 0.00024706253905133914, + "loss": 0.6215, + "step": 57210 + }, + { + "epoch": 1.2735933048433048, + "grad_norm": 0.4409613013267517, + "learning_rate": 0.0002470172247040418, + "loss": 0.6077, + "step": 57220 + }, + { + "epoch": 1.2738158831908832, + "grad_norm": 0.6302945613861084, + "learning_rate": 0.00024697190780179003, + "loss": 0.6608, + "step": 57230 + }, + { + "epoch": 1.2740384615384617, + "grad_norm": 0.5273117423057556, + "learning_rate": 0.00024692658834704633, + "loss": 0.5804, + "step": 57240 + }, + { + "epoch": 1.27426103988604, + "grad_norm": 0.5631963610649109, + "learning_rate": 0.0002468812663422734, + "loss": 0.6048, + "step": 57250 + }, + { + "epoch": 1.274483618233618, + "grad_norm": 0.5829416513442993, + "learning_rate": 0.00024683594178993406, + "loss": 0.605, + "step": 57260 + }, + { + "epoch": 1.2747061965811965, + "grad_norm": 0.7434841394424438, + "learning_rate": 0.00024679061469249134, + "loss": 0.5767, + "step": 57270 + }, + { + "epoch": 1.274928774928775, + "grad_norm": 0.6747451424598694, + "learning_rate": 0.0002467452850524083, + "loss": 0.6182, + "step": 57280 + }, + { + "epoch": 1.2751513532763532, + "grad_norm": 0.6527425050735474, + "learning_rate": 0.0002466999528721482, + "loss": 0.6533, + "step": 57290 + }, + { + "epoch": 1.2753739316239316, + "grad_norm": 0.577849805355072, + "learning_rate": 0.0002466546181541744, + "loss": 0.7178, + "step": 57300 + }, + { + "epoch": 1.27559650997151, + "grad_norm": 0.5812302231788635, + "learning_rate": 0.0002466092809009505, + "loss": 0.6465, + "step": 57310 + }, + { + "epoch": 1.2758190883190883, + "grad_norm": 0.6347759366035461, + "learning_rate": 0.0002465639411149401, + "loss": 0.6204, + "step": 57320 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.49303147196769714, + "learning_rate": 0.000246518598798607, + "loss": 0.4739, + "step": 57330 + }, + { + "epoch": 1.276264245014245, + "grad_norm": 0.6746029853820801, + "learning_rate": 0.0002464732539544152, + "loss": 0.5648, + "step": 57340 + }, + { + "epoch": 1.2764868233618234, + "grad_norm": 0.8490906953811646, + "learning_rate": 0.0002464279065848287, + "loss": 0.5943, + "step": 57350 + }, + { + "epoch": 1.2767094017094016, + "grad_norm": 1.0166454315185547, + "learning_rate": 0.0002463825566923118, + "loss": 0.564, + "step": 57360 + }, + { + "epoch": 1.27693198005698, + "grad_norm": 0.8642008304595947, + "learning_rate": 0.00024633720427932876, + "loss": 0.6205, + "step": 57370 + }, + { + "epoch": 1.2771545584045585, + "grad_norm": 0.7879709005355835, + "learning_rate": 0.0002462918493483441, + "loss": 0.5458, + "step": 57380 + }, + { + "epoch": 1.2773771367521367, + "grad_norm": 1.0417356491088867, + "learning_rate": 0.00024624649190182243, + "loss": 0.6538, + "step": 57390 + }, + { + "epoch": 1.2775997150997151, + "grad_norm": 0.517301619052887, + "learning_rate": 0.0002462011319422286, + "loss": 0.4757, + "step": 57400 + }, + { + "epoch": 1.2778222934472934, + "grad_norm": 0.7360623478889465, + "learning_rate": 0.0002461557694720274, + "loss": 0.4645, + "step": 57410 + }, + { + "epoch": 1.2780448717948718, + "grad_norm": 0.5814298391342163, + "learning_rate": 0.0002461104044936839, + "loss": 0.5377, + "step": 57420 + }, + { + "epoch": 1.27826745014245, + "grad_norm": 0.6260385513305664, + "learning_rate": 0.0002460650370096633, + "loss": 0.4785, + "step": 57430 + }, + { + "epoch": 1.2784900284900285, + "grad_norm": 0.5605562925338745, + "learning_rate": 0.0002460196670224308, + "loss": 0.7021, + "step": 57440 + }, + { + "epoch": 1.278712606837607, + "grad_norm": 0.9051653742790222, + "learning_rate": 0.000245974294534452, + "loss": 0.509, + "step": 57450 + }, + { + "epoch": 1.2789351851851851, + "grad_norm": 0.5881842970848083, + "learning_rate": 0.0002459289195481924, + "loss": 0.5765, + "step": 57460 + }, + { + "epoch": 1.2791577635327636, + "grad_norm": 0.5452075004577637, + "learning_rate": 0.0002458835420661177, + "loss": 0.7318, + "step": 57470 + }, + { + "epoch": 1.279380341880342, + "grad_norm": 0.3864049017429352, + "learning_rate": 0.0002458381620906937, + "loss": 0.4592, + "step": 57480 + }, + { + "epoch": 1.2796029202279202, + "grad_norm": 0.6464360952377319, + "learning_rate": 0.0002457927796243865, + "loss": 0.6142, + "step": 57490 + }, + { + "epoch": 1.2798254985754987, + "grad_norm": 0.680077850818634, + "learning_rate": 0.0002457473946696621, + "loss": 0.6157, + "step": 57500 + }, + { + "epoch": 1.2800480769230769, + "grad_norm": 0.6645841002464294, + "learning_rate": 0.0002457020072289869, + "loss": 0.631, + "step": 57510 + }, + { + "epoch": 1.2802706552706553, + "grad_norm": 1.004813313484192, + "learning_rate": 0.00024565661730482723, + "loss": 0.4855, + "step": 57520 + }, + { + "epoch": 1.2804932336182335, + "grad_norm": 0.743014931678772, + "learning_rate": 0.0002456112248996496, + "loss": 0.4654, + "step": 57530 + }, + { + "epoch": 1.280715811965812, + "grad_norm": 0.5669029951095581, + "learning_rate": 0.00024556583001592063, + "loss": 0.548, + "step": 57540 + }, + { + "epoch": 1.2809383903133904, + "grad_norm": 0.425339937210083, + "learning_rate": 0.0002455204326561071, + "loss": 0.4507, + "step": 57550 + }, + { + "epoch": 1.2811609686609686, + "grad_norm": 0.49417296051979065, + "learning_rate": 0.0002454750328226761, + "loss": 0.5089, + "step": 57560 + }, + { + "epoch": 1.281383547008547, + "grad_norm": 0.5269678831100464, + "learning_rate": 0.0002454296305180945, + "loss": 0.5332, + "step": 57570 + }, + { + "epoch": 1.2816061253561253, + "grad_norm": 0.44602254033088684, + "learning_rate": 0.00024538422574482964, + "loss": 0.4913, + "step": 57580 + }, + { + "epoch": 1.2818287037037037, + "grad_norm": 0.4153461754322052, + "learning_rate": 0.00024533881850534884, + "loss": 0.6558, + "step": 57590 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.9092263579368591, + "learning_rate": 0.0002452934088021195, + "loss": 0.5675, + "step": 57600 + }, + { + "epoch": 1.2822738603988604, + "grad_norm": 0.450931191444397, + "learning_rate": 0.0002452479966376092, + "loss": 0.595, + "step": 57610 + }, + { + "epoch": 1.2824964387464388, + "grad_norm": 0.7668671011924744, + "learning_rate": 0.0002452025820142857, + "loss": 0.6489, + "step": 57620 + }, + { + "epoch": 1.282719017094017, + "grad_norm": 0.6127539873123169, + "learning_rate": 0.000245157164934617, + "loss": 0.5202, + "step": 57630 + }, + { + "epoch": 1.2829415954415955, + "grad_norm": 0.7740516662597656, + "learning_rate": 0.0002451117454010709, + "loss": 0.535, + "step": 57640 + }, + { + "epoch": 1.283164173789174, + "grad_norm": 0.7097222208976746, + "learning_rate": 0.0002450663234161156, + "loss": 0.6147, + "step": 57650 + }, + { + "epoch": 1.2833867521367521, + "grad_norm": 0.5709938406944275, + "learning_rate": 0.0002450208989822195, + "loss": 0.5674, + "step": 57660 + }, + { + "epoch": 1.2836093304843303, + "grad_norm": 0.668928325176239, + "learning_rate": 0.00024497547210185086, + "loss": 0.522, + "step": 57670 + }, + { + "epoch": 1.2838319088319088, + "grad_norm": 0.590245246887207, + "learning_rate": 0.0002449300427774782, + "loss": 0.5693, + "step": 57680 + }, + { + "epoch": 1.2840544871794872, + "grad_norm": 0.7981787919998169, + "learning_rate": 0.00024488461101157023, + "loss": 0.556, + "step": 57690 + }, + { + "epoch": 1.2842770655270654, + "grad_norm": 0.6494070887565613, + "learning_rate": 0.0002448391768065958, + "loss": 0.4989, + "step": 57700 + }, + { + "epoch": 1.2844996438746439, + "grad_norm": 0.9221472144126892, + "learning_rate": 0.00024479374016502377, + "loss": 0.5186, + "step": 57710 + }, + { + "epoch": 1.2847222222222223, + "grad_norm": 0.5488528609275818, + "learning_rate": 0.0002447483010893232, + "loss": 0.6221, + "step": 57720 + }, + { + "epoch": 1.2849448005698005, + "grad_norm": 0.5177143216133118, + "learning_rate": 0.0002447028595819634, + "loss": 0.4592, + "step": 57730 + }, + { + "epoch": 1.285167378917379, + "grad_norm": 0.5516415238380432, + "learning_rate": 0.0002446574156454136, + "loss": 0.5079, + "step": 57740 + }, + { + "epoch": 1.2853899572649572, + "grad_norm": 0.4461617171764374, + "learning_rate": 0.0002446119692821432, + "loss": 0.4729, + "step": 57750 + }, + { + "epoch": 1.2856125356125356, + "grad_norm": 0.6107337474822998, + "learning_rate": 0.00024456652049462195, + "loss": 0.5218, + "step": 57760 + }, + { + "epoch": 1.2858351139601139, + "grad_norm": 0.6461668610572815, + "learning_rate": 0.00024452106928531944, + "loss": 0.5012, + "step": 57770 + }, + { + "epoch": 1.2860576923076923, + "grad_norm": 0.7621982097625732, + "learning_rate": 0.0002444756156567056, + "loss": 0.6168, + "step": 57780 + }, + { + "epoch": 1.2862802706552707, + "grad_norm": 0.4369259178638458, + "learning_rate": 0.0002444301596112504, + "loss": 0.5539, + "step": 57790 + }, + { + "epoch": 1.286502849002849, + "grad_norm": 0.6202419996261597, + "learning_rate": 0.00024438470115142386, + "loss": 0.5763, + "step": 57800 + }, + { + "epoch": 1.2867254273504274, + "grad_norm": 0.4778003394603729, + "learning_rate": 0.00024433924027969647, + "loss": 0.5908, + "step": 57810 + }, + { + "epoch": 1.2869480056980058, + "grad_norm": 0.9239721298217773, + "learning_rate": 0.00024429377699853835, + "loss": 0.5821, + "step": 57820 + }, + { + "epoch": 1.287170584045584, + "grad_norm": 0.7348893284797668, + "learning_rate": 0.0002442483113104202, + "loss": 0.53, + "step": 57830 + }, + { + "epoch": 1.2873931623931623, + "grad_norm": 0.4809786379337311, + "learning_rate": 0.0002442028432178126, + "loss": 0.6144, + "step": 57840 + }, + { + "epoch": 1.2876157407407407, + "grad_norm": 0.493743896484375, + "learning_rate": 0.00024415737272318625, + "loss": 0.5849, + "step": 57850 + }, + { + "epoch": 1.2878383190883191, + "grad_norm": 0.818612277507782, + "learning_rate": 0.00024411189982901217, + "loss": 0.6053, + "step": 57860 + }, + { + "epoch": 1.2880608974358974, + "grad_norm": 0.6896690726280212, + "learning_rate": 0.00024406642453776129, + "loss": 0.5601, + "step": 57870 + }, + { + "epoch": 1.2882834757834758, + "grad_norm": 0.5468393564224243, + "learning_rate": 0.0002440209468519049, + "loss": 0.4614, + "step": 57880 + }, + { + "epoch": 1.2885060541310542, + "grad_norm": 0.5655200481414795, + "learning_rate": 0.00024397546677391415, + "loss": 0.5631, + "step": 57890 + }, + { + "epoch": 1.2887286324786325, + "grad_norm": 0.3707646429538727, + "learning_rate": 0.00024392998430626056, + "loss": 0.5412, + "step": 57900 + }, + { + "epoch": 1.288951210826211, + "grad_norm": 0.6031479835510254, + "learning_rate": 0.0002438844994514157, + "loss": 0.6383, + "step": 57910 + }, + { + "epoch": 1.289173789173789, + "grad_norm": 0.6139857172966003, + "learning_rate": 0.00024383901221185114, + "loss": 0.6776, + "step": 57920 + }, + { + "epoch": 1.2893963675213675, + "grad_norm": 0.7917313575744629, + "learning_rate": 0.00024379352259003883, + "loss": 0.6386, + "step": 57930 + }, + { + "epoch": 1.2896189458689458, + "grad_norm": 0.5444529056549072, + "learning_rate": 0.00024374803058845062, + "loss": 0.6352, + "step": 57940 + }, + { + "epoch": 1.2898415242165242, + "grad_norm": 0.5975114107131958, + "learning_rate": 0.00024370253620955863, + "loss": 0.6732, + "step": 57950 + }, + { + "epoch": 1.2900641025641026, + "grad_norm": 0.6343320608139038, + "learning_rate": 0.00024365703945583502, + "loss": 0.6364, + "step": 57960 + }, + { + "epoch": 1.2902866809116809, + "grad_norm": 0.7224991321563721, + "learning_rate": 0.00024361154032975218, + "loss": 0.6188, + "step": 57970 + }, + { + "epoch": 1.2905092592592593, + "grad_norm": 0.6302068829536438, + "learning_rate": 0.0002435660388337825, + "loss": 0.6218, + "step": 57980 + }, + { + "epoch": 1.2907318376068377, + "grad_norm": 0.971505880355835, + "learning_rate": 0.00024352053497039865, + "loss": 0.5681, + "step": 57990 + }, + { + "epoch": 1.290954415954416, + "grad_norm": 1.0870121717453003, + "learning_rate": 0.00024347502874207328, + "loss": 0.4242, + "step": 58000 + }, + { + "epoch": 1.2911769943019942, + "grad_norm": 0.7964351177215576, + "learning_rate": 0.00024342952015127926, + "loss": 0.6279, + "step": 58010 + }, + { + "epoch": 1.2913995726495726, + "grad_norm": 0.5268858075141907, + "learning_rate": 0.00024338400920048955, + "loss": 0.6166, + "step": 58020 + }, + { + "epoch": 1.291622150997151, + "grad_norm": 0.6330119967460632, + "learning_rate": 0.00024333849589217726, + "loss": 0.5551, + "step": 58030 + }, + { + "epoch": 1.2918447293447293, + "grad_norm": 0.6858647465705872, + "learning_rate": 0.0002432929802288156, + "loss": 0.5315, + "step": 58040 + }, + { + "epoch": 1.2920673076923077, + "grad_norm": 0.347758412361145, + "learning_rate": 0.000243247462212878, + "loss": 0.4285, + "step": 58050 + }, + { + "epoch": 1.2922898860398861, + "grad_norm": 0.531583845615387, + "learning_rate": 0.00024320194184683795, + "loss": 0.7146, + "step": 58060 + }, + { + "epoch": 1.2925124643874644, + "grad_norm": 0.613280177116394, + "learning_rate": 0.00024315641913316891, + "loss": 0.6505, + "step": 58070 + }, + { + "epoch": 1.2927350427350428, + "grad_norm": 0.7552340030670166, + "learning_rate": 0.00024311089407434477, + "loss": 0.6316, + "step": 58080 + }, + { + "epoch": 1.292957621082621, + "grad_norm": 0.6747114658355713, + "learning_rate": 0.00024306536667283938, + "loss": 0.7003, + "step": 58090 + }, + { + "epoch": 1.2931801994301995, + "grad_norm": 0.5359660387039185, + "learning_rate": 0.00024301983693112664, + "loss": 0.5668, + "step": 58100 + }, + { + "epoch": 1.2934027777777777, + "grad_norm": 0.969078540802002, + "learning_rate": 0.00024297430485168079, + "loss": 0.5855, + "step": 58110 + }, + { + "epoch": 1.2936253561253561, + "grad_norm": 0.7394294738769531, + "learning_rate": 0.00024292877043697605, + "loss": 0.5572, + "step": 58120 + }, + { + "epoch": 1.2938479344729346, + "grad_norm": 0.7185589671134949, + "learning_rate": 0.00024288323368948676, + "loss": 0.5608, + "step": 58130 + }, + { + "epoch": 1.2940705128205128, + "grad_norm": 0.7642703056335449, + "learning_rate": 0.00024283769461168743, + "loss": 0.7009, + "step": 58140 + }, + { + "epoch": 1.2942930911680912, + "grad_norm": 0.7779717445373535, + "learning_rate": 0.00024279215320605272, + "loss": 0.516, + "step": 58150 + }, + { + "epoch": 1.2945156695156697, + "grad_norm": 0.6956257224082947, + "learning_rate": 0.0002427466094750574, + "loss": 0.5231, + "step": 58160 + }, + { + "epoch": 1.2947382478632479, + "grad_norm": 0.796648383140564, + "learning_rate": 0.00024270106342117628, + "loss": 0.6464, + "step": 58170 + }, + { + "epoch": 1.294960826210826, + "grad_norm": 0.6371486186981201, + "learning_rate": 0.00024265551504688441, + "loss": 0.6439, + "step": 58180 + }, + { + "epoch": 1.2951834045584045, + "grad_norm": 1.0725200176239014, + "learning_rate": 0.00024260996435465697, + "loss": 0.5708, + "step": 58190 + }, + { + "epoch": 1.295405982905983, + "grad_norm": 0.5749133229255676, + "learning_rate": 0.0002425644113469692, + "loss": 0.4988, + "step": 58200 + }, + { + "epoch": 1.2956285612535612, + "grad_norm": 0.4231486916542053, + "learning_rate": 0.00024251885602629645, + "loss": 0.5386, + "step": 58210 + }, + { + "epoch": 1.2958511396011396, + "grad_norm": 0.5720852017402649, + "learning_rate": 0.00024247329839511425, + "loss": 0.546, + "step": 58220 + }, + { + "epoch": 1.296073717948718, + "grad_norm": 0.6299185752868652, + "learning_rate": 0.00024242773845589827, + "loss": 0.5484, + "step": 58230 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.5682166814804077, + "learning_rate": 0.0002423821762111242, + "loss": 0.5864, + "step": 58240 + }, + { + "epoch": 1.2965188746438747, + "grad_norm": 0.4346299469470978, + "learning_rate": 0.00024233661166326803, + "loss": 0.5586, + "step": 58250 + }, + { + "epoch": 1.296741452991453, + "grad_norm": 0.9597857594490051, + "learning_rate": 0.00024229104481480568, + "loss": 0.5722, + "step": 58260 + }, + { + "epoch": 1.2969640313390314, + "grad_norm": 0.6724293828010559, + "learning_rate": 0.0002422454756682134, + "loss": 0.6278, + "step": 58270 + }, + { + "epoch": 1.2971866096866096, + "grad_norm": 0.4890615940093994, + "learning_rate": 0.0002421999042259673, + "loss": 0.5214, + "step": 58280 + }, + { + "epoch": 1.297409188034188, + "grad_norm": 0.7504422664642334, + "learning_rate": 0.0002421543304905439, + "loss": 0.6461, + "step": 58290 + }, + { + "epoch": 1.2976317663817665, + "grad_norm": 0.7557802200317383, + "learning_rate": 0.0002421087544644197, + "loss": 0.6423, + "step": 58300 + }, + { + "epoch": 1.2978543447293447, + "grad_norm": 0.947760820388794, + "learning_rate": 0.00024206317615007127, + "loss": 0.62, + "step": 58310 + }, + { + "epoch": 1.2980769230769231, + "grad_norm": 0.6679830551147461, + "learning_rate": 0.00024201759554997546, + "loss": 0.5337, + "step": 58320 + }, + { + "epoch": 1.2982995014245013, + "grad_norm": 0.4446559250354767, + "learning_rate": 0.0002419720126666091, + "loss": 0.4686, + "step": 58330 + }, + { + "epoch": 1.2985220797720798, + "grad_norm": 0.7188494801521301, + "learning_rate": 0.00024192642750244919, + "loss": 0.5269, + "step": 58340 + }, + { + "epoch": 1.298744658119658, + "grad_norm": 0.42798712849617004, + "learning_rate": 0.00024188084005997285, + "loss": 0.6084, + "step": 58350 + }, + { + "epoch": 1.2989672364672364, + "grad_norm": 0.5221139788627625, + "learning_rate": 0.0002418352503416574, + "loss": 0.586, + "step": 58360 + }, + { + "epoch": 1.2991898148148149, + "grad_norm": 0.8559644222259521, + "learning_rate": 0.00024178965834998023, + "loss": 0.6131, + "step": 58370 + }, + { + "epoch": 1.299412393162393, + "grad_norm": 0.7762161493301392, + "learning_rate": 0.00024174406408741876, + "loss": 0.5871, + "step": 58380 + }, + { + "epoch": 1.2996349715099715, + "grad_norm": 0.7057475447654724, + "learning_rate": 0.00024169846755645074, + "loss": 0.5825, + "step": 58390 + }, + { + "epoch": 1.29985754985755, + "grad_norm": 0.4852276146411896, + "learning_rate": 0.00024165286875955385, + "loss": 0.5527, + "step": 58400 + }, + { + "epoch": 1.3000801282051282, + "grad_norm": 0.6271269917488098, + "learning_rate": 0.00024160726769920598, + "loss": 0.5368, + "step": 58410 + }, + { + "epoch": 1.3003027065527066, + "grad_norm": 0.6222318410873413, + "learning_rate": 0.00024156166437788504, + "loss": 0.6229, + "step": 58420 + }, + { + "epoch": 1.3005252849002849, + "grad_norm": 0.9286940097808838, + "learning_rate": 0.0002415160587980693, + "loss": 0.5029, + "step": 58430 + }, + { + "epoch": 1.3007478632478633, + "grad_norm": 0.5464807152748108, + "learning_rate": 0.00024147045096223693, + "loss": 0.6235, + "step": 58440 + }, + { + "epoch": 1.3009704415954415, + "grad_norm": 0.5680151581764221, + "learning_rate": 0.00024142484087286633, + "loss": 0.5727, + "step": 58450 + }, + { + "epoch": 1.30119301994302, + "grad_norm": 0.6306223273277283, + "learning_rate": 0.00024137922853243588, + "loss": 0.778, + "step": 58460 + }, + { + "epoch": 1.3014155982905984, + "grad_norm": 0.4028373658657074, + "learning_rate": 0.0002413336139434244, + "loss": 0.5189, + "step": 58470 + }, + { + "epoch": 1.3016381766381766, + "grad_norm": 0.473341703414917, + "learning_rate": 0.0002412879971083104, + "loss": 0.6445, + "step": 58480 + }, + { + "epoch": 1.301860754985755, + "grad_norm": 0.5544946193695068, + "learning_rate": 0.00024124237802957286, + "loss": 0.5764, + "step": 58490 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.7774984836578369, + "learning_rate": 0.0002411967567096907, + "loss": 0.7581, + "step": 58500 + }, + { + "epoch": 1.3023059116809117, + "grad_norm": 0.664369523525238, + "learning_rate": 0.00024115113315114313, + "loss": 0.5788, + "step": 58510 + }, + { + "epoch": 1.30252849002849, + "grad_norm": 0.831189751625061, + "learning_rate": 0.00024110550735640928, + "loss": 0.6589, + "step": 58520 + }, + { + "epoch": 1.3027510683760684, + "grad_norm": 0.6030679941177368, + "learning_rate": 0.0002410598793279685, + "loss": 0.6406, + "step": 58530 + }, + { + "epoch": 1.3029736467236468, + "grad_norm": 0.534193217754364, + "learning_rate": 0.00024101424906830028, + "loss": 0.5129, + "step": 58540 + }, + { + "epoch": 1.303196225071225, + "grad_norm": 0.5773665904998779, + "learning_rate": 0.00024096861657988417, + "loss": 0.6065, + "step": 58550 + }, + { + "epoch": 1.3034188034188035, + "grad_norm": 0.8891463875770569, + "learning_rate": 0.00024092298186519987, + "loss": 0.7145, + "step": 58560 + }, + { + "epoch": 1.303641381766382, + "grad_norm": 0.5164126753807068, + "learning_rate": 0.00024087734492672725, + "loss": 0.5036, + "step": 58570 + }, + { + "epoch": 1.30386396011396, + "grad_norm": 0.5912543535232544, + "learning_rate": 0.00024083170576694635, + "loss": 0.6331, + "step": 58580 + }, + { + "epoch": 1.3040865384615383, + "grad_norm": 0.5806360244750977, + "learning_rate": 0.00024078606438833697, + "loss": 0.6346, + "step": 58590 + }, + { + "epoch": 1.3043091168091168, + "grad_norm": 0.6180190443992615, + "learning_rate": 0.00024074042079337953, + "loss": 0.518, + "step": 58600 + }, + { + "epoch": 1.3045316951566952, + "grad_norm": 1.0360801219940186, + "learning_rate": 0.0002406947749845543, + "loss": 0.593, + "step": 58610 + }, + { + "epoch": 1.3047542735042734, + "grad_norm": 0.7276124358177185, + "learning_rate": 0.0002406491269643416, + "loss": 0.6122, + "step": 58620 + }, + { + "epoch": 1.3049768518518519, + "grad_norm": 0.6221876740455627, + "learning_rate": 0.00024060347673522214, + "loss": 0.636, + "step": 58630 + }, + { + "epoch": 1.3051994301994303, + "grad_norm": 0.631127119064331, + "learning_rate": 0.00024055782429967644, + "loss": 0.6477, + "step": 58640 + }, + { + "epoch": 1.3054220085470085, + "grad_norm": 0.6096075177192688, + "learning_rate": 0.00024051216966018552, + "loss": 0.6325, + "step": 58650 + }, + { + "epoch": 1.305644586894587, + "grad_norm": 1.1462111473083496, + "learning_rate": 0.00024046651281923, + "loss": 0.6728, + "step": 58660 + }, + { + "epoch": 1.3058671652421652, + "grad_norm": 0.4736732244491577, + "learning_rate": 0.00024042085377929103, + "loss": 0.6686, + "step": 58670 + }, + { + "epoch": 1.3060897435897436, + "grad_norm": 0.6927259564399719, + "learning_rate": 0.0002403751925428499, + "loss": 0.6348, + "step": 58680 + }, + { + "epoch": 1.3063123219373218, + "grad_norm": 0.4512447416782379, + "learning_rate": 0.00024032952911238767, + "loss": 0.5676, + "step": 58690 + }, + { + "epoch": 1.3065349002849003, + "grad_norm": 0.45592087507247925, + "learning_rate": 0.00024028386349038576, + "loss": 0.5839, + "step": 58700 + }, + { + "epoch": 1.3067574786324787, + "grad_norm": 0.6498264670372009, + "learning_rate": 0.00024023819567932586, + "loss": 0.5339, + "step": 58710 + }, + { + "epoch": 1.306980056980057, + "grad_norm": 0.7101892232894897, + "learning_rate": 0.0002401925256816894, + "loss": 0.5566, + "step": 58720 + }, + { + "epoch": 1.3072026353276354, + "grad_norm": 0.6671558618545532, + "learning_rate": 0.0002401468534999582, + "loss": 0.5722, + "step": 58730 + }, + { + "epoch": 1.3074252136752138, + "grad_norm": 0.7375342845916748, + "learning_rate": 0.00024010117913661407, + "loss": 0.5704, + "step": 58740 + }, + { + "epoch": 1.307647792022792, + "grad_norm": 0.7588819861412048, + "learning_rate": 0.0002400555025941391, + "loss": 0.587, + "step": 58750 + }, + { + "epoch": 1.3078703703703702, + "grad_norm": 0.6468998789787292, + "learning_rate": 0.0002400098238750153, + "loss": 0.6558, + "step": 58760 + }, + { + "epoch": 1.3080929487179487, + "grad_norm": 0.5806763768196106, + "learning_rate": 0.00023996414298172488, + "loss": 0.5347, + "step": 58770 + }, + { + "epoch": 1.3083155270655271, + "grad_norm": 0.5419186949729919, + "learning_rate": 0.0002399184599167503, + "loss": 0.5315, + "step": 58780 + }, + { + "epoch": 1.3085381054131053, + "grad_norm": 0.5879216194152832, + "learning_rate": 0.00023987277468257386, + "loss": 0.5976, + "step": 58790 + }, + { + "epoch": 1.3087606837606838, + "grad_norm": 0.684470534324646, + "learning_rate": 0.00023982708728167822, + "loss": 0.5041, + "step": 58800 + }, + { + "epoch": 1.3089832621082622, + "grad_norm": 0.5002015829086304, + "learning_rate": 0.00023978139771654603, + "loss": 0.4835, + "step": 58810 + }, + { + "epoch": 1.3092058404558404, + "grad_norm": 0.7689092755317688, + "learning_rate": 0.00023973570598966019, + "loss": 0.5983, + "step": 58820 + }, + { + "epoch": 1.3094284188034189, + "grad_norm": 0.586552083492279, + "learning_rate": 0.0002396900121035035, + "loss": 0.4554, + "step": 58830 + }, + { + "epoch": 1.309650997150997, + "grad_norm": 0.9228227734565735, + "learning_rate": 0.00023964431606055908, + "loss": 0.6011, + "step": 58840 + }, + { + "epoch": 1.3098735754985755, + "grad_norm": 0.6102882623672485, + "learning_rate": 0.00023959861786331007, + "loss": 0.5829, + "step": 58850 + }, + { + "epoch": 1.3100961538461537, + "grad_norm": 0.6513160467147827, + "learning_rate": 0.0002395529175142398, + "loss": 0.4476, + "step": 58860 + }, + { + "epoch": 1.3103187321937322, + "grad_norm": 0.5500876307487488, + "learning_rate": 0.0002395072150158315, + "loss": 0.4558, + "step": 58870 + }, + { + "epoch": 1.3105413105413106, + "grad_norm": 0.49348214268684387, + "learning_rate": 0.00023946151037056886, + "loss": 0.5777, + "step": 58880 + }, + { + "epoch": 1.3107638888888888, + "grad_norm": 0.5179466009140015, + "learning_rate": 0.00023941580358093547, + "loss": 0.6253, + "step": 58890 + }, + { + "epoch": 1.3109864672364673, + "grad_norm": 0.8439163565635681, + "learning_rate": 0.00023937009464941497, + "loss": 0.6493, + "step": 58900 + }, + { + "epoch": 1.3112090455840457, + "grad_norm": 0.5786725878715515, + "learning_rate": 0.00023932438357849133, + "loss": 0.599, + "step": 58910 + }, + { + "epoch": 1.311431623931624, + "grad_norm": 0.5733489990234375, + "learning_rate": 0.00023927867037064853, + "loss": 0.5755, + "step": 58920 + }, + { + "epoch": 1.3116542022792022, + "grad_norm": 0.5293898582458496, + "learning_rate": 0.0002392329550283706, + "loss": 0.6261, + "step": 58930 + }, + { + "epoch": 1.3118767806267806, + "grad_norm": 1.0148966312408447, + "learning_rate": 0.00023918723755414178, + "loss": 0.714, + "step": 58940 + }, + { + "epoch": 1.312099358974359, + "grad_norm": 0.5190929174423218, + "learning_rate": 0.00023914151795044637, + "loss": 0.4459, + "step": 58950 + }, + { + "epoch": 1.3123219373219372, + "grad_norm": 0.5621257424354553, + "learning_rate": 0.00023909579621976884, + "loss": 0.4767, + "step": 58960 + }, + { + "epoch": 1.3125445156695157, + "grad_norm": 0.4501630365848541, + "learning_rate": 0.00023905007236459374, + "loss": 0.533, + "step": 58970 + }, + { + "epoch": 1.3127670940170941, + "grad_norm": 0.3523252606391907, + "learning_rate": 0.00023900434638740578, + "loss": 0.6054, + "step": 58980 + }, + { + "epoch": 1.3129896723646723, + "grad_norm": 0.4954468309879303, + "learning_rate": 0.00023895861829068964, + "loss": 0.7315, + "step": 58990 + }, + { + "epoch": 1.3132122507122508, + "grad_norm": 0.5253806710243225, + "learning_rate": 0.00023891288807693039, + "loss": 0.5097, + "step": 59000 + }, + { + "epoch": 1.313434829059829, + "grad_norm": 0.5932738780975342, + "learning_rate": 0.0002388671557486128, + "loss": 0.6076, + "step": 59010 + }, + { + "epoch": 1.3136574074074074, + "grad_norm": 0.8328933715820312, + "learning_rate": 0.00023882142130822223, + "loss": 0.5887, + "step": 59020 + }, + { + "epoch": 1.3138799857549857, + "grad_norm": 0.5181304812431335, + "learning_rate": 0.00023877568475824386, + "loss": 0.4473, + "step": 59030 + }, + { + "epoch": 1.314102564102564, + "grad_norm": 0.5174534916877747, + "learning_rate": 0.00023872994610116304, + "loss": 0.5158, + "step": 59040 + }, + { + "epoch": 1.3143251424501425, + "grad_norm": 0.8329073190689087, + "learning_rate": 0.0002386842053394652, + "loss": 0.7148, + "step": 59050 + }, + { + "epoch": 1.3145477207977208, + "grad_norm": 0.5756538510322571, + "learning_rate": 0.00023863846247563602, + "loss": 0.6326, + "step": 59060 + }, + { + "epoch": 1.3147702991452992, + "grad_norm": 0.6975627541542053, + "learning_rate": 0.00023859271751216113, + "loss": 0.6248, + "step": 59070 + }, + { + "epoch": 1.3149928774928774, + "grad_norm": 0.7237173318862915, + "learning_rate": 0.00023854697045152637, + "loss": 0.5916, + "step": 59080 + }, + { + "epoch": 1.3152154558404558, + "grad_norm": 0.6929823160171509, + "learning_rate": 0.00023850122129621766, + "loss": 0.7398, + "step": 59090 + }, + { + "epoch": 1.315438034188034, + "grad_norm": 0.6520585417747498, + "learning_rate": 0.0002384554700487211, + "loss": 0.5406, + "step": 59100 + }, + { + "epoch": 1.3156606125356125, + "grad_norm": 0.7275229692459106, + "learning_rate": 0.00023840971671152287, + "loss": 0.6248, + "step": 59110 + }, + { + "epoch": 1.315883190883191, + "grad_norm": 0.7695404291152954, + "learning_rate": 0.00023836396128710914, + "loss": 0.5023, + "step": 59120 + }, + { + "epoch": 1.3161057692307692, + "grad_norm": 0.5422924160957336, + "learning_rate": 0.00023831820377796627, + "loss": 0.6049, + "step": 59130 + }, + { + "epoch": 1.3163283475783476, + "grad_norm": 0.6437564492225647, + "learning_rate": 0.00023827244418658095, + "loss": 0.5623, + "step": 59140 + }, + { + "epoch": 1.316550925925926, + "grad_norm": 0.7772666215896606, + "learning_rate": 0.00023822668251543964, + "loss": 0.5435, + "step": 59150 + }, + { + "epoch": 1.3167735042735043, + "grad_norm": 0.578720211982727, + "learning_rate": 0.00023818091876702906, + "loss": 0.6183, + "step": 59160 + }, + { + "epoch": 1.3169960826210827, + "grad_norm": 0.5212342739105225, + "learning_rate": 0.00023813515294383622, + "loss": 0.5001, + "step": 59170 + }, + { + "epoch": 1.317218660968661, + "grad_norm": 0.3705865144729614, + "learning_rate": 0.0002380893850483479, + "loss": 0.5063, + "step": 59180 + }, + { + "epoch": 1.3174412393162394, + "grad_norm": 0.7557964324951172, + "learning_rate": 0.00023804361508305113, + "loss": 0.4601, + "step": 59190 + }, + { + "epoch": 1.3176638176638176, + "grad_norm": 0.8965262174606323, + "learning_rate": 0.00023799784305043322, + "loss": 0.5239, + "step": 59200 + }, + { + "epoch": 1.317886396011396, + "grad_norm": 0.38725170493125916, + "learning_rate": 0.00023795206895298144, + "loss": 0.4289, + "step": 59210 + }, + { + "epoch": 1.3181089743589745, + "grad_norm": 0.4773240089416504, + "learning_rate": 0.00023790629279318317, + "loss": 0.5446, + "step": 59220 + }, + { + "epoch": 1.3183315527065527, + "grad_norm": 0.8197348713874817, + "learning_rate": 0.00023786051457352585, + "loss": 0.5693, + "step": 59230 + }, + { + "epoch": 1.318554131054131, + "grad_norm": 0.5566990375518799, + "learning_rate": 0.0002378147342964973, + "loss": 0.5183, + "step": 59240 + }, + { + "epoch": 1.3187767094017093, + "grad_norm": 0.4379849433898926, + "learning_rate": 0.000237768951964585, + "loss": 0.538, + "step": 59250 + }, + { + "epoch": 1.3189992877492878, + "grad_norm": 0.4216741919517517, + "learning_rate": 0.000237723167580277, + "loss": 0.522, + "step": 59260 + }, + { + "epoch": 1.319221866096866, + "grad_norm": 0.4712885022163391, + "learning_rate": 0.00023767738114606119, + "loss": 0.4766, + "step": 59270 + }, + { + "epoch": 1.3194444444444444, + "grad_norm": 0.7440497279167175, + "learning_rate": 0.00023763159266442565, + "loss": 0.459, + "step": 59280 + }, + { + "epoch": 1.3196670227920229, + "grad_norm": 0.5086557865142822, + "learning_rate": 0.00023758580213785853, + "loss": 0.6002, + "step": 59290 + }, + { + "epoch": 1.319889601139601, + "grad_norm": 1.0026373863220215, + "learning_rate": 0.00023754000956884816, + "loss": 0.5271, + "step": 59300 + }, + { + "epoch": 1.3201121794871795, + "grad_norm": 0.7639952301979065, + "learning_rate": 0.00023749421495988294, + "loss": 0.6444, + "step": 59310 + }, + { + "epoch": 1.3201566951566952, + "eval_loss": 0.5806767344474792, + "eval_runtime": 337.354, + "eval_samples_per_second": 7.01, + "eval_steps_per_second": 7.01, + "step": 59312 + }, + { + "epoch": 1.320334757834758, + "grad_norm": 0.5289011001586914, + "learning_rate": 0.00023744841831345142, + "loss": 0.6351, + "step": 59320 + }, + { + "epoch": 1.3205573361823362, + "grad_norm": 0.6011822819709778, + "learning_rate": 0.0002374026196320421, + "loss": 0.6502, + "step": 59330 + }, + { + "epoch": 1.3207799145299146, + "grad_norm": 0.522434413433075, + "learning_rate": 0.00023735681891814386, + "loss": 0.6585, + "step": 59340 + }, + { + "epoch": 1.3210024928774928, + "grad_norm": 0.5660699009895325, + "learning_rate": 0.00023731101617424557, + "loss": 0.5665, + "step": 59350 + }, + { + "epoch": 1.3212250712250713, + "grad_norm": 0.7258313298225403, + "learning_rate": 0.00023726521140283603, + "loss": 0.5767, + "step": 59360 + }, + { + "epoch": 1.3214476495726495, + "grad_norm": 0.6533500552177429, + "learning_rate": 0.00023721940460640442, + "loss": 0.5371, + "step": 59370 + }, + { + "epoch": 1.321670227920228, + "grad_norm": 0.6535837650299072, + "learning_rate": 0.00023717359578743986, + "loss": 0.4816, + "step": 59380 + }, + { + "epoch": 1.3218928062678064, + "grad_norm": 0.812309980392456, + "learning_rate": 0.00023712778494843173, + "loss": 0.576, + "step": 59390 + }, + { + "epoch": 1.3221153846153846, + "grad_norm": 0.8531997203826904, + "learning_rate": 0.00023708197209186934, + "loss": 0.524, + "step": 59400 + }, + { + "epoch": 1.322337962962963, + "grad_norm": 0.6680968999862671, + "learning_rate": 0.00023703615722024224, + "loss": 0.4451, + "step": 59410 + }, + { + "epoch": 1.3225605413105412, + "grad_norm": 0.5203328728675842, + "learning_rate": 0.00023699034033604002, + "loss": 0.5163, + "step": 59420 + }, + { + "epoch": 1.3227831196581197, + "grad_norm": 0.5531857013702393, + "learning_rate": 0.0002369445214417525, + "loss": 0.5387, + "step": 59430 + }, + { + "epoch": 1.323005698005698, + "grad_norm": 0.693932056427002, + "learning_rate": 0.00023689870053986934, + "loss": 0.665, + "step": 59440 + }, + { + "epoch": 1.3232282763532763, + "grad_norm": 0.6742056012153625, + "learning_rate": 0.0002368528776328806, + "loss": 0.7007, + "step": 59450 + }, + { + "epoch": 1.3234508547008548, + "grad_norm": 0.6122947335243225, + "learning_rate": 0.00023680705272327636, + "loss": 0.6116, + "step": 59460 + }, + { + "epoch": 1.323673433048433, + "grad_norm": 0.44931745529174805, + "learning_rate": 0.00023676122581354673, + "loss": 0.5818, + "step": 59470 + }, + { + "epoch": 1.3238960113960114, + "grad_norm": 0.5619750022888184, + "learning_rate": 0.000236715396906182, + "loss": 0.554, + "step": 59480 + }, + { + "epoch": 1.3241185897435899, + "grad_norm": 0.5768644213676453, + "learning_rate": 0.00023666956600367254, + "loss": 0.6054, + "step": 59490 + }, + { + "epoch": 1.324341168091168, + "grad_norm": 0.9065821170806885, + "learning_rate": 0.00023662373310850886, + "loss": 0.593, + "step": 59500 + }, + { + "epoch": 1.3245637464387463, + "grad_norm": 0.5836758613586426, + "learning_rate": 0.00023657789822318154, + "loss": 0.562, + "step": 59510 + }, + { + "epoch": 1.3247863247863247, + "grad_norm": 0.5203041434288025, + "learning_rate": 0.00023653206135018122, + "loss": 0.6223, + "step": 59520 + }, + { + "epoch": 1.3250089031339032, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.00023648622249199886, + "loss": 0.5299, + "step": 59530 + }, + { + "epoch": 1.3252314814814814, + "grad_norm": 0.6576545834541321, + "learning_rate": 0.0002364403816511253, + "loss": 0.6453, + "step": 59540 + }, + { + "epoch": 1.3254540598290598, + "grad_norm": 0.5824242830276489, + "learning_rate": 0.00023639453883005147, + "loss": 0.529, + "step": 59550 + }, + { + "epoch": 1.3256766381766383, + "grad_norm": 0.5908051133155823, + "learning_rate": 0.00023634869403126873, + "loss": 0.666, + "step": 59560 + }, + { + "epoch": 1.3258992165242165, + "grad_norm": 0.5934712886810303, + "learning_rate": 0.00023630284725726814, + "loss": 0.4812, + "step": 59570 + }, + { + "epoch": 1.326121794871795, + "grad_norm": 0.5465096831321716, + "learning_rate": 0.00023625699851054113, + "loss": 0.4924, + "step": 59580 + }, + { + "epoch": 1.3263443732193732, + "grad_norm": 0.4866415560245514, + "learning_rate": 0.0002362111477935791, + "loss": 0.6447, + "step": 59590 + }, + { + "epoch": 1.3265669515669516, + "grad_norm": 0.8629594445228577, + "learning_rate": 0.0002361652951088737, + "loss": 0.6519, + "step": 59600 + }, + { + "epoch": 1.3267895299145298, + "grad_norm": 0.3857871890068054, + "learning_rate": 0.0002361194404589165, + "loss": 0.6385, + "step": 59610 + }, + { + "epoch": 1.3270121082621082, + "grad_norm": 1.1052052974700928, + "learning_rate": 0.00023607358384619942, + "loss": 0.5939, + "step": 59620 + }, + { + "epoch": 1.3272346866096867, + "grad_norm": 0.7581235766410828, + "learning_rate": 0.0002360277252732143, + "loss": 0.5635, + "step": 59630 + }, + { + "epoch": 1.327457264957265, + "grad_norm": 0.7944244146347046, + "learning_rate": 0.00023598186474245298, + "loss": 0.4476, + "step": 59640 + }, + { + "epoch": 1.3276798433048433, + "grad_norm": 0.6725006699562073, + "learning_rate": 0.00023593600225640767, + "loss": 0.5455, + "step": 59650 + }, + { + "epoch": 1.3279024216524218, + "grad_norm": 0.5654692649841309, + "learning_rate": 0.00023589013781757064, + "loss": 0.5602, + "step": 59660 + }, + { + "epoch": 1.328125, + "grad_norm": 0.44069766998291016, + "learning_rate": 0.00023584427142843412, + "loss": 0.5114, + "step": 59670 + }, + { + "epoch": 1.3283475783475782, + "grad_norm": 0.6530085206031799, + "learning_rate": 0.0002357984030914905, + "loss": 0.6105, + "step": 59680 + }, + { + "epoch": 1.3285701566951567, + "grad_norm": 0.603279709815979, + "learning_rate": 0.0002357525328092324, + "loss": 0.606, + "step": 59690 + }, + { + "epoch": 1.328792735042735, + "grad_norm": 0.9058358669281006, + "learning_rate": 0.00023570666058415248, + "loss": 0.6441, + "step": 59700 + }, + { + "epoch": 1.3290153133903133, + "grad_norm": 1.116974115371704, + "learning_rate": 0.00023566078641874328, + "loss": 0.5387, + "step": 59710 + }, + { + "epoch": 1.3292378917378918, + "grad_norm": 0.5993869304656982, + "learning_rate": 0.00023561491031549774, + "loss": 0.7069, + "step": 59720 + }, + { + "epoch": 1.3294604700854702, + "grad_norm": 0.516359269618988, + "learning_rate": 0.00023556903227690885, + "loss": 0.5338, + "step": 59730 + }, + { + "epoch": 1.3296830484330484, + "grad_norm": 0.8609277009963989, + "learning_rate": 0.00023552315230546963, + "loss": 0.6723, + "step": 59740 + }, + { + "epoch": 1.3299056267806268, + "grad_norm": 0.872204601764679, + "learning_rate": 0.00023547727040367327, + "loss": 0.5943, + "step": 59750 + }, + { + "epoch": 1.330128205128205, + "grad_norm": 0.5097432732582092, + "learning_rate": 0.00023543138657401298, + "loss": 0.4863, + "step": 59760 + }, + { + "epoch": 1.3303507834757835, + "grad_norm": 0.6608929634094238, + "learning_rate": 0.0002353855008189821, + "loss": 0.4568, + "step": 59770 + }, + { + "epoch": 1.3305733618233617, + "grad_norm": 0.675301194190979, + "learning_rate": 0.0002353396131410742, + "loss": 0.7222, + "step": 59780 + }, + { + "epoch": 1.3307959401709402, + "grad_norm": 0.6722831130027771, + "learning_rate": 0.0002352937235427827, + "loss": 0.5946, + "step": 59790 + }, + { + "epoch": 1.3310185185185186, + "grad_norm": 0.3495960533618927, + "learning_rate": 0.00023524783202660143, + "loss": 0.5669, + "step": 59800 + }, + { + "epoch": 1.3312410968660968, + "grad_norm": 0.7395991683006287, + "learning_rate": 0.00023520193859502412, + "loss": 0.606, + "step": 59810 + }, + { + "epoch": 1.3314636752136753, + "grad_norm": 0.7243375182151794, + "learning_rate": 0.0002351560432505446, + "loss": 0.5941, + "step": 59820 + }, + { + "epoch": 1.3316862535612537, + "grad_norm": 0.6544011235237122, + "learning_rate": 0.00023511014599565696, + "loss": 0.4566, + "step": 59830 + }, + { + "epoch": 1.331908831908832, + "grad_norm": 0.8000948429107666, + "learning_rate": 0.00023506424683285526, + "loss": 0.5545, + "step": 59840 + }, + { + "epoch": 1.3321314102564101, + "grad_norm": 0.6928564310073853, + "learning_rate": 0.00023501834576463365, + "loss": 0.4613, + "step": 59850 + }, + { + "epoch": 1.3323539886039886, + "grad_norm": 0.5772952437400818, + "learning_rate": 0.00023497244279348643, + "loss": 0.6453, + "step": 59860 + }, + { + "epoch": 1.332576566951567, + "grad_norm": 0.6352185606956482, + "learning_rate": 0.00023492653792190802, + "loss": 0.6916, + "step": 59870 + }, + { + "epoch": 1.3327991452991452, + "grad_norm": 0.4369979798793793, + "learning_rate": 0.00023488063115239305, + "loss": 0.6148, + "step": 59880 + }, + { + "epoch": 1.3330217236467237, + "grad_norm": 0.700515866279602, + "learning_rate": 0.00023483472248743596, + "loss": 0.6073, + "step": 59890 + }, + { + "epoch": 1.333244301994302, + "grad_norm": 0.9979990124702454, + "learning_rate": 0.00023478881192953157, + "loss": 0.5441, + "step": 59900 + }, + { + "epoch": 1.3334668803418803, + "grad_norm": 0.7997949123382568, + "learning_rate": 0.00023474289948117468, + "loss": 0.6842, + "step": 59910 + }, + { + "epoch": 1.3336894586894588, + "grad_norm": 0.6371301412582397, + "learning_rate": 0.00023469698514486012, + "loss": 0.4973, + "step": 59920 + }, + { + "epoch": 1.333912037037037, + "grad_norm": 0.6061527729034424, + "learning_rate": 0.00023465106892308298, + "loss": 0.4863, + "step": 59930 + }, + { + "epoch": 1.3341346153846154, + "grad_norm": 0.7907821536064148, + "learning_rate": 0.0002346051508183384, + "loss": 0.6388, + "step": 59940 + }, + { + "epoch": 1.3343571937321936, + "grad_norm": 0.8174198865890503, + "learning_rate": 0.00023455923083312165, + "loss": 0.5345, + "step": 59950 + }, + { + "epoch": 1.334579772079772, + "grad_norm": 0.6560558080673218, + "learning_rate": 0.00023451330896992798, + "loss": 0.6792, + "step": 59960 + }, + { + "epoch": 1.3348023504273505, + "grad_norm": 0.5488092303276062, + "learning_rate": 0.0002344673852312528, + "loss": 0.4952, + "step": 59970 + }, + { + "epoch": 1.3350249287749287, + "grad_norm": 0.8640246987342834, + "learning_rate": 0.00023442145961959177, + "loss": 0.6287, + "step": 59980 + }, + { + "epoch": 1.3352475071225072, + "grad_norm": 0.5481551885604858, + "learning_rate": 0.00023437553213744039, + "loss": 0.5579, + "step": 59990 + }, + { + "epoch": 1.3354700854700854, + "grad_norm": 0.860477864742279, + "learning_rate": 0.00023432960278729444, + "loss": 0.5365, + "step": 60000 + }, + { + "epoch": 1.3356926638176638, + "grad_norm": 0.5612112879753113, + "learning_rate": 0.00023428367157164983, + "loss": 0.5128, + "step": 60010 + }, + { + "epoch": 1.335915242165242, + "grad_norm": 0.5579436421394348, + "learning_rate": 0.0002342377384930024, + "loss": 0.6091, + "step": 60020 + }, + { + "epoch": 1.3361378205128205, + "grad_norm": 0.5270305275917053, + "learning_rate": 0.00023419180355384827, + "loss": 0.4779, + "step": 60030 + }, + { + "epoch": 1.336360398860399, + "grad_norm": 0.6184577345848083, + "learning_rate": 0.00023414586675668346, + "loss": 0.6175, + "step": 60040 + }, + { + "epoch": 1.3365829772079771, + "grad_norm": 0.6760562658309937, + "learning_rate": 0.00023409992810400439, + "loss": 0.479, + "step": 60050 + }, + { + "epoch": 1.3368055555555556, + "grad_norm": 0.6230688691139221, + "learning_rate": 0.00023405398759830727, + "loss": 0.5463, + "step": 60060 + }, + { + "epoch": 1.337028133903134, + "grad_norm": 0.355333149433136, + "learning_rate": 0.00023400804524208852, + "loss": 0.5802, + "step": 60070 + }, + { + "epoch": 1.3372507122507122, + "grad_norm": 0.70462965965271, + "learning_rate": 0.00023396210103784486, + "loss": 0.5679, + "step": 60080 + }, + { + "epoch": 1.3374732905982907, + "grad_norm": 0.7561984658241272, + "learning_rate": 0.00023391615498807283, + "loss": 0.5643, + "step": 60090 + }, + { + "epoch": 1.337695868945869, + "grad_norm": 0.6452364921569824, + "learning_rate": 0.0002338702070952691, + "loss": 0.4912, + "step": 60100 + }, + { + "epoch": 1.3379184472934473, + "grad_norm": 0.759984016418457, + "learning_rate": 0.0002338242573619306, + "loss": 0.5302, + "step": 60110 + }, + { + "epoch": 1.3381410256410255, + "grad_norm": 0.5478008389472961, + "learning_rate": 0.0002337783057905543, + "loss": 0.4984, + "step": 60120 + }, + { + "epoch": 1.338363603988604, + "grad_norm": 0.3967496156692505, + "learning_rate": 0.00023373235238363717, + "loss": 0.5099, + "step": 60130 + }, + { + "epoch": 1.3385861823361824, + "grad_norm": 0.3813558518886566, + "learning_rate": 0.0002336863971436764, + "loss": 0.523, + "step": 60140 + }, + { + "epoch": 1.3388087606837606, + "grad_norm": 0.6732510328292847, + "learning_rate": 0.00023364044007316933, + "loss": 0.5673, + "step": 60150 + }, + { + "epoch": 1.339031339031339, + "grad_norm": 0.5705891847610474, + "learning_rate": 0.00023359448117461312, + "loss": 0.412, + "step": 60160 + }, + { + "epoch": 1.3392539173789173, + "grad_norm": 0.44214022159576416, + "learning_rate": 0.0002335485204505053, + "loss": 0.6245, + "step": 60170 + }, + { + "epoch": 1.3394764957264957, + "grad_norm": 0.5217635631561279, + "learning_rate": 0.0002335025579033434, + "loss": 0.4878, + "step": 60180 + }, + { + "epoch": 1.339699074074074, + "grad_norm": 0.5117211937904358, + "learning_rate": 0.00023345659353562513, + "loss": 0.616, + "step": 60190 + }, + { + "epoch": 1.3399216524216524, + "grad_norm": 0.399731308221817, + "learning_rate": 0.00023341062734984815, + "loss": 0.4766, + "step": 60200 + }, + { + "epoch": 1.3401442307692308, + "grad_norm": 0.6643040776252747, + "learning_rate": 0.0002333646593485103, + "loss": 0.4042, + "step": 60210 + }, + { + "epoch": 1.340366809116809, + "grad_norm": 0.693706214427948, + "learning_rate": 0.0002333186895341096, + "loss": 0.6591, + "step": 60220 + }, + { + "epoch": 1.3405893874643875, + "grad_norm": 0.797396719455719, + "learning_rate": 0.00023327271790914408, + "loss": 0.4986, + "step": 60230 + }, + { + "epoch": 1.340811965811966, + "grad_norm": 0.7917518615722656, + "learning_rate": 0.00023322674447611173, + "loss": 0.5472, + "step": 60240 + }, + { + "epoch": 1.3410345441595442, + "grad_norm": 0.6472679972648621, + "learning_rate": 0.0002331807692375109, + "loss": 0.5395, + "step": 60250 + }, + { + "epoch": 1.3412571225071226, + "grad_norm": 0.5727419257164001, + "learning_rate": 0.0002331347921958399, + "loss": 0.5142, + "step": 60260 + }, + { + "epoch": 1.3414797008547008, + "grad_norm": 0.4394914209842682, + "learning_rate": 0.0002330888133535972, + "loss": 0.6813, + "step": 60270 + }, + { + "epoch": 1.3417022792022792, + "grad_norm": 0.733902096748352, + "learning_rate": 0.0002330428327132813, + "loss": 0.6836, + "step": 60280 + }, + { + "epoch": 1.3419248575498575, + "grad_norm": 0.714946985244751, + "learning_rate": 0.0002329968502773908, + "loss": 0.4508, + "step": 60290 + }, + { + "epoch": 1.342147435897436, + "grad_norm": 0.5767934322357178, + "learning_rate": 0.0002329508660484245, + "loss": 0.541, + "step": 60300 + }, + { + "epoch": 1.3423700142450143, + "grad_norm": 0.5443108081817627, + "learning_rate": 0.00023290488002888107, + "loss": 0.5748, + "step": 60310 + }, + { + "epoch": 1.3425925925925926, + "grad_norm": 0.5115030407905579, + "learning_rate": 0.00023285889222125956, + "loss": 0.4549, + "step": 60320 + }, + { + "epoch": 1.342815170940171, + "grad_norm": 0.6017990112304688, + "learning_rate": 0.00023281290262805896, + "loss": 0.4661, + "step": 60330 + }, + { + "epoch": 1.3430377492877492, + "grad_norm": 0.438231498003006, + "learning_rate": 0.00023276691125177835, + "loss": 0.6032, + "step": 60340 + }, + { + "epoch": 1.3432603276353277, + "grad_norm": 0.5477312207221985, + "learning_rate": 0.000232720918094917, + "loss": 0.6482, + "step": 60350 + }, + { + "epoch": 1.3434829059829059, + "grad_norm": 0.8656781911849976, + "learning_rate": 0.00023267492315997413, + "loss": 0.5728, + "step": 60360 + }, + { + "epoch": 1.3437054843304843, + "grad_norm": 0.6571064591407776, + "learning_rate": 0.00023262892644944922, + "loss": 0.4504, + "step": 60370 + }, + { + "epoch": 1.3439280626780628, + "grad_norm": 0.45420676469802856, + "learning_rate": 0.0002325829279658417, + "loss": 0.4575, + "step": 60380 + }, + { + "epoch": 1.344150641025641, + "grad_norm": 0.668060302734375, + "learning_rate": 0.0002325369277116512, + "loss": 0.7203, + "step": 60390 + }, + { + "epoch": 1.3443732193732194, + "grad_norm": 0.6548107862472534, + "learning_rate": 0.00023249092568937744, + "loss": 0.5799, + "step": 60400 + }, + { + "epoch": 1.3445957977207978, + "grad_norm": 0.8012334704399109, + "learning_rate": 0.00023244492190152016, + "loss": 0.5629, + "step": 60410 + }, + { + "epoch": 1.344818376068376, + "grad_norm": 0.560871958732605, + "learning_rate": 0.0002323989163505793, + "loss": 0.5206, + "step": 60420 + }, + { + "epoch": 1.3450409544159543, + "grad_norm": 0.5777971744537354, + "learning_rate": 0.00023235290903905474, + "loss": 0.5262, + "step": 60430 + }, + { + "epoch": 1.3452635327635327, + "grad_norm": 0.6702395081520081, + "learning_rate": 0.0002323068999694467, + "loss": 0.5214, + "step": 60440 + }, + { + "epoch": 1.3454861111111112, + "grad_norm": 0.5977016091346741, + "learning_rate": 0.00023226088914425516, + "loss": 0.4321, + "step": 60450 + }, + { + "epoch": 1.3457086894586894, + "grad_norm": 0.44499361515045166, + "learning_rate": 0.00023221487656598056, + "loss": 0.5829, + "step": 60460 + }, + { + "epoch": 1.3459312678062678, + "grad_norm": 0.46239128708839417, + "learning_rate": 0.0002321688622371232, + "loss": 0.5084, + "step": 60470 + }, + { + "epoch": 1.3461538461538463, + "grad_norm": 0.4279974102973938, + "learning_rate": 0.00023212284616018356, + "loss": 0.577, + "step": 60480 + }, + { + "epoch": 1.3463764245014245, + "grad_norm": 0.4787989854812622, + "learning_rate": 0.0002320768283376621, + "loss": 0.5562, + "step": 60490 + }, + { + "epoch": 1.346599002849003, + "grad_norm": 0.7557083368301392, + "learning_rate": 0.0002320308087720595, + "loss": 0.5946, + "step": 60500 + }, + { + "epoch": 1.3468215811965811, + "grad_norm": 0.7249836921691895, + "learning_rate": 0.00023198478746587665, + "loss": 0.5909, + "step": 60510 + }, + { + "epoch": 1.3470441595441596, + "grad_norm": 0.7555255889892578, + "learning_rate": 0.00023193876442161417, + "loss": 0.5493, + "step": 60520 + }, + { + "epoch": 1.3472667378917378, + "grad_norm": 0.653106153011322, + "learning_rate": 0.0002318927396417731, + "loss": 0.5377, + "step": 60530 + }, + { + "epoch": 1.3474893162393162, + "grad_norm": 0.5108314156532288, + "learning_rate": 0.00023184671312885446, + "loss": 0.589, + "step": 60540 + }, + { + "epoch": 1.3477118945868947, + "grad_norm": 0.4078660011291504, + "learning_rate": 0.00023180068488535941, + "loss": 0.5749, + "step": 60550 + }, + { + "epoch": 1.3479344729344729, + "grad_norm": 0.8522250652313232, + "learning_rate": 0.00023175465491378906, + "loss": 0.6098, + "step": 60560 + }, + { + "epoch": 1.3481570512820513, + "grad_norm": 0.7734964489936829, + "learning_rate": 0.00023170862321664472, + "loss": 0.5097, + "step": 60570 + }, + { + "epoch": 1.3483796296296298, + "grad_norm": 0.47563299536705017, + "learning_rate": 0.00023166258979642792, + "loss": 0.5825, + "step": 60580 + }, + { + "epoch": 1.348602207977208, + "grad_norm": 0.7040695548057556, + "learning_rate": 0.00023161655465564, + "loss": 0.4996, + "step": 60590 + }, + { + "epoch": 1.3488247863247862, + "grad_norm": 0.5625837445259094, + "learning_rate": 0.00023157051779678262, + "loss": 0.4862, + "step": 60600 + }, + { + "epoch": 1.3490473646723646, + "grad_norm": 0.6915884017944336, + "learning_rate": 0.0002315244792223575, + "loss": 0.6076, + "step": 60610 + }, + { + "epoch": 1.349269943019943, + "grad_norm": 0.6123034954071045, + "learning_rate": 0.0002314784389348664, + "loss": 0.5694, + "step": 60620 + }, + { + "epoch": 1.3494925213675213, + "grad_norm": 0.6105090379714966, + "learning_rate": 0.00023143239693681111, + "loss": 0.5683, + "step": 60630 + }, + { + "epoch": 1.3497150997150997, + "grad_norm": 0.6517614126205444, + "learning_rate": 0.00023138635323069365, + "loss": 0.5279, + "step": 60640 + }, + { + "epoch": 1.3499376780626782, + "grad_norm": 0.6277623176574707, + "learning_rate": 0.0002313403078190161, + "loss": 0.69, + "step": 60650 + }, + { + "epoch": 1.3501602564102564, + "grad_norm": 0.49262914061546326, + "learning_rate": 0.00023129426070428045, + "loss": 0.6217, + "step": 60660 + }, + { + "epoch": 1.3503828347578348, + "grad_norm": 0.46751052141189575, + "learning_rate": 0.0002312482118889891, + "loss": 0.5562, + "step": 60670 + }, + { + "epoch": 1.350605413105413, + "grad_norm": 0.584173321723938, + "learning_rate": 0.00023120216137564441, + "loss": 0.4987, + "step": 60680 + }, + { + "epoch": 1.3508279914529915, + "grad_norm": 0.6504818797111511, + "learning_rate": 0.00023115610916674871, + "loss": 0.6683, + "step": 60690 + }, + { + "epoch": 1.3510505698005697, + "grad_norm": 0.9467840790748596, + "learning_rate": 0.00023111005526480448, + "loss": 0.5891, + "step": 60700 + }, + { + "epoch": 1.3512731481481481, + "grad_norm": 0.3871930241584778, + "learning_rate": 0.00023106399967231443, + "loss": 0.5134, + "step": 60710 + }, + { + "epoch": 1.3514957264957266, + "grad_norm": 0.6421234011650085, + "learning_rate": 0.00023101794239178118, + "loss": 0.4355, + "step": 60720 + }, + { + "epoch": 1.3517183048433048, + "grad_norm": 0.5072511434555054, + "learning_rate": 0.00023097188342570751, + "loss": 0.5835, + "step": 60730 + }, + { + "epoch": 1.3519408831908832, + "grad_norm": 0.7537820339202881, + "learning_rate": 0.00023092582277659638, + "loss": 0.4876, + "step": 60740 + }, + { + "epoch": 1.3521634615384617, + "grad_norm": 0.6739486455917358, + "learning_rate": 0.00023087976044695077, + "loss": 0.5496, + "step": 60750 + }, + { + "epoch": 1.35238603988604, + "grad_norm": 0.6261805891990662, + "learning_rate": 0.00023083369643927366, + "loss": 0.4884, + "step": 60760 + }, + { + "epoch": 1.352608618233618, + "grad_norm": 0.5273316502571106, + "learning_rate": 0.0002307876307560682, + "loss": 0.521, + "step": 60770 + }, + { + "epoch": 1.3528311965811965, + "grad_norm": 0.9039011597633362, + "learning_rate": 0.00023074156339983773, + "loss": 0.625, + "step": 60780 + }, + { + "epoch": 1.353053774928775, + "grad_norm": 0.5701965689659119, + "learning_rate": 0.00023069549437308552, + "loss": 0.5523, + "step": 60790 + }, + { + "epoch": 1.3532763532763532, + "grad_norm": 0.6095125675201416, + "learning_rate": 0.00023064942367831499, + "loss": 0.469, + "step": 60800 + }, + { + "epoch": 1.3534989316239316, + "grad_norm": 0.7715137600898743, + "learning_rate": 0.00023060335131802978, + "loss": 0.5077, + "step": 60810 + }, + { + "epoch": 1.35372150997151, + "grad_norm": 0.45640823245048523, + "learning_rate": 0.0002305572772947333, + "loss": 0.5276, + "step": 60820 + }, + { + "epoch": 1.3539440883190883, + "grad_norm": 0.6939868927001953, + "learning_rate": 0.00023051120161092942, + "loss": 0.5504, + "step": 60830 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.5779340267181396, + "learning_rate": 0.00023046512426912186, + "loss": 0.5491, + "step": 60840 + }, + { + "epoch": 1.354389245014245, + "grad_norm": 0.5798224210739136, + "learning_rate": 0.00023041904527181447, + "loss": 0.5363, + "step": 60850 + }, + { + "epoch": 1.3546118233618234, + "grad_norm": 0.6343097686767578, + "learning_rate": 0.00023037296462151129, + "loss": 0.5538, + "step": 60860 + }, + { + "epoch": 1.3548344017094016, + "grad_norm": 0.9751542210578918, + "learning_rate": 0.00023032688232071632, + "loss": 0.5797, + "step": 60870 + }, + { + "epoch": 1.35505698005698, + "grad_norm": 0.7180834412574768, + "learning_rate": 0.00023028079837193377, + "loss": 0.4524, + "step": 60880 + }, + { + "epoch": 1.3552795584045585, + "grad_norm": 0.5285040140151978, + "learning_rate": 0.00023023471277766784, + "loss": 0.6168, + "step": 60890 + }, + { + "epoch": 1.3555021367521367, + "grad_norm": 0.8148588538169861, + "learning_rate": 0.00023018862554042286, + "loss": 0.4125, + "step": 60900 + }, + { + "epoch": 1.3557247150997151, + "grad_norm": 0.7640627026557922, + "learning_rate": 0.00023014253666270326, + "loss": 0.5679, + "step": 60910 + }, + { + "epoch": 1.3559472934472934, + "grad_norm": 0.6860986351966858, + "learning_rate": 0.0002300964461470135, + "loss": 0.5544, + "step": 60920 + }, + { + "epoch": 1.3561698717948718, + "grad_norm": 0.5359602570533752, + "learning_rate": 0.00023005035399585827, + "loss": 0.6402, + "step": 60930 + }, + { + "epoch": 1.35639245014245, + "grad_norm": 0.7734375596046448, + "learning_rate": 0.00023000426021174225, + "loss": 0.5711, + "step": 60940 + }, + { + "epoch": 1.3566150284900285, + "grad_norm": 0.4474536180496216, + "learning_rate": 0.00022995816479717007, + "loss": 0.3943, + "step": 60950 + }, + { + "epoch": 1.356837606837607, + "grad_norm": 0.6209190487861633, + "learning_rate": 0.00022991206775464673, + "loss": 0.6773, + "step": 60960 + }, + { + "epoch": 1.3570601851851851, + "grad_norm": 0.43275436758995056, + "learning_rate": 0.00022986596908667717, + "loss": 0.4396, + "step": 60970 + }, + { + "epoch": 1.3572827635327636, + "grad_norm": 0.6500193476676941, + "learning_rate": 0.00022981986879576637, + "loss": 0.5547, + "step": 60980 + }, + { + "epoch": 1.357505341880342, + "grad_norm": 0.5044712424278259, + "learning_rate": 0.00022977376688441945, + "loss": 0.6024, + "step": 60990 + }, + { + "epoch": 1.3577279202279202, + "grad_norm": 0.6947741508483887, + "learning_rate": 0.00022972766335514172, + "loss": 0.5264, + "step": 61000 + }, + { + "epoch": 1.3579504985754987, + "grad_norm": 0.6112157106399536, + "learning_rate": 0.00022968155821043843, + "loss": 0.4964, + "step": 61010 + }, + { + "epoch": 1.3581730769230769, + "grad_norm": 0.80865079164505, + "learning_rate": 0.00022963545145281492, + "loss": 0.5743, + "step": 61020 + }, + { + "epoch": 1.3583956552706553, + "grad_norm": 0.4842277467250824, + "learning_rate": 0.00022958934308477672, + "loss": 0.6893, + "step": 61030 + }, + { + "epoch": 1.3586182336182335, + "grad_norm": 0.872617244720459, + "learning_rate": 0.00022954323310882946, + "loss": 0.6816, + "step": 61040 + }, + { + "epoch": 1.358840811965812, + "grad_norm": 0.5222064852714539, + "learning_rate": 0.00022949712152747865, + "loss": 0.4895, + "step": 61050 + }, + { + "epoch": 1.3590633903133904, + "grad_norm": 0.5121984481811523, + "learning_rate": 0.00022945100834323014, + "loss": 0.4499, + "step": 61060 + }, + { + "epoch": 1.3592859686609686, + "grad_norm": 1.0106990337371826, + "learning_rate": 0.00022940489355858978, + "loss": 0.6911, + "step": 61070 + }, + { + "epoch": 1.359508547008547, + "grad_norm": 0.6854296922683716, + "learning_rate": 0.00022935877717606337, + "loss": 0.7157, + "step": 61080 + }, + { + "epoch": 1.3597311253561253, + "grad_norm": 0.5751320719718933, + "learning_rate": 0.00022931265919815696, + "loss": 0.5062, + "step": 61090 + }, + { + "epoch": 1.3599537037037037, + "grad_norm": 0.47325485944747925, + "learning_rate": 0.0002292665396273767, + "loss": 0.5205, + "step": 61100 + }, + { + "epoch": 1.360176282051282, + "grad_norm": 0.6046233177185059, + "learning_rate": 0.0002292204184662287, + "loss": 0.4618, + "step": 61110 + }, + { + "epoch": 1.3603988603988604, + "grad_norm": 0.7212419509887695, + "learning_rate": 0.0002291742957172192, + "loss": 0.5979, + "step": 61120 + }, + { + "epoch": 1.3606214387464388, + "grad_norm": 0.4322417378425598, + "learning_rate": 0.00022912817138285462, + "loss": 0.5809, + "step": 61130 + }, + { + "epoch": 1.360844017094017, + "grad_norm": 0.7356921434402466, + "learning_rate": 0.00022908204546564145, + "loss": 0.529, + "step": 61140 + }, + { + "epoch": 1.3610665954415955, + "grad_norm": 0.3717854619026184, + "learning_rate": 0.000229035917968086, + "loss": 0.5253, + "step": 61150 + }, + { + "epoch": 1.361289173789174, + "grad_norm": 0.8095477223396301, + "learning_rate": 0.000228989788892695, + "loss": 0.5405, + "step": 61160 + }, + { + "epoch": 1.3615117521367521, + "grad_norm": 0.4735598564147949, + "learning_rate": 0.0002289436582419752, + "loss": 0.607, + "step": 61170 + }, + { + "epoch": 1.3617343304843303, + "grad_norm": 0.6034855246543884, + "learning_rate": 0.0002288975260184333, + "loss": 0.5844, + "step": 61180 + }, + { + "epoch": 1.3619569088319088, + "grad_norm": 0.5404762625694275, + "learning_rate": 0.00022885139222457616, + "loss": 0.4322, + "step": 61190 + }, + { + "epoch": 1.3621794871794872, + "grad_norm": 1.5063923597335815, + "learning_rate": 0.00022880525686291075, + "loss": 0.7102, + "step": 61200 + }, + { + "epoch": 1.3624020655270654, + "grad_norm": 0.6246905326843262, + "learning_rate": 0.00022875911993594413, + "loss": 0.5407, + "step": 61210 + }, + { + "epoch": 1.3626246438746439, + "grad_norm": 0.5119190216064453, + "learning_rate": 0.00022871298144618339, + "loss": 0.5707, + "step": 61220 + }, + { + "epoch": 1.3628472222222223, + "grad_norm": 0.5602666735649109, + "learning_rate": 0.0002286668413961357, + "loss": 0.6487, + "step": 61230 + }, + { + "epoch": 1.3630698005698005, + "grad_norm": 0.6652705073356628, + "learning_rate": 0.00022862069978830837, + "loss": 0.5519, + "step": 61240 + }, + { + "epoch": 1.363292378917379, + "grad_norm": 0.6826812028884888, + "learning_rate": 0.00022857455662520884, + "loss": 0.5748, + "step": 61250 + }, + { + "epoch": 1.3635149572649572, + "grad_norm": 0.46549534797668457, + "learning_rate": 0.00022852841190934445, + "loss": 0.5655, + "step": 61260 + }, + { + "epoch": 1.3637375356125356, + "grad_norm": 0.5537489056587219, + "learning_rate": 0.00022848226564322284, + "loss": 0.6324, + "step": 61270 + }, + { + "epoch": 1.3639601139601139, + "grad_norm": 0.46651938557624817, + "learning_rate": 0.0002284361178293516, + "loss": 0.6068, + "step": 61280 + }, + { + "epoch": 1.3641826923076923, + "grad_norm": 0.5096479058265686, + "learning_rate": 0.00022838996847023842, + "loss": 0.4882, + "step": 61290 + }, + { + "epoch": 1.3644052706552707, + "grad_norm": 0.5014933943748474, + "learning_rate": 0.0002283438175683911, + "loss": 0.5067, + "step": 61300 + }, + { + "epoch": 1.364627849002849, + "grad_norm": 0.7643182873725891, + "learning_rate": 0.00022829766512631755, + "loss": 0.6686, + "step": 61310 + }, + { + "epoch": 1.3648504273504274, + "grad_norm": 0.5372745394706726, + "learning_rate": 0.00022825151114652572, + "loss": 0.5306, + "step": 61320 + }, + { + "epoch": 1.3650730056980058, + "grad_norm": 0.5938313603401184, + "learning_rate": 0.00022820535563152362, + "loss": 0.4763, + "step": 61330 + }, + { + "epoch": 1.365295584045584, + "grad_norm": 0.5216083526611328, + "learning_rate": 0.00022815919858381944, + "loss": 0.5552, + "step": 61340 + }, + { + "epoch": 1.3655181623931623, + "grad_norm": 0.4963095486164093, + "learning_rate": 0.00022811304000592135, + "loss": 0.4957, + "step": 61350 + }, + { + "epoch": 1.3657407407407407, + "grad_norm": 0.7983525991439819, + "learning_rate": 0.00022806687990033764, + "loss": 0.531, + "step": 61360 + }, + { + "epoch": 1.3659633190883191, + "grad_norm": 1.3070868253707886, + "learning_rate": 0.00022802071826957669, + "loss": 0.6501, + "step": 61370 + }, + { + "epoch": 1.3661858974358974, + "grad_norm": 0.77958083152771, + "learning_rate": 0.00022797455511614702, + "loss": 0.7044, + "step": 61380 + }, + { + "epoch": 1.3664084757834758, + "grad_norm": 0.6154484152793884, + "learning_rate": 0.00022792839044255705, + "loss": 0.5782, + "step": 61390 + }, + { + "epoch": 1.3666310541310542, + "grad_norm": 0.5589473247528076, + "learning_rate": 0.00022788222425131554, + "loss": 0.4325, + "step": 61400 + }, + { + "epoch": 1.3668536324786325, + "grad_norm": 0.42728063464164734, + "learning_rate": 0.00022783605654493107, + "loss": 0.5843, + "step": 61410 + }, + { + "epoch": 1.367076210826211, + "grad_norm": 0.8472455739974976, + "learning_rate": 0.00022778988732591259, + "loss": 0.6619, + "step": 61420 + }, + { + "epoch": 1.367298789173789, + "grad_norm": 0.6462082862854004, + "learning_rate": 0.0002277437165967688, + "loss": 0.6922, + "step": 61430 + }, + { + "epoch": 1.3675213675213675, + "grad_norm": 0.49318283796310425, + "learning_rate": 0.00022769754436000877, + "loss": 0.5898, + "step": 61440 + }, + { + "epoch": 1.3677439458689458, + "grad_norm": 0.6925389766693115, + "learning_rate": 0.00022765137061814153, + "loss": 0.5382, + "step": 61450 + }, + { + "epoch": 1.3679665242165242, + "grad_norm": 0.6416910886764526, + "learning_rate": 0.00022760519537367614, + "loss": 0.5307, + "step": 61460 + }, + { + "epoch": 1.3681891025641026, + "grad_norm": 0.47730687260627747, + "learning_rate": 0.00022755901862912188, + "loss": 0.5994, + "step": 61470 + }, + { + "epoch": 1.3684116809116809, + "grad_norm": 0.554335355758667, + "learning_rate": 0.0002275128403869879, + "loss": 0.6278, + "step": 61480 + }, + { + "epoch": 1.3686342592592593, + "grad_norm": 0.548578143119812, + "learning_rate": 0.00022746666064978377, + "loss": 0.5798, + "step": 61490 + }, + { + "epoch": 1.3688568376068377, + "grad_norm": 0.6649680733680725, + "learning_rate": 0.00022742047942001873, + "loss": 0.5958, + "step": 61500 + }, + { + "epoch": 1.369079415954416, + "grad_norm": 0.5594968199729919, + "learning_rate": 0.00022737429670020238, + "loss": 0.5051, + "step": 61510 + }, + { + "epoch": 1.3693019943019942, + "grad_norm": 0.5725864768028259, + "learning_rate": 0.00022732811249284436, + "loss": 0.4258, + "step": 61520 + }, + { + "epoch": 1.3695245726495726, + "grad_norm": 0.4459512233734131, + "learning_rate": 0.00022728192680045438, + "loss": 0.5485, + "step": 61530 + }, + { + "epoch": 1.369747150997151, + "grad_norm": 0.6141966581344604, + "learning_rate": 0.0002272357396255421, + "loss": 0.5744, + "step": 61540 + }, + { + "epoch": 1.3699697293447293, + "grad_norm": 0.5769743919372559, + "learning_rate": 0.00022718955097061745, + "loss": 0.576, + "step": 61550 + }, + { + "epoch": 1.3701923076923077, + "grad_norm": 0.4499601721763611, + "learning_rate": 0.00022714336083819037, + "loss": 0.5567, + "step": 61560 + }, + { + "epoch": 1.3704148860398861, + "grad_norm": 0.7231540679931641, + "learning_rate": 0.0002270971692307708, + "loss": 0.552, + "step": 61570 + }, + { + "epoch": 1.3706374643874644, + "grad_norm": 0.36756712198257446, + "learning_rate": 0.00022705097615086887, + "loss": 0.5, + "step": 61580 + }, + { + "epoch": 1.3708600427350428, + "grad_norm": 0.6529523730278015, + "learning_rate": 0.0002270047816009948, + "loss": 0.5632, + "step": 61590 + }, + { + "epoch": 1.371082621082621, + "grad_norm": 0.8186941742897034, + "learning_rate": 0.00022695858558365882, + "loss": 0.518, + "step": 61600 + }, + { + "epoch": 1.3713051994301995, + "grad_norm": 0.4617755115032196, + "learning_rate": 0.00022691238810137115, + "loss": 0.5366, + "step": 61610 + }, + { + "epoch": 1.3715277777777777, + "grad_norm": 0.4704979956150055, + "learning_rate": 0.00022686618915664227, + "loss": 0.5903, + "step": 61620 + }, + { + "epoch": 1.3717503561253561, + "grad_norm": 0.5129684209823608, + "learning_rate": 0.00022681998875198275, + "loss": 0.5632, + "step": 61630 + }, + { + "epoch": 1.3719729344729346, + "grad_norm": 0.9282556176185608, + "learning_rate": 0.00022677378688990306, + "loss": 0.6957, + "step": 61640 + }, + { + "epoch": 1.3721955128205128, + "grad_norm": 0.4365776479244232, + "learning_rate": 0.00022672758357291381, + "loss": 0.5194, + "step": 61650 + }, + { + "epoch": 1.3724180911680912, + "grad_norm": 0.5178688168525696, + "learning_rate": 0.00022668137880352585, + "loss": 0.6173, + "step": 61660 + }, + { + "epoch": 1.3726406695156697, + "grad_norm": 0.7604503631591797, + "learning_rate": 0.00022663517258424994, + "loss": 0.601, + "step": 61670 + }, + { + "epoch": 1.3728632478632479, + "grad_norm": 0.7271844148635864, + "learning_rate": 0.00022658896491759692, + "loss": 0.5686, + "step": 61680 + }, + { + "epoch": 1.373085826210826, + "grad_norm": 0.5016389489173889, + "learning_rate": 0.00022654275580607776, + "loss": 0.5662, + "step": 61690 + }, + { + "epoch": 1.3733084045584045, + "grad_norm": 0.8639649152755737, + "learning_rate": 0.00022649654525220357, + "loss": 0.4636, + "step": 61700 + }, + { + "epoch": 1.373530982905983, + "grad_norm": 0.9058513045310974, + "learning_rate": 0.00022645033325848534, + "loss": 0.6237, + "step": 61710 + }, + { + "epoch": 1.3737535612535612, + "grad_norm": 0.7580538988113403, + "learning_rate": 0.0002264041198274344, + "loss": 0.5563, + "step": 61720 + }, + { + "epoch": 1.3739761396011396, + "grad_norm": 0.4570119380950928, + "learning_rate": 0.00022635790496156201, + "loss": 0.6289, + "step": 61730 + }, + { + "epoch": 1.374198717948718, + "grad_norm": 0.8195516467094421, + "learning_rate": 0.00022631168866337945, + "loss": 0.6815, + "step": 61740 + }, + { + "epoch": 1.3744212962962963, + "grad_norm": 0.6854251027107239, + "learning_rate": 0.00022626547093539817, + "loss": 0.5385, + "step": 61750 + }, + { + "epoch": 1.3746438746438747, + "grad_norm": 0.6686112880706787, + "learning_rate": 0.00022621925178012967, + "loss": 0.549, + "step": 61760 + }, + { + "epoch": 1.374866452991453, + "grad_norm": 0.46365267038345337, + "learning_rate": 0.00022617303120008565, + "loss": 0.5998, + "step": 61770 + }, + { + "epoch": 1.3750890313390314, + "grad_norm": 0.5610596537590027, + "learning_rate": 0.00022612680919777766, + "loss": 0.473, + "step": 61780 + }, + { + "epoch": 1.3753116096866096, + "grad_norm": 0.6108236908912659, + "learning_rate": 0.00022608058577571743, + "loss": 0.4008, + "step": 61790 + }, + { + "epoch": 1.375534188034188, + "grad_norm": 0.6078330874443054, + "learning_rate": 0.00022603436093641683, + "loss": 0.6347, + "step": 61800 + }, + { + "epoch": 1.3757567663817665, + "grad_norm": 0.6715290546417236, + "learning_rate": 0.00022598813468238782, + "loss": 0.4849, + "step": 61810 + }, + { + "epoch": 1.3759793447293447, + "grad_norm": 0.6614882946014404, + "learning_rate": 0.0002259419070161422, + "loss": 0.5161, + "step": 61820 + }, + { + "epoch": 1.3762019230769231, + "grad_norm": 0.3726741373538971, + "learning_rate": 0.00022589567794019212, + "loss": 0.5588, + "step": 61830 + }, + { + "epoch": 1.3764245014245013, + "grad_norm": 0.7765806913375854, + "learning_rate": 0.00022584944745704974, + "loss": 0.5807, + "step": 61840 + }, + { + "epoch": 1.3766470797720798, + "grad_norm": 0.8264938592910767, + "learning_rate": 0.00022580321556922722, + "loss": 0.624, + "step": 61850 + }, + { + "epoch": 1.376869658119658, + "grad_norm": 0.41498658061027527, + "learning_rate": 0.00022575698227923687, + "loss": 0.5096, + "step": 61860 + }, + { + "epoch": 1.3770922364672364, + "grad_norm": 0.555334210395813, + "learning_rate": 0.00022571074758959093, + "loss": 0.5419, + "step": 61870 + }, + { + "epoch": 1.3773148148148149, + "grad_norm": 0.6658076643943787, + "learning_rate": 0.00022566451150280204, + "loss": 0.5898, + "step": 61880 + }, + { + "epoch": 1.377537393162393, + "grad_norm": 0.7786030769348145, + "learning_rate": 0.00022561827402138248, + "loss": 0.6097, + "step": 61890 + }, + { + "epoch": 1.3777599715099715, + "grad_norm": 0.5594373345375061, + "learning_rate": 0.00022557203514784498, + "loss": 0.5618, + "step": 61900 + }, + { + "epoch": 1.37798254985755, + "grad_norm": 0.4955253601074219, + "learning_rate": 0.0002255257948847022, + "loss": 0.43, + "step": 61910 + }, + { + "epoch": 1.3782051282051282, + "grad_norm": 0.5867207646369934, + "learning_rate": 0.00022547955323446673, + "loss": 0.6679, + "step": 61920 + }, + { + "epoch": 1.3784277065527066, + "grad_norm": 0.7317691445350647, + "learning_rate": 0.00022543331019965154, + "loss": 0.4994, + "step": 61930 + }, + { + "epoch": 1.3786502849002849, + "grad_norm": 0.5208408832550049, + "learning_rate": 0.0002253870657827694, + "loss": 0.6063, + "step": 61940 + }, + { + "epoch": 1.3788728632478633, + "grad_norm": 0.7552675008773804, + "learning_rate": 0.00022534081998633343, + "loss": 0.6639, + "step": 61950 + }, + { + "epoch": 1.3790954415954415, + "grad_norm": 0.5934411287307739, + "learning_rate": 0.00022529457281285646, + "loss": 0.6692, + "step": 61960 + }, + { + "epoch": 1.37931801994302, + "grad_norm": 0.7339460253715515, + "learning_rate": 0.00022524832426485173, + "loss": 0.6032, + "step": 61970 + }, + { + "epoch": 1.3795405982905984, + "grad_norm": 0.7074295282363892, + "learning_rate": 0.00022520207434483238, + "loss": 0.5184, + "step": 61980 + }, + { + "epoch": 1.3797631766381766, + "grad_norm": 0.42233505845069885, + "learning_rate": 0.00022515582305531173, + "loss": 0.5433, + "step": 61990 + }, + { + "epoch": 1.379985754985755, + "grad_norm": 0.46188631653785706, + "learning_rate": 0.000225109570398803, + "loss": 0.5613, + "step": 62000 + }, + { + "epoch": 1.3801638176638176, + "eval_loss": 0.576337993144989, + "eval_runtime": 337.4259, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 7.009, + "step": 62008 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.5289804339408875, + "learning_rate": 0.00022506331637781965, + "loss": 0.6027, + "step": 62010 + }, + { + "epoch": 1.3804309116809117, + "grad_norm": 0.3944786787033081, + "learning_rate": 0.00022501706099487524, + "loss": 0.5538, + "step": 62020 + }, + { + "epoch": 1.38065349002849, + "grad_norm": 0.6000606417655945, + "learning_rate": 0.00022497080425248317, + "loss": 0.7033, + "step": 62030 + }, + { + "epoch": 1.3808760683760684, + "grad_norm": 0.5767252445220947, + "learning_rate": 0.0002249245461531572, + "loss": 0.5677, + "step": 62040 + }, + { + "epoch": 1.3810986467236468, + "grad_norm": 0.5308343768119812, + "learning_rate": 0.00022487828669941104, + "loss": 0.582, + "step": 62050 + }, + { + "epoch": 1.381321225071225, + "grad_norm": 0.540984034538269, + "learning_rate": 0.00022483202589375836, + "loss": 0.5718, + "step": 62060 + }, + { + "epoch": 1.3815438034188035, + "grad_norm": 0.6492820382118225, + "learning_rate": 0.00022478576373871306, + "loss": 0.5682, + "step": 62070 + }, + { + "epoch": 1.381766381766382, + "grad_norm": 0.5033608078956604, + "learning_rate": 0.00022473950023678903, + "loss": 0.5774, + "step": 62080 + }, + { + "epoch": 1.38198896011396, + "grad_norm": 0.7015234231948853, + "learning_rate": 0.00022469323539050037, + "loss": 0.6103, + "step": 62090 + }, + { + "epoch": 1.3822115384615383, + "grad_norm": 0.605975866317749, + "learning_rate": 0.00022464696920236103, + "loss": 0.502, + "step": 62100 + }, + { + "epoch": 1.3824341168091168, + "grad_norm": 0.39481988549232483, + "learning_rate": 0.00022460070167488523, + "loss": 0.4986, + "step": 62110 + }, + { + "epoch": 1.3826566951566952, + "grad_norm": 0.6863367557525635, + "learning_rate": 0.00022455443281058722, + "loss": 0.6544, + "step": 62120 + }, + { + "epoch": 1.3828792735042734, + "grad_norm": 0.6480594873428345, + "learning_rate": 0.0002245081626119812, + "loss": 0.6128, + "step": 62130 + }, + { + "epoch": 1.3831018518518519, + "grad_norm": 0.5644036531448364, + "learning_rate": 0.00022446189108158148, + "loss": 0.4863, + "step": 62140 + }, + { + "epoch": 1.3833244301994303, + "grad_norm": 0.7015679478645325, + "learning_rate": 0.00022441561822190265, + "loss": 0.5432, + "step": 62150 + }, + { + "epoch": 1.3835470085470085, + "grad_norm": 0.559040904045105, + "learning_rate": 0.00022436934403545914, + "loss": 0.4822, + "step": 62160 + }, + { + "epoch": 1.383769586894587, + "grad_norm": 0.5581338405609131, + "learning_rate": 0.00022432306852476547, + "loss": 0.602, + "step": 62170 + }, + { + "epoch": 1.3839921652421652, + "grad_norm": 0.4883200228214264, + "learning_rate": 0.00022427679169233637, + "loss": 0.5778, + "step": 62180 + }, + { + "epoch": 1.3842147435897436, + "grad_norm": 0.912431001663208, + "learning_rate": 0.00022423051354068658, + "loss": 0.5926, + "step": 62190 + }, + { + "epoch": 1.3844373219373218, + "grad_norm": 0.54873126745224, + "learning_rate": 0.00022418423407233085, + "loss": 0.4685, + "step": 62200 + }, + { + "epoch": 1.3846599002849003, + "grad_norm": 0.4148489832878113, + "learning_rate": 0.000224137953289784, + "loss": 0.5606, + "step": 62210 + }, + { + "epoch": 1.3848824786324787, + "grad_norm": 0.6492806673049927, + "learning_rate": 0.000224091671195561, + "loss": 0.4948, + "step": 62220 + }, + { + "epoch": 1.385105056980057, + "grad_norm": 0.782181978225708, + "learning_rate": 0.00022404538779217687, + "loss": 0.5751, + "step": 62230 + }, + { + "epoch": 1.3853276353276354, + "grad_norm": 0.47252678871154785, + "learning_rate": 0.00022399910308214672, + "loss": 0.4825, + "step": 62240 + }, + { + "epoch": 1.3855502136752138, + "grad_norm": 0.6602449417114258, + "learning_rate": 0.00022395281706798562, + "loss": 0.6602, + "step": 62250 + }, + { + "epoch": 1.385772792022792, + "grad_norm": 0.538071870803833, + "learning_rate": 0.00022390652975220885, + "loss": 0.4552, + "step": 62260 + }, + { + "epoch": 1.3859953703703702, + "grad_norm": 0.7834280729293823, + "learning_rate": 0.00022386024113733172, + "loss": 0.5724, + "step": 62270 + }, + { + "epoch": 1.3862179487179487, + "grad_norm": 0.7608660459518433, + "learning_rate": 0.00022381395122586945, + "loss": 0.6567, + "step": 62280 + }, + { + "epoch": 1.3864405270655271, + "grad_norm": 0.7100602388381958, + "learning_rate": 0.00022376766002033763, + "loss": 0.626, + "step": 62290 + }, + { + "epoch": 1.3866631054131053, + "grad_norm": 0.4596807062625885, + "learning_rate": 0.00022372136752325176, + "loss": 0.6701, + "step": 62300 + }, + { + "epoch": 1.3868856837606838, + "grad_norm": 0.5603671669960022, + "learning_rate": 0.00022367507373712727, + "loss": 0.5665, + "step": 62310 + }, + { + "epoch": 1.3871082621082622, + "grad_norm": 0.5945459008216858, + "learning_rate": 0.00022362877866448, + "loss": 0.5837, + "step": 62320 + }, + { + "epoch": 1.3873308404558404, + "grad_norm": 0.688590943813324, + "learning_rate": 0.00022358248230782546, + "loss": 0.5475, + "step": 62330 + }, + { + "epoch": 1.3875534188034189, + "grad_norm": 0.41545212268829346, + "learning_rate": 0.00022353618466967957, + "loss": 0.559, + "step": 62340 + }, + { + "epoch": 1.387775997150997, + "grad_norm": 0.4486536979675293, + "learning_rate": 0.0002234898857525581, + "loss": 0.4332, + "step": 62350 + }, + { + "epoch": 1.3879985754985755, + "grad_norm": 0.4931320250034332, + "learning_rate": 0.00022344358555897702, + "loss": 0.5444, + "step": 62360 + }, + { + "epoch": 1.3882211538461537, + "grad_norm": 0.6781936287879944, + "learning_rate": 0.00022339728409145236, + "loss": 0.4987, + "step": 62370 + }, + { + "epoch": 1.3884437321937322, + "grad_norm": 0.5794190764427185, + "learning_rate": 0.00022335098135250006, + "loss": 0.5242, + "step": 62380 + }, + { + "epoch": 1.3886663105413106, + "grad_norm": 0.6140381097793579, + "learning_rate": 0.00022330467734463637, + "loss": 0.5791, + "step": 62390 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.8430309891700745, + "learning_rate": 0.0002232583720703774, + "loss": 0.5393, + "step": 62400 + }, + { + "epoch": 1.3891114672364673, + "grad_norm": 0.3692317605018616, + "learning_rate": 0.00022321206553223947, + "loss": 0.3972, + "step": 62410 + }, + { + "epoch": 1.3893340455840457, + "grad_norm": 0.6154002547264099, + "learning_rate": 0.00022316575773273888, + "loss": 0.5825, + "step": 62420 + }, + { + "epoch": 1.389556623931624, + "grad_norm": 0.62913978099823, + "learning_rate": 0.00022311944867439208, + "loss": 0.596, + "step": 62430 + }, + { + "epoch": 1.3897792022792022, + "grad_norm": 0.8898218870162964, + "learning_rate": 0.00022307313835971551, + "loss": 0.6229, + "step": 62440 + }, + { + "epoch": 1.3900017806267806, + "grad_norm": 0.7014033794403076, + "learning_rate": 0.0002230268267912257, + "loss": 0.6822, + "step": 62450 + }, + { + "epoch": 1.390224358974359, + "grad_norm": 0.7933834195137024, + "learning_rate": 0.00022298051397143928, + "loss": 0.6245, + "step": 62460 + }, + { + "epoch": 1.3904469373219372, + "grad_norm": 0.7982556223869324, + "learning_rate": 0.0002229341999028729, + "loss": 0.6596, + "step": 62470 + }, + { + "epoch": 1.3906695156695157, + "grad_norm": 0.7251663208007812, + "learning_rate": 0.00022288788458804334, + "loss": 0.6613, + "step": 62480 + }, + { + "epoch": 1.3908920940170941, + "grad_norm": 0.6037178635597229, + "learning_rate": 0.00022284156802946737, + "loss": 0.6531, + "step": 62490 + }, + { + "epoch": 1.3911146723646723, + "grad_norm": 0.6063000559806824, + "learning_rate": 0.00022279525022966189, + "loss": 0.4803, + "step": 62500 + }, + { + "epoch": 1.3913372507122508, + "grad_norm": 0.5215537548065186, + "learning_rate": 0.00022274893119114388, + "loss": 0.5963, + "step": 62510 + }, + { + "epoch": 1.391559829059829, + "grad_norm": 0.7405494451522827, + "learning_rate": 0.00022270261091643034, + "loss": 0.5951, + "step": 62520 + }, + { + "epoch": 1.3917824074074074, + "grad_norm": 0.7568892240524292, + "learning_rate": 0.0002226562894080383, + "loss": 0.4655, + "step": 62530 + }, + { + "epoch": 1.3920049857549857, + "grad_norm": 0.8880526423454285, + "learning_rate": 0.00022260996666848497, + "loss": 0.7519, + "step": 62540 + }, + { + "epoch": 1.392227564102564, + "grad_norm": 0.4250199794769287, + "learning_rate": 0.00022256364270028752, + "loss": 0.5116, + "step": 62550 + }, + { + "epoch": 1.3924501424501425, + "grad_norm": 0.38995271921157837, + "learning_rate": 0.00022251731750596326, + "loss": 0.5977, + "step": 62560 + }, + { + "epoch": 1.3926727207977208, + "grad_norm": 0.7418897151947021, + "learning_rate": 0.00022247099108802952, + "loss": 0.468, + "step": 62570 + }, + { + "epoch": 1.3928952991452992, + "grad_norm": 0.5622426867485046, + "learning_rate": 0.00022242466344900383, + "loss": 0.4863, + "step": 62580 + }, + { + "epoch": 1.3931178774928774, + "grad_norm": 0.3932307958602905, + "learning_rate": 0.00022237833459140346, + "loss": 0.6142, + "step": 62590 + }, + { + "epoch": 1.3933404558404558, + "grad_norm": 0.6206909418106079, + "learning_rate": 0.00022233200451774607, + "loss": 0.5501, + "step": 62600 + }, + { + "epoch": 1.393563034188034, + "grad_norm": 0.6179700493812561, + "learning_rate": 0.00022228567323054934, + "loss": 0.4821, + "step": 62610 + }, + { + "epoch": 1.3937856125356125, + "grad_norm": 0.5563085675239563, + "learning_rate": 0.0002222393407323308, + "loss": 0.5044, + "step": 62620 + }, + { + "epoch": 1.394008190883191, + "grad_norm": 0.6906719207763672, + "learning_rate": 0.00022219300702560832, + "loss": 0.4655, + "step": 62630 + }, + { + "epoch": 1.3942307692307692, + "grad_norm": 0.7658056020736694, + "learning_rate": 0.00022214667211289965, + "loss": 0.6384, + "step": 62640 + }, + { + "epoch": 1.3944533475783476, + "grad_norm": 0.5997521877288818, + "learning_rate": 0.00022210033599672277, + "loss": 0.5264, + "step": 62650 + }, + { + "epoch": 1.394675925925926, + "grad_norm": 0.6423810124397278, + "learning_rate": 0.00022205399867959545, + "loss": 0.6993, + "step": 62660 + }, + { + "epoch": 1.3948985042735043, + "grad_norm": 0.8208138346672058, + "learning_rate": 0.00022200766016403577, + "loss": 0.5452, + "step": 62670 + }, + { + "epoch": 1.3951210826210827, + "grad_norm": 0.5556749105453491, + "learning_rate": 0.0002219613204525619, + "loss": 0.5979, + "step": 62680 + }, + { + "epoch": 1.395343660968661, + "grad_norm": 0.5077260732650757, + "learning_rate": 0.0002219149795476919, + "loss": 0.4665, + "step": 62690 + }, + { + "epoch": 1.3955662393162394, + "grad_norm": 0.6669285893440247, + "learning_rate": 0.00022186863745194392, + "loss": 0.6145, + "step": 62700 + }, + { + "epoch": 1.3957888176638176, + "grad_norm": 0.7747873663902283, + "learning_rate": 0.0002218222941678363, + "loss": 0.4928, + "step": 62710 + }, + { + "epoch": 1.396011396011396, + "grad_norm": 0.642152726650238, + "learning_rate": 0.00022177594969788732, + "loss": 0.5777, + "step": 62720 + }, + { + "epoch": 1.3962339743589745, + "grad_norm": 0.5894016027450562, + "learning_rate": 0.00022172960404461542, + "loss": 0.6064, + "step": 62730 + }, + { + "epoch": 1.3964565527065527, + "grad_norm": 0.6865715980529785, + "learning_rate": 0.00022168325721053908, + "loss": 0.5555, + "step": 62740 + }, + { + "epoch": 1.396679131054131, + "grad_norm": 0.6753658652305603, + "learning_rate": 0.00022163690919817678, + "loss": 0.5101, + "step": 62750 + }, + { + "epoch": 1.3969017094017093, + "grad_norm": 0.4573817551136017, + "learning_rate": 0.0002215905600100471, + "loss": 0.4022, + "step": 62760 + }, + { + "epoch": 1.3971242877492878, + "grad_norm": 0.5543593764305115, + "learning_rate": 0.0002215442096486687, + "loss": 0.5375, + "step": 62770 + }, + { + "epoch": 1.397346866096866, + "grad_norm": 0.6529911160469055, + "learning_rate": 0.00022149785811656036, + "loss": 0.5514, + "step": 62780 + }, + { + "epoch": 1.3975694444444444, + "grad_norm": 0.5409330725669861, + "learning_rate": 0.0002214515054162408, + "loss": 0.5565, + "step": 62790 + }, + { + "epoch": 1.3977920227920229, + "grad_norm": 0.4280526638031006, + "learning_rate": 0.00022140515155022884, + "loss": 0.5782, + "step": 62800 + }, + { + "epoch": 1.398014601139601, + "grad_norm": 0.6719635725021362, + "learning_rate": 0.00022135879652104344, + "loss": 0.5799, + "step": 62810 + }, + { + "epoch": 1.3982371794871795, + "grad_norm": 0.4967477023601532, + "learning_rate": 0.00022131244033120359, + "loss": 0.5666, + "step": 62820 + }, + { + "epoch": 1.398459757834758, + "grad_norm": 0.6547058820724487, + "learning_rate": 0.00022126608298322822, + "loss": 0.6124, + "step": 62830 + }, + { + "epoch": 1.3986823361823362, + "grad_norm": 0.8326950073242188, + "learning_rate": 0.00022121972447963653, + "loss": 0.6077, + "step": 62840 + }, + { + "epoch": 1.3989049145299146, + "grad_norm": 0.8579636812210083, + "learning_rate": 0.00022117336482294767, + "loss": 0.6663, + "step": 62850 + }, + { + "epoch": 1.3991274928774928, + "grad_norm": 0.6645912528038025, + "learning_rate": 0.0002211270040156808, + "loss": 0.5493, + "step": 62860 + }, + { + "epoch": 1.3993500712250713, + "grad_norm": 0.5138288736343384, + "learning_rate": 0.0002210806420603552, + "loss": 0.52, + "step": 62870 + }, + { + "epoch": 1.3995726495726495, + "grad_norm": 0.6342707276344299, + "learning_rate": 0.00022103427895949027, + "loss": 0.6662, + "step": 62880 + }, + { + "epoch": 1.399795227920228, + "grad_norm": 0.6028152108192444, + "learning_rate": 0.00022098791471560544, + "loss": 0.4837, + "step": 62890 + }, + { + "epoch": 1.4000178062678064, + "grad_norm": 0.4323852062225342, + "learning_rate": 0.00022094154933122008, + "loss": 0.4636, + "step": 62900 + }, + { + "epoch": 1.4002403846153846, + "grad_norm": 0.6630343198776245, + "learning_rate": 0.00022089518280885386, + "loss": 0.6073, + "step": 62910 + }, + { + "epoch": 1.400462962962963, + "grad_norm": 0.5092897415161133, + "learning_rate": 0.00022084881515102627, + "loss": 0.5255, + "step": 62920 + }, + { + "epoch": 1.4006855413105412, + "grad_norm": 0.5960636138916016, + "learning_rate": 0.00022080244636025703, + "loss": 0.591, + "step": 62930 + }, + { + "epoch": 1.4009081196581197, + "grad_norm": 0.46302658319473267, + "learning_rate": 0.00022075607643906576, + "loss": 0.5215, + "step": 62940 + }, + { + "epoch": 1.401130698005698, + "grad_norm": 0.6542785167694092, + "learning_rate": 0.00022070970538997232, + "loss": 0.639, + "step": 62950 + }, + { + "epoch": 1.4013532763532763, + "grad_norm": 0.4922322928905487, + "learning_rate": 0.00022066333321549655, + "loss": 0.6914, + "step": 62960 + }, + { + "epoch": 1.4015758547008548, + "grad_norm": 0.6241610050201416, + "learning_rate": 0.0002206169599181583, + "loss": 0.574, + "step": 62970 + }, + { + "epoch": 1.401798433048433, + "grad_norm": 0.7919242978096008, + "learning_rate": 0.00022057058550047765, + "loss": 0.5021, + "step": 62980 + }, + { + "epoch": 1.4020210113960114, + "grad_norm": 0.4039008617401123, + "learning_rate": 0.00022052420996497446, + "loss": 0.5266, + "step": 62990 + }, + { + "epoch": 1.4022435897435899, + "grad_norm": 0.7754238247871399, + "learning_rate": 0.000220477833314169, + "loss": 0.526, + "step": 63000 + }, + { + "epoch": 1.402466168091168, + "grad_norm": 0.9635992646217346, + "learning_rate": 0.00022043145555058122, + "loss": 0.5949, + "step": 63010 + }, + { + "epoch": 1.4026887464387463, + "grad_norm": 0.6123163104057312, + "learning_rate": 0.00022038507667673142, + "loss": 0.6486, + "step": 63020 + }, + { + "epoch": 1.4029113247863247, + "grad_norm": 0.6298154592514038, + "learning_rate": 0.00022033869669513996, + "loss": 0.5848, + "step": 63030 + }, + { + "epoch": 1.4031339031339032, + "grad_norm": 0.6101586222648621, + "learning_rate": 0.00022029231560832701, + "loss": 0.4707, + "step": 63040 + }, + { + "epoch": 1.4033564814814814, + "grad_norm": 0.6360435485839844, + "learning_rate": 0.000220245933418813, + "loss": 0.4835, + "step": 63050 + }, + { + "epoch": 1.4035790598290598, + "grad_norm": 0.4704650044441223, + "learning_rate": 0.0002201995501291184, + "loss": 0.4642, + "step": 63060 + }, + { + "epoch": 1.4038016381766383, + "grad_norm": 0.383497953414917, + "learning_rate": 0.00022015316574176374, + "loss": 0.5225, + "step": 63070 + }, + { + "epoch": 1.4040242165242165, + "grad_norm": 0.6637696623802185, + "learning_rate": 0.00022010678025926952, + "loss": 0.4968, + "step": 63080 + }, + { + "epoch": 1.404246794871795, + "grad_norm": 0.5835416913032532, + "learning_rate": 0.0002200603936841564, + "loss": 0.5077, + "step": 63090 + }, + { + "epoch": 1.4044693732193732, + "grad_norm": 0.3635801076889038, + "learning_rate": 0.00022001400601894512, + "loss": 0.6125, + "step": 63100 + }, + { + "epoch": 1.4046919515669516, + "grad_norm": 0.8239157795906067, + "learning_rate": 0.00021996761726615632, + "loss": 0.7048, + "step": 63110 + }, + { + "epoch": 1.4049145299145298, + "grad_norm": 0.3798117935657501, + "learning_rate": 0.00021992122742831084, + "loss": 0.5571, + "step": 63120 + }, + { + "epoch": 1.4051371082621082, + "grad_norm": 0.634911298751831, + "learning_rate": 0.00021987483650792955, + "loss": 0.5474, + "step": 63130 + }, + { + "epoch": 1.4053596866096867, + "grad_norm": 0.764754056930542, + "learning_rate": 0.00021982844450753347, + "loss": 0.6282, + "step": 63140 + }, + { + "epoch": 1.405582264957265, + "grad_norm": 0.8541821241378784, + "learning_rate": 0.00021978205142964336, + "loss": 0.5572, + "step": 63150 + }, + { + "epoch": 1.4058048433048433, + "grad_norm": 0.4713304042816162, + "learning_rate": 0.0002197356572767804, + "loss": 0.5569, + "step": 63160 + }, + { + "epoch": 1.4060274216524218, + "grad_norm": 0.6356391906738281, + "learning_rate": 0.00021968926205146575, + "loss": 0.5644, + "step": 63170 + }, + { + "epoch": 1.40625, + "grad_norm": 0.48272061347961426, + "learning_rate": 0.00021964286575622044, + "loss": 0.6363, + "step": 63180 + }, + { + "epoch": 1.4064725783475782, + "grad_norm": 0.9424344897270203, + "learning_rate": 0.0002195964683935657, + "loss": 0.7512, + "step": 63190 + }, + { + "epoch": 1.4066951566951567, + "grad_norm": 0.7565159797668457, + "learning_rate": 0.0002195500699660228, + "loss": 0.6125, + "step": 63200 + }, + { + "epoch": 1.406917735042735, + "grad_norm": 0.8030708432197571, + "learning_rate": 0.00021950367047611318, + "loss": 0.6668, + "step": 63210 + }, + { + "epoch": 1.4071403133903133, + "grad_norm": 0.46443840861320496, + "learning_rate": 0.0002194572699263581, + "loss": 0.5308, + "step": 63220 + }, + { + "epoch": 1.4073628917378918, + "grad_norm": 0.5370081067085266, + "learning_rate": 0.000219410868319279, + "loss": 0.5857, + "step": 63230 + }, + { + "epoch": 1.4075854700854702, + "grad_norm": 1.0536185503005981, + "learning_rate": 0.00021936446565739748, + "loss": 0.5349, + "step": 63240 + }, + { + "epoch": 1.4078080484330484, + "grad_norm": 0.4503636956214905, + "learning_rate": 0.00021931806194323508, + "loss": 0.5168, + "step": 63250 + }, + { + "epoch": 1.4080306267806268, + "grad_norm": 0.5147433280944824, + "learning_rate": 0.0002192716571793133, + "loss": 0.6778, + "step": 63260 + }, + { + "epoch": 1.408253205128205, + "grad_norm": 0.5514103174209595, + "learning_rate": 0.00021922525136815391, + "loss": 0.5253, + "step": 63270 + }, + { + "epoch": 1.4084757834757835, + "grad_norm": 0.6138553023338318, + "learning_rate": 0.00021917884451227865, + "loss": 0.5666, + "step": 63280 + }, + { + "epoch": 1.4086983618233617, + "grad_norm": 0.6634931564331055, + "learning_rate": 0.00021913243661420923, + "loss": 0.6384, + "step": 63290 + }, + { + "epoch": 1.4089209401709402, + "grad_norm": 0.8572415709495544, + "learning_rate": 0.00021908602767646755, + "loss": 0.5278, + "step": 63300 + }, + { + "epoch": 1.4091435185185186, + "grad_norm": 0.7475055456161499, + "learning_rate": 0.00021903961770157557, + "loss": 0.4888, + "step": 63310 + }, + { + "epoch": 1.4093660968660968, + "grad_norm": 0.634645402431488, + "learning_rate": 0.0002189932066920551, + "loss": 0.675, + "step": 63320 + }, + { + "epoch": 1.4095886752136753, + "grad_norm": 0.6861944794654846, + "learning_rate": 0.00021894679465042828, + "loss": 0.6083, + "step": 63330 + }, + { + "epoch": 1.4098112535612537, + "grad_norm": 0.5424638986587524, + "learning_rate": 0.00021890038157921707, + "loss": 0.5013, + "step": 63340 + }, + { + "epoch": 1.410033831908832, + "grad_norm": 0.7189966440200806, + "learning_rate": 0.00021885396748094372, + "loss": 0.6209, + "step": 63350 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 0.5611239075660706, + "learning_rate": 0.00021880755235813033, + "loss": 0.5145, + "step": 63360 + }, + { + "epoch": 1.4104789886039886, + "grad_norm": 0.6758479475975037, + "learning_rate": 0.00021876113621329912, + "loss": 0.7104, + "step": 63370 + }, + { + "epoch": 1.410701566951567, + "grad_norm": 0.5141634345054626, + "learning_rate": 0.00021871471904897242, + "loss": 0.4923, + "step": 63380 + }, + { + "epoch": 1.4109241452991452, + "grad_norm": 0.5466700792312622, + "learning_rate": 0.00021866830086767261, + "loss": 0.6644, + "step": 63390 + }, + { + "epoch": 1.4111467236467237, + "grad_norm": 0.4376499056816101, + "learning_rate": 0.00021862188167192197, + "loss": 0.55, + "step": 63400 + }, + { + "epoch": 1.411369301994302, + "grad_norm": 0.5034909248352051, + "learning_rate": 0.00021857546146424305, + "loss": 0.5723, + "step": 63410 + }, + { + "epoch": 1.4115918803418803, + "grad_norm": 0.656143307685852, + "learning_rate": 0.00021852904024715838, + "loss": 0.4818, + "step": 63420 + }, + { + "epoch": 1.4118144586894588, + "grad_norm": 0.5316798090934753, + "learning_rate": 0.00021848261802319047, + "loss": 0.632, + "step": 63430 + }, + { + "epoch": 1.412037037037037, + "grad_norm": 0.800273060798645, + "learning_rate": 0.000218436194794862, + "loss": 0.6068, + "step": 63440 + }, + { + "epoch": 1.4122596153846154, + "grad_norm": 0.6114327907562256, + "learning_rate": 0.00021838977056469556, + "loss": 0.5579, + "step": 63450 + }, + { + "epoch": 1.4124821937321936, + "grad_norm": 0.5441319346427917, + "learning_rate": 0.00021834334533521398, + "loss": 0.5263, + "step": 63460 + }, + { + "epoch": 1.412704772079772, + "grad_norm": 0.738415002822876, + "learning_rate": 0.00021829691910893998, + "loss": 0.4855, + "step": 63470 + }, + { + "epoch": 1.4129273504273505, + "grad_norm": 0.49261611700057983, + "learning_rate": 0.00021825049188839643, + "loss": 0.5376, + "step": 63480 + }, + { + "epoch": 1.4131499287749287, + "grad_norm": 0.5762203931808472, + "learning_rate": 0.0002182040636761062, + "loss": 0.527, + "step": 63490 + }, + { + "epoch": 1.4133725071225072, + "grad_norm": 0.46112143993377686, + "learning_rate": 0.00021815763447459227, + "loss": 0.5495, + "step": 63500 + }, + { + "epoch": 1.4135950854700854, + "grad_norm": 0.5711562037467957, + "learning_rate": 0.00021811120428637758, + "loss": 0.5324, + "step": 63510 + }, + { + "epoch": 1.4138176638176638, + "grad_norm": 0.7803882956504822, + "learning_rate": 0.00021806477311398523, + "loss": 0.5632, + "step": 63520 + }, + { + "epoch": 1.414040242165242, + "grad_norm": 0.7463172674179077, + "learning_rate": 0.00021801834095993834, + "loss": 0.5532, + "step": 63530 + }, + { + "epoch": 1.4142628205128205, + "grad_norm": 0.7782133221626282, + "learning_rate": 0.00021797190782676005, + "loss": 0.4673, + "step": 63540 + }, + { + "epoch": 1.414485398860399, + "grad_norm": 0.4096747934818268, + "learning_rate": 0.00021792547371697355, + "loss": 0.6458, + "step": 63550 + }, + { + "epoch": 1.4147079772079771, + "grad_norm": 0.7637385129928589, + "learning_rate": 0.00021787903863310218, + "loss": 0.5388, + "step": 63560 + }, + { + "epoch": 1.4149305555555556, + "grad_norm": 0.7087598443031311, + "learning_rate": 0.00021783260257766918, + "loss": 0.5184, + "step": 63570 + }, + { + "epoch": 1.415153133903134, + "grad_norm": 0.6480220556259155, + "learning_rate": 0.00021778616555319795, + "loss": 0.5198, + "step": 63580 + }, + { + "epoch": 1.4153757122507122, + "grad_norm": 0.8158738017082214, + "learning_rate": 0.0002177397275622119, + "loss": 0.5879, + "step": 63590 + }, + { + "epoch": 1.4155982905982907, + "grad_norm": 0.5854315757751465, + "learning_rate": 0.0002176932886072346, + "loss": 0.6099, + "step": 63600 + }, + { + "epoch": 1.415820868945869, + "grad_norm": 0.6004104018211365, + "learning_rate": 0.00021764684869078944, + "loss": 0.5783, + "step": 63610 + }, + { + "epoch": 1.4160434472934473, + "grad_norm": 0.5777664184570312, + "learning_rate": 0.0002176004078154001, + "loss": 0.604, + "step": 63620 + }, + { + "epoch": 1.4162660256410255, + "grad_norm": 0.40372851490974426, + "learning_rate": 0.00021755396598359026, + "loss": 0.5008, + "step": 63630 + }, + { + "epoch": 1.416488603988604, + "grad_norm": 0.8670433163642883, + "learning_rate": 0.0002175075231978835, + "loss": 0.6425, + "step": 63640 + }, + { + "epoch": 1.4167111823361824, + "grad_norm": 0.9667596817016602, + "learning_rate": 0.00021746107946080353, + "loss": 0.5284, + "step": 63650 + }, + { + "epoch": 1.4169337606837606, + "grad_norm": 0.5009735822677612, + "learning_rate": 0.00021741463477487427, + "loss": 0.494, + "step": 63660 + }, + { + "epoch": 1.417156339031339, + "grad_norm": 0.6341816186904907, + "learning_rate": 0.00021736818914261955, + "loss": 0.5357, + "step": 63670 + }, + { + "epoch": 1.4173789173789173, + "grad_norm": 0.6542185544967651, + "learning_rate": 0.0002173217425665631, + "loss": 0.6142, + "step": 63680 + }, + { + "epoch": 1.4176014957264957, + "grad_norm": 0.4323403835296631, + "learning_rate": 0.00021727529504922904, + "loss": 0.568, + "step": 63690 + }, + { + "epoch": 1.417824074074074, + "grad_norm": 0.7435312867164612, + "learning_rate": 0.00021722884659314132, + "loss": 0.4743, + "step": 63700 + }, + { + "epoch": 1.4180466524216524, + "grad_norm": 0.6980502605438232, + "learning_rate": 0.00021718239720082397, + "loss": 0.5454, + "step": 63710 + }, + { + "epoch": 1.4182692307692308, + "grad_norm": 0.9360513687133789, + "learning_rate": 0.000217135946874801, + "loss": 0.4378, + "step": 63720 + }, + { + "epoch": 1.418491809116809, + "grad_norm": 0.6106828451156616, + "learning_rate": 0.0002170894956175967, + "loss": 0.518, + "step": 63730 + }, + { + "epoch": 1.4187143874643875, + "grad_norm": 0.5087167024612427, + "learning_rate": 0.0002170430434317353, + "loss": 0.499, + "step": 63740 + }, + { + "epoch": 1.418936965811966, + "grad_norm": 0.7681663632392883, + "learning_rate": 0.00021699659031974088, + "loss": 0.5969, + "step": 63750 + }, + { + "epoch": 1.4191595441595442, + "grad_norm": 0.5107528567314148, + "learning_rate": 0.00021695013628413788, + "loss": 0.446, + "step": 63760 + }, + { + "epoch": 1.4193821225071226, + "grad_norm": 0.6044454574584961, + "learning_rate": 0.00021690368132745055, + "loss": 0.5466, + "step": 63770 + }, + { + "epoch": 1.4196047008547008, + "grad_norm": 0.5367859601974487, + "learning_rate": 0.00021685722545220336, + "loss": 0.6068, + "step": 63780 + }, + { + "epoch": 1.4198272792022792, + "grad_norm": 0.6570670008659363, + "learning_rate": 0.00021681076866092073, + "loss": 0.6939, + "step": 63790 + }, + { + "epoch": 1.4200498575498575, + "grad_norm": 0.5233334302902222, + "learning_rate": 0.00021676431095612718, + "loss": 0.5027, + "step": 63800 + }, + { + "epoch": 1.420272435897436, + "grad_norm": 0.48035338521003723, + "learning_rate": 0.00021671785234034726, + "loss": 0.524, + "step": 63810 + }, + { + "epoch": 1.4204950142450143, + "grad_norm": 0.7729887962341309, + "learning_rate": 0.0002166713928161055, + "loss": 0.6281, + "step": 63820 + }, + { + "epoch": 1.4207175925925926, + "grad_norm": 0.9266574382781982, + "learning_rate": 0.00021662493238592666, + "loss": 0.5339, + "step": 63830 + }, + { + "epoch": 1.420940170940171, + "grad_norm": 0.6719033122062683, + "learning_rate": 0.0002165784710523354, + "loss": 0.4944, + "step": 63840 + }, + { + "epoch": 1.4211627492877492, + "grad_norm": 0.6520929336547852, + "learning_rate": 0.0002165320088178564, + "loss": 0.4898, + "step": 63850 + }, + { + "epoch": 1.4213853276353277, + "grad_norm": 0.4981181025505066, + "learning_rate": 0.00021648554568501455, + "loss": 0.5676, + "step": 63860 + }, + { + "epoch": 1.4216079059829059, + "grad_norm": 0.746509313583374, + "learning_rate": 0.0002164390816563346, + "loss": 0.4484, + "step": 63870 + }, + { + "epoch": 1.4218304843304843, + "grad_norm": 0.6327930092811584, + "learning_rate": 0.00021639261673434156, + "loss": 0.5454, + "step": 63880 + }, + { + "epoch": 1.4220530626780628, + "grad_norm": 0.7335301041603088, + "learning_rate": 0.00021634615092156026, + "loss": 0.5584, + "step": 63890 + }, + { + "epoch": 1.422275641025641, + "grad_norm": 0.4814402163028717, + "learning_rate": 0.0002162996842205158, + "loss": 0.6066, + "step": 63900 + }, + { + "epoch": 1.4224982193732194, + "grad_norm": 0.666235089302063, + "learning_rate": 0.00021625321663373313, + "loss": 0.5322, + "step": 63910 + }, + { + "epoch": 1.4227207977207978, + "grad_norm": 0.8331716656684875, + "learning_rate": 0.00021620674816373733, + "loss": 0.4986, + "step": 63920 + }, + { + "epoch": 1.422943376068376, + "grad_norm": 0.6989117860794067, + "learning_rate": 0.00021616027881305353, + "loss": 0.5071, + "step": 63930 + }, + { + "epoch": 1.4231659544159543, + "grad_norm": 0.6009159088134766, + "learning_rate": 0.00021611380858420698, + "loss": 0.5468, + "step": 63940 + }, + { + "epoch": 1.4233885327635327, + "grad_norm": 0.5715070366859436, + "learning_rate": 0.0002160673374797229, + "loss": 0.6371, + "step": 63950 + }, + { + "epoch": 1.4236111111111112, + "grad_norm": 0.6078493595123291, + "learning_rate": 0.00021602086550212657, + "loss": 0.6225, + "step": 63960 + }, + { + "epoch": 1.4238336894586894, + "grad_norm": 0.8261004090309143, + "learning_rate": 0.00021597439265394326, + "loss": 0.7131, + "step": 63970 + }, + { + "epoch": 1.4240562678062678, + "grad_norm": 0.8117848634719849, + "learning_rate": 0.0002159279189376984, + "loss": 0.5462, + "step": 63980 + }, + { + "epoch": 1.4242788461538463, + "grad_norm": 0.42046865820884705, + "learning_rate": 0.0002158814443559174, + "loss": 0.4954, + "step": 63990 + }, + { + "epoch": 1.4245014245014245, + "grad_norm": 0.8559585809707642, + "learning_rate": 0.00021583496891112566, + "loss": 0.6157, + "step": 64000 + }, + { + "epoch": 1.424724002849003, + "grad_norm": 0.9661999344825745, + "learning_rate": 0.00021578849260584876, + "loss": 0.5031, + "step": 64010 + }, + { + "epoch": 1.4249465811965811, + "grad_norm": 0.6524626612663269, + "learning_rate": 0.00021574201544261225, + "loss": 0.5737, + "step": 64020 + }, + { + "epoch": 1.4251691595441596, + "grad_norm": 0.7119362354278564, + "learning_rate": 0.00021569553742394176, + "loss": 0.6444, + "step": 64030 + }, + { + "epoch": 1.4253917378917378, + "grad_norm": 0.5532019138336182, + "learning_rate": 0.0002156490585523629, + "loss": 0.4423, + "step": 64040 + }, + { + "epoch": 1.4256143162393162, + "grad_norm": 0.5825714468955994, + "learning_rate": 0.00021560257883040138, + "loss": 0.5145, + "step": 64050 + }, + { + "epoch": 1.4258368945868947, + "grad_norm": 0.5801465511322021, + "learning_rate": 0.00021555609826058296, + "loss": 0.5956, + "step": 64060 + }, + { + "epoch": 1.4260594729344729, + "grad_norm": 0.5406079888343811, + "learning_rate": 0.00021550961684543341, + "loss": 0.5333, + "step": 64070 + }, + { + "epoch": 1.4262820512820513, + "grad_norm": 0.637581467628479, + "learning_rate": 0.0002154631345874786, + "loss": 0.6044, + "step": 64080 + }, + { + "epoch": 1.4265046296296298, + "grad_norm": 0.9882199764251709, + "learning_rate": 0.00021541665148924443, + "loss": 0.5219, + "step": 64090 + }, + { + "epoch": 1.426727207977208, + "grad_norm": 0.6166510581970215, + "learning_rate": 0.00021537016755325677, + "loss": 0.5737, + "step": 64100 + }, + { + "epoch": 1.4269497863247862, + "grad_norm": 0.6122153401374817, + "learning_rate": 0.0002153236827820416, + "loss": 0.4894, + "step": 64110 + }, + { + "epoch": 1.4271723646723646, + "grad_norm": 0.4888789653778076, + "learning_rate": 0.00021527719717812498, + "loss": 0.5273, + "step": 64120 + }, + { + "epoch": 1.427394943019943, + "grad_norm": 0.8074785470962524, + "learning_rate": 0.00021523071074403295, + "loss": 0.4874, + "step": 64130 + }, + { + "epoch": 1.4276175213675213, + "grad_norm": 0.7119250893592834, + "learning_rate": 0.0002151842234822916, + "loss": 0.5333, + "step": 64140 + }, + { + "epoch": 1.4278400997150997, + "grad_norm": 0.6475407481193542, + "learning_rate": 0.00021513773539542715, + "loss": 0.5837, + "step": 64150 + }, + { + "epoch": 1.4280626780626782, + "grad_norm": 0.5739694237709045, + "learning_rate": 0.00021509124648596582, + "loss": 0.5146, + "step": 64160 + }, + { + "epoch": 1.4282852564102564, + "grad_norm": 0.6470806002616882, + "learning_rate": 0.00021504475675643376, + "loss": 0.5036, + "step": 64170 + }, + { + "epoch": 1.4285078347578348, + "grad_norm": 0.5994382500648499, + "learning_rate": 0.00021499826620935726, + "loss": 0.4994, + "step": 64180 + }, + { + "epoch": 1.428730413105413, + "grad_norm": 0.8757331371307373, + "learning_rate": 0.00021495177484726278, + "loss": 0.5777, + "step": 64190 + }, + { + "epoch": 1.4289529914529915, + "grad_norm": 0.5289104580879211, + "learning_rate": 0.00021490528267267654, + "loss": 0.535, + "step": 64200 + }, + { + "epoch": 1.4291755698005697, + "grad_norm": 0.5589162111282349, + "learning_rate": 0.00021485878968812504, + "loss": 0.5024, + "step": 64210 + }, + { + "epoch": 1.4293981481481481, + "grad_norm": 0.8635842204093933, + "learning_rate": 0.0002148122958961348, + "loss": 0.5991, + "step": 64220 + }, + { + "epoch": 1.4296207264957266, + "grad_norm": 0.7705470323562622, + "learning_rate": 0.00021476580129923227, + "loss": 0.5253, + "step": 64230 + }, + { + "epoch": 1.4298433048433048, + "grad_norm": 0.5482203960418701, + "learning_rate": 0.00021471930589994393, + "loss": 0.7469, + "step": 64240 + }, + { + "epoch": 1.4300658831908832, + "grad_norm": 0.7359560132026672, + "learning_rate": 0.0002146728097007965, + "loss": 0.579, + "step": 64250 + }, + { + "epoch": 1.4302884615384617, + "grad_norm": 0.8372468948364258, + "learning_rate": 0.0002146263127043166, + "loss": 0.684, + "step": 64260 + }, + { + "epoch": 1.43051103988604, + "grad_norm": 0.7516504526138306, + "learning_rate": 0.00021457981491303086, + "loss": 0.6032, + "step": 64270 + }, + { + "epoch": 1.430733618233618, + "grad_norm": 0.4100610613822937, + "learning_rate": 0.00021453331632946605, + "loss": 0.537, + "step": 64280 + }, + { + "epoch": 1.4309561965811965, + "grad_norm": 0.7946826815605164, + "learning_rate": 0.000214486816956149, + "loss": 0.5378, + "step": 64290 + }, + { + "epoch": 1.431178774928775, + "grad_norm": 0.6719437837600708, + "learning_rate": 0.00021444031679560644, + "loss": 0.6651, + "step": 64300 + }, + { + "epoch": 1.4314013532763532, + "grad_norm": 0.6615647673606873, + "learning_rate": 0.00021439381585036516, + "loss": 0.5943, + "step": 64310 + }, + { + "epoch": 1.4316239316239316, + "grad_norm": 0.5145480632781982, + "learning_rate": 0.00021434731412295216, + "loss": 0.5319, + "step": 64320 + }, + { + "epoch": 1.43184650997151, + "grad_norm": 0.7475807070732117, + "learning_rate": 0.00021430081161589442, + "loss": 0.5558, + "step": 64330 + }, + { + "epoch": 1.4320690883190883, + "grad_norm": 0.4000377058982849, + "learning_rate": 0.00021425430833171887, + "loss": 0.5037, + "step": 64340 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.45284637808799744, + "learning_rate": 0.0002142078042729525, + "loss": 0.5588, + "step": 64350 + }, + { + "epoch": 1.432514245014245, + "grad_norm": 0.5943143367767334, + "learning_rate": 0.00021416129944212245, + "loss": 0.5597, + "step": 64360 + }, + { + "epoch": 1.4327368233618234, + "grad_norm": 0.9331151843070984, + "learning_rate": 0.00021411479384175576, + "loss": 0.6329, + "step": 64370 + }, + { + "epoch": 1.4329594017094016, + "grad_norm": 0.5388138294219971, + "learning_rate": 0.00021406828747437963, + "loss": 0.5553, + "step": 64380 + }, + { + "epoch": 1.43318198005698, + "grad_norm": 0.38518887758255005, + "learning_rate": 0.0002140217803425212, + "loss": 0.4621, + "step": 64390 + }, + { + "epoch": 1.4334045584045585, + "grad_norm": 0.5624729990959167, + "learning_rate": 0.0002139752724487078, + "loss": 0.4825, + "step": 64400 + }, + { + "epoch": 1.4336271367521367, + "grad_norm": 0.7365434169769287, + "learning_rate": 0.00021392876379546665, + "loss": 0.6077, + "step": 64410 + }, + { + "epoch": 1.4338497150997151, + "grad_norm": 0.584722101688385, + "learning_rate": 0.00021388225438532507, + "loss": 0.5207, + "step": 64420 + }, + { + "epoch": 1.4340722934472934, + "grad_norm": 0.6751091480255127, + "learning_rate": 0.00021383574422081041, + "loss": 0.4445, + "step": 64430 + }, + { + "epoch": 1.4342948717948718, + "grad_norm": 0.9433859586715698, + "learning_rate": 0.00021378923330445012, + "loss": 0.6379, + "step": 64440 + }, + { + "epoch": 1.43451745014245, + "grad_norm": 0.5465646386146545, + "learning_rate": 0.00021374272163877155, + "loss": 0.6188, + "step": 64450 + }, + { + "epoch": 1.4347400284900285, + "grad_norm": 0.8962814807891846, + "learning_rate": 0.00021369620922630228, + "loss": 0.5778, + "step": 64460 + }, + { + "epoch": 1.434962606837607, + "grad_norm": 0.7114683389663696, + "learning_rate": 0.0002136496960695698, + "loss": 0.6164, + "step": 64470 + }, + { + "epoch": 1.4351851851851851, + "grad_norm": 0.5674235820770264, + "learning_rate": 0.0002136031821711016, + "loss": 0.419, + "step": 64480 + }, + { + "epoch": 1.4354077635327636, + "grad_norm": 0.5753316879272461, + "learning_rate": 0.00021355666753342537, + "loss": 0.646, + "step": 64490 + }, + { + "epoch": 1.435630341880342, + "grad_norm": 0.697969913482666, + "learning_rate": 0.00021351015215906875, + "loss": 0.5691, + "step": 64500 + }, + { + "epoch": 1.4358529202279202, + "grad_norm": 0.5356051921844482, + "learning_rate": 0.0002134636360505594, + "loss": 0.517, + "step": 64510 + }, + { + "epoch": 1.4360754985754987, + "grad_norm": 0.5579541921615601, + "learning_rate": 0.000213417119210425, + "loss": 0.5676, + "step": 64520 + }, + { + "epoch": 1.4362980769230769, + "grad_norm": 0.7190403342247009, + "learning_rate": 0.00021337060164119338, + "loss": 0.5499, + "step": 64530 + }, + { + "epoch": 1.4365206552706553, + "grad_norm": 0.40730300545692444, + "learning_rate": 0.00021332408334539236, + "loss": 0.671, + "step": 64540 + }, + { + "epoch": 1.4367432336182335, + "grad_norm": 0.5748006105422974, + "learning_rate": 0.00021327756432554975, + "loss": 0.5321, + "step": 64550 + }, + { + "epoch": 1.436965811965812, + "grad_norm": 0.7539768218994141, + "learning_rate": 0.00021323104458419336, + "loss": 0.4917, + "step": 64560 + }, + { + "epoch": 1.4371883903133904, + "grad_norm": 0.6702855229377747, + "learning_rate": 0.00021318452412385117, + "loss": 0.7121, + "step": 64570 + }, + { + "epoch": 1.4374109686609686, + "grad_norm": 0.6595850586891174, + "learning_rate": 0.0002131380029470512, + "loss": 0.5186, + "step": 64580 + }, + { + "epoch": 1.437633547008547, + "grad_norm": 0.7162553668022156, + "learning_rate": 0.00021309148105632137, + "loss": 0.4905, + "step": 64590 + }, + { + "epoch": 1.4378561253561253, + "grad_norm": 0.6966633796691895, + "learning_rate": 0.00021304495845418973, + "loss": 0.4671, + "step": 64600 + }, + { + "epoch": 1.4380787037037037, + "grad_norm": 0.9159937500953674, + "learning_rate": 0.0002129984351431844, + "loss": 0.6021, + "step": 64610 + }, + { + "epoch": 1.438301282051282, + "grad_norm": 0.6505913734436035, + "learning_rate": 0.0002129519111258335, + "loss": 0.5875, + "step": 64620 + }, + { + "epoch": 1.4385238603988604, + "grad_norm": 0.5346196889877319, + "learning_rate": 0.00021290538640466507, + "loss": 0.518, + "step": 64630 + }, + { + "epoch": 1.4387464387464388, + "grad_norm": 0.4172317683696747, + "learning_rate": 0.00021285886098220736, + "loss": 0.4858, + "step": 64640 + }, + { + "epoch": 1.438969017094017, + "grad_norm": 0.6790648698806763, + "learning_rate": 0.00021281233486098868, + "loss": 0.5761, + "step": 64650 + }, + { + "epoch": 1.4391915954415955, + "grad_norm": 0.44441673159599304, + "learning_rate": 0.0002127658080435372, + "loss": 0.4783, + "step": 64660 + }, + { + "epoch": 1.439414173789174, + "grad_norm": 0.4197510778903961, + "learning_rate": 0.00021271928053238125, + "loss": 0.503, + "step": 64670 + }, + { + "epoch": 1.4396367521367521, + "grad_norm": 0.3227827250957489, + "learning_rate": 0.00021267275233004926, + "loss": 0.4116, + "step": 64680 + }, + { + "epoch": 1.4398593304843303, + "grad_norm": 0.8242856860160828, + "learning_rate": 0.0002126262234390695, + "loss": 0.4626, + "step": 64690 + }, + { + "epoch": 1.4400819088319088, + "grad_norm": 0.5800617933273315, + "learning_rate": 0.00021257969386197042, + "loss": 0.5454, + "step": 64700 + }, + { + "epoch": 1.4401709401709402, + "eval_loss": 0.5727022886276245, + "eval_runtime": 337.62, + "eval_samples_per_second": 7.005, + "eval_steps_per_second": 7.005, + "step": 64704 + }, + { + "epoch": 1.4403044871794872, + "grad_norm": 0.45175546407699585, + "learning_rate": 0.00021253316360128038, + "loss": 0.4923, + "step": 64710 + }, + { + "epoch": 1.4405270655270654, + "grad_norm": 0.6314474940299988, + "learning_rate": 0.0002124866326595281, + "loss": 0.432, + "step": 64720 + }, + { + "epoch": 1.4407496438746439, + "grad_norm": 0.5468671321868896, + "learning_rate": 0.00021244010103924193, + "loss": 0.5427, + "step": 64730 + }, + { + "epoch": 1.4409722222222223, + "grad_norm": 0.5443201065063477, + "learning_rate": 0.00021239356874295045, + "loss": 0.5005, + "step": 64740 + }, + { + "epoch": 1.4411948005698005, + "grad_norm": 0.6474123001098633, + "learning_rate": 0.00021234703577318237, + "loss": 0.4838, + "step": 64750 + }, + { + "epoch": 1.441417378917379, + "grad_norm": 0.7085283994674683, + "learning_rate": 0.00021230050213246626, + "loss": 0.5677, + "step": 64760 + }, + { + "epoch": 1.4416399572649572, + "grad_norm": 0.5790860652923584, + "learning_rate": 0.0002122539678233307, + "loss": 0.6691, + "step": 64770 + }, + { + "epoch": 1.4418625356125356, + "grad_norm": 0.6387883424758911, + "learning_rate": 0.00021220743284830457, + "loss": 0.4925, + "step": 64780 + }, + { + "epoch": 1.4420851139601139, + "grad_norm": 0.6226698160171509, + "learning_rate": 0.00021216089720991655, + "loss": 0.5552, + "step": 64790 + }, + { + "epoch": 1.4423076923076923, + "grad_norm": 0.5773744583129883, + "learning_rate": 0.00021211436091069538, + "loss": 0.4695, + "step": 64800 + }, + { + "epoch": 1.4425302706552707, + "grad_norm": 0.6607078313827515, + "learning_rate": 0.00021206782395316996, + "loss": 0.6169, + "step": 64810 + }, + { + "epoch": 1.442752849002849, + "grad_norm": 0.6033298373222351, + "learning_rate": 0.0002120212863398691, + "loss": 0.5271, + "step": 64820 + }, + { + "epoch": 1.4429754273504274, + "grad_norm": 0.5513878464698792, + "learning_rate": 0.0002119747480733217, + "loss": 0.64, + "step": 64830 + }, + { + "epoch": 1.4431980056980058, + "grad_norm": 0.44069719314575195, + "learning_rate": 0.00021192820915605666, + "loss": 0.4799, + "step": 64840 + }, + { + "epoch": 1.443420584045584, + "grad_norm": 0.5613614320755005, + "learning_rate": 0.00021188166959060296, + "loss": 0.5006, + "step": 64850 + }, + { + "epoch": 1.4436431623931623, + "grad_norm": 0.6709674596786499, + "learning_rate": 0.00021183512937948966, + "loss": 0.6678, + "step": 64860 + }, + { + "epoch": 1.4438657407407407, + "grad_norm": 0.4399740695953369, + "learning_rate": 0.0002117885885252457, + "loss": 0.4794, + "step": 64870 + }, + { + "epoch": 1.4440883190883191, + "grad_norm": 0.513027548789978, + "learning_rate": 0.00021174204703040022, + "loss": 0.5169, + "step": 64880 + }, + { + "epoch": 1.4443108974358974, + "grad_norm": 0.6490597128868103, + "learning_rate": 0.00021169550489748225, + "loss": 0.504, + "step": 64890 + }, + { + "epoch": 1.4445334757834758, + "grad_norm": 0.7682234048843384, + "learning_rate": 0.000211648962129021, + "loss": 0.6058, + "step": 64900 + }, + { + "epoch": 1.4447560541310542, + "grad_norm": 0.6964247822761536, + "learning_rate": 0.0002116024187275456, + "loss": 0.6491, + "step": 64910 + }, + { + "epoch": 1.4449786324786325, + "grad_norm": 0.5793104767799377, + "learning_rate": 0.0002115558746955853, + "loss": 0.4567, + "step": 64920 + }, + { + "epoch": 1.445201210826211, + "grad_norm": 0.4298732876777649, + "learning_rate": 0.00021150933003566928, + "loss": 0.5615, + "step": 64930 + }, + { + "epoch": 1.445423789173789, + "grad_norm": 0.6879767179489136, + "learning_rate": 0.0002114627847503268, + "loss": 0.5527, + "step": 64940 + }, + { + "epoch": 1.4456463675213675, + "grad_norm": 0.7321903705596924, + "learning_rate": 0.00021141623884208733, + "loss": 0.5424, + "step": 64950 + }, + { + "epoch": 1.4458689458689458, + "grad_norm": 0.7019228935241699, + "learning_rate": 0.00021136969231347997, + "loss": 0.5813, + "step": 64960 + }, + { + "epoch": 1.4460915242165242, + "grad_norm": 0.5599492192268372, + "learning_rate": 0.00021132314516703434, + "loss": 0.5678, + "step": 64970 + }, + { + "epoch": 1.4463141025641026, + "grad_norm": 0.3333553373813629, + "learning_rate": 0.00021127659740527964, + "loss": 0.4524, + "step": 64980 + }, + { + "epoch": 1.4465366809116809, + "grad_norm": 0.777064323425293, + "learning_rate": 0.00021123004903074541, + "loss": 0.5751, + "step": 64990 + }, + { + "epoch": 1.4467592592592593, + "grad_norm": 0.5544342994689941, + "learning_rate": 0.00021118350004596117, + "loss": 0.6019, + "step": 65000 + }, + { + "epoch": 1.4469818376068377, + "grad_norm": 0.6325475573539734, + "learning_rate": 0.0002111369504534564, + "loss": 0.5359, + "step": 65010 + }, + { + "epoch": 1.447204415954416, + "grad_norm": 0.5768696069717407, + "learning_rate": 0.00021109040025576054, + "loss": 0.5335, + "step": 65020 + }, + { + "epoch": 1.4474269943019942, + "grad_norm": 0.7404335737228394, + "learning_rate": 0.00021104384945540327, + "loss": 0.5483, + "step": 65030 + }, + { + "epoch": 1.4476495726495726, + "grad_norm": 0.654624879360199, + "learning_rate": 0.00021099729805491423, + "loss": 0.6153, + "step": 65040 + }, + { + "epoch": 1.447872150997151, + "grad_norm": 0.5879744291305542, + "learning_rate": 0.00021095074605682296, + "loss": 0.5576, + "step": 65050 + }, + { + "epoch": 1.4480947293447293, + "grad_norm": 0.5990705490112305, + "learning_rate": 0.00021090419346365922, + "loss": 0.5858, + "step": 65060 + }, + { + "epoch": 1.4483173076923077, + "grad_norm": 0.7336555123329163, + "learning_rate": 0.0002108576402779527, + "loss": 0.4919, + "step": 65070 + }, + { + "epoch": 1.4485398860398861, + "grad_norm": 0.6426530480384827, + "learning_rate": 0.00021081108650223312, + "loss": 0.6436, + "step": 65080 + }, + { + "epoch": 1.4487624643874644, + "grad_norm": 0.5368412137031555, + "learning_rate": 0.0002107645321390302, + "loss": 0.6246, + "step": 65090 + }, + { + "epoch": 1.4489850427350428, + "grad_norm": 0.9056705832481384, + "learning_rate": 0.0002107179771908738, + "loss": 0.615, + "step": 65100 + }, + { + "epoch": 1.449207621082621, + "grad_norm": 0.9572508931159973, + "learning_rate": 0.00021067142166029376, + "loss": 0.5894, + "step": 65110 + }, + { + "epoch": 1.4494301994301995, + "grad_norm": 0.607247531414032, + "learning_rate": 0.00021062486554981988, + "loss": 0.6655, + "step": 65120 + }, + { + "epoch": 1.4496527777777777, + "grad_norm": 0.7506188750267029, + "learning_rate": 0.00021057830886198216, + "loss": 0.5114, + "step": 65130 + }, + { + "epoch": 1.4498753561253561, + "grad_norm": 0.7134078145027161, + "learning_rate": 0.00021053175159931056, + "loss": 0.636, + "step": 65140 + }, + { + "epoch": 1.4500979344729346, + "grad_norm": 0.7259635925292969, + "learning_rate": 0.00021048519376433485, + "loss": 0.5056, + "step": 65150 + }, + { + "epoch": 1.4503205128205128, + "grad_norm": 0.8849378824234009, + "learning_rate": 0.0002104386353595851, + "loss": 0.748, + "step": 65160 + }, + { + "epoch": 1.4505430911680912, + "grad_norm": 0.45202553272247314, + "learning_rate": 0.00021039207638759138, + "loss": 0.627, + "step": 65170 + }, + { + "epoch": 1.4507656695156697, + "grad_norm": 0.6308692097663879, + "learning_rate": 0.0002103455168508838, + "loss": 0.6249, + "step": 65180 + }, + { + "epoch": 1.4509882478632479, + "grad_norm": 0.748424232006073, + "learning_rate": 0.00021029895675199226, + "loss": 0.572, + "step": 65190 + }, + { + "epoch": 1.451210826210826, + "grad_norm": 0.7193168997764587, + "learning_rate": 0.00021025239609344701, + "loss": 0.4869, + "step": 65200 + }, + { + "epoch": 1.4514334045584045, + "grad_norm": 0.47657930850982666, + "learning_rate": 0.00021020583487777828, + "loss": 0.6517, + "step": 65210 + }, + { + "epoch": 1.451655982905983, + "grad_norm": 0.42479145526885986, + "learning_rate": 0.00021015927310751598, + "loss": 0.4743, + "step": 65220 + }, + { + "epoch": 1.4518785612535612, + "grad_norm": 0.8069111108779907, + "learning_rate": 0.00021011271078519054, + "loss": 0.6877, + "step": 65230 + }, + { + "epoch": 1.4521011396011396, + "grad_norm": 0.31496503949165344, + "learning_rate": 0.00021006614791333205, + "loss": 0.5257, + "step": 65240 + }, + { + "epoch": 1.452323717948718, + "grad_norm": 0.30991029739379883, + "learning_rate": 0.00021001958449447087, + "loss": 0.5881, + "step": 65250 + }, + { + "epoch": 1.4525462962962963, + "grad_norm": 0.5215082168579102, + "learning_rate": 0.0002099730205311373, + "loss": 0.4965, + "step": 65260 + }, + { + "epoch": 1.4527688746438747, + "grad_norm": 0.6242929100990295, + "learning_rate": 0.00020992645602586164, + "loss": 0.5573, + "step": 65270 + }, + { + "epoch": 1.452991452991453, + "grad_norm": 0.5348063707351685, + "learning_rate": 0.0002098798909811742, + "loss": 0.5565, + "step": 65280 + }, + { + "epoch": 1.4532140313390314, + "grad_norm": 0.527005136013031, + "learning_rate": 0.00020983332539960538, + "loss": 0.6289, + "step": 65290 + }, + { + "epoch": 1.4534366096866096, + "grad_norm": 0.792716920375824, + "learning_rate": 0.0002097867592836856, + "loss": 0.5747, + "step": 65300 + }, + { + "epoch": 1.453659188034188, + "grad_norm": 0.6454300880432129, + "learning_rate": 0.00020974019263594534, + "loss": 0.6055, + "step": 65310 + }, + { + "epoch": 1.4538817663817665, + "grad_norm": 0.4974537491798401, + "learning_rate": 0.00020969362545891507, + "loss": 0.4444, + "step": 65320 + }, + { + "epoch": 1.4541043447293447, + "grad_norm": 0.46109578013420105, + "learning_rate": 0.00020964705775512518, + "loss": 0.504, + "step": 65330 + }, + { + "epoch": 1.4543269230769231, + "grad_norm": 0.7986196279525757, + "learning_rate": 0.00020960048952710632, + "loss": 0.5724, + "step": 65340 + }, + { + "epoch": 1.4545495014245013, + "grad_norm": 0.43146848678588867, + "learning_rate": 0.00020955392077738903, + "loss": 0.5882, + "step": 65350 + }, + { + "epoch": 1.4547720797720798, + "grad_norm": 0.8425827026367188, + "learning_rate": 0.0002095073515085038, + "loss": 0.5565, + "step": 65360 + }, + { + "epoch": 1.454994658119658, + "grad_norm": 0.6973526477813721, + "learning_rate": 0.00020946078172298137, + "loss": 0.4806, + "step": 65370 + }, + { + "epoch": 1.4552172364672364, + "grad_norm": 0.4596269726753235, + "learning_rate": 0.00020941421142335224, + "loss": 0.5385, + "step": 65380 + }, + { + "epoch": 1.4554398148148149, + "grad_norm": 0.7151857018470764, + "learning_rate": 0.0002093676406121472, + "loss": 0.565, + "step": 65390 + }, + { + "epoch": 1.455662393162393, + "grad_norm": 0.5673050880432129, + "learning_rate": 0.00020932106929189695, + "loss": 0.5585, + "step": 65400 + }, + { + "epoch": 1.4558849715099715, + "grad_norm": 0.683448314666748, + "learning_rate": 0.0002092744974651321, + "loss": 0.6157, + "step": 65410 + }, + { + "epoch": 1.45610754985755, + "grad_norm": 0.5907111167907715, + "learning_rate": 0.00020922792513438347, + "loss": 0.5675, + "step": 65420 + }, + { + "epoch": 1.4563301282051282, + "grad_norm": 0.5666101574897766, + "learning_rate": 0.0002091813523021818, + "loss": 0.5541, + "step": 65430 + }, + { + "epoch": 1.4565527065527066, + "grad_norm": 0.891807496547699, + "learning_rate": 0.00020913477897105797, + "loss": 0.5664, + "step": 65440 + }, + { + "epoch": 1.4567752849002849, + "grad_norm": 0.691290020942688, + "learning_rate": 0.00020908820514354274, + "loss": 0.5416, + "step": 65450 + }, + { + "epoch": 1.4569978632478633, + "grad_norm": 0.5921575427055359, + "learning_rate": 0.00020904163082216708, + "loss": 0.5611, + "step": 65460 + }, + { + "epoch": 1.4572204415954415, + "grad_norm": 0.6684668660163879, + "learning_rate": 0.00020899505600946173, + "loss": 0.5772, + "step": 65470 + }, + { + "epoch": 1.45744301994302, + "grad_norm": 0.6148405075073242, + "learning_rate": 0.0002089484807079577, + "loss": 0.4741, + "step": 65480 + }, + { + "epoch": 1.4576655982905984, + "grad_norm": 0.6442460417747498, + "learning_rate": 0.00020890190492018596, + "loss": 0.4478, + "step": 65490 + }, + { + "epoch": 1.4578881766381766, + "grad_norm": 0.6702595949172974, + "learning_rate": 0.00020885532864867732, + "loss": 0.553, + "step": 65500 + }, + { + "epoch": 1.458110754985755, + "grad_norm": 0.8547864556312561, + "learning_rate": 0.0002088087518959629, + "loss": 0.5483, + "step": 65510 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.7362828850746155, + "learning_rate": 0.0002087621746645737, + "loss": 0.639, + "step": 65520 + }, + { + "epoch": 1.4585559116809117, + "grad_norm": 0.914097785949707, + "learning_rate": 0.00020871559695704073, + "loss": 0.567, + "step": 65530 + }, + { + "epoch": 1.45877849002849, + "grad_norm": 0.5775737166404724, + "learning_rate": 0.00020866901877589515, + "loss": 0.5771, + "step": 65540 + }, + { + "epoch": 1.4590010683760684, + "grad_norm": 0.8055770397186279, + "learning_rate": 0.00020862244012366792, + "loss": 0.5812, + "step": 65550 + }, + { + "epoch": 1.4592236467236468, + "grad_norm": 0.6688644289970398, + "learning_rate": 0.00020857586100289034, + "loss": 0.6124, + "step": 65560 + }, + { + "epoch": 1.459446225071225, + "grad_norm": 0.6874992847442627, + "learning_rate": 0.00020852928141609333, + "loss": 0.6507, + "step": 65570 + }, + { + "epoch": 1.4596688034188035, + "grad_norm": 0.398538202047348, + "learning_rate": 0.00020848270136580822, + "loss": 0.611, + "step": 65580 + }, + { + "epoch": 1.459891381766382, + "grad_norm": 0.6153919100761414, + "learning_rate": 0.0002084361208545662, + "loss": 0.6638, + "step": 65590 + }, + { + "epoch": 1.46011396011396, + "grad_norm": 0.6108056902885437, + "learning_rate": 0.00020838953988489852, + "loss": 0.645, + "step": 65600 + }, + { + "epoch": 1.4603365384615383, + "grad_norm": 0.5398427248001099, + "learning_rate": 0.0002083429584593363, + "loss": 0.619, + "step": 65610 + }, + { + "epoch": 1.4605591168091168, + "grad_norm": 0.5409917235374451, + "learning_rate": 0.00020829637658041086, + "loss": 0.5931, + "step": 65620 + }, + { + "epoch": 1.4607816951566952, + "grad_norm": 0.9313738346099854, + "learning_rate": 0.0002082497942506536, + "loss": 0.6415, + "step": 65630 + }, + { + "epoch": 1.4610042735042734, + "grad_norm": 0.6145248413085938, + "learning_rate": 0.0002082032114725957, + "loss": 0.6641, + "step": 65640 + }, + { + "epoch": 1.4612268518518519, + "grad_norm": 0.6647670865058899, + "learning_rate": 0.00020815662824876858, + "loss": 0.6287, + "step": 65650 + }, + { + "epoch": 1.4614494301994303, + "grad_norm": 0.7241197228431702, + "learning_rate": 0.00020811004458170366, + "loss": 0.5647, + "step": 65660 + }, + { + "epoch": 1.4616720085470085, + "grad_norm": 0.7002893686294556, + "learning_rate": 0.00020806346047393226, + "loss": 0.6097, + "step": 65670 + }, + { + "epoch": 1.461894586894587, + "grad_norm": 0.40175196528434753, + "learning_rate": 0.00020801687592798582, + "loss": 0.5823, + "step": 65680 + }, + { + "epoch": 1.4621171652421652, + "grad_norm": 0.5566397905349731, + "learning_rate": 0.00020797029094639572, + "loss": 0.5513, + "step": 65690 + }, + { + "epoch": 1.4623397435897436, + "grad_norm": 0.5421175956726074, + "learning_rate": 0.00020792370553169355, + "loss": 0.6408, + "step": 65700 + }, + { + "epoch": 1.4625623219373218, + "grad_norm": 0.4728619158267975, + "learning_rate": 0.00020787711968641071, + "loss": 0.5313, + "step": 65710 + }, + { + "epoch": 1.4627849002849003, + "grad_norm": 0.5809181332588196, + "learning_rate": 0.0002078305334130787, + "loss": 0.4613, + "step": 65720 + }, + { + "epoch": 1.4630074786324787, + "grad_norm": 0.5622936487197876, + "learning_rate": 0.00020778394671422916, + "loss": 0.4403, + "step": 65730 + }, + { + "epoch": 1.463230056980057, + "grad_norm": 0.5469698905944824, + "learning_rate": 0.0002077373595923936, + "loss": 0.6044, + "step": 65740 + }, + { + "epoch": 1.4634526353276354, + "grad_norm": 0.8442160487174988, + "learning_rate": 0.00020769077205010352, + "loss": 0.6237, + "step": 65750 + }, + { + "epoch": 1.4636752136752138, + "grad_norm": 0.5504755973815918, + "learning_rate": 0.00020764418408989062, + "loss": 0.5859, + "step": 65760 + }, + { + "epoch": 1.463897792022792, + "grad_norm": 0.7845527529716492, + "learning_rate": 0.00020759759571428648, + "loss": 0.6043, + "step": 65770 + }, + { + "epoch": 1.4641203703703702, + "grad_norm": 0.5308002829551697, + "learning_rate": 0.00020755100692582275, + "loss": 0.4304, + "step": 65780 + }, + { + "epoch": 1.4643429487179487, + "grad_norm": 1.0177899599075317, + "learning_rate": 0.00020750441772703114, + "loss": 0.5702, + "step": 65790 + }, + { + "epoch": 1.4645655270655271, + "grad_norm": 0.8551054000854492, + "learning_rate": 0.0002074578281204434, + "loss": 0.6045, + "step": 65800 + }, + { + "epoch": 1.4647881054131053, + "grad_norm": 0.6332492828369141, + "learning_rate": 0.00020741123810859112, + "loss": 0.4905, + "step": 65810 + }, + { + "epoch": 1.4650106837606838, + "grad_norm": 0.6236631274223328, + "learning_rate": 0.0002073646476940061, + "loss": 0.6526, + "step": 65820 + }, + { + "epoch": 1.4652332621082622, + "grad_norm": 0.5618539452552795, + "learning_rate": 0.00020731805687922004, + "loss": 0.5197, + "step": 65830 + }, + { + "epoch": 1.4654558404558404, + "grad_norm": 0.5351765751838684, + "learning_rate": 0.00020727146566676486, + "loss": 0.5043, + "step": 65840 + }, + { + "epoch": 1.4656784188034189, + "grad_norm": 0.5465881824493408, + "learning_rate": 0.00020722487405917223, + "loss": 0.5711, + "step": 65850 + }, + { + "epoch": 1.465900997150997, + "grad_norm": 0.7434455752372742, + "learning_rate": 0.00020717828205897405, + "loss": 0.5675, + "step": 65860 + }, + { + "epoch": 1.4661235754985755, + "grad_norm": 0.7556038498878479, + "learning_rate": 0.00020713168966870216, + "loss": 0.5711, + "step": 65870 + }, + { + "epoch": 1.4663461538461537, + "grad_norm": 0.6555251479148865, + "learning_rate": 0.0002070850968908884, + "loss": 0.5165, + "step": 65880 + }, + { + "epoch": 1.4665687321937322, + "grad_norm": 0.9948902130126953, + "learning_rate": 0.00020703850372806465, + "loss": 0.4885, + "step": 65890 + }, + { + "epoch": 1.4667913105413106, + "grad_norm": 0.7312471270561218, + "learning_rate": 0.00020699191018276288, + "loss": 0.522, + "step": 65900 + }, + { + "epoch": 1.4670138888888888, + "grad_norm": 0.567846417427063, + "learning_rate": 0.00020694531625751496, + "loss": 0.5453, + "step": 65910 + }, + { + "epoch": 1.4672364672364673, + "grad_norm": 0.5828446745872498, + "learning_rate": 0.00020689872195485287, + "loss": 0.5965, + "step": 65920 + }, + { + "epoch": 1.4674590455840457, + "grad_norm": 0.3739456534385681, + "learning_rate": 0.00020685212727730864, + "loss": 0.6246, + "step": 65930 + }, + { + "epoch": 1.467681623931624, + "grad_norm": 0.6678730845451355, + "learning_rate": 0.00020680553222741414, + "loss": 0.5216, + "step": 65940 + }, + { + "epoch": 1.4679042022792022, + "grad_norm": 0.6459123492240906, + "learning_rate": 0.0002067589368077015, + "loss": 0.5315, + "step": 65950 + }, + { + "epoch": 1.4681267806267806, + "grad_norm": 0.6650933027267456, + "learning_rate": 0.00020671234102070263, + "loss": 0.6411, + "step": 65960 + }, + { + "epoch": 1.468349358974359, + "grad_norm": 0.6498469114303589, + "learning_rate": 0.0002066657448689497, + "loss": 0.5779, + "step": 65970 + }, + { + "epoch": 1.4685719373219372, + "grad_norm": 0.5796330571174622, + "learning_rate": 0.00020661914835497474, + "loss": 0.5096, + "step": 65980 + }, + { + "epoch": 1.4687945156695157, + "grad_norm": 0.5412297248840332, + "learning_rate": 0.00020657255148130984, + "loss": 0.5169, + "step": 65990 + }, + { + "epoch": 1.4690170940170941, + "grad_norm": 0.4486381411552429, + "learning_rate": 0.00020652595425048705, + "loss": 0.5125, + "step": 66000 + }, + { + "epoch": 1.4692396723646723, + "grad_norm": 0.6411442756652832, + "learning_rate": 0.00020647935666503862, + "loss": 0.5713, + "step": 66010 + }, + { + "epoch": 1.4694622507122508, + "grad_norm": 0.5224676132202148, + "learning_rate": 0.00020643275872749665, + "loss": 0.7264, + "step": 66020 + }, + { + "epoch": 1.469684829059829, + "grad_norm": 0.4064271152019501, + "learning_rate": 0.00020638616044039328, + "loss": 0.5996, + "step": 66030 + }, + { + "epoch": 1.4699074074074074, + "grad_norm": 0.7597877383232117, + "learning_rate": 0.00020633956180626074, + "loss": 0.508, + "step": 66040 + }, + { + "epoch": 1.4701299857549857, + "grad_norm": 0.6439722180366516, + "learning_rate": 0.00020629296282763125, + "loss": 0.4433, + "step": 66050 + }, + { + "epoch": 1.470352564102564, + "grad_norm": 0.7077510952949524, + "learning_rate": 0.000206246363507037, + "loss": 0.6232, + "step": 66060 + }, + { + "epoch": 1.4705751424501425, + "grad_norm": 0.6614882349967957, + "learning_rate": 0.0002061997638470102, + "loss": 0.5851, + "step": 66070 + }, + { + "epoch": 1.4707977207977208, + "grad_norm": 0.5982300043106079, + "learning_rate": 0.00020615316385008315, + "loss": 0.4403, + "step": 66080 + }, + { + "epoch": 1.4710202991452992, + "grad_norm": 0.6720724701881409, + "learning_rate": 0.0002061065635187882, + "loss": 0.5336, + "step": 66090 + }, + { + "epoch": 1.4712428774928774, + "grad_norm": 0.6834327578544617, + "learning_rate": 0.0002060599628556575, + "loss": 0.5241, + "step": 66100 + }, + { + "epoch": 1.4714654558404558, + "grad_norm": 0.5241697430610657, + "learning_rate": 0.00020601336186322353, + "loss": 0.576, + "step": 66110 + }, + { + "epoch": 1.471688034188034, + "grad_norm": 0.7107514142990112, + "learning_rate": 0.00020596676054401858, + "loss": 0.5049, + "step": 66120 + }, + { + "epoch": 1.4719106125356125, + "grad_norm": 0.4992298185825348, + "learning_rate": 0.00020592015890057494, + "loss": 0.5305, + "step": 66130 + }, + { + "epoch": 1.472133190883191, + "grad_norm": 0.6178436875343323, + "learning_rate": 0.000205873556935425, + "loss": 0.5717, + "step": 66140 + }, + { + "epoch": 1.4723557692307692, + "grad_norm": 0.442743718624115, + "learning_rate": 0.0002058269546511012, + "loss": 0.5183, + "step": 66150 + }, + { + "epoch": 1.4725783475783476, + "grad_norm": 0.5500785708427429, + "learning_rate": 0.0002057803520501359, + "loss": 0.493, + "step": 66160 + }, + { + "epoch": 1.472800925925926, + "grad_norm": 0.6199305653572083, + "learning_rate": 0.00020573374913506148, + "loss": 0.5522, + "step": 66170 + }, + { + "epoch": 1.4730235042735043, + "grad_norm": 0.6486396193504333, + "learning_rate": 0.00020568714590841046, + "loss": 0.6705, + "step": 66180 + }, + { + "epoch": 1.4732460826210827, + "grad_norm": 0.7136431932449341, + "learning_rate": 0.00020564054237271536, + "loss": 0.6631, + "step": 66190 + }, + { + "epoch": 1.473468660968661, + "grad_norm": 0.5296502113342285, + "learning_rate": 0.00020559393853050853, + "loss": 0.5054, + "step": 66200 + }, + { + "epoch": 1.4736912393162394, + "grad_norm": 0.5589050054550171, + "learning_rate": 0.00020554733438432247, + "loss": 0.5524, + "step": 66210 + }, + { + "epoch": 1.4739138176638176, + "grad_norm": 0.5227344632148743, + "learning_rate": 0.00020550072993668974, + "loss": 0.5718, + "step": 66220 + }, + { + "epoch": 1.474136396011396, + "grad_norm": 0.6966056227684021, + "learning_rate": 0.00020545412519014285, + "loss": 0.5455, + "step": 66230 + }, + { + "epoch": 1.4743589743589745, + "grad_norm": 0.5115660429000854, + "learning_rate": 0.00020540752014721432, + "loss": 0.5812, + "step": 66240 + }, + { + "epoch": 1.4745815527065527, + "grad_norm": 0.5743473768234253, + "learning_rate": 0.00020536091481043668, + "loss": 0.5892, + "step": 66250 + }, + { + "epoch": 1.474804131054131, + "grad_norm": 0.4098198413848877, + "learning_rate": 0.00020531430918234258, + "loss": 0.5955, + "step": 66260 + }, + { + "epoch": 1.4750267094017093, + "grad_norm": 0.6191383600234985, + "learning_rate": 0.00020526770326546463, + "loss": 0.4784, + "step": 66270 + }, + { + "epoch": 1.4752492877492878, + "grad_norm": 0.6457961201667786, + "learning_rate": 0.00020522109706233525, + "loss": 0.5828, + "step": 66280 + }, + { + "epoch": 1.475471866096866, + "grad_norm": 0.628578782081604, + "learning_rate": 0.00020517449057548724, + "loss": 0.6064, + "step": 66290 + }, + { + "epoch": 1.4756944444444444, + "grad_norm": 0.6337209343910217, + "learning_rate": 0.0002051278838074532, + "loss": 0.5404, + "step": 66300 + }, + { + "epoch": 1.4759170227920229, + "grad_norm": 0.5599583387374878, + "learning_rate": 0.00020508127676076572, + "loss": 0.5639, + "step": 66310 + }, + { + "epoch": 1.476139601139601, + "grad_norm": 0.8558855652809143, + "learning_rate": 0.00020503466943795756, + "loss": 0.5392, + "step": 66320 + }, + { + "epoch": 1.4763621794871795, + "grad_norm": 0.5551844239234924, + "learning_rate": 0.00020498806184156125, + "loss": 0.5564, + "step": 66330 + }, + { + "epoch": 1.476584757834758, + "grad_norm": 0.9600586295127869, + "learning_rate": 0.00020494145397410965, + "loss": 0.5996, + "step": 66340 + }, + { + "epoch": 1.4768073361823362, + "grad_norm": 0.4699052572250366, + "learning_rate": 0.00020489484583813535, + "loss": 0.5395, + "step": 66350 + }, + { + "epoch": 1.4770299145299146, + "grad_norm": 0.6838102340698242, + "learning_rate": 0.00020484823743617114, + "loss": 0.6533, + "step": 66360 + }, + { + "epoch": 1.4772524928774928, + "grad_norm": 0.639786958694458, + "learning_rate": 0.00020480162877074975, + "loss": 0.6041, + "step": 66370 + }, + { + "epoch": 1.4774750712250713, + "grad_norm": 0.7160543203353882, + "learning_rate": 0.00020475501984440388, + "loss": 0.5567, + "step": 66380 + }, + { + "epoch": 1.4776976495726495, + "grad_norm": 0.8186416029930115, + "learning_rate": 0.0002047084106596664, + "loss": 0.6609, + "step": 66390 + }, + { + "epoch": 1.477920227920228, + "grad_norm": 0.8617456555366516, + "learning_rate": 0.00020466180121906998, + "loss": 0.717, + "step": 66400 + }, + { + "epoch": 1.4781428062678064, + "grad_norm": 0.4262973964214325, + "learning_rate": 0.00020461519152514753, + "loss": 0.6177, + "step": 66410 + }, + { + "epoch": 1.4783653846153846, + "grad_norm": 0.5646436810493469, + "learning_rate": 0.00020456858158043168, + "loss": 0.5103, + "step": 66420 + }, + { + "epoch": 1.478587962962963, + "grad_norm": 0.5096763372421265, + "learning_rate": 0.0002045219713874554, + "loss": 0.485, + "step": 66430 + }, + { + "epoch": 1.4788105413105412, + "grad_norm": 0.6648346185684204, + "learning_rate": 0.00020447536094875157, + "loss": 0.5736, + "step": 66440 + }, + { + "epoch": 1.4790331196581197, + "grad_norm": 0.6746906042098999, + "learning_rate": 0.00020442875026685297, + "loss": 0.6185, + "step": 66450 + }, + { + "epoch": 1.479255698005698, + "grad_norm": 0.6889836192131042, + "learning_rate": 0.00020438213934429237, + "loss": 0.4814, + "step": 66460 + }, + { + "epoch": 1.4794782763532763, + "grad_norm": 0.579818844795227, + "learning_rate": 0.00020433552818360275, + "loss": 0.6022, + "step": 66470 + }, + { + "epoch": 1.4797008547008548, + "grad_norm": 0.6798656582832336, + "learning_rate": 0.00020428891678731702, + "loss": 0.6192, + "step": 66480 + }, + { + "epoch": 1.479923433048433, + "grad_norm": 0.49344855546951294, + "learning_rate": 0.000204242305157968, + "loss": 0.495, + "step": 66490 + }, + { + "epoch": 1.4801460113960114, + "grad_norm": 0.393621027469635, + "learning_rate": 0.00020419569329808862, + "loss": 0.545, + "step": 66500 + }, + { + "epoch": 1.4803685897435899, + "grad_norm": 0.4905228316783905, + "learning_rate": 0.0002041490812102119, + "loss": 0.577, + "step": 66510 + }, + { + "epoch": 1.480591168091168, + "grad_norm": 0.5233932137489319, + "learning_rate": 0.00020410246889687072, + "loss": 0.4568, + "step": 66520 + }, + { + "epoch": 1.4808137464387463, + "grad_norm": 0.47157812118530273, + "learning_rate": 0.00020405585636059796, + "loss": 0.6753, + "step": 66530 + }, + { + "epoch": 1.4810363247863247, + "grad_norm": 0.35142749547958374, + "learning_rate": 0.00020400924360392667, + "loss": 0.4946, + "step": 66540 + }, + { + "epoch": 1.4812589031339032, + "grad_norm": 0.8298178911209106, + "learning_rate": 0.0002039626306293898, + "loss": 0.5958, + "step": 66550 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.7113330960273743, + "learning_rate": 0.00020391601743952032, + "loss": 0.5403, + "step": 66560 + }, + { + "epoch": 1.4817040598290598, + "grad_norm": 0.4129575788974762, + "learning_rate": 0.00020386940403685125, + "loss": 0.5357, + "step": 66570 + }, + { + "epoch": 1.4819266381766383, + "grad_norm": 0.5855956673622131, + "learning_rate": 0.00020382279042391565, + "loss": 0.4916, + "step": 66580 + }, + { + "epoch": 1.4821492165242165, + "grad_norm": 0.5066674947738647, + "learning_rate": 0.00020377617660324648, + "loss": 0.5156, + "step": 66590 + }, + { + "epoch": 1.482371794871795, + "grad_norm": 0.7026668190956116, + "learning_rate": 0.0002037295625773767, + "loss": 0.5411, + "step": 66600 + }, + { + "epoch": 1.4825943732193732, + "grad_norm": 0.8947693109512329, + "learning_rate": 0.0002036829483488395, + "loss": 0.4803, + "step": 66610 + }, + { + "epoch": 1.4828169515669516, + "grad_norm": 0.5915529131889343, + "learning_rate": 0.00020363633392016784, + "loss": 0.5067, + "step": 66620 + }, + { + "epoch": 1.4830395299145298, + "grad_norm": 0.6306509375572205, + "learning_rate": 0.00020358971929389482, + "loss": 0.4563, + "step": 66630 + }, + { + "epoch": 1.4832621082621082, + "grad_norm": 0.8169883489608765, + "learning_rate": 0.00020354310447255353, + "loss": 0.6565, + "step": 66640 + }, + { + "epoch": 1.4834846866096867, + "grad_norm": 1.0991393327713013, + "learning_rate": 0.00020349648945867715, + "loss": 0.5178, + "step": 66650 + }, + { + "epoch": 1.483707264957265, + "grad_norm": 0.5982011556625366, + "learning_rate": 0.00020344987425479852, + "loss": 0.5572, + "step": 66660 + }, + { + "epoch": 1.4839298433048433, + "grad_norm": 0.46377143263816833, + "learning_rate": 0.00020340325886345092, + "loss": 0.5503, + "step": 66670 + }, + { + "epoch": 1.4841524216524218, + "grad_norm": 0.6298932433128357, + "learning_rate": 0.00020335664328716745, + "loss": 0.4792, + "step": 66680 + }, + { + "epoch": 1.484375, + "grad_norm": 0.6804326176643372, + "learning_rate": 0.0002033100275284813, + "loss": 0.4894, + "step": 66690 + }, + { + "epoch": 1.4845975783475782, + "grad_norm": 0.5818316340446472, + "learning_rate": 0.00020326341158992547, + "loss": 0.6249, + "step": 66700 + }, + { + "epoch": 1.4848201566951567, + "grad_norm": 0.5855556726455688, + "learning_rate": 0.0002032167954740332, + "loss": 0.5472, + "step": 66710 + }, + { + "epoch": 1.485042735042735, + "grad_norm": 0.497353196144104, + "learning_rate": 0.0002031701791833377, + "loss": 0.5875, + "step": 66720 + }, + { + "epoch": 1.4852653133903133, + "grad_norm": 0.48648321628570557, + "learning_rate": 0.000203123562720372, + "loss": 0.4869, + "step": 66730 + }, + { + "epoch": 1.4854878917378918, + "grad_norm": 0.6546781063079834, + "learning_rate": 0.0002030769460876693, + "loss": 0.5434, + "step": 66740 + }, + { + "epoch": 1.4857104700854702, + "grad_norm": 1.0312706232070923, + "learning_rate": 0.0002030303292877629, + "loss": 0.6661, + "step": 66750 + }, + { + "epoch": 1.4859330484330484, + "grad_norm": 0.4677811861038208, + "learning_rate": 0.00020298371232318596, + "loss": 0.4725, + "step": 66760 + }, + { + "epoch": 1.4861556267806268, + "grad_norm": 0.6403644680976868, + "learning_rate": 0.00020293709519647157, + "loss": 0.6621, + "step": 66770 + }, + { + "epoch": 1.486378205128205, + "grad_norm": 0.4164356291294098, + "learning_rate": 0.00020289047791015308, + "loss": 0.5814, + "step": 66780 + }, + { + "epoch": 1.4866007834757835, + "grad_norm": 0.5813632607460022, + "learning_rate": 0.00020284386046676365, + "loss": 0.6091, + "step": 66790 + }, + { + "epoch": 1.4868233618233617, + "grad_norm": 0.5498270988464355, + "learning_rate": 0.0002027972428688365, + "loss": 0.6177, + "step": 66800 + }, + { + "epoch": 1.4870459401709402, + "grad_norm": 0.47491297125816345, + "learning_rate": 0.00020275062511890485, + "loss": 0.5591, + "step": 66810 + }, + { + "epoch": 1.4872685185185186, + "grad_norm": 0.5644556879997253, + "learning_rate": 0.00020270400721950202, + "loss": 0.5741, + "step": 66820 + }, + { + "epoch": 1.4874910968660968, + "grad_norm": 0.6705946326255798, + "learning_rate": 0.00020265738917316117, + "loss": 0.5214, + "step": 66830 + }, + { + "epoch": 1.4877136752136753, + "grad_norm": 0.771763801574707, + "learning_rate": 0.00020261077098241565, + "loss": 0.5585, + "step": 66840 + }, + { + "epoch": 1.4879362535612537, + "grad_norm": 0.7709307074546814, + "learning_rate": 0.00020256415264979872, + "loss": 0.5727, + "step": 66850 + }, + { + "epoch": 1.488158831908832, + "grad_norm": 0.5889680981636047, + "learning_rate": 0.00020251753417784368, + "loss": 0.5227, + "step": 66860 + }, + { + "epoch": 1.4883814102564101, + "grad_norm": 0.5113076567649841, + "learning_rate": 0.0002024709155690837, + "loss": 0.5422, + "step": 66870 + }, + { + "epoch": 1.4886039886039886, + "grad_norm": 0.6153723001480103, + "learning_rate": 0.00020242429682605214, + "loss": 0.7747, + "step": 66880 + }, + { + "epoch": 1.488826566951567, + "grad_norm": 0.3839668333530426, + "learning_rate": 0.0002023776779512823, + "loss": 0.6726, + "step": 66890 + }, + { + "epoch": 1.4890491452991452, + "grad_norm": 0.6124092936515808, + "learning_rate": 0.00020233105894730752, + "loss": 0.5958, + "step": 66900 + }, + { + "epoch": 1.4892717236467237, + "grad_norm": 0.7292911410331726, + "learning_rate": 0.0002022844398166611, + "loss": 0.4976, + "step": 66910 + }, + { + "epoch": 1.489494301994302, + "grad_norm": 0.6575756669044495, + "learning_rate": 0.00020223782056187634, + "loss": 0.7109, + "step": 66920 + }, + { + "epoch": 1.4897168803418803, + "grad_norm": 0.680202066898346, + "learning_rate": 0.0002021912011854866, + "loss": 0.6018, + "step": 66930 + }, + { + "epoch": 1.4899394586894588, + "grad_norm": 0.6214728951454163, + "learning_rate": 0.00020214458169002514, + "loss": 0.6302, + "step": 66940 + }, + { + "epoch": 1.490162037037037, + "grad_norm": 0.6200801730155945, + "learning_rate": 0.00020209796207802536, + "loss": 0.5556, + "step": 66950 + }, + { + "epoch": 1.4903846153846154, + "grad_norm": 0.6757798790931702, + "learning_rate": 0.00020205134235202064, + "loss": 0.5413, + "step": 66960 + }, + { + "epoch": 1.4906071937321936, + "grad_norm": 0.5411709547042847, + "learning_rate": 0.00020200472251454427, + "loss": 0.5122, + "step": 66970 + }, + { + "epoch": 1.490829772079772, + "grad_norm": 0.6982552409172058, + "learning_rate": 0.00020195810256812968, + "loss": 0.5668, + "step": 66980 + }, + { + "epoch": 1.4910523504273505, + "grad_norm": 0.6113097667694092, + "learning_rate": 0.00020191148251531016, + "loss": 0.5333, + "step": 66990 + }, + { + "epoch": 1.4912749287749287, + "grad_norm": 0.5838876366615295, + "learning_rate": 0.00020186486235861914, + "loss": 0.4169, + "step": 67000 + }, + { + "epoch": 1.4914975071225072, + "grad_norm": 0.48443251848220825, + "learning_rate": 0.00020181824210058994, + "loss": 0.5466, + "step": 67010 + }, + { + "epoch": 1.4917200854700854, + "grad_norm": 0.5497033596038818, + "learning_rate": 0.00020177162174375596, + "loss": 0.5244, + "step": 67020 + }, + { + "epoch": 1.4919426638176638, + "grad_norm": 0.6074718832969666, + "learning_rate": 0.00020172500129065065, + "loss": 0.5937, + "step": 67030 + }, + { + "epoch": 1.492165242165242, + "grad_norm": 0.36284077167510986, + "learning_rate": 0.00020167838074380736, + "loss": 0.5153, + "step": 67040 + }, + { + "epoch": 1.4923878205128205, + "grad_norm": 0.6307108998298645, + "learning_rate": 0.00020163176010575947, + "loss": 0.6369, + "step": 67050 + }, + { + "epoch": 1.492610398860399, + "grad_norm": 0.8147817254066467, + "learning_rate": 0.00020158513937904035, + "loss": 0.5483, + "step": 67060 + }, + { + "epoch": 1.4928329772079771, + "grad_norm": 0.4598945677280426, + "learning_rate": 0.00020153851856618356, + "loss": 0.567, + "step": 67070 + }, + { + "epoch": 1.4930555555555556, + "grad_norm": 0.5549848079681396, + "learning_rate": 0.00020149189766972234, + "loss": 0.6136, + "step": 67080 + }, + { + "epoch": 1.493278133903134, + "grad_norm": 0.8051429986953735, + "learning_rate": 0.00020144527669219015, + "loss": 0.6486, + "step": 67090 + }, + { + "epoch": 1.4935007122507122, + "grad_norm": 0.6233551502227783, + "learning_rate": 0.00020139865563612052, + "loss": 0.4419, + "step": 67100 + }, + { + "epoch": 1.4937232905982907, + "grad_norm": 0.513957679271698, + "learning_rate": 0.0002013520345040468, + "loss": 0.6169, + "step": 67110 + }, + { + "epoch": 1.493945868945869, + "grad_norm": 0.5634671449661255, + "learning_rate": 0.0002013054132985024, + "loss": 0.5219, + "step": 67120 + }, + { + "epoch": 1.4941684472934473, + "grad_norm": 0.7170486450195312, + "learning_rate": 0.00020125879202202073, + "loss": 0.5853, + "step": 67130 + }, + { + "epoch": 1.4943910256410255, + "grad_norm": 0.7605602145195007, + "learning_rate": 0.0002012121706771353, + "loss": 0.5669, + "step": 67140 + }, + { + "epoch": 1.494613603988604, + "grad_norm": 0.39489638805389404, + "learning_rate": 0.0002011655492663795, + "loss": 0.5545, + "step": 67150 + }, + { + "epoch": 1.4948361823361824, + "grad_norm": 0.759689211845398, + "learning_rate": 0.00020111892779228679, + "loss": 0.6603, + "step": 67160 + }, + { + "epoch": 1.4950587606837606, + "grad_norm": 0.5082457661628723, + "learning_rate": 0.0002010723062573907, + "loss": 0.5883, + "step": 67170 + }, + { + "epoch": 1.495281339031339, + "grad_norm": 0.8190949559211731, + "learning_rate": 0.0002010256846642246, + "loss": 0.6132, + "step": 67180 + }, + { + "epoch": 1.4955039173789173, + "grad_norm": 0.600661039352417, + "learning_rate": 0.00020097906301532188, + "loss": 0.3774, + "step": 67190 + }, + { + "epoch": 1.4957264957264957, + "grad_norm": 0.44674062728881836, + "learning_rate": 0.00020093244131321608, + "loss": 0.4768, + "step": 67200 + }, + { + "epoch": 1.495949074074074, + "grad_norm": 0.5484546422958374, + "learning_rate": 0.00020088581956044074, + "loss": 0.5206, + "step": 67210 + }, + { + "epoch": 1.4961716524216524, + "grad_norm": 0.6583910584449768, + "learning_rate": 0.0002008391977595292, + "loss": 0.5034, + "step": 67220 + }, + { + "epoch": 1.4963942307692308, + "grad_norm": 0.6517849564552307, + "learning_rate": 0.00020079257591301493, + "loss": 0.6159, + "step": 67230 + }, + { + "epoch": 1.496616809116809, + "grad_norm": 0.5481684803962708, + "learning_rate": 0.00020074595402343147, + "loss": 0.65, + "step": 67240 + }, + { + "epoch": 1.4968393874643875, + "grad_norm": 0.6224780082702637, + "learning_rate": 0.00020069933209331228, + "loss": 0.4955, + "step": 67250 + }, + { + "epoch": 1.497061965811966, + "grad_norm": 0.956333339214325, + "learning_rate": 0.00020065271012519075, + "loss": 0.524, + "step": 67260 + }, + { + "epoch": 1.4972845441595442, + "grad_norm": 0.7989146113395691, + "learning_rate": 0.00020060608812160044, + "loss": 0.5822, + "step": 67270 + }, + { + "epoch": 1.4975071225071226, + "grad_norm": 0.8631839156150818, + "learning_rate": 0.0002005594660850749, + "loss": 0.5416, + "step": 67280 + }, + { + "epoch": 1.4977297008547008, + "grad_norm": 0.46611177921295166, + "learning_rate": 0.00020051284401814736, + "loss": 0.5857, + "step": 67290 + }, + { + "epoch": 1.4979522792022792, + "grad_norm": 0.5473828315734863, + "learning_rate": 0.00020046622192335152, + "loss": 0.4891, + "step": 67300 + }, + { + "epoch": 1.4981748575498575, + "grad_norm": 0.6410860419273376, + "learning_rate": 0.00020041959980322084, + "loss": 0.6214, + "step": 67310 + }, + { + "epoch": 1.498397435897436, + "grad_norm": 0.5481650829315186, + "learning_rate": 0.00020037297766028878, + "loss": 0.6417, + "step": 67320 + }, + { + "epoch": 1.4986200142450143, + "grad_norm": 0.9253783822059631, + "learning_rate": 0.0002003263554970887, + "loss": 0.4641, + "step": 67330 + }, + { + "epoch": 1.4988425925925926, + "grad_norm": 0.5475835800170898, + "learning_rate": 0.00020027973331615426, + "loss": 0.5779, + "step": 67340 + }, + { + "epoch": 1.499065170940171, + "grad_norm": 0.6916240453720093, + "learning_rate": 0.0002002331111200189, + "loss": 0.4992, + "step": 67350 + }, + { + "epoch": 1.4992877492877492, + "grad_norm": 0.5551814436912537, + "learning_rate": 0.00020018648891121602, + "loss": 0.5891, + "step": 67360 + }, + { + "epoch": 1.4995103276353277, + "grad_norm": 0.5963566303253174, + "learning_rate": 0.00020013986669227925, + "loss": 0.4562, + "step": 67370 + }, + { + "epoch": 1.4997329059829059, + "grad_norm": 0.7776322960853577, + "learning_rate": 0.000200093244465742, + "loss": 0.7268, + "step": 67380 + }, + { + "epoch": 1.4999554843304843, + "grad_norm": 0.4523514211177826, + "learning_rate": 0.00020004662223413778, + "loss": 0.4972, + "step": 67390 + }, + { + "epoch": 1.5001780626780628, + "grad_norm": 0.4121207296848297, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 67400 + }, + { + "epoch": 1.5001780626780628, + "eval_loss": 0.5678955912590027, + "eval_runtime": 337.2407, + "eval_samples_per_second": 7.013, + "eval_steps_per_second": 7.013, + "step": 67400 + }, + { + "epoch": 1.500400641025641, + "grad_norm": 0.5370911955833435, + "learning_rate": 0.00019995337776586227, + "loss": 0.4783, + "step": 67410 + }, + { + "epoch": 1.5006232193732194, + "grad_norm": 0.3378683924674988, + "learning_rate": 0.00019990675553425806, + "loss": 0.5036, + "step": 67420 + }, + { + "epoch": 1.5008457977207978, + "grad_norm": 0.48726844787597656, + "learning_rate": 0.00019986013330772077, + "loss": 0.4933, + "step": 67430 + }, + { + "epoch": 1.501068376068376, + "grad_norm": 0.5741036534309387, + "learning_rate": 0.00019981351108878397, + "loss": 0.6323, + "step": 67440 + }, + { + "epoch": 1.5012909544159543, + "grad_norm": 0.4806444048881531, + "learning_rate": 0.00019976688887998116, + "loss": 0.5906, + "step": 67450 + }, + { + "epoch": 1.5015135327635327, + "grad_norm": 0.5801340937614441, + "learning_rate": 0.00019972026668384576, + "loss": 0.5764, + "step": 67460 + }, + { + "epoch": 1.5017361111111112, + "grad_norm": 0.4316343367099762, + "learning_rate": 0.00019967364450291136, + "loss": 0.5098, + "step": 67470 + }, + { + "epoch": 1.5019586894586894, + "grad_norm": 0.37861862778663635, + "learning_rate": 0.00019962702233971134, + "loss": 0.6163, + "step": 67480 + }, + { + "epoch": 1.5021812678062678, + "grad_norm": 0.6555972695350647, + "learning_rate": 0.0001995804001967792, + "loss": 0.516, + "step": 67490 + }, + { + "epoch": 1.5024038461538463, + "grad_norm": 0.44705936312675476, + "learning_rate": 0.00019953377807664852, + "loss": 0.5441, + "step": 67500 + }, + { + "epoch": 1.5026264245014245, + "grad_norm": 0.6999156475067139, + "learning_rate": 0.0001994871559818527, + "loss": 0.6709, + "step": 67510 + }, + { + "epoch": 1.5028490028490027, + "grad_norm": 0.5605092644691467, + "learning_rate": 0.00019944053391492519, + "loss": 0.5358, + "step": 67520 + }, + { + "epoch": 1.5030715811965814, + "grad_norm": 0.7177448272705078, + "learning_rate": 0.0001993939118783996, + "loss": 0.517, + "step": 67530 + }, + { + "epoch": 1.5032941595441596, + "grad_norm": 0.5700393319129944, + "learning_rate": 0.00019934728987480927, + "loss": 0.6628, + "step": 67540 + }, + { + "epoch": 1.5035167378917378, + "grad_norm": 0.5855696797370911, + "learning_rate": 0.00019930066790668777, + "loss": 0.5235, + "step": 67550 + }, + { + "epoch": 1.5037393162393162, + "grad_norm": 0.53740394115448, + "learning_rate": 0.00019925404597656855, + "loss": 0.5231, + "step": 67560 + }, + { + "epoch": 1.5039618945868947, + "grad_norm": 0.5468155145645142, + "learning_rate": 0.00019920742408698508, + "loss": 0.5379, + "step": 67570 + }, + { + "epoch": 1.5041844729344729, + "grad_norm": 0.6854047179222107, + "learning_rate": 0.00019916080224047082, + "loss": 0.5855, + "step": 67580 + }, + { + "epoch": 1.5044070512820513, + "grad_norm": 0.6865229606628418, + "learning_rate": 0.00019911418043955928, + "loss": 0.5998, + "step": 67590 + }, + { + "epoch": 1.5046296296296298, + "grad_norm": 0.6859182715415955, + "learning_rate": 0.00019906755868678394, + "loss": 0.6969, + "step": 67600 + }, + { + "epoch": 1.504852207977208, + "grad_norm": 0.6807510852813721, + "learning_rate": 0.00019902093698467822, + "loss": 0.5432, + "step": 67610 + }, + { + "epoch": 1.5050747863247862, + "grad_norm": 0.6587150692939758, + "learning_rate": 0.0001989743153357755, + "loss": 0.4533, + "step": 67620 + }, + { + "epoch": 1.5052973646723646, + "grad_norm": 0.4862242639064789, + "learning_rate": 0.00019892769374260937, + "loss": 0.626, + "step": 67630 + }, + { + "epoch": 1.505519943019943, + "grad_norm": 0.4418836236000061, + "learning_rate": 0.00019888107220771323, + "loss": 0.447, + "step": 67640 + }, + { + "epoch": 1.5057425213675213, + "grad_norm": 0.675957977771759, + "learning_rate": 0.00019883445073362054, + "loss": 0.5451, + "step": 67650 + }, + { + "epoch": 1.5059650997150997, + "grad_norm": 0.4350026249885559, + "learning_rate": 0.00019878782932286474, + "loss": 0.6169, + "step": 67660 + }, + { + "epoch": 1.5061876780626782, + "grad_norm": 0.5768257975578308, + "learning_rate": 0.00019874120797797935, + "loss": 0.5835, + "step": 67670 + }, + { + "epoch": 1.5064102564102564, + "grad_norm": 0.8592005968093872, + "learning_rate": 0.00019869458670149768, + "loss": 0.5191, + "step": 67680 + }, + { + "epoch": 1.5066328347578346, + "grad_norm": 0.3954167068004608, + "learning_rate": 0.00019864796549595324, + "loss": 0.5395, + "step": 67690 + }, + { + "epoch": 1.5068554131054133, + "grad_norm": 0.5921634435653687, + "learning_rate": 0.00019860134436387953, + "loss": 0.4337, + "step": 67700 + }, + { + "epoch": 1.5070779914529915, + "grad_norm": 0.7694410681724548, + "learning_rate": 0.00019855472330780982, + "loss": 0.5281, + "step": 67710 + }, + { + "epoch": 1.5073005698005697, + "grad_norm": 0.8094078898429871, + "learning_rate": 0.00019850810233027768, + "loss": 0.6113, + "step": 67720 + }, + { + "epoch": 1.5075231481481481, + "grad_norm": 1.537431240081787, + "learning_rate": 0.00019846148143381654, + "loss": 0.5638, + "step": 67730 + }, + { + "epoch": 1.5077457264957266, + "grad_norm": 0.5468326807022095, + "learning_rate": 0.0001984148606209597, + "loss": 0.6547, + "step": 67740 + }, + { + "epoch": 1.5079683048433048, + "grad_norm": 0.5707191824913025, + "learning_rate": 0.00019836823989424063, + "loss": 0.6599, + "step": 67750 + }, + { + "epoch": 1.5081908831908832, + "grad_norm": 0.7331326603889465, + "learning_rate": 0.0001983216192561927, + "loss": 0.4641, + "step": 67760 + }, + { + "epoch": 1.5084134615384617, + "grad_norm": 0.5310831665992737, + "learning_rate": 0.00019827499870934937, + "loss": 0.5002, + "step": 67770 + }, + { + "epoch": 1.50863603988604, + "grad_norm": 0.500921368598938, + "learning_rate": 0.00019822837825624406, + "loss": 0.5765, + "step": 67780 + }, + { + "epoch": 1.508858618233618, + "grad_norm": 0.5608747005462646, + "learning_rate": 0.0001981817578994101, + "loss": 0.455, + "step": 67790 + }, + { + "epoch": 1.5090811965811965, + "grad_norm": 0.5071516036987305, + "learning_rate": 0.0001981351376413809, + "loss": 0.4695, + "step": 67800 + }, + { + "epoch": 1.509303774928775, + "grad_norm": 0.7218843698501587, + "learning_rate": 0.00019808851748468988, + "loss": 0.6235, + "step": 67810 + }, + { + "epoch": 1.5095263532763532, + "grad_norm": 0.6392359733581543, + "learning_rate": 0.00019804189743187036, + "loss": 0.5179, + "step": 67820 + }, + { + "epoch": 1.5097489316239316, + "grad_norm": 0.6713154911994934, + "learning_rate": 0.00019799527748545572, + "loss": 0.5209, + "step": 67830 + }, + { + "epoch": 1.50997150997151, + "grad_norm": 0.7304174304008484, + "learning_rate": 0.00019794865764797938, + "loss": 0.464, + "step": 67840 + }, + { + "epoch": 1.5101940883190883, + "grad_norm": 0.5477138161659241, + "learning_rate": 0.00019790203792197463, + "loss": 0.5502, + "step": 67850 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.43742215633392334, + "learning_rate": 0.00019785541830997494, + "loss": 0.6281, + "step": 67860 + }, + { + "epoch": 1.5106392450142452, + "grad_norm": 0.5618810057640076, + "learning_rate": 0.0001978087988145135, + "loss": 0.5577, + "step": 67870 + }, + { + "epoch": 1.5108618233618234, + "grad_norm": 0.47301146388053894, + "learning_rate": 0.00019776217943812376, + "loss": 0.5327, + "step": 67880 + }, + { + "epoch": 1.5110844017094016, + "grad_norm": 0.4628704786300659, + "learning_rate": 0.00019771556018333898, + "loss": 0.6488, + "step": 67890 + }, + { + "epoch": 1.51130698005698, + "grad_norm": 0.49233415722846985, + "learning_rate": 0.00019766894105269252, + "loss": 0.4458, + "step": 67900 + }, + { + "epoch": 1.5115295584045585, + "grad_norm": 0.4551265239715576, + "learning_rate": 0.00019762232204871772, + "loss": 0.5321, + "step": 67910 + }, + { + "epoch": 1.5117521367521367, + "grad_norm": 1.0391403436660767, + "learning_rate": 0.00019757570317394793, + "loss": 0.5385, + "step": 67920 + }, + { + "epoch": 1.5119747150997151, + "grad_norm": 0.7271586656570435, + "learning_rate": 0.00019752908443091636, + "loss": 0.5177, + "step": 67930 + }, + { + "epoch": 1.5121972934472936, + "grad_norm": 0.6301023960113525, + "learning_rate": 0.00019748246582215636, + "loss": 0.6633, + "step": 67940 + }, + { + "epoch": 1.5124198717948718, + "grad_norm": 0.5653015375137329, + "learning_rate": 0.0001974358473502013, + "loss": 0.4245, + "step": 67950 + }, + { + "epoch": 1.51264245014245, + "grad_norm": 0.9529626965522766, + "learning_rate": 0.00019738922901758435, + "loss": 0.4955, + "step": 67960 + }, + { + "epoch": 1.5128650284900285, + "grad_norm": 0.5109310746192932, + "learning_rate": 0.0001973426108268388, + "loss": 0.5948, + "step": 67970 + }, + { + "epoch": 1.513087606837607, + "grad_norm": 0.5099339485168457, + "learning_rate": 0.00019729599278049803, + "loss": 0.3919, + "step": 67980 + }, + { + "epoch": 1.5133101851851851, + "grad_norm": 0.6454190015792847, + "learning_rate": 0.00019724937488109517, + "loss": 0.5759, + "step": 67990 + }, + { + "epoch": 1.5135327635327636, + "grad_norm": 0.6826174259185791, + "learning_rate": 0.00019720275713116362, + "loss": 0.6977, + "step": 68000 + }, + { + "epoch": 1.513755341880342, + "grad_norm": 0.3470633924007416, + "learning_rate": 0.00019715613953323643, + "loss": 0.5286, + "step": 68010 + }, + { + "epoch": 1.5139779202279202, + "grad_norm": 0.4481032192707062, + "learning_rate": 0.00019710952208984702, + "loss": 0.545, + "step": 68020 + }, + { + "epoch": 1.5142004985754984, + "grad_norm": 0.6486870050430298, + "learning_rate": 0.00019706290480352848, + "loss": 0.4998, + "step": 68030 + }, + { + "epoch": 1.5144230769230769, + "grad_norm": 0.7330490350723267, + "learning_rate": 0.0001970162876768141, + "loss": 0.6018, + "step": 68040 + }, + { + "epoch": 1.5146456552706553, + "grad_norm": 0.5175281763076782, + "learning_rate": 0.00019696967071223712, + "loss": 0.5314, + "step": 68050 + }, + { + "epoch": 1.5148682336182335, + "grad_norm": 0.6107418537139893, + "learning_rate": 0.0001969230539123307, + "loss": 0.5814, + "step": 68060 + }, + { + "epoch": 1.515090811965812, + "grad_norm": 0.5667102336883545, + "learning_rate": 0.00019687643727962802, + "loss": 0.5239, + "step": 68070 + }, + { + "epoch": 1.5153133903133904, + "grad_norm": 0.6517727971076965, + "learning_rate": 0.00019682982081666234, + "loss": 0.5387, + "step": 68080 + }, + { + "epoch": 1.5155359686609686, + "grad_norm": 0.44955015182495117, + "learning_rate": 0.00019678320452596682, + "loss": 0.5079, + "step": 68090 + }, + { + "epoch": 1.515758547008547, + "grad_norm": 0.379574179649353, + "learning_rate": 0.00019673658841007455, + "loss": 0.5597, + "step": 68100 + }, + { + "epoch": 1.5159811253561255, + "grad_norm": 0.7952883839607239, + "learning_rate": 0.00019668997247151873, + "loss": 0.5436, + "step": 68110 + }, + { + "epoch": 1.5162037037037037, + "grad_norm": 0.5392372012138367, + "learning_rate": 0.00019664335671283254, + "loss": 0.5144, + "step": 68120 + }, + { + "epoch": 1.516426282051282, + "grad_norm": 0.3944644033908844, + "learning_rate": 0.00019659674113654916, + "loss": 0.477, + "step": 68130 + }, + { + "epoch": 1.5166488603988604, + "grad_norm": 0.44683077931404114, + "learning_rate": 0.00019655012574520158, + "loss": 0.4392, + "step": 68140 + }, + { + "epoch": 1.5168714387464388, + "grad_norm": 0.5390403866767883, + "learning_rate": 0.00019650351054132298, + "loss": 0.4811, + "step": 68150 + }, + { + "epoch": 1.517094017094017, + "grad_norm": 0.7489467859268188, + "learning_rate": 0.00019645689552744651, + "loss": 0.6996, + "step": 68160 + }, + { + "epoch": 1.5173165954415955, + "grad_norm": 0.5423558950424194, + "learning_rate": 0.00019641028070610522, + "loss": 0.4971, + "step": 68170 + }, + { + "epoch": 1.517539173789174, + "grad_norm": 0.7267695069313049, + "learning_rate": 0.00019636366607983218, + "loss": 0.7174, + "step": 68180 + }, + { + "epoch": 1.5177617521367521, + "grad_norm": 0.6742967963218689, + "learning_rate": 0.00019631705165116056, + "loss": 0.5926, + "step": 68190 + }, + { + "epoch": 1.5179843304843303, + "grad_norm": 0.6137351989746094, + "learning_rate": 0.0001962704374226233, + "loss": 0.5389, + "step": 68200 + }, + { + "epoch": 1.5182069088319088, + "grad_norm": 0.5949881076812744, + "learning_rate": 0.0001962238233967536, + "loss": 0.502, + "step": 68210 + }, + { + "epoch": 1.5184294871794872, + "grad_norm": 0.5518471002578735, + "learning_rate": 0.00019617720957608437, + "loss": 0.4403, + "step": 68220 + }, + { + "epoch": 1.5186520655270654, + "grad_norm": 0.6174115538597107, + "learning_rate": 0.00019613059596314877, + "loss": 0.6888, + "step": 68230 + }, + { + "epoch": 1.5188746438746439, + "grad_norm": 0.4383307695388794, + "learning_rate": 0.00019608398256047967, + "loss": 0.4862, + "step": 68240 + }, + { + "epoch": 1.5190972222222223, + "grad_norm": 0.598969578742981, + "learning_rate": 0.0001960373693706102, + "loss": 0.5032, + "step": 68250 + }, + { + "epoch": 1.5193198005698005, + "grad_norm": 0.4754716455936432, + "learning_rate": 0.00019599075639607338, + "loss": 0.5774, + "step": 68260 + }, + { + "epoch": 1.5195423789173788, + "grad_norm": 0.6664845943450928, + "learning_rate": 0.0001959441436394021, + "loss": 0.7143, + "step": 68270 + }, + { + "epoch": 1.5197649572649574, + "grad_norm": 0.6848062872886658, + "learning_rate": 0.00019589753110312936, + "loss": 0.4915, + "step": 68280 + }, + { + "epoch": 1.5199875356125356, + "grad_norm": 0.7403508424758911, + "learning_rate": 0.0001958509187897881, + "loss": 0.4748, + "step": 68290 + }, + { + "epoch": 1.5202101139601139, + "grad_norm": 0.5345255732536316, + "learning_rate": 0.00019580430670191142, + "loss": 0.4903, + "step": 68300 + }, + { + "epoch": 1.5204326923076923, + "grad_norm": 0.5372764468193054, + "learning_rate": 0.00019575769484203205, + "loss": 0.6183, + "step": 68310 + }, + { + "epoch": 1.5206552706552707, + "grad_norm": 0.6304016709327698, + "learning_rate": 0.000195711083212683, + "loss": 0.5801, + "step": 68320 + }, + { + "epoch": 1.520877849002849, + "grad_norm": 0.8080713748931885, + "learning_rate": 0.0001956644718163973, + "loss": 0.5597, + "step": 68330 + }, + { + "epoch": 1.5211004273504274, + "grad_norm": 0.40685272216796875, + "learning_rate": 0.00019561786065570765, + "loss": 0.5366, + "step": 68340 + }, + { + "epoch": 1.5213230056980058, + "grad_norm": 0.5551041960716248, + "learning_rate": 0.0001955712497331471, + "loss": 0.5362, + "step": 68350 + }, + { + "epoch": 1.521545584045584, + "grad_norm": 0.659697949886322, + "learning_rate": 0.0001955246390512484, + "loss": 0.5348, + "step": 68360 + }, + { + "epoch": 1.5217681623931623, + "grad_norm": 0.5986943244934082, + "learning_rate": 0.00019547802861254456, + "loss": 0.5273, + "step": 68370 + }, + { + "epoch": 1.5219907407407407, + "grad_norm": 0.4209531247615814, + "learning_rate": 0.0001954314184195683, + "loss": 0.5386, + "step": 68380 + }, + { + "epoch": 1.5222133190883191, + "grad_norm": 0.6699190139770508, + "learning_rate": 0.00019538480847485257, + "loss": 0.6869, + "step": 68390 + }, + { + "epoch": 1.5224358974358974, + "grad_norm": 0.7325060963630676, + "learning_rate": 0.00019533819878093006, + "loss": 0.4593, + "step": 68400 + }, + { + "epoch": 1.5226584757834758, + "grad_norm": 0.5187638401985168, + "learning_rate": 0.0001952915893403337, + "loss": 0.4833, + "step": 68410 + }, + { + "epoch": 1.5228810541310542, + "grad_norm": 0.640946090221405, + "learning_rate": 0.00019524498015559616, + "loss": 0.5063, + "step": 68420 + }, + { + "epoch": 1.5231036324786325, + "grad_norm": 0.6378102898597717, + "learning_rate": 0.0001951983712292503, + "loss": 0.5305, + "step": 68430 + }, + { + "epoch": 1.5233262108262107, + "grad_norm": 0.8082734942436218, + "learning_rate": 0.0001951517625638289, + "loss": 0.5905, + "step": 68440 + }, + { + "epoch": 1.5235487891737893, + "grad_norm": 0.6392168998718262, + "learning_rate": 0.0001951051541618647, + "loss": 0.4997, + "step": 68450 + }, + { + "epoch": 1.5237713675213675, + "grad_norm": 0.5380043983459473, + "learning_rate": 0.0001950585460258904, + "loss": 0.4804, + "step": 68460 + }, + { + "epoch": 1.5239939458689458, + "grad_norm": 0.7026430368423462, + "learning_rate": 0.0001950119381584388, + "loss": 0.5851, + "step": 68470 + }, + { + "epoch": 1.5242165242165242, + "grad_norm": 0.4975331723690033, + "learning_rate": 0.0001949653305620425, + "loss": 0.5668, + "step": 68480 + }, + { + "epoch": 1.5244391025641026, + "grad_norm": 0.5272009372711182, + "learning_rate": 0.00019491872323923427, + "loss": 0.5599, + "step": 68490 + }, + { + "epoch": 1.5246616809116809, + "grad_norm": 0.47747164964675903, + "learning_rate": 0.00019487211619254684, + "loss": 0.4989, + "step": 68500 + }, + { + "epoch": 1.5248842592592593, + "grad_norm": 0.8575394153594971, + "learning_rate": 0.00019482550942451275, + "loss": 0.4727, + "step": 68510 + }, + { + "epoch": 1.5251068376068377, + "grad_norm": 0.37729412317276, + "learning_rate": 0.00019477890293766482, + "loss": 0.5011, + "step": 68520 + }, + { + "epoch": 1.525329415954416, + "grad_norm": 0.4902661144733429, + "learning_rate": 0.00019473229673453547, + "loss": 0.4458, + "step": 68530 + }, + { + "epoch": 1.5255519943019942, + "grad_norm": 0.45972684025764465, + "learning_rate": 0.00019468569081765744, + "loss": 0.603, + "step": 68540 + }, + { + "epoch": 1.5257745726495726, + "grad_norm": 0.5463181138038635, + "learning_rate": 0.00019463908518956336, + "loss": 0.56, + "step": 68550 + }, + { + "epoch": 1.525997150997151, + "grad_norm": 0.4450381100177765, + "learning_rate": 0.00019459247985278576, + "loss": 0.67, + "step": 68560 + }, + { + "epoch": 1.5262197293447293, + "grad_norm": 0.6549626588821411, + "learning_rate": 0.0001945458748098572, + "loss": 0.6309, + "step": 68570 + }, + { + "epoch": 1.5264423076923077, + "grad_norm": 0.6377149820327759, + "learning_rate": 0.00019449927006331033, + "loss": 0.431, + "step": 68580 + }, + { + "epoch": 1.5266648860398861, + "grad_norm": 0.38826367259025574, + "learning_rate": 0.00019445266561567755, + "loss": 0.5232, + "step": 68590 + }, + { + "epoch": 1.5268874643874644, + "grad_norm": 0.5607532262802124, + "learning_rate": 0.0001944060614694915, + "loss": 0.663, + "step": 68600 + }, + { + "epoch": 1.5271100427350426, + "grad_norm": 0.3576250374317169, + "learning_rate": 0.00019435945762728469, + "loss": 0.5527, + "step": 68610 + }, + { + "epoch": 1.5273326210826212, + "grad_norm": 0.8259841203689575, + "learning_rate": 0.00019431285409158953, + "loss": 0.4382, + "step": 68620 + }, + { + "epoch": 1.5275551994301995, + "grad_norm": 0.5837790966033936, + "learning_rate": 0.0001942662508649385, + "loss": 0.4992, + "step": 68630 + }, + { + "epoch": 1.5277777777777777, + "grad_norm": 0.6744682192802429, + "learning_rate": 0.00019421964794986415, + "loss": 0.5334, + "step": 68640 + }, + { + "epoch": 1.5280003561253561, + "grad_norm": 0.599739134311676, + "learning_rate": 0.00019417304534889888, + "loss": 0.5101, + "step": 68650 + }, + { + "epoch": 1.5282229344729346, + "grad_norm": 0.42003190517425537, + "learning_rate": 0.0001941264430645751, + "loss": 0.4881, + "step": 68660 + }, + { + "epoch": 1.5284455128205128, + "grad_norm": 0.5991514325141907, + "learning_rate": 0.00019407984109942513, + "loss": 0.6502, + "step": 68670 + }, + { + "epoch": 1.5286680911680912, + "grad_norm": 0.7176218628883362, + "learning_rate": 0.0001940332394559815, + "loss": 0.5538, + "step": 68680 + }, + { + "epoch": 1.5288906695156697, + "grad_norm": 0.46449580788612366, + "learning_rate": 0.00019398663813677652, + "loss": 0.55, + "step": 68690 + }, + { + "epoch": 1.5291132478632479, + "grad_norm": 0.5694019794464111, + "learning_rate": 0.0001939400371443425, + "loss": 0.5211, + "step": 68700 + }, + { + "epoch": 1.529335826210826, + "grad_norm": 0.7176836729049683, + "learning_rate": 0.00019389343648121185, + "loss": 0.5964, + "step": 68710 + }, + { + "epoch": 1.5295584045584045, + "grad_norm": 0.5984424948692322, + "learning_rate": 0.0001938468361499169, + "loss": 0.4925, + "step": 68720 + }, + { + "epoch": 1.529780982905983, + "grad_norm": 0.6754959225654602, + "learning_rate": 0.00019380023615298984, + "loss": 0.5136, + "step": 68730 + }, + { + "epoch": 1.5300035612535612, + "grad_norm": 0.38003575801849365, + "learning_rate": 0.00019375363649296306, + "loss": 0.5454, + "step": 68740 + }, + { + "epoch": 1.5302261396011396, + "grad_norm": 0.5140901803970337, + "learning_rate": 0.0001937070371723688, + "loss": 0.6238, + "step": 68750 + }, + { + "epoch": 1.530448717948718, + "grad_norm": 0.6815766096115112, + "learning_rate": 0.00019366043819373928, + "loss": 0.5283, + "step": 68760 + }, + { + "epoch": 1.5306712962962963, + "grad_norm": 0.8449596166610718, + "learning_rate": 0.0001936138395596067, + "loss": 0.6162, + "step": 68770 + }, + { + "epoch": 1.5308938746438745, + "grad_norm": 0.4473537504673004, + "learning_rate": 0.0001935672412725034, + "loss": 0.5368, + "step": 68780 + }, + { + "epoch": 1.531116452991453, + "grad_norm": 0.6608182191848755, + "learning_rate": 0.0001935206433349614, + "loss": 0.6517, + "step": 68790 + }, + { + "epoch": 1.5313390313390314, + "grad_norm": 1.1109671592712402, + "learning_rate": 0.000193474045749513, + "loss": 0.5815, + "step": 68800 + }, + { + "epoch": 1.5315616096866096, + "grad_norm": 0.6018622517585754, + "learning_rate": 0.00019342744851869024, + "loss": 0.4961, + "step": 68810 + }, + { + "epoch": 1.531784188034188, + "grad_norm": 0.4928274154663086, + "learning_rate": 0.0001933808516450253, + "loss": 0.515, + "step": 68820 + }, + { + "epoch": 1.5320067663817665, + "grad_norm": 0.6179516911506653, + "learning_rate": 0.00019333425513105038, + "loss": 0.6854, + "step": 68830 + }, + { + "epoch": 1.5322293447293447, + "grad_norm": 0.7085897922515869, + "learning_rate": 0.00019328765897929742, + "loss": 0.6374, + "step": 68840 + }, + { + "epoch": 1.5324519230769231, + "grad_norm": 0.874418318271637, + "learning_rate": 0.00019324106319229856, + "loss": 0.569, + "step": 68850 + }, + { + "epoch": 1.5326745014245016, + "grad_norm": 0.7772431969642639, + "learning_rate": 0.00019319446777258593, + "loss": 0.4234, + "step": 68860 + }, + { + "epoch": 1.5328970797720798, + "grad_norm": 0.5707485675811768, + "learning_rate": 0.0001931478727226914, + "loss": 0.5065, + "step": 68870 + }, + { + "epoch": 1.533119658119658, + "grad_norm": 0.4485999643802643, + "learning_rate": 0.0001931012780451471, + "loss": 0.5181, + "step": 68880 + }, + { + "epoch": 1.5333422364672364, + "grad_norm": 0.5521811842918396, + "learning_rate": 0.00019305468374248506, + "loss": 0.6346, + "step": 68890 + }, + { + "epoch": 1.5335648148148149, + "grad_norm": 0.7295104265213013, + "learning_rate": 0.00019300808981723714, + "loss": 0.6556, + "step": 68900 + }, + { + "epoch": 1.533787393162393, + "grad_norm": 0.5098261833190918, + "learning_rate": 0.00019296149627193542, + "loss": 0.6277, + "step": 68910 + }, + { + "epoch": 1.5340099715099715, + "grad_norm": 0.4031944274902344, + "learning_rate": 0.0001929149031091117, + "loss": 0.5626, + "step": 68920 + }, + { + "epoch": 1.53423254985755, + "grad_norm": 0.7615691423416138, + "learning_rate": 0.00019286831033129791, + "loss": 0.5249, + "step": 68930 + }, + { + "epoch": 1.5344551282051282, + "grad_norm": 0.8737319707870483, + "learning_rate": 0.00019282171794102602, + "loss": 0.5877, + "step": 68940 + }, + { + "epoch": 1.5346777065527064, + "grad_norm": 0.5554376244544983, + "learning_rate": 0.00019277512594082782, + "loss": 0.3718, + "step": 68950 + }, + { + "epoch": 1.5349002849002849, + "grad_norm": 0.3122378885746002, + "learning_rate": 0.00019272853433323519, + "loss": 0.4532, + "step": 68960 + }, + { + "epoch": 1.5351228632478633, + "grad_norm": 0.4163728952407837, + "learning_rate": 0.00019268194312077998, + "loss": 0.5449, + "step": 68970 + }, + { + "epoch": 1.5353454415954415, + "grad_norm": 0.7278553247451782, + "learning_rate": 0.00019263535230599398, + "loss": 0.5452, + "step": 68980 + }, + { + "epoch": 1.53556801994302, + "grad_norm": 0.5134122371673584, + "learning_rate": 0.0001925887618914089, + "loss": 0.6467, + "step": 68990 + }, + { + "epoch": 1.5357905982905984, + "grad_norm": 0.5455583333969116, + "learning_rate": 0.00019254217187955665, + "loss": 0.5355, + "step": 69000 + }, + { + "epoch": 1.5360131766381766, + "grad_norm": 0.47929325699806213, + "learning_rate": 0.00019249558227296885, + "loss": 0.5137, + "step": 69010 + }, + { + "epoch": 1.5362357549857548, + "grad_norm": 0.7038000822067261, + "learning_rate": 0.00019244899307417724, + "loss": 0.5802, + "step": 69020 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.5524876117706299, + "learning_rate": 0.00019240240428571354, + "loss": 0.6186, + "step": 69030 + }, + { + "epoch": 1.5366809116809117, + "grad_norm": 0.6715249419212341, + "learning_rate": 0.0001923558159101095, + "loss": 0.4595, + "step": 69040 + }, + { + "epoch": 1.53690349002849, + "grad_norm": 0.6825011372566223, + "learning_rate": 0.00019230922794989655, + "loss": 0.528, + "step": 69050 + }, + { + "epoch": 1.5371260683760684, + "grad_norm": 0.8443838953971863, + "learning_rate": 0.00019226264040760649, + "loss": 0.5717, + "step": 69060 + }, + { + "epoch": 1.5373486467236468, + "grad_norm": 0.4754297137260437, + "learning_rate": 0.0001922160532857709, + "loss": 0.6914, + "step": 69070 + }, + { + "epoch": 1.537571225071225, + "grad_norm": 0.7522182464599609, + "learning_rate": 0.00019216946658692132, + "loss": 0.6039, + "step": 69080 + }, + { + "epoch": 1.5377938034188035, + "grad_norm": 0.6504005789756775, + "learning_rate": 0.00019212288031358933, + "loss": 0.5819, + "step": 69090 + }, + { + "epoch": 1.538016381766382, + "grad_norm": 0.479922890663147, + "learning_rate": 0.0001920762944683065, + "loss": 0.6508, + "step": 69100 + }, + { + "epoch": 1.53823896011396, + "grad_norm": 0.6442951560020447, + "learning_rate": 0.00019202970905360432, + "loss": 0.5327, + "step": 69110 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.4072447121143341, + "learning_rate": 0.00019198312407201425, + "loss": 0.4809, + "step": 69120 + }, + { + "epoch": 1.5386841168091168, + "grad_norm": 0.5883573293685913, + "learning_rate": 0.00019193653952606776, + "loss": 0.5166, + "step": 69130 + }, + { + "epoch": 1.5389066951566952, + "grad_norm": 0.4536867141723633, + "learning_rate": 0.00019188995541829636, + "loss": 0.4637, + "step": 69140 + }, + { + "epoch": 1.5391292735042734, + "grad_norm": 0.5501317381858826, + "learning_rate": 0.00019184337175123141, + "loss": 0.5643, + "step": 69150 + }, + { + "epoch": 1.5393518518518519, + "grad_norm": 0.5903235673904419, + "learning_rate": 0.0001917967885274043, + "loss": 0.5332, + "step": 69160 + }, + { + "epoch": 1.5395744301994303, + "grad_norm": 0.6028198599815369, + "learning_rate": 0.00019175020574934646, + "loss": 0.502, + "step": 69170 + }, + { + "epoch": 1.5397970085470085, + "grad_norm": 0.39726942777633667, + "learning_rate": 0.00019170362341958922, + "loss": 0.5645, + "step": 69180 + }, + { + "epoch": 1.5400195868945867, + "grad_norm": 0.5767681002616882, + "learning_rate": 0.0001916570415406638, + "loss": 0.5459, + "step": 69190 + }, + { + "epoch": 1.5402421652421654, + "grad_norm": 0.48167499899864197, + "learning_rate": 0.00019161046011510158, + "loss": 0.4562, + "step": 69200 + }, + { + "epoch": 1.5404647435897436, + "grad_norm": 0.5863054394721985, + "learning_rate": 0.00019156387914543382, + "loss": 0.6115, + "step": 69210 + }, + { + "epoch": 1.5406873219373218, + "grad_norm": 0.5131021738052368, + "learning_rate": 0.0001915172986341918, + "loss": 0.5833, + "step": 69220 + }, + { + "epoch": 1.5409099002849003, + "grad_norm": 0.4436676502227783, + "learning_rate": 0.00019147071858390671, + "loss": 0.4423, + "step": 69230 + }, + { + "epoch": 1.5411324786324787, + "grad_norm": 0.7246342897415161, + "learning_rate": 0.00019142413899710974, + "loss": 0.5924, + "step": 69240 + }, + { + "epoch": 1.541355056980057, + "grad_norm": 0.8733262419700623, + "learning_rate": 0.0001913775598763321, + "loss": 0.5369, + "step": 69250 + }, + { + "epoch": 1.5415776353276354, + "grad_norm": 0.5569106340408325, + "learning_rate": 0.0001913309812241049, + "loss": 0.5274, + "step": 69260 + }, + { + "epoch": 1.5418002136752138, + "grad_norm": 0.6295933127403259, + "learning_rate": 0.00019128440304295926, + "loss": 0.5813, + "step": 69270 + }, + { + "epoch": 1.542022792022792, + "grad_norm": 0.9999169707298279, + "learning_rate": 0.00019123782533542633, + "loss": 0.5836, + "step": 69280 + }, + { + "epoch": 1.5422453703703702, + "grad_norm": 0.44312646985054016, + "learning_rate": 0.00019119124810403713, + "loss": 0.6223, + "step": 69290 + }, + { + "epoch": 1.5424679487179487, + "grad_norm": 0.5661088228225708, + "learning_rate": 0.00019114467135132268, + "loss": 0.4703, + "step": 69300 + }, + { + "epoch": 1.5426905270655271, + "grad_norm": 0.6291427612304688, + "learning_rate": 0.00019109809507981414, + "loss": 0.6484, + "step": 69310 + }, + { + "epoch": 1.5429131054131053, + "grad_norm": 0.624117910861969, + "learning_rate": 0.00019105151929204236, + "loss": 0.4465, + "step": 69320 + }, + { + "epoch": 1.5431356837606838, + "grad_norm": 0.5193196535110474, + "learning_rate": 0.00019100494399053832, + "loss": 0.6065, + "step": 69330 + }, + { + "epoch": 1.5433582621082622, + "grad_norm": 0.5729727149009705, + "learning_rate": 0.000190958369177833, + "loss": 0.6657, + "step": 69340 + }, + { + "epoch": 1.5435808404558404, + "grad_norm": 0.7318823933601379, + "learning_rate": 0.00019091179485645728, + "loss": 0.5558, + "step": 69350 + }, + { + "epoch": 1.5438034188034186, + "grad_norm": 0.6170826554298401, + "learning_rate": 0.00019086522102894208, + "loss": 0.5517, + "step": 69360 + }, + { + "epoch": 1.5440259971509973, + "grad_norm": 0.5891059637069702, + "learning_rate": 0.00019081864769781822, + "loss": 0.5764, + "step": 69370 + }, + { + "epoch": 1.5442485754985755, + "grad_norm": 0.7347457408905029, + "learning_rate": 0.00019077207486561658, + "loss": 0.522, + "step": 69380 + }, + { + "epoch": 1.5444711538461537, + "grad_norm": 0.6764406561851501, + "learning_rate": 0.00019072550253486798, + "loss": 0.6101, + "step": 69390 + }, + { + "epoch": 1.5446937321937322, + "grad_norm": 0.34939131140708923, + "learning_rate": 0.00019067893070810312, + "loss": 0.5842, + "step": 69400 + }, + { + "epoch": 1.5449163105413106, + "grad_norm": 0.5678504705429077, + "learning_rate": 0.0001906323593878528, + "loss": 0.6085, + "step": 69410 + }, + { + "epoch": 1.5451388888888888, + "grad_norm": 0.4918629229068756, + "learning_rate": 0.00019058578857664778, + "loss": 0.6456, + "step": 69420 + }, + { + "epoch": 1.5453614672364673, + "grad_norm": 0.636618435382843, + "learning_rate": 0.00019053921827701865, + "loss": 0.5643, + "step": 69430 + }, + { + "epoch": 1.5455840455840457, + "grad_norm": 0.4326227605342865, + "learning_rate": 0.00019049264849149627, + "loss": 0.5054, + "step": 69440 + }, + { + "epoch": 1.545806623931624, + "grad_norm": 0.6106733083724976, + "learning_rate": 0.00019044607922261104, + "loss": 0.5299, + "step": 69450 + }, + { + "epoch": 1.5460292022792022, + "grad_norm": 0.5556911826133728, + "learning_rate": 0.00019039951047289375, + "loss": 0.5381, + "step": 69460 + }, + { + "epoch": 1.5462517806267806, + "grad_norm": 0.3829481303691864, + "learning_rate": 0.00019035294224487487, + "loss": 0.4977, + "step": 69470 + }, + { + "epoch": 1.546474358974359, + "grad_norm": 0.35006266832351685, + "learning_rate": 0.000190306374541085, + "loss": 0.5947, + "step": 69480 + }, + { + "epoch": 1.5466969373219372, + "grad_norm": 0.6910932660102844, + "learning_rate": 0.0001902598073640547, + "loss": 0.5648, + "step": 69490 + }, + { + "epoch": 1.5469195156695157, + "grad_norm": 0.5231965184211731, + "learning_rate": 0.00019021324071631442, + "loss": 0.5198, + "step": 69500 + }, + { + "epoch": 1.5471420940170941, + "grad_norm": 0.5858936309814453, + "learning_rate": 0.00019016667460039466, + "loss": 0.6829, + "step": 69510 + }, + { + "epoch": 1.5473646723646723, + "grad_norm": 0.5553900003433228, + "learning_rate": 0.00019012010901882584, + "loss": 0.5341, + "step": 69520 + }, + { + "epoch": 1.5475872507122506, + "grad_norm": 0.7034667134284973, + "learning_rate": 0.0001900735439741384, + "loss": 0.5864, + "step": 69530 + }, + { + "epoch": 1.5478098290598292, + "grad_norm": 0.6792713403701782, + "learning_rate": 0.00019002697946886272, + "loss": 0.6046, + "step": 69540 + }, + { + "epoch": 1.5480324074074074, + "grad_norm": 0.8406374454498291, + "learning_rate": 0.0001899804155055291, + "loss": 0.5032, + "step": 69550 + }, + { + "epoch": 1.5482549857549857, + "grad_norm": 0.5789881944656372, + "learning_rate": 0.00018993385208666797, + "loss": 0.5651, + "step": 69560 + }, + { + "epoch": 1.548477564102564, + "grad_norm": 0.732040286064148, + "learning_rate": 0.0001898872892148096, + "loss": 0.5107, + "step": 69570 + }, + { + "epoch": 1.5487001424501425, + "grad_norm": 0.7222851514816284, + "learning_rate": 0.0001898407268924841, + "loss": 0.5173, + "step": 69580 + }, + { + "epoch": 1.5489227207977208, + "grad_norm": 1.1726871728897095, + "learning_rate": 0.00018979416512222182, + "loss": 0.5663, + "step": 69590 + }, + { + "epoch": 1.5491452991452992, + "grad_norm": 0.6467028260231018, + "learning_rate": 0.000189747603906553, + "loss": 0.5539, + "step": 69600 + }, + { + "epoch": 1.5493678774928776, + "grad_norm": 0.4506545662879944, + "learning_rate": 0.00018970104324800776, + "loss": 0.6448, + "step": 69610 + }, + { + "epoch": 1.5495904558404558, + "grad_norm": 0.4832093417644501, + "learning_rate": 0.00018965448314911627, + "loss": 0.5298, + "step": 69620 + }, + { + "epoch": 1.549813034188034, + "grad_norm": 0.6158397197723389, + "learning_rate": 0.00018960792361240867, + "loss": 0.5429, + "step": 69630 + }, + { + "epoch": 1.5500356125356125, + "grad_norm": 0.4832749664783478, + "learning_rate": 0.00018956136464041493, + "loss": 0.5784, + "step": 69640 + }, + { + "epoch": 1.550258190883191, + "grad_norm": 0.7306379079818726, + "learning_rate": 0.00018951480623566523, + "loss": 0.6367, + "step": 69650 + }, + { + "epoch": 1.5504807692307692, + "grad_norm": 0.563369631767273, + "learning_rate": 0.0001894682484006895, + "loss": 0.4721, + "step": 69660 + }, + { + "epoch": 1.5507033475783476, + "grad_norm": 0.4590865671634674, + "learning_rate": 0.00018942169113801783, + "loss": 0.5756, + "step": 69670 + }, + { + "epoch": 1.550925925925926, + "grad_norm": 0.8247950673103333, + "learning_rate": 0.00018937513445018008, + "loss": 0.5036, + "step": 69680 + }, + { + "epoch": 1.5511485042735043, + "grad_norm": 0.4580720365047455, + "learning_rate": 0.00018932857833970626, + "loss": 0.5799, + "step": 69690 + }, + { + "epoch": 1.5513710826210825, + "grad_norm": 0.6762135028839111, + "learning_rate": 0.0001892820228091263, + "loss": 0.489, + "step": 69700 + }, + { + "epoch": 1.551593660968661, + "grad_norm": 0.6307932734489441, + "learning_rate": 0.0001892354678609699, + "loss": 0.6051, + "step": 69710 + }, + { + "epoch": 1.5518162393162394, + "grad_norm": 0.6217736601829529, + "learning_rate": 0.000189188913497767, + "loss": 0.5964, + "step": 69720 + }, + { + "epoch": 1.5520388176638176, + "grad_norm": 0.47461748123168945, + "learning_rate": 0.00018914235972204737, + "loss": 0.6002, + "step": 69730 + }, + { + "epoch": 1.552261396011396, + "grad_norm": 0.5183067917823792, + "learning_rate": 0.00018909580653634085, + "loss": 0.5517, + "step": 69740 + }, + { + "epoch": 1.5524839743589745, + "grad_norm": 0.5647053122520447, + "learning_rate": 0.00018904925394317709, + "loss": 0.4627, + "step": 69750 + }, + { + "epoch": 1.5527065527065527, + "grad_norm": 0.5931477546691895, + "learning_rate": 0.00018900270194508581, + "loss": 0.5838, + "step": 69760 + }, + { + "epoch": 1.552929131054131, + "grad_norm": 0.74869304895401, + "learning_rate": 0.00018895615054459678, + "loss": 0.572, + "step": 69770 + }, + { + "epoch": 1.5531517094017095, + "grad_norm": 0.47858738899230957, + "learning_rate": 0.0001889095997442395, + "loss": 0.5936, + "step": 69780 + }, + { + "epoch": 1.5533742877492878, + "grad_norm": 0.5192180871963501, + "learning_rate": 0.00018886304954654365, + "loss": 0.5538, + "step": 69790 + }, + { + "epoch": 1.553596866096866, + "grad_norm": 0.5217307806015015, + "learning_rate": 0.00018881649995403888, + "loss": 0.5716, + "step": 69800 + }, + { + "epoch": 1.5538194444444444, + "grad_norm": 0.4526553452014923, + "learning_rate": 0.0001887699509692546, + "loss": 0.5187, + "step": 69810 + }, + { + "epoch": 1.5540420227920229, + "grad_norm": 0.4228460192680359, + "learning_rate": 0.00018872340259472035, + "loss": 0.6612, + "step": 69820 + }, + { + "epoch": 1.554264601139601, + "grad_norm": 0.46403852105140686, + "learning_rate": 0.0001886768548329658, + "loss": 0.5794, + "step": 69830 + }, + { + "epoch": 1.5544871794871795, + "grad_norm": 0.6431500911712646, + "learning_rate": 0.00018863030768652005, + "loss": 0.5492, + "step": 69840 + }, + { + "epoch": 1.554709757834758, + "grad_norm": 0.5244579911231995, + "learning_rate": 0.00018858376115791277, + "loss": 0.5452, + "step": 69850 + }, + { + "epoch": 1.5549323361823362, + "grad_norm": 0.5913880467414856, + "learning_rate": 0.00018853721524967322, + "loss": 0.5058, + "step": 69860 + }, + { + "epoch": 1.5551549145299144, + "grad_norm": 0.7027416825294495, + "learning_rate": 0.0001884906699643308, + "loss": 0.5089, + "step": 69870 + }, + { + "epoch": 1.5553774928774928, + "grad_norm": 0.7109727263450623, + "learning_rate": 0.00018844412530441478, + "loss": 0.533, + "step": 69880 + }, + { + "epoch": 1.5556000712250713, + "grad_norm": 0.5859348177909851, + "learning_rate": 0.00018839758127245444, + "loss": 0.5419, + "step": 69890 + }, + { + "epoch": 1.5558226495726495, + "grad_norm": 0.7551231980323792, + "learning_rate": 0.00018835103787097902, + "loss": 0.6146, + "step": 69900 + }, + { + "epoch": 1.556045227920228, + "grad_norm": 0.548856258392334, + "learning_rate": 0.00018830449510251777, + "loss": 0.6349, + "step": 69910 + }, + { + "epoch": 1.5562678062678064, + "grad_norm": 0.5823180079460144, + "learning_rate": 0.00018825795296959982, + "loss": 0.6391, + "step": 69920 + }, + { + "epoch": 1.5564903846153846, + "grad_norm": 0.6126546859741211, + "learning_rate": 0.00018821141147475428, + "loss": 0.433, + "step": 69930 + }, + { + "epoch": 1.5567129629629628, + "grad_norm": 0.38551953434944153, + "learning_rate": 0.0001881648706205104, + "loss": 0.5385, + "step": 69940 + }, + { + "epoch": 1.5569355413105415, + "grad_norm": 0.48852142691612244, + "learning_rate": 0.00018811833040939703, + "loss": 0.5742, + "step": 69950 + }, + { + "epoch": 1.5571581196581197, + "grad_norm": 0.6741325855255127, + "learning_rate": 0.00018807179084394344, + "loss": 0.5228, + "step": 69960 + }, + { + "epoch": 1.557380698005698, + "grad_norm": 0.4975391924381256, + "learning_rate": 0.0001880252519266784, + "loss": 0.5911, + "step": 69970 + }, + { + "epoch": 1.5576032763532763, + "grad_norm": 0.868884265422821, + "learning_rate": 0.00018797871366013096, + "loss": 0.6094, + "step": 69980 + }, + { + "epoch": 1.5578258547008548, + "grad_norm": 0.5068603157997131, + "learning_rate": 0.0001879321760468301, + "loss": 0.5399, + "step": 69990 + }, + { + "epoch": 1.558048433048433, + "grad_norm": 0.4866209924221039, + "learning_rate": 0.00018788563908930466, + "loss": 0.656, + "step": 70000 + }, + { + "epoch": 1.5582710113960114, + "grad_norm": 0.5103034377098083, + "learning_rate": 0.0001878391027900835, + "loss": 0.5125, + "step": 70010 + }, + { + "epoch": 1.5584935897435899, + "grad_norm": 0.5046936869621277, + "learning_rate": 0.00018779256715169547, + "loss": 0.6247, + "step": 70020 + }, + { + "epoch": 1.558716168091168, + "grad_norm": 0.9371962547302246, + "learning_rate": 0.00018774603217666932, + "loss": 0.6255, + "step": 70030 + }, + { + "epoch": 1.5589387464387463, + "grad_norm": 0.5806150436401367, + "learning_rate": 0.00018769949786753381, + "loss": 0.5952, + "step": 70040 + }, + { + "epoch": 1.5591613247863247, + "grad_norm": 0.5372239947319031, + "learning_rate": 0.00018765296422681765, + "loss": 0.5557, + "step": 70050 + }, + { + "epoch": 1.5593839031339032, + "grad_norm": 0.42751508951187134, + "learning_rate": 0.00018760643125704954, + "loss": 0.6372, + "step": 70060 + }, + { + "epoch": 1.5596064814814814, + "grad_norm": 0.3532927632331848, + "learning_rate": 0.00018755989896075809, + "loss": 0.5215, + "step": 70070 + }, + { + "epoch": 1.5598290598290598, + "grad_norm": 0.6899681687355042, + "learning_rate": 0.00018751336734047194, + "loss": 0.5297, + "step": 70080 + }, + { + "epoch": 1.5600516381766383, + "grad_norm": 0.4909971356391907, + "learning_rate": 0.00018746683639871964, + "loss": 0.5418, + "step": 70090 + }, + { + "epoch": 1.5601851851851851, + "eval_loss": 0.564594566822052, + "eval_runtime": 337.0867, + "eval_samples_per_second": 7.016, + "eval_steps_per_second": 7.016, + "step": 70096 + }, + { + "epoch": 1.5602742165242165, + "grad_norm": 0.5869930982589722, + "learning_rate": 0.0001874203061380297, + "loss": 0.6224, + "step": 70100 + }, + { + "epoch": 1.5604967948717947, + "grad_norm": 0.653296709060669, + "learning_rate": 0.0001873737765609306, + "loss": 0.5368, + "step": 70110 + }, + { + "epoch": 1.5607193732193734, + "grad_norm": 0.6351401209831238, + "learning_rate": 0.0001873272476699508, + "loss": 0.6583, + "step": 70120 + }, + { + "epoch": 1.5609419515669516, + "grad_norm": 0.6833685040473938, + "learning_rate": 0.0001872807194676188, + "loss": 0.502, + "step": 70130 + }, + { + "epoch": 1.5611645299145298, + "grad_norm": 0.5794316530227661, + "learning_rate": 0.00018723419195646284, + "loss": 0.5486, + "step": 70140 + }, + { + "epoch": 1.5613871082621082, + "grad_norm": 0.6334303021430969, + "learning_rate": 0.00018718766513901134, + "loss": 0.5893, + "step": 70150 + }, + { + "epoch": 1.5616096866096867, + "grad_norm": 0.3508698642253876, + "learning_rate": 0.00018714113901779266, + "loss": 0.4961, + "step": 70160 + }, + { + "epoch": 1.561832264957265, + "grad_norm": 0.44076502323150635, + "learning_rate": 0.000187094613595335, + "loss": 0.5079, + "step": 70170 + }, + { + "epoch": 1.5620548433048433, + "grad_norm": 0.5710964202880859, + "learning_rate": 0.00018704808887416656, + "loss": 0.6312, + "step": 70180 + }, + { + "epoch": 1.5622774216524218, + "grad_norm": 0.34635406732559204, + "learning_rate": 0.00018700156485681563, + "loss": 0.5937, + "step": 70190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.46365317702293396, + "learning_rate": 0.00018695504154581026, + "loss": 0.4601, + "step": 70200 + }, + { + "epoch": 1.5627225783475782, + "grad_norm": 0.7181133031845093, + "learning_rate": 0.00018690851894367864, + "loss": 0.5682, + "step": 70210 + }, + { + "epoch": 1.5629451566951567, + "grad_norm": 1.1888110637664795, + "learning_rate": 0.00018686199705294883, + "loss": 0.5758, + "step": 70220 + }, + { + "epoch": 1.563167735042735, + "grad_norm": 0.5635021924972534, + "learning_rate": 0.00018681547587614888, + "loss": 0.4943, + "step": 70230 + }, + { + "epoch": 1.5633903133903133, + "grad_norm": 0.7435877323150635, + "learning_rate": 0.00018676895541580674, + "loss": 0.4574, + "step": 70240 + }, + { + "epoch": 1.5636128917378918, + "grad_norm": 0.5918731093406677, + "learning_rate": 0.00018672243567445035, + "loss": 0.5346, + "step": 70250 + }, + { + "epoch": 1.5638354700854702, + "grad_norm": 0.5541383624076843, + "learning_rate": 0.00018667591665460769, + "loss": 0.5451, + "step": 70260 + }, + { + "epoch": 1.5640580484330484, + "grad_norm": 0.8829107880592346, + "learning_rate": 0.00018662939835880667, + "loss": 0.6155, + "step": 70270 + }, + { + "epoch": 1.5642806267806266, + "grad_norm": 0.5331526398658752, + "learning_rate": 0.00018658288078957503, + "loss": 0.492, + "step": 70280 + }, + { + "epoch": 1.5645032051282053, + "grad_norm": 0.47868871688842773, + "learning_rate": 0.00018653636394944066, + "loss": 0.5991, + "step": 70290 + }, + { + "epoch": 1.5647257834757835, + "grad_norm": 0.918210506439209, + "learning_rate": 0.0001864898478409313, + "loss": 0.5491, + "step": 70300 + }, + { + "epoch": 1.5649483618233617, + "grad_norm": 0.48229533433914185, + "learning_rate": 0.00018644333246657467, + "loss": 0.4871, + "step": 70310 + }, + { + "epoch": 1.5651709401709402, + "grad_norm": 0.6398565769195557, + "learning_rate": 0.00018639681782889843, + "loss": 0.7005, + "step": 70320 + }, + { + "epoch": 1.5653935185185186, + "grad_norm": 0.43786656856536865, + "learning_rate": 0.00018635030393043028, + "loss": 0.6143, + "step": 70330 + }, + { + "epoch": 1.5656160968660968, + "grad_norm": 0.5263013243675232, + "learning_rate": 0.00018630379077369774, + "loss": 0.5106, + "step": 70340 + }, + { + "epoch": 1.5658386752136753, + "grad_norm": 0.6509763598442078, + "learning_rate": 0.00018625727836122844, + "loss": 0.6045, + "step": 70350 + }, + { + "epoch": 1.5660612535612537, + "grad_norm": 0.45926612615585327, + "learning_rate": 0.00018621076669554995, + "loss": 0.5444, + "step": 70360 + }, + { + "epoch": 1.566283831908832, + "grad_norm": 0.8301941752433777, + "learning_rate": 0.0001861642557791896, + "loss": 0.6015, + "step": 70370 + }, + { + "epoch": 1.5665064102564101, + "grad_norm": 0.5289263129234314, + "learning_rate": 0.00018611774561467498, + "loss": 0.5817, + "step": 70380 + }, + { + "epoch": 1.5667289886039886, + "grad_norm": 0.49125346541404724, + "learning_rate": 0.0001860712362045334, + "loss": 0.5645, + "step": 70390 + }, + { + "epoch": 1.566951566951567, + "grad_norm": 0.6305233240127563, + "learning_rate": 0.0001860247275512922, + "loss": 0.6307, + "step": 70400 + }, + { + "epoch": 1.5671741452991452, + "grad_norm": 0.7199904322624207, + "learning_rate": 0.00018597821965747884, + "loss": 0.4794, + "step": 70410 + }, + { + "epoch": 1.5673967236467237, + "grad_norm": 0.45911091566085815, + "learning_rate": 0.00018593171252562042, + "loss": 0.6006, + "step": 70420 + }, + { + "epoch": 1.567619301994302, + "grad_norm": 0.6156550049781799, + "learning_rate": 0.00018588520615824428, + "loss": 0.5031, + "step": 70430 + }, + { + "epoch": 1.5678418803418803, + "grad_norm": 0.8003026247024536, + "learning_rate": 0.00018583870055787762, + "loss": 0.4507, + "step": 70440 + }, + { + "epoch": 1.5680644586894585, + "grad_norm": 0.6622931361198425, + "learning_rate": 0.00018579219572704752, + "loss": 0.472, + "step": 70450 + }, + { + "epoch": 1.5682870370370372, + "grad_norm": 0.588738739490509, + "learning_rate": 0.00018574569166828115, + "loss": 0.489, + "step": 70460 + }, + { + "epoch": 1.5685096153846154, + "grad_norm": 0.652209997177124, + "learning_rate": 0.0001856991883841056, + "loss": 0.6046, + "step": 70470 + }, + { + "epoch": 1.5687321937321936, + "grad_norm": 0.5961380004882812, + "learning_rate": 0.00018565268587704783, + "loss": 0.5215, + "step": 70480 + }, + { + "epoch": 1.568954772079772, + "grad_norm": 0.48777759075164795, + "learning_rate": 0.0001856061841496349, + "loss": 0.5625, + "step": 70490 + }, + { + "epoch": 1.5691773504273505, + "grad_norm": 0.4242284297943115, + "learning_rate": 0.00018555968320439368, + "loss": 0.4548, + "step": 70500 + }, + { + "epoch": 1.5693999287749287, + "grad_norm": 0.49864986538887024, + "learning_rate": 0.00018551318304385107, + "loss": 0.5304, + "step": 70510 + }, + { + "epoch": 1.5696225071225072, + "grad_norm": 0.8613521456718445, + "learning_rate": 0.00018546668367053397, + "loss": 0.6085, + "step": 70520 + }, + { + "epoch": 1.5698450854700856, + "grad_norm": 0.49887996912002563, + "learning_rate": 0.00018542018508696916, + "loss": 0.6019, + "step": 70530 + }, + { + "epoch": 1.5700676638176638, + "grad_norm": 0.48798030614852905, + "learning_rate": 0.00018537368729568343, + "loss": 0.5913, + "step": 70540 + }, + { + "epoch": 1.570290242165242, + "grad_norm": 0.3880743384361267, + "learning_rate": 0.00018532719029920353, + "loss": 0.4696, + "step": 70550 + }, + { + "epoch": 1.5705128205128205, + "grad_norm": 0.5356256365776062, + "learning_rate": 0.0001852806941000561, + "loss": 0.5902, + "step": 70560 + }, + { + "epoch": 1.570735398860399, + "grad_norm": 0.8185734748840332, + "learning_rate": 0.0001852341987007678, + "loss": 0.5621, + "step": 70570 + }, + { + "epoch": 1.5709579772079771, + "grad_norm": 0.8998014330863953, + "learning_rate": 0.00018518770410386526, + "loss": 0.4523, + "step": 70580 + }, + { + "epoch": 1.5711805555555556, + "grad_norm": 0.6867089867591858, + "learning_rate": 0.00018514121031187498, + "loss": 0.6557, + "step": 70590 + }, + { + "epoch": 1.571403133903134, + "grad_norm": 0.6047399640083313, + "learning_rate": 0.00018509471732732348, + "loss": 0.5615, + "step": 70600 + }, + { + "epoch": 1.5716257122507122, + "grad_norm": 0.5918436050415039, + "learning_rate": 0.00018504822515273727, + "loss": 0.6941, + "step": 70610 + }, + { + "epoch": 1.5718482905982905, + "grad_norm": 0.673238217830658, + "learning_rate": 0.00018500173379064282, + "loss": 0.5057, + "step": 70620 + }, + { + "epoch": 1.572070868945869, + "grad_norm": 0.3770233988761902, + "learning_rate": 0.00018495524324356631, + "loss": 0.4909, + "step": 70630 + }, + { + "epoch": 1.5722934472934473, + "grad_norm": 0.6354002356529236, + "learning_rate": 0.00018490875351403425, + "loss": 0.5185, + "step": 70640 + }, + { + "epoch": 1.5725160256410255, + "grad_norm": 0.3865412175655365, + "learning_rate": 0.00018486226460457286, + "loss": 0.4186, + "step": 70650 + }, + { + "epoch": 1.572738603988604, + "grad_norm": 0.7606604099273682, + "learning_rate": 0.00018481577651770844, + "loss": 0.522, + "step": 70660 + }, + { + "epoch": 1.5729611823361824, + "grad_norm": 0.45895153284072876, + "learning_rate": 0.0001847692892559671, + "loss": 0.5963, + "step": 70670 + }, + { + "epoch": 1.5731837606837606, + "grad_norm": 0.4174489378929138, + "learning_rate": 0.00018472280282187504, + "loss": 0.4792, + "step": 70680 + }, + { + "epoch": 1.573406339031339, + "grad_norm": 1.0084757804870605, + "learning_rate": 0.00018467631721795846, + "loss": 0.619, + "step": 70690 + }, + { + "epoch": 1.5736289173789175, + "grad_norm": 0.5571333169937134, + "learning_rate": 0.00018462983244674328, + "loss": 0.4807, + "step": 70700 + }, + { + "epoch": 1.5738514957264957, + "grad_norm": 0.8347536325454712, + "learning_rate": 0.0001845833485107556, + "loss": 0.5284, + "step": 70710 + }, + { + "epoch": 1.574074074074074, + "grad_norm": 0.9292858839035034, + "learning_rate": 0.0001845368654125214, + "loss": 0.5157, + "step": 70720 + }, + { + "epoch": 1.5742966524216524, + "grad_norm": 0.639552652835846, + "learning_rate": 0.0001844903831545666, + "loss": 0.567, + "step": 70730 + }, + { + "epoch": 1.5745192307692308, + "grad_norm": 0.7539328336715698, + "learning_rate": 0.00018444390173941703, + "loss": 0.5415, + "step": 70740 + }, + { + "epoch": 1.574741809116809, + "grad_norm": 0.6498465538024902, + "learning_rate": 0.00018439742116959867, + "loss": 0.5269, + "step": 70750 + }, + { + "epoch": 1.5749643874643875, + "grad_norm": 0.511473536491394, + "learning_rate": 0.0001843509414476372, + "loss": 0.5868, + "step": 70760 + }, + { + "epoch": 1.575186965811966, + "grad_norm": 0.4643702805042267, + "learning_rate": 0.0001843044625760583, + "loss": 0.5276, + "step": 70770 + }, + { + "epoch": 1.5754095441595442, + "grad_norm": 0.6674240231513977, + "learning_rate": 0.00018425798455738777, + "loss": 0.4897, + "step": 70780 + }, + { + "epoch": 1.5756321225071224, + "grad_norm": 0.827182948589325, + "learning_rate": 0.00018421150739415131, + "loss": 0.5322, + "step": 70790 + }, + { + "epoch": 1.5758547008547008, + "grad_norm": 0.6789349317550659, + "learning_rate": 0.0001841650310888744, + "loss": 0.5365, + "step": 70800 + }, + { + "epoch": 1.5760772792022792, + "grad_norm": 0.4275471270084381, + "learning_rate": 0.00018411855564408268, + "loss": 0.5983, + "step": 70810 + }, + { + "epoch": 1.5762998575498575, + "grad_norm": 0.5810036659240723, + "learning_rate": 0.00018407208106230161, + "loss": 0.5801, + "step": 70820 + }, + { + "epoch": 1.576522435897436, + "grad_norm": 0.589556872844696, + "learning_rate": 0.0001840256073460568, + "loss": 0.6111, + "step": 70830 + }, + { + "epoch": 1.5767450142450143, + "grad_norm": 0.8313420414924622, + "learning_rate": 0.00018397913449787345, + "loss": 0.5872, + "step": 70840 + }, + { + "epoch": 1.5769675925925926, + "grad_norm": 0.4779322147369385, + "learning_rate": 0.00018393266252027707, + "loss": 0.6764, + "step": 70850 + }, + { + "epoch": 1.5771901709401708, + "grad_norm": 0.6745999455451965, + "learning_rate": 0.00018388619141579301, + "loss": 0.4457, + "step": 70860 + }, + { + "epoch": 1.5774127492877494, + "grad_norm": 0.5358150005340576, + "learning_rate": 0.0001838397211869465, + "loss": 0.5312, + "step": 70870 + }, + { + "epoch": 1.5776353276353277, + "grad_norm": 0.5253196954727173, + "learning_rate": 0.0001837932518362628, + "loss": 0.6305, + "step": 70880 + }, + { + "epoch": 1.5778579059829059, + "grad_norm": 0.592937707901001, + "learning_rate": 0.000183746783366267, + "loss": 0.4918, + "step": 70890 + }, + { + "epoch": 1.5780804843304843, + "grad_norm": 0.5754040479660034, + "learning_rate": 0.00018370031577948431, + "loss": 0.478, + "step": 70900 + }, + { + "epoch": 1.5783030626780628, + "grad_norm": 0.8871728181838989, + "learning_rate": 0.00018365384907843978, + "loss": 0.5147, + "step": 70910 + }, + { + "epoch": 1.578525641025641, + "grad_norm": 0.7244358062744141, + "learning_rate": 0.0001836073832656585, + "loss": 0.5109, + "step": 70920 + }, + { + "epoch": 1.5787482193732194, + "grad_norm": 0.6952163577079773, + "learning_rate": 0.00018356091834366545, + "loss": 0.5046, + "step": 70930 + }, + { + "epoch": 1.5789707977207978, + "grad_norm": 0.5666680335998535, + "learning_rate": 0.0001835144543149855, + "loss": 0.6313, + "step": 70940 + }, + { + "epoch": 1.579193376068376, + "grad_norm": 0.6511092782020569, + "learning_rate": 0.00018346799118214361, + "loss": 0.5232, + "step": 70950 + }, + { + "epoch": 1.5794159544159543, + "grad_norm": 0.4347042739391327, + "learning_rate": 0.00018342152894766463, + "loss": 0.4945, + "step": 70960 + }, + { + "epoch": 1.5796385327635327, + "grad_norm": 0.8085306286811829, + "learning_rate": 0.00018337506761407338, + "loss": 0.5116, + "step": 70970 + }, + { + "epoch": 1.5798611111111112, + "grad_norm": 0.8530517816543579, + "learning_rate": 0.0001833286071838945, + "loss": 0.7389, + "step": 70980 + }, + { + "epoch": 1.5800836894586894, + "grad_norm": 0.6170468926429749, + "learning_rate": 0.00018328214765965276, + "loss": 0.4751, + "step": 70990 + }, + { + "epoch": 1.5803062678062678, + "grad_norm": 0.8308414220809937, + "learning_rate": 0.00018323568904387284, + "loss": 0.6135, + "step": 71000 + }, + { + "epoch": 1.5805288461538463, + "grad_norm": 0.5918954610824585, + "learning_rate": 0.00018318923133907937, + "loss": 0.5327, + "step": 71010 + }, + { + "epoch": 1.5807514245014245, + "grad_norm": 0.7379618287086487, + "learning_rate": 0.0001831427745477967, + "loss": 0.4964, + "step": 71020 + }, + { + "epoch": 1.5809740028490027, + "grad_norm": 0.6630150675773621, + "learning_rate": 0.0001830963186725495, + "loss": 0.5815, + "step": 71030 + }, + { + "epoch": 1.5811965811965814, + "grad_norm": 0.5696947574615479, + "learning_rate": 0.00018304986371586225, + "loss": 0.4795, + "step": 71040 + }, + { + "epoch": 1.5814191595441596, + "grad_norm": 0.6283588409423828, + "learning_rate": 0.00018300340968025917, + "loss": 0.4752, + "step": 71050 + }, + { + "epoch": 1.5816417378917378, + "grad_norm": 0.7472113370895386, + "learning_rate": 0.00018295695656826477, + "loss": 0.4727, + "step": 71060 + }, + { + "epoch": 1.5818643162393162, + "grad_norm": 0.5877878665924072, + "learning_rate": 0.00018291050438240332, + "loss": 0.5458, + "step": 71070 + }, + { + "epoch": 1.5820868945868947, + "grad_norm": 0.9098490476608276, + "learning_rate": 0.000182864053125199, + "loss": 0.6001, + "step": 71080 + }, + { + "epoch": 1.5823094729344729, + "grad_norm": 0.5156393647193909, + "learning_rate": 0.0001828176027991761, + "loss": 0.6244, + "step": 71090 + }, + { + "epoch": 1.5825320512820513, + "grad_norm": 0.6438578963279724, + "learning_rate": 0.00018277115340685875, + "loss": 0.5452, + "step": 71100 + }, + { + "epoch": 1.5827546296296298, + "grad_norm": 0.6225441098213196, + "learning_rate": 0.00018272470495077098, + "loss": 0.5292, + "step": 71110 + }, + { + "epoch": 1.582977207977208, + "grad_norm": 0.5010538101196289, + "learning_rate": 0.0001826782574334369, + "loss": 0.6081, + "step": 71120 + }, + { + "epoch": 1.5831997863247862, + "grad_norm": 0.4037385582923889, + "learning_rate": 0.00018263181085738047, + "loss": 0.5426, + "step": 71130 + }, + { + "epoch": 1.5834223646723646, + "grad_norm": 0.6718563437461853, + "learning_rate": 0.00018258536522512578, + "loss": 0.6698, + "step": 71140 + }, + { + "epoch": 1.583644943019943, + "grad_norm": 0.5091724395751953, + "learning_rate": 0.0001825389205391965, + "loss": 0.6267, + "step": 71150 + }, + { + "epoch": 1.5838675213675213, + "grad_norm": 0.5664081573486328, + "learning_rate": 0.0001824924768021166, + "loss": 0.6177, + "step": 71160 + }, + { + "epoch": 1.5840900997150997, + "grad_norm": 1.1884257793426514, + "learning_rate": 0.00018244603401640981, + "loss": 0.4706, + "step": 71170 + }, + { + "epoch": 1.5843126780626782, + "grad_norm": 0.4916132986545563, + "learning_rate": 0.0001823995921845999, + "loss": 0.7023, + "step": 71180 + }, + { + "epoch": 1.5845352564102564, + "grad_norm": 0.45757535099983215, + "learning_rate": 0.00018235315130921058, + "loss": 0.545, + "step": 71190 + }, + { + "epoch": 1.5847578347578346, + "grad_norm": 0.5892700552940369, + "learning_rate": 0.00018230671139276544, + "loss": 0.585, + "step": 71200 + }, + { + "epoch": 1.5849804131054133, + "grad_norm": 0.5792036652565002, + "learning_rate": 0.00018226027243778813, + "loss": 0.5685, + "step": 71210 + }, + { + "epoch": 1.5852029914529915, + "grad_norm": 0.6208937168121338, + "learning_rate": 0.0001822138344468021, + "loss": 0.6743, + "step": 71220 + }, + { + "epoch": 1.5854255698005697, + "grad_norm": 0.4228981137275696, + "learning_rate": 0.00018216739742233086, + "loss": 0.4699, + "step": 71230 + }, + { + "epoch": 1.5856481481481481, + "grad_norm": 0.3311798870563507, + "learning_rate": 0.0001821209613668979, + "loss": 0.5585, + "step": 71240 + }, + { + "epoch": 1.5858707264957266, + "grad_norm": 0.617067813873291, + "learning_rate": 0.00018207452628302647, + "loss": 0.5973, + "step": 71250 + }, + { + "epoch": 1.5860933048433048, + "grad_norm": 0.8205387592315674, + "learning_rate": 0.00018202809217323997, + "loss": 0.4705, + "step": 71260 + }, + { + "epoch": 1.5863158831908832, + "grad_norm": 0.6504186391830444, + "learning_rate": 0.00018198165904006165, + "loss": 0.5848, + "step": 71270 + }, + { + "epoch": 1.5865384615384617, + "grad_norm": 0.5408328771591187, + "learning_rate": 0.00018193522688601482, + "loss": 0.4718, + "step": 71280 + }, + { + "epoch": 1.58676103988604, + "grad_norm": 0.5376555919647217, + "learning_rate": 0.0001818887957136225, + "loss": 0.6386, + "step": 71290 + }, + { + "epoch": 1.586983618233618, + "grad_norm": 0.5056243538856506, + "learning_rate": 0.0001818423655254078, + "loss": 0.5108, + "step": 71300 + }, + { + "epoch": 1.5872061965811965, + "grad_norm": 1.2390053272247314, + "learning_rate": 0.00018179593632389383, + "loss": 0.6817, + "step": 71310 + }, + { + "epoch": 1.587428774928775, + "grad_norm": 0.614452600479126, + "learning_rate": 0.00018174950811160364, + "loss": 0.4678, + "step": 71320 + }, + { + "epoch": 1.5876513532763532, + "grad_norm": 0.682161271572113, + "learning_rate": 0.00018170308089106006, + "loss": 0.5338, + "step": 71330 + }, + { + "epoch": 1.5878739316239316, + "grad_norm": 0.5767924785614014, + "learning_rate": 0.00018165665466478604, + "loss": 0.6269, + "step": 71340 + }, + { + "epoch": 1.58809650997151, + "grad_norm": 0.4488545060157776, + "learning_rate": 0.00018161022943530446, + "loss": 0.6213, + "step": 71350 + }, + { + "epoch": 1.5883190883190883, + "grad_norm": 0.7338999509811401, + "learning_rate": 0.00018156380520513804, + "loss": 0.4092, + "step": 71360 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.536692202091217, + "learning_rate": 0.00018151738197680952, + "loss": 0.4023, + "step": 71370 + }, + { + "epoch": 1.5887642450142452, + "grad_norm": 0.6144670248031616, + "learning_rate": 0.00018147095975284166, + "loss": 0.6507, + "step": 71380 + }, + { + "epoch": 1.5889868233618234, + "grad_norm": 0.8293641209602356, + "learning_rate": 0.00018142453853575697, + "loss": 0.4912, + "step": 71390 + }, + { + "epoch": 1.5892094017094016, + "grad_norm": 0.41700026392936707, + "learning_rate": 0.00018137811832807802, + "loss": 0.5167, + "step": 71400 + }, + { + "epoch": 1.58943198005698, + "grad_norm": 0.6886595487594604, + "learning_rate": 0.0001813316991323275, + "loss": 0.4783, + "step": 71410 + }, + { + "epoch": 1.5896545584045585, + "grad_norm": 0.5606061816215515, + "learning_rate": 0.00018128528095102763, + "loss": 0.535, + "step": 71420 + }, + { + "epoch": 1.5898771367521367, + "grad_norm": 0.47022682428359985, + "learning_rate": 0.00018123886378670095, + "loss": 0.4686, + "step": 71430 + }, + { + "epoch": 1.5900997150997151, + "grad_norm": 0.7232200503349304, + "learning_rate": 0.00018119244764186972, + "loss": 0.6632, + "step": 71440 + }, + { + "epoch": 1.5903222934472936, + "grad_norm": 0.6137561798095703, + "learning_rate": 0.0001811460325190563, + "loss": 0.4522, + "step": 71450 + }, + { + "epoch": 1.5905448717948718, + "grad_norm": 0.586788535118103, + "learning_rate": 0.00018109961842078295, + "loss": 0.4802, + "step": 71460 + }, + { + "epoch": 1.59076745014245, + "grad_norm": 0.7412348985671997, + "learning_rate": 0.00018105320534957177, + "loss": 0.5579, + "step": 71470 + }, + { + "epoch": 1.5909900284900285, + "grad_norm": 0.5436372756958008, + "learning_rate": 0.0001810067933079449, + "loss": 0.6088, + "step": 71480 + }, + { + "epoch": 1.591212606837607, + "grad_norm": 0.3423854410648346, + "learning_rate": 0.0001809603822984245, + "loss": 0.3988, + "step": 71490 + }, + { + "epoch": 1.5914351851851851, + "grad_norm": 0.4644746780395508, + "learning_rate": 0.00018091397232353245, + "loss": 0.5718, + "step": 71500 + }, + { + "epoch": 1.5916577635327636, + "grad_norm": 0.6078989505767822, + "learning_rate": 0.00018086756338579077, + "loss": 0.4827, + "step": 71510 + }, + { + "epoch": 1.591880341880342, + "grad_norm": 0.419260174036026, + "learning_rate": 0.0001808211554877214, + "loss": 0.4581, + "step": 71520 + }, + { + "epoch": 1.5921029202279202, + "grad_norm": 0.4638487994670868, + "learning_rate": 0.0001807747486318461, + "loss": 0.5443, + "step": 71530 + }, + { + "epoch": 1.5923254985754984, + "grad_norm": 0.49142155051231384, + "learning_rate": 0.0001807283428206868, + "loss": 0.4928, + "step": 71540 + }, + { + "epoch": 1.5925480769230769, + "grad_norm": 0.7733557224273682, + "learning_rate": 0.00018068193805676502, + "loss": 0.6753, + "step": 71550 + }, + { + "epoch": 1.5927706552706553, + "grad_norm": 0.6128353476524353, + "learning_rate": 0.00018063553434260254, + "loss": 0.7242, + "step": 71560 + }, + { + "epoch": 1.5929932336182335, + "grad_norm": 0.9399389028549194, + "learning_rate": 0.00018058913168072106, + "loss": 0.5613, + "step": 71570 + }, + { + "epoch": 1.593215811965812, + "grad_norm": 0.47446173429489136, + "learning_rate": 0.00018054273007364196, + "loss": 0.5745, + "step": 71580 + }, + { + "epoch": 1.5934383903133904, + "grad_norm": 0.6724503636360168, + "learning_rate": 0.00018049632952388684, + "loss": 0.6099, + "step": 71590 + }, + { + "epoch": 1.5936609686609686, + "grad_norm": 0.5252869725227356, + "learning_rate": 0.00018044993003397722, + "loss": 0.5341, + "step": 71600 + }, + { + "epoch": 1.593883547008547, + "grad_norm": 0.6296842098236084, + "learning_rate": 0.00018040353160643432, + "loss": 0.4828, + "step": 71610 + }, + { + "epoch": 1.5941061253561255, + "grad_norm": 0.8079186081886292, + "learning_rate": 0.0001803571342437796, + "loss": 0.6052, + "step": 71620 + }, + { + "epoch": 1.5943287037037037, + "grad_norm": 0.4828781485557556, + "learning_rate": 0.0001803107379485343, + "loss": 0.424, + "step": 71630 + }, + { + "epoch": 1.594551282051282, + "grad_norm": 0.5179343819618225, + "learning_rate": 0.00018026434272321956, + "loss": 0.4261, + "step": 71640 + }, + { + "epoch": 1.5947738603988604, + "grad_norm": 0.6875476837158203, + "learning_rate": 0.00018021794857035663, + "loss": 0.51, + "step": 71650 + }, + { + "epoch": 1.5949964387464388, + "grad_norm": 0.7260758876800537, + "learning_rate": 0.0001801715554924666, + "loss": 0.5178, + "step": 71660 + }, + { + "epoch": 1.595219017094017, + "grad_norm": 0.5954149961471558, + "learning_rate": 0.00018012516349207047, + "loss": 0.437, + "step": 71670 + }, + { + "epoch": 1.5954415954415955, + "grad_norm": 0.6847278475761414, + "learning_rate": 0.00018007877257168923, + "loss": 0.5978, + "step": 71680 + }, + { + "epoch": 1.595664173789174, + "grad_norm": 0.5501822233200073, + "learning_rate": 0.00018003238273384375, + "loss": 0.3647, + "step": 71690 + }, + { + "epoch": 1.5958867521367521, + "grad_norm": 0.4372483789920807, + "learning_rate": 0.00017998599398105495, + "loss": 0.4032, + "step": 71700 + }, + { + "epoch": 1.5961093304843303, + "grad_norm": 0.4264819920063019, + "learning_rate": 0.00017993960631584367, + "loss": 0.4546, + "step": 71710 + }, + { + "epoch": 1.5963319088319088, + "grad_norm": 0.4155157506465912, + "learning_rate": 0.00017989321974073053, + "loss": 0.638, + "step": 71720 + }, + { + "epoch": 1.5965544871794872, + "grad_norm": 0.45495685935020447, + "learning_rate": 0.00017984683425823633, + "loss": 0.5307, + "step": 71730 + }, + { + "epoch": 1.5967770655270654, + "grad_norm": 0.6772878766059875, + "learning_rate": 0.00017980044987088166, + "loss": 0.6329, + "step": 71740 + }, + { + "epoch": 1.5969996438746439, + "grad_norm": 0.5602226257324219, + "learning_rate": 0.00017975406658118706, + "loss": 0.63, + "step": 71750 + }, + { + "epoch": 1.5972222222222223, + "grad_norm": 0.5901429653167725, + "learning_rate": 0.00017970768439167303, + "loss": 0.5602, + "step": 71760 + }, + { + "epoch": 1.5974448005698005, + "grad_norm": 0.5109543800354004, + "learning_rate": 0.0001796613033048601, + "loss": 0.5425, + "step": 71770 + }, + { + "epoch": 1.5976673789173788, + "grad_norm": 0.7157694101333618, + "learning_rate": 0.00017961492332326857, + "loss": 0.5909, + "step": 71780 + }, + { + "epoch": 1.5978899572649574, + "grad_norm": 0.5277893543243408, + "learning_rate": 0.00017956854444941878, + "loss": 0.542, + "step": 71790 + }, + { + "epoch": 1.5981125356125356, + "grad_norm": 0.5802013278007507, + "learning_rate": 0.0001795221666858311, + "loss": 0.556, + "step": 71800 + }, + { + "epoch": 1.5983351139601139, + "grad_norm": 0.5859088897705078, + "learning_rate": 0.00017947579003502556, + "loss": 0.6438, + "step": 71810 + }, + { + "epoch": 1.5985576923076923, + "grad_norm": 0.6160123944282532, + "learning_rate": 0.00017942941449952245, + "loss": 0.6975, + "step": 71820 + }, + { + "epoch": 1.5987802706552707, + "grad_norm": 0.6131763458251953, + "learning_rate": 0.00017938304008184174, + "loss": 0.5347, + "step": 71830 + }, + { + "epoch": 1.599002849002849, + "grad_norm": 0.5505862236022949, + "learning_rate": 0.0001793366667845035, + "loss": 0.6364, + "step": 71840 + }, + { + "epoch": 1.5992254273504274, + "grad_norm": 0.5568742752075195, + "learning_rate": 0.00017929029461002772, + "loss": 0.5858, + "step": 71850 + }, + { + "epoch": 1.5994480056980058, + "grad_norm": 0.6193007230758667, + "learning_rate": 0.0001792439235609343, + "loss": 0.4796, + "step": 71860 + }, + { + "epoch": 1.599670584045584, + "grad_norm": 0.5258501172065735, + "learning_rate": 0.00017919755363974304, + "loss": 0.5053, + "step": 71870 + }, + { + "epoch": 1.5998931623931623, + "grad_norm": 0.6486078500747681, + "learning_rate": 0.00017915118484897378, + "loss": 0.5535, + "step": 71880 + }, + { + "epoch": 1.6001157407407407, + "grad_norm": 0.8163578510284424, + "learning_rate": 0.00017910481719114616, + "loss": 0.5581, + "step": 71890 + }, + { + "epoch": 1.6003383190883191, + "grad_norm": 0.42560261487960815, + "learning_rate": 0.0001790584506687799, + "loss": 0.3899, + "step": 71900 + }, + { + "epoch": 1.6005608974358974, + "grad_norm": 0.571181058883667, + "learning_rate": 0.0001790120852843946, + "loss": 0.5681, + "step": 71910 + }, + { + "epoch": 1.6007834757834758, + "grad_norm": 0.5772104859352112, + "learning_rate": 0.00017896572104050972, + "loss": 0.5197, + "step": 71920 + }, + { + "epoch": 1.6010060541310542, + "grad_norm": 0.4699803292751312, + "learning_rate": 0.00017891935793964487, + "loss": 0.475, + "step": 71930 + }, + { + "epoch": 1.6012286324786325, + "grad_norm": 0.6732143759727478, + "learning_rate": 0.0001788729959843193, + "loss": 0.4731, + "step": 71940 + }, + { + "epoch": 1.6014512108262107, + "grad_norm": 0.6661916971206665, + "learning_rate": 0.00017882663517705246, + "loss": 0.4658, + "step": 71950 + }, + { + "epoch": 1.6016737891737893, + "grad_norm": 0.6001116037368774, + "learning_rate": 0.00017878027552036354, + "loss": 0.5499, + "step": 71960 + }, + { + "epoch": 1.6018963675213675, + "grad_norm": 0.6547945141792297, + "learning_rate": 0.00017873391701677183, + "loss": 0.5831, + "step": 71970 + }, + { + "epoch": 1.6021189458689458, + "grad_norm": 0.7915700674057007, + "learning_rate": 0.00017868755966879646, + "loss": 0.604, + "step": 71980 + }, + { + "epoch": 1.6023415242165242, + "grad_norm": 0.28278616070747375, + "learning_rate": 0.00017864120347895658, + "loss": 0.4522, + "step": 71990 + }, + { + "epoch": 1.6025641025641026, + "grad_norm": 0.4669821262359619, + "learning_rate": 0.00017859484844977118, + "loss": 0.5869, + "step": 72000 + }, + { + "epoch": 1.6027866809116809, + "grad_norm": 0.907658040523529, + "learning_rate": 0.00017854849458375923, + "loss": 0.5388, + "step": 72010 + }, + { + "epoch": 1.6030092592592593, + "grad_norm": 0.5259823203086853, + "learning_rate": 0.00017850214188343968, + "loss": 0.6287, + "step": 72020 + }, + { + "epoch": 1.6032318376068377, + "grad_norm": 0.5709260702133179, + "learning_rate": 0.0001784557903513313, + "loss": 0.5868, + "step": 72030 + }, + { + "epoch": 1.603454415954416, + "grad_norm": 0.5346744060516357, + "learning_rate": 0.0001784094399899529, + "loss": 0.5212, + "step": 72040 + }, + { + "epoch": 1.6036769943019942, + "grad_norm": 0.5017918348312378, + "learning_rate": 0.00017836309080182327, + "loss": 0.5202, + "step": 72050 + }, + { + "epoch": 1.6038995726495726, + "grad_norm": 0.520685076713562, + "learning_rate": 0.00017831674278946102, + "loss": 0.5132, + "step": 72060 + }, + { + "epoch": 1.604122150997151, + "grad_norm": 0.6095485687255859, + "learning_rate": 0.00017827039595538465, + "loss": 0.6865, + "step": 72070 + }, + { + "epoch": 1.6043447293447293, + "grad_norm": 0.47989124059677124, + "learning_rate": 0.00017822405030211273, + "loss": 0.4015, + "step": 72080 + }, + { + "epoch": 1.6045673076923077, + "grad_norm": 0.5909743309020996, + "learning_rate": 0.0001781777058321638, + "loss": 0.4716, + "step": 72090 + }, + { + "epoch": 1.6047898860398861, + "grad_norm": 0.5345268249511719, + "learning_rate": 0.00017813136254805616, + "loss": 0.5581, + "step": 72100 + }, + { + "epoch": 1.6050124643874644, + "grad_norm": 0.5118623375892639, + "learning_rate": 0.00017808502045230817, + "loss": 0.5295, + "step": 72110 + }, + { + "epoch": 1.6052350427350426, + "grad_norm": 0.5770447850227356, + "learning_rate": 0.00017803867954743811, + "loss": 0.4923, + "step": 72120 + }, + { + "epoch": 1.6054576210826212, + "grad_norm": 0.49001508951187134, + "learning_rate": 0.00017799233983596425, + "loss": 0.5184, + "step": 72130 + }, + { + "epoch": 1.6056801994301995, + "grad_norm": 0.6325069665908813, + "learning_rate": 0.00017794600132040457, + "loss": 0.5174, + "step": 72140 + }, + { + "epoch": 1.6059027777777777, + "grad_norm": 0.36604610085487366, + "learning_rate": 0.00017789966400327727, + "loss": 0.5228, + "step": 72150 + }, + { + "epoch": 1.6061253561253561, + "grad_norm": 0.5231015682220459, + "learning_rate": 0.00017785332788710037, + "loss": 0.5275, + "step": 72160 + }, + { + "epoch": 1.6063479344729346, + "grad_norm": 0.8676664233207703, + "learning_rate": 0.0001778069929743917, + "loss": 0.6281, + "step": 72170 + }, + { + "epoch": 1.6065705128205128, + "grad_norm": 0.4106631875038147, + "learning_rate": 0.0001777606592676692, + "loss": 0.4852, + "step": 72180 + }, + { + "epoch": 1.6067930911680912, + "grad_norm": 0.4075709581375122, + "learning_rate": 0.00017771432676945076, + "loss": 0.4958, + "step": 72190 + }, + { + "epoch": 1.6070156695156697, + "grad_norm": 0.6719409823417664, + "learning_rate": 0.000177667995482254, + "loss": 0.5834, + "step": 72200 + }, + { + "epoch": 1.6072382478632479, + "grad_norm": 0.6902719736099243, + "learning_rate": 0.00017762166540859664, + "loss": 0.4113, + "step": 72210 + }, + { + "epoch": 1.607460826210826, + "grad_norm": 0.5260211229324341, + "learning_rate": 0.00017757533655099627, + "loss": 0.5081, + "step": 72220 + }, + { + "epoch": 1.6076834045584045, + "grad_norm": 0.4960169792175293, + "learning_rate": 0.00017752900891197053, + "loss": 0.6683, + "step": 72230 + }, + { + "epoch": 1.607905982905983, + "grad_norm": 0.5500181913375854, + "learning_rate": 0.00017748268249403678, + "loss": 0.5539, + "step": 72240 + }, + { + "epoch": 1.6081285612535612, + "grad_norm": 0.5022486448287964, + "learning_rate": 0.0001774363572997125, + "loss": 0.5495, + "step": 72250 + }, + { + "epoch": 1.6083511396011396, + "grad_norm": 0.4881563186645508, + "learning_rate": 0.00017739003333151505, + "loss": 0.5351, + "step": 72260 + }, + { + "epoch": 1.608573717948718, + "grad_norm": 0.5786612033843994, + "learning_rate": 0.00017734371059196176, + "loss": 0.5741, + "step": 72270 + }, + { + "epoch": 1.6087962962962963, + "grad_norm": 0.5145733952522278, + "learning_rate": 0.0001772973890835697, + "loss": 0.5077, + "step": 72280 + }, + { + "epoch": 1.6090188746438745, + "grad_norm": 0.532596230506897, + "learning_rate": 0.00017725106880885612, + "loss": 0.5594, + "step": 72290 + }, + { + "epoch": 1.609241452991453, + "grad_norm": 0.4536045491695404, + "learning_rate": 0.0001772047497703381, + "loss": 0.5199, + "step": 72300 + }, + { + "epoch": 1.6094640313390314, + "grad_norm": 0.7692566514015198, + "learning_rate": 0.00017715843197053263, + "loss": 0.4523, + "step": 72310 + }, + { + "epoch": 1.6096866096866096, + "grad_norm": 0.7244535684585571, + "learning_rate": 0.00017711211541195676, + "loss": 0.5721, + "step": 72320 + }, + { + "epoch": 1.609909188034188, + "grad_norm": 0.477071076631546, + "learning_rate": 0.00017706580009712716, + "loss": 0.5792, + "step": 72330 + }, + { + "epoch": 1.6101317663817665, + "grad_norm": 1.0131349563598633, + "learning_rate": 0.00017701948602856082, + "loss": 0.5588, + "step": 72340 + }, + { + "epoch": 1.6103543447293447, + "grad_norm": 0.5438540577888489, + "learning_rate": 0.00017697317320877436, + "loss": 0.4861, + "step": 72350 + }, + { + "epoch": 1.6105769230769231, + "grad_norm": 0.4957207143306732, + "learning_rate": 0.00017692686164028453, + "loss": 0.4832, + "step": 72360 + }, + { + "epoch": 1.6107995014245016, + "grad_norm": 0.5664975643157959, + "learning_rate": 0.00017688055132560797, + "loss": 0.5985, + "step": 72370 + }, + { + "epoch": 1.6110220797720798, + "grad_norm": 0.5937734246253967, + "learning_rate": 0.00017683424226726114, + "loss": 0.5806, + "step": 72380 + }, + { + "epoch": 1.611244658119658, + "grad_norm": 0.48959553241729736, + "learning_rate": 0.00017678793446776055, + "loss": 0.5822, + "step": 72390 + }, + { + "epoch": 1.6114672364672364, + "grad_norm": 0.7238348126411438, + "learning_rate": 0.00017674162792962264, + "loss": 0.5435, + "step": 72400 + }, + { + "epoch": 1.6116898148148149, + "grad_norm": 0.5640783309936523, + "learning_rate": 0.00017669532265536365, + "loss": 0.5744, + "step": 72410 + }, + { + "epoch": 1.611912393162393, + "grad_norm": 0.44172927737236023, + "learning_rate": 0.0001766490186474999, + "loss": 0.6866, + "step": 72420 + }, + { + "epoch": 1.6121349715099715, + "grad_norm": 0.8031690120697021, + "learning_rate": 0.00017660271590854765, + "loss": 0.6749, + "step": 72430 + }, + { + "epoch": 1.61235754985755, + "grad_norm": 0.6939573287963867, + "learning_rate": 0.00017655641444102294, + "loss": 0.5717, + "step": 72440 + }, + { + "epoch": 1.6125801282051282, + "grad_norm": 0.3727976679801941, + "learning_rate": 0.00017651011424744189, + "loss": 0.4701, + "step": 72450 + }, + { + "epoch": 1.6128027065527064, + "grad_norm": 0.43301764130592346, + "learning_rate": 0.0001764638153303205, + "loss": 0.5043, + "step": 72460 + }, + { + "epoch": 1.6130252849002849, + "grad_norm": 0.5547754168510437, + "learning_rate": 0.00017641751769217459, + "loss": 0.5573, + "step": 72470 + }, + { + "epoch": 1.6132478632478633, + "grad_norm": 0.5771608352661133, + "learning_rate": 0.0001763712213355201, + "loss": 0.4549, + "step": 72480 + }, + { + "epoch": 1.6134704415954415, + "grad_norm": 0.7360827922821045, + "learning_rate": 0.00017632492626287275, + "loss": 0.5214, + "step": 72490 + }, + { + "epoch": 1.61369301994302, + "grad_norm": 0.5349162220954895, + "learning_rate": 0.00017627863247674828, + "loss": 0.4989, + "step": 72500 + }, + { + "epoch": 1.6139155982905984, + "grad_norm": 0.3698183000087738, + "learning_rate": 0.00017623233997966239, + "loss": 0.4294, + "step": 72510 + }, + { + "epoch": 1.6141381766381766, + "grad_norm": 0.4332517683506012, + "learning_rate": 0.00017618604877413056, + "loss": 0.55, + "step": 72520 + }, + { + "epoch": 1.6143607549857548, + "grad_norm": 0.4140152037143707, + "learning_rate": 0.00017613975886266833, + "loss": 0.5283, + "step": 72530 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.7178450226783752, + "learning_rate": 0.0001760934702477912, + "loss": 0.6523, + "step": 72540 + }, + { + "epoch": 1.6148059116809117, + "grad_norm": 0.5415472984313965, + "learning_rate": 0.0001760471829320144, + "loss": 0.5014, + "step": 72550 + }, + { + "epoch": 1.61502849002849, + "grad_norm": 0.8831973075866699, + "learning_rate": 0.0001760008969178533, + "loss": 0.7235, + "step": 72560 + }, + { + "epoch": 1.6152510683760684, + "grad_norm": 0.6190544962882996, + "learning_rate": 0.0001759546122078231, + "loss": 0.595, + "step": 72570 + }, + { + "epoch": 1.6154736467236468, + "grad_norm": 0.5446276068687439, + "learning_rate": 0.000175908328804439, + "loss": 0.5364, + "step": 72580 + }, + { + "epoch": 1.615696225071225, + "grad_norm": 0.500360369682312, + "learning_rate": 0.00017586204671021612, + "loss": 0.6368, + "step": 72590 + }, + { + "epoch": 1.6159188034188035, + "grad_norm": 0.6088921427726746, + "learning_rate": 0.00017581576592766925, + "loss": 0.4953, + "step": 72600 + }, + { + "epoch": 1.616141381766382, + "grad_norm": 0.7649800777435303, + "learning_rate": 0.00017576948645931347, + "loss": 0.6165, + "step": 72610 + }, + { + "epoch": 1.61636396011396, + "grad_norm": 0.7882862091064453, + "learning_rate": 0.00017572320830766368, + "loss": 0.4718, + "step": 72620 + }, + { + "epoch": 1.6165865384615383, + "grad_norm": 0.7119544744491577, + "learning_rate": 0.00017567693147523455, + "loss": 0.5845, + "step": 72630 + }, + { + "epoch": 1.6168091168091168, + "grad_norm": 0.5975489020347595, + "learning_rate": 0.0001756306559645409, + "loss": 0.5229, + "step": 72640 + }, + { + "epoch": 1.6170316951566952, + "grad_norm": 0.389389306306839, + "learning_rate": 0.0001755843817780974, + "loss": 0.4661, + "step": 72650 + }, + { + "epoch": 1.6172542735042734, + "grad_norm": 0.4852727949619293, + "learning_rate": 0.00017553810891841854, + "loss": 0.5355, + "step": 72660 + }, + { + "epoch": 1.6174768518518519, + "grad_norm": 0.48424190282821655, + "learning_rate": 0.00017549183738801887, + "loss": 0.522, + "step": 72670 + }, + { + "epoch": 1.6176994301994303, + "grad_norm": 0.6597685217857361, + "learning_rate": 0.00017544556718941285, + "loss": 0.5024, + "step": 72680 + }, + { + "epoch": 1.6179220085470085, + "grad_norm": 0.724755048751831, + "learning_rate": 0.00017539929832511477, + "loss": 0.5394, + "step": 72690 + }, + { + "epoch": 1.6181445868945867, + "grad_norm": 0.6331667900085449, + "learning_rate": 0.00017535303079763893, + "loss": 0.5192, + "step": 72700 + }, + { + "epoch": 1.6183671652421654, + "grad_norm": 0.4962537884712219, + "learning_rate": 0.00017530676460949968, + "loss": 0.5735, + "step": 72710 + }, + { + "epoch": 1.6185897435897436, + "grad_norm": 0.3787330090999603, + "learning_rate": 0.000175260499763211, + "loss": 0.4621, + "step": 72720 + }, + { + "epoch": 1.6188123219373218, + "grad_norm": 0.543692946434021, + "learning_rate": 0.00017521423626128704, + "loss": 0.6384, + "step": 72730 + }, + { + "epoch": 1.6190349002849003, + "grad_norm": 0.3744504749774933, + "learning_rate": 0.00017516797410624174, + "loss": 0.6521, + "step": 72740 + }, + { + "epoch": 1.6192574786324787, + "grad_norm": 0.5380321145057678, + "learning_rate": 0.00017512171330058904, + "loss": 0.5083, + "step": 72750 + }, + { + "epoch": 1.619480056980057, + "grad_norm": 0.6366248726844788, + "learning_rate": 0.00017507545384684285, + "loss": 0.7177, + "step": 72760 + }, + { + "epoch": 1.6197026353276354, + "grad_norm": 0.3844091296195984, + "learning_rate": 0.00017502919574751685, + "loss": 0.4953, + "step": 72770 + }, + { + "epoch": 1.6199252136752138, + "grad_norm": 0.645712673664093, + "learning_rate": 0.0001749829390051248, + "loss": 0.5604, + "step": 72780 + }, + { + "epoch": 1.620147792022792, + "grad_norm": 0.4898901581764221, + "learning_rate": 0.0001749366836221804, + "loss": 0.745, + "step": 72790 + }, + { + "epoch": 1.6201923076923077, + "eval_loss": 0.5608183145523071, + "eval_runtime": 338.6161, + "eval_samples_per_second": 6.984, + "eval_steps_per_second": 6.984, + "step": 72792 + }, + { + "epoch": 1.6203703703703702, + "grad_norm": 0.5302802324295044, + "learning_rate": 0.00017489042960119707, + "loss": 0.5637, + "step": 72800 + }, + { + "epoch": 1.6205929487179487, + "grad_norm": 0.45127832889556885, + "learning_rate": 0.00017484417694468832, + "loss": 0.5059, + "step": 72810 + }, + { + "epoch": 1.6208155270655271, + "grad_norm": 0.46210169792175293, + "learning_rate": 0.00017479792565516764, + "loss": 0.4899, + "step": 72820 + }, + { + "epoch": 1.6210381054131053, + "grad_norm": 0.6143471002578735, + "learning_rate": 0.0001747516757351483, + "loss": 0.4319, + "step": 72830 + }, + { + "epoch": 1.6212606837606838, + "grad_norm": 0.6548733115196228, + "learning_rate": 0.00017470542718714353, + "loss": 0.4886, + "step": 72840 + }, + { + "epoch": 1.6214832621082622, + "grad_norm": 0.6141097545623779, + "learning_rate": 0.00017465918001366668, + "loss": 0.5904, + "step": 72850 + }, + { + "epoch": 1.6217058404558404, + "grad_norm": 0.6009771823883057, + "learning_rate": 0.0001746129342172306, + "loss": 0.6371, + "step": 72860 + }, + { + "epoch": 1.6219284188034186, + "grad_norm": 0.45862460136413574, + "learning_rate": 0.00017456668980034854, + "loss": 0.5505, + "step": 72870 + }, + { + "epoch": 1.6221509971509973, + "grad_norm": 0.550284743309021, + "learning_rate": 0.00017452044676553332, + "loss": 0.4793, + "step": 72880 + }, + { + "epoch": 1.6223735754985755, + "grad_norm": 0.35887646675109863, + "learning_rate": 0.00017447420511529789, + "loss": 0.4587, + "step": 72890 + }, + { + "epoch": 1.6225961538461537, + "grad_norm": 0.5029733180999756, + "learning_rate": 0.0001744279648521551, + "loss": 0.447, + "step": 72900 + }, + { + "epoch": 1.6228187321937322, + "grad_norm": 0.621500551700592, + "learning_rate": 0.00017438172597861754, + "loss": 0.4205, + "step": 72910 + }, + { + "epoch": 1.6230413105413106, + "grad_norm": 0.8010604381561279, + "learning_rate": 0.00017433548849719803, + "loss": 0.7382, + "step": 72920 + }, + { + "epoch": 1.6232638888888888, + "grad_norm": 0.5936803817749023, + "learning_rate": 0.0001742892524104091, + "loss": 0.4996, + "step": 72930 + }, + { + "epoch": 1.6234864672364673, + "grad_norm": 0.589506208896637, + "learning_rate": 0.0001742430177207632, + "loss": 0.5262, + "step": 72940 + }, + { + "epoch": 1.6237090455840457, + "grad_norm": 0.6230021119117737, + "learning_rate": 0.00017419678443077278, + "loss": 0.5251, + "step": 72950 + }, + { + "epoch": 1.623931623931624, + "grad_norm": 0.5615085363388062, + "learning_rate": 0.00017415055254295028, + "loss": 0.4211, + "step": 72960 + }, + { + "epoch": 1.6241542022792022, + "grad_norm": 0.7193329334259033, + "learning_rate": 0.00017410432205980787, + "loss": 0.4324, + "step": 72970 + }, + { + "epoch": 1.6243767806267806, + "grad_norm": 0.5282472968101501, + "learning_rate": 0.00017405809298385788, + "loss": 0.4229, + "step": 72980 + }, + { + "epoch": 1.624599358974359, + "grad_norm": 0.2699781358242035, + "learning_rate": 0.00017401186531761228, + "loss": 0.5352, + "step": 72990 + }, + { + "epoch": 1.6248219373219372, + "grad_norm": 0.6161572933197021, + "learning_rate": 0.00017396563906358319, + "loss": 0.5319, + "step": 73000 + }, + { + "epoch": 1.6250445156695157, + "grad_norm": 0.4191792607307434, + "learning_rate": 0.00017391941422428264, + "loss": 0.5876, + "step": 73010 + }, + { + "epoch": 1.6252670940170941, + "grad_norm": 0.5169584155082703, + "learning_rate": 0.00017387319080222242, + "loss": 0.606, + "step": 73020 + }, + { + "epoch": 1.6254896723646723, + "grad_norm": 0.510918915271759, + "learning_rate": 0.00017382696879991437, + "loss": 0.5738, + "step": 73030 + }, + { + "epoch": 1.6257122507122506, + "grad_norm": 0.49762460589408875, + "learning_rate": 0.00017378074821987035, + "loss": 0.5594, + "step": 73040 + }, + { + "epoch": 1.6259348290598292, + "grad_norm": 0.5228525400161743, + "learning_rate": 0.00017373452906460188, + "loss": 0.4566, + "step": 73050 + }, + { + "epoch": 1.6261574074074074, + "grad_norm": 0.46729880571365356, + "learning_rate": 0.00017368831133662057, + "loss": 0.4876, + "step": 73060 + }, + { + "epoch": 1.6263799857549857, + "grad_norm": 0.7242257595062256, + "learning_rate": 0.00017364209503843806, + "loss": 0.6487, + "step": 73070 + }, + { + "epoch": 1.626602564102564, + "grad_norm": 0.4142407774925232, + "learning_rate": 0.0001735958801725656, + "loss": 0.4958, + "step": 73080 + }, + { + "epoch": 1.6268251424501425, + "grad_norm": 0.48071005940437317, + "learning_rate": 0.00017354966674151462, + "loss": 0.4204, + "step": 73090 + }, + { + "epoch": 1.6270477207977208, + "grad_norm": 0.44896841049194336, + "learning_rate": 0.00017350345474779648, + "loss": 0.4794, + "step": 73100 + }, + { + "epoch": 1.6272702991452992, + "grad_norm": 0.5316339731216431, + "learning_rate": 0.0001734572441939223, + "loss": 0.6107, + "step": 73110 + }, + { + "epoch": 1.6274928774928776, + "grad_norm": 0.9510208964347839, + "learning_rate": 0.00017341103508240318, + "loss": 0.6287, + "step": 73120 + }, + { + "epoch": 1.6277154558404558, + "grad_norm": 0.5629388093948364, + "learning_rate": 0.00017336482741575013, + "loss": 0.5422, + "step": 73130 + }, + { + "epoch": 1.627938034188034, + "grad_norm": 0.377602219581604, + "learning_rate": 0.00017331862119647417, + "loss": 0.631, + "step": 73140 + }, + { + "epoch": 1.6281606125356125, + "grad_norm": 0.4106764495372772, + "learning_rate": 0.00017327241642708623, + "loss": 0.4257, + "step": 73150 + }, + { + "epoch": 1.628383190883191, + "grad_norm": 0.3863488435745239, + "learning_rate": 0.00017322621311009702, + "loss": 0.4273, + "step": 73160 + }, + { + "epoch": 1.6286057692307692, + "grad_norm": 0.6114441156387329, + "learning_rate": 0.0001731800112480173, + "loss": 0.4727, + "step": 73170 + }, + { + "epoch": 1.6288283475783476, + "grad_norm": 0.8081455230712891, + "learning_rate": 0.00017313381084335775, + "loss": 0.6697, + "step": 73180 + }, + { + "epoch": 1.629050925925926, + "grad_norm": 0.5485579967498779, + "learning_rate": 0.0001730876118986289, + "loss": 0.486, + "step": 73190 + }, + { + "epoch": 1.6292735042735043, + "grad_norm": 0.6874799132347107, + "learning_rate": 0.00017304141441634122, + "loss": 0.4315, + "step": 73200 + }, + { + "epoch": 1.6294960826210825, + "grad_norm": 0.5704666376113892, + "learning_rate": 0.0001729952183990052, + "loss": 0.4986, + "step": 73210 + }, + { + "epoch": 1.629718660968661, + "grad_norm": 0.4740930199623108, + "learning_rate": 0.0001729490238491311, + "loss": 0.4556, + "step": 73220 + }, + { + "epoch": 1.6299412393162394, + "grad_norm": 0.5930688381195068, + "learning_rate": 0.00017290283076922919, + "loss": 0.6498, + "step": 73230 + }, + { + "epoch": 1.6301638176638176, + "grad_norm": 0.5056358575820923, + "learning_rate": 0.0001728566391618097, + "loss": 0.5727, + "step": 73240 + }, + { + "epoch": 1.630386396011396, + "grad_norm": 0.5368411540985107, + "learning_rate": 0.00017281044902938263, + "loss": 0.3919, + "step": 73250 + }, + { + "epoch": 1.6306089743589745, + "grad_norm": 0.7721600532531738, + "learning_rate": 0.00017276426037445797, + "loss": 0.4666, + "step": 73260 + }, + { + "epoch": 1.6308315527065527, + "grad_norm": 0.7881020307540894, + "learning_rate": 0.0001727180731995457, + "loss": 0.548, + "step": 73270 + }, + { + "epoch": 1.631054131054131, + "grad_norm": 0.5035044550895691, + "learning_rate": 0.00017267188750715566, + "loss": 0.5716, + "step": 73280 + }, + { + "epoch": 1.6312767094017095, + "grad_norm": 0.6801565289497375, + "learning_rate": 0.0001726257032997977, + "loss": 0.4976, + "step": 73290 + }, + { + "epoch": 1.6314992877492878, + "grad_norm": 0.41549575328826904, + "learning_rate": 0.00017257952057998135, + "loss": 0.5138, + "step": 73300 + }, + { + "epoch": 1.631721866096866, + "grad_norm": 0.6814451217651367, + "learning_rate": 0.0001725333393502163, + "loss": 0.5514, + "step": 73310 + }, + { + "epoch": 1.6319444444444444, + "grad_norm": 0.6054021120071411, + "learning_rate": 0.00017248715961301213, + "loss": 0.5926, + "step": 73320 + }, + { + "epoch": 1.6321670227920229, + "grad_norm": 0.788329005241394, + "learning_rate": 0.0001724409813708782, + "loss": 0.5818, + "step": 73330 + }, + { + "epoch": 1.632389601139601, + "grad_norm": 0.7401924729347229, + "learning_rate": 0.00017239480462632388, + "loss": 0.5906, + "step": 73340 + }, + { + "epoch": 1.6326121794871795, + "grad_norm": 0.8028993606567383, + "learning_rate": 0.00017234862938185852, + "loss": 0.5838, + "step": 73350 + }, + { + "epoch": 1.632834757834758, + "grad_norm": 0.5766822099685669, + "learning_rate": 0.00017230245563999122, + "loss": 0.5865, + "step": 73360 + }, + { + "epoch": 1.6330573361823362, + "grad_norm": 0.7944523096084595, + "learning_rate": 0.00017225628340323127, + "loss": 0.636, + "step": 73370 + }, + { + "epoch": 1.6332799145299144, + "grad_norm": 0.5362275242805481, + "learning_rate": 0.00017221011267408751, + "loss": 0.6232, + "step": 73380 + }, + { + "epoch": 1.6335024928774928, + "grad_norm": 0.4361020028591156, + "learning_rate": 0.000172163943455069, + "loss": 0.4872, + "step": 73390 + }, + { + "epoch": 1.6337250712250713, + "grad_norm": 0.6074864268302917, + "learning_rate": 0.00017211777574868456, + "loss": 0.485, + "step": 73400 + }, + { + "epoch": 1.6339476495726495, + "grad_norm": 0.7449727654457092, + "learning_rate": 0.00017207160955744303, + "loss": 0.4808, + "step": 73410 + }, + { + "epoch": 1.634170227920228, + "grad_norm": 0.830277681350708, + "learning_rate": 0.00017202544488385306, + "loss": 0.564, + "step": 73420 + }, + { + "epoch": 1.6343928062678064, + "grad_norm": 0.505328893661499, + "learning_rate": 0.00017197928173042336, + "loss": 0.4169, + "step": 73430 + }, + { + "epoch": 1.6346153846153846, + "grad_norm": 0.4163178503513336, + "learning_rate": 0.0001719331200996624, + "loss": 0.3699, + "step": 73440 + }, + { + "epoch": 1.6348379629629628, + "grad_norm": 0.6493053436279297, + "learning_rate": 0.00017188695999407867, + "loss": 0.4961, + "step": 73450 + }, + { + "epoch": 1.6350605413105415, + "grad_norm": 0.3108616769313812, + "learning_rate": 0.0001718408014161806, + "loss": 0.5145, + "step": 73460 + }, + { + "epoch": 1.6352831196581197, + "grad_norm": 0.4791993498802185, + "learning_rate": 0.0001717946443684764, + "loss": 0.5821, + "step": 73470 + }, + { + "epoch": 1.635505698005698, + "grad_norm": 0.48567846417427063, + "learning_rate": 0.00017174848885347427, + "loss": 0.4676, + "step": 73480 + }, + { + "epoch": 1.6357282763532763, + "grad_norm": 0.5976247191429138, + "learning_rate": 0.00017170233487368247, + "loss": 0.5768, + "step": 73490 + }, + { + "epoch": 1.6359508547008548, + "grad_norm": 0.534724771976471, + "learning_rate": 0.0001716561824316089, + "loss": 0.5824, + "step": 73500 + }, + { + "epoch": 1.636173433048433, + "grad_norm": 0.593966543674469, + "learning_rate": 0.00017161003152976165, + "loss": 0.5155, + "step": 73510 + }, + { + "epoch": 1.6363960113960114, + "grad_norm": 0.675961971282959, + "learning_rate": 0.00017156388217064847, + "loss": 0.512, + "step": 73520 + }, + { + "epoch": 1.6366185897435899, + "grad_norm": 0.67283034324646, + "learning_rate": 0.00017151773435677726, + "loss": 0.6103, + "step": 73530 + }, + { + "epoch": 1.636841168091168, + "grad_norm": 0.6470919251441956, + "learning_rate": 0.0001714715880906556, + "loss": 0.5908, + "step": 73540 + }, + { + "epoch": 1.6370637464387463, + "grad_norm": 0.39839234948158264, + "learning_rate": 0.00017142544337479123, + "loss": 0.5409, + "step": 73550 + }, + { + "epoch": 1.6372863247863247, + "grad_norm": 0.6064070463180542, + "learning_rate": 0.00017137930021169165, + "loss": 0.4618, + "step": 73560 + }, + { + "epoch": 1.6375089031339032, + "grad_norm": 0.46118393540382385, + "learning_rate": 0.00017133315860386436, + "loss": 0.5061, + "step": 73570 + }, + { + "epoch": 1.6377314814814814, + "grad_norm": 0.5544940829277039, + "learning_rate": 0.00017128701855381666, + "loss": 0.5178, + "step": 73580 + }, + { + "epoch": 1.6379540598290598, + "grad_norm": 0.5723961591720581, + "learning_rate": 0.00017124088006405588, + "loss": 0.6169, + "step": 73590 + }, + { + "epoch": 1.6381766381766383, + "grad_norm": 0.6090016961097717, + "learning_rate": 0.00017119474313708927, + "loss": 0.4677, + "step": 73600 + }, + { + "epoch": 1.6383992165242165, + "grad_norm": 0.6423790454864502, + "learning_rate": 0.00017114860777542386, + "loss": 0.6065, + "step": 73610 + }, + { + "epoch": 1.6386217948717947, + "grad_norm": 0.6712021231651306, + "learning_rate": 0.0001711024739815667, + "loss": 0.5973, + "step": 73620 + }, + { + "epoch": 1.6388443732193734, + "grad_norm": 0.4111859202384949, + "learning_rate": 0.00017105634175802482, + "loss": 0.5508, + "step": 73630 + }, + { + "epoch": 1.6390669515669516, + "grad_norm": 0.571266233921051, + "learning_rate": 0.00017101021110730506, + "loss": 0.5742, + "step": 73640 + }, + { + "epoch": 1.6392895299145298, + "grad_norm": 0.658608615398407, + "learning_rate": 0.00017096408203191407, + "loss": 0.5593, + "step": 73650 + }, + { + "epoch": 1.6395121082621082, + "grad_norm": 0.6612945199012756, + "learning_rate": 0.00017091795453435865, + "loss": 0.6158, + "step": 73660 + }, + { + "epoch": 1.6397346866096867, + "grad_norm": 0.504089891910553, + "learning_rate": 0.00017087182861714542, + "loss": 0.4897, + "step": 73670 + }, + { + "epoch": 1.639957264957265, + "grad_norm": 0.63788902759552, + "learning_rate": 0.00017082570428278082, + "loss": 0.4936, + "step": 73680 + }, + { + "epoch": 1.6401798433048433, + "grad_norm": 0.7329126596450806, + "learning_rate": 0.00017077958153377133, + "loss": 0.608, + "step": 73690 + }, + { + "epoch": 1.6404024216524218, + "grad_norm": 0.5291252732276917, + "learning_rate": 0.00017073346037262336, + "loss": 0.4443, + "step": 73700 + }, + { + "epoch": 1.640625, + "grad_norm": 0.4839900732040405, + "learning_rate": 0.00017068734080184306, + "loss": 0.5896, + "step": 73710 + }, + { + "epoch": 1.6408475783475782, + "grad_norm": 0.6721231937408447, + "learning_rate": 0.00017064122282393665, + "loss": 0.6759, + "step": 73720 + }, + { + "epoch": 1.6410701566951567, + "grad_norm": 0.7454251646995544, + "learning_rate": 0.00017059510644141027, + "loss": 0.56, + "step": 73730 + }, + { + "epoch": 1.641292735042735, + "grad_norm": 0.5054280757904053, + "learning_rate": 0.00017054899165676988, + "loss": 0.4465, + "step": 73740 + }, + { + "epoch": 1.6415153133903133, + "grad_norm": 0.6631110906600952, + "learning_rate": 0.00017050287847252134, + "loss": 0.5212, + "step": 73750 + }, + { + "epoch": 1.6417378917378918, + "grad_norm": 0.5210461020469666, + "learning_rate": 0.00017045676689117056, + "loss": 0.5512, + "step": 73760 + }, + { + "epoch": 1.6419604700854702, + "grad_norm": 0.504281759262085, + "learning_rate": 0.0001704106569152233, + "loss": 0.5389, + "step": 73770 + }, + { + "epoch": 1.6421830484330484, + "grad_norm": 0.8250054121017456, + "learning_rate": 0.00017036454854718516, + "loss": 0.5495, + "step": 73780 + }, + { + "epoch": 1.6424056267806266, + "grad_norm": 0.4285098910331726, + "learning_rate": 0.00017031844178956167, + "loss": 0.5417, + "step": 73790 + }, + { + "epoch": 1.6426282051282053, + "grad_norm": 0.6083645820617676, + "learning_rate": 0.00017027233664485832, + "loss": 0.5558, + "step": 73800 + }, + { + "epoch": 1.6428507834757835, + "grad_norm": 0.4949245750904083, + "learning_rate": 0.00017022623311558062, + "loss": 0.4727, + "step": 73810 + }, + { + "epoch": 1.6430733618233617, + "grad_norm": 0.9863992929458618, + "learning_rate": 0.0001701801312042337, + "loss": 0.6826, + "step": 73820 + }, + { + "epoch": 1.6432959401709402, + "grad_norm": 0.45118269324302673, + "learning_rate": 0.0001701340309133229, + "loss": 0.5129, + "step": 73830 + }, + { + "epoch": 1.6435185185185186, + "grad_norm": 0.5702125430107117, + "learning_rate": 0.00017008793224535334, + "loss": 0.5916, + "step": 73840 + }, + { + "epoch": 1.6437410968660968, + "grad_norm": 0.4908457100391388, + "learning_rate": 0.00017004183520282995, + "loss": 0.5519, + "step": 73850 + }, + { + "epoch": 1.6439636752136753, + "grad_norm": 0.6474966406822205, + "learning_rate": 0.0001699957397882578, + "loss": 0.5739, + "step": 73860 + }, + { + "epoch": 1.6441862535612537, + "grad_norm": 0.5858306288719177, + "learning_rate": 0.0001699496460041417, + "loss": 0.5725, + "step": 73870 + }, + { + "epoch": 1.644408831908832, + "grad_norm": 0.655928373336792, + "learning_rate": 0.00016990355385298648, + "loss": 0.6545, + "step": 73880 + }, + { + "epoch": 1.6446314102564101, + "grad_norm": 0.4970945417881012, + "learning_rate": 0.00016985746333729674, + "loss": 0.3836, + "step": 73890 + }, + { + "epoch": 1.6448539886039886, + "grad_norm": 0.5481718182563782, + "learning_rate": 0.0001698113744595772, + "loss": 0.4997, + "step": 73900 + }, + { + "epoch": 1.645076566951567, + "grad_norm": 0.7615852355957031, + "learning_rate": 0.0001697652872223322, + "loss": 0.6299, + "step": 73910 + }, + { + "epoch": 1.6452991452991452, + "grad_norm": 0.6412912011146545, + "learning_rate": 0.0001697192016280663, + "loss": 0.7039, + "step": 73920 + }, + { + "epoch": 1.6455217236467237, + "grad_norm": 0.4065113663673401, + "learning_rate": 0.0001696731176792837, + "loss": 0.4645, + "step": 73930 + }, + { + "epoch": 1.645744301994302, + "grad_norm": 0.6548492908477783, + "learning_rate": 0.00016962703537848873, + "loss": 0.5546, + "step": 73940 + }, + { + "epoch": 1.6459668803418803, + "grad_norm": 0.7567201852798462, + "learning_rate": 0.00016958095472818557, + "loss": 0.6382, + "step": 73950 + }, + { + "epoch": 1.6461894586894585, + "grad_norm": 0.9115142226219177, + "learning_rate": 0.00016953487573087821, + "loss": 0.555, + "step": 73960 + }, + { + "epoch": 1.6464120370370372, + "grad_norm": 0.5288968682289124, + "learning_rate": 0.00016948879838907062, + "loss": 0.52, + "step": 73970 + }, + { + "epoch": 1.6466346153846154, + "grad_norm": 0.3623640835285187, + "learning_rate": 0.00016944272270526673, + "loss": 0.4835, + "step": 73980 + }, + { + "epoch": 1.6468571937321936, + "grad_norm": 0.5390410423278809, + "learning_rate": 0.0001693966486819703, + "loss": 0.6114, + "step": 73990 + }, + { + "epoch": 1.647079772079772, + "grad_norm": 0.5235438942909241, + "learning_rate": 0.00016935057632168498, + "loss": 0.5105, + "step": 74000 + }, + { + "epoch": 1.6473023504273505, + "grad_norm": 0.36156705021858215, + "learning_rate": 0.0001693045056269145, + "loss": 0.5099, + "step": 74010 + }, + { + "epoch": 1.6475249287749287, + "grad_norm": 0.601723313331604, + "learning_rate": 0.00016925843660016229, + "loss": 0.5058, + "step": 74020 + }, + { + "epoch": 1.6477475071225072, + "grad_norm": 0.5614992380142212, + "learning_rate": 0.0001692123692439319, + "loss": 0.6299, + "step": 74030 + }, + { + "epoch": 1.6479700854700856, + "grad_norm": 0.42595812678337097, + "learning_rate": 0.00016916630356072644, + "loss": 0.5607, + "step": 74040 + }, + { + "epoch": 1.6481926638176638, + "grad_norm": 0.5589383840560913, + "learning_rate": 0.0001691202395530493, + "loss": 0.542, + "step": 74050 + }, + { + "epoch": 1.648415242165242, + "grad_norm": 0.7026439905166626, + "learning_rate": 0.00016907417722340366, + "loss": 0.535, + "step": 74060 + }, + { + "epoch": 1.6486378205128205, + "grad_norm": 0.38493791222572327, + "learning_rate": 0.0001690281165742925, + "loss": 0.4701, + "step": 74070 + }, + { + "epoch": 1.648860398860399, + "grad_norm": 0.5316752791404724, + "learning_rate": 0.00016898205760821887, + "loss": 0.5313, + "step": 74080 + }, + { + "epoch": 1.6490829772079771, + "grad_norm": 0.6451575756072998, + "learning_rate": 0.00016893600032768564, + "loss": 0.5871, + "step": 74090 + }, + { + "epoch": 1.6493055555555556, + "grad_norm": 0.3266368806362152, + "learning_rate": 0.00016888994473519554, + "loss": 0.6027, + "step": 74100 + }, + { + "epoch": 1.649528133903134, + "grad_norm": 0.9386067986488342, + "learning_rate": 0.0001688438908332513, + "loss": 0.5026, + "step": 74110 + }, + { + "epoch": 1.6497507122507122, + "grad_norm": 0.418160617351532, + "learning_rate": 0.0001687978386243556, + "loss": 0.6328, + "step": 74120 + }, + { + "epoch": 1.6499732905982905, + "grad_norm": 0.5406116843223572, + "learning_rate": 0.00016875178811101086, + "loss": 0.5237, + "step": 74130 + }, + { + "epoch": 1.650195868945869, + "grad_norm": 0.5164955258369446, + "learning_rate": 0.00016870573929571954, + "loss": 0.5915, + "step": 74140 + }, + { + "epoch": 1.6504184472934473, + "grad_norm": 0.401613712310791, + "learning_rate": 0.00016865969218098398, + "loss": 0.6145, + "step": 74150 + }, + { + "epoch": 1.6506410256410255, + "grad_norm": 0.5824975371360779, + "learning_rate": 0.00016861364676930643, + "loss": 0.5881, + "step": 74160 + }, + { + "epoch": 1.650863603988604, + "grad_norm": 0.596259593963623, + "learning_rate": 0.00016856760306318896, + "loss": 0.6278, + "step": 74170 + }, + { + "epoch": 1.6510861823361824, + "grad_norm": 0.698535144329071, + "learning_rate": 0.00016852156106513368, + "loss": 0.5502, + "step": 74180 + }, + { + "epoch": 1.6513087606837606, + "grad_norm": 0.6913437843322754, + "learning_rate": 0.00016847552077764252, + "loss": 0.5445, + "step": 74190 + }, + { + "epoch": 1.651531339031339, + "grad_norm": 0.3660818040370941, + "learning_rate": 0.0001684294822032174, + "loss": 0.5501, + "step": 74200 + }, + { + "epoch": 1.6517539173789175, + "grad_norm": 0.6315298676490784, + "learning_rate": 0.00016838344534436003, + "loss": 0.5986, + "step": 74210 + }, + { + "epoch": 1.6519764957264957, + "grad_norm": 0.3987181782722473, + "learning_rate": 0.00016833741020357213, + "loss": 0.5797, + "step": 74220 + }, + { + "epoch": 1.652199074074074, + "grad_norm": 0.49659889936447144, + "learning_rate": 0.00016829137678335532, + "loss": 0.4949, + "step": 74230 + }, + { + "epoch": 1.6524216524216524, + "grad_norm": 0.5157960057258606, + "learning_rate": 0.00016824534508621102, + "loss": 0.6625, + "step": 74240 + }, + { + "epoch": 1.6526442307692308, + "grad_norm": 0.5533856153488159, + "learning_rate": 0.00016819931511464063, + "loss": 0.5254, + "step": 74250 + }, + { + "epoch": 1.652866809116809, + "grad_norm": 0.4741371273994446, + "learning_rate": 0.00016815328687114556, + "loss": 0.5132, + "step": 74260 + }, + { + "epoch": 1.6530893874643875, + "grad_norm": 0.4546084403991699, + "learning_rate": 0.00016810726035822692, + "loss": 0.4191, + "step": 74270 + }, + { + "epoch": 1.653311965811966, + "grad_norm": 0.5832876563072205, + "learning_rate": 0.00016806123557838582, + "loss": 0.5897, + "step": 74280 + }, + { + "epoch": 1.6535345441595442, + "grad_norm": 0.3638170063495636, + "learning_rate": 0.00016801521253412345, + "loss": 0.4935, + "step": 74290 + }, + { + "epoch": 1.6537571225071224, + "grad_norm": 0.5617278218269348, + "learning_rate": 0.0001679691912279405, + "loss": 0.6729, + "step": 74300 + }, + { + "epoch": 1.6539797008547008, + "grad_norm": 0.5840321779251099, + "learning_rate": 0.000167923171662338, + "loss": 0.6152, + "step": 74310 + }, + { + "epoch": 1.6542022792022792, + "grad_norm": 0.7067362070083618, + "learning_rate": 0.00016787715383981652, + "loss": 0.5307, + "step": 74320 + }, + { + "epoch": 1.6544248575498575, + "grad_norm": 0.7421824336051941, + "learning_rate": 0.00016783113776287683, + "loss": 0.6672, + "step": 74330 + }, + { + "epoch": 1.654647435897436, + "grad_norm": 0.7069873213768005, + "learning_rate": 0.00016778512343401948, + "loss": 0.553, + "step": 74340 + }, + { + "epoch": 1.6548700142450143, + "grad_norm": 0.5692541003227234, + "learning_rate": 0.00016773911085574486, + "loss": 0.6681, + "step": 74350 + }, + { + "epoch": 1.6550925925925926, + "grad_norm": 0.5314786434173584, + "learning_rate": 0.00016769310003055338, + "loss": 0.4982, + "step": 74360 + }, + { + "epoch": 1.6553151709401708, + "grad_norm": 0.5425453186035156, + "learning_rate": 0.00016764709096094528, + "loss": 0.5796, + "step": 74370 + }, + { + "epoch": 1.6555377492877494, + "grad_norm": 0.6942946314811707, + "learning_rate": 0.00016760108364942075, + "loss": 0.5619, + "step": 74380 + }, + { + "epoch": 1.6557603276353277, + "grad_norm": 0.48159825801849365, + "learning_rate": 0.00016755507809847986, + "loss": 0.4793, + "step": 74390 + }, + { + "epoch": 1.6559829059829059, + "grad_norm": 0.4759584963321686, + "learning_rate": 0.00016750907431062258, + "loss": 0.4454, + "step": 74400 + }, + { + "epoch": 1.6562054843304843, + "grad_norm": 0.6936882138252258, + "learning_rate": 0.0001674630722883488, + "loss": 0.5632, + "step": 74410 + }, + { + "epoch": 1.6564280626780628, + "grad_norm": 0.812346875667572, + "learning_rate": 0.0001674170720341584, + "loss": 0.5834, + "step": 74420 + }, + { + "epoch": 1.656650641025641, + "grad_norm": 0.8212725520133972, + "learning_rate": 0.00016737107355055088, + "loss": 0.5352, + "step": 74430 + }, + { + "epoch": 1.6568732193732194, + "grad_norm": 0.7028321623802185, + "learning_rate": 0.00016732507684002595, + "loss": 0.4907, + "step": 74440 + }, + { + "epoch": 1.6570957977207978, + "grad_norm": 0.4733540415763855, + "learning_rate": 0.0001672790819050831, + "loss": 0.4795, + "step": 74450 + }, + { + "epoch": 1.657318376068376, + "grad_norm": 0.6220640540122986, + "learning_rate": 0.0001672330887482217, + "loss": 0.5511, + "step": 74460 + }, + { + "epoch": 1.6575409544159543, + "grad_norm": 0.5480875372886658, + "learning_rate": 0.00016718709737194106, + "loss": 0.4772, + "step": 74470 + }, + { + "epoch": 1.6577635327635327, + "grad_norm": 0.6275351643562317, + "learning_rate": 0.0001671411077787405, + "loss": 0.445, + "step": 74480 + }, + { + "epoch": 1.6579861111111112, + "grad_norm": 0.5668492317199707, + "learning_rate": 0.00016709511997111898, + "loss": 0.5913, + "step": 74490 + }, + { + "epoch": 1.6582086894586894, + "grad_norm": 0.9715355634689331, + "learning_rate": 0.00016704913395157559, + "loss": 0.5269, + "step": 74500 + }, + { + "epoch": 1.6584312678062678, + "grad_norm": 0.36021795868873596, + "learning_rate": 0.00016700314972260924, + "loss": 0.4903, + "step": 74510 + }, + { + "epoch": 1.6586538461538463, + "grad_norm": 0.6244007349014282, + "learning_rate": 0.00016695716728671873, + "loss": 0.5046, + "step": 74520 + }, + { + "epoch": 1.6588764245014245, + "grad_norm": 0.5061987638473511, + "learning_rate": 0.0001669111866464028, + "loss": 0.5313, + "step": 74530 + }, + { + "epoch": 1.6590990028490027, + "grad_norm": 0.6868669390678406, + "learning_rate": 0.00016686520780416012, + "loss": 0.679, + "step": 74540 + }, + { + "epoch": 1.6593215811965814, + "grad_norm": 0.5997483730316162, + "learning_rate": 0.00016681923076248913, + "loss": 0.5782, + "step": 74550 + }, + { + "epoch": 1.6595441595441596, + "grad_norm": 0.3696061968803406, + "learning_rate": 0.0001667732555238884, + "loss": 0.4457, + "step": 74560 + }, + { + "epoch": 1.6597667378917378, + "grad_norm": 0.4042704403400421, + "learning_rate": 0.00016672728209085605, + "loss": 0.4871, + "step": 74570 + }, + { + "epoch": 1.6599893162393162, + "grad_norm": 0.5580457448959351, + "learning_rate": 0.00016668131046589045, + "loss": 0.4977, + "step": 74580 + }, + { + "epoch": 1.6602118945868947, + "grad_norm": 0.617868185043335, + "learning_rate": 0.00016663534065148973, + "loss": 0.6508, + "step": 74590 + }, + { + "epoch": 1.6604344729344729, + "grad_norm": 0.458039790391922, + "learning_rate": 0.00016658937265015192, + "loss": 0.5306, + "step": 74600 + }, + { + "epoch": 1.6606570512820513, + "grad_norm": 0.8516250848770142, + "learning_rate": 0.0001665434064643749, + "loss": 0.5773, + "step": 74610 + }, + { + "epoch": 1.6608796296296298, + "grad_norm": 0.6809437870979309, + "learning_rate": 0.00016649744209665663, + "loss": 0.5101, + "step": 74620 + }, + { + "epoch": 1.661102207977208, + "grad_norm": 0.4385990798473358, + "learning_rate": 0.00016645147954949473, + "loss": 0.5177, + "step": 74630 + }, + { + "epoch": 1.6613247863247862, + "grad_norm": 0.38819900155067444, + "learning_rate": 0.00016640551882538693, + "loss": 0.4473, + "step": 74640 + }, + { + "epoch": 1.6615473646723646, + "grad_norm": 0.5712143778800964, + "learning_rate": 0.00016635955992683074, + "loss": 0.4045, + "step": 74650 + }, + { + "epoch": 1.661769943019943, + "grad_norm": 0.4798211455345154, + "learning_rate": 0.00016631360285632358, + "loss": 0.5643, + "step": 74660 + }, + { + "epoch": 1.6619925213675213, + "grad_norm": 0.5165737271308899, + "learning_rate": 0.0001662676476163628, + "loss": 0.6715, + "step": 74670 + }, + { + "epoch": 1.6622150997150997, + "grad_norm": 0.6299379467964172, + "learning_rate": 0.00016622169420944574, + "loss": 0.5062, + "step": 74680 + }, + { + "epoch": 1.6624376780626782, + "grad_norm": 0.5905649662017822, + "learning_rate": 0.0001661757426380695, + "loss": 0.5199, + "step": 74690 + }, + { + "epoch": 1.6626602564102564, + "grad_norm": 0.5792425274848938, + "learning_rate": 0.000166129792904731, + "loss": 0.6112, + "step": 74700 + }, + { + "epoch": 1.6628828347578346, + "grad_norm": 0.6446602940559387, + "learning_rate": 0.00016608384501192727, + "loss": 0.5768, + "step": 74710 + }, + { + "epoch": 1.6631054131054133, + "grad_norm": 0.4840184450149536, + "learning_rate": 0.00016603789896215516, + "loss": 0.4455, + "step": 74720 + }, + { + "epoch": 1.6633279914529915, + "grad_norm": 0.7535210251808167, + "learning_rate": 0.0001659919547579115, + "loss": 0.6043, + "step": 74730 + }, + { + "epoch": 1.6635505698005697, + "grad_norm": 0.552081823348999, + "learning_rate": 0.00016594601240169278, + "loss": 0.447, + "step": 74740 + }, + { + "epoch": 1.6637731481481481, + "grad_norm": 0.5401560664176941, + "learning_rate": 0.00016590007189599566, + "loss": 0.5122, + "step": 74750 + }, + { + "epoch": 1.6639957264957266, + "grad_norm": 0.6353166699409485, + "learning_rate": 0.00016585413324331658, + "loss": 0.4692, + "step": 74760 + }, + { + "epoch": 1.6642183048433048, + "grad_norm": 0.6277305483818054, + "learning_rate": 0.0001658081964461518, + "loss": 0.4863, + "step": 74770 + }, + { + "epoch": 1.6644408831908832, + "grad_norm": 0.4086846113204956, + "learning_rate": 0.00016576226150699763, + "loss": 0.4235, + "step": 74780 + }, + { + "epoch": 1.6646634615384617, + "grad_norm": 0.574285626411438, + "learning_rate": 0.00016571632842835024, + "loss": 0.6429, + "step": 74790 + }, + { + "epoch": 1.66488603988604, + "grad_norm": 0.39552202820777893, + "learning_rate": 0.00016567039721270558, + "loss": 0.4631, + "step": 74800 + }, + { + "epoch": 1.665108618233618, + "grad_norm": 0.5707929730415344, + "learning_rate": 0.0001656244678625596, + "loss": 0.5358, + "step": 74810 + }, + { + "epoch": 1.6653311965811965, + "grad_norm": 0.5050995945930481, + "learning_rate": 0.00016557854038040833, + "loss": 0.4214, + "step": 74820 + }, + { + "epoch": 1.665553774928775, + "grad_norm": 0.49576276540756226, + "learning_rate": 0.00016553261476874728, + "loss": 0.6185, + "step": 74830 + }, + { + "epoch": 1.6657763532763532, + "grad_norm": 0.5018727779388428, + "learning_rate": 0.0001654866910300721, + "loss": 0.4222, + "step": 74840 + }, + { + "epoch": 1.6659989316239316, + "grad_norm": 0.8848727941513062, + "learning_rate": 0.0001654407691668784, + "loss": 0.6535, + "step": 74850 + }, + { + "epoch": 1.66622150997151, + "grad_norm": 0.5684048533439636, + "learning_rate": 0.00016539484918166167, + "loss": 0.6015, + "step": 74860 + }, + { + "epoch": 1.6664440883190883, + "grad_norm": 0.7550521492958069, + "learning_rate": 0.00016534893107691707, + "loss": 0.607, + "step": 74870 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5899578928947449, + "learning_rate": 0.00016530301485513996, + "loss": 0.5685, + "step": 74880 + }, + { + "epoch": 1.6668892450142452, + "grad_norm": 0.7316328883171082, + "learning_rate": 0.0001652571005188254, + "loss": 0.5741, + "step": 74890 + }, + { + "epoch": 1.6671118233618234, + "grad_norm": 0.5651301741600037, + "learning_rate": 0.0001652111880704685, + "loss": 0.5463, + "step": 74900 + }, + { + "epoch": 1.6673344017094016, + "grad_norm": 0.5351511240005493, + "learning_rate": 0.00016516527751256406, + "loss": 0.4856, + "step": 74910 + }, + { + "epoch": 1.66755698005698, + "grad_norm": 0.5877324938774109, + "learning_rate": 0.00016511936884760697, + "loss": 0.5068, + "step": 74920 + }, + { + "epoch": 1.6677795584045585, + "grad_norm": 0.40048742294311523, + "learning_rate": 0.00016507346207809195, + "loss": 0.5428, + "step": 74930 + }, + { + "epoch": 1.6680021367521367, + "grad_norm": 0.34698426723480225, + "learning_rate": 0.00016502755720651359, + "loss": 0.5051, + "step": 74940 + }, + { + "epoch": 1.6682247150997151, + "grad_norm": 0.6042134761810303, + "learning_rate": 0.00016498165423536645, + "loss": 0.5602, + "step": 74950 + }, + { + "epoch": 1.6684472934472936, + "grad_norm": 0.4832810163497925, + "learning_rate": 0.00016493575316714484, + "loss": 0.4747, + "step": 74960 + }, + { + "epoch": 1.6686698717948718, + "grad_norm": 0.5508270263671875, + "learning_rate": 0.00016488985400434314, + "loss": 0.5238, + "step": 74970 + }, + { + "epoch": 1.66889245014245, + "grad_norm": 0.4627018868923187, + "learning_rate": 0.00016484395674945545, + "loss": 0.6073, + "step": 74980 + }, + { + "epoch": 1.6691150284900285, + "grad_norm": 0.6229496002197266, + "learning_rate": 0.00016479806140497593, + "loss": 0.6461, + "step": 74990 + }, + { + "epoch": 1.669337606837607, + "grad_norm": 0.9775029420852661, + "learning_rate": 0.00016475216797339864, + "loss": 0.6489, + "step": 75000 + }, + { + "epoch": 1.6695601851851851, + "grad_norm": 0.38404151797294617, + "learning_rate": 0.0001647062764572173, + "loss": 0.5041, + "step": 75010 + }, + { + "epoch": 1.6697827635327636, + "grad_norm": 0.5900865793228149, + "learning_rate": 0.00016466038685892587, + "loss": 0.5822, + "step": 75020 + }, + { + "epoch": 1.670005341880342, + "grad_norm": 0.5986582636833191, + "learning_rate": 0.0001646144991810179, + "loss": 0.4817, + "step": 75030 + }, + { + "epoch": 1.6702279202279202, + "grad_norm": 0.6265101432800293, + "learning_rate": 0.0001645686134259871, + "loss": 0.588, + "step": 75040 + }, + { + "epoch": 1.6704504985754984, + "grad_norm": 0.46558502316474915, + "learning_rate": 0.00016452272959632675, + "loss": 0.4984, + "step": 75050 + }, + { + "epoch": 1.6706730769230769, + "grad_norm": 0.33297625184059143, + "learning_rate": 0.00016447684769453034, + "loss": 0.5982, + "step": 75060 + }, + { + "epoch": 1.6708956552706553, + "grad_norm": 0.6559587121009827, + "learning_rate": 0.00016443096772309114, + "loss": 0.5793, + "step": 75070 + }, + { + "epoch": 1.6711182336182335, + "grad_norm": 0.43765076994895935, + "learning_rate": 0.00016438508968450233, + "loss": 0.464, + "step": 75080 + }, + { + "epoch": 1.671340811965812, + "grad_norm": 0.46449562907218933, + "learning_rate": 0.00016433921358125682, + "loss": 0.5096, + "step": 75090 + }, + { + "epoch": 1.6715633903133904, + "grad_norm": 0.4868076741695404, + "learning_rate": 0.00016429333941584765, + "loss": 0.4879, + "step": 75100 + }, + { + "epoch": 1.6717859686609686, + "grad_norm": 0.5010735392570496, + "learning_rate": 0.00016424746719076764, + "loss": 0.4999, + "step": 75110 + }, + { + "epoch": 1.672008547008547, + "grad_norm": 0.5864977240562439, + "learning_rate": 0.0001642015969085095, + "loss": 0.6716, + "step": 75120 + }, + { + "epoch": 1.6722311253561255, + "grad_norm": 0.4027913808822632, + "learning_rate": 0.00016415572857156593, + "loss": 0.5468, + "step": 75130 + }, + { + "epoch": 1.6724537037037037, + "grad_norm": 0.789313793182373, + "learning_rate": 0.00016410986218242944, + "loss": 0.5943, + "step": 75140 + }, + { + "epoch": 1.672676282051282, + "grad_norm": 0.6468246579170227, + "learning_rate": 0.00016406399774359235, + "loss": 0.5485, + "step": 75150 + }, + { + "epoch": 1.6728988603988604, + "grad_norm": 0.46155524253845215, + "learning_rate": 0.00016401813525754707, + "loss": 0.4953, + "step": 75160 + }, + { + "epoch": 1.6731214387464388, + "grad_norm": 0.907846212387085, + "learning_rate": 0.00016397227472678578, + "loss": 0.5333, + "step": 75170 + }, + { + "epoch": 1.673344017094017, + "grad_norm": 0.44893550872802734, + "learning_rate": 0.0001639264161538006, + "loss": 0.5691, + "step": 75180 + }, + { + "epoch": 1.6735665954415955, + "grad_norm": 0.6367380619049072, + "learning_rate": 0.00016388055954108345, + "loss": 0.6064, + "step": 75190 + }, + { + "epoch": 1.673789173789174, + "grad_norm": 0.6882296204566956, + "learning_rate": 0.00016383470489112628, + "loss": 0.6168, + "step": 75200 + }, + { + "epoch": 1.6740117521367521, + "grad_norm": 0.6169039607048035, + "learning_rate": 0.00016378885220642093, + "loss": 0.5464, + "step": 75210 + }, + { + "epoch": 1.6742343304843303, + "grad_norm": 0.36328089237213135, + "learning_rate": 0.00016374300148945897, + "loss": 0.6335, + "step": 75220 + }, + { + "epoch": 1.6744569088319088, + "grad_norm": 0.5436282753944397, + "learning_rate": 0.00016369715274273193, + "loss": 0.5793, + "step": 75230 + }, + { + "epoch": 1.6746794871794872, + "grad_norm": 0.43563759326934814, + "learning_rate": 0.00016365130596873132, + "loss": 0.4787, + "step": 75240 + }, + { + "epoch": 1.6749020655270654, + "grad_norm": 0.603461742401123, + "learning_rate": 0.00016360546116994855, + "loss": 0.6215, + "step": 75250 + }, + { + "epoch": 1.6751246438746439, + "grad_norm": 0.5234988927841187, + "learning_rate": 0.00016355961834887479, + "loss": 0.5816, + "step": 75260 + }, + { + "epoch": 1.6753472222222223, + "grad_norm": 0.536532461643219, + "learning_rate": 0.00016351377750800116, + "loss": 0.5119, + "step": 75270 + }, + { + "epoch": 1.6755698005698005, + "grad_norm": 0.5434236526489258, + "learning_rate": 0.0001634679386498188, + "loss": 0.6079, + "step": 75280 + }, + { + "epoch": 1.6757923789173788, + "grad_norm": 0.48780888319015503, + "learning_rate": 0.0001634221017768185, + "loss": 0.5606, + "step": 75290 + }, + { + "epoch": 1.6760149572649574, + "grad_norm": 0.6546382308006287, + "learning_rate": 0.00016337626689149118, + "loss": 0.6589, + "step": 75300 + }, + { + "epoch": 1.6762375356125356, + "grad_norm": 0.37422406673431396, + "learning_rate": 0.0001633304339963275, + "loss": 0.4647, + "step": 75310 + }, + { + "epoch": 1.6764601139601139, + "grad_norm": 0.7309848666191101, + "learning_rate": 0.000163284603093818, + "loss": 0.5384, + "step": 75320 + }, + { + "epoch": 1.6766826923076923, + "grad_norm": 0.4979402422904968, + "learning_rate": 0.00016323877418645327, + "loss": 0.5439, + "step": 75330 + }, + { + "epoch": 1.6769052706552707, + "grad_norm": 0.48228442668914795, + "learning_rate": 0.0001631929472767237, + "loss": 0.642, + "step": 75340 + }, + { + "epoch": 1.677127849002849, + "grad_norm": 0.7119601368904114, + "learning_rate": 0.00016314712236711944, + "loss": 0.5524, + "step": 75350 + }, + { + "epoch": 1.6773504273504274, + "grad_norm": 0.8681598901748657, + "learning_rate": 0.00016310129946013073, + "loss": 0.7036, + "step": 75360 + }, + { + "epoch": 1.6775730056980058, + "grad_norm": 0.754706859588623, + "learning_rate": 0.0001630554785582476, + "loss": 0.5865, + "step": 75370 + }, + { + "epoch": 1.677795584045584, + "grad_norm": 0.41640523076057434, + "learning_rate": 0.00016300965966396, + "loss": 0.8015, + "step": 75380 + }, + { + "epoch": 1.6780181623931623, + "grad_norm": 0.6275506615638733, + "learning_rate": 0.00016296384277975784, + "loss": 0.5445, + "step": 75390 + }, + { + "epoch": 1.6782407407407407, + "grad_norm": 0.5284397602081299, + "learning_rate": 0.0001629180279081307, + "loss": 0.6416, + "step": 75400 + }, + { + "epoch": 1.6784633190883191, + "grad_norm": 0.42982757091522217, + "learning_rate": 0.00016287221505156832, + "loss": 0.6049, + "step": 75410 + }, + { + "epoch": 1.6786858974358974, + "grad_norm": 0.4871768355369568, + "learning_rate": 0.00016282640421256018, + "loss": 0.4916, + "step": 75420 + }, + { + "epoch": 1.6789084757834758, + "grad_norm": 0.6214765310287476, + "learning_rate": 0.00016278059539359563, + "loss": 0.4929, + "step": 75430 + }, + { + "epoch": 1.6791310541310542, + "grad_norm": 0.7499983310699463, + "learning_rate": 0.000162734788597164, + "loss": 0.5846, + "step": 75440 + }, + { + "epoch": 1.6793536324786325, + "grad_norm": 0.6454204320907593, + "learning_rate": 0.0001626889838257545, + "loss": 0.532, + "step": 75450 + }, + { + "epoch": 1.6795762108262107, + "grad_norm": 0.7106652855873108, + "learning_rate": 0.00016264318108185616, + "loss": 0.476, + "step": 75460 + }, + { + "epoch": 1.6797987891737893, + "grad_norm": 0.6261777281761169, + "learning_rate": 0.00016259738036795797, + "loss": 0.6637, + "step": 75470 + }, + { + "epoch": 1.6800213675213675, + "grad_norm": 0.4820646643638611, + "learning_rate": 0.00016255158168654868, + "loss": 0.5294, + "step": 75480 + }, + { + "epoch": 1.6801994301994303, + "eval_loss": 0.5559062361717224, + "eval_runtime": 337.5129, + "eval_samples_per_second": 7.007, + "eval_steps_per_second": 7.007, + "step": 75488 + }, + { + "epoch": 1.6802439458689458, + "grad_norm": 0.5208863615989685, + "learning_rate": 0.00016250578504011713, + "loss": 0.4434, + "step": 75490 + }, + { + "epoch": 1.6804665242165242, + "grad_norm": 0.3657657504081726, + "learning_rate": 0.0001624599904311519, + "loss": 0.4016, + "step": 75500 + }, + { + "epoch": 1.6806891025641026, + "grad_norm": 0.837756335735321, + "learning_rate": 0.00016241419786214154, + "loss": 0.5181, + "step": 75510 + }, + { + "epoch": 1.6809116809116809, + "grad_norm": 0.5618343353271484, + "learning_rate": 0.00016236840733557442, + "loss": 0.6338, + "step": 75520 + }, + { + "epoch": 1.6811342592592593, + "grad_norm": 0.44379135966300964, + "learning_rate": 0.00016232261885393886, + "loss": 0.573, + "step": 75530 + }, + { + "epoch": 1.6813568376068377, + "grad_norm": 0.4937328100204468, + "learning_rate": 0.000162276832419723, + "loss": 0.4523, + "step": 75540 + }, + { + "epoch": 1.681579415954416, + "grad_norm": 0.6280141472816467, + "learning_rate": 0.00016223104803541501, + "loss": 0.5038, + "step": 75550 + }, + { + "epoch": 1.6818019943019942, + "grad_norm": 0.43941378593444824, + "learning_rate": 0.00016218526570350279, + "loss": 0.638, + "step": 75560 + }, + { + "epoch": 1.6820245726495726, + "grad_norm": 0.5391653776168823, + "learning_rate": 0.00016213948542647414, + "loss": 0.5817, + "step": 75570 + }, + { + "epoch": 1.682247150997151, + "grad_norm": 0.3680667579174042, + "learning_rate": 0.00016209370720681685, + "loss": 0.5353, + "step": 75580 + }, + { + "epoch": 1.6824697293447293, + "grad_norm": 0.386249303817749, + "learning_rate": 0.00016204793104701858, + "loss": 0.4681, + "step": 75590 + }, + { + "epoch": 1.6826923076923077, + "grad_norm": 0.4776840806007385, + "learning_rate": 0.0001620021569495668, + "loss": 0.5674, + "step": 75600 + }, + { + "epoch": 1.6829148860398861, + "grad_norm": 0.6500211358070374, + "learning_rate": 0.00016195638491694892, + "loss": 0.4516, + "step": 75610 + }, + { + "epoch": 1.6831374643874644, + "grad_norm": 0.4960470199584961, + "learning_rate": 0.0001619106149516522, + "loss": 0.5763, + "step": 75620 + }, + { + "epoch": 1.6833600427350426, + "grad_norm": 0.4113209545612335, + "learning_rate": 0.00016186484705616385, + "loss": 0.4562, + "step": 75630 + }, + { + "epoch": 1.6835826210826212, + "grad_norm": 0.5733873844146729, + "learning_rate": 0.00016181908123297096, + "loss": 0.5481, + "step": 75640 + }, + { + "epoch": 1.6838051994301995, + "grad_norm": 0.592670738697052, + "learning_rate": 0.0001617733174845604, + "loss": 0.5587, + "step": 75650 + }, + { + "epoch": 1.6840277777777777, + "grad_norm": 0.5716107487678528, + "learning_rate": 0.00016172755581341907, + "loss": 0.5366, + "step": 75660 + }, + { + "epoch": 1.6842503561253561, + "grad_norm": 0.635908305644989, + "learning_rate": 0.00016168179622203375, + "loss": 0.4558, + "step": 75670 + }, + { + "epoch": 1.6844729344729346, + "grad_norm": 0.29668745398521423, + "learning_rate": 0.00016163603871289093, + "loss": 0.5495, + "step": 75680 + }, + { + "epoch": 1.6846955128205128, + "grad_norm": 0.34001338481903076, + "learning_rate": 0.00016159028328847715, + "loss": 0.4745, + "step": 75690 + }, + { + "epoch": 1.6849180911680912, + "grad_norm": 1.0909297466278076, + "learning_rate": 0.00016154452995127892, + "loss": 0.6351, + "step": 75700 + }, + { + "epoch": 1.6851406695156697, + "grad_norm": 0.8510547280311584, + "learning_rate": 0.00016149877870378233, + "loss": 0.578, + "step": 75710 + }, + { + "epoch": 1.6853632478632479, + "grad_norm": 0.5867220163345337, + "learning_rate": 0.00016145302954847363, + "loss": 0.5132, + "step": 75720 + }, + { + "epoch": 1.685585826210826, + "grad_norm": 0.5249475836753845, + "learning_rate": 0.0001614072824878389, + "loss": 0.6319, + "step": 75730 + }, + { + "epoch": 1.6858084045584045, + "grad_norm": 0.6304720640182495, + "learning_rate": 0.00016136153752436405, + "loss": 0.5996, + "step": 75740 + }, + { + "epoch": 1.686030982905983, + "grad_norm": 0.4601406753063202, + "learning_rate": 0.00016131579466053487, + "loss": 0.4823, + "step": 75750 + }, + { + "epoch": 1.6862535612535612, + "grad_norm": 0.3753646910190582, + "learning_rate": 0.00016127005389883703, + "loss": 0.6404, + "step": 75760 + }, + { + "epoch": 1.6864761396011396, + "grad_norm": 0.5258237719535828, + "learning_rate": 0.00016122431524175616, + "loss": 0.6586, + "step": 75770 + }, + { + "epoch": 1.686698717948718, + "grad_norm": 0.4938367009162903, + "learning_rate": 0.00016117857869177781, + "loss": 0.5113, + "step": 75780 + }, + { + "epoch": 1.6869212962962963, + "grad_norm": 0.674801230430603, + "learning_rate": 0.00016113284425138723, + "loss": 0.4846, + "step": 75790 + }, + { + "epoch": 1.6871438746438745, + "grad_norm": 0.606334924697876, + "learning_rate": 0.0001610871119230697, + "loss": 0.4629, + "step": 75800 + }, + { + "epoch": 1.687366452991453, + "grad_norm": 1.3602806329727173, + "learning_rate": 0.00016104138170931038, + "loss": 0.5608, + "step": 75810 + }, + { + "epoch": 1.6875890313390314, + "grad_norm": 0.6526398062705994, + "learning_rate": 0.00016099565361259426, + "loss": 0.7189, + "step": 75820 + }, + { + "epoch": 1.6878116096866096, + "grad_norm": 0.7846774458885193, + "learning_rate": 0.00016094992763540625, + "loss": 0.6107, + "step": 75830 + }, + { + "epoch": 1.688034188034188, + "grad_norm": 0.6672149896621704, + "learning_rate": 0.00016090420378023118, + "loss": 0.4595, + "step": 75840 + }, + { + "epoch": 1.6882567663817665, + "grad_norm": 1.1179325580596924, + "learning_rate": 0.00016085848204955365, + "loss": 0.4434, + "step": 75850 + }, + { + "epoch": 1.6884793447293447, + "grad_norm": 0.6847007870674133, + "learning_rate": 0.00016081276244585824, + "loss": 0.6622, + "step": 75860 + }, + { + "epoch": 1.6887019230769231, + "grad_norm": 0.596674382686615, + "learning_rate": 0.00016076704497162948, + "loss": 0.4638, + "step": 75870 + }, + { + "epoch": 1.6889245014245016, + "grad_norm": 0.7307976484298706, + "learning_rate": 0.00016072132962935155, + "loss": 0.5899, + "step": 75880 + }, + { + "epoch": 1.6891470797720798, + "grad_norm": 0.5637958645820618, + "learning_rate": 0.00016067561642150871, + "loss": 0.6062, + "step": 75890 + }, + { + "epoch": 1.689369658119658, + "grad_norm": 0.528197169303894, + "learning_rate": 0.00016062990535058505, + "loss": 0.5993, + "step": 75900 + }, + { + "epoch": 1.6895922364672364, + "grad_norm": 0.5327113270759583, + "learning_rate": 0.0001605841964190646, + "loss": 0.5893, + "step": 75910 + }, + { + "epoch": 1.6898148148148149, + "grad_norm": 0.5732520818710327, + "learning_rate": 0.00016053848962943118, + "loss": 0.556, + "step": 75920 + }, + { + "epoch": 1.690037393162393, + "grad_norm": 0.40062829852104187, + "learning_rate": 0.00016049278498416852, + "loss": 0.5818, + "step": 75930 + }, + { + "epoch": 1.6902599715099715, + "grad_norm": 0.4862927198410034, + "learning_rate": 0.00016044708248576028, + "loss": 0.5753, + "step": 75940 + }, + { + "epoch": 1.69048254985755, + "grad_norm": 0.5803338289260864, + "learning_rate": 0.00016040138213668995, + "loss": 0.6473, + "step": 75950 + }, + { + "epoch": 1.6907051282051282, + "grad_norm": 0.47554415464401245, + "learning_rate": 0.00016035568393944094, + "loss": 0.4034, + "step": 75960 + }, + { + "epoch": 1.6909277065527064, + "grad_norm": 0.5601959824562073, + "learning_rate": 0.00016030998789649649, + "loss": 0.6176, + "step": 75970 + }, + { + "epoch": 1.6911502849002849, + "grad_norm": 0.5545170903205872, + "learning_rate": 0.00016026429401033983, + "loss": 0.4686, + "step": 75980 + }, + { + "epoch": 1.6913728632478633, + "grad_norm": 0.45020928978919983, + "learning_rate": 0.00016021860228345396, + "loss": 0.4481, + "step": 75990 + }, + { + "epoch": 1.6915954415954415, + "grad_norm": 0.35779350996017456, + "learning_rate": 0.00016017291271832183, + "loss": 0.4946, + "step": 76000 + }, + { + "epoch": 1.69181801994302, + "grad_norm": 0.7461297512054443, + "learning_rate": 0.0001601272253174262, + "loss": 0.5929, + "step": 76010 + }, + { + "epoch": 1.6920405982905984, + "grad_norm": 0.39829355478286743, + "learning_rate": 0.00016008154008324976, + "loss": 0.5274, + "step": 76020 + }, + { + "epoch": 1.6922631766381766, + "grad_norm": 0.4320659935474396, + "learning_rate": 0.00016003585701827516, + "loss": 0.594, + "step": 76030 + }, + { + "epoch": 1.6924857549857548, + "grad_norm": 0.6035652756690979, + "learning_rate": 0.00015999017612498476, + "loss": 0.4655, + "step": 76040 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.4487076997756958, + "learning_rate": 0.00015994449740586094, + "loss": 0.5315, + "step": 76050 + }, + { + "epoch": 1.6929309116809117, + "grad_norm": 0.5057926774024963, + "learning_rate": 0.00015989882086338598, + "loss": 0.5373, + "step": 76060 + }, + { + "epoch": 1.69315349002849, + "grad_norm": 0.5225903987884521, + "learning_rate": 0.00015985314650004186, + "loss": 0.4748, + "step": 76070 + }, + { + "epoch": 1.6933760683760684, + "grad_norm": 0.7291542291641235, + "learning_rate": 0.00015980747431831063, + "loss": 0.5516, + "step": 76080 + }, + { + "epoch": 1.6935986467236468, + "grad_norm": 0.642415463924408, + "learning_rate": 0.0001597618043206742, + "loss": 0.442, + "step": 76090 + }, + { + "epoch": 1.693821225071225, + "grad_norm": 0.45231255888938904, + "learning_rate": 0.0001597161365096142, + "loss": 0.5974, + "step": 76100 + }, + { + "epoch": 1.6940438034188035, + "grad_norm": 0.7588939070701599, + "learning_rate": 0.00015967047088761234, + "loss": 0.6961, + "step": 76110 + }, + { + "epoch": 1.694266381766382, + "grad_norm": 0.6613054871559143, + "learning_rate": 0.00015962480745715016, + "loss": 0.6047, + "step": 76120 + }, + { + "epoch": 1.69448896011396, + "grad_norm": 0.7823227643966675, + "learning_rate": 0.00015957914622070901, + "loss": 0.5886, + "step": 76130 + }, + { + "epoch": 1.6947115384615383, + "grad_norm": 0.7325944304466248, + "learning_rate": 0.00015953348718077007, + "loss": 0.6105, + "step": 76140 + }, + { + "epoch": 1.6949341168091168, + "grad_norm": 0.5065510869026184, + "learning_rate": 0.00015948783033981458, + "loss": 0.4539, + "step": 76150 + }, + { + "epoch": 1.6951566951566952, + "grad_norm": 0.5566450953483582, + "learning_rate": 0.00015944217570032358, + "loss": 0.6209, + "step": 76160 + }, + { + "epoch": 1.6953792735042734, + "grad_norm": 0.5529851317405701, + "learning_rate": 0.00015939652326477793, + "loss": 0.5442, + "step": 76170 + }, + { + "epoch": 1.6956018518518519, + "grad_norm": 0.6417269706726074, + "learning_rate": 0.0001593508730356584, + "loss": 0.5299, + "step": 76180 + }, + { + "epoch": 1.6958244301994303, + "grad_norm": 0.5870069265365601, + "learning_rate": 0.00015930522501544575, + "loss": 0.5943, + "step": 76190 + }, + { + "epoch": 1.6960470085470085, + "grad_norm": 0.8581626415252686, + "learning_rate": 0.00015925957920662052, + "loss": 0.5854, + "step": 76200 + }, + { + "epoch": 1.6962695868945867, + "grad_norm": 0.5439938306808472, + "learning_rate": 0.00015921393561166308, + "loss": 0.4965, + "step": 76210 + }, + { + "epoch": 1.6964921652421654, + "grad_norm": 0.6037572026252747, + "learning_rate": 0.00015916829423305372, + "loss": 0.4593, + "step": 76220 + }, + { + "epoch": 1.6967147435897436, + "grad_norm": 0.40113234519958496, + "learning_rate": 0.00015912265507327277, + "loss": 0.3858, + "step": 76230 + }, + { + "epoch": 1.6969373219373218, + "grad_norm": 0.29176080226898193, + "learning_rate": 0.00015907701813480015, + "loss": 0.4355, + "step": 76240 + }, + { + "epoch": 1.6971599002849003, + "grad_norm": 0.5197509527206421, + "learning_rate": 0.00015903138342011585, + "loss": 0.5374, + "step": 76250 + }, + { + "epoch": 1.6973824786324787, + "grad_norm": 0.6022807955741882, + "learning_rate": 0.0001589857509316998, + "loss": 0.6799, + "step": 76260 + }, + { + "epoch": 1.697605056980057, + "grad_norm": 0.781623125076294, + "learning_rate": 0.00015894012067203158, + "loss": 0.514, + "step": 76270 + }, + { + "epoch": 1.6978276353276354, + "grad_norm": 0.4291946291923523, + "learning_rate": 0.00015889449264359077, + "loss": 0.6303, + "step": 76280 + }, + { + "epoch": 1.6980502136752138, + "grad_norm": 0.49008262157440186, + "learning_rate": 0.0001588488668488569, + "loss": 0.5911, + "step": 76290 + }, + { + "epoch": 1.698272792022792, + "grad_norm": 0.6763221621513367, + "learning_rate": 0.00015880324329030934, + "loss": 0.5334, + "step": 76300 + }, + { + "epoch": 1.6984953703703702, + "grad_norm": 0.4447299838066101, + "learning_rate": 0.0001587576219704272, + "loss": 0.4779, + "step": 76310 + }, + { + "epoch": 1.6987179487179487, + "grad_norm": 0.8476657271385193, + "learning_rate": 0.00015871200289168966, + "loss": 0.491, + "step": 76320 + }, + { + "epoch": 1.6989405270655271, + "grad_norm": 0.5236039161682129, + "learning_rate": 0.00015866638605657567, + "loss": 0.5248, + "step": 76330 + }, + { + "epoch": 1.6991631054131053, + "grad_norm": 0.9949755072593689, + "learning_rate": 0.00015862077146756414, + "loss": 0.5585, + "step": 76340 + }, + { + "epoch": 1.6993856837606838, + "grad_norm": 0.5810121297836304, + "learning_rate": 0.00015857515912713372, + "loss": 0.4416, + "step": 76350 + }, + { + "epoch": 1.6996082621082622, + "grad_norm": 0.47213947772979736, + "learning_rate": 0.0001585295490377631, + "loss": 0.5593, + "step": 76360 + }, + { + "epoch": 1.6998308404558404, + "grad_norm": 0.5209132432937622, + "learning_rate": 0.00015848394120193072, + "loss": 0.5851, + "step": 76370 + }, + { + "epoch": 1.7000534188034186, + "grad_norm": 0.42960062623023987, + "learning_rate": 0.00015843833562211496, + "loss": 0.4306, + "step": 76380 + }, + { + "epoch": 1.7002759971509973, + "grad_norm": 0.7299292683601379, + "learning_rate": 0.00015839273230079415, + "loss": 0.4564, + "step": 76390 + }, + { + "epoch": 1.7004985754985755, + "grad_norm": 0.5951300263404846, + "learning_rate": 0.00015834713124044622, + "loss": 0.6025, + "step": 76400 + }, + { + "epoch": 1.7007211538461537, + "grad_norm": 0.7156624794006348, + "learning_rate": 0.00015830153244354933, + "loss": 0.5785, + "step": 76410 + }, + { + "epoch": 1.7009437321937322, + "grad_norm": 0.3453843295574188, + "learning_rate": 0.00015825593591258126, + "loss": 0.5587, + "step": 76420 + }, + { + "epoch": 1.7011663105413106, + "grad_norm": 0.5409964323043823, + "learning_rate": 0.0001582103416500198, + "loss": 0.4189, + "step": 76430 + }, + { + "epoch": 1.7013888888888888, + "grad_norm": 0.484016478061676, + "learning_rate": 0.00015816474965834264, + "loss": 0.5967, + "step": 76440 + }, + { + "epoch": 1.7016114672364673, + "grad_norm": 0.6141249537467957, + "learning_rate": 0.0001581191599400272, + "loss": 0.5393, + "step": 76450 + }, + { + "epoch": 1.7018340455840457, + "grad_norm": 0.36540478467941284, + "learning_rate": 0.0001580735724975509, + "loss": 0.5607, + "step": 76460 + }, + { + "epoch": 1.702056623931624, + "grad_norm": 0.7449799180030823, + "learning_rate": 0.00015802798733339094, + "loss": 0.5529, + "step": 76470 + }, + { + "epoch": 1.7022792022792022, + "grad_norm": 0.609114944934845, + "learning_rate": 0.00015798240445002458, + "loss": 0.5248, + "step": 76480 + }, + { + "epoch": 1.7025017806267806, + "grad_norm": 0.5341273546218872, + "learning_rate": 0.00015793682384992872, + "loss": 0.4829, + "step": 76490 + }, + { + "epoch": 1.702724358974359, + "grad_norm": 0.546101450920105, + "learning_rate": 0.0001578912455355803, + "loss": 0.563, + "step": 76500 + }, + { + "epoch": 1.7029469373219372, + "grad_norm": 0.4574396014213562, + "learning_rate": 0.00015784566950945608, + "loss": 0.4483, + "step": 76510 + }, + { + "epoch": 1.7031695156695157, + "grad_norm": 0.3588087260723114, + "learning_rate": 0.00015780009577403276, + "loss": 0.5059, + "step": 76520 + }, + { + "epoch": 1.7033920940170941, + "grad_norm": 0.6983396410942078, + "learning_rate": 0.0001577545243317867, + "loss": 0.5204, + "step": 76530 + }, + { + "epoch": 1.7036146723646723, + "grad_norm": 0.6309730410575867, + "learning_rate": 0.00015770895518519434, + "loss": 0.5282, + "step": 76540 + }, + { + "epoch": 1.7038372507122506, + "grad_norm": 0.6660280227661133, + "learning_rate": 0.00015766338833673205, + "loss": 0.4507, + "step": 76550 + }, + { + "epoch": 1.7040598290598292, + "grad_norm": 0.7245699167251587, + "learning_rate": 0.00015761782378887585, + "loss": 0.3934, + "step": 76560 + }, + { + "epoch": 1.7042824074074074, + "grad_norm": 0.48217499256134033, + "learning_rate": 0.00015757226154410175, + "loss": 0.6225, + "step": 76570 + }, + { + "epoch": 1.7045049857549857, + "grad_norm": 0.5108203887939453, + "learning_rate": 0.0001575267016048858, + "loss": 0.6156, + "step": 76580 + }, + { + "epoch": 1.704727564102564, + "grad_norm": 0.4934636354446411, + "learning_rate": 0.0001574811439737036, + "loss": 0.4839, + "step": 76590 + }, + { + "epoch": 1.7049501424501425, + "grad_norm": 0.4846111834049225, + "learning_rate": 0.00015743558865303082, + "loss": 0.5209, + "step": 76600 + }, + { + "epoch": 1.7051727207977208, + "grad_norm": 0.8412664532661438, + "learning_rate": 0.00015739003564534305, + "loss": 0.6843, + "step": 76610 + }, + { + "epoch": 1.7053952991452992, + "grad_norm": 0.5684376955032349, + "learning_rate": 0.00015734448495311558, + "loss": 0.4732, + "step": 76620 + }, + { + "epoch": 1.7056178774928776, + "grad_norm": 2.491905689239502, + "learning_rate": 0.00015729893657882374, + "loss": 0.4859, + "step": 76630 + }, + { + "epoch": 1.7058404558404558, + "grad_norm": 0.6867818236351013, + "learning_rate": 0.00015725339052494262, + "loss": 0.653, + "step": 76640 + }, + { + "epoch": 1.706063034188034, + "grad_norm": 0.5544010996818542, + "learning_rate": 0.00015720784679394733, + "loss": 0.5295, + "step": 76650 + }, + { + "epoch": 1.7062856125356125, + "grad_norm": 0.4254001975059509, + "learning_rate": 0.00015716230538831264, + "loss": 0.6488, + "step": 76660 + }, + { + "epoch": 1.706508190883191, + "grad_norm": 0.7363816499710083, + "learning_rate": 0.00015711676631051331, + "loss": 0.5307, + "step": 76670 + }, + { + "epoch": 1.7067307692307692, + "grad_norm": 0.7330361008644104, + "learning_rate": 0.000157071229563024, + "loss": 0.5654, + "step": 76680 + }, + { + "epoch": 1.7069533475783476, + "grad_norm": 0.5299323201179504, + "learning_rate": 0.00015702569514831926, + "loss": 0.5194, + "step": 76690 + }, + { + "epoch": 1.707175925925926, + "grad_norm": 0.5326554179191589, + "learning_rate": 0.00015698016306887338, + "loss": 0.6502, + "step": 76700 + }, + { + "epoch": 1.7073985042735043, + "grad_norm": 0.5645008087158203, + "learning_rate": 0.00015693463332716067, + "loss": 0.5256, + "step": 76710 + }, + { + "epoch": 1.7076210826210825, + "grad_norm": 0.574365496635437, + "learning_rate": 0.0001568891059256553, + "loss": 0.4167, + "step": 76720 + }, + { + "epoch": 1.707843660968661, + "grad_norm": 0.759273886680603, + "learning_rate": 0.0001568435808668311, + "loss": 0.57, + "step": 76730 + }, + { + "epoch": 1.7080662393162394, + "grad_norm": 0.6679409146308899, + "learning_rate": 0.00015679805815316212, + "loss": 0.5119, + "step": 76740 + }, + { + "epoch": 1.7082888176638176, + "grad_norm": 0.4140261113643646, + "learning_rate": 0.000156752537787122, + "loss": 0.5053, + "step": 76750 + }, + { + "epoch": 1.708511396011396, + "grad_norm": 0.6662968397140503, + "learning_rate": 0.00015670701977118438, + "loss": 0.4888, + "step": 76760 + }, + { + "epoch": 1.7087339743589745, + "grad_norm": 0.47943583130836487, + "learning_rate": 0.00015666150410782276, + "loss": 0.4513, + "step": 76770 + }, + { + "epoch": 1.7089565527065527, + "grad_norm": 0.6128782629966736, + "learning_rate": 0.00015661599079951045, + "loss": 0.5287, + "step": 76780 + }, + { + "epoch": 1.709179131054131, + "grad_norm": 0.6315615773200989, + "learning_rate": 0.00015657047984872082, + "loss": 0.4315, + "step": 76790 + }, + { + "epoch": 1.7094017094017095, + "grad_norm": 0.5509540438652039, + "learning_rate": 0.0001565249712579268, + "loss": 0.5383, + "step": 76800 + }, + { + "epoch": 1.7096242877492878, + "grad_norm": 0.4812536835670471, + "learning_rate": 0.00015647946502960142, + "loss": 0.5343, + "step": 76810 + }, + { + "epoch": 1.709846866096866, + "grad_norm": 0.6319760680198669, + "learning_rate": 0.0001564339611662175, + "loss": 0.4889, + "step": 76820 + }, + { + "epoch": 1.7100694444444444, + "grad_norm": 0.7118447422981262, + "learning_rate": 0.0001563884596702479, + "loss": 0.5909, + "step": 76830 + }, + { + "epoch": 1.7102920227920229, + "grad_norm": 0.6964597702026367, + "learning_rate": 0.00015634296054416503, + "loss": 0.4947, + "step": 76840 + }, + { + "epoch": 1.710514601139601, + "grad_norm": 0.5916907787322998, + "learning_rate": 0.0001562974637904414, + "loss": 0.5048, + "step": 76850 + }, + { + "epoch": 1.7107371794871795, + "grad_norm": 0.5401000380516052, + "learning_rate": 0.00015625196941154943, + "loss": 0.6055, + "step": 76860 + }, + { + "epoch": 1.710959757834758, + "grad_norm": 0.6075836420059204, + "learning_rate": 0.0001562064774099612, + "loss": 0.5337, + "step": 76870 + }, + { + "epoch": 1.7111823361823362, + "grad_norm": 0.6079210638999939, + "learning_rate": 0.00015616098778814885, + "loss": 0.5166, + "step": 76880 + }, + { + "epoch": 1.7114049145299144, + "grad_norm": 0.48606622219085693, + "learning_rate": 0.00015611550054858437, + "loss": 0.5265, + "step": 76890 + }, + { + "epoch": 1.7116274928774928, + "grad_norm": 0.6630349159240723, + "learning_rate": 0.00015607001569373945, + "loss": 0.5691, + "step": 76900 + }, + { + "epoch": 1.7118500712250713, + "grad_norm": 0.5666025280952454, + "learning_rate": 0.00015602453322608584, + "loss": 0.5278, + "step": 76910 + }, + { + "epoch": 1.7120726495726495, + "grad_norm": 0.6412245035171509, + "learning_rate": 0.00015597905314809518, + "loss": 0.5876, + "step": 76920 + }, + { + "epoch": 1.712295227920228, + "grad_norm": 0.5317904353141785, + "learning_rate": 0.00015593357546223873, + "loss": 0.5432, + "step": 76930 + }, + { + "epoch": 1.7125178062678064, + "grad_norm": 0.5867559909820557, + "learning_rate": 0.00015588810017098793, + "loss": 0.5655, + "step": 76940 + }, + { + "epoch": 1.7127403846153846, + "grad_norm": 0.5823548436164856, + "learning_rate": 0.00015584262727681377, + "loss": 0.4988, + "step": 76950 + }, + { + "epoch": 1.7129629629629628, + "grad_norm": 0.8102551102638245, + "learning_rate": 0.00015579715678218744, + "loss": 0.6317, + "step": 76960 + }, + { + "epoch": 1.7131855413105415, + "grad_norm": 0.5633259415626526, + "learning_rate": 0.00015575168868957984, + "loss": 0.6443, + "step": 76970 + }, + { + "epoch": 1.7134081196581197, + "grad_norm": 0.4515515863895416, + "learning_rate": 0.00015570622300146165, + "loss": 0.4327, + "step": 76980 + }, + { + "epoch": 1.713630698005698, + "grad_norm": 0.5940386652946472, + "learning_rate": 0.00015566075972030355, + "loss": 0.5207, + "step": 76990 + }, + { + "epoch": 1.7138532763532763, + "grad_norm": 0.8212336301803589, + "learning_rate": 0.00015561529884857613, + "loss": 0.5504, + "step": 77000 + }, + { + "epoch": 1.7140758547008548, + "grad_norm": 0.6525058150291443, + "learning_rate": 0.00015556984038874965, + "loss": 0.4645, + "step": 77010 + }, + { + "epoch": 1.714298433048433, + "grad_norm": 0.8459281325340271, + "learning_rate": 0.00015552438434329445, + "loss": 0.6111, + "step": 77020 + }, + { + "epoch": 1.7145210113960114, + "grad_norm": 0.7160471677780151, + "learning_rate": 0.0001554789307146806, + "loss": 0.527, + "step": 77030 + }, + { + "epoch": 1.7147435897435899, + "grad_norm": 0.557977020740509, + "learning_rate": 0.0001554334795053781, + "loss": 0.6238, + "step": 77040 + }, + { + "epoch": 1.714966168091168, + "grad_norm": 0.45475080609321594, + "learning_rate": 0.00015538803071785687, + "loss": 0.5713, + "step": 77050 + }, + { + "epoch": 1.7151887464387463, + "grad_norm": 0.795819103717804, + "learning_rate": 0.00015534258435458652, + "loss": 0.5447, + "step": 77060 + }, + { + "epoch": 1.7154113247863247, + "grad_norm": 0.5533003211021423, + "learning_rate": 0.00015529714041803664, + "loss": 0.5369, + "step": 77070 + }, + { + "epoch": 1.7156339031339032, + "grad_norm": 0.7124785780906677, + "learning_rate": 0.0001552516989106768, + "loss": 0.5782, + "step": 77080 + }, + { + "epoch": 1.7158564814814814, + "grad_norm": 0.4880826771259308, + "learning_rate": 0.00015520625983497628, + "loss": 0.3975, + "step": 77090 + }, + { + "epoch": 1.7160790598290598, + "grad_norm": 0.6201895475387573, + "learning_rate": 0.0001551608231934042, + "loss": 0.5766, + "step": 77100 + }, + { + "epoch": 1.7163016381766383, + "grad_norm": 0.6869056820869446, + "learning_rate": 0.0001551153889884298, + "loss": 0.4937, + "step": 77110 + }, + { + "epoch": 1.7165242165242165, + "grad_norm": 0.6806768774986267, + "learning_rate": 0.00015506995722252184, + "loss": 0.5843, + "step": 77120 + }, + { + "epoch": 1.7167467948717947, + "grad_norm": 0.47910743951797485, + "learning_rate": 0.00015502452789814918, + "loss": 0.4628, + "step": 77130 + }, + { + "epoch": 1.7169693732193734, + "grad_norm": 0.6489134430885315, + "learning_rate": 0.00015497910101778056, + "loss": 0.5903, + "step": 77140 + }, + { + "epoch": 1.7171919515669516, + "grad_norm": 0.654880702495575, + "learning_rate": 0.00015493367658388438, + "loss": 0.6535, + "step": 77150 + }, + { + "epoch": 1.7174145299145298, + "grad_norm": 0.4657239317893982, + "learning_rate": 0.0001548882545989291, + "loss": 0.536, + "step": 77160 + }, + { + "epoch": 1.7176371082621082, + "grad_norm": 0.4619198739528656, + "learning_rate": 0.00015484283506538303, + "loss": 0.6001, + "step": 77170 + }, + { + "epoch": 1.7178596866096867, + "grad_norm": 0.8681321144104004, + "learning_rate": 0.0001547974179857143, + "loss": 0.5092, + "step": 77180 + }, + { + "epoch": 1.718082264957265, + "grad_norm": 0.408426433801651, + "learning_rate": 0.00015475200336239088, + "loss": 0.6088, + "step": 77190 + }, + { + "epoch": 1.7183048433048433, + "grad_norm": 0.3688236474990845, + "learning_rate": 0.0001547065911978806, + "loss": 0.5477, + "step": 77200 + }, + { + "epoch": 1.7185274216524218, + "grad_norm": 0.5153622627258301, + "learning_rate": 0.0001546611814946512, + "loss": 0.4702, + "step": 77210 + }, + { + "epoch": 1.71875, + "grad_norm": 0.40111953020095825, + "learning_rate": 0.00015461577425517038, + "loss": 0.4189, + "step": 77220 + }, + { + "epoch": 1.7189725783475782, + "grad_norm": 0.7101040482521057, + "learning_rate": 0.0001545703694819055, + "loss": 0.5583, + "step": 77230 + }, + { + "epoch": 1.7191951566951567, + "grad_norm": 0.6449742317199707, + "learning_rate": 0.00015452496717732392, + "loss": 0.5873, + "step": 77240 + }, + { + "epoch": 1.719417735042735, + "grad_norm": 0.4655434489250183, + "learning_rate": 0.0001544795673438929, + "loss": 0.5781, + "step": 77250 + }, + { + "epoch": 1.7196403133903133, + "grad_norm": 0.32710781693458557, + "learning_rate": 0.00015443416998407942, + "loss": 0.4204, + "step": 77260 + }, + { + "epoch": 1.7198628917378918, + "grad_norm": 0.6005839705467224, + "learning_rate": 0.00015438877510035043, + "loss": 0.5271, + "step": 77270 + }, + { + "epoch": 1.7200854700854702, + "grad_norm": 0.4636354148387909, + "learning_rate": 0.0001543433826951728, + "loss": 0.596, + "step": 77280 + }, + { + "epoch": 1.7203080484330484, + "grad_norm": 0.47903260588645935, + "learning_rate": 0.0001542979927710131, + "loss": 0.6041, + "step": 77290 + }, + { + "epoch": 1.7205306267806266, + "grad_norm": 0.5806395411491394, + "learning_rate": 0.00015425260533033784, + "loss": 0.5057, + "step": 77300 + }, + { + "epoch": 1.7207532051282053, + "grad_norm": 0.5809629559516907, + "learning_rate": 0.00015420722037561357, + "loss": 0.5285, + "step": 77310 + }, + { + "epoch": 1.7209757834757835, + "grad_norm": 0.5405491590499878, + "learning_rate": 0.00015416183790930633, + "loss": 0.493, + "step": 77320 + }, + { + "epoch": 1.7211983618233617, + "grad_norm": 0.6415594816207886, + "learning_rate": 0.00015411645793388242, + "loss": 0.501, + "step": 77330 + }, + { + "epoch": 1.7214209401709402, + "grad_norm": 0.6232026815414429, + "learning_rate": 0.0001540710804518077, + "loss": 0.5539, + "step": 77340 + }, + { + "epoch": 1.7216435185185186, + "grad_norm": 0.417968213558197, + "learning_rate": 0.00015402570546554803, + "loss": 0.4794, + "step": 77350 + }, + { + "epoch": 1.7218660968660968, + "grad_norm": 0.6860714554786682, + "learning_rate": 0.00015398033297756925, + "loss": 0.6022, + "step": 77360 + }, + { + "epoch": 1.7220886752136753, + "grad_norm": 0.6967795491218567, + "learning_rate": 0.00015393496299033677, + "loss": 0.674, + "step": 77370 + }, + { + "epoch": 1.7223112535612537, + "grad_norm": 1.5055320262908936, + "learning_rate": 0.00015388959550631612, + "loss": 0.5298, + "step": 77380 + }, + { + "epoch": 1.722533831908832, + "grad_norm": 0.6755040884017944, + "learning_rate": 0.00015384423052797265, + "loss": 0.504, + "step": 77390 + }, + { + "epoch": 1.7227564102564101, + "grad_norm": 0.45495426654815674, + "learning_rate": 0.00015379886805777144, + "loss": 0.5517, + "step": 77400 + }, + { + "epoch": 1.7229789886039886, + "grad_norm": 0.7500666975975037, + "learning_rate": 0.00015375350809817754, + "loss": 0.515, + "step": 77410 + }, + { + "epoch": 1.723201566951567, + "grad_norm": 0.6109678745269775, + "learning_rate": 0.00015370815065165593, + "loss": 0.4122, + "step": 77420 + }, + { + "epoch": 1.7234241452991452, + "grad_norm": 0.7749100923538208, + "learning_rate": 0.00015366279572067126, + "loss": 0.6301, + "step": 77430 + }, + { + "epoch": 1.7236467236467237, + "grad_norm": 0.6647700667381287, + "learning_rate": 0.0001536174433076883, + "loss": 0.6777, + "step": 77440 + }, + { + "epoch": 1.723869301994302, + "grad_norm": 0.4303279519081116, + "learning_rate": 0.00015357209341517134, + "loss": 0.6068, + "step": 77450 + }, + { + "epoch": 1.7240918803418803, + "grad_norm": 0.34944385290145874, + "learning_rate": 0.00015352674604558487, + "loss": 0.5226, + "step": 77460 + }, + { + "epoch": 1.7243144586894585, + "grad_norm": 0.5302987098693848, + "learning_rate": 0.00015348140120139306, + "loss": 0.5667, + "step": 77470 + }, + { + "epoch": 1.7245370370370372, + "grad_norm": 0.5831298828125, + "learning_rate": 0.00015343605888505995, + "loss": 0.4849, + "step": 77480 + }, + { + "epoch": 1.7247596153846154, + "grad_norm": 0.8023634552955627, + "learning_rate": 0.00015339071909904953, + "loss": 0.5295, + "step": 77490 + }, + { + "epoch": 1.7249821937321936, + "grad_norm": 0.6617007255554199, + "learning_rate": 0.00015334538184582565, + "loss": 0.7097, + "step": 77500 + }, + { + "epoch": 1.725204772079772, + "grad_norm": 0.43473944067955017, + "learning_rate": 0.00015330004712785185, + "loss": 0.5037, + "step": 77510 + }, + { + "epoch": 1.7254273504273505, + "grad_norm": 0.3872452676296234, + "learning_rate": 0.0001532547149475917, + "loss": 0.4662, + "step": 77520 + }, + { + "epoch": 1.7256499287749287, + "grad_norm": 0.4788249731063843, + "learning_rate": 0.0001532093853075087, + "loss": 0.5742, + "step": 77530 + }, + { + "epoch": 1.7258725071225072, + "grad_norm": 0.6486528515815735, + "learning_rate": 0.00015316405821006593, + "loss": 0.6181, + "step": 77540 + }, + { + "epoch": 1.7260950854700856, + "grad_norm": 0.5705780386924744, + "learning_rate": 0.0001531187336577266, + "loss": 0.5193, + "step": 77550 + }, + { + "epoch": 1.7263176638176638, + "grad_norm": 0.7377183437347412, + "learning_rate": 0.0001530734116529537, + "loss": 0.5357, + "step": 77560 + }, + { + "epoch": 1.726540242165242, + "grad_norm": 0.4787918031215668, + "learning_rate": 0.00015302809219821007, + "loss": 0.5632, + "step": 77570 + }, + { + "epoch": 1.7267628205128205, + "grad_norm": 0.5801129937171936, + "learning_rate": 0.00015298277529595826, + "loss": 0.4936, + "step": 77580 + }, + { + "epoch": 1.726985398860399, + "grad_norm": 0.5748696327209473, + "learning_rate": 0.00015293746094866096, + "loss": 0.6502, + "step": 77590 + }, + { + "epoch": 1.7272079772079771, + "grad_norm": 0.6041720509529114, + "learning_rate": 0.00015289214915878055, + "loss": 0.5518, + "step": 77600 + }, + { + "epoch": 1.7274305555555556, + "grad_norm": 0.55303955078125, + "learning_rate": 0.00015284683992877933, + "loss": 0.5525, + "step": 77610 + }, + { + "epoch": 1.727653133903134, + "grad_norm": 0.36641934514045715, + "learning_rate": 0.00015280153326111941, + "loss": 0.5056, + "step": 77620 + }, + { + "epoch": 1.7278757122507122, + "grad_norm": 0.6180765628814697, + "learning_rate": 0.0001527562291582628, + "loss": 0.6146, + "step": 77630 + }, + { + "epoch": 1.7280982905982905, + "grad_norm": 0.8334943652153015, + "learning_rate": 0.00015271092762267143, + "loss": 0.5016, + "step": 77640 + }, + { + "epoch": 1.728320868945869, + "grad_norm": 0.3000129163265228, + "learning_rate": 0.0001526656286568069, + "loss": 0.4684, + "step": 77650 + }, + { + "epoch": 1.7285434472934473, + "grad_norm": 0.6617578864097595, + "learning_rate": 0.0001526203322631309, + "loss": 0.4794, + "step": 77660 + }, + { + "epoch": 1.7287660256410255, + "grad_norm": 0.6590235829353333, + "learning_rate": 0.00015257503844410487, + "loss": 0.4775, + "step": 77670 + }, + { + "epoch": 1.728988603988604, + "grad_norm": 0.6377565264701843, + "learning_rate": 0.00015252974720219, + "loss": 0.6553, + "step": 77680 + }, + { + "epoch": 1.7292111823361824, + "grad_norm": 0.6191003322601318, + "learning_rate": 0.00015248445853984754, + "loss": 0.4988, + "step": 77690 + }, + { + "epoch": 1.7294337606837606, + "grad_norm": 0.408372700214386, + "learning_rate": 0.00015243917245953857, + "loss": 0.4995, + "step": 77700 + }, + { + "epoch": 1.729656339031339, + "grad_norm": 0.5942658185958862, + "learning_rate": 0.00015239388896372388, + "loss": 0.6144, + "step": 77710 + }, + { + "epoch": 1.7298789173789175, + "grad_norm": 0.6212291717529297, + "learning_rate": 0.00015234860805486423, + "loss": 0.6215, + "step": 77720 + }, + { + "epoch": 1.7301014957264957, + "grad_norm": 0.5465625524520874, + "learning_rate": 0.00015230332973542016, + "loss": 0.5146, + "step": 77730 + }, + { + "epoch": 1.730324074074074, + "grad_norm": 0.4212600290775299, + "learning_rate": 0.00015225805400785226, + "loss": 0.5602, + "step": 77740 + }, + { + "epoch": 1.7305466524216524, + "grad_norm": 0.7564339637756348, + "learning_rate": 0.00015221278087462076, + "loss": 0.5131, + "step": 77750 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 0.6511266231536865, + "learning_rate": 0.0001521675103381859, + "loss": 0.5001, + "step": 77760 + }, + { + "epoch": 1.730991809116809, + "grad_norm": 0.534263014793396, + "learning_rate": 0.00015212224240100764, + "loss": 0.4023, + "step": 77770 + }, + { + "epoch": 1.7312143874643875, + "grad_norm": 0.4892014265060425, + "learning_rate": 0.00015207697706554597, + "loss": 0.5147, + "step": 77780 + }, + { + "epoch": 1.731436965811966, + "grad_norm": 0.437839537858963, + "learning_rate": 0.00015203171433426056, + "loss": 0.4898, + "step": 77790 + }, + { + "epoch": 1.7316595441595442, + "grad_norm": 0.33123520016670227, + "learning_rate": 0.00015198645420961106, + "loss": 0.4448, + "step": 77800 + }, + { + "epoch": 1.7318821225071224, + "grad_norm": 0.4915980398654938, + "learning_rate": 0.00015194119669405698, + "loss": 0.5647, + "step": 77810 + }, + { + "epoch": 1.7321047008547008, + "grad_norm": 0.46801048517227173, + "learning_rate": 0.0001518959417900576, + "loss": 0.4096, + "step": 77820 + }, + { + "epoch": 1.7323272792022792, + "grad_norm": 0.6970308423042297, + "learning_rate": 0.0001518506895000721, + "loss": 0.66, + "step": 77830 + }, + { + "epoch": 1.7325498575498575, + "grad_norm": 0.3197179138660431, + "learning_rate": 0.00015180543982655964, + "loss": 0.4739, + "step": 77840 + }, + { + "epoch": 1.732772435897436, + "grad_norm": 0.4276377856731415, + "learning_rate": 0.000151760192771979, + "loss": 0.4646, + "step": 77850 + }, + { + "epoch": 1.7329950142450143, + "grad_norm": 0.5552869439125061, + "learning_rate": 0.0001517149483387889, + "loss": 0.531, + "step": 77860 + }, + { + "epoch": 1.7332175925925926, + "grad_norm": 0.6214910745620728, + "learning_rate": 0.00015166970652944807, + "loss": 0.5939, + "step": 77870 + }, + { + "epoch": 1.7334401709401708, + "grad_norm": 0.6200249791145325, + "learning_rate": 0.00015162446734641498, + "loss": 0.6064, + "step": 77880 + }, + { + "epoch": 1.7336627492877494, + "grad_norm": 0.5559808611869812, + "learning_rate": 0.0001515792307921479, + "loss": 0.4792, + "step": 77890 + }, + { + "epoch": 1.7338853276353277, + "grad_norm": 0.8645526766777039, + "learning_rate": 0.00015153399686910506, + "loss": 0.4616, + "step": 77900 + }, + { + "epoch": 1.7341079059829059, + "grad_norm": 0.6816758513450623, + "learning_rate": 0.0001514887655797445, + "loss": 0.5304, + "step": 77910 + }, + { + "epoch": 1.7343304843304843, + "grad_norm": 0.5790347456932068, + "learning_rate": 0.00015144353692652415, + "loss": 0.5979, + "step": 77920 + }, + { + "epoch": 1.7345530626780628, + "grad_norm": 0.59865802526474, + "learning_rate": 0.00015139831091190176, + "loss": 0.6378, + "step": 77930 + }, + { + "epoch": 1.734775641025641, + "grad_norm": 0.45446011424064636, + "learning_rate": 0.0001513530875383349, + "loss": 0.5979, + "step": 77940 + }, + { + "epoch": 1.7349982193732194, + "grad_norm": 0.4167000949382782, + "learning_rate": 0.00015130786680828115, + "loss": 0.4618, + "step": 77950 + }, + { + "epoch": 1.7352207977207978, + "grad_norm": 0.3784964084625244, + "learning_rate": 0.00015126264872419772, + "loss": 0.478, + "step": 77960 + }, + { + "epoch": 1.735443376068376, + "grad_norm": 0.4573151469230652, + "learning_rate": 0.00015121743328854197, + "loss": 0.4937, + "step": 77970 + }, + { + "epoch": 1.7356659544159543, + "grad_norm": 0.4052749574184418, + "learning_rate": 0.00015117222050377074, + "loss": 0.646, + "step": 77980 + }, + { + "epoch": 1.7358885327635327, + "grad_norm": 0.48372966051101685, + "learning_rate": 0.00015112701037234104, + "loss": 0.5491, + "step": 77990 + }, + { + "epoch": 1.7361111111111112, + "grad_norm": 0.7810246348381042, + "learning_rate": 0.00015108180289670958, + "loss": 0.451, + "step": 78000 + }, + { + "epoch": 1.7363336894586894, + "grad_norm": 0.6991286873817444, + "learning_rate": 0.000151036598079333, + "loss": 0.479, + "step": 78010 + }, + { + "epoch": 1.7365562678062678, + "grad_norm": 0.47509098052978516, + "learning_rate": 0.0001509913959226678, + "loss": 0.5591, + "step": 78020 + }, + { + "epoch": 1.7367788461538463, + "grad_norm": 0.5491872429847717, + "learning_rate": 0.00015094619642917024, + "loss": 0.4718, + "step": 78030 + }, + { + "epoch": 1.7370014245014245, + "grad_norm": 0.504374623298645, + "learning_rate": 0.0001509009996012965, + "loss": 0.4848, + "step": 78040 + }, + { + "epoch": 1.7372240028490027, + "grad_norm": 0.7922311425209045, + "learning_rate": 0.0001508558054415027, + "loss": 0.6024, + "step": 78050 + }, + { + "epoch": 1.7374465811965814, + "grad_norm": 0.6239007711410522, + "learning_rate": 0.0001508106139522446, + "loss": 0.6778, + "step": 78060 + }, + { + "epoch": 1.7376691595441596, + "grad_norm": 0.7354421615600586, + "learning_rate": 0.000150765425135978, + "loss": 0.6422, + "step": 78070 + }, + { + "epoch": 1.7378917378917378, + "grad_norm": 0.8253692388534546, + "learning_rate": 0.00015072023899515854, + "loss": 0.4777, + "step": 78080 + }, + { + "epoch": 1.7381143162393162, + "grad_norm": 0.5854160189628601, + "learning_rate": 0.00015067505553224164, + "loss": 0.4572, + "step": 78090 + }, + { + "epoch": 1.7383368945868947, + "grad_norm": 0.43329954147338867, + "learning_rate": 0.00015062987474968265, + "loss": 0.4732, + "step": 78100 + }, + { + "epoch": 1.7385594729344729, + "grad_norm": 0.5088170766830444, + "learning_rate": 0.0001505846966499366, + "loss": 0.4901, + "step": 78110 + }, + { + "epoch": 1.7387820512820513, + "grad_norm": 0.5127506852149963, + "learning_rate": 0.0001505395212354586, + "loss": 0.4453, + "step": 78120 + }, + { + "epoch": 1.7390046296296298, + "grad_norm": 0.6326592564582825, + "learning_rate": 0.00015049434850870354, + "loss": 0.5379, + "step": 78130 + }, + { + "epoch": 1.739227207977208, + "grad_norm": 0.616950273513794, + "learning_rate": 0.00015044917847212608, + "loss": 0.5545, + "step": 78140 + }, + { + "epoch": 1.7394497863247862, + "grad_norm": 0.5827544331550598, + "learning_rate": 0.00015040401112818082, + "loss": 0.5862, + "step": 78150 + }, + { + "epoch": 1.7396723646723646, + "grad_norm": 0.5349555611610413, + "learning_rate": 0.00015035884647932222, + "loss": 0.4486, + "step": 78160 + }, + { + "epoch": 1.739894943019943, + "grad_norm": 0.5661860704421997, + "learning_rate": 0.0001503136845280045, + "loss": 0.4761, + "step": 78170 + }, + { + "epoch": 1.7401175213675213, + "grad_norm": 0.4856835901737213, + "learning_rate": 0.00015026852527668186, + "loss": 0.407, + "step": 78180 + }, + { + "epoch": 1.7402065527065527, + "eval_loss": 0.5531619787216187, + "eval_runtime": 337.3034, + "eval_samples_per_second": 7.011, + "eval_steps_per_second": 7.011, + "step": 78184 + }, + { + "epoch": 1.7403400997150997, + "grad_norm": 0.5013774633407593, + "learning_rate": 0.0001502233687278083, + "loss": 0.5465, + "step": 78190 + }, + { + "epoch": 1.7405626780626782, + "grad_norm": 0.6066476702690125, + "learning_rate": 0.00015017821488383758, + "loss": 0.5875, + "step": 78200 + }, + { + "epoch": 1.7407852564102564, + "grad_norm": 0.4210889935493469, + "learning_rate": 0.00015013306374722348, + "loss": 0.3845, + "step": 78210 + }, + { + "epoch": 1.7410078347578346, + "grad_norm": 0.6543701887130737, + "learning_rate": 0.00015008791532041953, + "loss": 0.5046, + "step": 78220 + }, + { + "epoch": 1.7412304131054133, + "grad_norm": 0.6810021996498108, + "learning_rate": 0.00015004276960587913, + "loss": 0.447, + "step": 78230 + }, + { + "epoch": 1.7414529914529915, + "grad_norm": 0.5493144392967224, + "learning_rate": 0.00014999762660605553, + "loss": 0.6239, + "step": 78240 + }, + { + "epoch": 1.7416755698005697, + "grad_norm": 0.4917827546596527, + "learning_rate": 0.0001499524863234018, + "loss": 0.6435, + "step": 78250 + }, + { + "epoch": 1.7418981481481481, + "grad_norm": 0.592804491519928, + "learning_rate": 0.0001499073487603709, + "loss": 0.4306, + "step": 78260 + }, + { + "epoch": 1.7421207264957266, + "grad_norm": 0.43577784299850464, + "learning_rate": 0.00014986221391941575, + "loss": 0.504, + "step": 78270 + }, + { + "epoch": 1.7423433048433048, + "grad_norm": 0.2997153401374817, + "learning_rate": 0.00014981708180298887, + "loss": 0.5224, + "step": 78280 + }, + { + "epoch": 1.7425658831908832, + "grad_norm": 0.3560454845428467, + "learning_rate": 0.0001497719524135429, + "loss": 0.5944, + "step": 78290 + }, + { + "epoch": 1.7427884615384617, + "grad_norm": 0.6725266575813293, + "learning_rate": 0.00014972682575353015, + "loss": 0.5343, + "step": 78300 + }, + { + "epoch": 1.74301103988604, + "grad_norm": 0.5585955381393433, + "learning_rate": 0.0001496817018254028, + "loss": 0.6153, + "step": 78310 + }, + { + "epoch": 1.743233618233618, + "grad_norm": 0.5645360350608826, + "learning_rate": 0.000149636580631613, + "loss": 0.4829, + "step": 78320 + }, + { + "epoch": 1.7434561965811965, + "grad_norm": 0.6441385746002197, + "learning_rate": 0.00014959146217461265, + "loss": 0.5087, + "step": 78330 + }, + { + "epoch": 1.743678774928775, + "grad_norm": 0.5112342834472656, + "learning_rate": 0.0001495463464568535, + "loss": 0.3337, + "step": 78340 + }, + { + "epoch": 1.7439013532763532, + "grad_norm": 0.834675669670105, + "learning_rate": 0.00014950123348078716, + "loss": 0.5751, + "step": 78350 + }, + { + "epoch": 1.7441239316239316, + "grad_norm": 0.6172758936882019, + "learning_rate": 0.00014945612324886523, + "loss": 0.6955, + "step": 78360 + }, + { + "epoch": 1.74434650997151, + "grad_norm": 0.5868250727653503, + "learning_rate": 0.00014941101576353884, + "loss": 0.4277, + "step": 78370 + }, + { + "epoch": 1.7445690883190883, + "grad_norm": 0.7029470205307007, + "learning_rate": 0.00014936591102725932, + "loss": 0.5665, + "step": 78380 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.31776389479637146, + "learning_rate": 0.0001493208090424776, + "loss": 0.496, + "step": 78390 + }, + { + "epoch": 1.7450142450142452, + "grad_norm": 0.584300696849823, + "learning_rate": 0.00014927570981164464, + "loss": 0.5566, + "step": 78400 + }, + { + "epoch": 1.7452368233618234, + "grad_norm": 0.5410353541374207, + "learning_rate": 0.0001492306133372111, + "loss": 0.5756, + "step": 78410 + }, + { + "epoch": 1.7454594017094016, + "grad_norm": 0.5734127759933472, + "learning_rate": 0.0001491855196216276, + "loss": 0.6147, + "step": 78420 + }, + { + "epoch": 1.74568198005698, + "grad_norm": 0.6105397343635559, + "learning_rate": 0.00014914042866734457, + "loss": 0.5736, + "step": 78430 + }, + { + "epoch": 1.7459045584045585, + "grad_norm": 0.5667615532875061, + "learning_rate": 0.0001490953404768123, + "loss": 0.5383, + "step": 78440 + }, + { + "epoch": 1.7461271367521367, + "grad_norm": 0.7542938590049744, + "learning_rate": 0.00014905025505248086, + "loss": 0.5947, + "step": 78450 + }, + { + "epoch": 1.7463497150997151, + "grad_norm": 0.39676132798194885, + "learning_rate": 0.0001490051723968003, + "loss": 0.4752, + "step": 78460 + }, + { + "epoch": 1.7465722934472936, + "grad_norm": 0.6052358150482178, + "learning_rate": 0.00014896009251222044, + "loss": 0.5905, + "step": 78470 + }, + { + "epoch": 1.7467948717948718, + "grad_norm": 0.36085426807403564, + "learning_rate": 0.0001489150154011909, + "loss": 0.589, + "step": 78480 + }, + { + "epoch": 1.74701745014245, + "grad_norm": 0.4671715497970581, + "learning_rate": 0.00014886994106616132, + "loss": 0.5841, + "step": 78490 + }, + { + "epoch": 1.7472400284900285, + "grad_norm": 0.569623589515686, + "learning_rate": 0.0001488248695095809, + "loss": 0.584, + "step": 78500 + }, + { + "epoch": 1.747462606837607, + "grad_norm": 0.5761276483535767, + "learning_rate": 0.00014877980073389898, + "loss": 0.5222, + "step": 78510 + }, + { + "epoch": 1.7476851851851851, + "grad_norm": 0.779426097869873, + "learning_rate": 0.00014873473474156468, + "loss": 0.5492, + "step": 78520 + }, + { + "epoch": 1.7479077635327636, + "grad_norm": 0.7712996006011963, + "learning_rate": 0.0001486896715350268, + "loss": 0.4714, + "step": 78530 + }, + { + "epoch": 1.748130341880342, + "grad_norm": 0.6011484265327454, + "learning_rate": 0.00014864461111673417, + "loss": 0.4811, + "step": 78540 + }, + { + "epoch": 1.7483529202279202, + "grad_norm": 0.7905614376068115, + "learning_rate": 0.00014859955348913548, + "loss": 0.5346, + "step": 78550 + }, + { + "epoch": 1.7485754985754984, + "grad_norm": 0.5179427862167358, + "learning_rate": 0.00014855449865467906, + "loss": 0.5522, + "step": 78560 + }, + { + "epoch": 1.7487980769230769, + "grad_norm": 0.9913924336433411, + "learning_rate": 0.00014850944661581333, + "loss": 0.6609, + "step": 78570 + }, + { + "epoch": 1.7490206552706553, + "grad_norm": 0.7358751893043518, + "learning_rate": 0.0001484643973749864, + "loss": 0.5785, + "step": 78580 + }, + { + "epoch": 1.7492432336182335, + "grad_norm": 0.46974995732307434, + "learning_rate": 0.0001484193509346463, + "loss": 0.432, + "step": 78590 + }, + { + "epoch": 1.749465811965812, + "grad_norm": 0.665675163269043, + "learning_rate": 0.00014837430729724093, + "loss": 0.5735, + "step": 78600 + }, + { + "epoch": 1.7496883903133904, + "grad_norm": 0.44517970085144043, + "learning_rate": 0.00014832926646521797, + "loss": 0.541, + "step": 78610 + }, + { + "epoch": 1.7499109686609686, + "grad_norm": 0.3566705584526062, + "learning_rate": 0.00014828422844102499, + "loss": 0.6136, + "step": 78620 + }, + { + "epoch": 1.750133547008547, + "grad_norm": 0.7967696785926819, + "learning_rate": 0.00014823919322710935, + "loss": 0.6009, + "step": 78630 + }, + { + "epoch": 1.7503561253561255, + "grad_norm": 0.687789797782898, + "learning_rate": 0.0001481941608259183, + "loss": 0.4748, + "step": 78640 + }, + { + "epoch": 1.7505787037037037, + "grad_norm": 0.5504183173179626, + "learning_rate": 0.00014814913123989895, + "loss": 0.424, + "step": 78650 + }, + { + "epoch": 1.750801282051282, + "grad_norm": 0.617791473865509, + "learning_rate": 0.00014810410447149832, + "loss": 0.579, + "step": 78660 + }, + { + "epoch": 1.7510238603988604, + "grad_norm": 0.6422916054725647, + "learning_rate": 0.00014805908052316306, + "loss": 0.5115, + "step": 78670 + }, + { + "epoch": 1.7512464387464388, + "grad_norm": 0.5082630515098572, + "learning_rate": 0.00014801405939733993, + "loss": 0.6194, + "step": 78680 + }, + { + "epoch": 1.751469017094017, + "grad_norm": 0.5182098150253296, + "learning_rate": 0.00014796904109647536, + "loss": 0.5075, + "step": 78690 + }, + { + "epoch": 1.7516915954415955, + "grad_norm": 0.5061643719673157, + "learning_rate": 0.0001479240256230157, + "loss": 0.6505, + "step": 78700 + }, + { + "epoch": 1.751914173789174, + "grad_norm": 0.5079662203788757, + "learning_rate": 0.00014787901297940708, + "loss": 0.4824, + "step": 78710 + }, + { + "epoch": 1.7521367521367521, + "grad_norm": 0.43398627638816833, + "learning_rate": 0.00014783400316809565, + "loss": 0.5377, + "step": 78720 + }, + { + "epoch": 1.7523593304843303, + "grad_norm": 0.5941019058227539, + "learning_rate": 0.00014778899619152707, + "loss": 0.5567, + "step": 78730 + }, + { + "epoch": 1.7525819088319088, + "grad_norm": 0.6169543266296387, + "learning_rate": 0.00014774399205214723, + "loss": 0.5531, + "step": 78740 + }, + { + "epoch": 1.7528044871794872, + "grad_norm": 0.5469086766242981, + "learning_rate": 0.00014769899075240176, + "loss": 0.627, + "step": 78750 + }, + { + "epoch": 1.7530270655270654, + "grad_norm": 0.5565235614776611, + "learning_rate": 0.00014765399229473583, + "loss": 0.488, + "step": 78760 + }, + { + "epoch": 1.7532496438746439, + "grad_norm": 0.4693734049797058, + "learning_rate": 0.00014760899668159481, + "loss": 0.5234, + "step": 78770 + }, + { + "epoch": 1.7534722222222223, + "grad_norm": 0.7753030061721802, + "learning_rate": 0.00014756400391542382, + "loss": 0.6111, + "step": 78780 + }, + { + "epoch": 1.7536948005698005, + "grad_norm": 0.5935645699501038, + "learning_rate": 0.00014751901399866774, + "loss": 0.5248, + "step": 78790 + }, + { + "epoch": 1.7539173789173788, + "grad_norm": 0.6888484358787537, + "learning_rate": 0.0001474740269337715, + "loss": 0.4863, + "step": 78800 + }, + { + "epoch": 1.7541399572649574, + "grad_norm": 0.701227605342865, + "learning_rate": 0.00014742904272317954, + "loss": 0.6862, + "step": 78810 + }, + { + "epoch": 1.7543625356125356, + "grad_norm": 0.3563346564769745, + "learning_rate": 0.00014738406136933648, + "loss": 0.5245, + "step": 78820 + }, + { + "epoch": 1.7545851139601139, + "grad_norm": 0.42991700768470764, + "learning_rate": 0.0001473390828746866, + "loss": 0.4954, + "step": 78830 + }, + { + "epoch": 1.7548076923076923, + "grad_norm": 0.6053158640861511, + "learning_rate": 0.00014729410724167403, + "loss": 0.4704, + "step": 78840 + }, + { + "epoch": 1.7550302706552707, + "grad_norm": 0.4515514671802521, + "learning_rate": 0.00014724913447274282, + "loss": 0.5426, + "step": 78850 + }, + { + "epoch": 1.755252849002849, + "grad_norm": 0.4333280622959137, + "learning_rate": 0.0001472041645703369, + "loss": 0.526, + "step": 78860 + }, + { + "epoch": 1.7554754273504274, + "grad_norm": 0.8473355174064636, + "learning_rate": 0.00014715919753689982, + "loss": 0.6062, + "step": 78870 + }, + { + "epoch": 1.7556980056980058, + "grad_norm": 0.5016404986381531, + "learning_rate": 0.0001471142333748753, + "loss": 0.5792, + "step": 78880 + }, + { + "epoch": 1.755920584045584, + "grad_norm": 0.668297290802002, + "learning_rate": 0.00014706927208670654, + "loss": 0.5197, + "step": 78890 + }, + { + "epoch": 1.7561431623931623, + "grad_norm": 0.6492266654968262, + "learning_rate": 0.00014702431367483694, + "loss": 0.6069, + "step": 78900 + }, + { + "epoch": 1.7563657407407407, + "grad_norm": 0.5919086933135986, + "learning_rate": 0.00014697935814170942, + "loss": 0.47, + "step": 78910 + }, + { + "epoch": 1.7565883190883191, + "grad_norm": 0.7258105874061584, + "learning_rate": 0.000146934405489767, + "loss": 0.6531, + "step": 78920 + }, + { + "epoch": 1.7568108974358974, + "grad_norm": 0.39832577109336853, + "learning_rate": 0.00014688945572145245, + "loss": 0.5133, + "step": 78930 + }, + { + "epoch": 1.7570334757834758, + "grad_norm": 0.6426448225975037, + "learning_rate": 0.00014684450883920838, + "loss": 0.5626, + "step": 78940 + }, + { + "epoch": 1.7572560541310542, + "grad_norm": 0.6589502096176147, + "learning_rate": 0.00014679956484547714, + "loss": 0.5481, + "step": 78950 + }, + { + "epoch": 1.7574786324786325, + "grad_norm": 0.7137744426727295, + "learning_rate": 0.00014675462374270115, + "loss": 0.5171, + "step": 78960 + }, + { + "epoch": 1.7577012108262107, + "grad_norm": 0.4286314845085144, + "learning_rate": 0.00014670968553332249, + "loss": 0.4934, + "step": 78970 + }, + { + "epoch": 1.7579237891737893, + "grad_norm": 0.41813647747039795, + "learning_rate": 0.00014666475021978315, + "loss": 0.5078, + "step": 78980 + }, + { + "epoch": 1.7581463675213675, + "grad_norm": 0.5532824397087097, + "learning_rate": 0.0001466198178045249, + "loss": 0.5654, + "step": 78990 + }, + { + "epoch": 1.7583689458689458, + "grad_norm": 0.6196248531341553, + "learning_rate": 0.0001465748882899895, + "loss": 0.6976, + "step": 79000 + }, + { + "epoch": 1.7585915242165242, + "grad_norm": 0.5199751257896423, + "learning_rate": 0.0001465299616786184, + "loss": 0.5639, + "step": 79010 + }, + { + "epoch": 1.7588141025641026, + "grad_norm": 0.7880491018295288, + "learning_rate": 0.00014648503797285302, + "loss": 0.639, + "step": 79020 + }, + { + "epoch": 1.7590366809116809, + "grad_norm": 0.45788395404815674, + "learning_rate": 0.00014644011717513438, + "loss": 0.4218, + "step": 79030 + }, + { + "epoch": 1.7592592592592593, + "grad_norm": 0.4737023413181305, + "learning_rate": 0.00014639519928790368, + "loss": 0.6488, + "step": 79040 + }, + { + "epoch": 1.7594818376068377, + "grad_norm": 0.5662586688995361, + "learning_rate": 0.0001463502843136017, + "loss": 0.5559, + "step": 79050 + }, + { + "epoch": 1.759704415954416, + "grad_norm": 0.996168315410614, + "learning_rate": 0.00014630537225466918, + "loss": 0.4669, + "step": 79060 + }, + { + "epoch": 1.7599269943019942, + "grad_norm": 0.5135176181793213, + "learning_rate": 0.0001462604631135467, + "loss": 0.5703, + "step": 79070 + }, + { + "epoch": 1.7601495726495726, + "grad_norm": 0.3148840069770813, + "learning_rate": 0.00014621555689267468, + "loss": 0.5434, + "step": 79080 + }, + { + "epoch": 1.760372150997151, + "grad_norm": 0.5665394067764282, + "learning_rate": 0.00014617065359449327, + "loss": 0.5154, + "step": 79090 + }, + { + "epoch": 1.7605947293447293, + "grad_norm": 0.4433667063713074, + "learning_rate": 0.00014612575322144262, + "loss": 0.5047, + "step": 79100 + }, + { + "epoch": 1.7608173076923077, + "grad_norm": 0.692866325378418, + "learning_rate": 0.0001460808557759627, + "loss": 0.625, + "step": 79110 + }, + { + "epoch": 1.7610398860398861, + "grad_norm": 0.6210705041885376, + "learning_rate": 0.00014603596126049314, + "loss": 0.5122, + "step": 79120 + }, + { + "epoch": 1.7612624643874644, + "grad_norm": 0.484747052192688, + "learning_rate": 0.00014599106967747364, + "loss": 0.4343, + "step": 79130 + }, + { + "epoch": 1.7614850427350426, + "grad_norm": 0.46711021661758423, + "learning_rate": 0.0001459461810293437, + "loss": 0.4887, + "step": 79140 + }, + { + "epoch": 1.7617076210826212, + "grad_norm": 0.6077610850334167, + "learning_rate": 0.00014590129531854255, + "loss": 0.6496, + "step": 79150 + }, + { + "epoch": 1.7619301994301995, + "grad_norm": 0.35435062646865845, + "learning_rate": 0.0001458564125475092, + "loss": 0.4541, + "step": 79160 + }, + { + "epoch": 1.7621527777777777, + "grad_norm": 0.8237841129302979, + "learning_rate": 0.00014581153271868276, + "loss": 0.5728, + "step": 79170 + }, + { + "epoch": 1.7623753561253561, + "grad_norm": 0.5024449229240417, + "learning_rate": 0.000145766655834502, + "loss": 0.5759, + "step": 79180 + }, + { + "epoch": 1.7625979344729346, + "grad_norm": 0.4281274974346161, + "learning_rate": 0.00014572178189740554, + "loss": 0.5587, + "step": 79190 + }, + { + "epoch": 1.7628205128205128, + "grad_norm": 0.9404083490371704, + "learning_rate": 0.00014567691090983185, + "loss": 0.5896, + "step": 79200 + }, + { + "epoch": 1.7630430911680912, + "grad_norm": 0.5195593237876892, + "learning_rate": 0.00014563204287421937, + "loss": 0.5516, + "step": 79210 + }, + { + "epoch": 1.7632656695156697, + "grad_norm": 0.6367482542991638, + "learning_rate": 0.00014558717779300612, + "loss": 0.5328, + "step": 79220 + }, + { + "epoch": 1.7634882478632479, + "grad_norm": 0.7262487411499023, + "learning_rate": 0.0001455423156686302, + "loss": 0.548, + "step": 79230 + }, + { + "epoch": 1.763710826210826, + "grad_norm": 0.7013747096061707, + "learning_rate": 0.00014549745650352942, + "loss": 0.5405, + "step": 79240 + }, + { + "epoch": 1.7639334045584045, + "grad_norm": 0.808320164680481, + "learning_rate": 0.00014545260030014148, + "loss": 0.6022, + "step": 79250 + }, + { + "epoch": 1.764155982905983, + "grad_norm": 0.5514160394668579, + "learning_rate": 0.00014540774706090387, + "loss": 0.7491, + "step": 79260 + }, + { + "epoch": 1.7643785612535612, + "grad_norm": 0.40290629863739014, + "learning_rate": 0.00014536289678825402, + "loss": 0.513, + "step": 79270 + }, + { + "epoch": 1.7646011396011396, + "grad_norm": 0.656762421131134, + "learning_rate": 0.00014531804948462912, + "loss": 0.6651, + "step": 79280 + }, + { + "epoch": 1.764823717948718, + "grad_norm": 0.9429783225059509, + "learning_rate": 0.00014527320515246613, + "loss": 0.6955, + "step": 79290 + }, + { + "epoch": 1.7650462962962963, + "grad_norm": 0.5018649697303772, + "learning_rate": 0.00014522836379420197, + "loss": 0.4788, + "step": 79300 + }, + { + "epoch": 1.7652688746438745, + "grad_norm": 0.6236805319786072, + "learning_rate": 0.00014518352541227332, + "loss": 0.5814, + "step": 79310 + }, + { + "epoch": 1.765491452991453, + "grad_norm": 0.40159523487091064, + "learning_rate": 0.00014513869000911685, + "loss": 0.5379, + "step": 79320 + }, + { + "epoch": 1.7657140313390314, + "grad_norm": 0.4981936514377594, + "learning_rate": 0.00014509385758716881, + "loss": 0.4977, + "step": 79330 + }, + { + "epoch": 1.7659366096866096, + "grad_norm": 0.4534938931465149, + "learning_rate": 0.00014504902814886552, + "loss": 0.419, + "step": 79340 + }, + { + "epoch": 1.766159188034188, + "grad_norm": 0.5050660967826843, + "learning_rate": 0.00014500420169664304, + "loss": 0.492, + "step": 79350 + }, + { + "epoch": 1.7663817663817665, + "grad_norm": 0.7194937467575073, + "learning_rate": 0.00014495937823293727, + "loss": 0.5557, + "step": 79360 + }, + { + "epoch": 1.7666043447293447, + "grad_norm": 0.7035837769508362, + "learning_rate": 0.0001449145577601839, + "loss": 0.5342, + "step": 79370 + }, + { + "epoch": 1.7668269230769231, + "grad_norm": 0.7265353202819824, + "learning_rate": 0.00014486974028081855, + "loss": 0.554, + "step": 79380 + }, + { + "epoch": 1.7670495014245016, + "grad_norm": 0.6127051711082458, + "learning_rate": 0.0001448249257972767, + "loss": 0.6079, + "step": 79390 + }, + { + "epoch": 1.7672720797720798, + "grad_norm": 0.5281509160995483, + "learning_rate": 0.00014478011431199353, + "loss": 0.5042, + "step": 79400 + }, + { + "epoch": 1.767494658119658, + "grad_norm": 0.4901590943336487, + "learning_rate": 0.0001447353058274042, + "loss": 0.5553, + "step": 79410 + }, + { + "epoch": 1.7677172364672364, + "grad_norm": 0.3638477921485901, + "learning_rate": 0.00014469050034594352, + "loss": 0.5171, + "step": 79420 + }, + { + "epoch": 1.7679398148148149, + "grad_norm": 0.7034321427345276, + "learning_rate": 0.0001446456978700464, + "loss": 0.5711, + "step": 79430 + }, + { + "epoch": 1.768162393162393, + "grad_norm": 0.7615830302238464, + "learning_rate": 0.0001446008984021473, + "loss": 0.5898, + "step": 79440 + }, + { + "epoch": 1.7683849715099715, + "grad_norm": 0.43075403571128845, + "learning_rate": 0.00014455610194468075, + "loss": 0.5373, + "step": 79450 + }, + { + "epoch": 1.76860754985755, + "grad_norm": 0.552230954170227, + "learning_rate": 0.00014451130850008103, + "loss": 0.6413, + "step": 79460 + }, + { + "epoch": 1.7688301282051282, + "grad_norm": 0.41900429129600525, + "learning_rate": 0.00014446651807078223, + "loss": 0.6078, + "step": 79470 + }, + { + "epoch": 1.7690527065527064, + "grad_norm": 0.5278525948524475, + "learning_rate": 0.00014442173065921823, + "loss": 0.4369, + "step": 79480 + }, + { + "epoch": 1.7692752849002849, + "grad_norm": 0.47469159960746765, + "learning_rate": 0.00014437694626782298, + "loss": 0.5996, + "step": 79490 + }, + { + "epoch": 1.7694978632478633, + "grad_norm": 0.6462084054946899, + "learning_rate": 0.00014433216489902998, + "loss": 0.5545, + "step": 79500 + }, + { + "epoch": 1.7697204415954415, + "grad_norm": 0.5856205224990845, + "learning_rate": 0.00014428738655527265, + "loss": 0.5303, + "step": 79510 + }, + { + "epoch": 1.76994301994302, + "grad_norm": 0.5695719122886658, + "learning_rate": 0.00014424261123898444, + "loss": 0.5023, + "step": 79520 + }, + { + "epoch": 1.7701655982905984, + "grad_norm": 0.651113748550415, + "learning_rate": 0.0001441978389525983, + "loss": 0.5535, + "step": 79530 + }, + { + "epoch": 1.7703881766381766, + "grad_norm": 0.6896957755088806, + "learning_rate": 0.0001441530696985474, + "loss": 0.537, + "step": 79540 + }, + { + "epoch": 1.7706107549857548, + "grad_norm": 0.34784117341041565, + "learning_rate": 0.00014410830347926428, + "loss": 0.4425, + "step": 79550 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.4808754026889801, + "learning_rate": 0.00014406354029718172, + "loss": 0.4823, + "step": 79560 + }, + { + "epoch": 1.7710559116809117, + "grad_norm": 0.49746209383010864, + "learning_rate": 0.00014401878015473223, + "loss": 0.4782, + "step": 79570 + }, + { + "epoch": 1.77127849002849, + "grad_norm": 0.45266780257225037, + "learning_rate": 0.00014397402305434798, + "loss": 0.4927, + "step": 79580 + }, + { + "epoch": 1.7715010683760684, + "grad_norm": 0.5584794878959656, + "learning_rate": 0.00014392926899846123, + "loss": 0.5003, + "step": 79590 + }, + { + "epoch": 1.7717236467236468, + "grad_norm": 0.5668531060218811, + "learning_rate": 0.0001438845179895039, + "loss": 0.6955, + "step": 79600 + }, + { + "epoch": 1.771946225071225, + "grad_norm": 0.9424641728401184, + "learning_rate": 0.0001438397700299078, + "loss": 0.4932, + "step": 79610 + }, + { + "epoch": 1.7721688034188035, + "grad_norm": 0.526470422744751, + "learning_rate": 0.00014379502512210454, + "loss": 0.4848, + "step": 79620 + }, + { + "epoch": 1.772391381766382, + "grad_norm": 0.6422226428985596, + "learning_rate": 0.00014375028326852566, + "loss": 0.6079, + "step": 79630 + }, + { + "epoch": 1.77261396011396, + "grad_norm": 0.6118940711021423, + "learning_rate": 0.00014370554447160243, + "loss": 0.6929, + "step": 79640 + }, + { + "epoch": 1.7728365384615383, + "grad_norm": 0.4747575521469116, + "learning_rate": 0.000143660808733766, + "loss": 0.4412, + "step": 79650 + }, + { + "epoch": 1.7730591168091168, + "grad_norm": 0.5872943997383118, + "learning_rate": 0.00014361607605744734, + "loss": 0.5547, + "step": 79660 + }, + { + "epoch": 1.7732816951566952, + "grad_norm": 0.5565270781517029, + "learning_rate": 0.0001435713464450773, + "loss": 0.5267, + "step": 79670 + }, + { + "epoch": 1.7735042735042734, + "grad_norm": 0.7619370222091675, + "learning_rate": 0.00014352661989908646, + "loss": 0.5872, + "step": 79680 + }, + { + "epoch": 1.7737268518518519, + "grad_norm": 0.8954969048500061, + "learning_rate": 0.00014348189642190534, + "loss": 0.5826, + "step": 79690 + }, + { + "epoch": 1.7739494301994303, + "grad_norm": 0.538362979888916, + "learning_rate": 0.00014343717601596418, + "loss": 0.5234, + "step": 79700 + }, + { + "epoch": 1.7741720085470085, + "grad_norm": 0.7379159331321716, + "learning_rate": 0.00014339245868369324, + "loss": 0.5972, + "step": 79710 + }, + { + "epoch": 1.7743945868945867, + "grad_norm": 0.5484915971755981, + "learning_rate": 0.00014334774442752241, + "loss": 0.5816, + "step": 79720 + }, + { + "epoch": 1.7746171652421654, + "grad_norm": 0.5934858322143555, + "learning_rate": 0.0001433030332498815, + "loss": 0.6715, + "step": 79730 + }, + { + "epoch": 1.7748397435897436, + "grad_norm": 0.6535338163375854, + "learning_rate": 0.00014325832515320024, + "loss": 0.4886, + "step": 79740 + }, + { + "epoch": 1.7750623219373218, + "grad_norm": 0.3943345844745636, + "learning_rate": 0.000143213620139908, + "loss": 0.5029, + "step": 79750 + }, + { + "epoch": 1.7752849002849003, + "grad_norm": 0.3244159519672394, + "learning_rate": 0.0001431689182124341, + "loss": 0.4258, + "step": 79760 + }, + { + "epoch": 1.7755074786324787, + "grad_norm": 0.6063029766082764, + "learning_rate": 0.00014312421937320776, + "loss": 0.4813, + "step": 79770 + }, + { + "epoch": 1.775730056980057, + "grad_norm": 0.5160526633262634, + "learning_rate": 0.0001430795236246579, + "loss": 0.5442, + "step": 79780 + }, + { + "epoch": 1.7759526353276354, + "grad_norm": 0.4923551082611084, + "learning_rate": 0.00014303483096921328, + "loss": 0.6922, + "step": 79790 + }, + { + "epoch": 1.7761752136752138, + "grad_norm": 0.42703601717948914, + "learning_rate": 0.00014299014140930268, + "loss": 0.5636, + "step": 79800 + }, + { + "epoch": 1.776397792022792, + "grad_norm": 0.6603934168815613, + "learning_rate": 0.00014294545494735437, + "loss": 0.5109, + "step": 79810 + }, + { + "epoch": 1.7766203703703702, + "grad_norm": 0.4960465431213379, + "learning_rate": 0.0001429007715857968, + "loss": 0.6324, + "step": 79820 + }, + { + "epoch": 1.7768429487179487, + "grad_norm": 0.4624791741371155, + "learning_rate": 0.00014285609132705802, + "loss": 0.5882, + "step": 79830 + }, + { + "epoch": 1.7770655270655271, + "grad_norm": 0.38224509358406067, + "learning_rate": 0.00014281141417356598, + "loss": 0.5201, + "step": 79840 + }, + { + "epoch": 1.7772881054131053, + "grad_norm": 0.571273684501648, + "learning_rate": 0.00014276674012774857, + "loss": 0.4605, + "step": 79850 + }, + { + "epoch": 1.7775106837606838, + "grad_norm": 0.45023903250694275, + "learning_rate": 0.00014272206919203334, + "loss": 0.4811, + "step": 79860 + }, + { + "epoch": 1.7777332621082622, + "grad_norm": 0.43059608340263367, + "learning_rate": 0.0001426774013688477, + "loss": 0.4993, + "step": 79870 + }, + { + "epoch": 1.7779558404558404, + "grad_norm": 0.844609797000885, + "learning_rate": 0.00014263273666061907, + "loss": 0.5858, + "step": 79880 + }, + { + "epoch": 1.7781784188034186, + "grad_norm": 0.5263960361480713, + "learning_rate": 0.00014258807506977445, + "loss": 0.5184, + "step": 79890 + }, + { + "epoch": 1.7784009971509973, + "grad_norm": 0.6724256873130798, + "learning_rate": 0.00014254341659874085, + "loss": 0.5282, + "step": 79900 + }, + { + "epoch": 1.7786235754985755, + "grad_norm": 0.709722638130188, + "learning_rate": 0.00014249876124994504, + "loss": 0.5087, + "step": 79910 + }, + { + "epoch": 1.7788461538461537, + "grad_norm": 0.599476158618927, + "learning_rate": 0.00014245410902581358, + "loss": 0.6163, + "step": 79920 + }, + { + "epoch": 1.7790687321937322, + "grad_norm": 0.6736618876457214, + "learning_rate": 0.00014240945992877304, + "loss": 0.4809, + "step": 79930 + }, + { + "epoch": 1.7792913105413106, + "grad_norm": 0.7935197353363037, + "learning_rate": 0.0001423648139612495, + "loss": 0.5017, + "step": 79940 + }, + { + "epoch": 1.7795138888888888, + "grad_norm": 0.5399793982505798, + "learning_rate": 0.00014232017112566914, + "loss": 0.5457, + "step": 79950 + }, + { + "epoch": 1.7797364672364673, + "grad_norm": 0.7083317041397095, + "learning_rate": 0.00014227553142445796, + "loss": 0.65, + "step": 79960 + }, + { + "epoch": 1.7799590455840457, + "grad_norm": 0.6122748255729675, + "learning_rate": 0.00014223089486004162, + "loss": 0.5145, + "step": 79970 + }, + { + "epoch": 1.780181623931624, + "grad_norm": 0.4464356601238251, + "learning_rate": 0.00014218626143484573, + "loss": 0.596, + "step": 79980 + }, + { + "epoch": 1.7804042022792022, + "grad_norm": 0.5562244057655334, + "learning_rate": 0.00014214163115129578, + "loss": 0.5743, + "step": 79990 + }, + { + "epoch": 1.7806267806267806, + "grad_norm": 0.6051118969917297, + "learning_rate": 0.0001420970040118169, + "loss": 0.4303, + "step": 80000 + }, + { + "epoch": 1.780849358974359, + "grad_norm": 0.47445961833000183, + "learning_rate": 0.00014205238001883426, + "loss": 0.4666, + "step": 80010 + }, + { + "epoch": 1.7810719373219372, + "grad_norm": 0.6427887678146362, + "learning_rate": 0.00014200775917477273, + "loss": 0.5215, + "step": 80020 + }, + { + "epoch": 1.7812945156695157, + "grad_norm": 0.6066806316375732, + "learning_rate": 0.00014196314148205702, + "loss": 0.5826, + "step": 80030 + }, + { + "epoch": 1.7815170940170941, + "grad_norm": 0.6234745979309082, + "learning_rate": 0.0001419185269431117, + "loss": 0.4631, + "step": 80040 + }, + { + "epoch": 1.7817396723646723, + "grad_norm": 0.6324895024299622, + "learning_rate": 0.00014187391556036124, + "loss": 0.618, + "step": 80050 + }, + { + "epoch": 1.7819622507122506, + "grad_norm": 0.6726653575897217, + "learning_rate": 0.0001418293073362297, + "loss": 0.5663, + "step": 80060 + }, + { + "epoch": 1.7821848290598292, + "grad_norm": 0.5067920088768005, + "learning_rate": 0.00014178470227314133, + "loss": 0.5297, + "step": 80070 + }, + { + "epoch": 1.7824074074074074, + "grad_norm": 0.6199058890342712, + "learning_rate": 0.00014174010037351983, + "loss": 0.569, + "step": 80080 + }, + { + "epoch": 1.7826299857549857, + "grad_norm": 0.4189685881137848, + "learning_rate": 0.00014169550163978896, + "loss": 0.377, + "step": 80090 + }, + { + "epoch": 1.782852564102564, + "grad_norm": 0.3900449275970459, + "learning_rate": 0.0001416509060743723, + "loss": 0.5089, + "step": 80100 + }, + { + "epoch": 1.7830751424501425, + "grad_norm": 0.6637765765190125, + "learning_rate": 0.00014160631367969313, + "loss": 0.5911, + "step": 80110 + }, + { + "epoch": 1.7832977207977208, + "grad_norm": 0.9584378004074097, + "learning_rate": 0.00014156172445817467, + "loss": 0.4981, + "step": 80120 + }, + { + "epoch": 1.7835202991452992, + "grad_norm": 0.7218378186225891, + "learning_rate": 0.00014151713841224, + "loss": 0.5753, + "step": 80130 + }, + { + "epoch": 1.7837428774928776, + "grad_norm": 0.5870119333267212, + "learning_rate": 0.00014147255554431185, + "loss": 0.5396, + "step": 80140 + }, + { + "epoch": 1.7839654558404558, + "grad_norm": 0.4775555431842804, + "learning_rate": 0.00014142797585681293, + "loss": 0.5523, + "step": 80150 + }, + { + "epoch": 1.784188034188034, + "grad_norm": 0.4715177118778229, + "learning_rate": 0.00014138339935216584, + "loss": 0.4691, + "step": 80160 + }, + { + "epoch": 1.7844106125356125, + "grad_norm": 0.4988359808921814, + "learning_rate": 0.00014133882603279273, + "loss": 0.451, + "step": 80170 + }, + { + "epoch": 1.784633190883191, + "grad_norm": 0.5942032933235168, + "learning_rate": 0.00014129425590111584, + "loss": 0.625, + "step": 80180 + }, + { + "epoch": 1.7848557692307692, + "grad_norm": 0.40297701954841614, + "learning_rate": 0.00014124968895955719, + "loss": 0.5377, + "step": 80190 + }, + { + "epoch": 1.7850783475783476, + "grad_norm": 0.5702888369560242, + "learning_rate": 0.0001412051252105386, + "loss": 0.5224, + "step": 80200 + }, + { + "epoch": 1.785300925925926, + "grad_norm": 0.5079876780509949, + "learning_rate": 0.00014116056465648152, + "loss": 0.5006, + "step": 80210 + }, + { + "epoch": 1.7855235042735043, + "grad_norm": 0.8400496244430542, + "learning_rate": 0.00014111600729980756, + "loss": 0.5151, + "step": 80220 + }, + { + "epoch": 1.7857460826210825, + "grad_norm": 0.7045161724090576, + "learning_rate": 0.00014107145314293796, + "loss": 0.4752, + "step": 80230 + }, + { + "epoch": 1.785968660968661, + "grad_norm": 0.58415687084198, + "learning_rate": 0.00014102690218829387, + "loss": 0.4254, + "step": 80240 + }, + { + "epoch": 1.7861912393162394, + "grad_norm": 0.7725820541381836, + "learning_rate": 0.00014098235443829615, + "loss": 0.5807, + "step": 80250 + }, + { + "epoch": 1.7864138176638176, + "grad_norm": 0.5273394584655762, + "learning_rate": 0.0001409378098953656, + "loss": 0.6497, + "step": 80260 + }, + { + "epoch": 1.786636396011396, + "grad_norm": 0.32381510734558105, + "learning_rate": 0.00014089326856192287, + "loss": 0.5428, + "step": 80270 + }, + { + "epoch": 1.7868589743589745, + "grad_norm": 0.45702850818634033, + "learning_rate": 0.00014084873044038825, + "loss": 0.4426, + "step": 80280 + }, + { + "epoch": 1.7870815527065527, + "grad_norm": 0.5660591125488281, + "learning_rate": 0.00014080419553318206, + "loss": 0.4756, + "step": 80290 + }, + { + "epoch": 1.787304131054131, + "grad_norm": 0.4793355166912079, + "learning_rate": 0.00014075966384272437, + "loss": 0.5081, + "step": 80300 + }, + { + "epoch": 1.7875267094017095, + "grad_norm": 0.47678154706954956, + "learning_rate": 0.000140715135371435, + "loss": 0.5122, + "step": 80310 + }, + { + "epoch": 1.7877492877492878, + "grad_norm": 0.6590827703475952, + "learning_rate": 0.0001406706101217337, + "loss": 0.4629, + "step": 80320 + }, + { + "epoch": 1.787971866096866, + "grad_norm": 0.528451144695282, + "learning_rate": 0.00014062608809604013, + "loss": 0.5777, + "step": 80330 + }, + { + "epoch": 1.7881944444444444, + "grad_norm": 0.41150301694869995, + "learning_rate": 0.0001405815692967735, + "loss": 0.5946, + "step": 80340 + }, + { + "epoch": 1.7884170227920229, + "grad_norm": 0.46680572628974915, + "learning_rate": 0.00014053705372635297, + "loss": 0.4817, + "step": 80350 + }, + { + "epoch": 1.788639601139601, + "grad_norm": 0.4094049334526062, + "learning_rate": 0.00014049254138719764, + "loss": 0.4549, + "step": 80360 + }, + { + "epoch": 1.7888621794871795, + "grad_norm": 0.5676484107971191, + "learning_rate": 0.00014044803228172628, + "loss": 0.6292, + "step": 80370 + }, + { + "epoch": 1.789084757834758, + "grad_norm": 0.42871958017349243, + "learning_rate": 0.00014040352641235768, + "loss": 0.4641, + "step": 80380 + }, + { + "epoch": 1.7893073361823362, + "grad_norm": 0.4868099093437195, + "learning_rate": 0.00014035902378151018, + "loss": 0.5832, + "step": 80390 + }, + { + "epoch": 1.7895299145299144, + "grad_norm": 0.6399804353713989, + "learning_rate": 0.00014031452439160216, + "loss": 0.6641, + "step": 80400 + }, + { + "epoch": 1.7897524928774928, + "grad_norm": 0.36850976943969727, + "learning_rate": 0.0001402700282450518, + "loss": 0.5838, + "step": 80410 + }, + { + "epoch": 1.7899750712250713, + "grad_norm": 0.42611685395240784, + "learning_rate": 0.00014022553534427697, + "loss": 0.4689, + "step": 80420 + }, + { + "epoch": 1.7901976495726495, + "grad_norm": 0.5654397010803223, + "learning_rate": 0.00014018104569169543, + "loss": 0.45, + "step": 80430 + }, + { + "epoch": 1.790420227920228, + "grad_norm": 0.5701908469200134, + "learning_rate": 0.00014013655928972493, + "loss": 0.5506, + "step": 80440 + }, + { + "epoch": 1.7906428062678064, + "grad_norm": 0.28367915749549866, + "learning_rate": 0.00014009207614078272, + "loss": 0.592, + "step": 80450 + }, + { + "epoch": 1.7908653846153846, + "grad_norm": 0.5797436237335205, + "learning_rate": 0.00014004759624728624, + "loss": 0.511, + "step": 80460 + }, + { + "epoch": 1.7910879629629628, + "grad_norm": 0.4902498722076416, + "learning_rate": 0.00014000311961165243, + "loss": 0.4024, + "step": 80470 + }, + { + "epoch": 1.7913105413105415, + "grad_norm": 0.5887957215309143, + "learning_rate": 0.0001399586462362982, + "loss": 0.6255, + "step": 80480 + }, + { + "epoch": 1.7915331196581197, + "grad_norm": 0.4187609851360321, + "learning_rate": 0.00013991417612364024, + "loss": 0.425, + "step": 80490 + }, + { + "epoch": 1.791755698005698, + "grad_norm": 0.9703378677368164, + "learning_rate": 0.00013986970927609514, + "loss": 0.6751, + "step": 80500 + }, + { + "epoch": 1.7919782763532763, + "grad_norm": 0.46224939823150635, + "learning_rate": 0.0001398252456960793, + "loss": 0.496, + "step": 80510 + }, + { + "epoch": 1.7922008547008548, + "grad_norm": 0.6721695065498352, + "learning_rate": 0.00013978078538600888, + "loss": 0.5485, + "step": 80520 + }, + { + "epoch": 1.792423433048433, + "grad_norm": 0.5210494995117188, + "learning_rate": 0.00013973632834829984, + "loss": 0.613, + "step": 80530 + }, + { + "epoch": 1.7926460113960114, + "grad_norm": 0.6934546828269958, + "learning_rate": 0.00013969187458536805, + "loss": 0.6063, + "step": 80540 + }, + { + "epoch": 1.7928685897435899, + "grad_norm": 0.7616796493530273, + "learning_rate": 0.0001396474240996292, + "loss": 0.6277, + "step": 80550 + }, + { + "epoch": 1.793091168091168, + "grad_norm": 0.46376466751098633, + "learning_rate": 0.0001396029768934987, + "loss": 0.46, + "step": 80560 + }, + { + "epoch": 1.7933137464387463, + "grad_norm": 0.5162578821182251, + "learning_rate": 0.0001395585329693919, + "loss": 0.5241, + "step": 80570 + }, + { + "epoch": 1.7935363247863247, + "grad_norm": 0.9000981450080872, + "learning_rate": 0.00013951409232972392, + "loss": 0.5347, + "step": 80580 + }, + { + "epoch": 1.7937589031339032, + "grad_norm": 0.6056053042411804, + "learning_rate": 0.0001394696549769097, + "loss": 0.4556, + "step": 80590 + }, + { + "epoch": 1.7939814814814814, + "grad_norm": 0.6429079174995422, + "learning_rate": 0.00013942522091336392, + "loss": 0.5873, + "step": 80600 + }, + { + "epoch": 1.7942040598290598, + "grad_norm": 0.6571381092071533, + "learning_rate": 0.00013938079014150123, + "loss": 0.4782, + "step": 80610 + }, + { + "epoch": 1.7944266381766383, + "grad_norm": 0.3865581154823303, + "learning_rate": 0.00013933636266373606, + "loss": 0.5661, + "step": 80620 + }, + { + "epoch": 1.7946492165242165, + "grad_norm": 0.4678865373134613, + "learning_rate": 0.00013929193848248255, + "loss": 0.4541, + "step": 80630 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.4663577973842621, + "learning_rate": 0.00013924751760015482, + "loss": 0.5383, + "step": 80640 + }, + { + "epoch": 1.7950943732193734, + "grad_norm": 0.5332295894622803, + "learning_rate": 0.0001392031000191668, + "loss": 0.5486, + "step": 80650 + }, + { + "epoch": 1.7953169515669516, + "grad_norm": 0.5024805665016174, + "learning_rate": 0.00013915868574193197, + "loss": 0.5637, + "step": 80660 + }, + { + "epoch": 1.7955395299145298, + "grad_norm": 0.5011512637138367, + "learning_rate": 0.00013911427477086402, + "loss": 0.5621, + "step": 80670 + }, + { + "epoch": 1.7957621082621082, + "grad_norm": 0.50482577085495, + "learning_rate": 0.0001390698671083762, + "loss": 0.5181, + "step": 80680 + }, + { + "epoch": 1.7959846866096867, + "grad_norm": 0.4566592276096344, + "learning_rate": 0.00013902546275688173, + "loss": 0.5264, + "step": 80690 + }, + { + "epoch": 1.796207264957265, + "grad_norm": 0.6772483587265015, + "learning_rate": 0.00013898106171879348, + "loss": 0.5465, + "step": 80700 + }, + { + "epoch": 1.7964298433048433, + "grad_norm": 0.6919073462486267, + "learning_rate": 0.0001389366639965243, + "loss": 0.5014, + "step": 80710 + }, + { + "epoch": 1.7966524216524218, + "grad_norm": 1.044856071472168, + "learning_rate": 0.0001388922695924869, + "loss": 0.5896, + "step": 80720 + }, + { + "epoch": 1.796875, + "grad_norm": 0.691770613193512, + "learning_rate": 0.0001388478785090935, + "loss": 0.4374, + "step": 80730 + }, + { + "epoch": 1.7970975783475782, + "grad_norm": 0.6516551375389099, + "learning_rate": 0.00013880349074875642, + "loss": 0.5147, + "step": 80740 + }, + { + "epoch": 1.7973201566951567, + "grad_norm": 0.5089389681816101, + "learning_rate": 0.00013875910631388775, + "loss": 0.4736, + "step": 80750 + }, + { + "epoch": 1.797542735042735, + "grad_norm": 0.5509934425354004, + "learning_rate": 0.00013871472520689943, + "loss": 0.3939, + "step": 80760 + }, + { + "epoch": 1.7977653133903133, + "grad_norm": 0.7316688299179077, + "learning_rate": 0.00013867034743020304, + "loss": 0.5217, + "step": 80770 + }, + { + "epoch": 1.7979878917378918, + "grad_norm": 0.44678255915641785, + "learning_rate": 0.00013862597298621023, + "loss": 0.445, + "step": 80780 + }, + { + "epoch": 1.7982104700854702, + "grad_norm": 0.6435457468032837, + "learning_rate": 0.0001385816018773323, + "loss": 0.5519, + "step": 80790 + }, + { + "epoch": 1.7984330484330484, + "grad_norm": 0.6044391989707947, + "learning_rate": 0.00013853723410598033, + "loss": 0.6251, + "step": 80800 + }, + { + "epoch": 1.7986556267806266, + "grad_norm": 0.6454141139984131, + "learning_rate": 0.00013849286967456542, + "loss": 0.6475, + "step": 80810 + }, + { + "epoch": 1.7988782051282053, + "grad_norm": 0.596731960773468, + "learning_rate": 0.00013844850858549837, + "loss": 0.5181, + "step": 80820 + }, + { + "epoch": 1.7991007834757835, + "grad_norm": 0.5489130616188049, + "learning_rate": 0.0001384041508411897, + "loss": 0.6074, + "step": 80830 + }, + { + "epoch": 1.7993233618233617, + "grad_norm": 0.9071128964424133, + "learning_rate": 0.00013835979644404988, + "loss": 0.4769, + "step": 80840 + }, + { + "epoch": 1.7995459401709402, + "grad_norm": 0.5657101273536682, + "learning_rate": 0.0001383154453964893, + "loss": 0.4991, + "step": 80850 + }, + { + "epoch": 1.7997685185185186, + "grad_norm": 0.6993838548660278, + "learning_rate": 0.0001382710977009178, + "loss": 0.5707, + "step": 80860 + }, + { + "epoch": 1.7999910968660968, + "grad_norm": 0.5687394142150879, + "learning_rate": 0.0001382267533597454, + "loss": 0.4844, + "step": 80870 + }, + { + "epoch": 1.8002136752136753, + "grad_norm": 0.4542628228664398, + "learning_rate": 0.00013818241237538182, + "loss": 0.5811, + "step": 80880 + }, + { + "epoch": 1.8002136752136753, + "eval_loss": 0.5486223101615906, + "eval_runtime": 337.5274, + "eval_samples_per_second": 7.007, + "eval_steps_per_second": 7.007, + "step": 80880 + }, + { + "epoch": 1.8004362535612537, + "grad_norm": 0.5703278183937073, + "learning_rate": 0.0001381380747502365, + "loss": 0.457, + "step": 80890 + }, + { + "epoch": 1.800658831908832, + "grad_norm": 0.3732733428478241, + "learning_rate": 0.00013809374048671892, + "loss": 0.5358, + "step": 80900 + }, + { + "epoch": 1.8008814102564101, + "grad_norm": 0.49211806058883667, + "learning_rate": 0.0001380494095872381, + "loss": 0.5763, + "step": 80910 + }, + { + "epoch": 1.8011039886039886, + "grad_norm": 0.6078943014144897, + "learning_rate": 0.0001380050820542031, + "loss": 0.6499, + "step": 80920 + }, + { + "epoch": 1.801326566951567, + "grad_norm": 0.3904814124107361, + "learning_rate": 0.0001379607578900227, + "loss": 0.5125, + "step": 80930 + }, + { + "epoch": 1.8015491452991452, + "grad_norm": 0.43924203515052795, + "learning_rate": 0.0001379164370971055, + "loss": 0.5847, + "step": 80940 + }, + { + "epoch": 1.8017717236467237, + "grad_norm": 0.5108456015586853, + "learning_rate": 0.0001378721196778599, + "loss": 0.5818, + "step": 80950 + }, + { + "epoch": 1.801994301994302, + "grad_norm": 0.5348002314567566, + "learning_rate": 0.00013782780563469422, + "loss": 0.5172, + "step": 80960 + }, + { + "epoch": 1.8022168803418803, + "grad_norm": 0.6139683127403259, + "learning_rate": 0.00013778349497001646, + "loss": 0.4641, + "step": 80970 + }, + { + "epoch": 1.8024394586894585, + "grad_norm": 0.5597288608551025, + "learning_rate": 0.00013773918768623456, + "loss": 0.4787, + "step": 80980 + }, + { + "epoch": 1.8026620370370372, + "grad_norm": 0.6026448607444763, + "learning_rate": 0.0001376948837857561, + "loss": 0.5414, + "step": 80990 + }, + { + "epoch": 1.8028846153846154, + "grad_norm": 0.9139476418495178, + "learning_rate": 0.00013765058327098873, + "loss": 0.6162, + "step": 81000 + }, + { + "epoch": 1.8031071937321936, + "grad_norm": 0.35381126403808594, + "learning_rate": 0.0001376062861443397, + "loss": 0.4206, + "step": 81010 + }, + { + "epoch": 1.803329772079772, + "grad_norm": 0.5708917379379272, + "learning_rate": 0.0001375619924082161, + "loss": 0.5939, + "step": 81020 + }, + { + "epoch": 1.8035523504273505, + "grad_norm": 0.7910793423652649, + "learning_rate": 0.00013751770206502494, + "loss": 0.396, + "step": 81030 + }, + { + "epoch": 1.8037749287749287, + "grad_norm": 0.4806177020072937, + "learning_rate": 0.00013747341511717305, + "loss": 0.5387, + "step": 81040 + }, + { + "epoch": 1.8039975071225072, + "grad_norm": 0.5476149320602417, + "learning_rate": 0.00013742913156706695, + "loss": 0.5224, + "step": 81050 + }, + { + "epoch": 1.8042200854700856, + "grad_norm": 0.6252416372299194, + "learning_rate": 0.00013738485141711303, + "loss": 0.4524, + "step": 81060 + }, + { + "epoch": 1.8044426638176638, + "grad_norm": 0.6658093333244324, + "learning_rate": 0.0001373405746697176, + "loss": 0.531, + "step": 81070 + }, + { + "epoch": 1.804665242165242, + "grad_norm": 0.5023934245109558, + "learning_rate": 0.0001372963013272866, + "loss": 0.5644, + "step": 81080 + }, + { + "epoch": 1.8048878205128205, + "grad_norm": 0.6625211238861084, + "learning_rate": 0.00013725203139222593, + "loss": 0.6344, + "step": 81090 + }, + { + "epoch": 1.805110398860399, + "grad_norm": 0.3682522475719452, + "learning_rate": 0.0001372077648669413, + "loss": 0.5456, + "step": 81100 + }, + { + "epoch": 1.8053329772079771, + "grad_norm": 0.5537610650062561, + "learning_rate": 0.00013716350175383806, + "loss": 0.5544, + "step": 81110 + }, + { + "epoch": 1.8055555555555556, + "grad_norm": 0.6162594556808472, + "learning_rate": 0.00013711924205532164, + "loss": 0.5939, + "step": 81120 + }, + { + "epoch": 1.805778133903134, + "grad_norm": 0.49779462814331055, + "learning_rate": 0.000137074985773797, + "loss": 0.6856, + "step": 81130 + }, + { + "epoch": 1.8060007122507122, + "grad_norm": 0.8165244460105896, + "learning_rate": 0.00013703073291166916, + "loss": 0.5924, + "step": 81140 + }, + { + "epoch": 1.8062232905982905, + "grad_norm": 0.6566153764724731, + "learning_rate": 0.0001369864834713429, + "loss": 0.5529, + "step": 81150 + }, + { + "epoch": 1.806445868945869, + "grad_norm": 0.6264129281044006, + "learning_rate": 0.00013694223745522267, + "loss": 0.5535, + "step": 81160 + }, + { + "epoch": 1.8066684472934473, + "grad_norm": 0.44103971123695374, + "learning_rate": 0.0001368979948657129, + "loss": 0.4672, + "step": 81170 + }, + { + "epoch": 1.8068910256410255, + "grad_norm": 0.4332234263420105, + "learning_rate": 0.00013685375570521774, + "loss": 0.5054, + "step": 81180 + }, + { + "epoch": 1.807113603988604, + "grad_norm": 0.3906761407852173, + "learning_rate": 0.00013680951997614116, + "loss": 0.4347, + "step": 81190 + }, + { + "epoch": 1.8073361823361824, + "grad_norm": 0.4635500907897949, + "learning_rate": 0.00013676528768088702, + "loss": 0.522, + "step": 81200 + }, + { + "epoch": 1.8075587606837606, + "grad_norm": 0.4586428701877594, + "learning_rate": 0.00013672105882185892, + "loss": 0.4188, + "step": 81210 + }, + { + "epoch": 1.807781339031339, + "grad_norm": 0.8698287010192871, + "learning_rate": 0.00013667683340146025, + "loss": 0.554, + "step": 81220 + }, + { + "epoch": 1.8080039173789175, + "grad_norm": 0.6498838663101196, + "learning_rate": 0.0001366326114220943, + "loss": 0.6529, + "step": 81230 + }, + { + "epoch": 1.8082264957264957, + "grad_norm": 0.5619851350784302, + "learning_rate": 0.00013658839288616415, + "loss": 0.5616, + "step": 81240 + }, + { + "epoch": 1.808449074074074, + "grad_norm": 0.5245595574378967, + "learning_rate": 0.00013654417779607268, + "loss": 0.4498, + "step": 81250 + }, + { + "epoch": 1.8086716524216524, + "grad_norm": 0.5840174555778503, + "learning_rate": 0.0001364999661542225, + "loss": 0.4625, + "step": 81260 + }, + { + "epoch": 1.8088942307692308, + "grad_norm": 0.5450571179389954, + "learning_rate": 0.00013645575796301612, + "loss": 0.5568, + "step": 81270 + }, + { + "epoch": 1.809116809116809, + "grad_norm": 0.46242818236351013, + "learning_rate": 0.00013641155322485586, + "loss": 0.583, + "step": 81280 + }, + { + "epoch": 1.8093393874643875, + "grad_norm": 0.6096475720405579, + "learning_rate": 0.0001363673519421439, + "loss": 0.5224, + "step": 81290 + }, + { + "epoch": 1.809561965811966, + "grad_norm": 0.6651652455329895, + "learning_rate": 0.00013632315411728208, + "loss": 0.5991, + "step": 81300 + }, + { + "epoch": 1.8097845441595442, + "grad_norm": 0.45310983061790466, + "learning_rate": 0.0001362789597526722, + "loss": 0.4822, + "step": 81310 + }, + { + "epoch": 1.8100071225071224, + "grad_norm": 0.5300917625427246, + "learning_rate": 0.00013623476885071586, + "loss": 0.47, + "step": 81320 + }, + { + "epoch": 1.8102297008547008, + "grad_norm": 0.6232476830482483, + "learning_rate": 0.00013619058141381435, + "loss": 0.5301, + "step": 81330 + }, + { + "epoch": 1.8104522792022792, + "grad_norm": 0.8016089200973511, + "learning_rate": 0.00013614639744436883, + "loss": 0.6121, + "step": 81340 + }, + { + "epoch": 1.8106748575498575, + "grad_norm": 0.526396632194519, + "learning_rate": 0.00013610221694478042, + "loss": 0.5645, + "step": 81350 + }, + { + "epoch": 1.810897435897436, + "grad_norm": 0.43031802773475647, + "learning_rate": 0.00013605803991744982, + "loss": 0.6062, + "step": 81360 + }, + { + "epoch": 1.8111200142450143, + "grad_norm": 0.6098934412002563, + "learning_rate": 0.00013601386636477768, + "loss": 0.6972, + "step": 81370 + }, + { + "epoch": 1.8113425925925926, + "grad_norm": 0.6424472332000732, + "learning_rate": 0.00013596969628916448, + "loss": 0.6723, + "step": 81380 + }, + { + "epoch": 1.8115651709401708, + "grad_norm": 0.4741729497909546, + "learning_rate": 0.00013592552969301033, + "loss": 0.5176, + "step": 81390 + }, + { + "epoch": 1.8117877492877494, + "grad_norm": 0.4829888641834259, + "learning_rate": 0.0001358813665787154, + "loss": 0.5935, + "step": 81400 + }, + { + "epoch": 1.8120103276353277, + "grad_norm": 0.5409865379333496, + "learning_rate": 0.00013583720694867942, + "loss": 0.5528, + "step": 81410 + }, + { + "epoch": 1.8122329059829059, + "grad_norm": 0.6868401169776917, + "learning_rate": 0.0001357930508053022, + "loss": 0.5798, + "step": 81420 + }, + { + "epoch": 1.8124554843304843, + "grad_norm": 0.6660972833633423, + "learning_rate": 0.00013574889815098318, + "loss": 0.6272, + "step": 81430 + }, + { + "epoch": 1.8126780626780628, + "grad_norm": 0.5210102200508118, + "learning_rate": 0.00013570474898812158, + "loss": 0.5527, + "step": 81440 + }, + { + "epoch": 1.812900641025641, + "grad_norm": 0.6258218884468079, + "learning_rate": 0.00013566060331911657, + "loss": 0.6019, + "step": 81450 + }, + { + "epoch": 1.8131232193732194, + "grad_norm": 0.5742608904838562, + "learning_rate": 0.00013561646114636705, + "loss": 0.5425, + "step": 81460 + }, + { + "epoch": 1.8133457977207978, + "grad_norm": 0.848927915096283, + "learning_rate": 0.00013557232247227174, + "loss": 0.582, + "step": 81470 + }, + { + "epoch": 1.813568376068376, + "grad_norm": 0.4526084363460541, + "learning_rate": 0.00013552818729922915, + "loss": 0.4612, + "step": 81480 + }, + { + "epoch": 1.8137909544159543, + "grad_norm": 0.8034663796424866, + "learning_rate": 0.00013548405562963768, + "loss": 0.6144, + "step": 81490 + }, + { + "epoch": 1.8140135327635327, + "grad_norm": 0.3783120810985565, + "learning_rate": 0.00013543992746589543, + "loss": 0.4725, + "step": 81500 + }, + { + "epoch": 1.8142361111111112, + "grad_norm": 0.6843919157981873, + "learning_rate": 0.00013539580281040042, + "loss": 0.533, + "step": 81510 + }, + { + "epoch": 1.8144586894586894, + "grad_norm": 0.5809860825538635, + "learning_rate": 0.0001353516816655503, + "loss": 0.4795, + "step": 81520 + }, + { + "epoch": 1.8146812678062678, + "grad_norm": 0.5024617314338684, + "learning_rate": 0.00013530756403374274, + "loss": 0.5093, + "step": 81530 + }, + { + "epoch": 1.8149038461538463, + "grad_norm": 0.7193806171417236, + "learning_rate": 0.00013526344991737513, + "loss": 0.6808, + "step": 81540 + }, + { + "epoch": 1.8151264245014245, + "grad_norm": 0.6356869339942932, + "learning_rate": 0.00013521933931884462, + "loss": 0.5459, + "step": 81550 + }, + { + "epoch": 1.8153490028490027, + "grad_norm": 0.4601297676563263, + "learning_rate": 0.00013517523224054824, + "loss": 0.6181, + "step": 81560 + }, + { + "epoch": 1.8155715811965814, + "grad_norm": 0.42826223373413086, + "learning_rate": 0.00013513112868488285, + "loss": 0.5506, + "step": 81570 + }, + { + "epoch": 1.8157941595441596, + "grad_norm": 0.5442543625831604, + "learning_rate": 0.00013508702865424498, + "loss": 0.5424, + "step": 81580 + }, + { + "epoch": 1.8160167378917378, + "grad_norm": 0.772955060005188, + "learning_rate": 0.00013504293215103113, + "loss": 0.6139, + "step": 81590 + }, + { + "epoch": 1.8162393162393162, + "grad_norm": 0.5679219961166382, + "learning_rate": 0.00013499883917763756, + "loss": 0.5283, + "step": 81600 + }, + { + "epoch": 1.8164618945868947, + "grad_norm": 0.5960325002670288, + "learning_rate": 0.00013495474973646023, + "loss": 0.6502, + "step": 81610 + }, + { + "epoch": 1.8166844729344729, + "grad_norm": 0.376598060131073, + "learning_rate": 0.00013491066382989505, + "loss": 0.5098, + "step": 81620 + }, + { + "epoch": 1.8169070512820513, + "grad_norm": 0.4028322398662567, + "learning_rate": 0.00013486658146033773, + "loss": 0.5193, + "step": 81630 + }, + { + "epoch": 1.8171296296296298, + "grad_norm": 0.4078958034515381, + "learning_rate": 0.00013482250263018372, + "loss": 0.5282, + "step": 81640 + }, + { + "epoch": 1.817352207977208, + "grad_norm": 0.5620450973510742, + "learning_rate": 0.00013477842734182821, + "loss": 0.6942, + "step": 81650 + }, + { + "epoch": 1.8175747863247862, + "grad_norm": 0.6660060882568359, + "learning_rate": 0.00013473435559766634, + "loss": 0.4954, + "step": 81660 + }, + { + "epoch": 1.8177973646723646, + "grad_norm": 0.7021716833114624, + "learning_rate": 0.00013469028740009306, + "loss": 0.634, + "step": 81670 + }, + { + "epoch": 1.818019943019943, + "grad_norm": 0.45811787247657776, + "learning_rate": 0.000134646222751503, + "loss": 0.5591, + "step": 81680 + }, + { + "epoch": 1.8182425213675213, + "grad_norm": 0.8208920955657959, + "learning_rate": 0.00013460216165429067, + "loss": 0.5028, + "step": 81690 + }, + { + "epoch": 1.8184650997150997, + "grad_norm": 0.4805906414985657, + "learning_rate": 0.00013455810411085043, + "loss": 0.5842, + "step": 81700 + }, + { + "epoch": 1.8186876780626782, + "grad_norm": 0.3821577727794647, + "learning_rate": 0.00013451405012357643, + "loss": 0.6637, + "step": 81710 + }, + { + "epoch": 1.8189102564102564, + "grad_norm": 0.64554363489151, + "learning_rate": 0.0001344699996948625, + "loss": 0.4815, + "step": 81720 + }, + { + "epoch": 1.8191328347578346, + "grad_norm": 0.7134438157081604, + "learning_rate": 0.00013442595282710243, + "loss": 0.4481, + "step": 81730 + }, + { + "epoch": 1.8193554131054133, + "grad_norm": 0.6993748545646667, + "learning_rate": 0.00013438190952268984, + "loss": 0.6328, + "step": 81740 + }, + { + "epoch": 1.8195779914529915, + "grad_norm": 0.6261551380157471, + "learning_rate": 0.00013433786978401792, + "loss": 0.5424, + "step": 81750 + }, + { + "epoch": 1.8198005698005697, + "grad_norm": 0.5403935313224792, + "learning_rate": 0.00013429383361347993, + "loss": 0.4991, + "step": 81760 + }, + { + "epoch": 1.8200231481481481, + "grad_norm": 0.45395776629447937, + "learning_rate": 0.0001342498010134689, + "loss": 0.4634, + "step": 81770 + }, + { + "epoch": 1.8202457264957266, + "grad_norm": 0.5860028266906738, + "learning_rate": 0.00013420577198637745, + "loss": 0.5032, + "step": 81780 + }, + { + "epoch": 1.8204683048433048, + "grad_norm": 0.6436797976493835, + "learning_rate": 0.00013416174653459818, + "loss": 0.5357, + "step": 81790 + }, + { + "epoch": 1.8206908831908832, + "grad_norm": 0.7727674245834351, + "learning_rate": 0.0001341177246605235, + "loss": 0.5628, + "step": 81800 + }, + { + "epoch": 1.8209134615384617, + "grad_norm": 0.852611780166626, + "learning_rate": 0.00013407370636654565, + "loss": 0.5068, + "step": 81810 + }, + { + "epoch": 1.82113603988604, + "grad_norm": 0.5791258215904236, + "learning_rate": 0.0001340296916550565, + "loss": 0.4513, + "step": 81820 + }, + { + "epoch": 1.821358618233618, + "grad_norm": 0.6760326027870178, + "learning_rate": 0.00013398568052844792, + "loss": 0.5396, + "step": 81830 + }, + { + "epoch": 1.8215811965811965, + "grad_norm": 0.7038044333457947, + "learning_rate": 0.0001339416729891115, + "loss": 0.6021, + "step": 81840 + }, + { + "epoch": 1.821803774928775, + "grad_norm": 0.35636159777641296, + "learning_rate": 0.00013389766903943871, + "loss": 0.4685, + "step": 81850 + }, + { + "epoch": 1.8220263532763532, + "grad_norm": 0.47771090269088745, + "learning_rate": 0.00013385366868182063, + "loss": 0.687, + "step": 81860 + }, + { + "epoch": 1.8222489316239316, + "grad_norm": 0.6764662265777588, + "learning_rate": 0.00013380967191864836, + "loss": 0.5052, + "step": 81870 + }, + { + "epoch": 1.82247150997151, + "grad_norm": 0.42575374245643616, + "learning_rate": 0.00013376567875231273, + "loss": 0.4958, + "step": 81880 + }, + { + "epoch": 1.8226940883190883, + "grad_norm": 0.423395037651062, + "learning_rate": 0.00013372168918520432, + "loss": 0.4892, + "step": 81890 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.3987009823322296, + "learning_rate": 0.00013367770321971365, + "loss": 0.5223, + "step": 81900 + }, + { + "epoch": 1.8231392450142452, + "grad_norm": 0.736409604549408, + "learning_rate": 0.00013363372085823077, + "loss": 0.473, + "step": 81910 + }, + { + "epoch": 1.8233618233618234, + "grad_norm": 0.522287905216217, + "learning_rate": 0.00013358974210314593, + "loss": 0.5072, + "step": 81920 + }, + { + "epoch": 1.8235844017094016, + "grad_norm": 0.5287613272666931, + "learning_rate": 0.00013354576695684877, + "loss": 0.4859, + "step": 81930 + }, + { + "epoch": 1.82380698005698, + "grad_norm": 0.9009698033332825, + "learning_rate": 0.00013350179542172906, + "loss": 0.5714, + "step": 81940 + }, + { + "epoch": 1.8240295584045585, + "grad_norm": 0.5715592503547668, + "learning_rate": 0.00013345782750017628, + "loss": 0.5878, + "step": 81950 + }, + { + "epoch": 1.8242521367521367, + "grad_norm": 0.6704865097999573, + "learning_rate": 0.00013341386319457957, + "loss": 0.6115, + "step": 81960 + }, + { + "epoch": 1.8244747150997151, + "grad_norm": 0.4145774841308594, + "learning_rate": 0.00013336990250732806, + "loss": 0.429, + "step": 81970 + }, + { + "epoch": 1.8246972934472936, + "grad_norm": 0.5823487043380737, + "learning_rate": 0.00013332594544081058, + "loss": 0.5456, + "step": 81980 + }, + { + "epoch": 1.8249198717948718, + "grad_norm": 0.6845299005508423, + "learning_rate": 0.00013328199199741584, + "loss": 0.4696, + "step": 81990 + }, + { + "epoch": 1.82514245014245, + "grad_norm": 0.4744720160961151, + "learning_rate": 0.00013323804217953223, + "loss": 0.5328, + "step": 82000 + }, + { + "epoch": 1.8253650284900285, + "grad_norm": 0.4606737494468689, + "learning_rate": 0.0001331940959895481, + "loss": 0.3736, + "step": 82010 + }, + { + "epoch": 1.825587606837607, + "grad_norm": 0.28650131821632385, + "learning_rate": 0.00013315015342985146, + "loss": 0.5077, + "step": 82020 + }, + { + "epoch": 1.8258101851851851, + "grad_norm": 0.37976205348968506, + "learning_rate": 0.0001331062145028303, + "loss": 0.5773, + "step": 82030 + }, + { + "epoch": 1.8260327635327636, + "grad_norm": 0.3755210041999817, + "learning_rate": 0.00013306227921087212, + "loss": 0.509, + "step": 82040 + }, + { + "epoch": 1.826255341880342, + "grad_norm": 0.4783906638622284, + "learning_rate": 0.00013301834755636446, + "loss": 0.4079, + "step": 82050 + }, + { + "epoch": 1.8264779202279202, + "grad_norm": 0.7430179119110107, + "learning_rate": 0.00013297441954169468, + "loss": 0.6565, + "step": 82060 + }, + { + "epoch": 1.8267004985754984, + "grad_norm": 0.43394824862480164, + "learning_rate": 0.00013293049516924978, + "loss": 0.6075, + "step": 82070 + }, + { + "epoch": 1.8269230769230769, + "grad_norm": 0.6090715527534485, + "learning_rate": 0.0001328865744414167, + "loss": 0.4517, + "step": 82080 + }, + { + "epoch": 1.8271456552706553, + "grad_norm": 0.7434033751487732, + "learning_rate": 0.0001328426573605821, + "loss": 0.6067, + "step": 82090 + }, + { + "epoch": 1.8273682336182335, + "grad_norm": 0.5030858516693115, + "learning_rate": 0.00013279874392913246, + "loss": 0.4758, + "step": 82100 + }, + { + "epoch": 1.827590811965812, + "grad_norm": 0.47948139905929565, + "learning_rate": 0.00013275483414945408, + "loss": 0.566, + "step": 82110 + }, + { + "epoch": 1.8278133903133904, + "grad_norm": 0.6807217001914978, + "learning_rate": 0.0001327109280239331, + "loss": 0.517, + "step": 82120 + }, + { + "epoch": 1.8280359686609686, + "grad_norm": 0.6474286317825317, + "learning_rate": 0.00013266702555495536, + "loss": 0.5654, + "step": 82130 + }, + { + "epoch": 1.828258547008547, + "grad_norm": 0.6399319767951965, + "learning_rate": 0.00013262312674490657, + "loss": 0.3978, + "step": 82140 + }, + { + "epoch": 1.8284811253561255, + "grad_norm": 0.5145894289016724, + "learning_rate": 0.0001325792315961722, + "loss": 0.5827, + "step": 82150 + }, + { + "epoch": 1.8287037037037037, + "grad_norm": 0.8448268175125122, + "learning_rate": 0.00013253534011113766, + "loss": 0.6026, + "step": 82160 + }, + { + "epoch": 1.828926282051282, + "grad_norm": 0.6144095659255981, + "learning_rate": 0.00013249145229218793, + "loss": 0.4726, + "step": 82170 + }, + { + "epoch": 1.8291488603988604, + "grad_norm": 0.3744725286960602, + "learning_rate": 0.00013244756814170794, + "loss": 0.4769, + "step": 82180 + }, + { + "epoch": 1.8293714387464388, + "grad_norm": 0.4054800570011139, + "learning_rate": 0.00013240368766208235, + "loss": 0.4378, + "step": 82190 + }, + { + "epoch": 1.829594017094017, + "grad_norm": 0.5873180627822876, + "learning_rate": 0.0001323598108556958, + "loss": 0.624, + "step": 82200 + }, + { + "epoch": 1.8298165954415955, + "grad_norm": 0.3750841021537781, + "learning_rate": 0.00013231593772493244, + "loss": 0.5533, + "step": 82210 + }, + { + "epoch": 1.830039173789174, + "grad_norm": 0.7284649014472961, + "learning_rate": 0.00013227206827217642, + "loss": 0.6229, + "step": 82220 + }, + { + "epoch": 1.8302617521367521, + "grad_norm": 0.43646135926246643, + "learning_rate": 0.0001322282024998117, + "loss": 0.4732, + "step": 82230 + }, + { + "epoch": 1.8304843304843303, + "grad_norm": 0.5836132168769836, + "learning_rate": 0.0001321843404102219, + "loss": 0.5659, + "step": 82240 + }, + { + "epoch": 1.8307069088319088, + "grad_norm": 0.5374044179916382, + "learning_rate": 0.00013214048200579058, + "loss": 0.4631, + "step": 82250 + }, + { + "epoch": 1.8309294871794872, + "grad_norm": 0.84559166431427, + "learning_rate": 0.00013209662728890103, + "loss": 0.6409, + "step": 82260 + }, + { + "epoch": 1.8311520655270654, + "grad_norm": 0.5360352993011475, + "learning_rate": 0.0001320527762619363, + "loss": 0.6815, + "step": 82270 + }, + { + "epoch": 1.8313746438746439, + "grad_norm": 0.5297994613647461, + "learning_rate": 0.00013200892892727936, + "loss": 0.5423, + "step": 82280 + }, + { + "epoch": 1.8315972222222223, + "grad_norm": 0.6134427189826965, + "learning_rate": 0.00013196508528731286, + "loss": 0.5763, + "step": 82290 + }, + { + "epoch": 1.8318198005698005, + "grad_norm": 0.5154188871383667, + "learning_rate": 0.0001319212453444194, + "loss": 0.5902, + "step": 82300 + }, + { + "epoch": 1.8320423789173788, + "grad_norm": 0.47101709246635437, + "learning_rate": 0.00013187740910098117, + "loss": 0.5594, + "step": 82310 + }, + { + "epoch": 1.8322649572649574, + "grad_norm": 0.6749807000160217, + "learning_rate": 0.00013183357655938025, + "loss": 0.5638, + "step": 82320 + }, + { + "epoch": 1.8324875356125356, + "grad_norm": 0.6740586757659912, + "learning_rate": 0.00013178974772199862, + "loss": 0.5373, + "step": 82330 + }, + { + "epoch": 1.8327101139601139, + "grad_norm": 0.47263529896736145, + "learning_rate": 0.00013174592259121792, + "loss": 0.6147, + "step": 82340 + }, + { + "epoch": 1.8329326923076923, + "grad_norm": 0.5166828632354736, + "learning_rate": 0.00013170210116941966, + "loss": 0.5655, + "step": 82350 + }, + { + "epoch": 1.8331552706552707, + "grad_norm": 0.6054937243461609, + "learning_rate": 0.0001316582834589851, + "loss": 0.5334, + "step": 82360 + }, + { + "epoch": 1.833377849002849, + "grad_norm": 0.7472933530807495, + "learning_rate": 0.00013161446946229543, + "loss": 0.4991, + "step": 82370 + }, + { + "epoch": 1.8336004273504274, + "grad_norm": 0.5342981219291687, + "learning_rate": 0.00013157065918173143, + "loss": 0.4818, + "step": 82380 + }, + { + "epoch": 1.8338230056980058, + "grad_norm": 0.42199358344078064, + "learning_rate": 0.00013152685261967384, + "loss": 0.4548, + "step": 82390 + }, + { + "epoch": 1.834045584045584, + "grad_norm": 0.9192740321159363, + "learning_rate": 0.00013148304977850315, + "loss": 0.4891, + "step": 82400 + }, + { + "epoch": 1.8342681623931623, + "grad_norm": 0.6426318287849426, + "learning_rate": 0.0001314392506605996, + "loss": 0.4976, + "step": 82410 + }, + { + "epoch": 1.8344907407407407, + "grad_norm": 0.8605575561523438, + "learning_rate": 0.00013139545526834329, + "loss": 0.4526, + "step": 82420 + }, + { + "epoch": 1.8347133190883191, + "grad_norm": 0.3284328579902649, + "learning_rate": 0.0001313516636041142, + "loss": 0.481, + "step": 82430 + }, + { + "epoch": 1.8349358974358974, + "grad_norm": 0.8184496760368347, + "learning_rate": 0.0001313078756702918, + "loss": 0.5942, + "step": 82440 + }, + { + "epoch": 1.8351584757834758, + "grad_norm": 0.725690484046936, + "learning_rate": 0.00013126409146925572, + "loss": 0.5737, + "step": 82450 + }, + { + "epoch": 1.8353810541310542, + "grad_norm": 0.9030028581619263, + "learning_rate": 0.00013122031100338513, + "loss": 0.5774, + "step": 82460 + }, + { + "epoch": 1.8356036324786325, + "grad_norm": 0.635688304901123, + "learning_rate": 0.0001311765342750592, + "loss": 0.5737, + "step": 82470 + }, + { + "epoch": 1.8358262108262107, + "grad_norm": 0.5312469601631165, + "learning_rate": 0.0001311327612866567, + "loss": 0.5476, + "step": 82480 + }, + { + "epoch": 1.8360487891737893, + "grad_norm": 0.6173189878463745, + "learning_rate": 0.00013108899204055633, + "loss": 0.5478, + "step": 82490 + }, + { + "epoch": 1.8362713675213675, + "grad_norm": 0.44116851687431335, + "learning_rate": 0.00013104522653913656, + "loss": 0.4731, + "step": 82500 + }, + { + "epoch": 1.8364939458689458, + "grad_norm": 0.6289439797401428, + "learning_rate": 0.0001310014647847756, + "loss": 0.5242, + "step": 82510 + }, + { + "epoch": 1.8367165242165242, + "grad_norm": 0.7131869792938232, + "learning_rate": 0.00013095770677985156, + "loss": 0.5566, + "step": 82520 + }, + { + "epoch": 1.8369391025641026, + "grad_norm": 0.8671508431434631, + "learning_rate": 0.00013091395252674227, + "loss": 0.6793, + "step": 82530 + }, + { + "epoch": 1.8371616809116809, + "grad_norm": 0.45645245909690857, + "learning_rate": 0.00013087020202782532, + "loss": 0.5947, + "step": 82540 + }, + { + "epoch": 1.8373842592592593, + "grad_norm": 0.6090142726898193, + "learning_rate": 0.0001308264552854782, + "loss": 0.485, + "step": 82550 + }, + { + "epoch": 1.8376068376068377, + "grad_norm": 0.666921079158783, + "learning_rate": 0.00013078271230207818, + "loss": 0.5573, + "step": 82560 + }, + { + "epoch": 1.837829415954416, + "grad_norm": 0.5578299164772034, + "learning_rate": 0.00013073897308000216, + "loss": 0.4586, + "step": 82570 + }, + { + "epoch": 1.8380519943019942, + "grad_norm": 0.48516398668289185, + "learning_rate": 0.00013069523762162705, + "loss": 0.5203, + "step": 82580 + }, + { + "epoch": 1.8382745726495726, + "grad_norm": 0.47454801201820374, + "learning_rate": 0.00013065150592932954, + "loss": 0.6178, + "step": 82590 + }, + { + "epoch": 1.838497150997151, + "grad_norm": 0.5279563069343567, + "learning_rate": 0.00013060777800548586, + "loss": 0.4358, + "step": 82600 + }, + { + "epoch": 1.8387197293447293, + "grad_norm": 0.554388165473938, + "learning_rate": 0.0001305640538524724, + "loss": 0.471, + "step": 82610 + }, + { + "epoch": 1.8389423076923077, + "grad_norm": 0.7549745440483093, + "learning_rate": 0.00013052033347266508, + "loss": 0.4871, + "step": 82620 + }, + { + "epoch": 1.8391648860398861, + "grad_norm": 0.5078638195991516, + "learning_rate": 0.0001304766168684397, + "loss": 0.552, + "step": 82630 + }, + { + "epoch": 1.8393874643874644, + "grad_norm": 0.6136375665664673, + "learning_rate": 0.00013043290404217187, + "loss": 0.4959, + "step": 82640 + }, + { + "epoch": 1.8396100427350426, + "grad_norm": 0.8161970376968384, + "learning_rate": 0.00013038919499623703, + "loss": 0.625, + "step": 82650 + }, + { + "epoch": 1.8398326210826212, + "grad_norm": 0.5410851240158081, + "learning_rate": 0.00013034548973301028, + "loss": 0.6344, + "step": 82660 + }, + { + "epoch": 1.8400551994301995, + "grad_norm": 0.6233614087104797, + "learning_rate": 0.00013030178825486664, + "loss": 0.5575, + "step": 82670 + }, + { + "epoch": 1.8402777777777777, + "grad_norm": 0.5429202318191528, + "learning_rate": 0.0001302580905641809, + "loss": 0.6325, + "step": 82680 + }, + { + "epoch": 1.8405003561253561, + "grad_norm": 0.594391405582428, + "learning_rate": 0.00013021439666332762, + "loss": 0.5302, + "step": 82690 + }, + { + "epoch": 1.8407229344729346, + "grad_norm": 0.5456381440162659, + "learning_rate": 0.0001301707065546812, + "loss": 0.6353, + "step": 82700 + }, + { + "epoch": 1.8409455128205128, + "grad_norm": 0.524573802947998, + "learning_rate": 0.00013012702024061564, + "loss": 0.633, + "step": 82710 + }, + { + "epoch": 1.8411680911680912, + "grad_norm": 0.3950157165527344, + "learning_rate": 0.00013008333772350508, + "loss": 0.4898, + "step": 82720 + }, + { + "epoch": 1.8413906695156697, + "grad_norm": 0.6695838570594788, + "learning_rate": 0.00013003965900572318, + "loss": 0.5071, + "step": 82730 + }, + { + "epoch": 1.8416132478632479, + "grad_norm": 0.701891303062439, + "learning_rate": 0.00012999598408964342, + "loss": 0.5664, + "step": 82740 + }, + { + "epoch": 1.841835826210826, + "grad_norm": 0.5936694741249084, + "learning_rate": 0.0001299523129776392, + "loss": 0.5612, + "step": 82750 + }, + { + "epoch": 1.8420584045584045, + "grad_norm": 0.4507628381252289, + "learning_rate": 0.00012990864567208373, + "loss": 0.5959, + "step": 82760 + }, + { + "epoch": 1.842280982905983, + "grad_norm": 0.6318559050559998, + "learning_rate": 0.00012986498217534975, + "loss": 0.4675, + "step": 82770 + }, + { + "epoch": 1.8425035612535612, + "grad_norm": 0.45048779249191284, + "learning_rate": 0.00012982132248981005, + "loss": 0.4773, + "step": 82780 + }, + { + "epoch": 1.8427261396011396, + "grad_norm": 0.41977986693382263, + "learning_rate": 0.00012977766661783718, + "loss": 0.4448, + "step": 82790 + }, + { + "epoch": 1.842948717948718, + "grad_norm": 0.42034912109375, + "learning_rate": 0.00012973401456180337, + "loss": 0.3574, + "step": 82800 + }, + { + "epoch": 1.8431712962962963, + "grad_norm": 0.5503208041191101, + "learning_rate": 0.0001296903663240807, + "loss": 0.6054, + "step": 82810 + }, + { + "epoch": 1.8433938746438745, + "grad_norm": 0.5614815354347229, + "learning_rate": 0.0001296467219070412, + "loss": 0.5596, + "step": 82820 + }, + { + "epoch": 1.843616452991453, + "grad_norm": 0.5360153913497925, + "learning_rate": 0.00012960308131305633, + "loss": 0.5477, + "step": 82830 + }, + { + "epoch": 1.8438390313390314, + "grad_norm": 0.5727353096008301, + "learning_rate": 0.0001295594445444977, + "loss": 0.4641, + "step": 82840 + }, + { + "epoch": 1.8440616096866096, + "grad_norm": 0.48841726779937744, + "learning_rate": 0.00012951581160373644, + "loss": 0.5742, + "step": 82850 + }, + { + "epoch": 1.844284188034188, + "grad_norm": 0.6567955017089844, + "learning_rate": 0.00012947218249314372, + "loss": 0.6564, + "step": 82860 + }, + { + "epoch": 1.8445067663817665, + "grad_norm": 0.3863896429538727, + "learning_rate": 0.00012942855721509037, + "loss": 0.3935, + "step": 82870 + }, + { + "epoch": 1.8447293447293447, + "grad_norm": 0.4754053056240082, + "learning_rate": 0.00012938493577194696, + "loss": 0.5936, + "step": 82880 + }, + { + "epoch": 1.8449519230769231, + "grad_norm": 0.6173115968704224, + "learning_rate": 0.00012934131816608394, + "loss": 0.5695, + "step": 82890 + }, + { + "epoch": 1.8451745014245016, + "grad_norm": 0.45684781670570374, + "learning_rate": 0.00012929770439987155, + "loss": 0.4782, + "step": 82900 + }, + { + "epoch": 1.8453970797720798, + "grad_norm": 0.5181066393852234, + "learning_rate": 0.0001292540944756798, + "loss": 0.4371, + "step": 82910 + }, + { + "epoch": 1.845619658119658, + "grad_norm": 0.6220055818557739, + "learning_rate": 0.00012921048839587842, + "loss": 0.7226, + "step": 82920 + }, + { + "epoch": 1.8458422364672364, + "grad_norm": 0.7300344109535217, + "learning_rate": 0.0001291668861628371, + "loss": 0.5493, + "step": 82930 + }, + { + "epoch": 1.8460648148148149, + "grad_norm": 0.3150061070919037, + "learning_rate": 0.00012912328777892515, + "loss": 0.486, + "step": 82940 + }, + { + "epoch": 1.846287393162393, + "grad_norm": 0.5863127708435059, + "learning_rate": 0.00012907969324651183, + "loss": 0.5338, + "step": 82950 + }, + { + "epoch": 1.8465099715099715, + "grad_norm": 0.7054229974746704, + "learning_rate": 0.00012903610256796596, + "loss": 0.4996, + "step": 82960 + }, + { + "epoch": 1.84673254985755, + "grad_norm": 0.6429014205932617, + "learning_rate": 0.00012899251574565642, + "loss": 0.7624, + "step": 82970 + }, + { + "epoch": 1.8469551282051282, + "grad_norm": 0.5570936799049377, + "learning_rate": 0.00012894893278195165, + "loss": 0.5169, + "step": 82980 + }, + { + "epoch": 1.8471777065527064, + "grad_norm": 0.5311518907546997, + "learning_rate": 0.00012890535367922007, + "loss": 0.4844, + "step": 82990 + }, + { + "epoch": 1.8474002849002849, + "grad_norm": 0.4749816954135895, + "learning_rate": 0.00012886177843982974, + "loss": 0.575, + "step": 83000 + }, + { + "epoch": 1.8476228632478633, + "grad_norm": 0.4664008617401123, + "learning_rate": 0.00012881820706614865, + "loss": 0.4627, + "step": 83010 + }, + { + "epoch": 1.8478454415954415, + "grad_norm": 0.5003736615180969, + "learning_rate": 0.00012877463956054443, + "loss": 0.6056, + "step": 83020 + }, + { + "epoch": 1.84806801994302, + "grad_norm": 0.45148807764053345, + "learning_rate": 0.0001287310759253846, + "loss": 0.4996, + "step": 83030 + }, + { + "epoch": 1.8482905982905984, + "grad_norm": 0.39340224862098694, + "learning_rate": 0.0001286875161630365, + "loss": 0.592, + "step": 83040 + }, + { + "epoch": 1.8485131766381766, + "grad_norm": 0.6800395250320435, + "learning_rate": 0.00012864396027586708, + "loss": 0.4621, + "step": 83050 + }, + { + "epoch": 1.8487357549857548, + "grad_norm": 0.4289543628692627, + "learning_rate": 0.00012860040826624328, + "loss": 0.5439, + "step": 83060 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.4997618496417999, + "learning_rate": 0.00012855686013653182, + "loss": 0.5645, + "step": 83070 + }, + { + "epoch": 1.8491809116809117, + "grad_norm": 0.423065721988678, + "learning_rate": 0.00012851331588909908, + "loss": 0.5694, + "step": 83080 + }, + { + "epoch": 1.84940349002849, + "grad_norm": 0.5687001347541809, + "learning_rate": 0.00012846977552631117, + "loss": 0.495, + "step": 83090 + }, + { + "epoch": 1.8496260683760684, + "grad_norm": 0.6185249090194702, + "learning_rate": 0.00012842623905053423, + "loss": 0.5242, + "step": 83100 + }, + { + "epoch": 1.8498486467236468, + "grad_norm": 0.5709801316261292, + "learning_rate": 0.00012838270646413412, + "loss": 0.471, + "step": 83110 + }, + { + "epoch": 1.850071225071225, + "grad_norm": 0.6331045031547546, + "learning_rate": 0.00012833917776947632, + "loss": 0.5447, + "step": 83120 + }, + { + "epoch": 1.8502938034188035, + "grad_norm": 0.5902771949768066, + "learning_rate": 0.00012829565296892625, + "loss": 0.4946, + "step": 83130 + }, + { + "epoch": 1.850516381766382, + "grad_norm": 0.5342350602149963, + "learning_rate": 0.0001282521320648491, + "loss": 0.6436, + "step": 83140 + }, + { + "epoch": 1.85073896011396, + "grad_norm": 0.49461767077445984, + "learning_rate": 0.0001282086150596099, + "loss": 0.5505, + "step": 83150 + }, + { + "epoch": 1.8509615384615383, + "grad_norm": 0.6072582602500916, + "learning_rate": 0.00012816510195557326, + "loss": 0.5282, + "step": 83160 + }, + { + "epoch": 1.8511841168091168, + "grad_norm": 0.6011079549789429, + "learning_rate": 0.0001281215927551038, + "loss": 0.4349, + "step": 83170 + }, + { + "epoch": 1.8514066951566952, + "grad_norm": 0.6158570051193237, + "learning_rate": 0.00012807808746056584, + "loss": 0.4527, + "step": 83180 + }, + { + "epoch": 1.8516292735042734, + "grad_norm": 0.4972705543041229, + "learning_rate": 0.00012803458607432347, + "loss": 0.503, + "step": 83190 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.7067302465438843, + "learning_rate": 0.0001279910885987406, + "loss": 0.5721, + "step": 83200 + }, + { + "epoch": 1.8520744301994303, + "grad_norm": 0.6685699224472046, + "learning_rate": 0.00012794759503618103, + "loss": 0.5895, + "step": 83210 + }, + { + "epoch": 1.8522970085470085, + "grad_norm": 0.4621349275112152, + "learning_rate": 0.00012790410538900807, + "loss": 0.4735, + "step": 83220 + }, + { + "epoch": 1.8525195868945867, + "grad_norm": 0.6665029525756836, + "learning_rate": 0.00012786061965958497, + "loss": 0.5579, + "step": 83230 + }, + { + "epoch": 1.8527421652421654, + "grad_norm": 0.9082955121994019, + "learning_rate": 0.0001278171378502749, + "loss": 0.5431, + "step": 83240 + }, + { + "epoch": 1.8529647435897436, + "grad_norm": 0.7993310689926147, + "learning_rate": 0.0001277736599634407, + "loss": 0.5488, + "step": 83250 + }, + { + "epoch": 1.8531873219373218, + "grad_norm": 0.7538060545921326, + "learning_rate": 0.00012773018600144485, + "loss": 0.511, + "step": 83260 + }, + { + "epoch": 1.8534099002849003, + "grad_norm": 0.6340315341949463, + "learning_rate": 0.00012768671596664988, + "loss": 0.5297, + "step": 83270 + }, + { + "epoch": 1.8536324786324787, + "grad_norm": 0.7901144027709961, + "learning_rate": 0.000127643249861418, + "loss": 0.514, + "step": 83280 + }, + { + "epoch": 1.853855056980057, + "grad_norm": 0.43684589862823486, + "learning_rate": 0.00012759978768811116, + "loss": 0.5556, + "step": 83290 + }, + { + "epoch": 1.8540776353276354, + "grad_norm": 0.6661888360977173, + "learning_rate": 0.0001275563294490911, + "loss": 0.5632, + "step": 83300 + }, + { + "epoch": 1.8543002136752138, + "grad_norm": 0.5754364728927612, + "learning_rate": 0.0001275128751467194, + "loss": 0.5782, + "step": 83310 + }, + { + "epoch": 1.854522792022792, + "grad_norm": 0.6119531989097595, + "learning_rate": 0.00012746942478335745, + "loss": 0.5541, + "step": 83320 + }, + { + "epoch": 1.8547453703703702, + "grad_norm": 0.38876596093177795, + "learning_rate": 0.00012742597836136628, + "loss": 0.5523, + "step": 83330 + }, + { + "epoch": 1.8549679487179487, + "grad_norm": 0.4897285997867584, + "learning_rate": 0.00012738253588310685, + "loss": 0.5975, + "step": 83340 + }, + { + "epoch": 1.8551905270655271, + "grad_norm": 0.6104738116264343, + "learning_rate": 0.00012733909735094, + "loss": 0.5249, + "step": 83350 + }, + { + "epoch": 1.8554131054131053, + "grad_norm": 0.5610323548316956, + "learning_rate": 0.00012729566276722598, + "loss": 0.6574, + "step": 83360 + }, + { + "epoch": 1.8556356837606838, + "grad_norm": 0.24581007659435272, + "learning_rate": 0.00012725223213432514, + "loss": 0.5365, + "step": 83370 + }, + { + "epoch": 1.8558582621082622, + "grad_norm": 0.5664445161819458, + "learning_rate": 0.0001272088054545976, + "loss": 0.4695, + "step": 83380 + }, + { + "epoch": 1.8560808404558404, + "grad_norm": 0.35660311579704285, + "learning_rate": 0.00012716538273040314, + "loss": 0.4945, + "step": 83390 + }, + { + "epoch": 1.8563034188034186, + "grad_norm": 0.7497749924659729, + "learning_rate": 0.0001271219639641014, + "loss": 0.6068, + "step": 83400 + }, + { + "epoch": 1.8565259971509973, + "grad_norm": 0.4711231291294098, + "learning_rate": 0.0001270785491580518, + "loss": 0.5364, + "step": 83410 + }, + { + "epoch": 1.8567485754985755, + "grad_norm": 0.5040110349655151, + "learning_rate": 0.00012703513831461357, + "loss": 0.5249, + "step": 83420 + }, + { + "epoch": 1.8569711538461537, + "grad_norm": 0.5002251267433167, + "learning_rate": 0.0001269917314361456, + "loss": 0.4847, + "step": 83430 + }, + { + "epoch": 1.8571937321937322, + "grad_norm": 0.466820627450943, + "learning_rate": 0.00012694832852500672, + "loss": 0.5891, + "step": 83440 + }, + { + "epoch": 1.8574163105413106, + "grad_norm": 0.7309194207191467, + "learning_rate": 0.0001269049295835555, + "loss": 0.7185, + "step": 83450 + }, + { + "epoch": 1.8576388888888888, + "grad_norm": 0.42788100242614746, + "learning_rate": 0.00012686153461415026, + "loss": 0.6125, + "step": 83460 + }, + { + "epoch": 1.8578614672364673, + "grad_norm": 0.5784748196601868, + "learning_rate": 0.00012681814361914908, + "loss": 0.5353, + "step": 83470 + }, + { + "epoch": 1.8580840455840457, + "grad_norm": 0.82913738489151, + "learning_rate": 0.00012677475660090995, + "loss": 0.5588, + "step": 83480 + }, + { + "epoch": 1.858306623931624, + "grad_norm": 0.7082958221435547, + "learning_rate": 0.0001267313735617904, + "loss": 0.6553, + "step": 83490 + }, + { + "epoch": 1.8585292022792022, + "grad_norm": 0.4604193866252899, + "learning_rate": 0.00012668799450414806, + "loss": 0.4829, + "step": 83500 + }, + { + "epoch": 1.8587517806267806, + "grad_norm": 0.4414191246032715, + "learning_rate": 0.00012664461943034006, + "loss": 0.5907, + "step": 83510 + }, + { + "epoch": 1.858974358974359, + "grad_norm": 0.44887977838516235, + "learning_rate": 0.00012660124834272348, + "loss": 0.5824, + "step": 83520 + }, + { + "epoch": 1.8591969373219372, + "grad_norm": 0.3785727322101593, + "learning_rate": 0.00012655788124365518, + "loss": 0.5708, + "step": 83530 + }, + { + "epoch": 1.8594195156695157, + "grad_norm": 0.5096050500869751, + "learning_rate": 0.00012651451813549172, + "loss": 0.5093, + "step": 83540 + }, + { + "epoch": 1.8596420940170941, + "grad_norm": 0.4787435829639435, + "learning_rate": 0.00012647115902058952, + "loss": 0.4097, + "step": 83550 + }, + { + "epoch": 1.8598646723646723, + "grad_norm": 0.5042093992233276, + "learning_rate": 0.00012642780390130476, + "loss": 0.6072, + "step": 83560 + }, + { + "epoch": 1.8600872507122506, + "grad_norm": 0.6546964049339294, + "learning_rate": 0.00012638445277999332, + "loss": 0.5195, + "step": 83570 + }, + { + "epoch": 1.8602207977207978, + "eval_loss": 0.5436797142028809, + "eval_runtime": 337.4885, + "eval_samples_per_second": 7.008, + "eval_steps_per_second": 7.008, + "step": 83576 + }, + { + "epoch": 1.8603098290598292, + "grad_norm": 0.5635590553283691, + "learning_rate": 0.00012634110565901095, + "loss": 0.6364, + "step": 83580 + }, + { + "epoch": 1.8605324074074074, + "grad_norm": 0.5586998462677002, + "learning_rate": 0.00012629776254071326, + "loss": 0.5486, + "step": 83590 + }, + { + "epoch": 1.8607549857549857, + "grad_norm": 0.5139333009719849, + "learning_rate": 0.0001262544234274555, + "loss": 0.5651, + "step": 83600 + }, + { + "epoch": 1.860977564102564, + "grad_norm": 0.49953770637512207, + "learning_rate": 0.00012621108832159276, + "loss": 0.5109, + "step": 83610 + }, + { + "epoch": 1.8612001424501425, + "grad_norm": 0.5323540568351746, + "learning_rate": 0.00012616775722547986, + "loss": 0.5728, + "step": 83620 + }, + { + "epoch": 1.8614227207977208, + "grad_norm": 0.571060061454773, + "learning_rate": 0.0001261244301414714, + "loss": 0.4647, + "step": 83630 + }, + { + "epoch": 1.8616452991452992, + "grad_norm": 0.585278332233429, + "learning_rate": 0.00012608110707192197, + "loss": 0.5556, + "step": 83640 + }, + { + "epoch": 1.8618678774928776, + "grad_norm": 0.35864341259002686, + "learning_rate": 0.00012603778801918564, + "loss": 0.4576, + "step": 83650 + }, + { + "epoch": 1.8620904558404558, + "grad_norm": 0.6041143536567688, + "learning_rate": 0.00012599447298561648, + "loss": 0.6435, + "step": 83660 + }, + { + "epoch": 1.862313034188034, + "grad_norm": 0.5151081681251526, + "learning_rate": 0.00012595116197356825, + "loss": 0.4269, + "step": 83670 + }, + { + "epoch": 1.8625356125356125, + "grad_norm": 0.515720784664154, + "learning_rate": 0.00012590785498539447, + "loss": 0.5999, + "step": 83680 + }, + { + "epoch": 1.862758190883191, + "grad_norm": 0.4865381717681885, + "learning_rate": 0.00012586455202344852, + "loss": 0.4351, + "step": 83690 + }, + { + "epoch": 1.8629807692307692, + "grad_norm": 0.7341486811637878, + "learning_rate": 0.00012582125309008353, + "loss": 0.5515, + "step": 83700 + }, + { + "epoch": 1.8632033475783476, + "grad_norm": 0.6000564098358154, + "learning_rate": 0.00012577795818765234, + "loss": 0.5063, + "step": 83710 + }, + { + "epoch": 1.863425925925926, + "grad_norm": 0.7252387404441833, + "learning_rate": 0.00012573466731850765, + "loss": 0.5685, + "step": 83720 + }, + { + "epoch": 1.8636485042735043, + "grad_norm": 0.7386507987976074, + "learning_rate": 0.00012569138048500194, + "loss": 0.6376, + "step": 83730 + }, + { + "epoch": 1.8638710826210825, + "grad_norm": 0.5711615681648254, + "learning_rate": 0.00012564809768948754, + "loss": 0.6018, + "step": 83740 + }, + { + "epoch": 1.864093660968661, + "grad_norm": 0.4323315918445587, + "learning_rate": 0.00012560481893431633, + "loss": 0.4106, + "step": 83750 + }, + { + "epoch": 1.8643162393162394, + "grad_norm": 0.5830208659172058, + "learning_rate": 0.00012556154422184012, + "loss": 0.5466, + "step": 83760 + }, + { + "epoch": 1.8645388176638176, + "grad_norm": 0.5899856090545654, + "learning_rate": 0.00012551827355441051, + "loss": 0.5519, + "step": 83770 + }, + { + "epoch": 1.864761396011396, + "grad_norm": 0.4912465512752533, + "learning_rate": 0.00012547500693437893, + "loss": 0.5965, + "step": 83780 + }, + { + "epoch": 1.8649839743589745, + "grad_norm": 0.7850707173347473, + "learning_rate": 0.00012543174436409647, + "loss": 0.5518, + "step": 83790 + }, + { + "epoch": 1.8652065527065527, + "grad_norm": 1.1132444143295288, + "learning_rate": 0.0001253884858459141, + "loss": 0.4395, + "step": 83800 + }, + { + "epoch": 1.865429131054131, + "grad_norm": 0.5983325242996216, + "learning_rate": 0.00012534523138218247, + "loss": 0.5475, + "step": 83810 + }, + { + "epoch": 1.8656517094017095, + "grad_norm": 0.5957872271537781, + "learning_rate": 0.00012530198097525206, + "loss": 0.5988, + "step": 83820 + }, + { + "epoch": 1.8658742877492878, + "grad_norm": 0.4412519335746765, + "learning_rate": 0.00012525873462747316, + "loss": 0.4865, + "step": 83830 + }, + { + "epoch": 1.866096866096866, + "grad_norm": 0.6750221252441406, + "learning_rate": 0.00012521549234119586, + "loss": 0.5586, + "step": 83840 + }, + { + "epoch": 1.8663194444444444, + "grad_norm": 0.5277356505393982, + "learning_rate": 0.00012517225411876984, + "loss": 0.5821, + "step": 83850 + }, + { + "epoch": 1.8665420227920229, + "grad_norm": 0.5814523100852966, + "learning_rate": 0.00012512901996254483, + "loss": 0.5374, + "step": 83860 + }, + { + "epoch": 1.866764601139601, + "grad_norm": 0.4394824802875519, + "learning_rate": 0.00012508578987487025, + "loss": 0.6425, + "step": 83870 + }, + { + "epoch": 1.8669871794871795, + "grad_norm": 0.49788931012153625, + "learning_rate": 0.00012504256385809508, + "loss": 0.5151, + "step": 83880 + }, + { + "epoch": 1.867209757834758, + "grad_norm": 0.5545089840888977, + "learning_rate": 0.0001249993419145684, + "loss": 0.5008, + "step": 83890 + }, + { + "epoch": 1.8674323361823362, + "grad_norm": 0.5493986010551453, + "learning_rate": 0.0001249561240466388, + "loss": 0.5409, + "step": 83900 + }, + { + "epoch": 1.8676549145299144, + "grad_norm": 0.5672900080680847, + "learning_rate": 0.0001249129102566549, + "loss": 0.4545, + "step": 83910 + }, + { + "epoch": 1.8678774928774928, + "grad_norm": 0.41749370098114014, + "learning_rate": 0.00012486970054696497, + "loss": 0.5166, + "step": 83920 + }, + { + "epoch": 1.8681000712250713, + "grad_norm": 0.5397555828094482, + "learning_rate": 0.00012482649491991693, + "loss": 0.5845, + "step": 83930 + }, + { + "epoch": 1.8683226495726495, + "grad_norm": 0.48144885897636414, + "learning_rate": 0.0001247832933778587, + "loss": 0.5117, + "step": 83940 + }, + { + "epoch": 1.868545227920228, + "grad_norm": 0.6149954795837402, + "learning_rate": 0.00012474009592313798, + "loss": 0.6119, + "step": 83950 + }, + { + "epoch": 1.8687678062678064, + "grad_norm": 0.6500161290168762, + "learning_rate": 0.000124696902558102, + "loss": 0.409, + "step": 83960 + }, + { + "epoch": 1.8689903846153846, + "grad_norm": 0.3311348855495453, + "learning_rate": 0.00012465371328509797, + "loss": 0.458, + "step": 83970 + }, + { + "epoch": 1.8692129629629628, + "grad_norm": 0.41198965907096863, + "learning_rate": 0.00012461052810647285, + "loss": 0.4788, + "step": 83980 + }, + { + "epoch": 1.8694355413105415, + "grad_norm": 0.5640448927879333, + "learning_rate": 0.00012456734702457335, + "loss": 0.5883, + "step": 83990 + }, + { + "epoch": 1.8696581196581197, + "grad_norm": 0.4277702569961548, + "learning_rate": 0.00012452417004174603, + "loss": 0.6195, + "step": 84000 + }, + { + "epoch": 1.869880698005698, + "grad_norm": 0.37998971343040466, + "learning_rate": 0.00012448099716033702, + "loss": 0.4813, + "step": 84010 + }, + { + "epoch": 1.8701032763532763, + "grad_norm": 0.8592267632484436, + "learning_rate": 0.00012443782838269244, + "loss": 0.5471, + "step": 84020 + }, + { + "epoch": 1.8703258547008548, + "grad_norm": 0.6263825297355652, + "learning_rate": 0.00012439466371115817, + "loss": 0.3867, + "step": 84030 + }, + { + "epoch": 1.870548433048433, + "grad_norm": 0.7152578234672546, + "learning_rate": 0.00012435150314807973, + "loss": 0.5774, + "step": 84040 + }, + { + "epoch": 1.8707710113960114, + "grad_norm": 0.7144100666046143, + "learning_rate": 0.00012430834669580252, + "loss": 0.6186, + "step": 84050 + }, + { + "epoch": 1.8709935897435899, + "grad_norm": 0.4694978892803192, + "learning_rate": 0.0001242651943566718, + "loss": 0.4516, + "step": 84060 + }, + { + "epoch": 1.871216168091168, + "grad_norm": 0.34822237491607666, + "learning_rate": 0.00012422204613303235, + "loss": 0.5519, + "step": 84070 + }, + { + "epoch": 1.8714387464387463, + "grad_norm": 0.6391999125480652, + "learning_rate": 0.00012417890202722891, + "loss": 0.5739, + "step": 84080 + }, + { + "epoch": 1.8716613247863247, + "grad_norm": 1.0606963634490967, + "learning_rate": 0.0001241357620416061, + "loss": 0.468, + "step": 84090 + }, + { + "epoch": 1.8718839031339032, + "grad_norm": 0.5437816977500916, + "learning_rate": 0.00012409262617850804, + "loss": 0.6617, + "step": 84100 + }, + { + "epoch": 1.8721064814814814, + "grad_norm": 0.5535063147544861, + "learning_rate": 0.0001240494944402788, + "loss": 0.5088, + "step": 84110 + }, + { + "epoch": 1.8723290598290598, + "grad_norm": 0.6068494915962219, + "learning_rate": 0.00012400636682926224, + "loss": 0.6898, + "step": 84120 + }, + { + "epoch": 1.8725516381766383, + "grad_norm": 0.3118143081665039, + "learning_rate": 0.00012396324334780192, + "loss": 0.5705, + "step": 84130 + }, + { + "epoch": 1.8727742165242165, + "grad_norm": 0.7818388342857361, + "learning_rate": 0.00012392012399824122, + "loss": 0.477, + "step": 84140 + }, + { + "epoch": 1.8729967948717947, + "grad_norm": 0.4516705572605133, + "learning_rate": 0.00012387700878292322, + "loss": 0.5883, + "step": 84150 + }, + { + "epoch": 1.8732193732193734, + "grad_norm": 0.7244430184364319, + "learning_rate": 0.00012383389770419085, + "loss": 0.5847, + "step": 84160 + }, + { + "epoch": 1.8734419515669516, + "grad_norm": 0.24914613366127014, + "learning_rate": 0.0001237907907643869, + "loss": 0.4618, + "step": 84170 + }, + { + "epoch": 1.8736645299145298, + "grad_norm": 0.646794855594635, + "learning_rate": 0.00012374768796585372, + "loss": 0.5783, + "step": 84180 + }, + { + "epoch": 1.8738871082621082, + "grad_norm": 0.8252887725830078, + "learning_rate": 0.0001237045893109336, + "loss": 0.5806, + "step": 84190 + }, + { + "epoch": 1.8741096866096867, + "grad_norm": 0.5684128999710083, + "learning_rate": 0.00012366149480196858, + "loss": 0.5365, + "step": 84200 + }, + { + "epoch": 1.874332264957265, + "grad_norm": 0.46364259719848633, + "learning_rate": 0.00012361840444130042, + "loss": 0.4656, + "step": 84210 + }, + { + "epoch": 1.8745548433048433, + "grad_norm": 0.45392075181007385, + "learning_rate": 0.00012357531823127066, + "loss": 0.5358, + "step": 84220 + }, + { + "epoch": 1.8747774216524218, + "grad_norm": 0.7082318067550659, + "learning_rate": 0.00012353223617422073, + "loss": 0.5213, + "step": 84230 + }, + { + "epoch": 1.875, + "grad_norm": 0.5095435976982117, + "learning_rate": 0.00012348915827249167, + "loss": 0.5009, + "step": 84240 + }, + { + "epoch": 1.8752225783475782, + "grad_norm": 0.50666344165802, + "learning_rate": 0.0001234460845284243, + "loss": 0.4844, + "step": 84250 + }, + { + "epoch": 1.8754451566951567, + "grad_norm": 0.5035162568092346, + "learning_rate": 0.00012340301494435954, + "loss": 0.5847, + "step": 84260 + }, + { + "epoch": 1.875667735042735, + "grad_norm": 0.7025172710418701, + "learning_rate": 0.00012335994952263757, + "loss": 0.5392, + "step": 84270 + }, + { + "epoch": 1.8758903133903133, + "grad_norm": 0.5457714200019836, + "learning_rate": 0.00012331688826559863, + "loss": 0.617, + "step": 84280 + }, + { + "epoch": 1.8761128917378918, + "grad_norm": 0.6207228899002075, + "learning_rate": 0.00012327383117558278, + "loss": 0.4245, + "step": 84290 + }, + { + "epoch": 1.8763354700854702, + "grad_norm": 0.7247689962387085, + "learning_rate": 0.00012323077825492974, + "loss": 0.64, + "step": 84300 + }, + { + "epoch": 1.8765580484330484, + "grad_norm": 0.6851694583892822, + "learning_rate": 0.00012318772950597911, + "loss": 0.4342, + "step": 84310 + }, + { + "epoch": 1.8767806267806266, + "grad_norm": 0.5423067212104797, + "learning_rate": 0.0001231446849310701, + "loss": 0.4896, + "step": 84320 + }, + { + "epoch": 1.8770032051282053, + "grad_norm": 0.6308934092521667, + "learning_rate": 0.00012310164453254184, + "loss": 0.6464, + "step": 84330 + }, + { + "epoch": 1.8772257834757835, + "grad_norm": 0.8702993392944336, + "learning_rate": 0.0001230586083127332, + "loss": 0.5344, + "step": 84340 + }, + { + "epoch": 1.8774483618233617, + "grad_norm": 0.5256815552711487, + "learning_rate": 0.0001230155762739827, + "loss": 0.4322, + "step": 84350 + }, + { + "epoch": 1.8776709401709402, + "grad_norm": 0.656808614730835, + "learning_rate": 0.00012297254841862882, + "loss": 0.5554, + "step": 84360 + }, + { + "epoch": 1.8778935185185186, + "grad_norm": 0.38146379590034485, + "learning_rate": 0.0001229295247490098, + "loss": 0.4928, + "step": 84370 + }, + { + "epoch": 1.8781160968660968, + "grad_norm": 0.6034631729125977, + "learning_rate": 0.0001228865052674634, + "loss": 0.5746, + "step": 84380 + }, + { + "epoch": 1.8783386752136753, + "grad_norm": 0.5881955623626709, + "learning_rate": 0.00012284348997632747, + "loss": 0.4958, + "step": 84390 + }, + { + "epoch": 1.8785612535612537, + "grad_norm": 0.5065589547157288, + "learning_rate": 0.0001228004788779395, + "loss": 0.5454, + "step": 84400 + }, + { + "epoch": 1.878783831908832, + "grad_norm": 0.6091739535331726, + "learning_rate": 0.0001227574719746367, + "loss": 0.657, + "step": 84410 + }, + { + "epoch": 1.8790064102564101, + "grad_norm": 0.3164239823818207, + "learning_rate": 0.00012271446926875607, + "loss": 0.4448, + "step": 84420 + }, + { + "epoch": 1.8792289886039886, + "grad_norm": 0.6602497100830078, + "learning_rate": 0.00012267147076263445, + "loss": 0.5011, + "step": 84430 + }, + { + "epoch": 1.879451566951567, + "grad_norm": 0.4034685790538788, + "learning_rate": 0.00012262847645860839, + "loss": 0.5238, + "step": 84440 + }, + { + "epoch": 1.8796741452991452, + "grad_norm": 0.5005523562431335, + "learning_rate": 0.0001225854863590143, + "loss": 0.5152, + "step": 84450 + }, + { + "epoch": 1.8798967236467237, + "grad_norm": 0.7055954933166504, + "learning_rate": 0.00012254250046618822, + "loss": 0.5278, + "step": 84460 + }, + { + "epoch": 1.880119301994302, + "grad_norm": 0.5619649887084961, + "learning_rate": 0.00012249951878246608, + "loss": 0.4446, + "step": 84470 + }, + { + "epoch": 1.8803418803418803, + "grad_norm": 0.657782793045044, + "learning_rate": 0.00012245654131018356, + "loss": 0.4602, + "step": 84480 + }, + { + "epoch": 1.8805644586894585, + "grad_norm": 0.799344539642334, + "learning_rate": 0.00012241356805167602, + "loss": 0.4666, + "step": 84490 + }, + { + "epoch": 1.8807870370370372, + "grad_norm": 0.5603383779525757, + "learning_rate": 0.00012237059900927877, + "loss": 0.4702, + "step": 84500 + }, + { + "epoch": 1.8810096153846154, + "grad_norm": 0.7844343781471252, + "learning_rate": 0.00012232763418532669, + "loss": 0.5809, + "step": 84510 + }, + { + "epoch": 1.8812321937321936, + "grad_norm": 0.5265217423439026, + "learning_rate": 0.00012228467358215453, + "loss": 0.5684, + "step": 84520 + }, + { + "epoch": 1.881454772079772, + "grad_norm": 0.578589916229248, + "learning_rate": 0.00012224171720209693, + "loss": 0.4439, + "step": 84530 + }, + { + "epoch": 1.8816773504273505, + "grad_norm": 0.4853005111217499, + "learning_rate": 0.00012219876504748796, + "loss": 0.5476, + "step": 84540 + }, + { + "epoch": 1.8818999287749287, + "grad_norm": 0.444774329662323, + "learning_rate": 0.00012215581712066186, + "loss": 0.4716, + "step": 84550 + }, + { + "epoch": 1.8821225071225072, + "grad_norm": 0.31034624576568604, + "learning_rate": 0.00012211287342395234, + "loss": 0.5194, + "step": 84560 + }, + { + "epoch": 1.8823450854700856, + "grad_norm": 0.799504816532135, + "learning_rate": 0.00012206993395969304, + "loss": 0.4748, + "step": 84570 + }, + { + "epoch": 1.8825676638176638, + "grad_norm": 0.45250698924064636, + "learning_rate": 0.0001220269987302173, + "loss": 0.6653, + "step": 84580 + }, + { + "epoch": 1.882790242165242, + "grad_norm": 0.5559919476509094, + "learning_rate": 0.00012198406773785835, + "loss": 0.5582, + "step": 84590 + }, + { + "epoch": 1.8830128205128205, + "grad_norm": 0.5852392315864563, + "learning_rate": 0.00012194114098494898, + "loss": 0.5398, + "step": 84600 + }, + { + "epoch": 1.883235398860399, + "grad_norm": 1.573107123374939, + "learning_rate": 0.0001218982184738219, + "loss": 0.5588, + "step": 84610 + }, + { + "epoch": 1.8834579772079771, + "grad_norm": 0.5588731169700623, + "learning_rate": 0.0001218553002068096, + "loss": 0.4883, + "step": 84620 + }, + { + "epoch": 1.8836805555555556, + "grad_norm": 0.5660615563392639, + "learning_rate": 0.00012181238618624422, + "loss": 0.4707, + "step": 84630 + }, + { + "epoch": 1.883903133903134, + "grad_norm": 0.6259063482284546, + "learning_rate": 0.00012176947641445782, + "loss": 0.4833, + "step": 84640 + }, + { + "epoch": 1.8841257122507122, + "grad_norm": 0.735929548740387, + "learning_rate": 0.00012172657089378212, + "loss": 0.4643, + "step": 84650 + }, + { + "epoch": 1.8843482905982905, + "grad_norm": 0.4203285574913025, + "learning_rate": 0.00012168366962654866, + "loss": 0.4736, + "step": 84660 + }, + { + "epoch": 1.884570868945869, + "grad_norm": 0.4741728603839874, + "learning_rate": 0.00012164077261508864, + "loss": 0.4674, + "step": 84670 + }, + { + "epoch": 1.8847934472934473, + "grad_norm": 0.3773883283138275, + "learning_rate": 0.00012159787986173322, + "loss": 0.4748, + "step": 84680 + }, + { + "epoch": 1.8850160256410255, + "grad_norm": 0.4112374484539032, + "learning_rate": 0.00012155499136881318, + "loss": 0.4899, + "step": 84690 + }, + { + "epoch": 1.885238603988604, + "grad_norm": 0.44202110171318054, + "learning_rate": 0.00012151210713865912, + "loss": 0.5883, + "step": 84700 + }, + { + "epoch": 1.8854611823361824, + "grad_norm": 0.3438667356967926, + "learning_rate": 0.00012146922717360138, + "loss": 0.4871, + "step": 84710 + }, + { + "epoch": 1.8856837606837606, + "grad_norm": 0.5432357788085938, + "learning_rate": 0.0001214263514759702, + "loss": 0.4654, + "step": 84720 + }, + { + "epoch": 1.885906339031339, + "grad_norm": 0.5579224824905396, + "learning_rate": 0.00012138348004809535, + "loss": 0.5805, + "step": 84730 + }, + { + "epoch": 1.8861289173789175, + "grad_norm": 0.5524855852127075, + "learning_rate": 0.00012134061289230654, + "loss": 0.4803, + "step": 84740 + }, + { + "epoch": 1.8863514957264957, + "grad_norm": 0.5937925577163696, + "learning_rate": 0.00012129775001093322, + "loss": 0.4385, + "step": 84750 + }, + { + "epoch": 1.886574074074074, + "grad_norm": 0.8141183257102966, + "learning_rate": 0.00012125489140630466, + "loss": 0.4975, + "step": 84760 + }, + { + "epoch": 1.8867966524216524, + "grad_norm": 0.3882113993167877, + "learning_rate": 0.00012121203708074969, + "loss": 0.5738, + "step": 84770 + }, + { + "epoch": 1.8870192307692308, + "grad_norm": 0.5651386380195618, + "learning_rate": 0.00012116918703659714, + "loss": 0.6125, + "step": 84780 + }, + { + "epoch": 1.887241809116809, + "grad_norm": 0.5414236187934875, + "learning_rate": 0.00012112634127617556, + "loss": 0.5745, + "step": 84790 + }, + { + "epoch": 1.8874643874643875, + "grad_norm": 0.6082583069801331, + "learning_rate": 0.00012108349980181317, + "loss": 0.5157, + "step": 84800 + }, + { + "epoch": 1.887686965811966, + "grad_norm": 0.6844538450241089, + "learning_rate": 0.00012104066261583792, + "loss": 0.5765, + "step": 84810 + }, + { + "epoch": 1.8879095441595442, + "grad_norm": 0.7233855128288269, + "learning_rate": 0.00012099782972057773, + "loss": 0.623, + "step": 84820 + }, + { + "epoch": 1.8881321225071224, + "grad_norm": 0.44369903206825256, + "learning_rate": 0.00012095500111836018, + "loss": 0.4765, + "step": 84830 + }, + { + "epoch": 1.8883547008547008, + "grad_norm": 0.5439565181732178, + "learning_rate": 0.00012091217681151254, + "loss": 0.4019, + "step": 84840 + }, + { + "epoch": 1.8885772792022792, + "grad_norm": 0.6787068247795105, + "learning_rate": 0.00012086935680236196, + "loss": 0.6234, + "step": 84850 + }, + { + "epoch": 1.8887998575498575, + "grad_norm": 0.47564569115638733, + "learning_rate": 0.00012082654109323535, + "loss": 0.504, + "step": 84860 + }, + { + "epoch": 1.889022435897436, + "grad_norm": 0.39963382482528687, + "learning_rate": 0.00012078372968645926, + "loss": 0.389, + "step": 84870 + }, + { + "epoch": 1.8892450142450143, + "grad_norm": 0.4792673885822296, + "learning_rate": 0.00012074092258436014, + "loss": 0.5531, + "step": 84880 + }, + { + "epoch": 1.8894675925925926, + "grad_norm": 0.49218714237213135, + "learning_rate": 0.0001206981197892642, + "loss": 0.5529, + "step": 84890 + }, + { + "epoch": 1.8896901709401708, + "grad_norm": 0.7303302884101868, + "learning_rate": 0.00012065532130349737, + "loss": 0.4026, + "step": 84900 + }, + { + "epoch": 1.8899127492877494, + "grad_norm": 0.6758183240890503, + "learning_rate": 0.00012061252712938528, + "loss": 0.5874, + "step": 84910 + }, + { + "epoch": 1.8901353276353277, + "grad_norm": 0.7041637897491455, + "learning_rate": 0.00012056973726925359, + "loss": 0.5742, + "step": 84920 + }, + { + "epoch": 1.8903579059829059, + "grad_norm": 0.7809052467346191, + "learning_rate": 0.00012052695172542727, + "loss": 0.5525, + "step": 84930 + }, + { + "epoch": 1.8905804843304843, + "grad_norm": 0.37585023045539856, + "learning_rate": 0.00012048417050023151, + "loss": 0.5036, + "step": 84940 + }, + { + "epoch": 1.8908030626780628, + "grad_norm": 0.5783548355102539, + "learning_rate": 0.00012044139359599099, + "loss": 0.5024, + "step": 84950 + }, + { + "epoch": 1.891025641025641, + "grad_norm": 0.5071015954017639, + "learning_rate": 0.00012039862101503026, + "loss": 0.4789, + "step": 84960 + }, + { + "epoch": 1.8912482193732194, + "grad_norm": 0.5445882678031921, + "learning_rate": 0.00012035585275967368, + "loss": 0.4893, + "step": 84970 + }, + { + "epoch": 1.8914707977207978, + "grad_norm": 0.3778285086154938, + "learning_rate": 0.00012031308883224519, + "loss": 0.4128, + "step": 84980 + }, + { + "epoch": 1.891693376068376, + "grad_norm": 0.5169997811317444, + "learning_rate": 0.00012027032923506872, + "loss": 0.512, + "step": 84990 + }, + { + "epoch": 1.8919159544159543, + "grad_norm": 0.5017030835151672, + "learning_rate": 0.00012022757397046786, + "loss": 0.4416, + "step": 85000 + }, + { + "epoch": 1.8921385327635327, + "grad_norm": 1.1345303058624268, + "learning_rate": 0.00012018482304076588, + "loss": 0.5782, + "step": 85010 + }, + { + "epoch": 1.8923611111111112, + "grad_norm": 0.4666755497455597, + "learning_rate": 0.00012014207644828596, + "loss": 0.6162, + "step": 85020 + }, + { + "epoch": 1.8925836894586894, + "grad_norm": 0.7333738803863525, + "learning_rate": 0.00012009933419535104, + "loss": 0.5684, + "step": 85030 + }, + { + "epoch": 1.8928062678062678, + "grad_norm": 0.5773189067840576, + "learning_rate": 0.00012005659628428367, + "loss": 0.4725, + "step": 85040 + }, + { + "epoch": 1.8930288461538463, + "grad_norm": 0.6351726651191711, + "learning_rate": 0.00012001386271740637, + "loss": 0.6178, + "step": 85050 + }, + { + "epoch": 1.8932514245014245, + "grad_norm": 0.49017852544784546, + "learning_rate": 0.00011997113349704116, + "loss": 0.5083, + "step": 85060 + }, + { + "epoch": 1.8934740028490027, + "grad_norm": 0.5200414061546326, + "learning_rate": 0.00011992840862551009, + "loss": 0.513, + "step": 85070 + }, + { + "epoch": 1.8936965811965814, + "grad_norm": 0.6539701223373413, + "learning_rate": 0.00011988568810513488, + "loss": 0.4864, + "step": 85080 + }, + { + "epoch": 1.8939191595441596, + "grad_norm": 0.568430483341217, + "learning_rate": 0.00011984297193823692, + "loss": 0.5922, + "step": 85090 + }, + { + "epoch": 1.8941417378917378, + "grad_norm": 0.7377212047576904, + "learning_rate": 0.00011980026012713748, + "loss": 0.5317, + "step": 85100 + }, + { + "epoch": 1.8943643162393162, + "grad_norm": 0.6099929213523865, + "learning_rate": 0.0001197575526741576, + "loss": 0.6454, + "step": 85110 + }, + { + "epoch": 1.8945868945868947, + "grad_norm": 0.5552646517753601, + "learning_rate": 0.00011971484958161796, + "loss": 0.4626, + "step": 85120 + }, + { + "epoch": 1.8948094729344729, + "grad_norm": 0.6018966436386108, + "learning_rate": 0.00011967215085183912, + "loss": 0.501, + "step": 85130 + }, + { + "epoch": 1.8950320512820513, + "grad_norm": 0.5927259922027588, + "learning_rate": 0.00011962945648714141, + "loss": 0.5232, + "step": 85140 + }, + { + "epoch": 1.8952546296296298, + "grad_norm": 0.5345644950866699, + "learning_rate": 0.00011958676648984477, + "loss": 0.6754, + "step": 85150 + }, + { + "epoch": 1.895477207977208, + "grad_norm": 0.47902974486351013, + "learning_rate": 0.00011954408086226908, + "loss": 0.4121, + "step": 85160 + }, + { + "epoch": 1.8956997863247862, + "grad_norm": 0.6269406676292419, + "learning_rate": 0.00011950139960673393, + "loss": 0.5845, + "step": 85170 + }, + { + "epoch": 1.8959223646723646, + "grad_norm": 0.6758217215538025, + "learning_rate": 0.00011945872272555862, + "loss": 0.7172, + "step": 85180 + }, + { + "epoch": 1.896144943019943, + "grad_norm": 0.6879309415817261, + "learning_rate": 0.00011941605022106228, + "loss": 0.6026, + "step": 85190 + }, + { + "epoch": 1.8963675213675213, + "grad_norm": 0.587652325630188, + "learning_rate": 0.00011937338209556368, + "loss": 0.5271, + "step": 85200 + }, + { + "epoch": 1.8965900997150997, + "grad_norm": 0.4308949410915375, + "learning_rate": 0.00011933071835138152, + "loss": 0.6274, + "step": 85210 + }, + { + "epoch": 1.8968126780626782, + "grad_norm": 0.4711028039455414, + "learning_rate": 0.00011928805899083418, + "loss": 0.5202, + "step": 85220 + }, + { + "epoch": 1.8970352564102564, + "grad_norm": 0.5150978565216064, + "learning_rate": 0.00011924540401623976, + "loss": 0.5259, + "step": 85230 + }, + { + "epoch": 1.8972578347578346, + "grad_norm": 0.5412598252296448, + "learning_rate": 0.00011920275342991618, + "loss": 0.553, + "step": 85240 + }, + { + "epoch": 1.8974804131054133, + "grad_norm": 0.5050225257873535, + "learning_rate": 0.00011916010723418116, + "loss": 0.4893, + "step": 85250 + }, + { + "epoch": 1.8977029914529915, + "grad_norm": 0.587312638759613, + "learning_rate": 0.00011911746543135209, + "loss": 0.676, + "step": 85260 + }, + { + "epoch": 1.8979255698005697, + "grad_norm": 0.5557898879051208, + "learning_rate": 0.00011907482802374615, + "loss": 0.5312, + "step": 85270 + }, + { + "epoch": 1.8981481481481481, + "grad_norm": 0.42806851863861084, + "learning_rate": 0.0001190321950136803, + "loss": 0.4447, + "step": 85280 + }, + { + "epoch": 1.8983707264957266, + "grad_norm": 0.5302470922470093, + "learning_rate": 0.00011898956640347125, + "loss": 0.4668, + "step": 85290 + }, + { + "epoch": 1.8985933048433048, + "grad_norm": 0.4515651762485504, + "learning_rate": 0.00011894694219543549, + "loss": 0.5503, + "step": 85300 + }, + { + "epoch": 1.8988158831908832, + "grad_norm": 0.5524975061416626, + "learning_rate": 0.0001189043223918893, + "loss": 0.4934, + "step": 85310 + }, + { + "epoch": 1.8990384615384617, + "grad_norm": 0.6137921214103699, + "learning_rate": 0.00011886170699514855, + "loss": 0.5975, + "step": 85320 + }, + { + "epoch": 1.89926103988604, + "grad_norm": 0.8360822796821594, + "learning_rate": 0.00011881909600752909, + "loss": 0.6624, + "step": 85330 + }, + { + "epoch": 1.899483618233618, + "grad_norm": 0.5582287907600403, + "learning_rate": 0.0001187764894313464, + "loss": 0.553, + "step": 85340 + }, + { + "epoch": 1.8997061965811965, + "grad_norm": 0.6215211153030396, + "learning_rate": 0.00011873388726891575, + "loss": 0.5912, + "step": 85350 + }, + { + "epoch": 1.899928774928775, + "grad_norm": 0.4308690130710602, + "learning_rate": 0.00011869128952255226, + "loss": 0.532, + "step": 85360 + }, + { + "epoch": 1.9001513532763532, + "grad_norm": 0.6136817932128906, + "learning_rate": 0.00011864869619457057, + "loss": 0.7096, + "step": 85370 + }, + { + "epoch": 1.9003739316239316, + "grad_norm": 0.33455830812454224, + "learning_rate": 0.00011860610728728536, + "loss": 0.7005, + "step": 85380 + }, + { + "epoch": 1.90059650997151, + "grad_norm": 0.776676595211029, + "learning_rate": 0.00011856352280301095, + "loss": 0.6113, + "step": 85390 + }, + { + "epoch": 1.9008190883190883, + "grad_norm": 0.570492684841156, + "learning_rate": 0.00011852094274406133, + "loss": 0.4769, + "step": 85400 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3798914849758148, + "learning_rate": 0.00011847836711275038, + "loss": 0.4396, + "step": 85410 + }, + { + "epoch": 1.9012642450142452, + "grad_norm": 0.43583646416664124, + "learning_rate": 0.00011843579591139175, + "loss": 0.63, + "step": 85420 + }, + { + "epoch": 1.9014868233618234, + "grad_norm": 0.632577657699585, + "learning_rate": 0.00011839322914229869, + "loss": 0.6159, + "step": 85430 + }, + { + "epoch": 1.9017094017094016, + "grad_norm": 0.5504708886146545, + "learning_rate": 0.00011835066680778446, + "loss": 0.4517, + "step": 85440 + }, + { + "epoch": 1.90193198005698, + "grad_norm": 0.6885342597961426, + "learning_rate": 0.00011830810891016173, + "loss": 0.5899, + "step": 85450 + }, + { + "epoch": 1.9021545584045585, + "grad_norm": 0.6813510656356812, + "learning_rate": 0.00011826555545174324, + "loss": 0.4313, + "step": 85460 + }, + { + "epoch": 1.9023771367521367, + "grad_norm": 0.8150388598442078, + "learning_rate": 0.00011822300643484145, + "loss": 0.5397, + "step": 85470 + }, + { + "epoch": 1.9025997150997151, + "grad_norm": 0.5913096070289612, + "learning_rate": 0.00011818046186176839, + "loss": 0.5695, + "step": 85480 + }, + { + "epoch": 1.9028222934472936, + "grad_norm": 0.5552115440368652, + "learning_rate": 0.000118137921734836, + "loss": 0.4947, + "step": 85490 + }, + { + "epoch": 1.9030448717948718, + "grad_norm": 0.5598598122596741, + "learning_rate": 0.00011809538605635599, + "loss": 0.5299, + "step": 85500 + }, + { + "epoch": 1.90326745014245, + "grad_norm": 0.5184574127197266, + "learning_rate": 0.00011805285482863972, + "loss": 0.4939, + "step": 85510 + }, + { + "epoch": 1.9034900284900285, + "grad_norm": 0.4912760853767395, + "learning_rate": 0.00011801032805399841, + "loss": 0.5775, + "step": 85520 + }, + { + "epoch": 1.903712606837607, + "grad_norm": 0.6037229299545288, + "learning_rate": 0.00011796780573474304, + "loss": 0.5351, + "step": 85530 + }, + { + "epoch": 1.9039351851851851, + "grad_norm": 0.6881440281867981, + "learning_rate": 0.0001179252878731842, + "loss": 0.504, + "step": 85540 + }, + { + "epoch": 1.9041577635327636, + "grad_norm": 0.6429431438446045, + "learning_rate": 0.00011788277447163244, + "loss": 0.6366, + "step": 85550 + }, + { + "epoch": 1.904380341880342, + "grad_norm": 0.5920882225036621, + "learning_rate": 0.00011784026553239793, + "loss": 0.5859, + "step": 85560 + }, + { + "epoch": 1.9046029202279202, + "grad_norm": 0.7298809885978699, + "learning_rate": 0.00011779776105779064, + "loss": 0.4179, + "step": 85570 + }, + { + "epoch": 1.9048254985754984, + "grad_norm": 0.5162444114685059, + "learning_rate": 0.00011775526105012038, + "loss": 0.6221, + "step": 85580 + }, + { + "epoch": 1.9050480769230769, + "grad_norm": 0.5679960250854492, + "learning_rate": 0.00011771276551169647, + "loss": 0.5587, + "step": 85590 + }, + { + "epoch": 1.9052706552706553, + "grad_norm": 0.663221001625061, + "learning_rate": 0.00011767027444482828, + "loss": 0.5354, + "step": 85600 + }, + { + "epoch": 1.9054932336182335, + "grad_norm": 0.44698238372802734, + "learning_rate": 0.0001176277878518248, + "loss": 0.4357, + "step": 85610 + }, + { + "epoch": 1.905715811965812, + "grad_norm": 0.6691514253616333, + "learning_rate": 0.00011758530573499471, + "loss": 0.6166, + "step": 85620 + }, + { + "epoch": 1.9059383903133904, + "grad_norm": 0.4185422956943512, + "learning_rate": 0.00011754282809664659, + "loss": 0.4938, + "step": 85630 + }, + { + "epoch": 1.9061609686609686, + "grad_norm": 0.5682917237281799, + "learning_rate": 0.00011750035493908874, + "loss": 0.5895, + "step": 85640 + }, + { + "epoch": 1.906383547008547, + "grad_norm": 0.5465940833091736, + "learning_rate": 0.00011745788626462908, + "loss": 0.568, + "step": 85650 + }, + { + "epoch": 1.9066061253561255, + "grad_norm": 0.477728933095932, + "learning_rate": 0.00011741542207557546, + "loss": 0.5208, + "step": 85660 + }, + { + "epoch": 1.9068287037037037, + "grad_norm": 0.5027092099189758, + "learning_rate": 0.00011737296237423545, + "loss": 0.5392, + "step": 85670 + }, + { + "epoch": 1.907051282051282, + "grad_norm": 0.47351041436195374, + "learning_rate": 0.00011733050716291627, + "loss": 0.5045, + "step": 85680 + }, + { + "epoch": 1.9072738603988604, + "grad_norm": 0.43656596541404724, + "learning_rate": 0.00011728805644392498, + "loss": 0.3866, + "step": 85690 + }, + { + "epoch": 1.9074964387464388, + "grad_norm": 0.492007315158844, + "learning_rate": 0.00011724561021956849, + "loss": 0.4327, + "step": 85700 + }, + { + "epoch": 1.907719017094017, + "grad_norm": 0.6703928112983704, + "learning_rate": 0.00011720316849215332, + "loss": 0.507, + "step": 85710 + }, + { + "epoch": 1.9079415954415955, + "grad_norm": 0.49135464429855347, + "learning_rate": 0.00011716073126398565, + "loss": 0.5191, + "step": 85720 + }, + { + "epoch": 1.908164173789174, + "grad_norm": 0.46850427985191345, + "learning_rate": 0.00011711829853737169, + "loss": 0.5541, + "step": 85730 + }, + { + "epoch": 1.9083867521367521, + "grad_norm": 0.4342600107192993, + "learning_rate": 0.00011707587031461722, + "loss": 0.5583, + "step": 85740 + }, + { + "epoch": 1.9086093304843303, + "grad_norm": 0.5486937165260315, + "learning_rate": 0.00011703344659802789, + "loss": 0.4439, + "step": 85750 + }, + { + "epoch": 1.9088319088319088, + "grad_norm": 0.5380405187606812, + "learning_rate": 0.00011699102738990895, + "loss": 0.5932, + "step": 85760 + }, + { + "epoch": 1.9090544871794872, + "grad_norm": 0.386771023273468, + "learning_rate": 0.00011694861269256554, + "loss": 0.5885, + "step": 85770 + }, + { + "epoch": 1.9092770655270654, + "grad_norm": 0.5973242521286011, + "learning_rate": 0.00011690620250830253, + "loss": 0.4062, + "step": 85780 + }, + { + "epoch": 1.9094996438746439, + "grad_norm": 0.48295095562934875, + "learning_rate": 0.00011686379683942448, + "loss": 0.5558, + "step": 85790 + }, + { + "epoch": 1.9097222222222223, + "grad_norm": 0.46864092350006104, + "learning_rate": 0.00011682139568823576, + "loss": 0.5994, + "step": 85800 + }, + { + "epoch": 1.9099448005698005, + "grad_norm": 0.5002042055130005, + "learning_rate": 0.00011677899905704053, + "loss": 0.5547, + "step": 85810 + }, + { + "epoch": 1.9101673789173788, + "grad_norm": 0.4876922369003296, + "learning_rate": 0.00011673660694814259, + "loss": 0.4462, + "step": 85820 + }, + { + "epoch": 1.9103899572649574, + "grad_norm": 0.4872403144836426, + "learning_rate": 0.00011669421936384559, + "loss": 0.6234, + "step": 85830 + }, + { + "epoch": 1.9106125356125356, + "grad_norm": 0.5643612742424011, + "learning_rate": 0.00011665183630645298, + "loss": 0.5341, + "step": 85840 + }, + { + "epoch": 1.9108351139601139, + "grad_norm": 0.5881261825561523, + "learning_rate": 0.00011660945777826775, + "loss": 0.5138, + "step": 85850 + }, + { + "epoch": 1.9110576923076923, + "grad_norm": 0.4867708683013916, + "learning_rate": 0.00011656708378159283, + "loss": 0.5568, + "step": 85860 + }, + { + "epoch": 1.9112802706552707, + "grad_norm": 0.6191570162773132, + "learning_rate": 0.00011652471431873086, + "loss": 0.5563, + "step": 85870 + }, + { + "epoch": 1.911502849002849, + "grad_norm": 0.6064071655273438, + "learning_rate": 0.00011648234939198428, + "loss": 0.5772, + "step": 85880 + }, + { + "epoch": 1.9117254273504274, + "grad_norm": 0.4651621878147125, + "learning_rate": 0.0001164399890036552, + "loss": 0.5513, + "step": 85890 + }, + { + "epoch": 1.9119480056980058, + "grad_norm": 0.3724319040775299, + "learning_rate": 0.0001163976331560456, + "loss": 0.4405, + "step": 85900 + }, + { + "epoch": 1.912170584045584, + "grad_norm": 0.4384273886680603, + "learning_rate": 0.00011635528185145696, + "loss": 0.5506, + "step": 85910 + }, + { + "epoch": 1.9123931623931623, + "grad_norm": 0.7029179334640503, + "learning_rate": 0.0001163129350921908, + "loss": 0.5189, + "step": 85920 + }, + { + "epoch": 1.9126157407407407, + "grad_norm": 0.625603437423706, + "learning_rate": 0.00011627059288054827, + "loss": 0.5317, + "step": 85930 + }, + { + "epoch": 1.9128383190883191, + "grad_norm": 0.5059728026390076, + "learning_rate": 0.00011622825521883025, + "loss": 0.4053, + "step": 85940 + }, + { + "epoch": 1.9130608974358974, + "grad_norm": 0.5113756656646729, + "learning_rate": 0.00011618592210933741, + "loss": 0.5821, + "step": 85950 + }, + { + "epoch": 1.9132834757834758, + "grad_norm": 0.648999810218811, + "learning_rate": 0.00011614359355437026, + "loss": 0.6489, + "step": 85960 + }, + { + "epoch": 1.9135060541310542, + "grad_norm": 0.5037275552749634, + "learning_rate": 0.00011610126955622882, + "loss": 0.5174, + "step": 85970 + }, + { + "epoch": 1.9137286324786325, + "grad_norm": 0.5843381881713867, + "learning_rate": 0.00011605895011721313, + "loss": 0.5892, + "step": 85980 + }, + { + "epoch": 1.9139512108262107, + "grad_norm": 0.43946772813796997, + "learning_rate": 0.00011601663523962274, + "loss": 0.5989, + "step": 85990 + }, + { + "epoch": 1.9141737891737893, + "grad_norm": 0.7192827463150024, + "learning_rate": 0.00011597432492575713, + "loss": 0.609, + "step": 86000 + }, + { + "epoch": 1.9143963675213675, + "grad_norm": 0.583531379699707, + "learning_rate": 0.00011593201917791552, + "loss": 0.537, + "step": 86010 + }, + { + "epoch": 1.9146189458689458, + "grad_norm": 0.5226663947105408, + "learning_rate": 0.00011588971799839678, + "loss": 0.4969, + "step": 86020 + }, + { + "epoch": 1.9148415242165242, + "grad_norm": 0.808573305606842, + "learning_rate": 0.00011584742138949965, + "loss": 0.6544, + "step": 86030 + }, + { + "epoch": 1.9150641025641026, + "grad_norm": 0.6683142185211182, + "learning_rate": 0.0001158051293535226, + "loss": 0.6896, + "step": 86040 + }, + { + "epoch": 1.9152866809116809, + "grad_norm": 0.47932371497154236, + "learning_rate": 0.00011576284189276365, + "loss": 0.4986, + "step": 86050 + }, + { + "epoch": 1.9155092592592593, + "grad_norm": 0.6403408646583557, + "learning_rate": 0.00011572055900952085, + "loss": 0.5601, + "step": 86060 + }, + { + "epoch": 1.9157318376068377, + "grad_norm": 0.46542832255363464, + "learning_rate": 0.00011567828070609183, + "loss": 0.5125, + "step": 86070 + }, + { + "epoch": 1.915954415954416, + "grad_norm": 1.7472851276397705, + "learning_rate": 0.00011563600698477413, + "loss": 0.5651, + "step": 86080 + }, + { + "epoch": 1.9161769943019942, + "grad_norm": 0.4838975965976715, + "learning_rate": 0.00011559373784786483, + "loss": 0.5217, + "step": 86090 + }, + { + "epoch": 1.9163995726495726, + "grad_norm": 0.612494170665741, + "learning_rate": 0.00011555147329766098, + "loss": 0.5574, + "step": 86100 + }, + { + "epoch": 1.916622150997151, + "grad_norm": 0.2956370711326599, + "learning_rate": 0.00011550921333645917, + "loss": 0.5312, + "step": 86110 + }, + { + "epoch": 1.9168447293447293, + "grad_norm": 0.4158254861831665, + "learning_rate": 0.00011546695796655593, + "loss": 0.5194, + "step": 86120 + }, + { + "epoch": 1.9170673076923077, + "grad_norm": 0.9217372536659241, + "learning_rate": 0.00011542470719024732, + "loss": 0.5323, + "step": 86130 + }, + { + "epoch": 1.9172898860398861, + "grad_norm": 0.5834040641784668, + "learning_rate": 0.00011538246100982935, + "loss": 0.4904, + "step": 86140 + }, + { + "epoch": 1.9175124643874644, + "grad_norm": 0.6396893858909607, + "learning_rate": 0.00011534021942759775, + "loss": 0.4483, + "step": 86150 + }, + { + "epoch": 1.9177350427350426, + "grad_norm": 0.41121169924736023, + "learning_rate": 0.00011529798244584789, + "loss": 0.4859, + "step": 86160 + }, + { + "epoch": 1.9179576210826212, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.00011525575006687502, + "loss": 0.5019, + "step": 86170 + }, + { + "epoch": 1.9181801994301995, + "grad_norm": 0.5235520601272583, + "learning_rate": 0.00011521352229297412, + "loss": 0.6181, + "step": 86180 + }, + { + "epoch": 1.9184027777777777, + "grad_norm": 0.6111951470375061, + "learning_rate": 0.0001151712991264398, + "loss": 0.6011, + "step": 86190 + }, + { + "epoch": 1.9186253561253561, + "grad_norm": 0.5527158379554749, + "learning_rate": 0.00011512908056956651, + "loss": 0.5046, + "step": 86200 + }, + { + "epoch": 1.9188479344729346, + "grad_norm": 0.4959039092063904, + "learning_rate": 0.00011508686662464844, + "loss": 0.5589, + "step": 86210 + }, + { + "epoch": 1.9190705128205128, + "grad_norm": 0.3603544533252716, + "learning_rate": 0.00011504465729397957, + "loss": 0.4758, + "step": 86220 + }, + { + "epoch": 1.9192930911680912, + "grad_norm": 0.594570517539978, + "learning_rate": 0.00011500245257985365, + "loss": 0.5857, + "step": 86230 + }, + { + "epoch": 1.9195156695156697, + "grad_norm": 0.4193931221961975, + "learning_rate": 0.00011496025248456397, + "loss": 0.4426, + "step": 86240 + }, + { + "epoch": 1.9197382478632479, + "grad_norm": 0.47187313437461853, + "learning_rate": 0.00011491805701040376, + "loss": 0.4891, + "step": 86250 + }, + { + "epoch": 1.919960826210826, + "grad_norm": 0.5686632990837097, + "learning_rate": 0.00011487586615966607, + "loss": 0.6884, + "step": 86260 + }, + { + "epoch": 1.9201834045584045, + "grad_norm": 0.43781980872154236, + "learning_rate": 0.00011483367993464345, + "loss": 0.5434, + "step": 86270 + }, + { + "epoch": 1.9202279202279202, + "eval_loss": 0.5408384203910828, + "eval_runtime": 337.9859, + "eval_samples_per_second": 6.997, + "eval_steps_per_second": 6.997, + "step": 86272 + }, + { + "epoch": 1.920405982905983, + "grad_norm": 0.4699023962020874, + "learning_rate": 0.00011479149833762832, + "loss": 0.5495, + "step": 86280 + }, + { + "epoch": 1.9206285612535612, + "grad_norm": 0.7036943435668945, + "learning_rate": 0.00011474932137091299, + "loss": 0.5594, + "step": 86290 + }, + { + "epoch": 1.9208511396011396, + "grad_norm": 0.4120835065841675, + "learning_rate": 0.0001147071490367893, + "loss": 0.5549, + "step": 86300 + }, + { + "epoch": 1.921073717948718, + "grad_norm": 0.4701841175556183, + "learning_rate": 0.00011466498133754895, + "loss": 0.4743, + "step": 86310 + }, + { + "epoch": 1.9212962962962963, + "grad_norm": 0.5886813998222351, + "learning_rate": 0.00011462281827548347, + "loss": 0.5863, + "step": 86320 + }, + { + "epoch": 1.9215188746438745, + "grad_norm": 0.63768470287323, + "learning_rate": 0.00011458065985288382, + "loss": 0.5573, + "step": 86330 + }, + { + "epoch": 1.921741452991453, + "grad_norm": 0.42090120911598206, + "learning_rate": 0.00011453850607204106, + "loss": 0.4823, + "step": 86340 + }, + { + "epoch": 1.9219640313390314, + "grad_norm": 0.4522937834262848, + "learning_rate": 0.00011449635693524587, + "loss": 0.4412, + "step": 86350 + }, + { + "epoch": 1.9221866096866096, + "grad_norm": 0.7093581557273865, + "learning_rate": 0.00011445421244478869, + "loss": 0.5428, + "step": 86360 + }, + { + "epoch": 1.922409188034188, + "grad_norm": 0.6658710241317749, + "learning_rate": 0.00011441207260295956, + "loss": 0.5897, + "step": 86370 + }, + { + "epoch": 1.9226317663817665, + "grad_norm": 0.438462495803833, + "learning_rate": 0.00011436993741204847, + "loss": 0.4477, + "step": 86380 + }, + { + "epoch": 1.9228543447293447, + "grad_norm": 0.5765256881713867, + "learning_rate": 0.0001143278068743451, + "loss": 0.5906, + "step": 86390 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.6065409183502197, + "learning_rate": 0.0001142856809921389, + "loss": 0.6645, + "step": 86400 + }, + { + "epoch": 1.9232995014245016, + "grad_norm": 0.3438239097595215, + "learning_rate": 0.0001142435597677189, + "loss": 0.5343, + "step": 86410 + }, + { + "epoch": 1.9235220797720798, + "grad_norm": 0.5237026810646057, + "learning_rate": 0.00011420144320337405, + "loss": 0.5707, + "step": 86420 + }, + { + "epoch": 1.923744658119658, + "grad_norm": 0.7502513527870178, + "learning_rate": 0.00011415933130139302, + "loss": 0.6223, + "step": 86430 + }, + { + "epoch": 1.9239672364672364, + "grad_norm": 0.5259876251220703, + "learning_rate": 0.0001141172240640642, + "loss": 0.5175, + "step": 86440 + }, + { + "epoch": 1.9241898148148149, + "grad_norm": 0.5037872195243835, + "learning_rate": 0.00011407512149367572, + "loss": 0.4996, + "step": 86450 + }, + { + "epoch": 1.924412393162393, + "grad_norm": 0.5434580445289612, + "learning_rate": 0.00011403302359251558, + "loss": 0.5908, + "step": 86460 + }, + { + "epoch": 1.9246349715099715, + "grad_norm": 0.38471388816833496, + "learning_rate": 0.00011399093036287123, + "loss": 0.5842, + "step": 86470 + }, + { + "epoch": 1.92485754985755, + "grad_norm": 0.6415163278579712, + "learning_rate": 0.00011394884180703012, + "loss": 0.6516, + "step": 86480 + }, + { + "epoch": 1.9250801282051282, + "grad_norm": 0.5737249255180359, + "learning_rate": 0.00011390675792727947, + "loss": 0.6385, + "step": 86490 + }, + { + "epoch": 1.9253027065527064, + "grad_norm": 0.5640256404876709, + "learning_rate": 0.00011386467872590601, + "loss": 0.5657, + "step": 86500 + }, + { + "epoch": 1.9255252849002849, + "grad_norm": 0.6238548159599304, + "learning_rate": 0.00011382260420519641, + "loss": 0.589, + "step": 86510 + }, + { + "epoch": 1.9257478632478633, + "grad_norm": 0.5096440315246582, + "learning_rate": 0.00011378053436743706, + "loss": 0.4942, + "step": 86520 + }, + { + "epoch": 1.9259704415954415, + "grad_norm": 0.49294623732566833, + "learning_rate": 0.00011373846921491402, + "loss": 0.4893, + "step": 86530 + }, + { + "epoch": 1.92619301994302, + "grad_norm": 0.5232724547386169, + "learning_rate": 0.0001136964087499133, + "loss": 0.6122, + "step": 86540 + }, + { + "epoch": 1.9264155982905984, + "grad_norm": 0.7509873509407043, + "learning_rate": 0.00011365435297472027, + "loss": 0.5612, + "step": 86550 + }, + { + "epoch": 1.9266381766381766, + "grad_norm": 0.4268711805343628, + "learning_rate": 0.00011361230189162042, + "loss": 0.521, + "step": 86560 + }, + { + "epoch": 1.9268607549857548, + "grad_norm": 0.5106337666511536, + "learning_rate": 0.0001135702555028988, + "loss": 0.5348, + "step": 86570 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.7293399572372437, + "learning_rate": 0.00011352821381084022, + "loss": 0.6527, + "step": 86580 + }, + { + "epoch": 1.9273059116809117, + "grad_norm": 0.6081629395484924, + "learning_rate": 0.00011348617681772931, + "loss": 0.5959, + "step": 86590 + }, + { + "epoch": 1.92752849002849, + "grad_norm": 0.3281137943267822, + "learning_rate": 0.00011344414452585044, + "loss": 0.4751, + "step": 86600 + }, + { + "epoch": 1.9277510683760684, + "grad_norm": 0.6834414005279541, + "learning_rate": 0.00011340211693748755, + "loss": 0.478, + "step": 86610 + }, + { + "epoch": 1.9279736467236468, + "grad_norm": 0.44042229652404785, + "learning_rate": 0.00011336009405492452, + "loss": 0.5395, + "step": 86620 + }, + { + "epoch": 1.928196225071225, + "grad_norm": 0.6206148862838745, + "learning_rate": 0.00011331807588044498, + "loss": 0.5037, + "step": 86630 + }, + { + "epoch": 1.9284188034188035, + "grad_norm": 0.4447237551212311, + "learning_rate": 0.00011327606241633209, + "loss": 0.5585, + "step": 86640 + }, + { + "epoch": 1.928641381766382, + "grad_norm": 0.60000079870224, + "learning_rate": 0.00011323405366486892, + "loss": 0.5136, + "step": 86650 + }, + { + "epoch": 1.92886396011396, + "grad_norm": 0.2710607945919037, + "learning_rate": 0.00011319204962833834, + "loss": 0.5419, + "step": 86660 + }, + { + "epoch": 1.9290865384615383, + "grad_norm": 0.5812567472457886, + "learning_rate": 0.00011315005030902292, + "loss": 0.5482, + "step": 86670 + }, + { + "epoch": 1.9293091168091168, + "grad_norm": 0.34236037731170654, + "learning_rate": 0.0001131080557092048, + "loss": 0.4468, + "step": 86680 + }, + { + "epoch": 1.9295316951566952, + "grad_norm": 0.7552074790000916, + "learning_rate": 0.00011306606583116606, + "loss": 0.5755, + "step": 86690 + }, + { + "epoch": 1.9297542735042734, + "grad_norm": 0.4339049458503723, + "learning_rate": 0.00011302408067718846, + "loss": 0.5557, + "step": 86700 + }, + { + "epoch": 1.9299768518518519, + "grad_norm": 0.529327392578125, + "learning_rate": 0.00011298210024955351, + "loss": 0.7157, + "step": 86710 + }, + { + "epoch": 1.9301994301994303, + "grad_norm": 0.5876535773277283, + "learning_rate": 0.00011294012455054249, + "loss": 0.5565, + "step": 86720 + }, + { + "epoch": 1.9304220085470085, + "grad_norm": 0.7795631289482117, + "learning_rate": 0.00011289815358243636, + "loss": 0.5131, + "step": 86730 + }, + { + "epoch": 1.9306445868945867, + "grad_norm": 0.5589414834976196, + "learning_rate": 0.00011285618734751595, + "loss": 0.5326, + "step": 86740 + }, + { + "epoch": 1.9308671652421654, + "grad_norm": 0.46770164370536804, + "learning_rate": 0.00011281422584806156, + "loss": 0.5333, + "step": 86750 + }, + { + "epoch": 1.9310897435897436, + "grad_norm": 0.6444997787475586, + "learning_rate": 0.00011277226908635363, + "loss": 0.5487, + "step": 86760 + }, + { + "epoch": 1.9313123219373218, + "grad_norm": 0.7315465211868286, + "learning_rate": 0.0001127303170646719, + "loss": 0.5377, + "step": 86770 + }, + { + "epoch": 1.9315349002849003, + "grad_norm": 0.4663032293319702, + "learning_rate": 0.0001126883697852962, + "loss": 0.5601, + "step": 86780 + }, + { + "epoch": 1.9317574786324787, + "grad_norm": 0.8682087063789368, + "learning_rate": 0.00011264642725050594, + "loss": 0.4918, + "step": 86790 + }, + { + "epoch": 1.931980056980057, + "grad_norm": 0.5688363313674927, + "learning_rate": 0.00011260448946258035, + "loss": 0.6255, + "step": 86800 + }, + { + "epoch": 1.9322026353276354, + "grad_norm": 0.6298112273216248, + "learning_rate": 0.00011256255642379843, + "loss": 0.4573, + "step": 86810 + }, + { + "epoch": 1.9324252136752138, + "grad_norm": 0.3815062940120697, + "learning_rate": 0.00011252062813643868, + "loss": 0.6459, + "step": 86820 + }, + { + "epoch": 1.932647792022792, + "grad_norm": 0.5226765275001526, + "learning_rate": 0.00011247870460277962, + "loss": 0.6222, + "step": 86830 + }, + { + "epoch": 1.9328703703703702, + "grad_norm": 0.6144941449165344, + "learning_rate": 0.0001124367858250994, + "loss": 0.4883, + "step": 86840 + }, + { + "epoch": 1.9330929487179487, + "grad_norm": 0.7446057200431824, + "learning_rate": 0.00011239487180567594, + "loss": 0.5385, + "step": 86850 + }, + { + "epoch": 1.9333155270655271, + "grad_norm": 0.5342898964881897, + "learning_rate": 0.00011235296254678682, + "loss": 0.5993, + "step": 86860 + }, + { + "epoch": 1.9335381054131053, + "grad_norm": 0.5611191987991333, + "learning_rate": 0.00011231105805070949, + "loss": 0.5582, + "step": 86870 + }, + { + "epoch": 1.9337606837606838, + "grad_norm": 0.6036495566368103, + "learning_rate": 0.00011226915831972114, + "loss": 0.6238, + "step": 86880 + }, + { + "epoch": 1.9339832621082622, + "grad_norm": 0.6017242670059204, + "learning_rate": 0.00011222726335609852, + "loss": 0.5185, + "step": 86890 + }, + { + "epoch": 1.9342058404558404, + "grad_norm": 0.3665631115436554, + "learning_rate": 0.00011218537316211821, + "loss": 0.5793, + "step": 86900 + }, + { + "epoch": 1.9344284188034186, + "grad_norm": 0.6043411493301392, + "learning_rate": 0.00011214348774005661, + "loss": 0.5855, + "step": 86910 + }, + { + "epoch": 1.9346509971509973, + "grad_norm": 0.5133441090583801, + "learning_rate": 0.0001121016070921898, + "loss": 0.4867, + "step": 86920 + }, + { + "epoch": 1.9348735754985755, + "grad_norm": 0.5243979692459106, + "learning_rate": 0.00011205973122079361, + "loss": 0.5063, + "step": 86930 + }, + { + "epoch": 1.9350961538461537, + "grad_norm": 0.5311282277107239, + "learning_rate": 0.00011201786012814363, + "loss": 0.6174, + "step": 86940 + }, + { + "epoch": 1.9353187321937322, + "grad_norm": 0.6318812370300293, + "learning_rate": 0.00011197599381651525, + "loss": 0.5249, + "step": 86950 + }, + { + "epoch": 1.9355413105413106, + "grad_norm": 1.000100016593933, + "learning_rate": 0.0001119341322881833, + "loss": 0.5078, + "step": 86960 + }, + { + "epoch": 1.9357638888888888, + "grad_norm": 0.5029283165931702, + "learning_rate": 0.00011189227554542272, + "loss": 0.4465, + "step": 86970 + }, + { + "epoch": 1.9359864672364673, + "grad_norm": 0.5905060768127441, + "learning_rate": 0.00011185042359050801, + "loss": 0.5005, + "step": 86980 + }, + { + "epoch": 1.9362090455840457, + "grad_norm": 0.6078103184700012, + "learning_rate": 0.00011180857642571347, + "loss": 0.5083, + "step": 86990 + }, + { + "epoch": 1.936431623931624, + "grad_norm": 0.48058727383613586, + "learning_rate": 0.00011176673405331306, + "loss": 0.6146, + "step": 87000 + }, + { + "epoch": 1.9366542022792022, + "grad_norm": 0.5735967755317688, + "learning_rate": 0.00011172489647558055, + "loss": 0.6516, + "step": 87010 + }, + { + "epoch": 1.9368767806267806, + "grad_norm": 0.3764326572418213, + "learning_rate": 0.00011168306369478954, + "loss": 0.6557, + "step": 87020 + }, + { + "epoch": 1.937099358974359, + "grad_norm": 0.46224406361579895, + "learning_rate": 0.00011164123571321312, + "loss": 0.4073, + "step": 87030 + }, + { + "epoch": 1.9373219373219372, + "grad_norm": 0.6236437559127808, + "learning_rate": 0.0001115994125331242, + "loss": 0.5778, + "step": 87040 + }, + { + "epoch": 1.9375445156695157, + "grad_norm": 0.4578424394130707, + "learning_rate": 0.00011155759415679558, + "loss": 0.5797, + "step": 87050 + }, + { + "epoch": 1.9377670940170941, + "grad_norm": 0.5358936786651611, + "learning_rate": 0.00011151578058649971, + "loss": 0.5618, + "step": 87060 + }, + { + "epoch": 1.9379896723646723, + "grad_norm": 0.7817856073379517, + "learning_rate": 0.00011147397182450876, + "loss": 0.5458, + "step": 87070 + }, + { + "epoch": 1.9382122507122506, + "grad_norm": 0.6386322975158691, + "learning_rate": 0.00011143216787309466, + "loss": 0.4807, + "step": 87080 + }, + { + "epoch": 1.9384348290598292, + "grad_norm": 0.4296022951602936, + "learning_rate": 0.0001113903687345291, + "loss": 0.5916, + "step": 87090 + }, + { + "epoch": 1.9386574074074074, + "grad_norm": 0.4393704831600189, + "learning_rate": 0.00011134857441108337, + "loss": 0.5215, + "step": 87100 + }, + { + "epoch": 1.9388799857549857, + "grad_norm": 0.47139036655426025, + "learning_rate": 0.0001113067849050287, + "loss": 0.5059, + "step": 87110 + }, + { + "epoch": 1.939102564102564, + "grad_norm": 0.5626164674758911, + "learning_rate": 0.00011126500021863595, + "loss": 0.4885, + "step": 87120 + }, + { + "epoch": 1.9393251424501425, + "grad_norm": 1.0058131217956543, + "learning_rate": 0.00011122322035417574, + "loss": 0.5409, + "step": 87130 + }, + { + "epoch": 1.9395477207977208, + "grad_norm": 0.5677262544631958, + "learning_rate": 0.00011118144531391838, + "loss": 0.5311, + "step": 87140 + }, + { + "epoch": 1.9397702991452992, + "grad_norm": 0.5968604683876038, + "learning_rate": 0.00011113967510013407, + "loss": 0.4733, + "step": 87150 + }, + { + "epoch": 1.9399928774928776, + "grad_norm": 0.5895420908927917, + "learning_rate": 0.0001110979097150925, + "loss": 0.5589, + "step": 87160 + }, + { + "epoch": 1.9402154558404558, + "grad_norm": 0.41556987166404724, + "learning_rate": 0.00011105614916106337, + "loss": 0.4725, + "step": 87170 + }, + { + "epoch": 1.940438034188034, + "grad_norm": 0.6243895888328552, + "learning_rate": 0.00011101439344031584, + "loss": 0.5698, + "step": 87180 + }, + { + "epoch": 1.9406606125356125, + "grad_norm": 0.452260285615921, + "learning_rate": 0.00011097264255511901, + "loss": 0.4175, + "step": 87190 + }, + { + "epoch": 1.940883190883191, + "grad_norm": 0.4803001284599304, + "learning_rate": 0.00011093089650774167, + "loss": 0.4819, + "step": 87200 + }, + { + "epoch": 1.9411057692307692, + "grad_norm": 0.5832453370094299, + "learning_rate": 0.0001108891553004523, + "loss": 0.6637, + "step": 87210 + }, + { + "epoch": 1.9413283475783476, + "grad_norm": 0.5619366765022278, + "learning_rate": 0.0001108474189355192, + "loss": 0.4702, + "step": 87220 + }, + { + "epoch": 1.941550925925926, + "grad_norm": 0.4576612114906311, + "learning_rate": 0.00011080568741521043, + "loss": 0.5133, + "step": 87230 + }, + { + "epoch": 1.9417735042735043, + "grad_norm": 0.6994546055793762, + "learning_rate": 0.00011076396074179352, + "loss": 0.5462, + "step": 87240 + }, + { + "epoch": 1.9419960826210825, + "grad_norm": 0.6044816374778748, + "learning_rate": 0.00011072223891753605, + "loss": 0.5178, + "step": 87250 + }, + { + "epoch": 1.942218660968661, + "grad_norm": 0.5070165395736694, + "learning_rate": 0.00011068052194470519, + "loss": 0.5199, + "step": 87260 + }, + { + "epoch": 1.9424412393162394, + "grad_norm": 0.7343992590904236, + "learning_rate": 0.00011063880982556791, + "loss": 0.4915, + "step": 87270 + }, + { + "epoch": 1.9426638176638176, + "grad_norm": 0.4487825632095337, + "learning_rate": 0.00011059710256239091, + "loss": 0.5069, + "step": 87280 + }, + { + "epoch": 1.942886396011396, + "grad_norm": 0.39010611176490784, + "learning_rate": 0.00011055540015744048, + "loss": 0.5441, + "step": 87290 + }, + { + "epoch": 1.9431089743589745, + "grad_norm": 0.39558878540992737, + "learning_rate": 0.00011051370261298287, + "loss": 0.5061, + "step": 87300 + }, + { + "epoch": 1.9433315527065527, + "grad_norm": 0.6780951619148254, + "learning_rate": 0.00011047200993128393, + "loss": 0.6518, + "step": 87310 + }, + { + "epoch": 1.943554131054131, + "grad_norm": 0.4943266212940216, + "learning_rate": 0.0001104303221146092, + "loss": 0.573, + "step": 87320 + }, + { + "epoch": 1.9437767094017095, + "grad_norm": 0.35894832015037537, + "learning_rate": 0.00011038863916522411, + "loss": 0.466, + "step": 87330 + }, + { + "epoch": 1.9439992877492878, + "grad_norm": 1.0896111726760864, + "learning_rate": 0.00011034696108539373, + "loss": 0.5727, + "step": 87340 + }, + { + "epoch": 1.944221866096866, + "grad_norm": 0.4004582166671753, + "learning_rate": 0.00011030528787738286, + "loss": 0.4915, + "step": 87350 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.5520352125167847, + "learning_rate": 0.00011026361954345607, + "loss": 0.4131, + "step": 87360 + }, + { + "epoch": 1.9446670227920229, + "grad_norm": 0.4198590815067291, + "learning_rate": 0.00011022195608587776, + "loss": 0.4725, + "step": 87370 + }, + { + "epoch": 1.944889601139601, + "grad_norm": 0.49883604049682617, + "learning_rate": 0.00011018029750691174, + "loss": 0.4958, + "step": 87380 + }, + { + "epoch": 1.9451121794871795, + "grad_norm": 0.470523864030838, + "learning_rate": 0.00011013864380882191, + "loss": 0.5394, + "step": 87390 + }, + { + "epoch": 1.945334757834758, + "grad_norm": 0.31142184138298035, + "learning_rate": 0.00011009699499387172, + "loss": 0.5702, + "step": 87400 + }, + { + "epoch": 1.9455573361823362, + "grad_norm": 0.753667950630188, + "learning_rate": 0.00011005535106432452, + "loss": 0.4705, + "step": 87410 + }, + { + "epoch": 1.9457799145299144, + "grad_norm": 0.6320822238922119, + "learning_rate": 0.00011001371202244309, + "loss": 0.5632, + "step": 87420 + }, + { + "epoch": 1.9460024928774928, + "grad_norm": 0.7997458577156067, + "learning_rate": 0.0001099720778704902, + "loss": 0.6188, + "step": 87430 + }, + { + "epoch": 1.9462250712250713, + "grad_norm": 0.5000240206718445, + "learning_rate": 0.0001099304486107283, + "loss": 0.4939, + "step": 87440 + }, + { + "epoch": 1.9464476495726495, + "grad_norm": 0.6512289047241211, + "learning_rate": 0.00010988882424541964, + "loss": 0.5788, + "step": 87450 + }, + { + "epoch": 1.946670227920228, + "grad_norm": 0.4772588014602661, + "learning_rate": 0.00010984720477682597, + "loss": 0.6477, + "step": 87460 + }, + { + "epoch": 1.9468928062678064, + "grad_norm": 0.7150284051895142, + "learning_rate": 0.000109805590207209, + "loss": 0.4929, + "step": 87470 + }, + { + "epoch": 1.9471153846153846, + "grad_norm": 0.4428633451461792, + "learning_rate": 0.00010976398053883008, + "loss": 0.5268, + "step": 87480 + }, + { + "epoch": 1.9473379629629628, + "grad_norm": 0.5598447322845459, + "learning_rate": 0.00010972237577395033, + "loss": 0.614, + "step": 87490 + }, + { + "epoch": 1.9475605413105415, + "grad_norm": 0.67003333568573, + "learning_rate": 0.00010968077591483059, + "loss": 0.5683, + "step": 87500 + }, + { + "epoch": 1.9477831196581197, + "grad_norm": 0.7352870106697083, + "learning_rate": 0.00010963918096373152, + "loss": 0.5054, + "step": 87510 + }, + { + "epoch": 1.948005698005698, + "grad_norm": 0.4697332978248596, + "learning_rate": 0.00010959759092291322, + "loss": 0.6146, + "step": 87520 + }, + { + "epoch": 1.9482282763532763, + "grad_norm": 0.544786810874939, + "learning_rate": 0.00010955600579463583, + "loss": 0.5645, + "step": 87530 + }, + { + "epoch": 1.9484508547008548, + "grad_norm": 0.5202828049659729, + "learning_rate": 0.00010951442558115923, + "loss": 0.573, + "step": 87540 + }, + { + "epoch": 1.948673433048433, + "grad_norm": 0.4407440423965454, + "learning_rate": 0.00010947285028474275, + "loss": 0.4154, + "step": 87550 + }, + { + "epoch": 1.9488960113960114, + "grad_norm": 0.4419044554233551, + "learning_rate": 0.00010943127990764566, + "loss": 0.5709, + "step": 87560 + }, + { + "epoch": 1.9491185897435899, + "grad_norm": 0.5056807398796082, + "learning_rate": 0.00010938971445212698, + "loss": 0.5888, + "step": 87570 + }, + { + "epoch": 1.949341168091168, + "grad_norm": 0.5175442695617676, + "learning_rate": 0.00010934815392044542, + "loss": 0.5497, + "step": 87580 + }, + { + "epoch": 1.9495637464387463, + "grad_norm": 0.7415947318077087, + "learning_rate": 0.00010930659831485944, + "loss": 0.5651, + "step": 87590 + }, + { + "epoch": 1.9497863247863247, + "grad_norm": 0.5029433965682983, + "learning_rate": 0.00010926504763762706, + "loss": 0.4877, + "step": 87600 + }, + { + "epoch": 1.9500089031339032, + "grad_norm": 0.47681859135627747, + "learning_rate": 0.0001092235018910063, + "loss": 0.5207, + "step": 87610 + }, + { + "epoch": 1.9502314814814814, + "grad_norm": 0.3940894305706024, + "learning_rate": 0.00010918196107725474, + "loss": 0.4589, + "step": 87620 + }, + { + "epoch": 1.9504540598290598, + "grad_norm": 0.4383629262447357, + "learning_rate": 0.00010914042519862975, + "loss": 0.5397, + "step": 87630 + }, + { + "epoch": 1.9506766381766383, + "grad_norm": 0.4269826412200928, + "learning_rate": 0.00010909889425738847, + "loss": 0.4881, + "step": 87640 + }, + { + "epoch": 1.9508992165242165, + "grad_norm": 0.725731611251831, + "learning_rate": 0.00010905736825578774, + "loss": 0.485, + "step": 87650 + }, + { + "epoch": 1.9511217948717947, + "grad_norm": 0.5675638914108276, + "learning_rate": 0.000109015847196084, + "loss": 0.4996, + "step": 87660 + }, + { + "epoch": 1.9513443732193734, + "grad_norm": 0.5646101236343384, + "learning_rate": 0.00010897433108053361, + "loss": 0.6168, + "step": 87670 + }, + { + "epoch": 1.9515669515669516, + "grad_norm": 0.6364313364028931, + "learning_rate": 0.00010893281991139268, + "loss": 0.6142, + "step": 87680 + }, + { + "epoch": 1.9517895299145298, + "grad_norm": 0.5038259625434875, + "learning_rate": 0.00010889131369091676, + "loss": 0.492, + "step": 87690 + }, + { + "epoch": 1.9520121082621082, + "grad_norm": 0.5601425766944885, + "learning_rate": 0.00010884981242136145, + "loss": 0.6074, + "step": 87700 + }, + { + "epoch": 1.9522346866096867, + "grad_norm": 0.9033480882644653, + "learning_rate": 0.00010880831610498194, + "loss": 0.5971, + "step": 87710 + }, + { + "epoch": 1.952457264957265, + "grad_norm": 0.6076274514198303, + "learning_rate": 0.00010876682474403321, + "loss": 0.5046, + "step": 87720 + }, + { + "epoch": 1.9526798433048433, + "grad_norm": 0.7191054821014404, + "learning_rate": 0.00010872533834076999, + "loss": 0.6105, + "step": 87730 + }, + { + "epoch": 1.9529024216524218, + "grad_norm": 0.3403964042663574, + "learning_rate": 0.00010868385689744652, + "loss": 0.4262, + "step": 87740 + }, + { + "epoch": 1.953125, + "grad_norm": 0.29592037200927734, + "learning_rate": 0.000108642380416317, + "loss": 0.6061, + "step": 87750 + }, + { + "epoch": 1.9533475783475782, + "grad_norm": 0.5361388921737671, + "learning_rate": 0.00010860090889963532, + "loss": 0.6893, + "step": 87760 + }, + { + "epoch": 1.9535701566951567, + "grad_norm": 0.48884057998657227, + "learning_rate": 0.00010855944234965512, + "loss": 0.4991, + "step": 87770 + }, + { + "epoch": 1.953792735042735, + "grad_norm": 0.47964853048324585, + "learning_rate": 0.00010851798076862962, + "loss": 0.6482, + "step": 87780 + }, + { + "epoch": 1.9540153133903133, + "grad_norm": 0.8488168716430664, + "learning_rate": 0.00010847652415881204, + "loss": 0.5503, + "step": 87790 + }, + { + "epoch": 1.9542378917378918, + "grad_norm": 0.5509916543960571, + "learning_rate": 0.00010843507252245499, + "loss": 0.5735, + "step": 87800 + }, + { + "epoch": 1.9544604700854702, + "grad_norm": 0.5244113802909851, + "learning_rate": 0.00010839362586181114, + "loss": 0.5345, + "step": 87810 + }, + { + "epoch": 1.9546830484330484, + "grad_norm": 0.5117759108543396, + "learning_rate": 0.00010835218417913255, + "loss": 0.5599, + "step": 87820 + }, + { + "epoch": 1.9549056267806266, + "grad_norm": 0.6230669021606445, + "learning_rate": 0.00010831074747667131, + "loss": 0.4652, + "step": 87830 + }, + { + "epoch": 1.9551282051282053, + "grad_norm": 0.6364635229110718, + "learning_rate": 0.00010826931575667911, + "loss": 0.4689, + "step": 87840 + }, + { + "epoch": 1.9553507834757835, + "grad_norm": 0.4438205659389496, + "learning_rate": 0.00010822788902140741, + "loss": 0.5304, + "step": 87850 + }, + { + "epoch": 1.9555733618233617, + "grad_norm": 0.48501142859458923, + "learning_rate": 0.0001081864672731073, + "loss": 0.4529, + "step": 87860 + }, + { + "epoch": 1.9557959401709402, + "grad_norm": 0.4966143071651459, + "learning_rate": 0.00010814505051402984, + "loss": 0.429, + "step": 87870 + }, + { + "epoch": 1.9560185185185186, + "grad_norm": 0.39442554116249084, + "learning_rate": 0.00010810363874642544, + "loss": 0.6, + "step": 87880 + }, + { + "epoch": 1.9562410968660968, + "grad_norm": 0.7312954664230347, + "learning_rate": 0.00010806223197254455, + "loss": 0.6283, + "step": 87890 + }, + { + "epoch": 1.9564636752136753, + "grad_norm": 0.6002002358436584, + "learning_rate": 0.00010802083019463722, + "loss": 0.5609, + "step": 87900 + }, + { + "epoch": 1.9566862535612537, + "grad_norm": 0.528228223323822, + "learning_rate": 0.00010797943341495329, + "loss": 0.5249, + "step": 87910 + }, + { + "epoch": 1.956908831908832, + "grad_norm": 0.7415391802787781, + "learning_rate": 0.00010793804163574228, + "loss": 0.8109, + "step": 87920 + }, + { + "epoch": 1.9571314102564101, + "grad_norm": 0.4688614010810852, + "learning_rate": 0.00010789665485925353, + "loss": 0.4535, + "step": 87930 + }, + { + "epoch": 1.9573539886039886, + "grad_norm": 0.5116744041442871, + "learning_rate": 0.00010785527308773593, + "loss": 0.6447, + "step": 87940 + }, + { + "epoch": 1.957576566951567, + "grad_norm": 0.487576961517334, + "learning_rate": 0.00010781389632343817, + "loss": 0.5942, + "step": 87950 + }, + { + "epoch": 1.9577991452991452, + "grad_norm": 0.5948076248168945, + "learning_rate": 0.00010777252456860876, + "loss": 0.501, + "step": 87960 + }, + { + "epoch": 1.9580217236467237, + "grad_norm": 0.5517193078994751, + "learning_rate": 0.00010773115782549585, + "loss": 0.5566, + "step": 87970 + }, + { + "epoch": 1.958244301994302, + "grad_norm": 0.5932909250259399, + "learning_rate": 0.00010768979609634735, + "loss": 0.669, + "step": 87980 + }, + { + "epoch": 1.9584668803418803, + "grad_norm": 0.9186346530914307, + "learning_rate": 0.00010764843938341092, + "loss": 0.6424, + "step": 87990 + }, + { + "epoch": 1.9586894586894585, + "grad_norm": 0.5425516366958618, + "learning_rate": 0.00010760708768893386, + "loss": 0.4626, + "step": 88000 + }, + { + "epoch": 1.9589120370370372, + "grad_norm": 0.3957112431526184, + "learning_rate": 0.00010756574101516342, + "loss": 0.5944, + "step": 88010 + }, + { + "epoch": 1.9591346153846154, + "grad_norm": 0.37178757786750793, + "learning_rate": 0.00010752439936434617, + "loss": 0.4518, + "step": 88020 + }, + { + "epoch": 1.9593571937321936, + "grad_norm": 0.48003068566322327, + "learning_rate": 0.00010748306273872876, + "loss": 0.5355, + "step": 88030 + }, + { + "epoch": 1.959579772079772, + "grad_norm": 0.510742723941803, + "learning_rate": 0.00010744173114055747, + "loss": 0.4429, + "step": 88040 + }, + { + "epoch": 1.9598023504273505, + "grad_norm": 0.4262735843658447, + "learning_rate": 0.0001074004045720783, + "loss": 0.4797, + "step": 88050 + }, + { + "epoch": 1.9600249287749287, + "grad_norm": 0.5816901326179504, + "learning_rate": 0.00010735908303553693, + "loss": 0.5103, + "step": 88060 + }, + { + "epoch": 1.9602475071225072, + "grad_norm": 0.5432152152061462, + "learning_rate": 0.0001073177665331789, + "loss": 0.4934, + "step": 88070 + }, + { + "epoch": 1.9604700854700856, + "grad_norm": 0.43575814366340637, + "learning_rate": 0.00010727645506724933, + "loss": 0.4969, + "step": 88080 + }, + { + "epoch": 1.9606926638176638, + "grad_norm": 0.441064715385437, + "learning_rate": 0.000107235148639993, + "loss": 0.5214, + "step": 88090 + }, + { + "epoch": 1.960915242165242, + "grad_norm": 0.5713571310043335, + "learning_rate": 0.00010719384725365465, + "loss": 0.4997, + "step": 88100 + }, + { + "epoch": 1.9611378205128205, + "grad_norm": 0.5397986769676208, + "learning_rate": 0.00010715255091047862, + "loss": 0.5765, + "step": 88110 + }, + { + "epoch": 1.961360398860399, + "grad_norm": 0.5189772844314575, + "learning_rate": 0.00010711125961270896, + "loss": 0.4562, + "step": 88120 + }, + { + "epoch": 1.9615829772079771, + "grad_norm": 0.554828405380249, + "learning_rate": 0.0001070699733625895, + "loss": 0.5483, + "step": 88130 + }, + { + "epoch": 1.9618055555555556, + "grad_norm": 0.28737306594848633, + "learning_rate": 0.00010702869216236378, + "loss": 0.5179, + "step": 88140 + }, + { + "epoch": 1.962028133903134, + "grad_norm": 0.5746546387672424, + "learning_rate": 0.00010698741601427511, + "loss": 0.505, + "step": 88150 + }, + { + "epoch": 1.9622507122507122, + "grad_norm": 0.5059083104133606, + "learning_rate": 0.0001069461449205663, + "loss": 0.4812, + "step": 88160 + }, + { + "epoch": 1.9624732905982905, + "grad_norm": 0.6395819783210754, + "learning_rate": 0.00010690487888348016, + "loss": 0.5105, + "step": 88170 + }, + { + "epoch": 1.962695868945869, + "grad_norm": 0.7052072286605835, + "learning_rate": 0.00010686361790525911, + "loss": 0.5763, + "step": 88180 + }, + { + "epoch": 1.9629184472934473, + "grad_norm": 0.42889729142189026, + "learning_rate": 0.00010682236198814533, + "loss": 0.4853, + "step": 88190 + }, + { + "epoch": 1.9631410256410255, + "grad_norm": 0.875728189945221, + "learning_rate": 0.00010678111113438074, + "loss": 0.6747, + "step": 88200 + }, + { + "epoch": 1.963363603988604, + "grad_norm": 0.6272012591362, + "learning_rate": 0.00010673986534620681, + "loss": 0.6072, + "step": 88210 + }, + { + "epoch": 1.9635861823361824, + "grad_norm": 0.46567538380622864, + "learning_rate": 0.00010669862462586502, + "loss": 0.4489, + "step": 88220 + }, + { + "epoch": 1.9638087606837606, + "grad_norm": 0.6730668544769287, + "learning_rate": 0.00010665738897559627, + "loss": 0.495, + "step": 88230 + }, + { + "epoch": 1.964031339031339, + "grad_norm": 0.6411313414573669, + "learning_rate": 0.00010661615839764141, + "loss": 0.5778, + "step": 88240 + }, + { + "epoch": 1.9642539173789175, + "grad_norm": 0.4434345066547394, + "learning_rate": 0.00010657493289424097, + "loss": 0.4711, + "step": 88250 + }, + { + "epoch": 1.9644764957264957, + "grad_norm": 0.49573907256126404, + "learning_rate": 0.00010653371246763515, + "loss": 0.5912, + "step": 88260 + }, + { + "epoch": 1.964699074074074, + "grad_norm": 0.3320474624633789, + "learning_rate": 0.0001064924971200639, + "loss": 0.5829, + "step": 88270 + }, + { + "epoch": 1.9649216524216524, + "grad_norm": 0.6052396893501282, + "learning_rate": 0.00010645128685376699, + "loss": 0.5362, + "step": 88280 + }, + { + "epoch": 1.9651442307692308, + "grad_norm": 0.3489750623703003, + "learning_rate": 0.00010641008167098365, + "loss": 0.6776, + "step": 88290 + }, + { + "epoch": 1.965366809116809, + "grad_norm": 0.5899769067764282, + "learning_rate": 0.0001063688815739531, + "loss": 0.5392, + "step": 88300 + }, + { + "epoch": 1.9655893874643875, + "grad_norm": 0.33934059739112854, + "learning_rate": 0.00010632768656491416, + "loss": 0.3809, + "step": 88310 + }, + { + "epoch": 1.965811965811966, + "grad_norm": 0.5670376420021057, + "learning_rate": 0.00010628649664610542, + "loss": 0.5409, + "step": 88320 + }, + { + "epoch": 1.9660345441595442, + "grad_norm": 0.7151758670806885, + "learning_rate": 0.00010624531181976526, + "loss": 0.4336, + "step": 88330 + }, + { + "epoch": 1.9662571225071224, + "grad_norm": 0.43221569061279297, + "learning_rate": 0.00010620413208813152, + "loss": 0.472, + "step": 88340 + }, + { + "epoch": 1.9664797008547008, + "grad_norm": 0.7412872314453125, + "learning_rate": 0.00010616295745344203, + "loss": 0.6344, + "step": 88350 + }, + { + "epoch": 1.9667022792022792, + "grad_norm": 0.42692792415618896, + "learning_rate": 0.00010612178791793432, + "loss": 0.4771, + "step": 88360 + }, + { + "epoch": 1.9669248575498575, + "grad_norm": 0.6319110989570618, + "learning_rate": 0.00010608062348384544, + "loss": 0.6329, + "step": 88370 + }, + { + "epoch": 1.967147435897436, + "grad_norm": 0.4742584526538849, + "learning_rate": 0.00010603946415341237, + "loss": 0.5043, + "step": 88380 + }, + { + "epoch": 1.9673700142450143, + "grad_norm": 0.43028610944747925, + "learning_rate": 0.00010599830992887172, + "loss": 0.4571, + "step": 88390 + }, + { + "epoch": 1.9675925925925926, + "grad_norm": 0.5484131574630737, + "learning_rate": 0.00010595716081245988, + "loss": 0.4252, + "step": 88400 + }, + { + "epoch": 1.9678151709401708, + "grad_norm": 0.6796470284461975, + "learning_rate": 0.00010591601680641288, + "loss": 0.4696, + "step": 88410 + }, + { + "epoch": 1.9680377492877494, + "grad_norm": 0.48466822504997253, + "learning_rate": 0.00010587487791296666, + "loss": 0.4373, + "step": 88420 + }, + { + "epoch": 1.9682603276353277, + "grad_norm": 0.6868494749069214, + "learning_rate": 0.00010583374413435651, + "loss": 0.4667, + "step": 88430 + }, + { + "epoch": 1.9684829059829059, + "grad_norm": 0.699004590511322, + "learning_rate": 0.00010579261547281785, + "loss": 0.5441, + "step": 88440 + }, + { + "epoch": 1.9687054843304843, + "grad_norm": 0.6828630566596985, + "learning_rate": 0.00010575149193058554, + "loss": 0.4365, + "step": 88450 + }, + { + "epoch": 1.9689280626780628, + "grad_norm": 0.45245376229286194, + "learning_rate": 0.00010571037350989442, + "loss": 0.4757, + "step": 88460 + }, + { + "epoch": 1.969150641025641, + "grad_norm": 0.5244765877723694, + "learning_rate": 0.0001056692602129787, + "loss": 0.4994, + "step": 88470 + }, + { + "epoch": 1.9693732193732194, + "grad_norm": 0.5580021739006042, + "learning_rate": 0.00010562815204207259, + "loss": 0.6172, + "step": 88480 + }, + { + "epoch": 1.9695957977207978, + "grad_norm": 0.7489878535270691, + "learning_rate": 0.00010558704899940996, + "loss": 0.6022, + "step": 88490 + }, + { + "epoch": 1.969818376068376, + "grad_norm": 0.5939428806304932, + "learning_rate": 0.00010554595108722445, + "loss": 0.5763, + "step": 88500 + }, + { + "epoch": 1.9700409544159543, + "grad_norm": 0.5634039640426636, + "learning_rate": 0.00010550485830774923, + "loss": 0.4778, + "step": 88510 + }, + { + "epoch": 1.9702635327635327, + "grad_norm": 0.5487877130508423, + "learning_rate": 0.00010546377066321735, + "loss": 0.5419, + "step": 88520 + }, + { + "epoch": 1.9704861111111112, + "grad_norm": 0.37066948413848877, + "learning_rate": 0.00010542268815586158, + "loss": 0.4689, + "step": 88530 + }, + { + "epoch": 1.9707086894586894, + "grad_norm": 0.473908007144928, + "learning_rate": 0.00010538161078791433, + "loss": 0.4853, + "step": 88540 + }, + { + "epoch": 1.9709312678062678, + "grad_norm": 0.5134755969047546, + "learning_rate": 0.00010534053856160784, + "loss": 0.5325, + "step": 88550 + }, + { + "epoch": 1.9711538461538463, + "grad_norm": 0.40971675515174866, + "learning_rate": 0.00010529947147917403, + "loss": 0.5162, + "step": 88560 + }, + { + "epoch": 1.9713764245014245, + "grad_norm": 0.6359806060791016, + "learning_rate": 0.00010525840954284438, + "loss": 0.7041, + "step": 88570 + }, + { + "epoch": 1.9715990028490027, + "grad_norm": 0.5774009227752686, + "learning_rate": 0.00010521735275485035, + "loss": 0.5622, + "step": 88580 + }, + { + "epoch": 1.9718215811965814, + "grad_norm": 0.758904218673706, + "learning_rate": 0.00010517630111742303, + "loss": 0.6136, + "step": 88590 + }, + { + "epoch": 1.9720441595441596, + "grad_norm": 0.40916797518730164, + "learning_rate": 0.00010513525463279306, + "loss": 0.5237, + "step": 88600 + }, + { + "epoch": 1.9722667378917378, + "grad_norm": 0.4158216118812561, + "learning_rate": 0.00010509421330319098, + "loss": 0.4971, + "step": 88610 + }, + { + "epoch": 1.9724893162393162, + "grad_norm": 0.7104225158691406, + "learning_rate": 0.00010505317713084708, + "loss": 0.5799, + "step": 88620 + }, + { + "epoch": 1.9727118945868947, + "grad_norm": 0.5312607288360596, + "learning_rate": 0.00010501214611799125, + "loss": 0.5532, + "step": 88630 + }, + { + "epoch": 1.9729344729344729, + "grad_norm": 0.6019620895385742, + "learning_rate": 0.00010497112026685321, + "loss": 0.5284, + "step": 88640 + }, + { + "epoch": 1.9731570512820513, + "grad_norm": 0.6432709693908691, + "learning_rate": 0.00010493009957966224, + "loss": 0.6361, + "step": 88650 + }, + { + "epoch": 1.9733796296296298, + "grad_norm": 0.6770408153533936, + "learning_rate": 0.00010488908405864749, + "loss": 0.5335, + "step": 88660 + }, + { + "epoch": 1.973602207977208, + "grad_norm": 0.681075930595398, + "learning_rate": 0.00010484807370603777, + "loss": 0.6111, + "step": 88670 + }, + { + "epoch": 1.9738247863247862, + "grad_norm": 0.5270264744758606, + "learning_rate": 0.00010480706852406159, + "loss": 0.5412, + "step": 88680 + }, + { + "epoch": 1.9740473646723646, + "grad_norm": 0.6909594535827637, + "learning_rate": 0.00010476606851494728, + "loss": 0.4828, + "step": 88690 + }, + { + "epoch": 1.974269943019943, + "grad_norm": 0.4906465411186218, + "learning_rate": 0.00010472507368092284, + "loss": 0.54, + "step": 88700 + }, + { + "epoch": 1.9744925213675213, + "grad_norm": 0.7198073267936707, + "learning_rate": 0.00010468408402421578, + "loss": 0.6627, + "step": 88710 + }, + { + "epoch": 1.9747150997150997, + "grad_norm": 0.8544343709945679, + "learning_rate": 0.00010464309954705371, + "loss": 0.6367, + "step": 88720 + }, + { + "epoch": 1.9749376780626782, + "grad_norm": 1.1126538515090942, + "learning_rate": 0.00010460212025166363, + "loss": 0.45, + "step": 88730 + }, + { + "epoch": 1.9751602564102564, + "grad_norm": 0.503483772277832, + "learning_rate": 0.00010456114614027241, + "loss": 0.5136, + "step": 88740 + }, + { + "epoch": 1.9753828347578346, + "grad_norm": 0.5431207418441772, + "learning_rate": 0.00010452017721510663, + "loss": 0.6539, + "step": 88750 + }, + { + "epoch": 1.9756054131054133, + "grad_norm": 0.724375307559967, + "learning_rate": 0.0001044792134783926, + "loss": 0.6294, + "step": 88760 + }, + { + "epoch": 1.9758279914529915, + "grad_norm": 0.7335283160209656, + "learning_rate": 0.00010443825493235628, + "loss": 0.4855, + "step": 88770 + }, + { + "epoch": 1.9760505698005697, + "grad_norm": 0.5082575678825378, + "learning_rate": 0.00010439730157922355, + "loss": 0.5777, + "step": 88780 + }, + { + "epoch": 1.9762731481481481, + "grad_norm": 0.34826645255088806, + "learning_rate": 0.00010435635342121959, + "loss": 0.5078, + "step": 88790 + }, + { + "epoch": 1.9764957264957266, + "grad_norm": 0.5735419392585754, + "learning_rate": 0.0001043154104605697, + "loss": 0.6271, + "step": 88800 + }, + { + "epoch": 1.9767183048433048, + "grad_norm": 0.8078730702400208, + "learning_rate": 0.00010427447269949872, + "loss": 0.6738, + "step": 88810 + }, + { + "epoch": 1.9769408831908832, + "grad_norm": 0.3965419828891754, + "learning_rate": 0.00010423354014023128, + "loss": 0.5354, + "step": 88820 + }, + { + "epoch": 1.9771634615384617, + "grad_norm": 0.37384480237960815, + "learning_rate": 0.00010419261278499166, + "loss": 0.3936, + "step": 88830 + }, + { + "epoch": 1.97738603988604, + "grad_norm": 0.7163903713226318, + "learning_rate": 0.00010415169063600397, + "loss": 0.4801, + "step": 88840 + }, + { + "epoch": 1.977608618233618, + "grad_norm": 0.6502336859703064, + "learning_rate": 0.00010411077369549178, + "loss": 0.4773, + "step": 88850 + }, + { + "epoch": 1.9778311965811965, + "grad_norm": 0.7422441840171814, + "learning_rate": 0.00010406986196567873, + "loss": 0.6388, + "step": 88860 + }, + { + "epoch": 1.978053774928775, + "grad_norm": 0.5089988112449646, + "learning_rate": 0.00010402895544878785, + "loss": 0.5556, + "step": 88870 + }, + { + "epoch": 1.9782763532763532, + "grad_norm": 0.4812432527542114, + "learning_rate": 0.0001039880541470421, + "loss": 0.3783, + "step": 88880 + }, + { + "epoch": 1.9784989316239316, + "grad_norm": 0.6941017508506775, + "learning_rate": 0.00010394715806266409, + "loss": 0.5007, + "step": 88890 + }, + { + "epoch": 1.97872150997151, + "grad_norm": 0.44974082708358765, + "learning_rate": 0.0001039062671978761, + "loss": 0.4807, + "step": 88900 + }, + { + "epoch": 1.9789440883190883, + "grad_norm": 0.43339797854423523, + "learning_rate": 0.00010386538155490026, + "loss": 0.5258, + "step": 88910 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.4669586420059204, + "learning_rate": 0.00010382450113595835, + "loss": 0.4959, + "step": 88920 + }, + { + "epoch": 1.9793892450142452, + "grad_norm": 0.4113246500492096, + "learning_rate": 0.00010378362594327171, + "loss": 0.5621, + "step": 88930 + }, + { + "epoch": 1.9796118233618234, + "grad_norm": 0.5443630814552307, + "learning_rate": 0.00010374275597906157, + "loss": 0.5251, + "step": 88940 + }, + { + "epoch": 1.9798344017094016, + "grad_norm": 0.5284280180931091, + "learning_rate": 0.00010370189124554892, + "loss": 0.6476, + "step": 88950 + }, + { + "epoch": 1.98005698005698, + "grad_norm": 0.584843635559082, + "learning_rate": 0.00010366103174495432, + "loss": 0.5103, + "step": 88960 + }, + { + "epoch": 1.9802350427350426, + "eval_loss": 0.5369717478752136, + "eval_runtime": 337.2246, + "eval_samples_per_second": 7.013, + "eval_steps_per_second": 7.013, + "step": 88968 + }, + { + "epoch": 1.9802795584045585, + "grad_norm": 0.5102009773254395, + "learning_rate": 0.0001036201774794981, + "loss": 0.5013, + "step": 88970 + }, + { + "epoch": 1.9805021367521367, + "grad_norm": 0.5352452397346497, + "learning_rate": 0.00010357932845140044, + "loss": 0.5947, + "step": 88980 + }, + { + "epoch": 1.9807247150997151, + "grad_norm": 0.42273956537246704, + "learning_rate": 0.0001035384846628809, + "loss": 0.4462, + "step": 88990 + }, + { + "epoch": 1.9809472934472936, + "grad_norm": 0.5700570940971375, + "learning_rate": 0.00010349764611615915, + "loss": 0.4705, + "step": 89000 + }, + { + "epoch": 1.9811698717948718, + "grad_norm": 0.487447053194046, + "learning_rate": 0.00010345681281345425, + "loss": 0.4994, + "step": 89010 + }, + { + "epoch": 1.98139245014245, + "grad_norm": 0.7617775797843933, + "learning_rate": 0.00010341598475698517, + "loss": 0.5634, + "step": 89020 + }, + { + "epoch": 1.9816150284900285, + "grad_norm": 0.528607964515686, + "learning_rate": 0.00010337516194897054, + "loss": 0.5352, + "step": 89030 + }, + { + "epoch": 1.981837606837607, + "grad_norm": 0.40189066529273987, + "learning_rate": 0.00010333434439162872, + "loss": 0.6469, + "step": 89040 + }, + { + "epoch": 1.9820601851851851, + "grad_norm": 0.4828200340270996, + "learning_rate": 0.00010329353208717776, + "loss": 0.4271, + "step": 89050 + }, + { + "epoch": 1.9822827635327636, + "grad_norm": 0.6710944771766663, + "learning_rate": 0.0001032527250378355, + "loss": 0.5488, + "step": 89060 + }, + { + "epoch": 1.982505341880342, + "grad_norm": 0.6126538515090942, + "learning_rate": 0.0001032119232458193, + "loss": 0.4838, + "step": 89070 + }, + { + "epoch": 1.9827279202279202, + "grad_norm": 0.5800119042396545, + "learning_rate": 0.00010317112671334643, + "loss": 0.5686, + "step": 89080 + }, + { + "epoch": 1.9829504985754984, + "grad_norm": 0.6056652665138245, + "learning_rate": 0.00010313033544263378, + "loss": 0.4074, + "step": 89090 + }, + { + "epoch": 1.9831730769230769, + "grad_norm": 0.7546206712722778, + "learning_rate": 0.00010308954943589801, + "loss": 0.5134, + "step": 89100 + }, + { + "epoch": 1.9833956552706553, + "grad_norm": 0.5002375245094299, + "learning_rate": 0.00010304876869535545, + "loss": 0.6756, + "step": 89110 + }, + { + "epoch": 1.9836182336182335, + "grad_norm": 0.4622044563293457, + "learning_rate": 0.00010300799322322228, + "loss": 0.5782, + "step": 89120 + }, + { + "epoch": 1.983840811965812, + "grad_norm": 0.5083198547363281, + "learning_rate": 0.00010296722302171411, + "loss": 0.5272, + "step": 89130 + }, + { + "epoch": 1.9840633903133904, + "grad_norm": 0.47521403431892395, + "learning_rate": 0.00010292645809304644, + "loss": 0.4775, + "step": 89140 + }, + { + "epoch": 1.9842859686609686, + "grad_norm": 0.48532363772392273, + "learning_rate": 0.00010288569843943449, + "loss": 0.4386, + "step": 89150 + }, + { + "epoch": 1.984508547008547, + "grad_norm": 0.5855087041854858, + "learning_rate": 0.0001028449440630932, + "loss": 0.5472, + "step": 89160 + }, + { + "epoch": 1.9847311253561255, + "grad_norm": 0.5583044290542603, + "learning_rate": 0.00010280419496623714, + "loss": 0.633, + "step": 89170 + }, + { + "epoch": 1.9849537037037037, + "grad_norm": 0.5322542190551758, + "learning_rate": 0.00010276345115108075, + "loss": 0.5051, + "step": 89180 + }, + { + "epoch": 1.985176282051282, + "grad_norm": 0.43916016817092896, + "learning_rate": 0.00010272271261983799, + "loss": 0.4588, + "step": 89190 + }, + { + "epoch": 1.9853988603988604, + "grad_norm": 0.7113112807273865, + "learning_rate": 0.00010268197937472275, + "loss": 0.5232, + "step": 89200 + }, + { + "epoch": 1.9856214387464388, + "grad_norm": 0.6803197860717773, + "learning_rate": 0.00010264125141794836, + "loss": 0.5176, + "step": 89210 + }, + { + "epoch": 1.985844017094017, + "grad_norm": 1.2475115060806274, + "learning_rate": 0.00010260052875172803, + "loss": 0.4694, + "step": 89220 + }, + { + "epoch": 1.9860665954415955, + "grad_norm": 0.585325300693512, + "learning_rate": 0.00010255981137827473, + "loss": 0.4763, + "step": 89230 + }, + { + "epoch": 1.986289173789174, + "grad_norm": 0.5921732783317566, + "learning_rate": 0.00010251909929980103, + "loss": 0.5698, + "step": 89240 + }, + { + "epoch": 1.9865117521367521, + "grad_norm": 0.3192692995071411, + "learning_rate": 0.00010247839251851936, + "loss": 0.636, + "step": 89250 + }, + { + "epoch": 1.9867343304843303, + "grad_norm": 0.445182204246521, + "learning_rate": 0.00010243769103664163, + "loss": 0.3865, + "step": 89260 + }, + { + "epoch": 1.9869569088319088, + "grad_norm": 0.5367228388786316, + "learning_rate": 0.00010239699485637966, + "loss": 0.4922, + "step": 89270 + }, + { + "epoch": 1.9871794871794872, + "grad_norm": 0.4065212905406952, + "learning_rate": 0.00010235630397994485, + "loss": 0.5156, + "step": 89280 + }, + { + "epoch": 1.9874020655270654, + "grad_norm": 0.538700520992279, + "learning_rate": 0.0001023156184095484, + "loss": 0.5277, + "step": 89290 + }, + { + "epoch": 1.9876246438746439, + "grad_norm": 0.6323667764663696, + "learning_rate": 0.00010227493814740124, + "loss": 0.6574, + "step": 89300 + }, + { + "epoch": 1.9878472222222223, + "grad_norm": 0.29169631004333496, + "learning_rate": 0.00010223426319571392, + "loss": 0.4746, + "step": 89310 + }, + { + "epoch": 1.9880698005698005, + "grad_norm": 0.47286543250083923, + "learning_rate": 0.00010219359355669677, + "loss": 0.4648, + "step": 89320 + }, + { + "epoch": 1.9882923789173788, + "grad_norm": 0.5874037146568298, + "learning_rate": 0.00010215292923255982, + "loss": 0.5053, + "step": 89330 + }, + { + "epoch": 1.9885149572649574, + "grad_norm": 0.49212488532066345, + "learning_rate": 0.00010211227022551288, + "loss": 0.575, + "step": 89340 + }, + { + "epoch": 1.9887375356125356, + "grad_norm": 0.6886469125747681, + "learning_rate": 0.00010207161653776522, + "loss": 0.3993, + "step": 89350 + }, + { + "epoch": 1.9889601139601139, + "grad_norm": 0.6136534810066223, + "learning_rate": 0.00010203096817152609, + "loss": 0.5185, + "step": 89360 + }, + { + "epoch": 1.9891826923076923, + "grad_norm": 0.6381192803382874, + "learning_rate": 0.00010199032512900437, + "loss": 0.5533, + "step": 89370 + }, + { + "epoch": 1.9894052706552707, + "grad_norm": 0.6178556680679321, + "learning_rate": 0.00010194968741240866, + "loss": 0.5602, + "step": 89380 + }, + { + "epoch": 1.989627849002849, + "grad_norm": 0.46962249279022217, + "learning_rate": 0.00010190905502394717, + "loss": 0.5227, + "step": 89390 + }, + { + "epoch": 1.9898504273504274, + "grad_norm": 0.4782501459121704, + "learning_rate": 0.00010186842796582789, + "loss": 0.4977, + "step": 89400 + }, + { + "epoch": 1.9900730056980058, + "grad_norm": 0.8166126608848572, + "learning_rate": 0.00010182780624025868, + "loss": 0.6993, + "step": 89410 + }, + { + "epoch": 1.990295584045584, + "grad_norm": 0.55714350938797, + "learning_rate": 0.00010178718984944673, + "loss": 0.5452, + "step": 89420 + }, + { + "epoch": 1.9905181623931623, + "grad_norm": 0.6192950010299683, + "learning_rate": 0.0001017465787955993, + "loss": 0.5868, + "step": 89430 + }, + { + "epoch": 1.9907407407407407, + "grad_norm": 0.5710587501525879, + "learning_rate": 0.00010170597308092324, + "loss": 0.5548, + "step": 89440 + }, + { + "epoch": 1.9909633190883191, + "grad_norm": 0.43953341245651245, + "learning_rate": 0.00010166537270762504, + "loss": 0.4858, + "step": 89450 + }, + { + "epoch": 1.9911858974358974, + "grad_norm": 0.43213924765586853, + "learning_rate": 0.00010162477767791099, + "loss": 0.4924, + "step": 89460 + }, + { + "epoch": 1.9914084757834758, + "grad_norm": 0.5746148228645325, + "learning_rate": 0.00010158418799398706, + "loss": 0.578, + "step": 89470 + }, + { + "epoch": 1.9916310541310542, + "grad_norm": 0.6764656901359558, + "learning_rate": 0.00010154360365805899, + "loss": 0.5371, + "step": 89480 + }, + { + "epoch": 1.9918536324786325, + "grad_norm": 0.5575287342071533, + "learning_rate": 0.00010150302467233204, + "loss": 0.6483, + "step": 89490 + }, + { + "epoch": 1.9920762108262107, + "grad_norm": 0.5395053625106812, + "learning_rate": 0.00010146245103901135, + "loss": 0.5266, + "step": 89500 + }, + { + "epoch": 1.9922987891737893, + "grad_norm": 0.4512389004230499, + "learning_rate": 0.00010142188276030182, + "loss": 0.4788, + "step": 89510 + }, + { + "epoch": 1.9925213675213675, + "grad_norm": 0.5930687189102173, + "learning_rate": 0.00010138131983840779, + "loss": 0.4624, + "step": 89520 + }, + { + "epoch": 1.9927439458689458, + "grad_norm": 0.32276391983032227, + "learning_rate": 0.00010134076227553358, + "loss": 0.4986, + "step": 89530 + }, + { + "epoch": 1.9929665242165242, + "grad_norm": 0.4842802882194519, + "learning_rate": 0.00010130021007388313, + "loss": 0.5604, + "step": 89540 + }, + { + "epoch": 1.9931891025641026, + "grad_norm": 0.7211653590202332, + "learning_rate": 0.00010125966323566012, + "loss": 0.5766, + "step": 89550 + }, + { + "epoch": 1.9934116809116809, + "grad_norm": 0.38424408435821533, + "learning_rate": 0.00010121912176306776, + "loss": 0.5793, + "step": 89560 + }, + { + "epoch": 1.9936342592592593, + "grad_norm": 0.5912850499153137, + "learning_rate": 0.00010117858565830922, + "loss": 0.5374, + "step": 89570 + }, + { + "epoch": 1.9938568376068377, + "grad_norm": 0.4847007691860199, + "learning_rate": 0.00010113805492358721, + "loss": 0.4387, + "step": 89580 + }, + { + "epoch": 1.994079415954416, + "grad_norm": 0.4593251645565033, + "learning_rate": 0.00010109752956110423, + "loss": 0.447, + "step": 89590 + }, + { + "epoch": 1.9943019943019942, + "grad_norm": 0.7256568074226379, + "learning_rate": 0.00010105700957306248, + "loss": 0.5952, + "step": 89600 + }, + { + "epoch": 1.9945245726495726, + "grad_norm": 0.4649495780467987, + "learning_rate": 0.00010101649496166379, + "loss": 0.5566, + "step": 89610 + }, + { + "epoch": 1.994747150997151, + "grad_norm": 0.4171789586544037, + "learning_rate": 0.00010097598572910988, + "loss": 0.5237, + "step": 89620 + }, + { + "epoch": 1.9949697293447293, + "grad_norm": 0.4397051930427551, + "learning_rate": 0.00010093548187760192, + "loss": 0.6384, + "step": 89630 + }, + { + "epoch": 1.9951923076923077, + "grad_norm": 0.5089512467384338, + "learning_rate": 0.00010089498340934102, + "loss": 0.4769, + "step": 89640 + }, + { + "epoch": 1.9954148860398861, + "grad_norm": 0.6652963757514954, + "learning_rate": 0.0001008544903265278, + "loss": 0.6014, + "step": 89650 + }, + { + "epoch": 1.9956374643874644, + "grad_norm": 0.4736979007720947, + "learning_rate": 0.00010081400263136274, + "loss": 0.4361, + "step": 89660 + }, + { + "epoch": 1.9958600427350426, + "grad_norm": 0.7009586095809937, + "learning_rate": 0.00010077352032604597, + "loss": 0.6858, + "step": 89670 + }, + { + "epoch": 1.9960826210826212, + "grad_norm": 0.7269137501716614, + "learning_rate": 0.00010073304341277733, + "loss": 0.5995, + "step": 89680 + }, + { + "epoch": 1.9963051994301995, + "grad_norm": 0.6158400774002075, + "learning_rate": 0.00010069257189375645, + "loss": 0.6985, + "step": 89690 + }, + { + "epoch": 1.9965277777777777, + "grad_norm": 0.9363667964935303, + "learning_rate": 0.00010065210577118245, + "loss": 0.4506, + "step": 89700 + }, + { + "epoch": 1.9967503561253561, + "grad_norm": 0.6668571829795837, + "learning_rate": 0.00010061164504725433, + "loss": 0.5016, + "step": 89710 + }, + { + "epoch": 1.9969729344729346, + "grad_norm": 0.5492924451828003, + "learning_rate": 0.00010057118972417079, + "loss": 0.4525, + "step": 89720 + }, + { + "epoch": 1.9971955128205128, + "grad_norm": 0.6436484456062317, + "learning_rate": 0.00010053073980413019, + "loss": 0.6489, + "step": 89730 + }, + { + "epoch": 1.9974180911680912, + "grad_norm": 0.6494245529174805, + "learning_rate": 0.00010049029528933065, + "loss": 0.6466, + "step": 89740 + }, + { + "epoch": 1.9976406695156697, + "grad_norm": 0.33054423332214355, + "learning_rate": 0.00010044985618196987, + "loss": 0.5338, + "step": 89750 + }, + { + "epoch": 1.9978632478632479, + "grad_norm": 0.4462552070617676, + "learning_rate": 0.00010040942248424553, + "loss": 0.4926, + "step": 89760 + }, + { + "epoch": 1.998085826210826, + "grad_norm": 0.5570842623710632, + "learning_rate": 0.00010036899419835468, + "loss": 0.4973, + "step": 89770 + }, + { + "epoch": 1.9983084045584045, + "grad_norm": 0.5469715595245361, + "learning_rate": 0.00010032857132649418, + "loss": 0.4614, + "step": 89780 + }, + { + "epoch": 1.998530982905983, + "grad_norm": 0.6691291928291321, + "learning_rate": 0.0001002881538708607, + "loss": 0.5004, + "step": 89790 + }, + { + "epoch": 1.9987535612535612, + "grad_norm": 0.6604383587837219, + "learning_rate": 0.00010024774183365056, + "loss": 0.4791, + "step": 89800 + }, + { + "epoch": 1.9989761396011396, + "grad_norm": 0.6781284809112549, + "learning_rate": 0.00010020733521705978, + "loss": 0.5646, + "step": 89810 + }, + { + "epoch": 1.999198717948718, + "grad_norm": 0.5180469751358032, + "learning_rate": 0.00010016693402328412, + "loss": 0.5442, + "step": 89820 + }, + { + "epoch": 1.9994212962962963, + "grad_norm": 0.647405743598938, + "learning_rate": 0.00010012653825451908, + "loss": 0.508, + "step": 89830 + }, + { + "epoch": 1.9996438746438745, + "grad_norm": 0.5377041101455688, + "learning_rate": 0.00010008614791295961, + "loss": 0.5258, + "step": 89840 + }, + { + "epoch": 1.999866452991453, + "grad_norm": 0.6276906132698059, + "learning_rate": 0.00010004576300080067, + "loss": 0.4638, + "step": 89850 + }, + { + "epoch": 2.0000890313390314, + "grad_norm": 0.3543466031551361, + "learning_rate": 0.00010000538352023676, + "loss": 0.4689, + "step": 89860 + }, + { + "epoch": 2.0003116096866096, + "grad_norm": 0.43617504835128784, + "learning_rate": 9.996500947346221e-05, + "loss": 0.3632, + "step": 89870 + }, + { + "epoch": 2.0005341880341883, + "grad_norm": 0.45962151885032654, + "learning_rate": 9.992464086267093e-05, + "loss": 0.4641, + "step": 89880 + }, + { + "epoch": 2.0007567663817665, + "grad_norm": 0.42497971653938293, + "learning_rate": 9.98842776900566e-05, + "loss": 0.5192, + "step": 89890 + }, + { + "epoch": 2.0009793447293447, + "grad_norm": 0.777499794960022, + "learning_rate": 9.984391995781262e-05, + "loss": 0.436, + "step": 89900 + }, + { + "epoch": 2.001201923076923, + "grad_norm": 0.5374550819396973, + "learning_rate": 9.980356766813206e-05, + "loss": 0.4228, + "step": 89910 + }, + { + "epoch": 2.0014245014245016, + "grad_norm": 0.6309812664985657, + "learning_rate": 9.976322082320755e-05, + "loss": 0.4356, + "step": 89920 + }, + { + "epoch": 2.00164707977208, + "grad_norm": 0.6194542646408081, + "learning_rate": 9.972287942523168e-05, + "loss": 0.5146, + "step": 89930 + }, + { + "epoch": 2.001869658119658, + "grad_norm": 0.5618504881858826, + "learning_rate": 9.968254347639666e-05, + "loss": 0.3839, + "step": 89940 + }, + { + "epoch": 2.0020922364672367, + "grad_norm": 0.41237515211105347, + "learning_rate": 9.964221297889433e-05, + "loss": 0.3925, + "step": 89950 + }, + { + "epoch": 2.002314814814815, + "grad_norm": 0.5048966407775879, + "learning_rate": 9.960188793491632e-05, + "loss": 0.5813, + "step": 89960 + }, + { + "epoch": 2.002537393162393, + "grad_norm": 0.5869396328926086, + "learning_rate": 9.956156834665398e-05, + "loss": 0.5925, + "step": 89970 + }, + { + "epoch": 2.0027599715099713, + "grad_norm": 0.598250150680542, + "learning_rate": 9.952125421629814e-05, + "loss": 0.4339, + "step": 89980 + }, + { + "epoch": 2.00298254985755, + "grad_norm": 0.7474868893623352, + "learning_rate": 9.948094554603962e-05, + "loss": 0.4814, + "step": 89990 + }, + { + "epoch": 2.003205128205128, + "grad_norm": 0.7503690719604492, + "learning_rate": 9.94406423380688e-05, + "loss": 0.4807, + "step": 90000 + }, + { + "epoch": 2.0034277065527064, + "grad_norm": 0.47324520349502563, + "learning_rate": 9.94003445945758e-05, + "loss": 0.4463, + "step": 90010 + }, + { + "epoch": 2.003650284900285, + "grad_norm": 0.586833119392395, + "learning_rate": 9.936005231775046e-05, + "loss": 0.5546, + "step": 90020 + }, + { + "epoch": 2.0038728632478633, + "grad_norm": 0.5987151265144348, + "learning_rate": 9.93197655097823e-05, + "loss": 0.473, + "step": 90030 + }, + { + "epoch": 2.0040954415954415, + "grad_norm": 0.4147493243217468, + "learning_rate": 9.927948417286044e-05, + "loss": 0.3984, + "step": 90040 + }, + { + "epoch": 2.0043180199430197, + "grad_norm": 0.34556564688682556, + "learning_rate": 9.923920830917395e-05, + "loss": 0.3464, + "step": 90050 + }, + { + "epoch": 2.0045405982905984, + "grad_norm": 0.4374103248119354, + "learning_rate": 9.919893792091129e-05, + "loss": 0.4862, + "step": 90060 + }, + { + "epoch": 2.0047631766381766, + "grad_norm": 0.5401944518089294, + "learning_rate": 9.915867301026083e-05, + "loss": 0.4721, + "step": 90070 + }, + { + "epoch": 2.004985754985755, + "grad_norm": 0.367075651884079, + "learning_rate": 9.911841357941068e-05, + "loss": 0.6188, + "step": 90080 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 0.6527010798454285, + "learning_rate": 9.907815963054847e-05, + "loss": 0.5411, + "step": 90090 + }, + { + "epoch": 2.0054309116809117, + "grad_norm": 0.45079076290130615, + "learning_rate": 9.903791116586172e-05, + "loss": 0.4643, + "step": 90100 + }, + { + "epoch": 2.00565349002849, + "grad_norm": 0.31943023204803467, + "learning_rate": 9.89976681875376e-05, + "loss": 0.3938, + "step": 90110 + }, + { + "epoch": 2.0058760683760686, + "grad_norm": 0.7696496844291687, + "learning_rate": 9.89574306977628e-05, + "loss": 0.5147, + "step": 90120 + }, + { + "epoch": 2.006098646723647, + "grad_norm": 0.5576694011688232, + "learning_rate": 9.891719869872391e-05, + "loss": 0.4113, + "step": 90130 + }, + { + "epoch": 2.006321225071225, + "grad_norm": 0.5383765697479248, + "learning_rate": 9.887697219260723e-05, + "loss": 0.4871, + "step": 90140 + }, + { + "epoch": 2.0065438034188032, + "grad_norm": 0.38253116607666016, + "learning_rate": 9.883675118159863e-05, + "loss": 0.4429, + "step": 90150 + }, + { + "epoch": 2.006766381766382, + "grad_norm": 0.6916284561157227, + "learning_rate": 9.879653566788381e-05, + "loss": 0.5345, + "step": 90160 + }, + { + "epoch": 2.00698896011396, + "grad_norm": 0.5268100500106812, + "learning_rate": 9.875632565364816e-05, + "loss": 0.3958, + "step": 90170 + }, + { + "epoch": 2.0072115384615383, + "grad_norm": 0.4431670606136322, + "learning_rate": 9.871612114107658e-05, + "loss": 0.4366, + "step": 90180 + }, + { + "epoch": 2.007434116809117, + "grad_norm": 0.4209796190261841, + "learning_rate": 9.867592213235397e-05, + "loss": 0.3836, + "step": 90190 + }, + { + "epoch": 2.007656695156695, + "grad_norm": 0.6848621964454651, + "learning_rate": 9.863572862966461e-05, + "loss": 0.5611, + "step": 90200 + }, + { + "epoch": 2.0078792735042734, + "grad_norm": 0.4434153735637665, + "learning_rate": 9.859554063519276e-05, + "loss": 0.4577, + "step": 90210 + }, + { + "epoch": 2.0081018518518516, + "grad_norm": 0.7151669263839722, + "learning_rate": 9.855535815112222e-05, + "loss": 0.5231, + "step": 90220 + }, + { + "epoch": 2.0083244301994303, + "grad_norm": 0.5325934290885925, + "learning_rate": 9.851518117963659e-05, + "loss": 0.491, + "step": 90230 + }, + { + "epoch": 2.0085470085470085, + "grad_norm": 0.4765765070915222, + "learning_rate": 9.847500972291908e-05, + "loss": 0.4732, + "step": 90240 + }, + { + "epoch": 2.0087695868945867, + "grad_norm": 0.6172223091125488, + "learning_rate": 9.843484378315274e-05, + "loss": 0.3766, + "step": 90250 + }, + { + "epoch": 2.0089921652421654, + "grad_norm": 0.7189575433731079, + "learning_rate": 9.839468336252002e-05, + "loss": 0.458, + "step": 90260 + }, + { + "epoch": 2.0092147435897436, + "grad_norm": 0.7340860366821289, + "learning_rate": 9.835452846320343e-05, + "loss": 0.4761, + "step": 90270 + }, + { + "epoch": 2.009437321937322, + "grad_norm": 0.7318997979164124, + "learning_rate": 9.831437908738494e-05, + "loss": 0.5299, + "step": 90280 + }, + { + "epoch": 2.0096599002849005, + "grad_norm": 0.4002963602542877, + "learning_rate": 9.827423523724636e-05, + "loss": 0.395, + "step": 90290 + }, + { + "epoch": 2.0098824786324787, + "grad_norm": 0.5723457932472229, + "learning_rate": 9.823409691496918e-05, + "loss": 0.53, + "step": 90300 + }, + { + "epoch": 2.010105056980057, + "grad_norm": 0.646827220916748, + "learning_rate": 9.819396412273441e-05, + "loss": 0.5054, + "step": 90310 + }, + { + "epoch": 2.010327635327635, + "grad_norm": 0.5630742311477661, + "learning_rate": 9.815383686272297e-05, + "loss": 0.5638, + "step": 90320 + }, + { + "epoch": 2.010550213675214, + "grad_norm": 0.6631711721420288, + "learning_rate": 9.811371513711549e-05, + "loss": 0.3844, + "step": 90330 + }, + { + "epoch": 2.010772792022792, + "grad_norm": 0.796441376209259, + "learning_rate": 9.807359894809205e-05, + "loss": 0.3774, + "step": 90340 + }, + { + "epoch": 2.0109953703703702, + "grad_norm": 0.4723573625087738, + "learning_rate": 9.80334882978327e-05, + "loss": 0.4761, + "step": 90350 + }, + { + "epoch": 2.011217948717949, + "grad_norm": 0.7912504076957703, + "learning_rate": 9.799338318851706e-05, + "loss": 0.4198, + "step": 90360 + }, + { + "epoch": 2.011440527065527, + "grad_norm": 0.5627785325050354, + "learning_rate": 9.795328362232448e-05, + "loss": 0.4511, + "step": 90370 + }, + { + "epoch": 2.0116631054131053, + "grad_norm": 0.7288662195205688, + "learning_rate": 9.791318960143401e-05, + "loss": 0.487, + "step": 90380 + }, + { + "epoch": 2.0118856837606836, + "grad_norm": 0.761756181716919, + "learning_rate": 9.787310112802448e-05, + "loss": 0.4703, + "step": 90390 + }, + { + "epoch": 2.012108262108262, + "grad_norm": 0.7722057700157166, + "learning_rate": 9.783301820427416e-05, + "loss": 0.4858, + "step": 90400 + }, + { + "epoch": 2.0123308404558404, + "grad_norm": 0.4718744456768036, + "learning_rate": 9.779294083236124e-05, + "loss": 0.4952, + "step": 90410 + }, + { + "epoch": 2.0125534188034186, + "grad_norm": 0.6757660508155823, + "learning_rate": 9.775286901446362e-05, + "loss": 0.4728, + "step": 90420 + }, + { + "epoch": 2.0127759971509973, + "grad_norm": 0.5467312932014465, + "learning_rate": 9.771280275275885e-05, + "loss": 0.4071, + "step": 90430 + }, + { + "epoch": 2.0129985754985755, + "grad_norm": 0.37198352813720703, + "learning_rate": 9.767274204942405e-05, + "loss": 0.4475, + "step": 90440 + }, + { + "epoch": 2.0132211538461537, + "grad_norm": 0.8231156468391418, + "learning_rate": 9.763268690663623e-05, + "loss": 0.4464, + "step": 90450 + }, + { + "epoch": 2.0134437321937324, + "grad_norm": 0.4113079309463501, + "learning_rate": 9.759263732657198e-05, + "loss": 0.4318, + "step": 90460 + }, + { + "epoch": 2.0136663105413106, + "grad_norm": 0.4778950810432434, + "learning_rate": 9.755259331140774e-05, + "loss": 0.4658, + "step": 90470 + }, + { + "epoch": 2.013888888888889, + "grad_norm": 0.6860994100570679, + "learning_rate": 9.751255486331938e-05, + "loss": 0.4329, + "step": 90480 + }, + { + "epoch": 2.014111467236467, + "grad_norm": 0.43397676944732666, + "learning_rate": 9.747252198448267e-05, + "loss": 0.4092, + "step": 90490 + }, + { + "epoch": 2.0143340455840457, + "grad_norm": 0.7068895101547241, + "learning_rate": 9.743249467707307e-05, + "loss": 0.4854, + "step": 90500 + }, + { + "epoch": 2.014556623931624, + "grad_norm": 0.4531174302101135, + "learning_rate": 9.739247294326565e-05, + "loss": 0.4063, + "step": 90510 + }, + { + "epoch": 2.014779202279202, + "grad_norm": 0.5841737985610962, + "learning_rate": 9.735245678523527e-05, + "loss": 0.5277, + "step": 90520 + }, + { + "epoch": 2.015001780626781, + "grad_norm": 0.5976734161376953, + "learning_rate": 9.731244620515649e-05, + "loss": 0.5151, + "step": 90530 + }, + { + "epoch": 2.015224358974359, + "grad_norm": 0.6347478628158569, + "learning_rate": 9.727244120520338e-05, + "loss": 0.3879, + "step": 90540 + }, + { + "epoch": 2.0154469373219372, + "grad_norm": 0.4365602433681488, + "learning_rate": 9.723244178754988e-05, + "loss": 0.5014, + "step": 90550 + }, + { + "epoch": 2.0156695156695155, + "grad_norm": 0.5678282380104065, + "learning_rate": 9.719244795436972e-05, + "loss": 0.4745, + "step": 90560 + }, + { + "epoch": 2.015892094017094, + "grad_norm": 0.6716097593307495, + "learning_rate": 9.715245970783604e-05, + "loss": 0.4533, + "step": 90570 + }, + { + "epoch": 2.0161146723646723, + "grad_norm": 0.5791259407997131, + "learning_rate": 9.711247705012187e-05, + "loss": 0.4712, + "step": 90580 + }, + { + "epoch": 2.0163372507122506, + "grad_norm": 0.6677980422973633, + "learning_rate": 9.707249998339993e-05, + "loss": 0.4188, + "step": 90590 + }, + { + "epoch": 2.0165598290598292, + "grad_norm": 0.580368161201477, + "learning_rate": 9.703252850984261e-05, + "loss": 0.4026, + "step": 90600 + }, + { + "epoch": 2.0167824074074074, + "grad_norm": 0.5054930448532104, + "learning_rate": 9.699256263162205e-05, + "loss": 0.472, + "step": 90610 + }, + { + "epoch": 2.0170049857549857, + "grad_norm": 0.5461148023605347, + "learning_rate": 9.695260235090988e-05, + "loss": 0.5155, + "step": 90620 + }, + { + "epoch": 2.0172275641025643, + "grad_norm": 0.6772049069404602, + "learning_rate": 9.691264766987769e-05, + "loss": 0.6472, + "step": 90630 + }, + { + "epoch": 2.0174501424501425, + "grad_norm": 0.4362126588821411, + "learning_rate": 9.687269859069659e-05, + "loss": 0.3806, + "step": 90640 + }, + { + "epoch": 2.0176727207977208, + "grad_norm": 0.42470234632492065, + "learning_rate": 9.683275511553746e-05, + "loss": 0.3881, + "step": 90650 + }, + { + "epoch": 2.017895299145299, + "grad_norm": 0.5822004079818726, + "learning_rate": 9.67928172465709e-05, + "loss": 0.5111, + "step": 90660 + }, + { + "epoch": 2.0181178774928776, + "grad_norm": 0.708109438419342, + "learning_rate": 9.67528849859672e-05, + "loss": 0.4816, + "step": 90670 + }, + { + "epoch": 2.018340455840456, + "grad_norm": 0.740749716758728, + "learning_rate": 9.67129583358962e-05, + "loss": 0.4824, + "step": 90680 + }, + { + "epoch": 2.018563034188034, + "grad_norm": 0.5704371333122253, + "learning_rate": 9.667303729852763e-05, + "loss": 0.4345, + "step": 90690 + }, + { + "epoch": 2.0187856125356127, + "grad_norm": 0.534059464931488, + "learning_rate": 9.663312187603077e-05, + "loss": 0.5181, + "step": 90700 + }, + { + "epoch": 2.019008190883191, + "grad_norm": 0.408011257648468, + "learning_rate": 9.659321207057466e-05, + "loss": 0.4758, + "step": 90710 + }, + { + "epoch": 2.019230769230769, + "grad_norm": 0.4999423325061798, + "learning_rate": 9.655330788432808e-05, + "loss": 0.4698, + "step": 90720 + }, + { + "epoch": 2.0194533475783474, + "grad_norm": 0.5350780487060547, + "learning_rate": 9.651340931945942e-05, + "loss": 0.4726, + "step": 90730 + }, + { + "epoch": 2.019675925925926, + "grad_norm": 0.4886220693588257, + "learning_rate": 9.647351637813682e-05, + "loss": 0.526, + "step": 90740 + }, + { + "epoch": 2.0198985042735043, + "grad_norm": 0.7873054146766663, + "learning_rate": 9.643362906252816e-05, + "loss": 0.37, + "step": 90750 + }, + { + "epoch": 2.0201210826210825, + "grad_norm": 0.60884690284729, + "learning_rate": 9.63937473748008e-05, + "loss": 0.5565, + "step": 90760 + }, + { + "epoch": 2.020343660968661, + "grad_norm": 0.4547918438911438, + "learning_rate": 9.635387131712204e-05, + "loss": 0.4058, + "step": 90770 + }, + { + "epoch": 2.0205662393162394, + "grad_norm": 0.410516619682312, + "learning_rate": 9.631400089165876e-05, + "loss": 0.3991, + "step": 90780 + }, + { + "epoch": 2.0207888176638176, + "grad_norm": 0.5903146862983704, + "learning_rate": 9.627413610057754e-05, + "loss": 0.4019, + "step": 90790 + }, + { + "epoch": 2.021011396011396, + "grad_norm": 0.7246853113174438, + "learning_rate": 9.623427694604467e-05, + "loss": 0.5818, + "step": 90800 + }, + { + "epoch": 2.0212339743589745, + "grad_norm": 0.5898091197013855, + "learning_rate": 9.619442343022625e-05, + "loss": 0.4375, + "step": 90810 + }, + { + "epoch": 2.0214565527065527, + "grad_norm": 0.5030173063278198, + "learning_rate": 9.615457555528782e-05, + "loss": 0.4817, + "step": 90820 + }, + { + "epoch": 2.021679131054131, + "grad_norm": 0.656521737575531, + "learning_rate": 9.611473332339469e-05, + "loss": 0.4512, + "step": 90830 + }, + { + "epoch": 2.0219017094017095, + "grad_norm": 0.5611187815666199, + "learning_rate": 9.607489673671199e-05, + "loss": 0.5406, + "step": 90840 + }, + { + "epoch": 2.0221242877492878, + "grad_norm": 0.5025830864906311, + "learning_rate": 9.603506579740447e-05, + "loss": 0.4433, + "step": 90850 + }, + { + "epoch": 2.022346866096866, + "grad_norm": 0.5679742693901062, + "learning_rate": 9.59952405076366e-05, + "loss": 0.5052, + "step": 90860 + }, + { + "epoch": 2.0225694444444446, + "grad_norm": 0.5760874152183533, + "learning_rate": 9.59554208695725e-05, + "loss": 0.4054, + "step": 90870 + }, + { + "epoch": 2.022792022792023, + "grad_norm": 0.8978238701820374, + "learning_rate": 9.591560688537609e-05, + "loss": 0.5044, + "step": 90880 + }, + { + "epoch": 2.023014601139601, + "grad_norm": 0.4812617301940918, + "learning_rate": 9.587579855721071e-05, + "loss": 0.38, + "step": 90890 + }, + { + "epoch": 2.0232371794871793, + "grad_norm": 0.41074270009994507, + "learning_rate": 9.58359958872397e-05, + "loss": 0.4779, + "step": 90900 + }, + { + "epoch": 2.023459757834758, + "grad_norm": 0.4377320408821106, + "learning_rate": 9.579619887762594e-05, + "loss": 0.4976, + "step": 90910 + }, + { + "epoch": 2.023682336182336, + "grad_norm": 0.610565185546875, + "learning_rate": 9.575640753053204e-05, + "loss": 0.4633, + "step": 90920 + }, + { + "epoch": 2.0239049145299144, + "grad_norm": 0.49709609150886536, + "learning_rate": 9.571662184812027e-05, + "loss": 0.4137, + "step": 90930 + }, + { + "epoch": 2.024127492877493, + "grad_norm": 0.6810767650604248, + "learning_rate": 9.567684183255265e-05, + "loss": 0.408, + "step": 90940 + }, + { + "epoch": 2.0243500712250713, + "grad_norm": 0.6302564144134521, + "learning_rate": 9.563706748599095e-05, + "loss": 0.5975, + "step": 90950 + }, + { + "epoch": 2.0245726495726495, + "grad_norm": 0.8101886510848999, + "learning_rate": 9.55972988105964e-05, + "loss": 0.539, + "step": 90960 + }, + { + "epoch": 2.0247952279202277, + "grad_norm": 0.5000439882278442, + "learning_rate": 9.555753580853007e-05, + "loss": 0.4115, + "step": 90970 + }, + { + "epoch": 2.0250178062678064, + "grad_norm": 0.5363843441009521, + "learning_rate": 9.551777848195269e-05, + "loss": 0.4885, + "step": 90980 + }, + { + "epoch": 2.0252403846153846, + "grad_norm": 0.5441843271255493, + "learning_rate": 9.54780268330248e-05, + "loss": 0.4076, + "step": 90990 + }, + { + "epoch": 2.025462962962963, + "grad_norm": 0.6781536936759949, + "learning_rate": 9.543828086390649e-05, + "loss": 0.3442, + "step": 91000 + }, + { + "epoch": 2.0256855413105415, + "grad_norm": 0.5577031970024109, + "learning_rate": 9.539854057675759e-05, + "loss": 0.4969, + "step": 91010 + }, + { + "epoch": 2.0259081196581197, + "grad_norm": 0.5493881702423096, + "learning_rate": 9.535880597373769e-05, + "loss": 0.3874, + "step": 91020 + }, + { + "epoch": 2.026130698005698, + "grad_norm": 0.5387133955955505, + "learning_rate": 9.531907705700589e-05, + "loss": 0.4539, + "step": 91030 + }, + { + "epoch": 2.0263532763532766, + "grad_norm": 0.5434364080429077, + "learning_rate": 9.52793538287211e-05, + "loss": 0.4671, + "step": 91040 + }, + { + "epoch": 2.0265758547008548, + "grad_norm": 0.595073401927948, + "learning_rate": 9.5239636291042e-05, + "loss": 0.5306, + "step": 91050 + }, + { + "epoch": 2.026798433048433, + "grad_norm": 0.7041422128677368, + "learning_rate": 9.51999244461268e-05, + "loss": 0.5565, + "step": 91060 + }, + { + "epoch": 2.027021011396011, + "grad_norm": 0.48130425810813904, + "learning_rate": 9.516021829613348e-05, + "loss": 0.4901, + "step": 91070 + }, + { + "epoch": 2.02724358974359, + "grad_norm": 0.7272732257843018, + "learning_rate": 9.512051784321976e-05, + "loss": 0.4513, + "step": 91080 + }, + { + "epoch": 2.027466168091168, + "grad_norm": 0.4732091724872589, + "learning_rate": 9.5080823089543e-05, + "loss": 0.4689, + "step": 91090 + }, + { + "epoch": 2.0276887464387463, + "grad_norm": 0.41249731183052063, + "learning_rate": 9.504113403726023e-05, + "loss": 0.425, + "step": 91100 + }, + { + "epoch": 2.027911324786325, + "grad_norm": 0.6525123119354248, + "learning_rate": 9.500145068852806e-05, + "loss": 0.4401, + "step": 91110 + }, + { + "epoch": 2.028133903133903, + "grad_norm": 0.5969793200492859, + "learning_rate": 9.496177304550305e-05, + "loss": 0.3355, + "step": 91120 + }, + { + "epoch": 2.0283564814814814, + "grad_norm": 0.8126296401023865, + "learning_rate": 9.492210111034126e-05, + "loss": 0.5219, + "step": 91130 + }, + { + "epoch": 2.0285790598290596, + "grad_norm": 0.4047560691833496, + "learning_rate": 9.48824348851985e-05, + "loss": 0.4808, + "step": 91140 + }, + { + "epoch": 2.0288016381766383, + "grad_norm": 0.3511456251144409, + "learning_rate": 9.48427743722303e-05, + "loss": 0.4441, + "step": 91150 + }, + { + "epoch": 2.0290242165242165, + "grad_norm": 0.7069734334945679, + "learning_rate": 9.480311957359192e-05, + "loss": 0.4695, + "step": 91160 + }, + { + "epoch": 2.0292467948717947, + "grad_norm": 0.5533146262168884, + "learning_rate": 9.476347049143803e-05, + "loss": 0.4474, + "step": 91170 + }, + { + "epoch": 2.0294693732193734, + "grad_norm": 0.6713184118270874, + "learning_rate": 9.472382712792332e-05, + "loss": 0.3889, + "step": 91180 + }, + { + "epoch": 2.0296919515669516, + "grad_norm": 0.6932535171508789, + "learning_rate": 9.468418948520204e-05, + "loss": 0.4224, + "step": 91190 + }, + { + "epoch": 2.02991452991453, + "grad_norm": 0.6654514074325562, + "learning_rate": 9.46445575654281e-05, + "loss": 0.4261, + "step": 91200 + }, + { + "epoch": 2.0301371082621085, + "grad_norm": 0.5747096538543701, + "learning_rate": 9.460493137075514e-05, + "loss": 0.572, + "step": 91210 + }, + { + "epoch": 2.0303596866096867, + "grad_norm": 0.5004822015762329, + "learning_rate": 9.456531090333658e-05, + "loss": 0.4607, + "step": 91220 + }, + { + "epoch": 2.030582264957265, + "grad_norm": 0.7012773156166077, + "learning_rate": 9.452569616532528e-05, + "loss": 0.4961, + "step": 91230 + }, + { + "epoch": 2.030804843304843, + "grad_norm": 0.5176010131835938, + "learning_rate": 9.448608715887403e-05, + "loss": 0.5134, + "step": 91240 + }, + { + "epoch": 2.031027421652422, + "grad_norm": 0.907871663570404, + "learning_rate": 9.444648388613515e-05, + "loss": 0.4613, + "step": 91250 + }, + { + "epoch": 2.03125, + "grad_norm": 0.36404740810394287, + "learning_rate": 9.440688634926071e-05, + "loss": 0.3384, + "step": 91260 + }, + { + "epoch": 2.031472578347578, + "grad_norm": 0.6608119606971741, + "learning_rate": 9.436729455040254e-05, + "loss": 0.3983, + "step": 91270 + }, + { + "epoch": 2.031695156695157, + "grad_norm": 0.8917686939239502, + "learning_rate": 9.432770849171204e-05, + "loss": 0.4951, + "step": 91280 + }, + { + "epoch": 2.031917735042735, + "grad_norm": 0.5580182075500488, + "learning_rate": 9.428812817534037e-05, + "loss": 0.4805, + "step": 91290 + }, + { + "epoch": 2.0321403133903133, + "grad_norm": 0.6238675713539124, + "learning_rate": 9.424855360343842e-05, + "loss": 0.5015, + "step": 91300 + }, + { + "epoch": 2.0323628917378915, + "grad_norm": 0.45403924584388733, + "learning_rate": 9.420898477815658e-05, + "loss": 0.4426, + "step": 91310 + }, + { + "epoch": 2.03258547008547, + "grad_norm": 0.5086132287979126, + "learning_rate": 9.41694217016451e-05, + "loss": 0.3683, + "step": 91320 + }, + { + "epoch": 2.0328080484330484, + "grad_norm": 0.6602432131767273, + "learning_rate": 9.412986437605391e-05, + "loss": 0.5435, + "step": 91330 + }, + { + "epoch": 2.0330306267806266, + "grad_norm": 0.4932442903518677, + "learning_rate": 9.409031280353254e-05, + "loss": 0.4734, + "step": 91340 + }, + { + "epoch": 2.0332532051282053, + "grad_norm": 0.4811466634273529, + "learning_rate": 9.405076698623034e-05, + "loss": 0.3959, + "step": 91350 + }, + { + "epoch": 2.0334757834757835, + "grad_norm": 0.6618918180465698, + "learning_rate": 9.401122692629613e-05, + "loss": 0.4753, + "step": 91360 + }, + { + "epoch": 2.0336983618233617, + "grad_norm": 0.5234821438789368, + "learning_rate": 9.397169262587862e-05, + "loss": 0.5293, + "step": 91370 + }, + { + "epoch": 2.0339209401709404, + "grad_norm": 0.5152151584625244, + "learning_rate": 9.393216408712619e-05, + "loss": 0.4318, + "step": 91380 + }, + { + "epoch": 2.0341435185185186, + "grad_norm": 0.3490557074546814, + "learning_rate": 9.389264131218673e-05, + "loss": 0.4209, + "step": 91390 + }, + { + "epoch": 2.034366096866097, + "grad_norm": 0.5929777026176453, + "learning_rate": 9.385312430320801e-05, + "loss": 0.5068, + "step": 91400 + }, + { + "epoch": 2.034588675213675, + "grad_norm": 0.630289614200592, + "learning_rate": 9.38136130623374e-05, + "loss": 0.5485, + "step": 91410 + }, + { + "epoch": 2.0348112535612537, + "grad_norm": 0.572494626045227, + "learning_rate": 9.377410759172198e-05, + "loss": 0.4738, + "step": 91420 + }, + { + "epoch": 2.035033831908832, + "grad_norm": 0.5333491563796997, + "learning_rate": 9.373460789350854e-05, + "loss": 0.5702, + "step": 91430 + }, + { + "epoch": 2.03525641025641, + "grad_norm": 0.6336487531661987, + "learning_rate": 9.369511396984356e-05, + "loss": 0.5368, + "step": 91440 + }, + { + "epoch": 2.035478988603989, + "grad_norm": 0.5200332999229431, + "learning_rate": 9.365562582287304e-05, + "loss": 0.5022, + "step": 91450 + }, + { + "epoch": 2.035701566951567, + "grad_norm": 0.31436365842819214, + "learning_rate": 9.361614345474286e-05, + "loss": 0.4842, + "step": 91460 + }, + { + "epoch": 2.0359241452991452, + "grad_norm": 0.5940852165222168, + "learning_rate": 9.357666686759854e-05, + "loss": 0.4173, + "step": 91470 + }, + { + "epoch": 2.0361467236467234, + "grad_norm": 0.43001100420951843, + "learning_rate": 9.353719606358533e-05, + "loss": 0.3914, + "step": 91480 + }, + { + "epoch": 2.036369301994302, + "grad_norm": 0.4594919681549072, + "learning_rate": 9.349773104484798e-05, + "loss": 0.4438, + "step": 91490 + }, + { + "epoch": 2.0365918803418803, + "grad_norm": 0.38712412118911743, + "learning_rate": 9.34582718135311e-05, + "loss": 0.4537, + "step": 91500 + }, + { + "epoch": 2.0368144586894585, + "grad_norm": 0.553755521774292, + "learning_rate": 9.341881837177897e-05, + "loss": 0.4922, + "step": 91510 + }, + { + "epoch": 2.037037037037037, + "grad_norm": 0.39810413122177124, + "learning_rate": 9.337937072173557e-05, + "loss": 0.451, + "step": 91520 + }, + { + "epoch": 2.0372596153846154, + "grad_norm": 0.5983201861381531, + "learning_rate": 9.333992886554437e-05, + "loss": 0.5033, + "step": 91530 + }, + { + "epoch": 2.0374821937321936, + "grad_norm": 0.45720791816711426, + "learning_rate": 9.330049280534874e-05, + "loss": 0.5107, + "step": 91540 + }, + { + "epoch": 2.0377047720797723, + "grad_norm": 0.48426130414009094, + "learning_rate": 9.326106254329167e-05, + "loss": 0.5196, + "step": 91550 + }, + { + "epoch": 2.0379273504273505, + "grad_norm": 0.679417073726654, + "learning_rate": 9.322163808151587e-05, + "loss": 0.503, + "step": 91560 + }, + { + "epoch": 2.0381499287749287, + "grad_norm": 0.6564821600914001, + "learning_rate": 9.318221942216366e-05, + "loss": 0.4255, + "step": 91570 + }, + { + "epoch": 2.038372507122507, + "grad_norm": 0.6791837215423584, + "learning_rate": 9.314280656737717e-05, + "loss": 0.3442, + "step": 91580 + }, + { + "epoch": 2.0385950854700856, + "grad_norm": 0.7403519153594971, + "learning_rate": 9.310339951929797e-05, + "loss": 0.3808, + "step": 91590 + }, + { + "epoch": 2.038817663817664, + "grad_norm": 0.7340866923332214, + "learning_rate": 9.306399828006754e-05, + "loss": 0.5083, + "step": 91600 + }, + { + "epoch": 2.039040242165242, + "grad_norm": 0.6276527643203735, + "learning_rate": 9.302460285182707e-05, + "loss": 0.5224, + "step": 91610 + }, + { + "epoch": 2.0392628205128207, + "grad_norm": 0.6082279086112976, + "learning_rate": 9.298521323671719e-05, + "loss": 0.5278, + "step": 91620 + }, + { + "epoch": 2.039485398860399, + "grad_norm": 0.5417894124984741, + "learning_rate": 9.294582943687842e-05, + "loss": 0.4473, + "step": 91630 + }, + { + "epoch": 2.039707977207977, + "grad_norm": 0.5575587749481201, + "learning_rate": 9.29064514544509e-05, + "loss": 0.5124, + "step": 91640 + }, + { + "epoch": 2.0399305555555554, + "grad_norm": 0.49067622423171997, + "learning_rate": 9.28670792915745e-05, + "loss": 0.44, + "step": 91650 + }, + { + "epoch": 2.040153133903134, + "grad_norm": 0.6434421539306641, + "learning_rate": 9.282771295038877e-05, + "loss": 0.4494, + "step": 91660 + }, + { + "epoch": 2.0402421652421654, + "eval_loss": 0.5388351082801819, + "eval_runtime": 337.1739, + "eval_samples_per_second": 7.014, + "eval_steps_per_second": 7.014, + "step": 91664 + }, + { + "epoch": 2.0403757122507122, + "grad_norm": 0.6126685738563538, + "learning_rate": 9.278835243303281e-05, + "loss": 0.4354, + "step": 91670 + }, + { + "epoch": 2.0405982905982905, + "grad_norm": 0.6492798924446106, + "learning_rate": 9.274899774164552e-05, + "loss": 0.4865, + "step": 91680 + }, + { + "epoch": 2.040820868945869, + "grad_norm": 0.4895659387111664, + "learning_rate": 9.27096488783655e-05, + "loss": 0.4222, + "step": 91690 + }, + { + "epoch": 2.0410434472934473, + "grad_norm": 0.7228186130523682, + "learning_rate": 9.267030584533099e-05, + "loss": 0.5693, + "step": 91700 + }, + { + "epoch": 2.0412660256410255, + "grad_norm": 0.3963758647441864, + "learning_rate": 9.263096864467993e-05, + "loss": 0.4665, + "step": 91710 + }, + { + "epoch": 2.041488603988604, + "grad_norm": 0.4847806394100189, + "learning_rate": 9.259163727855001e-05, + "loss": 0.472, + "step": 91720 + }, + { + "epoch": 2.0417111823361824, + "grad_norm": 0.5810156464576721, + "learning_rate": 9.255231174907835e-05, + "loss": 0.4409, + "step": 91730 + }, + { + "epoch": 2.0419337606837606, + "grad_norm": 0.4100762903690338, + "learning_rate": 9.251299205840214e-05, + "loss": 0.5712, + "step": 91740 + }, + { + "epoch": 2.042156339031339, + "grad_norm": 0.6675921082496643, + "learning_rate": 9.247367820865782e-05, + "loss": 0.4256, + "step": 91750 + }, + { + "epoch": 2.0423789173789175, + "grad_norm": 0.4220803678035736, + "learning_rate": 9.243437020198189e-05, + "loss": 0.5776, + "step": 91760 + }, + { + "epoch": 2.0426014957264957, + "grad_norm": 0.40601903200149536, + "learning_rate": 9.239506804051032e-05, + "loss": 0.4718, + "step": 91770 + }, + { + "epoch": 2.042824074074074, + "grad_norm": 0.719412088394165, + "learning_rate": 9.235577172637884e-05, + "loss": 0.6789, + "step": 91780 + }, + { + "epoch": 2.0430466524216526, + "grad_norm": 0.4384337365627289, + "learning_rate": 9.231648126172286e-05, + "loss": 0.4875, + "step": 91790 + }, + { + "epoch": 2.043269230769231, + "grad_norm": 0.5705899000167847, + "learning_rate": 9.227719664867748e-05, + "loss": 0.4742, + "step": 91800 + }, + { + "epoch": 2.043491809116809, + "grad_norm": 0.46044304966926575, + "learning_rate": 9.223791788937738e-05, + "loss": 0.3934, + "step": 91810 + }, + { + "epoch": 2.0437143874643873, + "grad_norm": 0.4466346204280853, + "learning_rate": 9.219864498595705e-05, + "loss": 0.4922, + "step": 91820 + }, + { + "epoch": 2.043936965811966, + "grad_norm": 0.3737199306488037, + "learning_rate": 9.215937794055058e-05, + "loss": 0.4456, + "step": 91830 + }, + { + "epoch": 2.044159544159544, + "grad_norm": 0.6909509897232056, + "learning_rate": 9.21201167552918e-05, + "loss": 0.4351, + "step": 91840 + }, + { + "epoch": 2.0443821225071224, + "grad_norm": 0.42793193459510803, + "learning_rate": 9.208086143231418e-05, + "loss": 0.3917, + "step": 91850 + }, + { + "epoch": 2.044604700854701, + "grad_norm": 0.5402208566665649, + "learning_rate": 9.204161197375098e-05, + "loss": 0.3512, + "step": 91860 + }, + { + "epoch": 2.0448272792022792, + "grad_norm": 0.7035687565803528, + "learning_rate": 9.200236838173497e-05, + "loss": 0.4324, + "step": 91870 + }, + { + "epoch": 2.0450498575498575, + "grad_norm": 0.6524356007575989, + "learning_rate": 9.196313065839861e-05, + "loss": 0.4678, + "step": 91880 + }, + { + "epoch": 2.0452724358974357, + "grad_norm": 0.573111891746521, + "learning_rate": 9.192389880587415e-05, + "loss": 0.4819, + "step": 91890 + }, + { + "epoch": 2.0454950142450143, + "grad_norm": 0.4412767291069031, + "learning_rate": 9.188467282629352e-05, + "loss": 0.392, + "step": 91900 + }, + { + "epoch": 2.0457175925925926, + "grad_norm": 0.2940179705619812, + "learning_rate": 9.184545272178827e-05, + "loss": 0.5119, + "step": 91910 + }, + { + "epoch": 2.0459401709401708, + "grad_norm": 1.0994153022766113, + "learning_rate": 9.180623849448964e-05, + "loss": 0.4068, + "step": 91920 + }, + { + "epoch": 2.0461627492877494, + "grad_norm": 0.6845361590385437, + "learning_rate": 9.176703014652862e-05, + "loss": 0.4443, + "step": 91930 + }, + { + "epoch": 2.0463853276353277, + "grad_norm": 0.5533000230789185, + "learning_rate": 9.172782768003582e-05, + "loss": 0.6033, + "step": 91940 + }, + { + "epoch": 2.046607905982906, + "grad_norm": 0.4258192479610443, + "learning_rate": 9.168863109714143e-05, + "loss": 0.3596, + "step": 91950 + }, + { + "epoch": 2.0468304843304845, + "grad_norm": 0.8160050511360168, + "learning_rate": 9.164944039997551e-05, + "loss": 0.5137, + "step": 91960 + }, + { + "epoch": 2.0470530626780628, + "grad_norm": 0.45495203137397766, + "learning_rate": 9.161025559066769e-05, + "loss": 0.512, + "step": 91970 + }, + { + "epoch": 2.047275641025641, + "grad_norm": 0.5231293439865112, + "learning_rate": 9.157107667134733e-05, + "loss": 0.4806, + "step": 91980 + }, + { + "epoch": 2.047498219373219, + "grad_norm": 0.7062931060791016, + "learning_rate": 9.153190364414341e-05, + "loss": 0.4642, + "step": 91990 + }, + { + "epoch": 2.047720797720798, + "grad_norm": 0.7344730496406555, + "learning_rate": 9.149273651118473e-05, + "loss": 0.5388, + "step": 92000 + }, + { + "epoch": 2.047943376068376, + "grad_norm": 0.5181725025177002, + "learning_rate": 9.145357527459957e-05, + "loss": 0.4403, + "step": 92010 + }, + { + "epoch": 2.0481659544159543, + "grad_norm": 0.5806347727775574, + "learning_rate": 9.141441993651592e-05, + "loss": 0.437, + "step": 92020 + }, + { + "epoch": 2.048388532763533, + "grad_norm": 0.9183672070503235, + "learning_rate": 9.137527049906156e-05, + "loss": 0.5308, + "step": 92030 + }, + { + "epoch": 2.048611111111111, + "grad_norm": 0.6128705143928528, + "learning_rate": 9.133612696436396e-05, + "loss": 0.4582, + "step": 92040 + }, + { + "epoch": 2.0488336894586894, + "grad_norm": 0.5914685726165771, + "learning_rate": 9.129698933455016e-05, + "loss": 0.3601, + "step": 92050 + }, + { + "epoch": 2.0490562678062676, + "grad_norm": 0.5341922640800476, + "learning_rate": 9.125785761174694e-05, + "loss": 0.5504, + "step": 92060 + }, + { + "epoch": 2.0492788461538463, + "grad_norm": 0.6216781139373779, + "learning_rate": 9.121873179808076e-05, + "loss": 0.4186, + "step": 92070 + }, + { + "epoch": 2.0495014245014245, + "grad_norm": 0.4968711733818054, + "learning_rate": 9.117961189567783e-05, + "loss": 0.3992, + "step": 92080 + }, + { + "epoch": 2.0497240028490027, + "grad_norm": 0.634192943572998, + "learning_rate": 9.114049790666379e-05, + "loss": 0.4502, + "step": 92090 + }, + { + "epoch": 2.0499465811965814, + "grad_norm": 0.6684777736663818, + "learning_rate": 9.110138983316422e-05, + "loss": 0.5756, + "step": 92100 + }, + { + "epoch": 2.0501691595441596, + "grad_norm": 0.29836568236351013, + "learning_rate": 9.106228767730426e-05, + "loss": 0.4609, + "step": 92110 + }, + { + "epoch": 2.050391737891738, + "grad_norm": 0.7051525115966797, + "learning_rate": 9.102319144120879e-05, + "loss": 0.383, + "step": 92120 + }, + { + "epoch": 2.0506143162393164, + "grad_norm": 0.7818118333816528, + "learning_rate": 9.09841011270023e-05, + "loss": 0.5358, + "step": 92130 + }, + { + "epoch": 2.0508368945868947, + "grad_norm": 0.6384372711181641, + "learning_rate": 9.094501673680909e-05, + "loss": 0.3883, + "step": 92140 + }, + { + "epoch": 2.051059472934473, + "grad_norm": 0.4258003830909729, + "learning_rate": 9.090593827275291e-05, + "loss": 0.4494, + "step": 92150 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.5207109451293945, + "learning_rate": 9.086686573695731e-05, + "loss": 0.4022, + "step": 92160 + }, + { + "epoch": 2.0515046296296298, + "grad_norm": 0.3599660098552704, + "learning_rate": 9.082779913154555e-05, + "loss": 0.3917, + "step": 92170 + }, + { + "epoch": 2.051727207977208, + "grad_norm": 0.5726381540298462, + "learning_rate": 9.078873845864055e-05, + "loss": 0.4969, + "step": 92180 + }, + { + "epoch": 2.051949786324786, + "grad_norm": 0.5060173273086548, + "learning_rate": 9.074968372036492e-05, + "loss": 0.5765, + "step": 92190 + }, + { + "epoch": 2.052172364672365, + "grad_norm": 0.6169702410697937, + "learning_rate": 9.071063491884094e-05, + "loss": 0.4987, + "step": 92200 + }, + { + "epoch": 2.052394943019943, + "grad_norm": 0.5102764368057251, + "learning_rate": 9.067159205619049e-05, + "loss": 0.3631, + "step": 92210 + }, + { + "epoch": 2.0526175213675213, + "grad_norm": 0.4469665288925171, + "learning_rate": 9.06325551345353e-05, + "loss": 0.457, + "step": 92220 + }, + { + "epoch": 2.0528400997150995, + "grad_norm": 0.5031200051307678, + "learning_rate": 9.059352415599654e-05, + "loss": 0.4391, + "step": 92230 + }, + { + "epoch": 2.053062678062678, + "grad_norm": 0.5435706377029419, + "learning_rate": 9.055449912269523e-05, + "loss": 0.3567, + "step": 92240 + }, + { + "epoch": 2.0532852564102564, + "grad_norm": 0.5851424932479858, + "learning_rate": 9.051548003675203e-05, + "loss": 0.3947, + "step": 92250 + }, + { + "epoch": 2.0535078347578346, + "grad_norm": 0.5932340621948242, + "learning_rate": 9.04764669002873e-05, + "loss": 0.4843, + "step": 92260 + }, + { + "epoch": 2.0537304131054133, + "grad_norm": 0.6475874781608582, + "learning_rate": 9.043745971542107e-05, + "loss": 0.3962, + "step": 92270 + }, + { + "epoch": 2.0539529914529915, + "grad_norm": 0.5898043513298035, + "learning_rate": 9.039845848427291e-05, + "loss": 0.3851, + "step": 92280 + }, + { + "epoch": 2.0541755698005697, + "grad_norm": 0.6884317398071289, + "learning_rate": 9.03594632089623e-05, + "loss": 0.4134, + "step": 92290 + }, + { + "epoch": 2.0543981481481484, + "grad_norm": 0.4127469062805176, + "learning_rate": 9.032047389160814e-05, + "loss": 0.4321, + "step": 92300 + }, + { + "epoch": 2.0546207264957266, + "grad_norm": 0.5552111268043518, + "learning_rate": 9.028149053432923e-05, + "loss": 0.5608, + "step": 92310 + }, + { + "epoch": 2.054843304843305, + "grad_norm": 0.6372430920600891, + "learning_rate": 9.024251313924394e-05, + "loss": 0.56, + "step": 92320 + }, + { + "epoch": 2.055065883190883, + "grad_norm": 0.4178203046321869, + "learning_rate": 9.020354170847033e-05, + "loss": 0.4222, + "step": 92330 + }, + { + "epoch": 2.0552884615384617, + "grad_norm": 0.6235283613204956, + "learning_rate": 9.016457624412616e-05, + "loss": 0.542, + "step": 92340 + }, + { + "epoch": 2.05551103988604, + "grad_norm": 0.5176951885223389, + "learning_rate": 9.012561674832882e-05, + "loss": 0.5165, + "step": 92350 + }, + { + "epoch": 2.055733618233618, + "grad_norm": 0.7084895968437195, + "learning_rate": 9.00866632231955e-05, + "loss": 0.5354, + "step": 92360 + }, + { + "epoch": 2.0559561965811968, + "grad_norm": 0.5494484305381775, + "learning_rate": 9.00477156708428e-05, + "loss": 0.4844, + "step": 92370 + }, + { + "epoch": 2.056178774928775, + "grad_norm": 0.6145066618919373, + "learning_rate": 9.000877409338723e-05, + "loss": 0.4319, + "step": 92380 + }, + { + "epoch": 2.056401353276353, + "grad_norm": 0.5414249897003174, + "learning_rate": 8.996983849294494e-05, + "loss": 0.4177, + "step": 92390 + }, + { + "epoch": 2.0566239316239314, + "grad_norm": 0.5711957812309265, + "learning_rate": 8.993090887163176e-05, + "loss": 0.4718, + "step": 92400 + }, + { + "epoch": 2.05684650997151, + "grad_norm": 0.5843347907066345, + "learning_rate": 8.989198523156301e-05, + "loss": 0.5651, + "step": 92410 + }, + { + "epoch": 2.0570690883190883, + "grad_norm": 0.5972548723220825, + "learning_rate": 8.985306757485394e-05, + "loss": 0.4103, + "step": 92420 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 0.6618757247924805, + "learning_rate": 8.981415590361943e-05, + "loss": 0.5238, + "step": 92430 + }, + { + "epoch": 2.057514245014245, + "grad_norm": 0.577944278717041, + "learning_rate": 8.977525021997381e-05, + "loss": 0.3938, + "step": 92440 + }, + { + "epoch": 2.0577368233618234, + "grad_norm": 0.56364905834198, + "learning_rate": 8.973635052603133e-05, + "loss": 0.4253, + "step": 92450 + }, + { + "epoch": 2.0579594017094016, + "grad_norm": 0.6828592419624329, + "learning_rate": 8.969745682390583e-05, + "loss": 0.3796, + "step": 92460 + }, + { + "epoch": 2.05818198005698, + "grad_norm": 0.5902596712112427, + "learning_rate": 8.96585691157108e-05, + "loss": 0.4654, + "step": 92470 + }, + { + "epoch": 2.0584045584045585, + "grad_norm": 0.7088204026222229, + "learning_rate": 8.961968740355949e-05, + "loss": 0.3455, + "step": 92480 + }, + { + "epoch": 2.0586271367521367, + "grad_norm": 0.44851240515708923, + "learning_rate": 8.958081168956478e-05, + "loss": 0.5068, + "step": 92490 + }, + { + "epoch": 2.058849715099715, + "grad_norm": 0.6315305829048157, + "learning_rate": 8.954194197583908e-05, + "loss": 0.4933, + "step": 92500 + }, + { + "epoch": 2.0590722934472936, + "grad_norm": 0.5136936902999878, + "learning_rate": 8.950307826449468e-05, + "loss": 0.4816, + "step": 92510 + }, + { + "epoch": 2.059294871794872, + "grad_norm": 0.42075827717781067, + "learning_rate": 8.946422055764347e-05, + "loss": 0.3074, + "step": 92520 + }, + { + "epoch": 2.05951745014245, + "grad_norm": 0.64700847864151, + "learning_rate": 8.942536885739708e-05, + "loss": 0.498, + "step": 92530 + }, + { + "epoch": 2.0597400284900287, + "grad_norm": 0.4763801097869873, + "learning_rate": 8.93865231658666e-05, + "loss": 0.4241, + "step": 92540 + }, + { + "epoch": 2.059962606837607, + "grad_norm": 0.5297418236732483, + "learning_rate": 8.934768348516298e-05, + "loss": 0.5568, + "step": 92550 + }, + { + "epoch": 2.060185185185185, + "grad_norm": 0.4956209063529968, + "learning_rate": 8.930884981739684e-05, + "loss": 0.4939, + "step": 92560 + }, + { + "epoch": 2.0604077635327633, + "grad_norm": 0.5499415397644043, + "learning_rate": 8.927002216467848e-05, + "loss": 0.5963, + "step": 92570 + }, + { + "epoch": 2.060630341880342, + "grad_norm": 0.696087658405304, + "learning_rate": 8.923120052911771e-05, + "loss": 0.5168, + "step": 92580 + }, + { + "epoch": 2.06085292022792, + "grad_norm": 0.5364202857017517, + "learning_rate": 8.919238491282416e-05, + "loss": 0.4864, + "step": 92590 + }, + { + "epoch": 2.0610754985754984, + "grad_norm": 0.49946531653404236, + "learning_rate": 8.915357531790713e-05, + "loss": 0.541, + "step": 92600 + }, + { + "epoch": 2.061298076923077, + "grad_norm": 0.7604655623435974, + "learning_rate": 8.911477174647558e-05, + "loss": 0.3397, + "step": 92610 + }, + { + "epoch": 2.0615206552706553, + "grad_norm": 0.49020248651504517, + "learning_rate": 8.907597420063808e-05, + "loss": 0.449, + "step": 92620 + }, + { + "epoch": 2.0617432336182335, + "grad_norm": 0.7224427461624146, + "learning_rate": 8.903718268250304e-05, + "loss": 0.4455, + "step": 92630 + }, + { + "epoch": 2.0619658119658117, + "grad_norm": 0.5862017273902893, + "learning_rate": 8.899839719417827e-05, + "loss": 0.3779, + "step": 92640 + }, + { + "epoch": 2.0621883903133904, + "grad_norm": 0.42739906907081604, + "learning_rate": 8.895961773777144e-05, + "loss": 0.4847, + "step": 92650 + }, + { + "epoch": 2.0624109686609686, + "grad_norm": 0.4330075979232788, + "learning_rate": 8.892084431538996e-05, + "loss": 0.4962, + "step": 92660 + }, + { + "epoch": 2.062633547008547, + "grad_norm": 0.6505460143089294, + "learning_rate": 8.888207692914065e-05, + "loss": 0.5212, + "step": 92670 + }, + { + "epoch": 2.0628561253561255, + "grad_norm": 0.5942981243133545, + "learning_rate": 8.884331558113028e-05, + "loss": 0.4443, + "step": 92680 + }, + { + "epoch": 2.0630787037037037, + "grad_norm": 0.4336341321468353, + "learning_rate": 8.880456027346511e-05, + "loss": 0.3539, + "step": 92690 + }, + { + "epoch": 2.063301282051282, + "grad_norm": 0.38467174768447876, + "learning_rate": 8.876581100825119e-05, + "loss": 0.4438, + "step": 92700 + }, + { + "epoch": 2.0635238603988606, + "grad_norm": 0.4787411093711853, + "learning_rate": 8.872706778759422e-05, + "loss": 0.4363, + "step": 92710 + }, + { + "epoch": 2.063746438746439, + "grad_norm": 0.3388282358646393, + "learning_rate": 8.86883306135994e-05, + "loss": 0.3916, + "step": 92720 + }, + { + "epoch": 2.063969017094017, + "grad_norm": 0.43503648042678833, + "learning_rate": 8.864959948837181e-05, + "loss": 0.4058, + "step": 92730 + }, + { + "epoch": 2.0641915954415953, + "grad_norm": 0.5398057103157043, + "learning_rate": 8.861087441401616e-05, + "loss": 0.491, + "step": 92740 + }, + { + "epoch": 2.064414173789174, + "grad_norm": 0.42972442507743835, + "learning_rate": 8.857215539263677e-05, + "loss": 0.3559, + "step": 92750 + }, + { + "epoch": 2.064636752136752, + "grad_norm": 0.6799263954162598, + "learning_rate": 8.853344242633767e-05, + "loss": 0.4094, + "step": 92760 + }, + { + "epoch": 2.0648593304843303, + "grad_norm": 0.4752724766731262, + "learning_rate": 8.849473551722265e-05, + "loss": 0.4443, + "step": 92770 + }, + { + "epoch": 2.065081908831909, + "grad_norm": 0.5786859393119812, + "learning_rate": 8.84560346673949e-05, + "loss": 0.4233, + "step": 92780 + }, + { + "epoch": 2.0653044871794872, + "grad_norm": 0.5894894003868103, + "learning_rate": 8.841733987895761e-05, + "loss": 0.4082, + "step": 92790 + }, + { + "epoch": 2.0655270655270654, + "grad_norm": 0.584894061088562, + "learning_rate": 8.837865115401336e-05, + "loss": 0.5259, + "step": 92800 + }, + { + "epoch": 2.0657496438746437, + "grad_norm": 0.5054792761802673, + "learning_rate": 8.833996849466458e-05, + "loss": 0.5733, + "step": 92810 + }, + { + "epoch": 2.0659722222222223, + "grad_norm": 0.7498967051506042, + "learning_rate": 8.830129190301331e-05, + "loss": 0.4991, + "step": 92820 + }, + { + "epoch": 2.0661948005698005, + "grad_norm": 0.492639422416687, + "learning_rate": 8.826262138116128e-05, + "loss": 0.5089, + "step": 92830 + }, + { + "epoch": 2.0664173789173788, + "grad_norm": 0.5578546524047852, + "learning_rate": 8.822395693120989e-05, + "loss": 0.4406, + "step": 92840 + }, + { + "epoch": 2.0666399572649574, + "grad_norm": 0.767248809337616, + "learning_rate": 8.818529855526024e-05, + "loss": 0.4079, + "step": 92850 + }, + { + "epoch": 2.0668625356125356, + "grad_norm": 0.6224132776260376, + "learning_rate": 8.814664625541293e-05, + "loss": 0.4362, + "step": 92860 + }, + { + "epoch": 2.067085113960114, + "grad_norm": 0.5436907410621643, + "learning_rate": 8.810800003376843e-05, + "loss": 0.4614, + "step": 92870 + }, + { + "epoch": 2.0673076923076925, + "grad_norm": 0.6860635280609131, + "learning_rate": 8.806935989242681e-05, + "loss": 0.5936, + "step": 92880 + }, + { + "epoch": 2.0675302706552707, + "grad_norm": 0.7654950022697449, + "learning_rate": 8.803072583348782e-05, + "loss": 0.5193, + "step": 92890 + }, + { + "epoch": 2.067752849002849, + "grad_norm": 0.4014761447906494, + "learning_rate": 8.799209785905083e-05, + "loss": 0.5032, + "step": 92900 + }, + { + "epoch": 2.067975427350427, + "grad_norm": 0.799392819404602, + "learning_rate": 8.795347597121501e-05, + "loss": 0.4786, + "step": 92910 + }, + { + "epoch": 2.068198005698006, + "grad_norm": 0.5604988932609558, + "learning_rate": 8.791486017207898e-05, + "loss": 0.4582, + "step": 92920 + }, + { + "epoch": 2.068420584045584, + "grad_norm": 0.7473601698875427, + "learning_rate": 8.787625046374126e-05, + "loss": 0.4344, + "step": 92930 + }, + { + "epoch": 2.0686431623931623, + "grad_norm": 0.5701202750205994, + "learning_rate": 8.783764684829981e-05, + "loss": 0.4618, + "step": 92940 + }, + { + "epoch": 2.068865740740741, + "grad_norm": 0.5223069787025452, + "learning_rate": 8.779904932785246e-05, + "loss": 0.5245, + "step": 92950 + }, + { + "epoch": 2.069088319088319, + "grad_norm": 0.4085731506347656, + "learning_rate": 8.77604579044966e-05, + "loss": 0.4249, + "step": 92960 + }, + { + "epoch": 2.0693108974358974, + "grad_norm": 0.4149361550807953, + "learning_rate": 8.772187258032936e-05, + "loss": 0.4093, + "step": 92970 + }, + { + "epoch": 2.0695334757834756, + "grad_norm": 0.7283080220222473, + "learning_rate": 8.768329335744747e-05, + "loss": 0.531, + "step": 92980 + }, + { + "epoch": 2.0697560541310542, + "grad_norm": 0.44233766198158264, + "learning_rate": 8.764472023794742e-05, + "loss": 0.4348, + "step": 92990 + }, + { + "epoch": 2.0699786324786325, + "grad_norm": 0.42627280950546265, + "learning_rate": 8.760615322392521e-05, + "loss": 0.5617, + "step": 93000 + }, + { + "epoch": 2.0702012108262107, + "grad_norm": 0.534911036491394, + "learning_rate": 8.756759231747664e-05, + "loss": 0.5039, + "step": 93010 + }, + { + "epoch": 2.0704237891737893, + "grad_norm": 0.29487407207489014, + "learning_rate": 8.752903752069712e-05, + "loss": 0.4299, + "step": 93020 + }, + { + "epoch": 2.0706463675213675, + "grad_norm": 0.2687482237815857, + "learning_rate": 8.74904888356818e-05, + "loss": 0.4787, + "step": 93030 + }, + { + "epoch": 2.0708689458689458, + "grad_norm": 0.5471023917198181, + "learning_rate": 8.745194626452542e-05, + "loss": 0.4017, + "step": 93040 + }, + { + "epoch": 2.0710915242165244, + "grad_norm": 0.6511717438697815, + "learning_rate": 8.741340980932246e-05, + "loss": 0.5364, + "step": 93050 + }, + { + "epoch": 2.0713141025641026, + "grad_norm": 0.36137768626213074, + "learning_rate": 8.737487947216693e-05, + "loss": 0.453, + "step": 93060 + }, + { + "epoch": 2.071536680911681, + "grad_norm": 0.5700806379318237, + "learning_rate": 8.733635525515273e-05, + "loss": 0.4989, + "step": 93070 + }, + { + "epoch": 2.071759259259259, + "grad_norm": 0.823119580745697, + "learning_rate": 8.729783716037312e-05, + "loss": 0.4331, + "step": 93080 + }, + { + "epoch": 2.0719818376068377, + "grad_norm": 0.4080774188041687, + "learning_rate": 8.725932518992132e-05, + "loss": 0.3784, + "step": 93090 + }, + { + "epoch": 2.072204415954416, + "grad_norm": 0.38573476672172546, + "learning_rate": 8.722081934589008e-05, + "loss": 0.3619, + "step": 93100 + }, + { + "epoch": 2.072426994301994, + "grad_norm": 0.4866933524608612, + "learning_rate": 8.718231963037185e-05, + "loss": 0.5084, + "step": 93110 + }, + { + "epoch": 2.072649572649573, + "grad_norm": 0.6777507066726685, + "learning_rate": 8.71438260454587e-05, + "loss": 0.4189, + "step": 93120 + }, + { + "epoch": 2.072872150997151, + "grad_norm": 0.5231661796569824, + "learning_rate": 8.710533859324253e-05, + "loss": 0.4679, + "step": 93130 + }, + { + "epoch": 2.0730947293447293, + "grad_norm": 0.5415060520172119, + "learning_rate": 8.706685727581458e-05, + "loss": 0.4766, + "step": 93140 + }, + { + "epoch": 2.0733173076923075, + "grad_norm": 0.3524067997932434, + "learning_rate": 8.702838209526609e-05, + "loss": 0.4928, + "step": 93150 + }, + { + "epoch": 2.073539886039886, + "grad_norm": 0.2378246933221817, + "learning_rate": 8.698991305368778e-05, + "loss": 0.3772, + "step": 93160 + }, + { + "epoch": 2.0737624643874644, + "grad_norm": 0.6241315603256226, + "learning_rate": 8.69514501531701e-05, + "loss": 0.4476, + "step": 93170 + }, + { + "epoch": 2.0739850427350426, + "grad_norm": 0.32090499997138977, + "learning_rate": 8.691299339580318e-05, + "loss": 0.4351, + "step": 93180 + }, + { + "epoch": 2.0742076210826212, + "grad_norm": 0.696263313293457, + "learning_rate": 8.687454278367686e-05, + "loss": 0.5288, + "step": 93190 + }, + { + "epoch": 2.0744301994301995, + "grad_norm": 0.7352275252342224, + "learning_rate": 8.68360983188804e-05, + "loss": 0.4412, + "step": 93200 + }, + { + "epoch": 2.0746527777777777, + "grad_norm": 0.3914749324321747, + "learning_rate": 8.679766000350309e-05, + "loss": 0.4483, + "step": 93210 + }, + { + "epoch": 2.0748753561253563, + "grad_norm": 0.4394104480743408, + "learning_rate": 8.67592278396335e-05, + "loss": 0.4899, + "step": 93220 + }, + { + "epoch": 2.0750979344729346, + "grad_norm": 0.3059239089488983, + "learning_rate": 8.67208018293602e-05, + "loss": 0.4111, + "step": 93230 + }, + { + "epoch": 2.0753205128205128, + "grad_norm": 0.5648074150085449, + "learning_rate": 8.668238197477128e-05, + "loss": 0.5929, + "step": 93240 + }, + { + "epoch": 2.075543091168091, + "grad_norm": 0.5991377830505371, + "learning_rate": 8.664396827795444e-05, + "loss": 0.4856, + "step": 93250 + }, + { + "epoch": 2.0757656695156697, + "grad_norm": 0.5345048308372498, + "learning_rate": 8.660556074099722e-05, + "loss": 0.4525, + "step": 93260 + }, + { + "epoch": 2.075988247863248, + "grad_norm": 0.3742232322692871, + "learning_rate": 8.656715936598669e-05, + "loss": 0.4495, + "step": 93270 + }, + { + "epoch": 2.076210826210826, + "grad_norm": 0.5081925988197327, + "learning_rate": 8.652876415500953e-05, + "loss": 0.4513, + "step": 93280 + }, + { + "epoch": 2.0764334045584047, + "grad_norm": 0.3067689538002014, + "learning_rate": 8.649037511015219e-05, + "loss": 0.4551, + "step": 93290 + }, + { + "epoch": 2.076655982905983, + "grad_norm": 0.3174913227558136, + "learning_rate": 8.64519922335008e-05, + "loss": 0.473, + "step": 93300 + }, + { + "epoch": 2.076878561253561, + "grad_norm": 0.3483714163303375, + "learning_rate": 8.641361552714111e-05, + "loss": 0.4364, + "step": 93310 + }, + { + "epoch": 2.0771011396011394, + "grad_norm": 0.42381465435028076, + "learning_rate": 8.637524499315864e-05, + "loss": 0.3395, + "step": 93320 + }, + { + "epoch": 2.077323717948718, + "grad_norm": 0.4798165261745453, + "learning_rate": 8.633688063363824e-05, + "loss": 0.4197, + "step": 93330 + }, + { + "epoch": 2.0775462962962963, + "grad_norm": 0.6045119166374207, + "learning_rate": 8.629852245066485e-05, + "loss": 0.4756, + "step": 93340 + }, + { + "epoch": 2.0777688746438745, + "grad_norm": 0.5622422099113464, + "learning_rate": 8.626017044632289e-05, + "loss": 0.4412, + "step": 93350 + }, + { + "epoch": 2.077991452991453, + "grad_norm": 0.5713974833488464, + "learning_rate": 8.62218246226963e-05, + "loss": 0.3372, + "step": 93360 + }, + { + "epoch": 2.0782140313390314, + "grad_norm": 0.32925736904144287, + "learning_rate": 8.61834849818689e-05, + "loss": 0.4533, + "step": 93370 + }, + { + "epoch": 2.0784366096866096, + "grad_norm": 0.5514950752258301, + "learning_rate": 8.61451515259241e-05, + "loss": 0.3857, + "step": 93380 + }, + { + "epoch": 2.0786591880341883, + "grad_norm": 0.5712468028068542, + "learning_rate": 8.610682425694498e-05, + "loss": 0.457, + "step": 93390 + }, + { + "epoch": 2.0788817663817665, + "grad_norm": 0.5968846082687378, + "learning_rate": 8.606850317701427e-05, + "loss": 0.4946, + "step": 93400 + }, + { + "epoch": 2.0791043447293447, + "grad_norm": 0.31459808349609375, + "learning_rate": 8.603018828821443e-05, + "loss": 0.4765, + "step": 93410 + }, + { + "epoch": 2.079326923076923, + "grad_norm": 0.6099420189857483, + "learning_rate": 8.599187959262738e-05, + "loss": 0.5371, + "step": 93420 + }, + { + "epoch": 2.0795495014245016, + "grad_norm": 0.38431429862976074, + "learning_rate": 8.595357709233493e-05, + "loss": 0.4074, + "step": 93430 + }, + { + "epoch": 2.07977207977208, + "grad_norm": 0.48925134539604187, + "learning_rate": 8.591528078941846e-05, + "loss": 0.4671, + "step": 93440 + }, + { + "epoch": 2.079994658119658, + "grad_norm": 0.4454767405986786, + "learning_rate": 8.587699068595912e-05, + "loss": 0.4699, + "step": 93450 + }, + { + "epoch": 2.0802172364672367, + "grad_norm": 0.5479843616485596, + "learning_rate": 8.583870678403745e-05, + "loss": 0.4729, + "step": 93460 + }, + { + "epoch": 2.080439814814815, + "grad_norm": 0.5257598757743835, + "learning_rate": 8.580042908573389e-05, + "loss": 0.3654, + "step": 93470 + }, + { + "epoch": 2.080662393162393, + "grad_norm": 0.7161563634872437, + "learning_rate": 8.57621575931286e-05, + "loss": 0.4546, + "step": 93480 + }, + { + "epoch": 2.0808849715099713, + "grad_norm": 0.6287294626235962, + "learning_rate": 8.572389230830109e-05, + "loss": 0.4629, + "step": 93490 + }, + { + "epoch": 2.08110754985755, + "grad_norm": 0.6539040207862854, + "learning_rate": 8.568563323333083e-05, + "loss": 0.5137, + "step": 93500 + }, + { + "epoch": 2.081330128205128, + "grad_norm": 0.6592426896095276, + "learning_rate": 8.564738037029685e-05, + "loss": 0.4433, + "step": 93510 + }, + { + "epoch": 2.0815527065527064, + "grad_norm": 0.5749611258506775, + "learning_rate": 8.560913372127784e-05, + "loss": 0.385, + "step": 93520 + }, + { + "epoch": 2.081775284900285, + "grad_norm": 0.8078531622886658, + "learning_rate": 8.557089328835212e-05, + "loss": 0.5508, + "step": 93530 + }, + { + "epoch": 2.0819978632478633, + "grad_norm": 0.5725900530815125, + "learning_rate": 8.553265907359777e-05, + "loss": 0.4822, + "step": 93540 + }, + { + "epoch": 2.0822204415954415, + "grad_norm": 0.6212359070777893, + "learning_rate": 8.54944310790925e-05, + "loss": 0.3839, + "step": 93550 + }, + { + "epoch": 2.08244301994302, + "grad_norm": 0.7050454020500183, + "learning_rate": 8.545620930691349e-05, + "loss": 0.5479, + "step": 93560 + }, + { + "epoch": 2.0826655982905984, + "grad_norm": 0.5940646529197693, + "learning_rate": 8.541799375913783e-05, + "loss": 0.492, + "step": 93570 + }, + { + "epoch": 2.0828881766381766, + "grad_norm": 0.5924594402313232, + "learning_rate": 8.537978443784229e-05, + "loss": 0.494, + "step": 93580 + }, + { + "epoch": 2.083110754985755, + "grad_norm": 0.748340904712677, + "learning_rate": 8.534158134510302e-05, + "loss": 0.5395, + "step": 93590 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.7004445195198059, + "learning_rate": 8.530338448299607e-05, + "loss": 0.5362, + "step": 93600 + }, + { + "epoch": 2.0835559116809117, + "grad_norm": 0.6023527979850769, + "learning_rate": 8.52651938535971e-05, + "loss": 0.4703, + "step": 93610 + }, + { + "epoch": 2.08377849002849, + "grad_norm": 0.4019067585468292, + "learning_rate": 8.522700945898152e-05, + "loss": 0.4851, + "step": 93620 + }, + { + "epoch": 2.0840010683760686, + "grad_norm": 0.4958648085594177, + "learning_rate": 8.51888313012241e-05, + "loss": 0.4182, + "step": 93630 + }, + { + "epoch": 2.084223646723647, + "grad_norm": 0.7258164286613464, + "learning_rate": 8.515065938239959e-05, + "loss": 0.4581, + "step": 93640 + }, + { + "epoch": 2.084446225071225, + "grad_norm": 0.48704367876052856, + "learning_rate": 8.511249370458227e-05, + "loss": 0.4772, + "step": 93650 + }, + { + "epoch": 2.0846688034188032, + "grad_norm": 0.6313557028770447, + "learning_rate": 8.50743342698461e-05, + "loss": 0.4946, + "step": 93660 + }, + { + "epoch": 2.084891381766382, + "grad_norm": 0.5559951066970825, + "learning_rate": 8.503618108026468e-05, + "loss": 0.5083, + "step": 93670 + }, + { + "epoch": 2.08511396011396, + "grad_norm": 0.6134077310562134, + "learning_rate": 8.49980341379113e-05, + "loss": 0.3626, + "step": 93680 + }, + { + "epoch": 2.0853365384615383, + "grad_norm": 0.558841347694397, + "learning_rate": 8.495989344485895e-05, + "loss": 0.4417, + "step": 93690 + }, + { + "epoch": 2.085559116809117, + "grad_norm": 0.6272107362747192, + "learning_rate": 8.492175900318011e-05, + "loss": 0.5813, + "step": 93700 + }, + { + "epoch": 2.085781695156695, + "grad_norm": 0.6320427060127258, + "learning_rate": 8.488363081494715e-05, + "loss": 0.4615, + "step": 93710 + }, + { + "epoch": 2.0860042735042734, + "grad_norm": 0.5066922903060913, + "learning_rate": 8.484550888223186e-05, + "loss": 0.5195, + "step": 93720 + }, + { + "epoch": 2.0862268518518516, + "grad_norm": 0.6472083330154419, + "learning_rate": 8.480739320710592e-05, + "loss": 0.5324, + "step": 93730 + }, + { + "epoch": 2.0864494301994303, + "grad_norm": 0.5297220945358276, + "learning_rate": 8.476928379164048e-05, + "loss": 0.4975, + "step": 93740 + }, + { + "epoch": 2.0866720085470085, + "grad_norm": 0.5604262948036194, + "learning_rate": 8.473118063790653e-05, + "loss": 0.4071, + "step": 93750 + }, + { + "epoch": 2.0868945868945867, + "grad_norm": 0.5125129222869873, + "learning_rate": 8.469308374797464e-05, + "loss": 0.4617, + "step": 93760 + }, + { + "epoch": 2.0871171652421654, + "grad_norm": 0.3355007767677307, + "learning_rate": 8.465499312391491e-05, + "loss": 0.4668, + "step": 93770 + }, + { + "epoch": 2.0873397435897436, + "grad_norm": 0.6570688486099243, + "learning_rate": 8.461690876779729e-05, + "loss": 0.3485, + "step": 93780 + }, + { + "epoch": 2.087562321937322, + "grad_norm": 0.44765180349349976, + "learning_rate": 8.457883068169128e-05, + "loss": 0.4467, + "step": 93790 + }, + { + "epoch": 2.0877849002849005, + "grad_norm": 0.4833996295928955, + "learning_rate": 8.454075886766612e-05, + "loss": 0.3855, + "step": 93800 + }, + { + "epoch": 2.0880074786324787, + "grad_norm": 0.8714666962623596, + "learning_rate": 8.450269332779065e-05, + "loss": 0.4644, + "step": 93810 + }, + { + "epoch": 2.088230056980057, + "grad_norm": 0.5705962777137756, + "learning_rate": 8.446463406413335e-05, + "loss": 0.4754, + "step": 93820 + }, + { + "epoch": 2.088452635327635, + "grad_norm": 0.6698535680770874, + "learning_rate": 8.44265810787625e-05, + "loss": 0.4284, + "step": 93830 + }, + { + "epoch": 2.088675213675214, + "grad_norm": 0.3349551856517792, + "learning_rate": 8.438853437374583e-05, + "loss": 0.4488, + "step": 93840 + }, + { + "epoch": 2.088897792022792, + "grad_norm": 0.44186100363731384, + "learning_rate": 8.43504939511508e-05, + "loss": 0.3784, + "step": 93850 + }, + { + "epoch": 2.0891203703703702, + "grad_norm": 0.8291239142417908, + "learning_rate": 8.431245981304459e-05, + "loss": 0.4066, + "step": 93860 + }, + { + "epoch": 2.089342948717949, + "grad_norm": 0.5814853310585022, + "learning_rate": 8.4274431961494e-05, + "loss": 0.4305, + "step": 93870 + }, + { + "epoch": 2.089565527065527, + "grad_norm": 0.670498251914978, + "learning_rate": 8.423641039856555e-05, + "loss": 0.3973, + "step": 93880 + }, + { + "epoch": 2.0897881054131053, + "grad_norm": 0.833396852016449, + "learning_rate": 8.419839512632532e-05, + "loss": 0.3523, + "step": 93890 + }, + { + "epoch": 2.0900106837606836, + "grad_norm": 0.70555180311203, + "learning_rate": 8.416038614683916e-05, + "loss": 0.4674, + "step": 93900 + }, + { + "epoch": 2.090233262108262, + "grad_norm": 0.8045915961265564, + "learning_rate": 8.412238346217238e-05, + "loss": 0.4466, + "step": 93910 + }, + { + "epoch": 2.0904558404558404, + "grad_norm": 0.593986988067627, + "learning_rate": 8.408438707439015e-05, + "loss": 0.5011, + "step": 93920 + }, + { + "epoch": 2.0906784188034186, + "grad_norm": 0.5474157929420471, + "learning_rate": 8.404639698555721e-05, + "loss": 0.5583, + "step": 93930 + }, + { + "epoch": 2.0909009971509973, + "grad_norm": 0.7892537117004395, + "learning_rate": 8.400841319773797e-05, + "loss": 0.5914, + "step": 93940 + }, + { + "epoch": 2.0911235754985755, + "grad_norm": 0.5392142534255981, + "learning_rate": 8.397043571299654e-05, + "loss": 0.5828, + "step": 93950 + }, + { + "epoch": 2.0913461538461537, + "grad_norm": 0.7543501257896423, + "learning_rate": 8.393246453339661e-05, + "loss": 0.401, + "step": 93960 + }, + { + "epoch": 2.0915687321937324, + "grad_norm": 0.7711642384529114, + "learning_rate": 8.389449966100164e-05, + "loss": 0.5597, + "step": 93970 + }, + { + "epoch": 2.0917913105413106, + "grad_norm": 0.570942759513855, + "learning_rate": 8.385654109787461e-05, + "loss": 0.396, + "step": 93980 + }, + { + "epoch": 2.092013888888889, + "grad_norm": 0.42397984862327576, + "learning_rate": 8.381858884607816e-05, + "loss": 0.4265, + "step": 93990 + }, + { + "epoch": 2.092236467236467, + "grad_norm": 0.4155064821243286, + "learning_rate": 8.378064290767469e-05, + "loss": 0.4785, + "step": 94000 + }, + { + "epoch": 2.0924590455840457, + "grad_norm": 0.53016597032547, + "learning_rate": 8.374270328472622e-05, + "loss": 0.4579, + "step": 94010 + }, + { + "epoch": 2.092681623931624, + "grad_norm": 0.46546798944473267, + "learning_rate": 8.370476997929442e-05, + "loss": 0.395, + "step": 94020 + }, + { + "epoch": 2.092904202279202, + "grad_norm": 1.021172285079956, + "learning_rate": 8.366684299344063e-05, + "loss": 0.4434, + "step": 94030 + }, + { + "epoch": 2.093126780626781, + "grad_norm": 0.331666499376297, + "learning_rate": 8.36289223292259e-05, + "loss": 0.4571, + "step": 94040 + }, + { + "epoch": 2.093349358974359, + "grad_norm": 0.3523954451084137, + "learning_rate": 8.359100798871073e-05, + "loss": 0.4707, + "step": 94050 + }, + { + "epoch": 2.0935719373219372, + "grad_norm": 0.6340324282646179, + "learning_rate": 8.355309997395548e-05, + "loss": 0.464, + "step": 94060 + }, + { + "epoch": 2.0937945156695155, + "grad_norm": 0.6942493915557861, + "learning_rate": 8.35151982870201e-05, + "loss": 0.5229, + "step": 94070 + }, + { + "epoch": 2.094017094017094, + "grad_norm": 0.5725482106208801, + "learning_rate": 8.347730292996421e-05, + "loss": 0.3971, + "step": 94080 + }, + { + "epoch": 2.0942396723646723, + "grad_norm": 0.4753534495830536, + "learning_rate": 8.343941390484707e-05, + "loss": 0.4699, + "step": 94090 + }, + { + "epoch": 2.0944622507122506, + "grad_norm": 0.7122089862823486, + "learning_rate": 8.340153121372767e-05, + "loss": 0.4823, + "step": 94100 + }, + { + "epoch": 2.0946848290598292, + "grad_norm": 0.8469312191009521, + "learning_rate": 8.336365485866444e-05, + "loss": 0.4615, + "step": 94110 + }, + { + "epoch": 2.0949074074074074, + "grad_norm": 0.6163572669029236, + "learning_rate": 8.332578484171575e-05, + "loss": 0.4688, + "step": 94120 + }, + { + "epoch": 2.0951299857549857, + "grad_norm": 0.5812152028083801, + "learning_rate": 8.328792116493937e-05, + "loss": 0.5371, + "step": 94130 + }, + { + "epoch": 2.095352564102564, + "grad_norm": 0.46097496151924133, + "learning_rate": 8.325006383039291e-05, + "loss": 0.4029, + "step": 94140 + }, + { + "epoch": 2.0955751424501425, + "grad_norm": 0.8459033966064453, + "learning_rate": 8.321221284013354e-05, + "loss": 0.4604, + "step": 94150 + }, + { + "epoch": 2.0957977207977208, + "grad_norm": 0.5037481784820557, + "learning_rate": 8.317436819621813e-05, + "loss": 0.4163, + "step": 94160 + }, + { + "epoch": 2.096020299145299, + "grad_norm": 0.4708966314792633, + "learning_rate": 8.31365299007032e-05, + "loss": 0.458, + "step": 94170 + }, + { + "epoch": 2.0962428774928776, + "grad_norm": 0.457185834646225, + "learning_rate": 8.309869795564495e-05, + "loss": 0.5437, + "step": 94180 + }, + { + "epoch": 2.096465455840456, + "grad_norm": 0.5290883183479309, + "learning_rate": 8.306087236309912e-05, + "loss": 0.5182, + "step": 94190 + }, + { + "epoch": 2.096688034188034, + "grad_norm": 0.4177439212799072, + "learning_rate": 8.30230531251212e-05, + "loss": 0.3714, + "step": 94200 + }, + { + "epoch": 2.0969106125356127, + "grad_norm": 0.5624401569366455, + "learning_rate": 8.298524024376632e-05, + "loss": 0.4405, + "step": 94210 + }, + { + "epoch": 2.097133190883191, + "grad_norm": 0.330450177192688, + "learning_rate": 8.294743372108928e-05, + "loss": 0.4666, + "step": 94220 + }, + { + "epoch": 2.097355769230769, + "grad_norm": 0.5528355240821838, + "learning_rate": 8.290963355914453e-05, + "loss": 0.5391, + "step": 94230 + }, + { + "epoch": 2.0975783475783474, + "grad_norm": 0.47880885004997253, + "learning_rate": 8.287183975998623e-05, + "loss": 0.4698, + "step": 94240 + }, + { + "epoch": 2.097800925925926, + "grad_norm": 0.7645977139472961, + "learning_rate": 8.283405232566794e-05, + "loss": 0.4942, + "step": 94250 + }, + { + "epoch": 2.0980235042735043, + "grad_norm": 0.5995956063270569, + "learning_rate": 8.279627125824326e-05, + "loss": 0.4539, + "step": 94260 + }, + { + "epoch": 2.0982460826210825, + "grad_norm": 0.48524004220962524, + "learning_rate": 8.275849655976506e-05, + "loss": 0.4036, + "step": 94270 + }, + { + "epoch": 2.098468660968661, + "grad_norm": 0.5934380292892456, + "learning_rate": 8.272072823228614e-05, + "loss": 0.482, + "step": 94280 + }, + { + "epoch": 2.0986912393162394, + "grad_norm": 0.7825711369514465, + "learning_rate": 8.268296627785885e-05, + "loss": 0.5104, + "step": 94290 + }, + { + "epoch": 2.0989138176638176, + "grad_norm": 0.6093398332595825, + "learning_rate": 8.264521069853523e-05, + "loss": 0.3482, + "step": 94300 + }, + { + "epoch": 2.099136396011396, + "grad_norm": 0.5677360892295837, + "learning_rate": 8.260746149636691e-05, + "loss": 0.4758, + "step": 94310 + }, + { + "epoch": 2.0993589743589745, + "grad_norm": 0.4341944456100464, + "learning_rate": 8.256971867340532e-05, + "loss": 0.451, + "step": 94320 + }, + { + "epoch": 2.0995815527065527, + "grad_norm": 0.7116082906723022, + "learning_rate": 8.253198223170129e-05, + "loss": 0.4402, + "step": 94330 + }, + { + "epoch": 2.099804131054131, + "grad_norm": 0.40924692153930664, + "learning_rate": 8.249425217330548e-05, + "loss": 0.4372, + "step": 94340 + }, + { + "epoch": 2.1000267094017095, + "grad_norm": 0.473922997713089, + "learning_rate": 8.245652850026823e-05, + "loss": 0.4199, + "step": 94350 + }, + { + "epoch": 2.1002492877492878, + "grad_norm": 0.5568352341651917, + "learning_rate": 8.241881121463943e-05, + "loss": 0.409, + "step": 94360 + }, + { + "epoch": 2.1002492877492878, + "eval_loss": 0.5367586016654968, + "eval_runtime": 337.1532, + "eval_samples_per_second": 7.015, + "eval_steps_per_second": 7.015, + "step": 94360 + }, + { + "epoch": 2.100471866096866, + "grad_norm": 0.5693296790122986, + "learning_rate": 8.238110031846878e-05, + "loss": 0.5006, + "step": 94370 + }, + { + "epoch": 2.1006944444444446, + "grad_norm": 0.40285006165504456, + "learning_rate": 8.234339581380532e-05, + "loss": 0.3592, + "step": 94380 + }, + { + "epoch": 2.100917022792023, + "grad_norm": 0.406491219997406, + "learning_rate": 8.230569770269807e-05, + "loss": 0.4356, + "step": 94390 + }, + { + "epoch": 2.101139601139601, + "grad_norm": 0.4366461932659149, + "learning_rate": 8.226800598719562e-05, + "loss": 0.499, + "step": 94400 + }, + { + "epoch": 2.1013621794871793, + "grad_norm": 0.5887917280197144, + "learning_rate": 8.223032066934603e-05, + "loss": 0.4306, + "step": 94410 + }, + { + "epoch": 2.101584757834758, + "grad_norm": 0.5526008605957031, + "learning_rate": 8.219264175119723e-05, + "loss": 0.5057, + "step": 94420 + }, + { + "epoch": 2.101807336182336, + "grad_norm": 0.43040841817855835, + "learning_rate": 8.215496923479672e-05, + "loss": 0.4997, + "step": 94430 + }, + { + "epoch": 2.1020299145299144, + "grad_norm": 0.5127760171890259, + "learning_rate": 8.211730312219165e-05, + "loss": 0.5375, + "step": 94440 + }, + { + "epoch": 2.102252492877493, + "grad_norm": 0.5958731770515442, + "learning_rate": 8.207964341542884e-05, + "loss": 0.3318, + "step": 94450 + }, + { + "epoch": 2.1024750712250713, + "grad_norm": 0.6060073375701904, + "learning_rate": 8.204199011655481e-05, + "loss": 0.4845, + "step": 94460 + }, + { + "epoch": 2.1026976495726495, + "grad_norm": 0.523822546005249, + "learning_rate": 8.200434322761551e-05, + "loss": 0.4344, + "step": 94470 + }, + { + "epoch": 2.1029202279202277, + "grad_norm": 0.5422360897064209, + "learning_rate": 8.196670275065683e-05, + "loss": 0.3397, + "step": 94480 + }, + { + "epoch": 2.1031428062678064, + "grad_norm": 0.4780917465686798, + "learning_rate": 8.192906868772414e-05, + "loss": 0.4053, + "step": 94490 + }, + { + "epoch": 2.1033653846153846, + "grad_norm": 0.7546813488006592, + "learning_rate": 8.189144104086257e-05, + "loss": 0.4096, + "step": 94500 + }, + { + "epoch": 2.103587962962963, + "grad_norm": 0.4494799077510834, + "learning_rate": 8.185381981211673e-05, + "loss": 0.4379, + "step": 94510 + }, + { + "epoch": 2.1038105413105415, + "grad_norm": 0.5556170344352722, + "learning_rate": 8.181620500353103e-05, + "loss": 0.5039, + "step": 94520 + }, + { + "epoch": 2.1040331196581197, + "grad_norm": 0.6032299399375916, + "learning_rate": 8.177859661714949e-05, + "loss": 0.4897, + "step": 94530 + }, + { + "epoch": 2.104255698005698, + "grad_norm": 0.6110396981239319, + "learning_rate": 8.174099465501588e-05, + "loss": 0.5793, + "step": 94540 + }, + { + "epoch": 2.1044782763532766, + "grad_norm": 0.8312065601348877, + "learning_rate": 8.170339911917335e-05, + "loss": 0.4654, + "step": 94550 + }, + { + "epoch": 2.1047008547008548, + "grad_norm": 0.8147444725036621, + "learning_rate": 8.166581001166496e-05, + "loss": 0.4232, + "step": 94560 + }, + { + "epoch": 2.104923433048433, + "grad_norm": 0.4960596263408661, + "learning_rate": 8.16282273345333e-05, + "loss": 0.4324, + "step": 94570 + }, + { + "epoch": 2.105146011396011, + "grad_norm": 0.5012384653091431, + "learning_rate": 8.159065108982071e-05, + "loss": 0.4532, + "step": 94580 + }, + { + "epoch": 2.10536858974359, + "grad_norm": 0.7035501599311829, + "learning_rate": 8.155308127956905e-05, + "loss": 0.5045, + "step": 94590 + }, + { + "epoch": 2.105591168091168, + "grad_norm": 0.6068798899650574, + "learning_rate": 8.151551790581999e-05, + "loss": 0.5179, + "step": 94600 + }, + { + "epoch": 2.1058137464387463, + "grad_norm": 0.6684674024581909, + "learning_rate": 8.147796097061463e-05, + "loss": 0.3775, + "step": 94610 + }, + { + "epoch": 2.106036324786325, + "grad_norm": 0.6053679585456848, + "learning_rate": 8.144041047599389e-05, + "loss": 0.4893, + "step": 94620 + }, + { + "epoch": 2.106258903133903, + "grad_norm": 0.602454662322998, + "learning_rate": 8.140286642399837e-05, + "loss": 0.3968, + "step": 94630 + }, + { + "epoch": 2.1064814814814814, + "grad_norm": 0.54142165184021, + "learning_rate": 8.13653288166681e-05, + "loss": 0.5053, + "step": 94640 + }, + { + "epoch": 2.1067040598290596, + "grad_norm": 0.4809923470020294, + "learning_rate": 8.1327797656043e-05, + "loss": 0.4167, + "step": 94650 + }, + { + "epoch": 2.1069266381766383, + "grad_norm": 0.7400349378585815, + "learning_rate": 8.12902729441625e-05, + "loss": 0.4062, + "step": 94660 + }, + { + "epoch": 2.1071492165242165, + "grad_norm": 0.5602110028266907, + "learning_rate": 8.125275468306574e-05, + "loss": 0.4817, + "step": 94670 + }, + { + "epoch": 2.1073717948717947, + "grad_norm": 0.5356965661048889, + "learning_rate": 8.121524287479161e-05, + "loss": 0.4635, + "step": 94680 + }, + { + "epoch": 2.1075943732193734, + "grad_norm": 0.42339104413986206, + "learning_rate": 8.117773752137833e-05, + "loss": 0.3797, + "step": 94690 + }, + { + "epoch": 2.1078169515669516, + "grad_norm": 0.6352277994155884, + "learning_rate": 8.114023862486406e-05, + "loss": 0.4707, + "step": 94700 + }, + { + "epoch": 2.10803952991453, + "grad_norm": 0.6159664392471313, + "learning_rate": 8.110274618728654e-05, + "loss": 0.4815, + "step": 94710 + }, + { + "epoch": 2.1082621082621085, + "grad_norm": 0.5292482376098633, + "learning_rate": 8.106526021068313e-05, + "loss": 0.5653, + "step": 94720 + }, + { + "epoch": 2.1084846866096867, + "grad_norm": 0.6296613216400146, + "learning_rate": 8.102778069709083e-05, + "loss": 0.4936, + "step": 94730 + }, + { + "epoch": 2.108707264957265, + "grad_norm": 0.4425913691520691, + "learning_rate": 8.09903076485464e-05, + "loss": 0.5033, + "step": 94740 + }, + { + "epoch": 2.108929843304843, + "grad_norm": 0.41408321261405945, + "learning_rate": 8.0952841067086e-05, + "loss": 0.4678, + "step": 94750 + }, + { + "epoch": 2.109152421652422, + "grad_norm": 0.7556164860725403, + "learning_rate": 8.091538095474576e-05, + "loss": 0.4469, + "step": 94760 + }, + { + "epoch": 2.109375, + "grad_norm": 0.6101809740066528, + "learning_rate": 8.087792731356112e-05, + "loss": 0.5185, + "step": 94770 + }, + { + "epoch": 2.109597578347578, + "grad_norm": 0.5169164538383484, + "learning_rate": 8.084048014556745e-05, + "loss": 0.3463, + "step": 94780 + }, + { + "epoch": 2.109820156695157, + "grad_norm": 0.6174333691596985, + "learning_rate": 8.080303945279961e-05, + "loss": 0.5064, + "step": 94790 + }, + { + "epoch": 2.110042735042735, + "grad_norm": 0.5151993036270142, + "learning_rate": 8.07656052372922e-05, + "loss": 0.4483, + "step": 94800 + }, + { + "epoch": 2.1102653133903133, + "grad_norm": 0.5747796893119812, + "learning_rate": 8.072817750107942e-05, + "loss": 0.4774, + "step": 94810 + }, + { + "epoch": 2.1104878917378915, + "grad_norm": 0.3656141459941864, + "learning_rate": 8.069075624619516e-05, + "loss": 0.4848, + "step": 94820 + }, + { + "epoch": 2.11071047008547, + "grad_norm": 0.4049152731895447, + "learning_rate": 8.065334147467283e-05, + "loss": 0.5153, + "step": 94830 + }, + { + "epoch": 2.1109330484330484, + "grad_norm": 0.5611526370048523, + "learning_rate": 8.061593318854562e-05, + "loss": 0.3793, + "step": 94840 + }, + { + "epoch": 2.1111556267806266, + "grad_norm": 0.4716905951499939, + "learning_rate": 8.057853138984632e-05, + "loss": 0.4409, + "step": 94850 + }, + { + "epoch": 2.1113782051282053, + "grad_norm": 0.42111822962760925, + "learning_rate": 8.054113608060738e-05, + "loss": 0.3701, + "step": 94860 + }, + { + "epoch": 2.1116007834757835, + "grad_norm": 0.6273981928825378, + "learning_rate": 8.050374726286092e-05, + "loss": 0.4781, + "step": 94870 + }, + { + "epoch": 2.1118233618233617, + "grad_norm": 0.49173009395599365, + "learning_rate": 8.046636493863873e-05, + "loss": 0.5226, + "step": 94880 + }, + { + "epoch": 2.1120459401709404, + "grad_norm": 0.49693116545677185, + "learning_rate": 8.042898910997212e-05, + "loss": 0.4516, + "step": 94890 + }, + { + "epoch": 2.1122685185185186, + "grad_norm": 0.43563681840896606, + "learning_rate": 8.039161977889205e-05, + "loss": 0.465, + "step": 94900 + }, + { + "epoch": 2.112491096866097, + "grad_norm": 0.879296064376831, + "learning_rate": 8.035425694742929e-05, + "loss": 0.4238, + "step": 94910 + }, + { + "epoch": 2.112713675213675, + "grad_norm": 0.763816773891449, + "learning_rate": 8.031690061761414e-05, + "loss": 0.4125, + "step": 94920 + }, + { + "epoch": 2.1129362535612537, + "grad_norm": 0.5743877291679382, + "learning_rate": 8.02795507914766e-05, + "loss": 0.4042, + "step": 94930 + }, + { + "epoch": 2.113158831908832, + "grad_norm": 0.4411097764968872, + "learning_rate": 8.024220747104627e-05, + "loss": 0.4687, + "step": 94940 + }, + { + "epoch": 2.11338141025641, + "grad_norm": 0.6350961923599243, + "learning_rate": 8.020487065835243e-05, + "loss": 0.4661, + "step": 94950 + }, + { + "epoch": 2.113603988603989, + "grad_norm": 0.5903646349906921, + "learning_rate": 8.016754035542404e-05, + "loss": 0.4633, + "step": 94960 + }, + { + "epoch": 2.113826566951567, + "grad_norm": 0.6953848600387573, + "learning_rate": 8.013021656428954e-05, + "loss": 0.5241, + "step": 94970 + }, + { + "epoch": 2.1140491452991452, + "grad_norm": 0.41217243671417236, + "learning_rate": 8.00928992869772e-05, + "loss": 0.4664, + "step": 94980 + }, + { + "epoch": 2.1142717236467234, + "grad_norm": 0.555772602558136, + "learning_rate": 8.00555885255149e-05, + "loss": 0.4651, + "step": 94990 + }, + { + "epoch": 2.114494301994302, + "grad_norm": 0.5230336785316467, + "learning_rate": 8.00182842819301e-05, + "loss": 0.5052, + "step": 95000 + }, + { + "epoch": 2.1147168803418803, + "grad_norm": 0.7198736667633057, + "learning_rate": 7.998098655824995e-05, + "loss": 0.516, + "step": 95010 + }, + { + "epoch": 2.1149394586894585, + "grad_norm": 0.45915162563323975, + "learning_rate": 7.99436953565013e-05, + "loss": 0.4227, + "step": 95020 + }, + { + "epoch": 2.115162037037037, + "grad_norm": 0.8346054553985596, + "learning_rate": 7.990641067871054e-05, + "loss": 0.4145, + "step": 95030 + }, + { + "epoch": 2.1153846153846154, + "grad_norm": 0.5007335543632507, + "learning_rate": 7.986913252690367e-05, + "loss": 0.3814, + "step": 95040 + }, + { + "epoch": 2.1156071937321936, + "grad_norm": 0.6236112117767334, + "learning_rate": 7.983186090310648e-05, + "loss": 0.474, + "step": 95050 + }, + { + "epoch": 2.1158297720797723, + "grad_norm": 0.7768984436988831, + "learning_rate": 7.979459580934434e-05, + "loss": 0.4494, + "step": 95060 + }, + { + "epoch": 2.1160523504273505, + "grad_norm": 0.5049422979354858, + "learning_rate": 7.975733724764225e-05, + "loss": 0.384, + "step": 95070 + }, + { + "epoch": 2.1162749287749287, + "grad_norm": 0.45497995615005493, + "learning_rate": 7.972008522002491e-05, + "loss": 0.352, + "step": 95080 + }, + { + "epoch": 2.116497507122507, + "grad_norm": 0.3762676417827606, + "learning_rate": 7.968283972851669e-05, + "loss": 0.5393, + "step": 95090 + }, + { + "epoch": 2.1167200854700856, + "grad_norm": 0.4134499430656433, + "learning_rate": 7.964560077514136e-05, + "loss": 0.5039, + "step": 95100 + }, + { + "epoch": 2.116942663817664, + "grad_norm": 0.7410092353820801, + "learning_rate": 7.960836836192263e-05, + "loss": 0.5567, + "step": 95110 + }, + { + "epoch": 2.117165242165242, + "grad_norm": 0.637007474899292, + "learning_rate": 7.957114249088369e-05, + "loss": 0.4832, + "step": 95120 + }, + { + "epoch": 2.1173878205128207, + "grad_norm": 0.7132471203804016, + "learning_rate": 7.953392316404748e-05, + "loss": 0.4884, + "step": 95130 + }, + { + "epoch": 2.117610398860399, + "grad_norm": 0.9175398945808411, + "learning_rate": 7.94967103834365e-05, + "loss": 0.5452, + "step": 95140 + }, + { + "epoch": 2.117832977207977, + "grad_norm": 0.4358776807785034, + "learning_rate": 7.945950415107299e-05, + "loss": 0.4458, + "step": 95150 + }, + { + "epoch": 2.1180555555555554, + "grad_norm": 0.8178651928901672, + "learning_rate": 7.942230446897862e-05, + "loss": 0.5678, + "step": 95160 + }, + { + "epoch": 2.118278133903134, + "grad_norm": 0.4699116349220276, + "learning_rate": 7.938511133917503e-05, + "loss": 0.3165, + "step": 95170 + }, + { + "epoch": 2.1185007122507122, + "grad_norm": 0.4907972812652588, + "learning_rate": 7.934792476368316e-05, + "loss": 0.4241, + "step": 95180 + }, + { + "epoch": 2.1187232905982905, + "grad_norm": 0.5656859278678894, + "learning_rate": 7.931074474452381e-05, + "loss": 0.4015, + "step": 95190 + }, + { + "epoch": 2.118945868945869, + "grad_norm": 0.4246733784675598, + "learning_rate": 7.927357128371739e-05, + "loss": 0.4926, + "step": 95200 + }, + { + "epoch": 2.1191684472934473, + "grad_norm": 0.8523581027984619, + "learning_rate": 7.923640438328396e-05, + "loss": 0.5256, + "step": 95210 + }, + { + "epoch": 2.1193910256410255, + "grad_norm": 0.37937796115875244, + "learning_rate": 7.919924404524317e-05, + "loss": 0.4751, + "step": 95220 + }, + { + "epoch": 2.119613603988604, + "grad_norm": 0.4993674159049988, + "learning_rate": 7.916209027161441e-05, + "loss": 0.4355, + "step": 95230 + }, + { + "epoch": 2.1198361823361824, + "grad_norm": 0.7296259999275208, + "learning_rate": 7.912494306441654e-05, + "loss": 0.5167, + "step": 95240 + }, + { + "epoch": 2.1200587606837606, + "grad_norm": 0.3281956911087036, + "learning_rate": 7.908780242566817e-05, + "loss": 0.5247, + "step": 95250 + }, + { + "epoch": 2.120281339031339, + "grad_norm": 0.38215774297714233, + "learning_rate": 7.905066835738763e-05, + "loss": 0.349, + "step": 95260 + }, + { + "epoch": 2.1205039173789175, + "grad_norm": 0.6754162907600403, + "learning_rate": 7.90135408615928e-05, + "loss": 0.4428, + "step": 95270 + }, + { + "epoch": 2.1207264957264957, + "grad_norm": 0.6413685083389282, + "learning_rate": 7.897641994030127e-05, + "loss": 0.4805, + "step": 95280 + }, + { + "epoch": 2.120949074074074, + "grad_norm": 0.633334755897522, + "learning_rate": 7.893930559553007e-05, + "loss": 0.4691, + "step": 95290 + }, + { + "epoch": 2.1211716524216526, + "grad_norm": 0.5734654664993286, + "learning_rate": 7.890219782929611e-05, + "loss": 0.4752, + "step": 95300 + }, + { + "epoch": 2.121394230769231, + "grad_norm": 0.6622852683067322, + "learning_rate": 7.886509664361592e-05, + "loss": 0.4182, + "step": 95310 + }, + { + "epoch": 2.121616809116809, + "grad_norm": 0.6082544922828674, + "learning_rate": 7.882800204050549e-05, + "loss": 0.5433, + "step": 95320 + }, + { + "epoch": 2.1218393874643873, + "grad_norm": 0.5464411377906799, + "learning_rate": 7.879091402198062e-05, + "loss": 0.4334, + "step": 95330 + }, + { + "epoch": 2.122061965811966, + "grad_norm": 0.47847822308540344, + "learning_rate": 7.875383259005671e-05, + "loss": 0.4479, + "step": 95340 + }, + { + "epoch": 2.122284544159544, + "grad_norm": 0.49448779225349426, + "learning_rate": 7.871675774674878e-05, + "loss": 0.4607, + "step": 95350 + }, + { + "epoch": 2.1225071225071224, + "grad_norm": 0.4804612398147583, + "learning_rate": 7.867968949407153e-05, + "loss": 0.4253, + "step": 95360 + }, + { + "epoch": 2.122729700854701, + "grad_norm": 0.599602997303009, + "learning_rate": 7.864262783403935e-05, + "loss": 0.3489, + "step": 95370 + }, + { + "epoch": 2.1229522792022792, + "grad_norm": 0.5012718439102173, + "learning_rate": 7.860557276866603e-05, + "loss": 0.6108, + "step": 95380 + }, + { + "epoch": 2.1231748575498575, + "grad_norm": 0.6502254605293274, + "learning_rate": 7.856852429996528e-05, + "loss": 0.5054, + "step": 95390 + }, + { + "epoch": 2.123397435897436, + "grad_norm": 0.5006417632102966, + "learning_rate": 7.853148242995031e-05, + "loss": 0.333, + "step": 95400 + }, + { + "epoch": 2.1236200142450143, + "grad_norm": 0.8475795388221741, + "learning_rate": 7.849444716063405e-05, + "loss": 0.5125, + "step": 95410 + }, + { + "epoch": 2.1238425925925926, + "grad_norm": 0.6902666091918945, + "learning_rate": 7.845741849402906e-05, + "loss": 0.4753, + "step": 95420 + }, + { + "epoch": 2.1240651709401708, + "grad_norm": 0.6792203783988953, + "learning_rate": 7.842039643214737e-05, + "loss": 0.4453, + "step": 95430 + }, + { + "epoch": 2.1242877492877494, + "grad_norm": 0.6841697096824646, + "learning_rate": 7.838338097700088e-05, + "loss": 0.4297, + "step": 95440 + }, + { + "epoch": 2.1245103276353277, + "grad_norm": 0.47156375646591187, + "learning_rate": 7.83463721306011e-05, + "loss": 0.5173, + "step": 95450 + }, + { + "epoch": 2.124732905982906, + "grad_norm": 0.6936251521110535, + "learning_rate": 7.830936989495897e-05, + "loss": 0.4983, + "step": 95460 + }, + { + "epoch": 2.1249554843304845, + "grad_norm": 0.3907637596130371, + "learning_rate": 7.827237427208529e-05, + "loss": 0.5749, + "step": 95470 + }, + { + "epoch": 2.1251780626780628, + "grad_norm": 0.6941818594932556, + "learning_rate": 7.823538526399045e-05, + "loss": 0.4437, + "step": 95480 + }, + { + "epoch": 2.125400641025641, + "grad_norm": 0.49002397060394287, + "learning_rate": 7.819840287268444e-05, + "loss": 0.3759, + "step": 95490 + }, + { + "epoch": 2.125623219373219, + "grad_norm": 0.8776218295097351, + "learning_rate": 7.816142710017697e-05, + "loss": 0.4317, + "step": 95500 + }, + { + "epoch": 2.125845797720798, + "grad_norm": 0.6318543553352356, + "learning_rate": 7.812445794847734e-05, + "loss": 0.4536, + "step": 95510 + }, + { + "epoch": 2.126068376068376, + "grad_norm": 0.39666521549224854, + "learning_rate": 7.808749541959437e-05, + "loss": 0.5282, + "step": 95520 + }, + { + "epoch": 2.1262909544159543, + "grad_norm": 0.6775786280632019, + "learning_rate": 7.80505395155367e-05, + "loss": 0.638, + "step": 95530 + }, + { + "epoch": 2.126513532763533, + "grad_norm": 0.3907429575920105, + "learning_rate": 7.801359023831254e-05, + "loss": 0.4733, + "step": 95540 + }, + { + "epoch": 2.126736111111111, + "grad_norm": 0.4724726974964142, + "learning_rate": 7.797664758992984e-05, + "loss": 0.455, + "step": 95550 + }, + { + "epoch": 2.1269586894586894, + "grad_norm": 0.5766648054122925, + "learning_rate": 7.79397115723959e-05, + "loss": 0.3844, + "step": 95560 + }, + { + "epoch": 2.127181267806268, + "grad_norm": 0.5338031053543091, + "learning_rate": 7.790278218771798e-05, + "loss": 0.4238, + "step": 95570 + }, + { + "epoch": 2.1274038461538463, + "grad_norm": 0.5047205090522766, + "learning_rate": 7.786585943790283e-05, + "loss": 0.4362, + "step": 95580 + }, + { + "epoch": 2.1276264245014245, + "grad_norm": 0.3229919970035553, + "learning_rate": 7.782894332495691e-05, + "loss": 0.4147, + "step": 95590 + }, + { + "epoch": 2.1278490028490027, + "grad_norm": 0.49135375022888184, + "learning_rate": 7.779203385088618e-05, + "loss": 0.4977, + "step": 95600 + }, + { + "epoch": 2.1280715811965814, + "grad_norm": 0.5660061836242676, + "learning_rate": 7.775513101769636e-05, + "loss": 0.4634, + "step": 95610 + }, + { + "epoch": 2.1282941595441596, + "grad_norm": 0.5993034243583679, + "learning_rate": 7.77182348273928e-05, + "loss": 0.4497, + "step": 95620 + }, + { + "epoch": 2.128516737891738, + "grad_norm": 0.5378540754318237, + "learning_rate": 7.768134528198046e-05, + "loss": 0.4574, + "step": 95630 + }, + { + "epoch": 2.128739316239316, + "grad_norm": 0.4644266366958618, + "learning_rate": 7.764446238346395e-05, + "loss": 0.5494, + "step": 95640 + }, + { + "epoch": 2.1289618945868947, + "grad_norm": 0.7540342211723328, + "learning_rate": 7.76075861338476e-05, + "loss": 0.4136, + "step": 95650 + }, + { + "epoch": 2.129184472934473, + "grad_norm": 0.6110064387321472, + "learning_rate": 7.757071653513512e-05, + "loss": 0.3748, + "step": 95660 + }, + { + "epoch": 2.129407051282051, + "grad_norm": 0.4860828220844269, + "learning_rate": 7.753385358933016e-05, + "loss": 0.4584, + "step": 95670 + }, + { + "epoch": 2.1296296296296298, + "grad_norm": 0.6992922425270081, + "learning_rate": 7.749699729843591e-05, + "loss": 0.4407, + "step": 95680 + }, + { + "epoch": 2.129852207977208, + "grad_norm": 0.46820303797721863, + "learning_rate": 7.746014766445504e-05, + "loss": 0.4352, + "step": 95690 + }, + { + "epoch": 2.130074786324786, + "grad_norm": 0.4686327874660492, + "learning_rate": 7.742330468939006e-05, + "loss": 0.4255, + "step": 95700 + }, + { + "epoch": 2.130297364672365, + "grad_norm": 0.7035104036331177, + "learning_rate": 7.738646837524306e-05, + "loss": 0.5257, + "step": 95710 + }, + { + "epoch": 2.130519943019943, + "grad_norm": 0.6425663232803345, + "learning_rate": 7.734963872401573e-05, + "loss": 0.5195, + "step": 95720 + }, + { + "epoch": 2.1307425213675213, + "grad_norm": 0.6731189489364624, + "learning_rate": 7.73128157377095e-05, + "loss": 0.4616, + "step": 95730 + }, + { + "epoch": 2.1309650997150995, + "grad_norm": 0.5955937504768372, + "learning_rate": 7.727599941832526e-05, + "loss": 0.5135, + "step": 95740 + }, + { + "epoch": 2.131187678062678, + "grad_norm": 0.4566531479358673, + "learning_rate": 7.723918976786366e-05, + "loss": 0.3921, + "step": 95750 + }, + { + "epoch": 2.1314102564102564, + "grad_norm": 0.5514445900917053, + "learning_rate": 7.720238678832498e-05, + "loss": 0.5411, + "step": 95760 + }, + { + "epoch": 2.1316328347578346, + "grad_norm": 0.4605773687362671, + "learning_rate": 7.716559048170913e-05, + "loss": 0.383, + "step": 95770 + }, + { + "epoch": 2.1318554131054133, + "grad_norm": 0.7322866320610046, + "learning_rate": 7.712880085001565e-05, + "loss": 0.5983, + "step": 95780 + }, + { + "epoch": 2.1320779914529915, + "grad_norm": 0.36871546506881714, + "learning_rate": 7.709201789524381e-05, + "loss": 0.3566, + "step": 95790 + }, + { + "epoch": 2.1323005698005697, + "grad_norm": 0.5514602661132812, + "learning_rate": 7.705524161939223e-05, + "loss": 0.469, + "step": 95800 + }, + { + "epoch": 2.132523148148148, + "grad_norm": 0.7098495960235596, + "learning_rate": 7.701847202445956e-05, + "loss": 0.486, + "step": 95810 + }, + { + "epoch": 2.1327457264957266, + "grad_norm": 0.5476725101470947, + "learning_rate": 7.698170911244373e-05, + "loss": 0.3667, + "step": 95820 + }, + { + "epoch": 2.132968304843305, + "grad_norm": 0.7701019048690796, + "learning_rate": 7.694495288534252e-05, + "loss": 0.472, + "step": 95830 + }, + { + "epoch": 2.133190883190883, + "grad_norm": 0.586618185043335, + "learning_rate": 7.690820334515331e-05, + "loss": 0.3609, + "step": 95840 + }, + { + "epoch": 2.1334134615384617, + "grad_norm": 0.35569578409194946, + "learning_rate": 7.68714604938731e-05, + "loss": 0.3996, + "step": 95850 + }, + { + "epoch": 2.13363603988604, + "grad_norm": 0.7099726796150208, + "learning_rate": 7.683472433349854e-05, + "loss": 0.4586, + "step": 95860 + }, + { + "epoch": 2.133858618233618, + "grad_norm": 0.6935610175132751, + "learning_rate": 7.679799486602595e-05, + "loss": 0.424, + "step": 95870 + }, + { + "epoch": 2.1340811965811968, + "grad_norm": 0.5956912040710449, + "learning_rate": 7.67612720934511e-05, + "loss": 0.4726, + "step": 95880 + }, + { + "epoch": 2.134303774928775, + "grad_norm": 0.6841393113136292, + "learning_rate": 7.67245560177696e-05, + "loss": 0.5007, + "step": 95890 + }, + { + "epoch": 2.134526353276353, + "grad_norm": 0.6714563965797424, + "learning_rate": 7.668784664097668e-05, + "loss": 0.4712, + "step": 95900 + }, + { + "epoch": 2.1347489316239314, + "grad_norm": 0.4370259642601013, + "learning_rate": 7.665114396506709e-05, + "loss": 0.5509, + "step": 95910 + }, + { + "epoch": 2.13497150997151, + "grad_norm": 0.7829974889755249, + "learning_rate": 7.661444799203532e-05, + "loss": 0.426, + "step": 95920 + }, + { + "epoch": 2.1351940883190883, + "grad_norm": 0.5304891467094421, + "learning_rate": 7.657775872387554e-05, + "loss": 0.5888, + "step": 95930 + }, + { + "epoch": 2.1354166666666665, + "grad_norm": 0.4999597370624542, + "learning_rate": 7.654107616258137e-05, + "loss": 0.4326, + "step": 95940 + }, + { + "epoch": 2.135639245014245, + "grad_norm": 0.5106223821640015, + "learning_rate": 7.650440031014611e-05, + "loss": 0.4832, + "step": 95950 + }, + { + "epoch": 2.1358618233618234, + "grad_norm": 0.5175594687461853, + "learning_rate": 7.646773116856287e-05, + "loss": 0.3942, + "step": 95960 + }, + { + "epoch": 2.1360844017094016, + "grad_norm": 0.6161689162254333, + "learning_rate": 7.643106873982422e-05, + "loss": 0.5379, + "step": 95970 + }, + { + "epoch": 2.13630698005698, + "grad_norm": 0.6075182557106018, + "learning_rate": 7.639441302592248e-05, + "loss": 0.4709, + "step": 95980 + }, + { + "epoch": 2.1365295584045585, + "grad_norm": 0.9312088489532471, + "learning_rate": 7.635776402884949e-05, + "loss": 0.5752, + "step": 95990 + }, + { + "epoch": 2.1367521367521367, + "grad_norm": 0.5864628553390503, + "learning_rate": 7.632112175059684e-05, + "loss": 0.5621, + "step": 96000 + }, + { + "epoch": 2.136974715099715, + "grad_norm": 0.46275755763053894, + "learning_rate": 7.628448619315575e-05, + "loss": 0.3262, + "step": 96010 + }, + { + "epoch": 2.1371972934472936, + "grad_norm": 0.5860275030136108, + "learning_rate": 7.62478573585169e-05, + "loss": 0.498, + "step": 96020 + }, + { + "epoch": 2.137419871794872, + "grad_norm": 0.3560199737548828, + "learning_rate": 7.621123524867077e-05, + "loss": 0.4276, + "step": 96030 + }, + { + "epoch": 2.13764245014245, + "grad_norm": 0.4286309778690338, + "learning_rate": 7.617461986560746e-05, + "loss": 0.4103, + "step": 96040 + }, + { + "epoch": 2.1378650284900287, + "grad_norm": 0.28722521662712097, + "learning_rate": 7.613801121131667e-05, + "loss": 0.3054, + "step": 96050 + }, + { + "epoch": 2.138087606837607, + "grad_norm": 0.766991376876831, + "learning_rate": 7.610140928778777e-05, + "loss": 0.5683, + "step": 96060 + }, + { + "epoch": 2.138310185185185, + "grad_norm": 0.7361512184143066, + "learning_rate": 7.606481409700976e-05, + "loss": 0.5906, + "step": 96070 + }, + { + "epoch": 2.1385327635327633, + "grad_norm": 0.46420538425445557, + "learning_rate": 7.602822564097122e-05, + "loss": 0.4366, + "step": 96080 + }, + { + "epoch": 2.138755341880342, + "grad_norm": 0.8343654870986938, + "learning_rate": 7.599164392166033e-05, + "loss": 0.4457, + "step": 96090 + }, + { + "epoch": 2.13897792022792, + "grad_norm": 0.6298092603683472, + "learning_rate": 7.595506894106503e-05, + "loss": 0.4655, + "step": 96100 + }, + { + "epoch": 2.1392004985754984, + "grad_norm": 0.6869631409645081, + "learning_rate": 7.591850070117281e-05, + "loss": 0.3374, + "step": 96110 + }, + { + "epoch": 2.139423076923077, + "grad_norm": 0.6529879570007324, + "learning_rate": 7.588193920397084e-05, + "loss": 0.3987, + "step": 96120 + }, + { + "epoch": 2.1396456552706553, + "grad_norm": 0.4423435628414154, + "learning_rate": 7.584538445144591e-05, + "loss": 0.401, + "step": 96130 + }, + { + "epoch": 2.1398682336182335, + "grad_norm": 0.6857985258102417, + "learning_rate": 7.580883644558443e-05, + "loss": 0.5029, + "step": 96140 + }, + { + "epoch": 2.1400908119658117, + "grad_norm": 0.4324056804180145, + "learning_rate": 7.577229518837252e-05, + "loss": 0.4247, + "step": 96150 + }, + { + "epoch": 2.1403133903133904, + "grad_norm": 0.5784803628921509, + "learning_rate": 7.57357606817957e-05, + "loss": 0.474, + "step": 96160 + }, + { + "epoch": 2.1405359686609686, + "grad_norm": 0.5325353741645813, + "learning_rate": 7.569923292783938e-05, + "loss": 0.4022, + "step": 96170 + }, + { + "epoch": 2.140758547008547, + "grad_norm": 0.35216307640075684, + "learning_rate": 7.56627119284885e-05, + "loss": 0.4482, + "step": 96180 + }, + { + "epoch": 2.1409811253561255, + "grad_norm": 0.8801172375679016, + "learning_rate": 7.562619768572765e-05, + "loss": 0.5553, + "step": 96190 + }, + { + "epoch": 2.1412037037037037, + "grad_norm": 0.6519230008125305, + "learning_rate": 7.55896902015411e-05, + "loss": 0.502, + "step": 96200 + }, + { + "epoch": 2.141426282051282, + "grad_norm": 0.5615265965461731, + "learning_rate": 7.555318947791257e-05, + "loss": 0.4937, + "step": 96210 + }, + { + "epoch": 2.1416488603988606, + "grad_norm": 0.49167194962501526, + "learning_rate": 7.551669551682565e-05, + "loss": 0.523, + "step": 96220 + }, + { + "epoch": 2.141871438746439, + "grad_norm": 0.48790618777275085, + "learning_rate": 7.548020832026335e-05, + "loss": 0.4501, + "step": 96230 + }, + { + "epoch": 2.142094017094017, + "grad_norm": 0.8500549793243408, + "learning_rate": 7.544372789020844e-05, + "loss": 0.4977, + "step": 96240 + }, + { + "epoch": 2.1423165954415953, + "grad_norm": 0.715935230255127, + "learning_rate": 7.540725422864334e-05, + "loss": 0.4873, + "step": 96250 + }, + { + "epoch": 2.142539173789174, + "grad_norm": 0.6317983865737915, + "learning_rate": 7.537078733755005e-05, + "loss": 0.4425, + "step": 96260 + }, + { + "epoch": 2.142761752136752, + "grad_norm": 1.0182199478149414, + "learning_rate": 7.53343272189102e-05, + "loss": 0.5664, + "step": 96270 + }, + { + "epoch": 2.1429843304843303, + "grad_norm": 0.7626780271530151, + "learning_rate": 7.529787387470506e-05, + "loss": 0.5206, + "step": 96280 + }, + { + "epoch": 2.143206908831909, + "grad_norm": 0.45802199840545654, + "learning_rate": 7.526142730691561e-05, + "loss": 0.4275, + "step": 96290 + }, + { + "epoch": 2.1434294871794872, + "grad_norm": 0.5165693759918213, + "learning_rate": 7.522498751752225e-05, + "loss": 0.5341, + "step": 96300 + }, + { + "epoch": 2.1436520655270654, + "grad_norm": 0.5957320928573608, + "learning_rate": 7.518855450850519e-05, + "loss": 0.3631, + "step": 96310 + }, + { + "epoch": 2.1438746438746437, + "grad_norm": 0.5179235339164734, + "learning_rate": 7.515212828184428e-05, + "loss": 0.449, + "step": 96320 + }, + { + "epoch": 2.1440972222222223, + "grad_norm": 0.4287464916706085, + "learning_rate": 7.5115708839519e-05, + "loss": 0.4, + "step": 96330 + }, + { + "epoch": 2.1443198005698005, + "grad_norm": 0.45676085352897644, + "learning_rate": 7.507929618350824e-05, + "loss": 0.4563, + "step": 96340 + }, + { + "epoch": 2.1445423789173788, + "grad_norm": 0.6015214323997498, + "learning_rate": 7.504289031579081e-05, + "loss": 0.4153, + "step": 96350 + }, + { + "epoch": 2.1447649572649574, + "grad_norm": 0.6132090091705322, + "learning_rate": 7.500649123834507e-05, + "loss": 0.4989, + "step": 96360 + }, + { + "epoch": 2.1449875356125356, + "grad_norm": 0.4717426598072052, + "learning_rate": 7.497009895314887e-05, + "loss": 0.5924, + "step": 96370 + }, + { + "epoch": 2.145210113960114, + "grad_norm": 0.48287132382392883, + "learning_rate": 7.493371346217983e-05, + "loss": 0.39, + "step": 96380 + }, + { + "epoch": 2.1454326923076925, + "grad_norm": 0.589698076248169, + "learning_rate": 7.489733476741519e-05, + "loss": 0.4849, + "step": 96390 + }, + { + "epoch": 2.1456552706552707, + "grad_norm": 0.4884866178035736, + "learning_rate": 7.48609628708318e-05, + "loss": 0.4765, + "step": 96400 + }, + { + "epoch": 2.145877849002849, + "grad_norm": 0.5568512082099915, + "learning_rate": 7.482459777440612e-05, + "loss": 0.4794, + "step": 96410 + }, + { + "epoch": 2.146100427350427, + "grad_norm": 0.4387529194355011, + "learning_rate": 7.478823948011429e-05, + "loss": 0.481, + "step": 96420 + }, + { + "epoch": 2.146323005698006, + "grad_norm": 0.5052660703659058, + "learning_rate": 7.475188798993206e-05, + "loss": 0.4859, + "step": 96430 + }, + { + "epoch": 2.146545584045584, + "grad_norm": 0.5194045305252075, + "learning_rate": 7.471554330583475e-05, + "loss": 0.3896, + "step": 96440 + }, + { + "epoch": 2.1467681623931623, + "grad_norm": 0.7493390440940857, + "learning_rate": 7.467920542979734e-05, + "loss": 0.5528, + "step": 96450 + }, + { + "epoch": 2.146990740740741, + "grad_norm": 0.48206827044487, + "learning_rate": 7.464287436379451e-05, + "loss": 0.5136, + "step": 96460 + }, + { + "epoch": 2.147213319088319, + "grad_norm": 0.5970621705055237, + "learning_rate": 7.460655010980058e-05, + "loss": 0.4574, + "step": 96470 + }, + { + "epoch": 2.1474358974358974, + "grad_norm": 0.4395906627178192, + "learning_rate": 7.45702326697893e-05, + "loss": 0.5257, + "step": 96480 + }, + { + "epoch": 2.1476584757834756, + "grad_norm": 0.6027551293373108, + "learning_rate": 7.453392204573426e-05, + "loss": 0.527, + "step": 96490 + }, + { + "epoch": 2.1478810541310542, + "grad_norm": 0.4774160087108612, + "learning_rate": 7.449761823960868e-05, + "loss": 0.4066, + "step": 96500 + }, + { + "epoch": 2.1481036324786325, + "grad_norm": 0.8844955563545227, + "learning_rate": 7.446132125338519e-05, + "loss": 0.536, + "step": 96510 + }, + { + "epoch": 2.1483262108262107, + "grad_norm": 0.5076330304145813, + "learning_rate": 7.442503108903629e-05, + "loss": 0.5438, + "step": 96520 + }, + { + "epoch": 2.1485487891737893, + "grad_norm": 0.6229117512702942, + "learning_rate": 7.438874774853397e-05, + "loss": 0.5332, + "step": 96530 + }, + { + "epoch": 2.1487713675213675, + "grad_norm": 0.4757481515407562, + "learning_rate": 7.435247123384996e-05, + "loss": 0.4961, + "step": 96540 + }, + { + "epoch": 2.1489939458689458, + "grad_norm": 0.5127527117729187, + "learning_rate": 7.431620154695551e-05, + "loss": 0.4622, + "step": 96550 + }, + { + "epoch": 2.1492165242165244, + "grad_norm": 0.8079750537872314, + "learning_rate": 7.427993868982155e-05, + "loss": 0.4647, + "step": 96560 + }, + { + "epoch": 2.1494391025641026, + "grad_norm": 0.656417965888977, + "learning_rate": 7.424368266441873e-05, + "loss": 0.4997, + "step": 96570 + }, + { + "epoch": 2.149661680911681, + "grad_norm": 0.46112340688705444, + "learning_rate": 7.420743347271703e-05, + "loss": 0.5201, + "step": 96580 + }, + { + "epoch": 2.149884259259259, + "grad_norm": 0.7423391938209534, + "learning_rate": 7.417119111668642e-05, + "loss": 0.5169, + "step": 96590 + }, + { + "epoch": 2.1501068376068377, + "grad_norm": 0.4680321514606476, + "learning_rate": 7.413495559829635e-05, + "loss": 0.3816, + "step": 96600 + }, + { + "epoch": 2.150329415954416, + "grad_norm": 0.6164966225624084, + "learning_rate": 7.409872691951573e-05, + "loss": 0.4939, + "step": 96610 + }, + { + "epoch": 2.150551994301994, + "grad_norm": 0.7925980687141418, + "learning_rate": 7.40625050823134e-05, + "loss": 0.4332, + "step": 96620 + }, + { + "epoch": 2.150774572649573, + "grad_norm": 0.48925745487213135, + "learning_rate": 7.402629008865763e-05, + "loss": 0.4911, + "step": 96630 + }, + { + "epoch": 2.150997150997151, + "grad_norm": 0.45277827978134155, + "learning_rate": 7.399008194051644e-05, + "loss": 0.5383, + "step": 96640 + }, + { + "epoch": 2.1512197293447293, + "grad_norm": 0.4843733310699463, + "learning_rate": 7.395388063985729e-05, + "loss": 0.3939, + "step": 96650 + }, + { + "epoch": 2.1514423076923075, + "grad_norm": 0.7864764928817749, + "learning_rate": 7.391768618864745e-05, + "loss": 0.4114, + "step": 96660 + }, + { + "epoch": 2.151664886039886, + "grad_norm": 0.6666766405105591, + "learning_rate": 7.388149858885378e-05, + "loss": 0.4779, + "step": 96670 + }, + { + "epoch": 2.1518874643874644, + "grad_norm": 0.6841953992843628, + "learning_rate": 7.384531784244271e-05, + "loss": 0.4747, + "step": 96680 + }, + { + "epoch": 2.1521100427350426, + "grad_norm": 0.7607446312904358, + "learning_rate": 7.380914395138033e-05, + "loss": 0.4095, + "step": 96690 + }, + { + "epoch": 2.1523326210826212, + "grad_norm": 0.6345654129981995, + "learning_rate": 7.377297691763239e-05, + "loss": 0.5492, + "step": 96700 + }, + { + "epoch": 2.1525551994301995, + "grad_norm": 0.5190750956535339, + "learning_rate": 7.373681674316426e-05, + "loss": 0.457, + "step": 96710 + }, + { + "epoch": 2.1527777777777777, + "grad_norm": 0.3715447783470154, + "learning_rate": 7.370066342994081e-05, + "loss": 0.4415, + "step": 96720 + }, + { + "epoch": 2.1530003561253563, + "grad_norm": 0.715945303440094, + "learning_rate": 7.36645169799268e-05, + "loss": 0.4283, + "step": 96730 + }, + { + "epoch": 2.1532229344729346, + "grad_norm": 0.6834224462509155, + "learning_rate": 7.362837739508629e-05, + "loss": 0.551, + "step": 96740 + }, + { + "epoch": 2.1534455128205128, + "grad_norm": 0.46780624985694885, + "learning_rate": 7.359224467738317e-05, + "loss": 0.383, + "step": 96750 + }, + { + "epoch": 2.153668091168091, + "grad_norm": 0.6412452459335327, + "learning_rate": 7.355611882878097e-05, + "loss": 0.4973, + "step": 96760 + }, + { + "epoch": 2.1538906695156697, + "grad_norm": 0.44822946190834045, + "learning_rate": 7.35199998512428e-05, + "loss": 0.4009, + "step": 96770 + }, + { + "epoch": 2.154113247863248, + "grad_norm": 0.4693109095096588, + "learning_rate": 7.348388774673143e-05, + "loss": 0.4893, + "step": 96780 + }, + { + "epoch": 2.154335826210826, + "grad_norm": 0.8119006156921387, + "learning_rate": 7.344778251720911e-05, + "loss": 0.4104, + "step": 96790 + }, + { + "epoch": 2.1545584045584047, + "grad_norm": 0.5076255798339844, + "learning_rate": 7.341168416463789e-05, + "loss": 0.5757, + "step": 96800 + }, + { + "epoch": 2.154780982905983, + "grad_norm": 0.5202703475952148, + "learning_rate": 7.337559269097938e-05, + "loss": 0.5638, + "step": 96810 + }, + { + "epoch": 2.155003561253561, + "grad_norm": 0.5872973203659058, + "learning_rate": 7.333950809819484e-05, + "loss": 0.4893, + "step": 96820 + }, + { + "epoch": 2.1552261396011394, + "grad_norm": 0.4134224057197571, + "learning_rate": 7.33034303882451e-05, + "loss": 0.5044, + "step": 96830 + }, + { + "epoch": 2.155448717948718, + "grad_norm": 0.48488810658454895, + "learning_rate": 7.326735956309074e-05, + "loss": 0.4659, + "step": 96840 + }, + { + "epoch": 2.1556712962962963, + "grad_norm": 0.5602866411209106, + "learning_rate": 7.323129562469174e-05, + "loss": 0.4519, + "step": 96850 + }, + { + "epoch": 2.1558938746438745, + "grad_norm": 0.3875146508216858, + "learning_rate": 7.319523857500798e-05, + "loss": 0.4158, + "step": 96860 + }, + { + "epoch": 2.156116452991453, + "grad_norm": 0.5519022345542908, + "learning_rate": 7.315918841599869e-05, + "loss": 0.4951, + "step": 96870 + }, + { + "epoch": 2.1563390313390314, + "grad_norm": 0.7909629344940186, + "learning_rate": 7.312314514962295e-05, + "loss": 0.5099, + "step": 96880 + }, + { + "epoch": 2.1565616096866096, + "grad_norm": 0.334187388420105, + "learning_rate": 7.308710877783937e-05, + "loss": 0.4609, + "step": 96890 + }, + { + "epoch": 2.1567841880341883, + "grad_norm": 0.3953896462917328, + "learning_rate": 7.305107930260619e-05, + "loss": 0.4981, + "step": 96900 + }, + { + "epoch": 2.1570067663817665, + "grad_norm": 0.5780625939369202, + "learning_rate": 7.30150567258813e-05, + "loss": 0.3871, + "step": 96910 + }, + { + "epoch": 2.1572293447293447, + "grad_norm": 0.7589643597602844, + "learning_rate": 7.297904104962223e-05, + "loss": 0.4304, + "step": 96920 + }, + { + "epoch": 2.157451923076923, + "grad_norm": 0.584696888923645, + "learning_rate": 7.2943032275786e-05, + "loss": 0.4656, + "step": 96930 + }, + { + "epoch": 2.1576745014245016, + "grad_norm": 0.6244634985923767, + "learning_rate": 7.29070304063294e-05, + "loss": 0.4773, + "step": 96940 + }, + { + "epoch": 2.15789707977208, + "grad_norm": 0.6585376858711243, + "learning_rate": 7.287103544320881e-05, + "loss": 0.4357, + "step": 96950 + }, + { + "epoch": 2.158119658119658, + "grad_norm": 0.605294942855835, + "learning_rate": 7.283504738838022e-05, + "loss": 0.5408, + "step": 96960 + }, + { + "epoch": 2.1583422364672367, + "grad_norm": 0.5022411346435547, + "learning_rate": 7.279906624379928e-05, + "loss": 0.5125, + "step": 96970 + }, + { + "epoch": 2.158564814814815, + "grad_norm": 0.6028233766555786, + "learning_rate": 7.276309201142129e-05, + "loss": 0.5237, + "step": 96980 + }, + { + "epoch": 2.158787393162393, + "grad_norm": 0.5513371229171753, + "learning_rate": 7.272712469320094e-05, + "loss": 0.4935, + "step": 96990 + }, + { + "epoch": 2.1590099715099713, + "grad_norm": 0.922234833240509, + "learning_rate": 7.269116429109291e-05, + "loss": 0.4227, + "step": 97000 + }, + { + "epoch": 2.15923254985755, + "grad_norm": 0.6376217007637024, + "learning_rate": 7.265521080705115e-05, + "loss": 0.4728, + "step": 97010 + }, + { + "epoch": 2.159455128205128, + "grad_norm": 0.5466942191123962, + "learning_rate": 7.261926424302949e-05, + "loss": 0.4958, + "step": 97020 + }, + { + "epoch": 2.1596777065527064, + "grad_norm": 0.4967202842235565, + "learning_rate": 7.25833246009813e-05, + "loss": 0.4079, + "step": 97030 + }, + { + "epoch": 2.159900284900285, + "grad_norm": 0.48792919516563416, + "learning_rate": 7.254739188285955e-05, + "loss": 0.419, + "step": 97040 + }, + { + "epoch": 2.1601228632478633, + "grad_norm": 0.6444426774978638, + "learning_rate": 7.251146609061685e-05, + "loss": 0.355, + "step": 97050 + }, + { + "epoch": 2.16025641025641, + "eval_loss": 0.5352727174758911, + "eval_runtime": 337.3252, + "eval_samples_per_second": 7.011, + "eval_steps_per_second": 7.011, + "step": 97056 + }, + { + "epoch": 2.1603454415954415, + "grad_norm": 0.597774088382721, + "learning_rate": 7.247554722620552e-05, + "loss": 0.5716, + "step": 97060 + }, + { + "epoch": 2.16056801994302, + "grad_norm": 0.6890891790390015, + "learning_rate": 7.243963529157731e-05, + "loss": 0.5159, + "step": 97070 + }, + { + "epoch": 2.1607905982905984, + "grad_norm": 0.5812705755233765, + "learning_rate": 7.240373028868372e-05, + "loss": 0.3916, + "step": 97080 + }, + { + "epoch": 2.1610131766381766, + "grad_norm": 0.6706557869911194, + "learning_rate": 7.236783221947589e-05, + "loss": 0.5548, + "step": 97090 + }, + { + "epoch": 2.161235754985755, + "grad_norm": 0.5046707391738892, + "learning_rate": 7.233194108590455e-05, + "loss": 0.3913, + "step": 97100 + }, + { + "epoch": 2.1614583333333335, + "grad_norm": 0.40967047214508057, + "learning_rate": 7.229605688992002e-05, + "loss": 0.4302, + "step": 97110 + }, + { + "epoch": 2.1616809116809117, + "grad_norm": 0.5742045640945435, + "learning_rate": 7.22601796334724e-05, + "loss": 0.4413, + "step": 97120 + }, + { + "epoch": 2.16190349002849, + "grad_norm": 0.6036415100097656, + "learning_rate": 7.222430931851109e-05, + "loss": 0.4734, + "step": 97130 + }, + { + "epoch": 2.1621260683760686, + "grad_norm": 0.7149750590324402, + "learning_rate": 7.218844594698552e-05, + "loss": 0.5379, + "step": 97140 + }, + { + "epoch": 2.162348646723647, + "grad_norm": 0.48467162251472473, + "learning_rate": 7.215258952084434e-05, + "loss": 0.4919, + "step": 97150 + }, + { + "epoch": 2.162571225071225, + "grad_norm": 0.588701605796814, + "learning_rate": 7.21167400420361e-05, + "loss": 0.3474, + "step": 97160 + }, + { + "epoch": 2.1627938034188032, + "grad_norm": 0.5003872513771057, + "learning_rate": 7.208089751250891e-05, + "loss": 0.5029, + "step": 97170 + }, + { + "epoch": 2.163016381766382, + "grad_norm": 0.8031962513923645, + "learning_rate": 7.204506193421045e-05, + "loss": 0.5116, + "step": 97180 + }, + { + "epoch": 2.16323896011396, + "grad_norm": 0.5941150188446045, + "learning_rate": 7.200923330908811e-05, + "loss": 0.5218, + "step": 97190 + }, + { + "epoch": 2.1634615384615383, + "grad_norm": 0.6272960901260376, + "learning_rate": 7.197341163908883e-05, + "loss": 0.3902, + "step": 97200 + }, + { + "epoch": 2.163684116809117, + "grad_norm": 0.6789999604225159, + "learning_rate": 7.193759692615914e-05, + "loss": 0.5537, + "step": 97210 + }, + { + "epoch": 2.163906695156695, + "grad_norm": 0.5254083871841431, + "learning_rate": 7.190178917224525e-05, + "loss": 0.4646, + "step": 97220 + }, + { + "epoch": 2.1641292735042734, + "grad_norm": 0.5182301998138428, + "learning_rate": 7.186598837929302e-05, + "loss": 0.484, + "step": 97230 + }, + { + "epoch": 2.164351851851852, + "grad_norm": 0.7297257781028748, + "learning_rate": 7.183019454924784e-05, + "loss": 0.5208, + "step": 97240 + }, + { + "epoch": 2.1645744301994303, + "grad_norm": 0.40275275707244873, + "learning_rate": 7.179440768405492e-05, + "loss": 0.487, + "step": 97250 + }, + { + "epoch": 2.1647970085470085, + "grad_norm": 0.6008617281913757, + "learning_rate": 7.175862778565876e-05, + "loss": 0.4279, + "step": 97260 + }, + { + "epoch": 2.1650195868945867, + "grad_norm": 0.6788070201873779, + "learning_rate": 7.172285485600374e-05, + "loss": 0.4198, + "step": 97270 + }, + { + "epoch": 2.1652421652421654, + "grad_norm": 0.4225766062736511, + "learning_rate": 7.168708889703388e-05, + "loss": 0.5602, + "step": 97280 + }, + { + "epoch": 2.1654647435897436, + "grad_norm": 0.7581738233566284, + "learning_rate": 7.165132991069256e-05, + "loss": 0.4849, + "step": 97290 + }, + { + "epoch": 2.165687321937322, + "grad_norm": 0.4646041691303253, + "learning_rate": 7.161557789892308e-05, + "loss": 0.5735, + "step": 97300 + }, + { + "epoch": 2.1659099002849005, + "grad_norm": 0.47280997037887573, + "learning_rate": 7.157983286366816e-05, + "loss": 0.3521, + "step": 97310 + }, + { + "epoch": 2.1661324786324787, + "grad_norm": 0.3988167345523834, + "learning_rate": 7.154409480687027e-05, + "loss": 0.4472, + "step": 97320 + }, + { + "epoch": 2.166355056980057, + "grad_norm": 0.4632010757923126, + "learning_rate": 7.150836373047145e-05, + "loss": 0.4917, + "step": 97330 + }, + { + "epoch": 2.166577635327635, + "grad_norm": 0.5823772549629211, + "learning_rate": 7.147263963641337e-05, + "loss": 0.6083, + "step": 97340 + }, + { + "epoch": 2.166800213675214, + "grad_norm": 0.7309155464172363, + "learning_rate": 7.14369225266372e-05, + "loss": 0.4442, + "step": 97350 + }, + { + "epoch": 2.167022792022792, + "grad_norm": 0.5929965972900391, + "learning_rate": 7.140121240308393e-05, + "loss": 0.4356, + "step": 97360 + }, + { + "epoch": 2.1672453703703702, + "grad_norm": 0.8357145190238953, + "learning_rate": 7.136550926769403e-05, + "loss": 0.3463, + "step": 97370 + }, + { + "epoch": 2.167467948717949, + "grad_norm": 0.86272794008255, + "learning_rate": 7.132981312240774e-05, + "loss": 0.4568, + "step": 97380 + }, + { + "epoch": 2.167690527065527, + "grad_norm": 0.656522274017334, + "learning_rate": 7.129412396916469e-05, + "loss": 0.4935, + "step": 97390 + }, + { + "epoch": 2.1679131054131053, + "grad_norm": 0.3058460056781769, + "learning_rate": 7.125844180990427e-05, + "loss": 0.4591, + "step": 97400 + }, + { + "epoch": 2.168135683760684, + "grad_norm": 0.6548540592193604, + "learning_rate": 7.122276664656553e-05, + "loss": 0.4842, + "step": 97410 + }, + { + "epoch": 2.168358262108262, + "grad_norm": 0.5646089315414429, + "learning_rate": 7.118709848108716e-05, + "loss": 0.448, + "step": 97420 + }, + { + "epoch": 2.1685808404558404, + "grad_norm": 0.5614109039306641, + "learning_rate": 7.11514373154072e-05, + "loss": 0.486, + "step": 97430 + }, + { + "epoch": 2.1688034188034186, + "grad_norm": 0.6399925351142883, + "learning_rate": 7.111578315146365e-05, + "loss": 0.4793, + "step": 97440 + }, + { + "epoch": 2.1690259971509973, + "grad_norm": 0.6552419662475586, + "learning_rate": 7.108013599119394e-05, + "loss": 0.4544, + "step": 97450 + }, + { + "epoch": 2.1692485754985755, + "grad_norm": 0.5547710061073303, + "learning_rate": 7.104449583653518e-05, + "loss": 0.4983, + "step": 97460 + }, + { + "epoch": 2.1694711538461537, + "grad_norm": 0.6505656242370605, + "learning_rate": 7.100886268942411e-05, + "loss": 0.4198, + "step": 97470 + }, + { + "epoch": 2.169693732193732, + "grad_norm": 0.761031985282898, + "learning_rate": 7.097323655179708e-05, + "loss": 0.5211, + "step": 97480 + }, + { + "epoch": 2.1699163105413106, + "grad_norm": 0.5829175710678101, + "learning_rate": 7.093761742558993e-05, + "loss": 0.5456, + "step": 97490 + }, + { + "epoch": 2.170138888888889, + "grad_norm": 0.6106030344963074, + "learning_rate": 7.090200531273832e-05, + "loss": 0.4444, + "step": 97500 + }, + { + "epoch": 2.170361467236467, + "grad_norm": 0.5242830514907837, + "learning_rate": 7.086640021517741e-05, + "loss": 0.4114, + "step": 97510 + }, + { + "epoch": 2.1705840455840457, + "grad_norm": 0.4403768479824066, + "learning_rate": 7.08308021348421e-05, + "loss": 0.4941, + "step": 97520 + }, + { + "epoch": 2.170806623931624, + "grad_norm": 0.6011183261871338, + "learning_rate": 7.079521107366669e-05, + "loss": 0.36, + "step": 97530 + }, + { + "epoch": 2.171029202279202, + "grad_norm": 0.6127752661705017, + "learning_rate": 7.075962703358527e-05, + "loss": 0.4552, + "step": 97540 + }, + { + "epoch": 2.171251780626781, + "grad_norm": 0.5007062554359436, + "learning_rate": 7.072405001653153e-05, + "loss": 0.4492, + "step": 97550 + }, + { + "epoch": 2.171474358974359, + "grad_norm": 0.5939716696739197, + "learning_rate": 7.06884800244388e-05, + "loss": 0.5014, + "step": 97560 + }, + { + "epoch": 2.1716969373219372, + "grad_norm": 0.6019341349601746, + "learning_rate": 7.065291705923984e-05, + "loss": 0.4862, + "step": 97570 + }, + { + "epoch": 2.1719195156695155, + "grad_norm": 0.5671701431274414, + "learning_rate": 7.061736112286728e-05, + "loss": 0.5061, + "step": 97580 + }, + { + "epoch": 2.172142094017094, + "grad_norm": 0.670473039150238, + "learning_rate": 7.058181221725322e-05, + "loss": 0.4381, + "step": 97590 + }, + { + "epoch": 2.1723646723646723, + "grad_norm": 0.33992794156074524, + "learning_rate": 7.054627034432944e-05, + "loss": 0.3826, + "step": 97600 + }, + { + "epoch": 2.1725872507122506, + "grad_norm": 0.4417734444141388, + "learning_rate": 7.051073550602731e-05, + "loss": 0.4528, + "step": 97610 + }, + { + "epoch": 2.1728098290598292, + "grad_norm": 0.40786826610565186, + "learning_rate": 7.047520770427787e-05, + "loss": 0.4451, + "step": 97620 + }, + { + "epoch": 2.1730324074074074, + "grad_norm": 0.3992997407913208, + "learning_rate": 7.043968694101162e-05, + "loss": 0.5139, + "step": 97630 + }, + { + "epoch": 2.1732549857549857, + "grad_norm": 0.45102357864379883, + "learning_rate": 7.040417321815884e-05, + "loss": 0.5253, + "step": 97640 + }, + { + "epoch": 2.173477564102564, + "grad_norm": 0.7903671264648438, + "learning_rate": 7.036866653764944e-05, + "loss": 0.4218, + "step": 97650 + }, + { + "epoch": 2.1737001424501425, + "grad_norm": 0.5459246039390564, + "learning_rate": 7.033316690141278e-05, + "loss": 0.4656, + "step": 97660 + }, + { + "epoch": 2.1739227207977208, + "grad_norm": 0.47902384400367737, + "learning_rate": 7.029767431137794e-05, + "loss": 0.5076, + "step": 97670 + }, + { + "epoch": 2.174145299145299, + "grad_norm": 0.7973486185073853, + "learning_rate": 7.02621887694737e-05, + "loss": 0.4914, + "step": 97680 + }, + { + "epoch": 2.1743678774928776, + "grad_norm": 0.6195580959320068, + "learning_rate": 7.022671027762837e-05, + "loss": 0.4633, + "step": 97690 + }, + { + "epoch": 2.174590455840456, + "grad_norm": 0.8144098520278931, + "learning_rate": 7.019123883776979e-05, + "loss": 0.4831, + "step": 97700 + }, + { + "epoch": 2.174813034188034, + "grad_norm": 0.47490057349205017, + "learning_rate": 7.015577445182555e-05, + "loss": 0.3452, + "step": 97710 + }, + { + "epoch": 2.1750356125356127, + "grad_norm": 0.6486740112304688, + "learning_rate": 7.012031712172283e-05, + "loss": 0.4742, + "step": 97720 + }, + { + "epoch": 2.175258190883191, + "grad_norm": 0.31936442852020264, + "learning_rate": 7.008486684938837e-05, + "loss": 0.467, + "step": 97730 + }, + { + "epoch": 2.175480769230769, + "grad_norm": 0.559054970741272, + "learning_rate": 7.004942363674864e-05, + "loss": 0.5024, + "step": 97740 + }, + { + "epoch": 2.1757033475783474, + "grad_norm": 0.48571863770484924, + "learning_rate": 7.001398748572958e-05, + "loss": 0.5012, + "step": 97750 + }, + { + "epoch": 2.175925925925926, + "grad_norm": 0.38457971811294556, + "learning_rate": 6.997855839825695e-05, + "loss": 0.4385, + "step": 97760 + }, + { + "epoch": 2.1761485042735043, + "grad_norm": 0.526547372341156, + "learning_rate": 6.99431363762558e-05, + "loss": 0.5034, + "step": 97770 + }, + { + "epoch": 2.1763710826210825, + "grad_norm": 0.4569105803966522, + "learning_rate": 6.990772142165118e-05, + "loss": 0.4015, + "step": 97780 + }, + { + "epoch": 2.176593660968661, + "grad_norm": 0.7775120139122009, + "learning_rate": 6.987231353636741e-05, + "loss": 0.4182, + "step": 97790 + }, + { + "epoch": 2.1768162393162394, + "grad_norm": 0.7812454104423523, + "learning_rate": 6.983691272232861e-05, + "loss": 0.3933, + "step": 97800 + }, + { + "epoch": 2.1770388176638176, + "grad_norm": 0.7553454041481018, + "learning_rate": 6.980151898145858e-05, + "loss": 0.4667, + "step": 97810 + }, + { + "epoch": 2.177261396011396, + "grad_norm": 0.45227760076522827, + "learning_rate": 6.976613231568057e-05, + "loss": 0.4502, + "step": 97820 + }, + { + "epoch": 2.1774839743589745, + "grad_norm": 0.9420391917228699, + "learning_rate": 6.97307527269176e-05, + "loss": 0.5162, + "step": 97830 + }, + { + "epoch": 2.1777065527065527, + "grad_norm": 0.39840957522392273, + "learning_rate": 6.969538021709212e-05, + "loss": 0.4783, + "step": 97840 + }, + { + "epoch": 2.177929131054131, + "grad_norm": 0.6985434889793396, + "learning_rate": 6.966001478812636e-05, + "loss": 0.4801, + "step": 97850 + }, + { + "epoch": 2.1781517094017095, + "grad_norm": 0.6731042265892029, + "learning_rate": 6.962465644194207e-05, + "loss": 0.5688, + "step": 97860 + }, + { + "epoch": 2.1783742877492878, + "grad_norm": 0.608867347240448, + "learning_rate": 6.95893051804607e-05, + "loss": 0.4455, + "step": 97870 + }, + { + "epoch": 2.178596866096866, + "grad_norm": 0.4440484046936035, + "learning_rate": 6.955396100560325e-05, + "loss": 0.4191, + "step": 97880 + }, + { + "epoch": 2.1788194444444446, + "grad_norm": 0.5435939431190491, + "learning_rate": 6.951862391929033e-05, + "loss": 0.4844, + "step": 97890 + }, + { + "epoch": 2.179042022792023, + "grad_norm": 0.7728957533836365, + "learning_rate": 6.948329392344228e-05, + "loss": 0.5298, + "step": 97900 + }, + { + "epoch": 2.179264601139601, + "grad_norm": 0.5529747605323792, + "learning_rate": 6.944797101997889e-05, + "loss": 0.5869, + "step": 97910 + }, + { + "epoch": 2.1794871794871793, + "grad_norm": 0.5707988739013672, + "learning_rate": 6.941265521081954e-05, + "loss": 0.4759, + "step": 97920 + }, + { + "epoch": 2.179709757834758, + "grad_norm": 0.4855974614620209, + "learning_rate": 6.937734649788343e-05, + "loss": 0.4444, + "step": 97930 + }, + { + "epoch": 2.179932336182336, + "grad_norm": 0.46950584650039673, + "learning_rate": 6.934204488308924e-05, + "loss": 0.4483, + "step": 97940 + }, + { + "epoch": 2.1801549145299144, + "grad_norm": 0.7412520051002502, + "learning_rate": 6.930675036835528e-05, + "loss": 0.5823, + "step": 97950 + }, + { + "epoch": 2.180377492877493, + "grad_norm": 0.4875216484069824, + "learning_rate": 6.927146295559952e-05, + "loss": 0.4769, + "step": 97960 + }, + { + "epoch": 2.1806000712250713, + "grad_norm": 0.5370801687240601, + "learning_rate": 6.923618264673953e-05, + "loss": 0.5906, + "step": 97970 + }, + { + "epoch": 2.1808226495726495, + "grad_norm": 0.42827099561691284, + "learning_rate": 6.920090944369235e-05, + "loss": 0.5098, + "step": 97980 + }, + { + "epoch": 2.1810452279202277, + "grad_norm": 0.4902758002281189, + "learning_rate": 6.916564334837485e-05, + "loss": 0.4485, + "step": 97990 + }, + { + "epoch": 2.1812678062678064, + "grad_norm": 0.3472527861595154, + "learning_rate": 6.913038436270338e-05, + "loss": 0.3637, + "step": 98000 + }, + { + "epoch": 2.1814903846153846, + "grad_norm": 0.45957064628601074, + "learning_rate": 6.909513248859396e-05, + "loss": 0.4666, + "step": 98010 + }, + { + "epoch": 2.181712962962963, + "grad_norm": 0.7909951210021973, + "learning_rate": 6.905988772796222e-05, + "loss": 0.4711, + "step": 98020 + }, + { + "epoch": 2.1819355413105415, + "grad_norm": 0.5959575772285461, + "learning_rate": 6.902465008272337e-05, + "loss": 0.4394, + "step": 98030 + }, + { + "epoch": 2.1821581196581197, + "grad_norm": 0.5832464098930359, + "learning_rate": 6.89894195547923e-05, + "loss": 0.4472, + "step": 98040 + }, + { + "epoch": 2.182380698005698, + "grad_norm": 0.5050378441810608, + "learning_rate": 6.895419614608346e-05, + "loss": 0.4717, + "step": 98050 + }, + { + "epoch": 2.1826032763532766, + "grad_norm": 0.3996773958206177, + "learning_rate": 6.891897985851077e-05, + "loss": 0.4428, + "step": 98060 + }, + { + "epoch": 2.1828258547008548, + "grad_norm": 0.8722583055496216, + "learning_rate": 6.888377069398804e-05, + "loss": 0.5293, + "step": 98070 + }, + { + "epoch": 2.183048433048433, + "grad_norm": 0.47751516103744507, + "learning_rate": 6.884856865442855e-05, + "loss": 0.4147, + "step": 98080 + }, + { + "epoch": 2.183271011396011, + "grad_norm": 0.5492531061172485, + "learning_rate": 6.881337374174521e-05, + "loss": 0.4396, + "step": 98090 + }, + { + "epoch": 2.18349358974359, + "grad_norm": 0.4488508403301239, + "learning_rate": 6.877818595785053e-05, + "loss": 0.5729, + "step": 98100 + }, + { + "epoch": 2.183716168091168, + "grad_norm": 0.5802498459815979, + "learning_rate": 6.874300530465671e-05, + "loss": 0.468, + "step": 98110 + }, + { + "epoch": 2.1839387464387463, + "grad_norm": 0.6385955214500427, + "learning_rate": 6.870783178407538e-05, + "loss": 0.5316, + "step": 98120 + }, + { + "epoch": 2.184161324786325, + "grad_norm": 0.6732540726661682, + "learning_rate": 6.867266539801796e-05, + "loss": 0.5995, + "step": 98130 + }, + { + "epoch": 2.184383903133903, + "grad_norm": 0.8565722703933716, + "learning_rate": 6.863750614839537e-05, + "loss": 0.4553, + "step": 98140 + }, + { + "epoch": 2.1846064814814814, + "grad_norm": 0.8222180604934692, + "learning_rate": 6.860235403711827e-05, + "loss": 0.5389, + "step": 98150 + }, + { + "epoch": 2.1848290598290596, + "grad_norm": 0.47665002942085266, + "learning_rate": 6.856720906609681e-05, + "loss": 0.5462, + "step": 98160 + }, + { + "epoch": 2.1850516381766383, + "grad_norm": 0.5633779168128967, + "learning_rate": 6.853207123724085e-05, + "loss": 0.4064, + "step": 98170 + }, + { + "epoch": 2.1852742165242165, + "grad_norm": 0.7037586569786072, + "learning_rate": 6.849694055245974e-05, + "loss": 0.4568, + "step": 98180 + }, + { + "epoch": 2.1854967948717947, + "grad_norm": 0.6584839224815369, + "learning_rate": 6.846181701366257e-05, + "loss": 0.4715, + "step": 98190 + }, + { + "epoch": 2.1857193732193734, + "grad_norm": 0.4015384912490845, + "learning_rate": 6.842670062275789e-05, + "loss": 0.4596, + "step": 98200 + }, + { + "epoch": 2.1859419515669516, + "grad_norm": 0.5888437628746033, + "learning_rate": 6.8391591381654e-05, + "loss": 0.4623, + "step": 98210 + }, + { + "epoch": 2.18616452991453, + "grad_norm": 0.5876869559288025, + "learning_rate": 6.835648929225879e-05, + "loss": 0.6175, + "step": 98220 + }, + { + "epoch": 2.1863871082621085, + "grad_norm": 0.43403884768486023, + "learning_rate": 6.832139435647971e-05, + "loss": 0.4205, + "step": 98230 + }, + { + "epoch": 2.1866096866096867, + "grad_norm": 0.6193419694900513, + "learning_rate": 6.828630657622386e-05, + "loss": 0.4591, + "step": 98240 + }, + { + "epoch": 2.186832264957265, + "grad_norm": 0.6113179922103882, + "learning_rate": 6.8251225953398e-05, + "loss": 0.5874, + "step": 98250 + }, + { + "epoch": 2.187054843304843, + "grad_norm": 0.33731210231781006, + "learning_rate": 6.821615248990831e-05, + "loss": 0.4017, + "step": 98260 + }, + { + "epoch": 2.187277421652422, + "grad_norm": 0.556659460067749, + "learning_rate": 6.818108618766077e-05, + "loss": 0.5342, + "step": 98270 + }, + { + "epoch": 2.1875, + "grad_norm": 0.5533732771873474, + "learning_rate": 6.814602704856092e-05, + "loss": 0.4999, + "step": 98280 + }, + { + "epoch": 2.187722578347578, + "grad_norm": 0.6558281779289246, + "learning_rate": 6.811097507451391e-05, + "loss": 0.4326, + "step": 98290 + }, + { + "epoch": 2.187945156695157, + "grad_norm": 0.44423213601112366, + "learning_rate": 6.807593026742456e-05, + "loss": 0.4674, + "step": 98300 + }, + { + "epoch": 2.188167735042735, + "grad_norm": 0.5235666036605835, + "learning_rate": 6.804089262919706e-05, + "loss": 0.5688, + "step": 98310 + }, + { + "epoch": 2.1883903133903133, + "grad_norm": 0.5956048965454102, + "learning_rate": 6.80058621617355e-05, + "loss": 0.555, + "step": 98320 + }, + { + "epoch": 2.1886128917378915, + "grad_norm": 0.46237239241600037, + "learning_rate": 6.797083886694353e-05, + "loss": 0.5437, + "step": 98330 + }, + { + "epoch": 2.18883547008547, + "grad_norm": 0.4554048180580139, + "learning_rate": 6.793582274672416e-05, + "loss": 0.5219, + "step": 98340 + }, + { + "epoch": 2.1890580484330484, + "grad_norm": 0.6391506195068359, + "learning_rate": 6.790081380298032e-05, + "loss": 0.4697, + "step": 98350 + }, + { + "epoch": 2.1892806267806266, + "grad_norm": 0.4395703971385956, + "learning_rate": 6.78658120376144e-05, + "loss": 0.4557, + "step": 98360 + }, + { + "epoch": 2.1895032051282053, + "grad_norm": 0.6101352572441101, + "learning_rate": 6.783081745252839e-05, + "loss": 0.5763, + "step": 98370 + }, + { + "epoch": 2.1897257834757835, + "grad_norm": 0.37331920862197876, + "learning_rate": 6.7795830049624e-05, + "loss": 0.4456, + "step": 98380 + }, + { + "epoch": 2.1899483618233617, + "grad_norm": 0.6069123148918152, + "learning_rate": 6.776084983080247e-05, + "loss": 0.4583, + "step": 98390 + }, + { + "epoch": 2.1901709401709404, + "grad_norm": 0.6946260333061218, + "learning_rate": 6.772587679796456e-05, + "loss": 0.4695, + "step": 98400 + }, + { + "epoch": 2.1903935185185186, + "grad_norm": 0.44884660840034485, + "learning_rate": 6.769091095301079e-05, + "loss": 0.4295, + "step": 98410 + }, + { + "epoch": 2.190616096866097, + "grad_norm": 0.6085039973258972, + "learning_rate": 6.765595229784123e-05, + "loss": 0.4646, + "step": 98420 + }, + { + "epoch": 2.190838675213675, + "grad_norm": 0.300814688205719, + "learning_rate": 6.762100083435562e-05, + "loss": 0.4507, + "step": 98430 + }, + { + "epoch": 2.1910612535612537, + "grad_norm": 0.4522894024848938, + "learning_rate": 6.758605656445315e-05, + "loss": 0.4733, + "step": 98440 + }, + { + "epoch": 2.191283831908832, + "grad_norm": 0.6713006496429443, + "learning_rate": 6.755111949003277e-05, + "loss": 0.5671, + "step": 98450 + }, + { + "epoch": 2.19150641025641, + "grad_norm": 0.6300404667854309, + "learning_rate": 6.751618961299296e-05, + "loss": 0.5089, + "step": 98460 + }, + { + "epoch": 2.191728988603989, + "grad_norm": 0.613237738609314, + "learning_rate": 6.748126693523193e-05, + "loss": 0.4503, + "step": 98470 + }, + { + "epoch": 2.191951566951567, + "grad_norm": 0.6103219985961914, + "learning_rate": 6.74463514586473e-05, + "loss": 0.4693, + "step": 98480 + }, + { + "epoch": 2.1921741452991452, + "grad_norm": 0.7985731363296509, + "learning_rate": 6.741144318513641e-05, + "loss": 0.4554, + "step": 98490 + }, + { + "epoch": 2.1923967236467234, + "grad_norm": 0.6526599526405334, + "learning_rate": 6.737654211659627e-05, + "loss": 0.5476, + "step": 98500 + }, + { + "epoch": 2.192619301994302, + "grad_norm": 0.6740187406539917, + "learning_rate": 6.734164825492339e-05, + "loss": 0.487, + "step": 98510 + }, + { + "epoch": 2.1928418803418803, + "grad_norm": 0.703872561454773, + "learning_rate": 6.730676160201394e-05, + "loss": 0.4868, + "step": 98520 + }, + { + "epoch": 2.1930644586894585, + "grad_norm": 0.702970027923584, + "learning_rate": 6.727188215976376e-05, + "loss": 0.4743, + "step": 98530 + }, + { + "epoch": 2.193287037037037, + "grad_norm": 0.7146590948104858, + "learning_rate": 6.72370099300681e-05, + "loss": 0.3917, + "step": 98540 + }, + { + "epoch": 2.1935096153846154, + "grad_norm": 0.4664157032966614, + "learning_rate": 6.7202144914822e-05, + "loss": 0.4909, + "step": 98550 + }, + { + "epoch": 2.1937321937321936, + "grad_norm": 0.451846718788147, + "learning_rate": 6.716728711592013e-05, + "loss": 0.4031, + "step": 98560 + }, + { + "epoch": 2.1939547720797723, + "grad_norm": 0.9357752799987793, + "learning_rate": 6.713243653525653e-05, + "loss": 0.4001, + "step": 98570 + }, + { + "epoch": 2.1941773504273505, + "grad_norm": 0.5053947567939758, + "learning_rate": 6.709759317472513e-05, + "loss": 0.4501, + "step": 98580 + }, + { + "epoch": 2.1943999287749287, + "grad_norm": 0.5573664903640747, + "learning_rate": 6.706275703621932e-05, + "loss": 0.4847, + "step": 98590 + }, + { + "epoch": 2.194622507122507, + "grad_norm": 0.508842408657074, + "learning_rate": 6.70279281216321e-05, + "loss": 0.4954, + "step": 98600 + }, + { + "epoch": 2.1948450854700856, + "grad_norm": 0.5140504240989685, + "learning_rate": 6.69931064328562e-05, + "loss": 0.4875, + "step": 98610 + }, + { + "epoch": 2.195067663817664, + "grad_norm": 0.7873440384864807, + "learning_rate": 6.69582919717837e-05, + "loss": 0.4963, + "step": 98620 + }, + { + "epoch": 2.195290242165242, + "grad_norm": 0.8808897137641907, + "learning_rate": 6.692348474030652e-05, + "loss": 0.4906, + "step": 98630 + }, + { + "epoch": 2.1955128205128207, + "grad_norm": 0.5072993040084839, + "learning_rate": 6.688868474031614e-05, + "loss": 0.3668, + "step": 98640 + }, + { + "epoch": 2.195735398860399, + "grad_norm": 0.5444533228874207, + "learning_rate": 6.68538919737036e-05, + "loss": 0.4285, + "step": 98650 + }, + { + "epoch": 2.195957977207977, + "grad_norm": 0.4028169810771942, + "learning_rate": 6.681910644235956e-05, + "loss": 0.5166, + "step": 98660 + }, + { + "epoch": 2.1961805555555554, + "grad_norm": 0.655707061290741, + "learning_rate": 6.678432814817437e-05, + "loss": 0.5236, + "step": 98670 + }, + { + "epoch": 2.196403133903134, + "grad_norm": 0.45885083079338074, + "learning_rate": 6.674955709303778e-05, + "loss": 0.3951, + "step": 98680 + }, + { + "epoch": 2.1966257122507122, + "grad_norm": 0.7110733985900879, + "learning_rate": 6.671479327883934e-05, + "loss": 0.4361, + "step": 98690 + }, + { + "epoch": 2.1968482905982905, + "grad_norm": 0.5791854858398438, + "learning_rate": 6.668003670746823e-05, + "loss": 0.4113, + "step": 98700 + }, + { + "epoch": 2.197070868945869, + "grad_norm": 0.6926530599594116, + "learning_rate": 6.664528738081298e-05, + "loss": 0.4975, + "step": 98710 + }, + { + "epoch": 2.1972934472934473, + "grad_norm": 0.48590734601020813, + "learning_rate": 6.661054530076198e-05, + "loss": 0.5481, + "step": 98720 + }, + { + "epoch": 2.1975160256410255, + "grad_norm": 0.49652108550071716, + "learning_rate": 6.657581046920316e-05, + "loss": 0.485, + "step": 98730 + }, + { + "epoch": 2.197738603988604, + "grad_norm": 0.8256610035896301, + "learning_rate": 6.654108288802401e-05, + "loss": 0.4494, + "step": 98740 + }, + { + "epoch": 2.1979611823361824, + "grad_norm": 0.3270527422428131, + "learning_rate": 6.650636255911175e-05, + "loss": 0.4132, + "step": 98750 + }, + { + "epoch": 2.1981837606837606, + "grad_norm": 0.5124560594558716, + "learning_rate": 6.647164948435296e-05, + "loss": 0.3695, + "step": 98760 + }, + { + "epoch": 2.198406339031339, + "grad_norm": 0.5017485022544861, + "learning_rate": 6.643694366563405e-05, + "loss": 0.3798, + "step": 98770 + }, + { + "epoch": 2.1986289173789175, + "grad_norm": 0.7371763586997986, + "learning_rate": 6.640224510484097e-05, + "loss": 0.4621, + "step": 98780 + }, + { + "epoch": 2.1988514957264957, + "grad_norm": 0.4819795489311218, + "learning_rate": 6.636755380385924e-05, + "loss": 0.5866, + "step": 98790 + }, + { + "epoch": 2.199074074074074, + "grad_norm": 0.3381766080856323, + "learning_rate": 6.633286976457404e-05, + "loss": 0.4564, + "step": 98800 + }, + { + "epoch": 2.1992966524216526, + "grad_norm": 0.44364458322525024, + "learning_rate": 6.629819298887019e-05, + "loss": 0.4222, + "step": 98810 + }, + { + "epoch": 2.199519230769231, + "grad_norm": 0.5250563621520996, + "learning_rate": 6.626352347863191e-05, + "loss": 0.4457, + "step": 98820 + }, + { + "epoch": 2.199741809116809, + "grad_norm": 1.1085268259048462, + "learning_rate": 6.622886123574333e-05, + "loss": 0.3828, + "step": 98830 + }, + { + "epoch": 2.1999643874643873, + "grad_norm": 0.5071104764938354, + "learning_rate": 6.619420626208788e-05, + "loss": 0.4507, + "step": 98840 + }, + { + "epoch": 2.200186965811966, + "grad_norm": 0.5599742531776428, + "learning_rate": 6.615955855954878e-05, + "loss": 0.5491, + "step": 98850 + }, + { + "epoch": 2.200409544159544, + "grad_norm": 0.5447997450828552, + "learning_rate": 6.612491813000883e-05, + "loss": 0.4793, + "step": 98860 + }, + { + "epoch": 2.2006321225071224, + "grad_norm": 0.5703315734863281, + "learning_rate": 6.609028497535043e-05, + "loss": 0.3342, + "step": 98870 + }, + { + "epoch": 2.200854700854701, + "grad_norm": 0.5179494619369507, + "learning_rate": 6.605565909745559e-05, + "loss": 0.402, + "step": 98880 + }, + { + "epoch": 2.2010772792022792, + "grad_norm": 0.6648085713386536, + "learning_rate": 6.602104049820594e-05, + "loss": 0.5129, + "step": 98890 + }, + { + "epoch": 2.2012998575498575, + "grad_norm": 0.4397805333137512, + "learning_rate": 6.598642917948255e-05, + "loss": 0.3512, + "step": 98900 + }, + { + "epoch": 2.201522435897436, + "grad_norm": 0.6573788523674011, + "learning_rate": 6.595182514316631e-05, + "loss": 0.5547, + "step": 98910 + }, + { + "epoch": 2.2017450142450143, + "grad_norm": 0.3666999042034149, + "learning_rate": 6.591722839113765e-05, + "loss": 0.4774, + "step": 98920 + }, + { + "epoch": 2.2019675925925926, + "grad_norm": 0.5320939421653748, + "learning_rate": 6.588263892527655e-05, + "loss": 0.4257, + "step": 98930 + }, + { + "epoch": 2.2021901709401708, + "grad_norm": 0.6304528117179871, + "learning_rate": 6.584805674746264e-05, + "loss": 0.464, + "step": 98940 + }, + { + "epoch": 2.2024127492877494, + "grad_norm": 0.6009019017219543, + "learning_rate": 6.581348185957523e-05, + "loss": 0.4468, + "step": 98950 + }, + { + "epoch": 2.2026353276353277, + "grad_norm": 0.4979470372200012, + "learning_rate": 6.577891426349306e-05, + "loss": 0.3993, + "step": 98960 + }, + { + "epoch": 2.202857905982906, + "grad_norm": 0.5306423902511597, + "learning_rate": 6.574435396109448e-05, + "loss": 0.512, + "step": 98970 + }, + { + "epoch": 2.2030804843304845, + "grad_norm": 0.4831714928150177, + "learning_rate": 6.570980095425763e-05, + "loss": 0.4364, + "step": 98980 + }, + { + "epoch": 2.2033030626780628, + "grad_norm": 0.5222032070159912, + "learning_rate": 6.567525524486013e-05, + "loss": 0.4322, + "step": 98990 + }, + { + "epoch": 2.203525641025641, + "grad_norm": 0.742304265499115, + "learning_rate": 6.564071683477924e-05, + "loss": 0.4608, + "step": 99000 + }, + { + "epoch": 2.203748219373219, + "grad_norm": 0.4952321946620941, + "learning_rate": 6.560618572589177e-05, + "loss": 0.3382, + "step": 99010 + }, + { + "epoch": 2.203970797720798, + "grad_norm": 0.4795549213886261, + "learning_rate": 6.557166192007418e-05, + "loss": 0.5629, + "step": 99020 + }, + { + "epoch": 2.204193376068376, + "grad_norm": 0.6933590173721313, + "learning_rate": 6.553714541920259e-05, + "loss": 0.4518, + "step": 99030 + }, + { + "epoch": 2.2044159544159543, + "grad_norm": 0.4522525370121002, + "learning_rate": 6.550263622515256e-05, + "loss": 0.4033, + "step": 99040 + }, + { + "epoch": 2.204638532763533, + "grad_norm": 0.8026001453399658, + "learning_rate": 6.546813433979937e-05, + "loss": 0.4414, + "step": 99050 + }, + { + "epoch": 2.204861111111111, + "grad_norm": 0.5256423950195312, + "learning_rate": 6.543363976501788e-05, + "loss": 0.381, + "step": 99060 + }, + { + "epoch": 2.2050836894586894, + "grad_norm": 0.5207923054695129, + "learning_rate": 6.539915250268258e-05, + "loss": 0.3722, + "step": 99070 + }, + { + "epoch": 2.205306267806268, + "grad_norm": 0.5042715072631836, + "learning_rate": 6.536467255466752e-05, + "loss": 0.4009, + "step": 99080 + }, + { + "epoch": 2.2055288461538463, + "grad_norm": 0.698853611946106, + "learning_rate": 6.533019992284644e-05, + "loss": 0.3718, + "step": 99090 + }, + { + "epoch": 2.2057514245014245, + "grad_norm": 0.4758215546607971, + "learning_rate": 6.529573460909253e-05, + "loss": 0.357, + "step": 99100 + }, + { + "epoch": 2.2059740028490027, + "grad_norm": 0.7511695027351379, + "learning_rate": 6.526127661527861e-05, + "loss": 0.6324, + "step": 99110 + }, + { + "epoch": 2.2061965811965814, + "grad_norm": 0.9775429964065552, + "learning_rate": 6.522682594327722e-05, + "loss": 0.5158, + "step": 99120 + }, + { + "epoch": 2.2064191595441596, + "grad_norm": 0.5471348166465759, + "learning_rate": 6.519238259496046e-05, + "loss": 0.5492, + "step": 99130 + }, + { + "epoch": 2.206641737891738, + "grad_norm": 0.5604672431945801, + "learning_rate": 6.515794657219996e-05, + "loss": 0.3754, + "step": 99140 + }, + { + "epoch": 2.206864316239316, + "grad_norm": 0.7262865304946899, + "learning_rate": 6.512351787686706e-05, + "loss": 0.439, + "step": 99150 + }, + { + "epoch": 2.2070868945868947, + "grad_norm": 0.599807858467102, + "learning_rate": 6.50890965108326e-05, + "loss": 0.487, + "step": 99160 + }, + { + "epoch": 2.207309472934473, + "grad_norm": 0.7784044742584229, + "learning_rate": 6.505468247596713e-05, + "loss": 0.4654, + "step": 99170 + }, + { + "epoch": 2.207532051282051, + "grad_norm": 0.5841115117073059, + "learning_rate": 6.502027577414062e-05, + "loss": 0.4926, + "step": 99180 + }, + { + "epoch": 2.2077546296296298, + "grad_norm": 0.38955599069595337, + "learning_rate": 6.498587640722285e-05, + "loss": 0.4819, + "step": 99190 + }, + { + "epoch": 2.207977207977208, + "grad_norm": 0.7164641618728638, + "learning_rate": 6.495148437708308e-05, + "loss": 0.4765, + "step": 99200 + }, + { + "epoch": 2.208199786324786, + "grad_norm": 0.44146817922592163, + "learning_rate": 6.491709968559019e-05, + "loss": 0.4453, + "step": 99210 + }, + { + "epoch": 2.208422364672365, + "grad_norm": 0.5845023393630981, + "learning_rate": 6.488272233461274e-05, + "loss": 0.5473, + "step": 99220 + }, + { + "epoch": 2.208644943019943, + "grad_norm": 0.40102317929267883, + "learning_rate": 6.484835232601873e-05, + "loss": 0.4975, + "step": 99230 + }, + { + "epoch": 2.2088675213675213, + "grad_norm": 0.5128397345542908, + "learning_rate": 6.481398966167595e-05, + "loss": 0.4143, + "step": 99240 + }, + { + "epoch": 2.2090900997150995, + "grad_norm": 0.6717884540557861, + "learning_rate": 6.477963434345158e-05, + "loss": 0.4679, + "step": 99250 + }, + { + "epoch": 2.209312678062678, + "grad_norm": 0.46634578704833984, + "learning_rate": 6.474528637321258e-05, + "loss": 0.4547, + "step": 99260 + }, + { + "epoch": 2.2095352564102564, + "grad_norm": 0.6119767427444458, + "learning_rate": 6.471094575282544e-05, + "loss": 0.4605, + "step": 99270 + }, + { + "epoch": 2.2097578347578346, + "grad_norm": 0.538398027420044, + "learning_rate": 6.467661248415624e-05, + "loss": 0.4892, + "step": 99280 + }, + { + "epoch": 2.2099804131054133, + "grad_norm": 0.5721553564071655, + "learning_rate": 6.464228656907071e-05, + "loss": 0.4595, + "step": 99290 + }, + { + "epoch": 2.2102029914529915, + "grad_norm": 0.43349581956863403, + "learning_rate": 6.460796800943423e-05, + "loss": 0.423, + "step": 99300 + }, + { + "epoch": 2.2104255698005697, + "grad_norm": 0.7425587773323059, + "learning_rate": 6.457365680711151e-05, + "loss": 0.4235, + "step": 99310 + }, + { + "epoch": 2.210648148148148, + "grad_norm": 0.4731992781162262, + "learning_rate": 6.453935296396716e-05, + "loss": 0.5229, + "step": 99320 + }, + { + "epoch": 2.2108707264957266, + "grad_norm": 0.6548227667808533, + "learning_rate": 6.450505648186524e-05, + "loss": 0.4649, + "step": 99330 + }, + { + "epoch": 2.211093304843305, + "grad_norm": 0.8874356746673584, + "learning_rate": 6.44707673626695e-05, + "loss": 0.5212, + "step": 99340 + }, + { + "epoch": 2.211315883190883, + "grad_norm": 0.5008230805397034, + "learning_rate": 6.443648560824326e-05, + "loss": 0.4533, + "step": 99350 + }, + { + "epoch": 2.2115384615384617, + "grad_norm": 0.4697886109352112, + "learning_rate": 6.440221122044932e-05, + "loss": 0.4913, + "step": 99360 + }, + { + "epoch": 2.21176103988604, + "grad_norm": 0.49064961075782776, + "learning_rate": 6.43679442011502e-05, + "loss": 0.4923, + "step": 99370 + }, + { + "epoch": 2.211983618233618, + "grad_norm": 0.4335954487323761, + "learning_rate": 6.433368455220811e-05, + "loss": 0.4019, + "step": 99380 + }, + { + "epoch": 2.2122061965811968, + "grad_norm": 0.6782045364379883, + "learning_rate": 6.42994322754846e-05, + "loss": 0.4978, + "step": 99390 + }, + { + "epoch": 2.212428774928775, + "grad_norm": 0.6394982933998108, + "learning_rate": 6.426518737284102e-05, + "loss": 0.4363, + "step": 99400 + }, + { + "epoch": 2.212651353276353, + "grad_norm": 0.620266854763031, + "learning_rate": 6.42309498461383e-05, + "loss": 0.4837, + "step": 99410 + }, + { + "epoch": 2.2128739316239314, + "grad_norm": 0.43421876430511475, + "learning_rate": 6.419671969723686e-05, + "loss": 0.5502, + "step": 99420 + }, + { + "epoch": 2.21309650997151, + "grad_norm": 0.6228753328323364, + "learning_rate": 6.416249692799689e-05, + "loss": 0.4654, + "step": 99430 + }, + { + "epoch": 2.2133190883190883, + "grad_norm": 0.5116588473320007, + "learning_rate": 6.41282815402781e-05, + "loss": 0.4352, + "step": 99440 + }, + { + "epoch": 2.2135416666666665, + "grad_norm": 0.34148287773132324, + "learning_rate": 6.409407353593964e-05, + "loss": 0.4115, + "step": 99450 + }, + { + "epoch": 2.213764245014245, + "grad_norm": 0.524929404258728, + "learning_rate": 6.405987291684049e-05, + "loss": 0.548, + "step": 99460 + }, + { + "epoch": 2.2139868233618234, + "grad_norm": 0.5598315000534058, + "learning_rate": 6.402567968483913e-05, + "loss": 0.4296, + "step": 99470 + }, + { + "epoch": 2.2142094017094016, + "grad_norm": 0.4203566908836365, + "learning_rate": 6.39914938417937e-05, + "loss": 0.3476, + "step": 99480 + }, + { + "epoch": 2.21443198005698, + "grad_norm": 0.6650808453559875, + "learning_rate": 6.39573153895618e-05, + "loss": 0.5173, + "step": 99490 + }, + { + "epoch": 2.2146545584045585, + "grad_norm": 0.5748631954193115, + "learning_rate": 6.392314433000073e-05, + "loss": 0.445, + "step": 99500 + }, + { + "epoch": 2.2148771367521367, + "grad_norm": 0.6141366958618164, + "learning_rate": 6.38889806649674e-05, + "loss": 0.4875, + "step": 99510 + }, + { + "epoch": 2.215099715099715, + "grad_norm": 0.6306594014167786, + "learning_rate": 6.385482439631836e-05, + "loss": 0.3793, + "step": 99520 + }, + { + "epoch": 2.2153222934472936, + "grad_norm": 0.9447283148765564, + "learning_rate": 6.382067552590954e-05, + "loss": 0.5865, + "step": 99530 + }, + { + "epoch": 2.215544871794872, + "grad_norm": 0.770289957523346, + "learning_rate": 6.378653405559669e-05, + "loss": 0.4947, + "step": 99540 + }, + { + "epoch": 2.21576745014245, + "grad_norm": 0.5178115367889404, + "learning_rate": 6.37523999872351e-05, + "loss": 0.4488, + "step": 99550 + }, + { + "epoch": 2.2159900284900287, + "grad_norm": 0.9453518986701965, + "learning_rate": 6.371827332267964e-05, + "loss": 0.5901, + "step": 99560 + }, + { + "epoch": 2.216212606837607, + "grad_norm": 0.7093714475631714, + "learning_rate": 6.368415406378476e-05, + "loss": 0.4739, + "step": 99570 + }, + { + "epoch": 2.216435185185185, + "grad_norm": 0.5546488165855408, + "learning_rate": 6.365004221240461e-05, + "loss": 0.511, + "step": 99580 + }, + { + "epoch": 2.2166577635327633, + "grad_norm": 0.3860114514827728, + "learning_rate": 6.361593777039272e-05, + "loss": 0.461, + "step": 99590 + }, + { + "epoch": 2.216880341880342, + "grad_norm": 0.4976522922515869, + "learning_rate": 6.358184073960241e-05, + "loss": 0.4717, + "step": 99600 + }, + { + "epoch": 2.21710292022792, + "grad_norm": 0.7040444612503052, + "learning_rate": 6.354775112188662e-05, + "loss": 0.3994, + "step": 99610 + }, + { + "epoch": 2.2173254985754984, + "grad_norm": 0.4434584379196167, + "learning_rate": 6.351366891909768e-05, + "loss": 0.4361, + "step": 99620 + }, + { + "epoch": 2.217548076923077, + "grad_norm": 0.6409199833869934, + "learning_rate": 6.347959413308773e-05, + "loss": 0.4323, + "step": 99630 + }, + { + "epoch": 2.2177706552706553, + "grad_norm": 0.7286927103996277, + "learning_rate": 6.344552676570836e-05, + "loss": 0.4481, + "step": 99640 + }, + { + "epoch": 2.2179932336182335, + "grad_norm": 0.8713966608047485, + "learning_rate": 6.341146681881087e-05, + "loss": 0.4698, + "step": 99650 + }, + { + "epoch": 2.2182158119658117, + "grad_norm": 0.7125070095062256, + "learning_rate": 6.337741429424615e-05, + "loss": 0.6052, + "step": 99660 + }, + { + "epoch": 2.2184383903133904, + "grad_norm": 0.6974532604217529, + "learning_rate": 6.334336919386452e-05, + "loss": 0.3992, + "step": 99670 + }, + { + "epoch": 2.2186609686609686, + "grad_norm": 0.709204912185669, + "learning_rate": 6.330933151951608e-05, + "loss": 0.5381, + "step": 99680 + }, + { + "epoch": 2.218883547008547, + "grad_norm": 0.6567760705947876, + "learning_rate": 6.327530127305046e-05, + "loss": 0.3979, + "step": 99690 + }, + { + "epoch": 2.2191061253561255, + "grad_norm": 0.5433497428894043, + "learning_rate": 6.324127845631688e-05, + "loss": 0.5281, + "step": 99700 + }, + { + "epoch": 2.2193287037037037, + "grad_norm": 0.6927756071090698, + "learning_rate": 6.320726307116422e-05, + "loss": 0.5071, + "step": 99710 + }, + { + "epoch": 2.219551282051282, + "grad_norm": 0.43783435225486755, + "learning_rate": 6.317325511944093e-05, + "loss": 0.4411, + "step": 99720 + }, + { + "epoch": 2.2197738603988606, + "grad_norm": 0.6120383739471436, + "learning_rate": 6.313925460299488e-05, + "loss": 0.3843, + "step": 99730 + }, + { + "epoch": 2.219996438746439, + "grad_norm": 0.6174899935722351, + "learning_rate": 6.310526152367377e-05, + "loss": 0.4308, + "step": 99740 + }, + { + "epoch": 2.220219017094017, + "grad_norm": 0.6677566170692444, + "learning_rate": 6.307127588332491e-05, + "loss": 0.4542, + "step": 99750 + }, + { + "epoch": 2.220263532763533, + "eval_loss": 0.5339695811271667, + "eval_runtime": 337.4487, + "eval_samples_per_second": 7.008, + "eval_steps_per_second": 7.008, + "step": 99752 + }, + { + "epoch": 2.2204415954415953, + "grad_norm": 0.6040011644363403, + "learning_rate": 6.303729768379493e-05, + "loss": 0.5667, + "step": 99760 + }, + { + "epoch": 2.220664173789174, + "grad_norm": 0.40889132022857666, + "learning_rate": 6.300332692693032e-05, + "loss": 0.4262, + "step": 99770 + }, + { + "epoch": 2.220886752136752, + "grad_norm": 0.6798517107963562, + "learning_rate": 6.296936361457709e-05, + "loss": 0.4203, + "step": 99780 + }, + { + "epoch": 2.2211093304843303, + "grad_norm": 0.3974006474018097, + "learning_rate": 6.29354077485808e-05, + "loss": 0.3396, + "step": 99790 + }, + { + "epoch": 2.221331908831909, + "grad_norm": 0.5051971077919006, + "learning_rate": 6.290145933078673e-05, + "loss": 0.4209, + "step": 99800 + }, + { + "epoch": 2.2215544871794872, + "grad_norm": 0.5820755958557129, + "learning_rate": 6.286751836303952e-05, + "loss": 0.4572, + "step": 99810 + }, + { + "epoch": 2.2217770655270654, + "grad_norm": 0.5005538463592529, + "learning_rate": 6.283358484718365e-05, + "loss": 0.4265, + "step": 99820 + }, + { + "epoch": 2.2219996438746437, + "grad_norm": 0.4250526428222656, + "learning_rate": 6.279965878506305e-05, + "loss": 0.4732, + "step": 99830 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.6704878807067871, + "learning_rate": 6.27657401785213e-05, + "loss": 0.4835, + "step": 99840 + }, + { + "epoch": 2.2224448005698005, + "grad_norm": 0.7315400838851929, + "learning_rate": 6.27318290294016e-05, + "loss": 0.4405, + "step": 99850 + }, + { + "epoch": 2.2226673789173788, + "grad_norm": 0.6555359363555908, + "learning_rate": 6.269792533954673e-05, + "loss": 0.3954, + "step": 99860 + }, + { + "epoch": 2.2228899572649574, + "grad_norm": 0.6594572067260742, + "learning_rate": 6.266402911079894e-05, + "loss": 0.4352, + "step": 99870 + }, + { + "epoch": 2.2231125356125356, + "grad_norm": 0.5686958432197571, + "learning_rate": 6.26301403450003e-05, + "loss": 0.4538, + "step": 99880 + }, + { + "epoch": 2.223335113960114, + "grad_norm": 0.5742239356040955, + "learning_rate": 6.259625904399223e-05, + "loss": 0.4345, + "step": 99890 + }, + { + "epoch": 2.2235576923076925, + "grad_norm": 0.5327342748641968, + "learning_rate": 6.25623852096159e-05, + "loss": 0.557, + "step": 99900 + }, + { + "epoch": 2.2237802706552707, + "grad_norm": 0.6104030013084412, + "learning_rate": 6.252851884371209e-05, + "loss": 0.4583, + "step": 99910 + }, + { + "epoch": 2.224002849002849, + "grad_norm": 0.6607154011726379, + "learning_rate": 6.249465994812111e-05, + "loss": 0.6308, + "step": 99920 + }, + { + "epoch": 2.224225427350427, + "grad_norm": 0.5360001921653748, + "learning_rate": 6.246080852468288e-05, + "loss": 0.4952, + "step": 99930 + }, + { + "epoch": 2.224448005698006, + "grad_norm": 0.7452919483184814, + "learning_rate": 6.242696457523696e-05, + "loss": 0.3986, + "step": 99940 + }, + { + "epoch": 2.224670584045584, + "grad_norm": 0.3966030776500702, + "learning_rate": 6.239312810162234e-05, + "loss": 0.4379, + "step": 99950 + }, + { + "epoch": 2.2248931623931623, + "grad_norm": 0.6134777069091797, + "learning_rate": 6.235929910567781e-05, + "loss": 0.4995, + "step": 99960 + }, + { + "epoch": 2.225115740740741, + "grad_norm": 0.5449444055557251, + "learning_rate": 6.232547758924163e-05, + "loss": 0.4427, + "step": 99970 + }, + { + "epoch": 2.225338319088319, + "grad_norm": 0.8455405831336975, + "learning_rate": 6.22916635541517e-05, + "loss": 0.4743, + "step": 99980 + }, + { + "epoch": 2.2255608974358974, + "grad_norm": 0.809389591217041, + "learning_rate": 6.22578570022455e-05, + "loss": 0.4629, + "step": 99990 + }, + { + "epoch": 2.2257834757834756, + "grad_norm": 0.720911979675293, + "learning_rate": 6.222405793536021e-05, + "loss": 0.5837, + "step": 100000 + }, + { + "epoch": 2.2260060541310542, + "grad_norm": 0.6490511298179626, + "learning_rate": 6.219026635533232e-05, + "loss": 0.4231, + "step": 100010 + }, + { + "epoch": 2.2262286324786325, + "grad_norm": 0.6139844059944153, + "learning_rate": 6.215648226399822e-05, + "loss": 0.3518, + "step": 100020 + }, + { + "epoch": 2.2264512108262107, + "grad_norm": 0.4793091118335724, + "learning_rate": 6.212270566319368e-05, + "loss": 0.4306, + "step": 100030 + }, + { + "epoch": 2.2266737891737893, + "grad_norm": 0.554528534412384, + "learning_rate": 6.208893655475417e-05, + "loss": 0.4464, + "step": 100040 + }, + { + "epoch": 2.2268963675213675, + "grad_norm": 0.4235374331474304, + "learning_rate": 6.205517494051474e-05, + "loss": 0.4374, + "step": 100050 + }, + { + "epoch": 2.2271189458689458, + "grad_norm": 0.6234670877456665, + "learning_rate": 6.202142082231002e-05, + "loss": 0.5593, + "step": 100060 + }, + { + "epoch": 2.2273415242165244, + "grad_norm": 0.31259241700172424, + "learning_rate": 6.198767420197427e-05, + "loss": 0.4652, + "step": 100070 + }, + { + "epoch": 2.2275641025641026, + "grad_norm": 0.3480241000652313, + "learning_rate": 6.195393508134136e-05, + "loss": 0.4259, + "step": 100080 + }, + { + "epoch": 2.227786680911681, + "grad_norm": 0.5502161979675293, + "learning_rate": 6.192020346224455e-05, + "loss": 0.3988, + "step": 100090 + }, + { + "epoch": 2.228009259259259, + "grad_norm": 0.5438657402992249, + "learning_rate": 6.18864793465169e-05, + "loss": 0.3868, + "step": 100100 + }, + { + "epoch": 2.2282318376068377, + "grad_norm": 0.5434377789497375, + "learning_rate": 6.185276273599106e-05, + "loss": 0.5102, + "step": 100110 + }, + { + "epoch": 2.228454415954416, + "grad_norm": 0.5145140886306763, + "learning_rate": 6.181905363249916e-05, + "loss": 0.3758, + "step": 100120 + }, + { + "epoch": 2.228676994301994, + "grad_norm": 0.4315755069255829, + "learning_rate": 6.1785352037873e-05, + "loss": 0.4181, + "step": 100130 + }, + { + "epoch": 2.228899572649573, + "grad_norm": 0.699902355670929, + "learning_rate": 6.175165795394407e-05, + "loss": 0.4775, + "step": 100140 + }, + { + "epoch": 2.229122150997151, + "grad_norm": 0.6445436477661133, + "learning_rate": 6.171797138254312e-05, + "loss": 0.4147, + "step": 100150 + }, + { + "epoch": 2.2293447293447293, + "grad_norm": 0.6389631032943726, + "learning_rate": 6.168429232550088e-05, + "loss": 0.3922, + "step": 100160 + }, + { + "epoch": 2.2295673076923075, + "grad_norm": 0.6590464115142822, + "learning_rate": 6.165062078464735e-05, + "loss": 0.5344, + "step": 100170 + }, + { + "epoch": 2.229789886039886, + "grad_norm": 0.5974990129470825, + "learning_rate": 6.161695676181236e-05, + "loss": 0.4084, + "step": 100180 + }, + { + "epoch": 2.2300124643874644, + "grad_norm": 0.48121482133865356, + "learning_rate": 6.158330025882522e-05, + "loss": 0.3951, + "step": 100190 + }, + { + "epoch": 2.2302350427350426, + "grad_norm": 0.6151335835456848, + "learning_rate": 6.154965127751486e-05, + "loss": 0.4861, + "step": 100200 + }, + { + "epoch": 2.2304576210826212, + "grad_norm": 0.6503200531005859, + "learning_rate": 6.15160098197098e-05, + "loss": 0.4966, + "step": 100210 + }, + { + "epoch": 2.2306801994301995, + "grad_norm": 0.555203914642334, + "learning_rate": 6.148237588723817e-05, + "loss": 0.4536, + "step": 100220 + }, + { + "epoch": 2.2309027777777777, + "grad_norm": 0.7991825938224792, + "learning_rate": 6.144874948192758e-05, + "loss": 0.5368, + "step": 100230 + }, + { + "epoch": 2.2311253561253563, + "grad_norm": 0.4899354875087738, + "learning_rate": 6.141513060560537e-05, + "loss": 0.4644, + "step": 100240 + }, + { + "epoch": 2.2313479344729346, + "grad_norm": 0.5462999939918518, + "learning_rate": 6.138151926009843e-05, + "loss": 0.3995, + "step": 100250 + }, + { + "epoch": 2.2315705128205128, + "grad_norm": 0.4971529245376587, + "learning_rate": 6.134791544723319e-05, + "loss": 0.4257, + "step": 100260 + }, + { + "epoch": 2.231793091168091, + "grad_norm": 0.66850745677948, + "learning_rate": 6.131431916883579e-05, + "loss": 0.4657, + "step": 100270 + }, + { + "epoch": 2.2320156695156697, + "grad_norm": 0.5258366465568542, + "learning_rate": 6.128073042673176e-05, + "loss": 0.4896, + "step": 100280 + }, + { + "epoch": 2.232238247863248, + "grad_norm": 0.7665773630142212, + "learning_rate": 6.124714922274648e-05, + "loss": 0.496, + "step": 100290 + }, + { + "epoch": 2.232460826210826, + "grad_norm": 0.4078090786933899, + "learning_rate": 6.121357555870462e-05, + "loss": 0.4215, + "step": 100300 + }, + { + "epoch": 2.2326834045584047, + "grad_norm": 0.5825769305229187, + "learning_rate": 6.11800094364307e-05, + "loss": 0.4048, + "step": 100310 + }, + { + "epoch": 2.232905982905983, + "grad_norm": 0.3658164441585541, + "learning_rate": 6.114645085774868e-05, + "loss": 0.4594, + "step": 100320 + }, + { + "epoch": 2.233128561253561, + "grad_norm": 0.6267021894454956, + "learning_rate": 6.111289982448221e-05, + "loss": 0.4313, + "step": 100330 + }, + { + "epoch": 2.2333511396011394, + "grad_norm": 0.7192836999893188, + "learning_rate": 6.107935633845445e-05, + "loss": 0.5962, + "step": 100340 + }, + { + "epoch": 2.233573717948718, + "grad_norm": 0.5349550247192383, + "learning_rate": 6.104582040148821e-05, + "loss": 0.4404, + "step": 100350 + }, + { + "epoch": 2.2337962962962963, + "grad_norm": 0.5640553832054138, + "learning_rate": 6.10122920154059e-05, + "loss": 0.5369, + "step": 100360 + }, + { + "epoch": 2.2340188746438745, + "grad_norm": 0.45412135124206543, + "learning_rate": 6.0978771182029346e-05, + "loss": 0.5089, + "step": 100370 + }, + { + "epoch": 2.234241452991453, + "grad_norm": 0.500693142414093, + "learning_rate": 6.0945257903180196e-05, + "loss": 0.4994, + "step": 100380 + }, + { + "epoch": 2.2344640313390314, + "grad_norm": 0.3699854910373688, + "learning_rate": 6.091175218067955e-05, + "loss": 0.3459, + "step": 100390 + }, + { + "epoch": 2.2346866096866096, + "grad_norm": 0.40656983852386475, + "learning_rate": 6.0878254016348214e-05, + "loss": 0.512, + "step": 100400 + }, + { + "epoch": 2.2349091880341883, + "grad_norm": 0.769273579120636, + "learning_rate": 6.08447634120064e-05, + "loss": 0.4603, + "step": 100410 + }, + { + "epoch": 2.2351317663817665, + "grad_norm": 0.7436431646347046, + "learning_rate": 6.0811280369474054e-05, + "loss": 0.4733, + "step": 100420 + }, + { + "epoch": 2.2353543447293447, + "grad_norm": 0.697196900844574, + "learning_rate": 6.077780489057072e-05, + "loss": 0.57, + "step": 100430 + }, + { + "epoch": 2.235576923076923, + "grad_norm": 0.48001793026924133, + "learning_rate": 6.0744336977115414e-05, + "loss": 0.5416, + "step": 100440 + }, + { + "epoch": 2.2357995014245016, + "grad_norm": 0.7711690664291382, + "learning_rate": 6.0710876630926805e-05, + "loss": 0.4299, + "step": 100450 + }, + { + "epoch": 2.23602207977208, + "grad_norm": 0.48359090089797974, + "learning_rate": 6.06774238538232e-05, + "loss": 0.4307, + "step": 100460 + }, + { + "epoch": 2.236244658119658, + "grad_norm": 0.7829921245574951, + "learning_rate": 6.0643978647622435e-05, + "loss": 0.4167, + "step": 100470 + }, + { + "epoch": 2.2364672364672367, + "grad_norm": 0.5604079365730286, + "learning_rate": 6.0610541014141944e-05, + "loss": 0.4173, + "step": 100480 + }, + { + "epoch": 2.236689814814815, + "grad_norm": 0.6749580502510071, + "learning_rate": 6.057711095519878e-05, + "loss": 0.5747, + "step": 100490 + }, + { + "epoch": 2.236912393162393, + "grad_norm": 0.7519851326942444, + "learning_rate": 6.0543688472609604e-05, + "loss": 0.4357, + "step": 100500 + }, + { + "epoch": 2.2371349715099713, + "grad_norm": 0.6080546975135803, + "learning_rate": 6.05102735681905e-05, + "loss": 0.5401, + "step": 100510 + }, + { + "epoch": 2.23735754985755, + "grad_norm": 0.5083986520767212, + "learning_rate": 6.04768662437573e-05, + "loss": 0.4284, + "step": 100520 + }, + { + "epoch": 2.237580128205128, + "grad_norm": 0.6632825136184692, + "learning_rate": 6.0443466501125486e-05, + "loss": 0.5337, + "step": 100530 + }, + { + "epoch": 2.2378027065527064, + "grad_norm": 0.6212065815925598, + "learning_rate": 6.04100743421099e-05, + "loss": 0.4282, + "step": 100540 + }, + { + "epoch": 2.238025284900285, + "grad_norm": 0.404258131980896, + "learning_rate": 6.037668976852513e-05, + "loss": 0.394, + "step": 100550 + }, + { + "epoch": 2.2382478632478633, + "grad_norm": 0.559452474117279, + "learning_rate": 6.0343312782185346e-05, + "loss": 0.5065, + "step": 100560 + }, + { + "epoch": 2.2384704415954415, + "grad_norm": 0.44808855652809143, + "learning_rate": 6.030994338490432e-05, + "loss": 0.4189, + "step": 100570 + }, + { + "epoch": 2.23869301994302, + "grad_norm": 0.48122096061706543, + "learning_rate": 6.027658157849529e-05, + "loss": 0.4731, + "step": 100580 + }, + { + "epoch": 2.2389155982905984, + "grad_norm": 0.5979316234588623, + "learning_rate": 6.024322736477117e-05, + "loss": 0.4881, + "step": 100590 + }, + { + "epoch": 2.2391381766381766, + "grad_norm": 0.40405070781707764, + "learning_rate": 6.020988074554452e-05, + "loss": 0.3934, + "step": 100600 + }, + { + "epoch": 2.239360754985755, + "grad_norm": 0.6947756409645081, + "learning_rate": 6.017654172262737e-05, + "loss": 0.4821, + "step": 100610 + }, + { + "epoch": 2.2395833333333335, + "grad_norm": 0.7525035738945007, + "learning_rate": 6.0143210297831385e-05, + "loss": 0.4503, + "step": 100620 + }, + { + "epoch": 2.2398059116809117, + "grad_norm": 0.4716576933860779, + "learning_rate": 6.010988647296787e-05, + "loss": 0.5023, + "step": 100630 + }, + { + "epoch": 2.24002849002849, + "grad_norm": 0.7538976669311523, + "learning_rate": 6.007657024984772e-05, + "loss": 0.4782, + "step": 100640 + }, + { + "epoch": 2.2402510683760686, + "grad_norm": 0.5350600481033325, + "learning_rate": 6.00432616302812e-05, + "loss": 0.4736, + "step": 100650 + }, + { + "epoch": 2.240473646723647, + "grad_norm": 0.894558846950531, + "learning_rate": 6.0009960616078484e-05, + "loss": 0.3729, + "step": 100660 + }, + { + "epoch": 2.240696225071225, + "grad_norm": 0.5190034508705139, + "learning_rate": 5.997666720904907e-05, + "loss": 0.4033, + "step": 100670 + }, + { + "epoch": 2.2409188034188032, + "grad_norm": 0.5922912955284119, + "learning_rate": 5.994338141100215e-05, + "loss": 0.3694, + "step": 100680 + }, + { + "epoch": 2.241141381766382, + "grad_norm": 0.42299923300743103, + "learning_rate": 5.9910103223746574e-05, + "loss": 0.5054, + "step": 100690 + }, + { + "epoch": 2.24136396011396, + "grad_norm": 0.7082996368408203, + "learning_rate": 5.9876832649090655e-05, + "loss": 0.5104, + "step": 100700 + }, + { + "epoch": 2.2415865384615383, + "grad_norm": 0.6098785996437073, + "learning_rate": 5.9843569688842415e-05, + "loss": 0.4679, + "step": 100710 + }, + { + "epoch": 2.241809116809117, + "grad_norm": 0.6080827116966248, + "learning_rate": 5.981031434480928e-05, + "loss": 0.3405, + "step": 100720 + }, + { + "epoch": 2.242031695156695, + "grad_norm": 0.55560702085495, + "learning_rate": 5.977706661879843e-05, + "loss": 0.4762, + "step": 100730 + }, + { + "epoch": 2.2422542735042734, + "grad_norm": 0.36331528425216675, + "learning_rate": 5.974382651261659e-05, + "loss": 0.4664, + "step": 100740 + }, + { + "epoch": 2.242476851851852, + "grad_norm": 0.5189181566238403, + "learning_rate": 5.971059402807e-05, + "loss": 0.3889, + "step": 100750 + }, + { + "epoch": 2.2426994301994303, + "grad_norm": 0.5288671851158142, + "learning_rate": 5.96773691669646e-05, + "loss": 0.5082, + "step": 100760 + }, + { + "epoch": 2.2429220085470085, + "grad_norm": 0.4157465100288391, + "learning_rate": 5.964415193110584e-05, + "loss": 0.5014, + "step": 100770 + }, + { + "epoch": 2.2431445868945867, + "grad_norm": 0.5020760893821716, + "learning_rate": 5.9610942322298805e-05, + "loss": 0.4667, + "step": 100780 + }, + { + "epoch": 2.2433671652421654, + "grad_norm": 0.4918922185897827, + "learning_rate": 5.9577740342348044e-05, + "loss": 0.5327, + "step": 100790 + }, + { + "epoch": 2.2435897435897436, + "grad_norm": 0.5720158815383911, + "learning_rate": 5.954454599305788e-05, + "loss": 0.5067, + "step": 100800 + }, + { + "epoch": 2.243812321937322, + "grad_norm": 0.6430893540382385, + "learning_rate": 5.9511359276232015e-05, + "loss": 0.5036, + "step": 100810 + }, + { + "epoch": 2.2440349002849005, + "grad_norm": 0.8744356036186218, + "learning_rate": 5.94781801936739e-05, + "loss": 0.542, + "step": 100820 + }, + { + "epoch": 2.2442574786324787, + "grad_norm": 0.4596138894557953, + "learning_rate": 5.9445008747186505e-05, + "loss": 0.4761, + "step": 100830 + }, + { + "epoch": 2.244480056980057, + "grad_norm": 0.5335553884506226, + "learning_rate": 5.9411844938572394e-05, + "loss": 0.518, + "step": 100840 + }, + { + "epoch": 2.244702635327635, + "grad_norm": 0.6477447748184204, + "learning_rate": 5.9378688769633796e-05, + "loss": 0.5032, + "step": 100850 + }, + { + "epoch": 2.244925213675214, + "grad_norm": 0.8028891682624817, + "learning_rate": 5.9345540242172295e-05, + "loss": 0.5746, + "step": 100860 + }, + { + "epoch": 2.245147792022792, + "grad_norm": 0.5045342445373535, + "learning_rate": 5.931239935798927e-05, + "loss": 0.3358, + "step": 100870 + }, + { + "epoch": 2.2453703703703702, + "grad_norm": 0.6941115260124207, + "learning_rate": 5.9279266118885655e-05, + "loss": 0.3895, + "step": 100880 + }, + { + "epoch": 2.245592948717949, + "grad_norm": 0.38937705755233765, + "learning_rate": 5.924614052666191e-05, + "loss": 0.411, + "step": 100890 + }, + { + "epoch": 2.245815527065527, + "grad_norm": 0.641523540019989, + "learning_rate": 5.921302258311812e-05, + "loss": 0.5199, + "step": 100900 + }, + { + "epoch": 2.2460381054131053, + "grad_norm": 0.3802975118160248, + "learning_rate": 5.917991229005393e-05, + "loss": 0.3925, + "step": 100910 + }, + { + "epoch": 2.246260683760684, + "grad_norm": 0.4550374150276184, + "learning_rate": 5.914680964926866e-05, + "loss": 0.4571, + "step": 100920 + }, + { + "epoch": 2.246483262108262, + "grad_norm": 0.8346225619316101, + "learning_rate": 5.911371466256106e-05, + "loss": 0.4954, + "step": 100930 + }, + { + "epoch": 2.2467058404558404, + "grad_norm": 0.4897273778915405, + "learning_rate": 5.9080627331729455e-05, + "loss": 0.4012, + "step": 100940 + }, + { + "epoch": 2.2469284188034186, + "grad_norm": 0.7520624995231628, + "learning_rate": 5.904754765857195e-05, + "loss": 0.5254, + "step": 100950 + }, + { + "epoch": 2.2471509971509973, + "grad_norm": 0.49854952096939087, + "learning_rate": 5.901447564488609e-05, + "loss": 0.5773, + "step": 100960 + }, + { + "epoch": 2.2473735754985755, + "grad_norm": 0.7178290486335754, + "learning_rate": 5.898141129246903e-05, + "loss": 0.4814, + "step": 100970 + }, + { + "epoch": 2.2475961538461537, + "grad_norm": 0.5157769918441772, + "learning_rate": 5.894835460311752e-05, + "loss": 0.5249, + "step": 100980 + }, + { + "epoch": 2.247818732193732, + "grad_norm": 0.5982409119606018, + "learning_rate": 5.891530557862797e-05, + "loss": 0.5009, + "step": 100990 + }, + { + "epoch": 2.2480413105413106, + "grad_norm": 0.700705349445343, + "learning_rate": 5.8882264220796145e-05, + "loss": 0.583, + "step": 101000 + }, + { + "epoch": 2.248263888888889, + "grad_norm": 0.7363760471343994, + "learning_rate": 5.8849230531417596e-05, + "loss": 0.3753, + "step": 101010 + }, + { + "epoch": 2.248486467236467, + "grad_norm": 0.7411749362945557, + "learning_rate": 5.881620451228742e-05, + "loss": 0.3801, + "step": 101020 + }, + { + "epoch": 2.2487090455840457, + "grad_norm": 0.5226131081581116, + "learning_rate": 5.878318616520026e-05, + "loss": 0.4723, + "step": 101030 + }, + { + "epoch": 2.248931623931624, + "grad_norm": 0.5838454961776733, + "learning_rate": 5.875017549195039e-05, + "loss": 0.6124, + "step": 101040 + }, + { + "epoch": 2.249154202279202, + "grad_norm": 0.3981051743030548, + "learning_rate": 5.8717172494331665e-05, + "loss": 0.4393, + "step": 101050 + }, + { + "epoch": 2.249376780626781, + "grad_norm": 0.6105630993843079, + "learning_rate": 5.868417717413737e-05, + "loss": 0.4758, + "step": 101060 + }, + { + "epoch": 2.249599358974359, + "grad_norm": 0.5280075669288635, + "learning_rate": 5.865118953316064e-05, + "loss": 0.378, + "step": 101070 + }, + { + "epoch": 2.2498219373219372, + "grad_norm": 0.7146009206771851, + "learning_rate": 5.861820957319395e-05, + "loss": 0.5024, + "step": 101080 + }, + { + "epoch": 2.250044515669516, + "grad_norm": 0.48275527358055115, + "learning_rate": 5.8585237296029474e-05, + "loss": 0.4036, + "step": 101090 + }, + { + "epoch": 2.250267094017094, + "grad_norm": 0.37731584906578064, + "learning_rate": 5.8552272703458974e-05, + "loss": 0.4652, + "step": 101100 + }, + { + "epoch": 2.2504896723646723, + "grad_norm": 0.594689667224884, + "learning_rate": 5.851931579727377e-05, + "loss": 0.5159, + "step": 101110 + }, + { + "epoch": 2.2507122507122506, + "grad_norm": 0.4745054543018341, + "learning_rate": 5.848636657926476e-05, + "loss": 0.4407, + "step": 101120 + }, + { + "epoch": 2.2509348290598292, + "grad_norm": 0.39887046813964844, + "learning_rate": 5.845342505122249e-05, + "loss": 0.399, + "step": 101130 + }, + { + "epoch": 2.2511574074074074, + "grad_norm": 0.349498450756073, + "learning_rate": 5.842049121493694e-05, + "loss": 0.4507, + "step": 101140 + }, + { + "epoch": 2.2513799857549857, + "grad_norm": 0.7075125575065613, + "learning_rate": 5.838756507219778e-05, + "loss": 0.4531, + "step": 101150 + }, + { + "epoch": 2.251602564102564, + "grad_norm": 0.6591486930847168, + "learning_rate": 5.835464662479428e-05, + "loss": 0.3924, + "step": 101160 + }, + { + "epoch": 2.2518251424501425, + "grad_norm": 0.5846258401870728, + "learning_rate": 5.832173587451524e-05, + "loss": 0.4983, + "step": 101170 + }, + { + "epoch": 2.2520477207977208, + "grad_norm": 0.7267735600471497, + "learning_rate": 5.8288832823149055e-05, + "loss": 0.4958, + "step": 101180 + }, + { + "epoch": 2.252270299145299, + "grad_norm": 0.7835147976875305, + "learning_rate": 5.825593747248375e-05, + "loss": 0.5087, + "step": 101190 + }, + { + "epoch": 2.2524928774928776, + "grad_norm": 0.6003299951553345, + "learning_rate": 5.82230498243068e-05, + "loss": 0.4145, + "step": 101200 + }, + { + "epoch": 2.252715455840456, + "grad_norm": 0.623741626739502, + "learning_rate": 5.819016988040542e-05, + "loss": 0.4989, + "step": 101210 + }, + { + "epoch": 2.252938034188034, + "grad_norm": 0.5692561864852905, + "learning_rate": 5.815729764256625e-05, + "loss": 0.365, + "step": 101220 + }, + { + "epoch": 2.2531606125356127, + "grad_norm": 0.5674599409103394, + "learning_rate": 5.812443311257565e-05, + "loss": 0.4341, + "step": 101230 + }, + { + "epoch": 2.253383190883191, + "grad_norm": 0.36465272307395935, + "learning_rate": 5.809157629221951e-05, + "loss": 0.3755, + "step": 101240 + }, + { + "epoch": 2.253605769230769, + "grad_norm": 0.4215715825557709, + "learning_rate": 5.805872718328327e-05, + "loss": 0.4074, + "step": 101250 + }, + { + "epoch": 2.253828347578348, + "grad_norm": 0.7996568083763123, + "learning_rate": 5.802588578755199e-05, + "loss": 0.4353, + "step": 101260 + }, + { + "epoch": 2.254050925925926, + "grad_norm": 0.6421694755554199, + "learning_rate": 5.799305210681036e-05, + "loss": 0.4425, + "step": 101270 + }, + { + "epoch": 2.2542735042735043, + "grad_norm": 0.4854976534843445, + "learning_rate": 5.796022614284249e-05, + "loss": 0.481, + "step": 101280 + }, + { + "epoch": 2.2544960826210825, + "grad_norm": 0.6315191984176636, + "learning_rate": 5.79274078974322e-05, + "loss": 0.5784, + "step": 101290 + }, + { + "epoch": 2.254718660968661, + "grad_norm": 0.5861208438873291, + "learning_rate": 5.789459737236287e-05, + "loss": 0.4458, + "step": 101300 + }, + { + "epoch": 2.2549412393162394, + "grad_norm": 0.6290010213851929, + "learning_rate": 5.786179456941745e-05, + "loss": 0.4419, + "step": 101310 + }, + { + "epoch": 2.2551638176638176, + "grad_norm": 0.6168012022972107, + "learning_rate": 5.7828999490378546e-05, + "loss": 0.4462, + "step": 101320 + }, + { + "epoch": 2.255386396011396, + "grad_norm": 0.5474677085876465, + "learning_rate": 5.7796212137028125e-05, + "loss": 0.4423, + "step": 101330 + }, + { + "epoch": 2.2556089743589745, + "grad_norm": 0.36434105038642883, + "learning_rate": 5.776343251114795e-05, + "loss": 0.4075, + "step": 101340 + }, + { + "epoch": 2.2558315527065527, + "grad_norm": 0.8574975728988647, + "learning_rate": 5.773066061451935e-05, + "loss": 0.469, + "step": 101350 + }, + { + "epoch": 2.256054131054131, + "grad_norm": 0.5685760974884033, + "learning_rate": 5.769789644892305e-05, + "loss": 0.4623, + "step": 101360 + }, + { + "epoch": 2.2562767094017095, + "grad_norm": 0.840263843536377, + "learning_rate": 5.766514001613956e-05, + "loss": 0.4403, + "step": 101370 + }, + { + "epoch": 2.2564992877492878, + "grad_norm": 0.47229859232902527, + "learning_rate": 5.763239131794891e-05, + "loss": 0.4218, + "step": 101380 + }, + { + "epoch": 2.256721866096866, + "grad_norm": 0.5112053751945496, + "learning_rate": 5.759965035613062e-05, + "loss": 0.4215, + "step": 101390 + }, + { + "epoch": 2.2569444444444446, + "grad_norm": 0.2744973599910736, + "learning_rate": 5.756691713246394e-05, + "loss": 0.3427, + "step": 101400 + }, + { + "epoch": 2.257167022792023, + "grad_norm": 0.46965116262435913, + "learning_rate": 5.753419164872762e-05, + "loss": 0.3961, + "step": 101410 + }, + { + "epoch": 2.257389601139601, + "grad_norm": 0.5640278458595276, + "learning_rate": 5.750147390669989e-05, + "loss": 0.539, + "step": 101420 + }, + { + "epoch": 2.2576121794871793, + "grad_norm": 0.5815955996513367, + "learning_rate": 5.746876390815874e-05, + "loss": 0.4581, + "step": 101430 + }, + { + "epoch": 2.257834757834758, + "grad_norm": 0.47417697310447693, + "learning_rate": 5.743606165488162e-05, + "loss": 0.4741, + "step": 101440 + }, + { + "epoch": 2.258057336182336, + "grad_norm": 0.5229243040084839, + "learning_rate": 5.740336714864571e-05, + "loss": 0.4679, + "step": 101450 + }, + { + "epoch": 2.2582799145299144, + "grad_norm": 0.5353955030441284, + "learning_rate": 5.7370680391227484e-05, + "loss": 0.4983, + "step": 101460 + }, + { + "epoch": 2.258502492877493, + "grad_norm": 0.562786340713501, + "learning_rate": 5.733800138440324e-05, + "loss": 0.5244, + "step": 101470 + }, + { + "epoch": 2.2587250712250713, + "grad_norm": 0.9685201644897461, + "learning_rate": 5.730533012994881e-05, + "loss": 0.4722, + "step": 101480 + }, + { + "epoch": 2.2589476495726495, + "grad_norm": 0.5229083299636841, + "learning_rate": 5.7272666629639615e-05, + "loss": 0.5052, + "step": 101490 + }, + { + "epoch": 2.2591702279202277, + "grad_norm": 0.9138157367706299, + "learning_rate": 5.7240010885250486e-05, + "loss": 0.5017, + "step": 101500 + }, + { + "epoch": 2.2593928062678064, + "grad_norm": 0.6066059470176697, + "learning_rate": 5.7207362898556017e-05, + "loss": 0.4791, + "step": 101510 + }, + { + "epoch": 2.2596153846153846, + "grad_norm": 0.5644805431365967, + "learning_rate": 5.717472267133037e-05, + "loss": 0.452, + "step": 101520 + }, + { + "epoch": 2.259837962962963, + "grad_norm": 0.555303692817688, + "learning_rate": 5.7142090205347196e-05, + "loss": 0.4632, + "step": 101530 + }, + { + "epoch": 2.2600605413105415, + "grad_norm": 0.5508922338485718, + "learning_rate": 5.710946550237981e-05, + "loss": 0.513, + "step": 101540 + }, + { + "epoch": 2.2602831196581197, + "grad_norm": 0.47135332226753235, + "learning_rate": 5.7076848564201104e-05, + "loss": 0.4393, + "step": 101550 + }, + { + "epoch": 2.260505698005698, + "grad_norm": 0.5292717814445496, + "learning_rate": 5.7044239392583364e-05, + "loss": 0.5473, + "step": 101560 + }, + { + "epoch": 2.2607282763532766, + "grad_norm": 0.5483565926551819, + "learning_rate": 5.701163798929871e-05, + "loss": 0.479, + "step": 101570 + }, + { + "epoch": 2.2609508547008548, + "grad_norm": 0.4322088360786438, + "learning_rate": 5.697904435611876e-05, + "loss": 0.456, + "step": 101580 + }, + { + "epoch": 2.261173433048433, + "grad_norm": 0.7418818473815918, + "learning_rate": 5.694645849481455e-05, + "loss": 0.4398, + "step": 101590 + }, + { + "epoch": 2.261396011396011, + "grad_norm": 0.3156370520591736, + "learning_rate": 5.69138804071569e-05, + "loss": 0.3302, + "step": 101600 + }, + { + "epoch": 2.26161858974359, + "grad_norm": 0.5656064748764038, + "learning_rate": 5.688131009491613e-05, + "loss": 0.511, + "step": 101610 + }, + { + "epoch": 2.261841168091168, + "grad_norm": 0.5305771827697754, + "learning_rate": 5.684874755986211e-05, + "loss": 0.4182, + "step": 101620 + }, + { + "epoch": 2.2620637464387463, + "grad_norm": 0.637701690196991, + "learning_rate": 5.681619280376442e-05, + "loss": 0.3758, + "step": 101630 + }, + { + "epoch": 2.262286324786325, + "grad_norm": 0.4770202338695526, + "learning_rate": 5.6783645828391974e-05, + "loss": 0.5329, + "step": 101640 + }, + { + "epoch": 2.262508903133903, + "grad_norm": 0.6516206860542297, + "learning_rate": 5.675110663551344e-05, + "loss": 0.5447, + "step": 101650 + }, + { + "epoch": 2.2627314814814814, + "grad_norm": 0.6704887747764587, + "learning_rate": 5.6718575226897054e-05, + "loss": 0.5387, + "step": 101660 + }, + { + "epoch": 2.2629540598290596, + "grad_norm": 0.5100089907646179, + "learning_rate": 5.668605160431057e-05, + "loss": 0.4309, + "step": 101670 + }, + { + "epoch": 2.2631766381766383, + "grad_norm": 0.485149085521698, + "learning_rate": 5.6653535769521374e-05, + "loss": 0.472, + "step": 101680 + }, + { + "epoch": 2.2633992165242165, + "grad_norm": 0.6081596612930298, + "learning_rate": 5.662102772429645e-05, + "loss": 0.4186, + "step": 101690 + }, + { + "epoch": 2.2636217948717947, + "grad_norm": 0.42370980978012085, + "learning_rate": 5.6588527470402196e-05, + "loss": 0.5188, + "step": 101700 + }, + { + "epoch": 2.2638443732193734, + "grad_norm": 0.6564342379570007, + "learning_rate": 5.65560350096048e-05, + "loss": 0.525, + "step": 101710 + }, + { + "epoch": 2.2640669515669516, + "grad_norm": 0.7005487680435181, + "learning_rate": 5.6523550343669875e-05, + "loss": 0.4555, + "step": 101720 + }, + { + "epoch": 2.26428952991453, + "grad_norm": 0.6535967588424683, + "learning_rate": 5.6491073474362647e-05, + "loss": 0.4316, + "step": 101730 + }, + { + "epoch": 2.2645121082621085, + "grad_norm": 0.4445608854293823, + "learning_rate": 5.6458604403447965e-05, + "loss": 0.5378, + "step": 101740 + }, + { + "epoch": 2.2647346866096867, + "grad_norm": 0.389424592256546, + "learning_rate": 5.6426143132690235e-05, + "loss": 0.434, + "step": 101750 + }, + { + "epoch": 2.264957264957265, + "grad_norm": 0.500011146068573, + "learning_rate": 5.639368966385343e-05, + "loss": 0.3236, + "step": 101760 + }, + { + "epoch": 2.265179843304843, + "grad_norm": 0.49770283699035645, + "learning_rate": 5.636124399870115e-05, + "loss": 0.4463, + "step": 101770 + }, + { + "epoch": 2.265402421652422, + "grad_norm": 0.6304476261138916, + "learning_rate": 5.6328806138996404e-05, + "loss": 0.5451, + "step": 101780 + }, + { + "epoch": 2.265625, + "grad_norm": 0.44962552189826965, + "learning_rate": 5.629637608650193e-05, + "loss": 0.4311, + "step": 101790 + }, + { + "epoch": 2.265847578347578, + "grad_norm": 0.5515990257263184, + "learning_rate": 5.6263953842980044e-05, + "loss": 0.4567, + "step": 101800 + }, + { + "epoch": 2.266070156695157, + "grad_norm": 0.3929078280925751, + "learning_rate": 5.623153941019255e-05, + "loss": 0.3712, + "step": 101810 + }, + { + "epoch": 2.266292735042735, + "grad_norm": 0.4479203522205353, + "learning_rate": 5.6199132789900896e-05, + "loss": 0.4379, + "step": 101820 + }, + { + "epoch": 2.2665153133903133, + "grad_norm": 0.6935744881629944, + "learning_rate": 5.616673398386618e-05, + "loss": 0.4509, + "step": 101830 + }, + { + "epoch": 2.2667378917378915, + "grad_norm": 0.7883388996124268, + "learning_rate": 5.613434299384885e-05, + "loss": 0.4422, + "step": 101840 + }, + { + "epoch": 2.26696047008547, + "grad_norm": 0.6511839628219604, + "learning_rate": 5.6101959821609064e-05, + "loss": 0.4903, + "step": 101850 + }, + { + "epoch": 2.2671830484330484, + "grad_norm": 0.720378041267395, + "learning_rate": 5.6069584468906576e-05, + "loss": 0.5358, + "step": 101860 + }, + { + "epoch": 2.2674056267806266, + "grad_norm": 0.3852686285972595, + "learning_rate": 5.6037216937500684e-05, + "loss": 0.42, + "step": 101870 + }, + { + "epoch": 2.2676282051282053, + "grad_norm": 0.5996120572090149, + "learning_rate": 5.6004857229150275e-05, + "loss": 0.4837, + "step": 101880 + }, + { + "epoch": 2.2678507834757835, + "grad_norm": 0.5900279879570007, + "learning_rate": 5.5972505345613824e-05, + "loss": 0.4205, + "step": 101890 + }, + { + "epoch": 2.2680733618233617, + "grad_norm": 0.5508142709732056, + "learning_rate": 5.5940161288649384e-05, + "loss": 0.4303, + "step": 101900 + }, + { + "epoch": 2.2682959401709404, + "grad_norm": 0.3930082321166992, + "learning_rate": 5.590782506001444e-05, + "loss": 0.381, + "step": 101910 + }, + { + "epoch": 2.2685185185185186, + "grad_norm": 0.3945516347885132, + "learning_rate": 5.5875496661466256e-05, + "loss": 0.3843, + "step": 101920 + }, + { + "epoch": 2.268741096866097, + "grad_norm": 0.6308930516242981, + "learning_rate": 5.5843176094761576e-05, + "loss": 0.4208, + "step": 101930 + }, + { + "epoch": 2.268963675213675, + "grad_norm": 0.5742242336273193, + "learning_rate": 5.5810863361656705e-05, + "loss": 0.4758, + "step": 101940 + }, + { + "epoch": 2.2691862535612537, + "grad_norm": 0.47501349449157715, + "learning_rate": 5.577855846390756e-05, + "loss": 0.4667, + "step": 101950 + }, + { + "epoch": 2.269408831908832, + "grad_norm": 0.7356695532798767, + "learning_rate": 5.574626140326962e-05, + "loss": 0.5717, + "step": 101960 + }, + { + "epoch": 2.26963141025641, + "grad_norm": 0.5006420016288757, + "learning_rate": 5.5713972181497984e-05, + "loss": 0.4647, + "step": 101970 + }, + { + "epoch": 2.269853988603989, + "grad_norm": 0.7696124911308289, + "learning_rate": 5.568169080034722e-05, + "loss": 0.3968, + "step": 101980 + }, + { + "epoch": 2.270076566951567, + "grad_norm": 0.5565013885498047, + "learning_rate": 5.564941726157147e-05, + "loss": 0.4999, + "step": 101990 + }, + { + "epoch": 2.2702991452991452, + "grad_norm": 0.5363160967826843, + "learning_rate": 5.561715156692455e-05, + "loss": 0.4392, + "step": 102000 + }, + { + "epoch": 2.2705217236467234, + "grad_norm": 0.6066251993179321, + "learning_rate": 5.55848937181598e-05, + "loss": 0.4533, + "step": 102010 + }, + { + "epoch": 2.270744301994302, + "grad_norm": 0.7414187788963318, + "learning_rate": 5.555264371703017e-05, + "loss": 0.5076, + "step": 102020 + }, + { + "epoch": 2.2709668803418803, + "grad_norm": 0.7134029865264893, + "learning_rate": 5.5520401565288106e-05, + "loss": 0.5363, + "step": 102030 + }, + { + "epoch": 2.2711894586894585, + "grad_norm": 0.4264451265335083, + "learning_rate": 5.548816726468577e-05, + "loss": 0.4643, + "step": 102040 + }, + { + "epoch": 2.271412037037037, + "grad_norm": 0.49752625823020935, + "learning_rate": 5.545594081697467e-05, + "loss": 0.4397, + "step": 102050 + }, + { + "epoch": 2.2716346153846154, + "grad_norm": 0.46924108266830444, + "learning_rate": 5.542372222390608e-05, + "loss": 0.4594, + "step": 102060 + }, + { + "epoch": 2.2718571937321936, + "grad_norm": 0.6802897453308105, + "learning_rate": 5.539151148723076e-05, + "loss": 0.5427, + "step": 102070 + }, + { + "epoch": 2.2720797720797723, + "grad_norm": 0.6026861071586609, + "learning_rate": 5.535930860869911e-05, + "loss": 0.3957, + "step": 102080 + }, + { + "epoch": 2.2723023504273505, + "grad_norm": 0.5114626288414001, + "learning_rate": 5.5327113590061044e-05, + "loss": 0.4284, + "step": 102090 + }, + { + "epoch": 2.2725249287749287, + "grad_norm": 0.5237139463424683, + "learning_rate": 5.529492643306604e-05, + "loss": 0.525, + "step": 102100 + }, + { + "epoch": 2.272747507122507, + "grad_norm": 0.4586896598339081, + "learning_rate": 5.526274713946325e-05, + "loss": 0.4378, + "step": 102110 + }, + { + "epoch": 2.2729700854700856, + "grad_norm": 0.4087303578853607, + "learning_rate": 5.5230575711001276e-05, + "loss": 0.5392, + "step": 102120 + }, + { + "epoch": 2.273192663817664, + "grad_norm": 0.6352576017379761, + "learning_rate": 5.519841214942829e-05, + "loss": 0.4601, + "step": 102130 + }, + { + "epoch": 2.273415242165242, + "grad_norm": 0.640564501285553, + "learning_rate": 5.51662564564921e-05, + "loss": 0.4784, + "step": 102140 + }, + { + "epoch": 2.2736378205128207, + "grad_norm": 0.572647750377655, + "learning_rate": 5.5134108633940105e-05, + "loss": 0.5141, + "step": 102150 + }, + { + "epoch": 2.273860398860399, + "grad_norm": 0.5198943614959717, + "learning_rate": 5.510196868351927e-05, + "loss": 0.5727, + "step": 102160 + }, + { + "epoch": 2.274082977207977, + "grad_norm": 0.3667494058609009, + "learning_rate": 5.506983660697604e-05, + "loss": 0.4296, + "step": 102170 + }, + { + "epoch": 2.2743055555555554, + "grad_norm": 0.43145737051963806, + "learning_rate": 5.503771240605659e-05, + "loss": 0.563, + "step": 102180 + }, + { + "epoch": 2.274528133903134, + "grad_norm": 0.5121556520462036, + "learning_rate": 5.500559608250648e-05, + "loss": 0.389, + "step": 102190 + }, + { + "epoch": 2.2747507122507122, + "grad_norm": 0.3813532590866089, + "learning_rate": 5.497348763807097e-05, + "loss": 0.47, + "step": 102200 + }, + { + "epoch": 2.2749732905982905, + "grad_norm": 0.36606770753860474, + "learning_rate": 5.4941387074494874e-05, + "loss": 0.4546, + "step": 102210 + }, + { + "epoch": 2.275195868945869, + "grad_norm": 0.5075564980506897, + "learning_rate": 5.490929439352257e-05, + "loss": 0.4803, + "step": 102220 + }, + { + "epoch": 2.2754184472934473, + "grad_norm": 0.4560779929161072, + "learning_rate": 5.4877209596897946e-05, + "loss": 0.4122, + "step": 102230 + }, + { + "epoch": 2.2756410256410255, + "grad_norm": 0.5539279580116272, + "learning_rate": 5.484513268636464e-05, + "loss": 0.4025, + "step": 102240 + }, + { + "epoch": 2.275863603988604, + "grad_norm": 0.47847601771354675, + "learning_rate": 5.4813063663665585e-05, + "loss": 0.4715, + "step": 102250 + }, + { + "epoch": 2.2760861823361824, + "grad_norm": 0.6673201322555542, + "learning_rate": 5.478100253054357e-05, + "loss": 0.3853, + "step": 102260 + }, + { + "epoch": 2.2763087606837606, + "grad_norm": 0.7573196291923523, + "learning_rate": 5.4748949288740705e-05, + "loss": 0.4458, + "step": 102270 + }, + { + "epoch": 2.276531339031339, + "grad_norm": 0.5507382750511169, + "learning_rate": 5.4716903939998844e-05, + "loss": 0.4435, + "step": 102280 + }, + { + "epoch": 2.2767539173789175, + "grad_norm": 0.635073184967041, + "learning_rate": 5.468486648605935e-05, + "loss": 0.4689, + "step": 102290 + }, + { + "epoch": 2.2769764957264957, + "grad_norm": 0.5143459439277649, + "learning_rate": 5.4652836928663164e-05, + "loss": 0.3888, + "step": 102300 + }, + { + "epoch": 2.277199074074074, + "grad_norm": 0.8898342847824097, + "learning_rate": 5.4620815269550805e-05, + "loss": 0.5532, + "step": 102310 + }, + { + "epoch": 2.277421652421652, + "grad_norm": 0.6367766857147217, + "learning_rate": 5.458880151046244e-05, + "loss": 0.4739, + "step": 102320 + }, + { + "epoch": 2.277644230769231, + "grad_norm": 0.5471299886703491, + "learning_rate": 5.4556795653137563e-05, + "loss": 0.4544, + "step": 102330 + }, + { + "epoch": 2.277866809116809, + "grad_norm": 0.483954519033432, + "learning_rate": 5.4524797699315485e-05, + "loss": 0.4725, + "step": 102340 + }, + { + "epoch": 2.2780893874643873, + "grad_norm": 0.5129729509353638, + "learning_rate": 5.4492807650734986e-05, + "loss": 0.3671, + "step": 102350 + }, + { + "epoch": 2.278311965811966, + "grad_norm": 0.4544709622859955, + "learning_rate": 5.446082550913443e-05, + "loss": 0.3694, + "step": 102360 + }, + { + "epoch": 2.278534544159544, + "grad_norm": 0.6704840660095215, + "learning_rate": 5.4428851276251836e-05, + "loss": 0.4251, + "step": 102370 + }, + { + "epoch": 2.2787571225071224, + "grad_norm": 0.8961855173110962, + "learning_rate": 5.4396884953824554e-05, + "loss": 0.3873, + "step": 102380 + }, + { + "epoch": 2.278979700854701, + "grad_norm": 0.5246909260749817, + "learning_rate": 5.436492654358975e-05, + "loss": 0.4699, + "step": 102390 + }, + { + "epoch": 2.2792022792022792, + "grad_norm": 0.5263209342956543, + "learning_rate": 5.4332976047284114e-05, + "loss": 0.4693, + "step": 102400 + }, + { + "epoch": 2.2794248575498575, + "grad_norm": 0.6012300848960876, + "learning_rate": 5.430103346664377e-05, + "loss": 0.4863, + "step": 102410 + }, + { + "epoch": 2.279647435897436, + "grad_norm": 0.7874319553375244, + "learning_rate": 5.4269098803404516e-05, + "loss": 0.4381, + "step": 102420 + }, + { + "epoch": 2.2798700142450143, + "grad_norm": 0.42439478635787964, + "learning_rate": 5.4237172059301745e-05, + "loss": 0.6481, + "step": 102430 + }, + { + "epoch": 2.2800925925925926, + "grad_norm": 0.3294517695903778, + "learning_rate": 5.420525323607037e-05, + "loss": 0.3813, + "step": 102440 + }, + { + "epoch": 2.2802706552706553, + "eval_loss": 0.5308219790458679, + "eval_runtime": 337.354, + "eval_samples_per_second": 7.01, + "eval_steps_per_second": 7.01, + "step": 102448 + }, + { + "epoch": 2.2803151709401708, + "grad_norm": 0.35001301765441895, + "learning_rate": 5.417334233544489e-05, + "loss": 0.5311, + "step": 102450 + }, + { + "epoch": 2.2805377492877494, + "grad_norm": 0.4265924394130707, + "learning_rate": 5.414143935915943e-05, + "loss": 0.3942, + "step": 102460 + }, + { + "epoch": 2.2807603276353277, + "grad_norm": 0.4556063413619995, + "learning_rate": 5.410954430894748e-05, + "loss": 0.4001, + "step": 102470 + }, + { + "epoch": 2.280982905982906, + "grad_norm": 0.5078181028366089, + "learning_rate": 5.407765718654234e-05, + "loss": 0.4825, + "step": 102480 + }, + { + "epoch": 2.281205484330484, + "grad_norm": 0.5195839405059814, + "learning_rate": 5.404577799367676e-05, + "loss": 0.3958, + "step": 102490 + }, + { + "epoch": 2.2814280626780628, + "grad_norm": 0.714531660079956, + "learning_rate": 5.4013906732083154e-05, + "loss": 0.5397, + "step": 102500 + }, + { + "epoch": 2.281650641025641, + "grad_norm": 0.40454044938087463, + "learning_rate": 5.398204340349331e-05, + "loss": 0.5091, + "step": 102510 + }, + { + "epoch": 2.281873219373219, + "grad_norm": 0.6262097954750061, + "learning_rate": 5.395018800963876e-05, + "loss": 0.6028, + "step": 102520 + }, + { + "epoch": 2.282095797720798, + "grad_norm": 0.44660210609436035, + "learning_rate": 5.391834055225056e-05, + "loss": 0.5524, + "step": 102530 + }, + { + "epoch": 2.282318376068376, + "grad_norm": 0.5187098383903503, + "learning_rate": 5.3886501033059367e-05, + "loss": 0.5016, + "step": 102540 + }, + { + "epoch": 2.2825409544159543, + "grad_norm": 0.5763646364212036, + "learning_rate": 5.385466945379527e-05, + "loss": 0.3882, + "step": 102550 + }, + { + "epoch": 2.282763532763533, + "grad_norm": 0.49286168813705444, + "learning_rate": 5.3822845816188085e-05, + "loss": 0.3818, + "step": 102560 + }, + { + "epoch": 2.282986111111111, + "grad_norm": 0.6143706440925598, + "learning_rate": 5.379103012196711e-05, + "loss": 0.5703, + "step": 102570 + }, + { + "epoch": 2.2832086894586894, + "grad_norm": 0.410530149936676, + "learning_rate": 5.375922237286126e-05, + "loss": 0.3684, + "step": 102580 + }, + { + "epoch": 2.283431267806268, + "grad_norm": 0.8411930203437805, + "learning_rate": 5.372742257059897e-05, + "loss": 0.5101, + "step": 102590 + }, + { + "epoch": 2.2836538461538463, + "grad_norm": 0.5050935745239258, + "learning_rate": 5.3695630716908354e-05, + "loss": 0.4048, + "step": 102600 + }, + { + "epoch": 2.2838764245014245, + "grad_norm": 0.4293699860572815, + "learning_rate": 5.366384681351688e-05, + "loss": 0.4361, + "step": 102610 + }, + { + "epoch": 2.2840990028490027, + "grad_norm": 0.622787594795227, + "learning_rate": 5.363207086215176e-05, + "loss": 0.4265, + "step": 102620 + }, + { + "epoch": 2.2843215811965814, + "grad_norm": 0.6479224562644958, + "learning_rate": 5.3600302864539785e-05, + "loss": 0.4123, + "step": 102630 + }, + { + "epoch": 2.2845441595441596, + "grad_norm": 0.502160906791687, + "learning_rate": 5.3568542822407154e-05, + "loss": 0.4438, + "step": 102640 + }, + { + "epoch": 2.284766737891738, + "grad_norm": 0.45600494742393494, + "learning_rate": 5.353679073747977e-05, + "loss": 0.4465, + "step": 102650 + }, + { + "epoch": 2.284989316239316, + "grad_norm": 0.6276834011077881, + "learning_rate": 5.3505046611483076e-05, + "loss": 0.5092, + "step": 102660 + }, + { + "epoch": 2.2852118945868947, + "grad_norm": 0.7201434373855591, + "learning_rate": 5.3473310446142075e-05, + "loss": 0.4117, + "step": 102670 + }, + { + "epoch": 2.285434472934473, + "grad_norm": 0.4016876518726349, + "learning_rate": 5.344158224318141e-05, + "loss": 0.4158, + "step": 102680 + }, + { + "epoch": 2.285657051282051, + "grad_norm": 0.5486331582069397, + "learning_rate": 5.340986200432507e-05, + "loss": 0.3995, + "step": 102690 + }, + { + "epoch": 2.2858796296296298, + "grad_norm": 0.6210474371910095, + "learning_rate": 5.337814973129684e-05, + "loss": 0.4866, + "step": 102700 + }, + { + "epoch": 2.286102207977208, + "grad_norm": 0.6425034999847412, + "learning_rate": 5.334644542581999e-05, + "loss": 0.4377, + "step": 102710 + }, + { + "epoch": 2.286324786324786, + "grad_norm": 0.5458213686943054, + "learning_rate": 5.331474908961733e-05, + "loss": 0.5359, + "step": 102720 + }, + { + "epoch": 2.286547364672365, + "grad_norm": 0.5335869193077087, + "learning_rate": 5.328306072441132e-05, + "loss": 0.4587, + "step": 102730 + }, + { + "epoch": 2.286769943019943, + "grad_norm": 0.6593013405799866, + "learning_rate": 5.3251380331923936e-05, + "loss": 0.4461, + "step": 102740 + }, + { + "epoch": 2.2869925213675213, + "grad_norm": 0.48755788803100586, + "learning_rate": 5.321970791387663e-05, + "loss": 0.5187, + "step": 102750 + }, + { + "epoch": 2.2872150997151, + "grad_norm": 0.4677642583847046, + "learning_rate": 5.31880434719906e-05, + "loss": 0.4189, + "step": 102760 + }, + { + "epoch": 2.287437678062678, + "grad_norm": 0.5250211954116821, + "learning_rate": 5.315638700798642e-05, + "loss": 0.4374, + "step": 102770 + }, + { + "epoch": 2.2876602564102564, + "grad_norm": 0.5852629542350769, + "learning_rate": 5.312473852358437e-05, + "loss": 0.525, + "step": 102780 + }, + { + "epoch": 2.2878828347578346, + "grad_norm": 0.6496941447257996, + "learning_rate": 5.3093098020504285e-05, + "loss": 0.4563, + "step": 102790 + }, + { + "epoch": 2.2881054131054133, + "grad_norm": 0.4138205051422119, + "learning_rate": 5.306146550046551e-05, + "loss": 0.4192, + "step": 102800 + }, + { + "epoch": 2.2883279914529915, + "grad_norm": 0.5685064196586609, + "learning_rate": 5.3029840965187e-05, + "loss": 0.5226, + "step": 102810 + }, + { + "epoch": 2.2885505698005697, + "grad_norm": 0.6383584141731262, + "learning_rate": 5.2998224416387284e-05, + "loss": 0.4088, + "step": 102820 + }, + { + "epoch": 2.288773148148148, + "grad_norm": 0.5327423214912415, + "learning_rate": 5.296661585578435e-05, + "loss": 0.4424, + "step": 102830 + }, + { + "epoch": 2.2889957264957266, + "grad_norm": 0.5009591579437256, + "learning_rate": 5.293501528509588e-05, + "loss": 0.5031, + "step": 102840 + }, + { + "epoch": 2.289218304843305, + "grad_norm": 0.6047536134719849, + "learning_rate": 5.2903422706039074e-05, + "loss": 0.4622, + "step": 102850 + }, + { + "epoch": 2.289440883190883, + "grad_norm": 0.39370644092559814, + "learning_rate": 5.2871838120330695e-05, + "loss": 0.338, + "step": 102860 + }, + { + "epoch": 2.2896634615384617, + "grad_norm": 0.7097088694572449, + "learning_rate": 5.284026152968706e-05, + "loss": 0.4214, + "step": 102870 + }, + { + "epoch": 2.28988603988604, + "grad_norm": 0.47650113701820374, + "learning_rate": 5.280869293582418e-05, + "loss": 0.3967, + "step": 102880 + }, + { + "epoch": 2.290108618233618, + "grad_norm": 0.6755890846252441, + "learning_rate": 5.27771323404574e-05, + "loss": 0.5204, + "step": 102890 + }, + { + "epoch": 2.2903311965811968, + "grad_norm": 0.4635171890258789, + "learning_rate": 5.2745579745301696e-05, + "loss": 0.4416, + "step": 102900 + }, + { + "epoch": 2.290553774928775, + "grad_norm": 0.7415624856948853, + "learning_rate": 5.271403515207176e-05, + "loss": 0.4874, + "step": 102910 + }, + { + "epoch": 2.290776353276353, + "grad_norm": 0.5703538060188293, + "learning_rate": 5.268249856248173e-05, + "loss": 0.5932, + "step": 102920 + }, + { + "epoch": 2.290998931623932, + "grad_norm": 0.4767812192440033, + "learning_rate": 5.26509699782453e-05, + "loss": 0.4217, + "step": 102930 + }, + { + "epoch": 2.29122150997151, + "grad_norm": 0.6134464144706726, + "learning_rate": 5.261944940107581e-05, + "loss": 0.5368, + "step": 102940 + }, + { + "epoch": 2.2914440883190883, + "grad_norm": 0.3436715006828308, + "learning_rate": 5.258793683268608e-05, + "loss": 0.4835, + "step": 102950 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.6039507985115051, + "learning_rate": 5.255643227478861e-05, + "loss": 0.3613, + "step": 102960 + }, + { + "epoch": 2.291889245014245, + "grad_norm": 0.7695244550704956, + "learning_rate": 5.252493572909525e-05, + "loss": 0.5058, + "step": 102970 + }, + { + "epoch": 2.2921118233618234, + "grad_norm": 0.6251684427261353, + "learning_rate": 5.2493447197317616e-05, + "loss": 0.5454, + "step": 102980 + }, + { + "epoch": 2.2923344017094016, + "grad_norm": 0.4886912405490875, + "learning_rate": 5.246196668116681e-05, + "loss": 0.4633, + "step": 102990 + }, + { + "epoch": 2.29255698005698, + "grad_norm": 0.7781798243522644, + "learning_rate": 5.2430494182353504e-05, + "loss": 0.5038, + "step": 103000 + }, + { + "epoch": 2.2927795584045585, + "grad_norm": 0.5966687798500061, + "learning_rate": 5.239902970258797e-05, + "loss": 0.5312, + "step": 103010 + }, + { + "epoch": 2.2930021367521367, + "grad_norm": 0.5091105699539185, + "learning_rate": 5.2367573243580034e-05, + "loss": 0.5261, + "step": 103020 + }, + { + "epoch": 2.293224715099715, + "grad_norm": 0.6099951267242432, + "learning_rate": 5.233612480703905e-05, + "loss": 0.4781, + "step": 103030 + }, + { + "epoch": 2.2934472934472936, + "grad_norm": 0.5651898384094238, + "learning_rate": 5.230468439467384e-05, + "loss": 0.4752, + "step": 103040 + }, + { + "epoch": 2.293669871794872, + "grad_norm": 0.6548570990562439, + "learning_rate": 5.2273252008193e-05, + "loss": 0.483, + "step": 103050 + }, + { + "epoch": 2.29389245014245, + "grad_norm": 0.48320871591567993, + "learning_rate": 5.2241827649304584e-05, + "loss": 0.4787, + "step": 103060 + }, + { + "epoch": 2.2941150284900287, + "grad_norm": 0.5880222320556641, + "learning_rate": 5.22104113197162e-05, + "loss": 0.3746, + "step": 103070 + }, + { + "epoch": 2.294337606837607, + "grad_norm": 0.41989943385124207, + "learning_rate": 5.2179003021135076e-05, + "loss": 0.3545, + "step": 103080 + }, + { + "epoch": 2.294560185185185, + "grad_norm": 0.580086350440979, + "learning_rate": 5.214760275526793e-05, + "loss": 0.4115, + "step": 103090 + }, + { + "epoch": 2.294782763532764, + "grad_norm": 0.4067051410675049, + "learning_rate": 5.211621052382114e-05, + "loss": 0.4841, + "step": 103100 + }, + { + "epoch": 2.295005341880342, + "grad_norm": 0.5266998410224915, + "learning_rate": 5.208482632850047e-05, + "loss": 0.4854, + "step": 103110 + }, + { + "epoch": 2.29522792022792, + "grad_norm": 0.4169633984565735, + "learning_rate": 5.205345017101144e-05, + "loss": 0.4697, + "step": 103120 + }, + { + "epoch": 2.2954504985754984, + "grad_norm": 0.4864277243614197, + "learning_rate": 5.2022082053059054e-05, + "loss": 0.5506, + "step": 103130 + }, + { + "epoch": 2.295673076923077, + "grad_norm": 0.5991712808609009, + "learning_rate": 5.199072197634784e-05, + "loss": 0.5265, + "step": 103140 + }, + { + "epoch": 2.2958956552706553, + "grad_norm": 0.5243403315544128, + "learning_rate": 5.195936994258197e-05, + "loss": 0.4581, + "step": 103150 + }, + { + "epoch": 2.2961182336182335, + "grad_norm": 0.5383456945419312, + "learning_rate": 5.1928025953465195e-05, + "loss": 0.4343, + "step": 103160 + }, + { + "epoch": 2.2963408119658117, + "grad_norm": 0.5923618078231812, + "learning_rate": 5.1896690010700696e-05, + "loss": 0.5134, + "step": 103170 + }, + { + "epoch": 2.2965633903133904, + "grad_norm": 0.9396385550498962, + "learning_rate": 5.1865362115991265e-05, + "loss": 0.5086, + "step": 103180 + }, + { + "epoch": 2.2967859686609686, + "grad_norm": 0.6137779355049133, + "learning_rate": 5.1834042271039295e-05, + "loss": 0.4525, + "step": 103190 + }, + { + "epoch": 2.297008547008547, + "grad_norm": 0.4180096387863159, + "learning_rate": 5.18027304775468e-05, + "loss": 0.4028, + "step": 103200 + }, + { + "epoch": 2.2972311253561255, + "grad_norm": 0.49826472997665405, + "learning_rate": 5.177142673721522e-05, + "loss": 0.5231, + "step": 103210 + }, + { + "epoch": 2.2974537037037037, + "grad_norm": 0.4427054822444916, + "learning_rate": 5.1740131051745664e-05, + "loss": 0.4184, + "step": 103220 + }, + { + "epoch": 2.297676282051282, + "grad_norm": 0.38813918828964233, + "learning_rate": 5.170884342283877e-05, + "loss": 0.4, + "step": 103230 + }, + { + "epoch": 2.2978988603988606, + "grad_norm": 0.4896891117095947, + "learning_rate": 5.167756385219478e-05, + "loss": 0.4271, + "step": 103240 + }, + { + "epoch": 2.298121438746439, + "grad_norm": 0.32303759455680847, + "learning_rate": 5.16462923415133e-05, + "loss": 0.4146, + "step": 103250 + }, + { + "epoch": 2.298344017094017, + "grad_norm": 0.6285502910614014, + "learning_rate": 5.161502889249379e-05, + "loss": 0.4145, + "step": 103260 + }, + { + "epoch": 2.2985665954415953, + "grad_norm": 0.6991376280784607, + "learning_rate": 5.158377350683505e-05, + "loss": 0.4111, + "step": 103270 + }, + { + "epoch": 2.298789173789174, + "grad_norm": 0.4952690601348877, + "learning_rate": 5.1552526186235565e-05, + "loss": 0.392, + "step": 103280 + }, + { + "epoch": 2.299011752136752, + "grad_norm": 0.5192189812660217, + "learning_rate": 5.1521286932393396e-05, + "loss": 0.4112, + "step": 103290 + }, + { + "epoch": 2.2992343304843303, + "grad_norm": 0.42775392532348633, + "learning_rate": 5.149005574700598e-05, + "loss": 0.4729, + "step": 103300 + }, + { + "epoch": 2.299456908831909, + "grad_norm": 0.6301408410072327, + "learning_rate": 5.145883263177058e-05, + "loss": 0.4336, + "step": 103310 + }, + { + "epoch": 2.2996794871794872, + "grad_norm": 0.6173430681228638, + "learning_rate": 5.142761758838375e-05, + "loss": 0.4586, + "step": 103320 + }, + { + "epoch": 2.2999020655270654, + "grad_norm": 0.8824524283409119, + "learning_rate": 5.1396410618541814e-05, + "loss": 0.4052, + "step": 103330 + }, + { + "epoch": 2.3001246438746437, + "grad_norm": 0.461791068315506, + "learning_rate": 5.136521172394055e-05, + "loss": 0.4431, + "step": 103340 + }, + { + "epoch": 2.3003472222222223, + "grad_norm": 0.4454747140407562, + "learning_rate": 5.1334020906275395e-05, + "loss": 0.4699, + "step": 103350 + }, + { + "epoch": 2.3005698005698005, + "grad_norm": 0.3216058313846588, + "learning_rate": 5.130283816724124e-05, + "loss": 0.4434, + "step": 103360 + }, + { + "epoch": 2.3007923789173788, + "grad_norm": 0.5915253758430481, + "learning_rate": 5.1271663508532565e-05, + "loss": 0.4242, + "step": 103370 + }, + { + "epoch": 2.3010149572649574, + "grad_norm": 0.6322838068008423, + "learning_rate": 5.124049693184354e-05, + "loss": 0.5511, + "step": 103380 + }, + { + "epoch": 2.3012375356125356, + "grad_norm": 0.46077054738998413, + "learning_rate": 5.120933843886761e-05, + "loss": 0.5324, + "step": 103390 + }, + { + "epoch": 2.301460113960114, + "grad_norm": 0.8137968182563782, + "learning_rate": 5.1178188031298035e-05, + "loss": 0.4624, + "step": 103400 + }, + { + "epoch": 2.3016826923076925, + "grad_norm": 0.9656932950019836, + "learning_rate": 5.1147045710827576e-05, + "loss": 0.4386, + "step": 103410 + }, + { + "epoch": 2.3019052706552707, + "grad_norm": 0.449002206325531, + "learning_rate": 5.111591147914856e-05, + "loss": 0.4588, + "step": 103420 + }, + { + "epoch": 2.302127849002849, + "grad_norm": 0.753855288028717, + "learning_rate": 5.108478533795271e-05, + "loss": 0.4022, + "step": 103430 + }, + { + "epoch": 2.302350427350427, + "grad_norm": 0.6333743333816528, + "learning_rate": 5.105366728893157e-05, + "loss": 0.3967, + "step": 103440 + }, + { + "epoch": 2.302573005698006, + "grad_norm": 1.017401099205017, + "learning_rate": 5.102255733377612e-05, + "loss": 0.4129, + "step": 103450 + }, + { + "epoch": 2.302795584045584, + "grad_norm": 0.49452051520347595, + "learning_rate": 5.099145547417681e-05, + "loss": 0.4, + "step": 103460 + }, + { + "epoch": 2.3030181623931623, + "grad_norm": 0.5165116786956787, + "learning_rate": 5.096036171182379e-05, + "loss": 0.5483, + "step": 103470 + }, + { + "epoch": 2.303240740740741, + "grad_norm": 0.5332070589065552, + "learning_rate": 5.0929276048406735e-05, + "loss": 0.5176, + "step": 103480 + }, + { + "epoch": 2.303463319088319, + "grad_norm": 0.5469018816947937, + "learning_rate": 5.089819848561483e-05, + "loss": 0.4091, + "step": 103490 + }, + { + "epoch": 2.3036858974358974, + "grad_norm": 0.6199618577957153, + "learning_rate": 5.086712902513691e-05, + "loss": 0.4496, + "step": 103500 + }, + { + "epoch": 2.3039084757834756, + "grad_norm": 0.43971481919288635, + "learning_rate": 5.083606766866127e-05, + "loss": 0.4389, + "step": 103510 + }, + { + "epoch": 2.3041310541310542, + "grad_norm": 0.5679343342781067, + "learning_rate": 5.080501441787588e-05, + "loss": 0.3612, + "step": 103520 + }, + { + "epoch": 2.3043536324786325, + "grad_norm": 0.4358706772327423, + "learning_rate": 5.077396927446809e-05, + "loss": 0.3968, + "step": 103530 + }, + { + "epoch": 2.3045762108262107, + "grad_norm": 0.36143189668655396, + "learning_rate": 5.074293224012498e-05, + "loss": 0.4191, + "step": 103540 + }, + { + "epoch": 2.3047987891737893, + "grad_norm": 0.5568950772285461, + "learning_rate": 5.071190331653317e-05, + "loss": 0.3489, + "step": 103550 + }, + { + "epoch": 2.3050213675213675, + "grad_norm": 0.7864968180656433, + "learning_rate": 5.068088250537868e-05, + "loss": 0.5666, + "step": 103560 + }, + { + "epoch": 2.3052439458689458, + "grad_norm": 0.5771081447601318, + "learning_rate": 5.064986980834727e-05, + "loss": 0.4724, + "step": 103570 + }, + { + "epoch": 2.3054665242165244, + "grad_norm": 0.849843442440033, + "learning_rate": 5.061886522712422e-05, + "loss": 0.4717, + "step": 103580 + }, + { + "epoch": 2.3056891025641026, + "grad_norm": 0.6978979110717773, + "learning_rate": 5.058786876339436e-05, + "loss": 0.4199, + "step": 103590 + }, + { + "epoch": 2.305911680911681, + "grad_norm": 0.5732024908065796, + "learning_rate": 5.055688041884199e-05, + "loss": 0.4802, + "step": 103600 + }, + { + "epoch": 2.306134259259259, + "grad_norm": 0.7335711717605591, + "learning_rate": 5.052590019515107e-05, + "loss": 0.3348, + "step": 103610 + }, + { + "epoch": 2.3063568376068377, + "grad_norm": 0.4181780219078064, + "learning_rate": 5.049492809400509e-05, + "loss": 0.4457, + "step": 103620 + }, + { + "epoch": 2.306579415954416, + "grad_norm": 0.43094080686569214, + "learning_rate": 5.046396411708709e-05, + "loss": 0.5484, + "step": 103630 + }, + { + "epoch": 2.306801994301994, + "grad_norm": 0.4213450849056244, + "learning_rate": 5.043300826607973e-05, + "loss": 0.4916, + "step": 103640 + }, + { + "epoch": 2.307024572649573, + "grad_norm": 0.4814653694629669, + "learning_rate": 5.0402060542665183e-05, + "loss": 0.4567, + "step": 103650 + }, + { + "epoch": 2.307247150997151, + "grad_norm": 0.41216182708740234, + "learning_rate": 5.0371120948525076e-05, + "loss": 0.6228, + "step": 103660 + }, + { + "epoch": 2.3074697293447293, + "grad_norm": 0.46462947130203247, + "learning_rate": 5.034018948534076e-05, + "loss": 0.3674, + "step": 103670 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.42045047879219055, + "learning_rate": 5.03092661547931e-05, + "loss": 0.3813, + "step": 103680 + }, + { + "epoch": 2.307914886039886, + "grad_norm": 0.5972529649734497, + "learning_rate": 5.027835095856241e-05, + "loss": 0.4336, + "step": 103690 + }, + { + "epoch": 2.3081374643874644, + "grad_norm": 0.6638756990432739, + "learning_rate": 5.0247443898328714e-05, + "loss": 0.4934, + "step": 103700 + }, + { + "epoch": 2.3083600427350426, + "grad_norm": 0.5444926619529724, + "learning_rate": 5.021654497577151e-05, + "loss": 0.5162, + "step": 103710 + }, + { + "epoch": 2.3085826210826212, + "grad_norm": 0.6065942049026489, + "learning_rate": 5.018565419256984e-05, + "loss": 0.525, + "step": 103720 + }, + { + "epoch": 2.3088051994301995, + "grad_norm": 0.7407015562057495, + "learning_rate": 5.0154771550402447e-05, + "loss": 0.4399, + "step": 103730 + }, + { + "epoch": 2.3090277777777777, + "grad_norm": 0.8703110814094543, + "learning_rate": 5.012389705094738e-05, + "loss": 0.3646, + "step": 103740 + }, + { + "epoch": 2.3092503561253563, + "grad_norm": 0.6362120509147644, + "learning_rate": 5.009303069588242e-05, + "loss": 0.4145, + "step": 103750 + }, + { + "epoch": 2.3094729344729346, + "grad_norm": 0.7072296142578125, + "learning_rate": 5.006217248688492e-05, + "loss": 0.4931, + "step": 103760 + }, + { + "epoch": 2.3096955128205128, + "grad_norm": 0.6555598378181458, + "learning_rate": 5.003132242563169e-05, + "loss": 0.4361, + "step": 103770 + }, + { + "epoch": 2.309918091168091, + "grad_norm": 0.6338368058204651, + "learning_rate": 5.0000480513799176e-05, + "loss": 0.411, + "step": 103780 + }, + { + "epoch": 2.3101406695156697, + "grad_norm": 0.45361435413360596, + "learning_rate": 4.996964675306341e-05, + "loss": 0.4087, + "step": 103790 + }, + { + "epoch": 2.310363247863248, + "grad_norm": 0.39669114351272583, + "learning_rate": 4.99388211450998e-05, + "loss": 0.4447, + "step": 103800 + }, + { + "epoch": 2.310585826210826, + "grad_norm": 0.4574336111545563, + "learning_rate": 4.990800369158355e-05, + "loss": 0.3971, + "step": 103810 + }, + { + "epoch": 2.3108084045584047, + "grad_norm": 0.5125351548194885, + "learning_rate": 4.9877194394189185e-05, + "loss": 0.429, + "step": 103820 + }, + { + "epoch": 2.311030982905983, + "grad_norm": 0.5527442693710327, + "learning_rate": 4.9846393254591e-05, + "loss": 0.4648, + "step": 103830 + }, + { + "epoch": 2.311253561253561, + "grad_norm": 0.7029348611831665, + "learning_rate": 4.98156002744627e-05, + "loss": 0.5263, + "step": 103840 + }, + { + "epoch": 2.3114761396011394, + "grad_norm": 0.52207350730896, + "learning_rate": 4.978481545547764e-05, + "loss": 0.4652, + "step": 103850 + }, + { + "epoch": 2.311698717948718, + "grad_norm": 0.609255850315094, + "learning_rate": 4.975403879930867e-05, + "loss": 0.4534, + "step": 103860 + }, + { + "epoch": 2.3119212962962963, + "grad_norm": 0.6262114644050598, + "learning_rate": 4.972327030762829e-05, + "loss": 0.4914, + "step": 103870 + }, + { + "epoch": 2.3121438746438745, + "grad_norm": 0.5444429516792297, + "learning_rate": 4.9692509982108373e-05, + "loss": 0.4856, + "step": 103880 + }, + { + "epoch": 2.312366452991453, + "grad_norm": 0.5377378463745117, + "learning_rate": 4.966175782442051e-05, + "loss": 0.4425, + "step": 103890 + }, + { + "epoch": 2.3125890313390314, + "grad_norm": 0.5324745774269104, + "learning_rate": 4.9631013836235786e-05, + "loss": 0.3724, + "step": 103900 + }, + { + "epoch": 2.3128116096866096, + "grad_norm": 1.1432093381881714, + "learning_rate": 4.960027801922486e-05, + "loss": 0.4219, + "step": 103910 + }, + { + "epoch": 2.3130341880341883, + "grad_norm": 0.48718583583831787, + "learning_rate": 4.9569550375057994e-05, + "loss": 0.4795, + "step": 103920 + }, + { + "epoch": 2.3132567663817665, + "grad_norm": 0.3967041075229645, + "learning_rate": 4.953883090540492e-05, + "loss": 0.4148, + "step": 103930 + }, + { + "epoch": 2.3134793447293447, + "grad_norm": 0.6293598413467407, + "learning_rate": 4.9508119611934914e-05, + "loss": 0.4807, + "step": 103940 + }, + { + "epoch": 2.313701923076923, + "grad_norm": 0.7447176575660706, + "learning_rate": 4.947741649631694e-05, + "loss": 0.5025, + "step": 103950 + }, + { + "epoch": 2.3139245014245016, + "grad_norm": 0.618916928768158, + "learning_rate": 4.9446721560219324e-05, + "loss": 0.5215, + "step": 103960 + }, + { + "epoch": 2.31414707977208, + "grad_norm": 0.7033261060714722, + "learning_rate": 4.94160348053101e-05, + "loss": 0.3964, + "step": 103970 + }, + { + "epoch": 2.314369658119658, + "grad_norm": 0.4891183376312256, + "learning_rate": 4.938535623325682e-05, + "loss": 0.4563, + "step": 103980 + }, + { + "epoch": 2.314592236467236, + "grad_norm": 0.5945625305175781, + "learning_rate": 4.935468584572658e-05, + "loss": 0.4716, + "step": 103990 + }, + { + "epoch": 2.314814814814815, + "grad_norm": 0.6280008554458618, + "learning_rate": 4.932402364438604e-05, + "loss": 0.5475, + "step": 104000 + }, + { + "epoch": 2.315037393162393, + "grad_norm": 0.535823404788971, + "learning_rate": 4.929336963090145e-05, + "loss": 0.4413, + "step": 104010 + }, + { + "epoch": 2.3152599715099713, + "grad_norm": 0.42797577381134033, + "learning_rate": 4.926272380693848e-05, + "loss": 0.4313, + "step": 104020 + }, + { + "epoch": 2.31548254985755, + "grad_norm": 0.6048837900161743, + "learning_rate": 4.9232086174162504e-05, + "loss": 0.4155, + "step": 104030 + }, + { + "epoch": 2.315705128205128, + "grad_norm": 0.49043428897857666, + "learning_rate": 4.9201456734238394e-05, + "loss": 0.5275, + "step": 104040 + }, + { + "epoch": 2.3159277065527064, + "grad_norm": 0.6840739846229553, + "learning_rate": 4.917083548883055e-05, + "loss": 0.3677, + "step": 104050 + }, + { + "epoch": 2.316150284900285, + "grad_norm": 0.4900062084197998, + "learning_rate": 4.914022243960299e-05, + "loss": 0.4514, + "step": 104060 + }, + { + "epoch": 2.3163728632478633, + "grad_norm": 0.45657631754875183, + "learning_rate": 4.91096175882193e-05, + "loss": 0.547, + "step": 104070 + }, + { + "epoch": 2.3165954415954415, + "grad_norm": 0.6857179403305054, + "learning_rate": 4.9079020936342465e-05, + "loss": 0.5541, + "step": 104080 + }, + { + "epoch": 2.31681801994302, + "grad_norm": 0.6056687235832214, + "learning_rate": 4.9048432485635245e-05, + "loss": 0.367, + "step": 104090 + }, + { + "epoch": 2.3170405982905984, + "grad_norm": 0.5658937096595764, + "learning_rate": 4.90178522377597e-05, + "loss": 0.429, + "step": 104100 + }, + { + "epoch": 2.3172631766381766, + "grad_norm": 0.3987172842025757, + "learning_rate": 4.898728019437768e-05, + "loss": 0.4943, + "step": 104110 + }, + { + "epoch": 2.317485754985755, + "grad_norm": 0.5849761962890625, + "learning_rate": 4.895671635715047e-05, + "loss": 0.3971, + "step": 104120 + }, + { + "epoch": 2.3177083333333335, + "grad_norm": 0.6185342073440552, + "learning_rate": 4.8926160727738944e-05, + "loss": 0.4785, + "step": 104130 + }, + { + "epoch": 2.3179309116809117, + "grad_norm": 0.6695090532302856, + "learning_rate": 4.889561330780352e-05, + "loss": 0.4147, + "step": 104140 + }, + { + "epoch": 2.31815349002849, + "grad_norm": 0.5411170721054077, + "learning_rate": 4.886507409900425e-05, + "loss": 0.5187, + "step": 104150 + }, + { + "epoch": 2.318376068376068, + "grad_norm": 0.7303962111473083, + "learning_rate": 4.8834543103000486e-05, + "loss": 0.3542, + "step": 104160 + }, + { + "epoch": 2.318598646723647, + "grad_norm": 0.42483285069465637, + "learning_rate": 4.880402032145141e-05, + "loss": 0.575, + "step": 104170 + }, + { + "epoch": 2.318821225071225, + "grad_norm": 0.7072681188583374, + "learning_rate": 4.877350575601565e-05, + "loss": 0.4142, + "step": 104180 + }, + { + "epoch": 2.3190438034188032, + "grad_norm": 0.6763079166412354, + "learning_rate": 4.874299940835139e-05, + "loss": 0.4266, + "step": 104190 + }, + { + "epoch": 2.319266381766382, + "grad_norm": 0.5402014851570129, + "learning_rate": 4.871250128011635e-05, + "loss": 0.4712, + "step": 104200 + }, + { + "epoch": 2.31948896011396, + "grad_norm": 0.4686580002307892, + "learning_rate": 4.868201137296791e-05, + "loss": 0.396, + "step": 104210 + }, + { + "epoch": 2.3197115384615383, + "grad_norm": 0.43557852506637573, + "learning_rate": 4.865152968856279e-05, + "loss": 0.4539, + "step": 104220 + }, + { + "epoch": 2.319934116809117, + "grad_norm": 0.535490870475769, + "learning_rate": 4.86210562285575e-05, + "loss": 0.3973, + "step": 104230 + }, + { + "epoch": 2.320156695156695, + "grad_norm": 0.27691832184791565, + "learning_rate": 4.8590590994607874e-05, + "loss": 0.4522, + "step": 104240 + }, + { + "epoch": 2.3203792735042734, + "grad_norm": 0.7170953154563904, + "learning_rate": 4.8560133988369495e-05, + "loss": 0.4457, + "step": 104250 + }, + { + "epoch": 2.320601851851852, + "grad_norm": 0.32499974966049194, + "learning_rate": 4.852968521149741e-05, + "loss": 0.4466, + "step": 104260 + }, + { + "epoch": 2.3208244301994303, + "grad_norm": 0.3497593104839325, + "learning_rate": 4.849924466564624e-05, + "loss": 0.3996, + "step": 104270 + }, + { + "epoch": 2.3210470085470085, + "grad_norm": 0.5175875425338745, + "learning_rate": 4.846881235247011e-05, + "loss": 0.419, + "step": 104280 + }, + { + "epoch": 2.3212695868945867, + "grad_norm": 0.5474359393119812, + "learning_rate": 4.8438388273622834e-05, + "loss": 0.4922, + "step": 104290 + }, + { + "epoch": 2.3214921652421654, + "grad_norm": 0.4498869478702545, + "learning_rate": 4.840797243075757e-05, + "loss": 0.3817, + "step": 104300 + }, + { + "epoch": 2.3217147435897436, + "grad_norm": 0.9331063628196716, + "learning_rate": 4.837756482552718e-05, + "loss": 0.4962, + "step": 104310 + }, + { + "epoch": 2.321937321937322, + "grad_norm": 0.5544161200523376, + "learning_rate": 4.834716545958402e-05, + "loss": 0.5001, + "step": 104320 + }, + { + "epoch": 2.3221599002849, + "grad_norm": 0.7270580530166626, + "learning_rate": 4.831677433458006e-05, + "loss": 0.5085, + "step": 104330 + }, + { + "epoch": 2.3223824786324787, + "grad_norm": 0.6028932929039001, + "learning_rate": 4.82863914521668e-05, + "loss": 0.4596, + "step": 104340 + }, + { + "epoch": 2.322605056980057, + "grad_norm": 0.6653586626052856, + "learning_rate": 4.825601681399518e-05, + "loss": 0.4395, + "step": 104350 + }, + { + "epoch": 2.322827635327635, + "grad_norm": 0.4627334177494049, + "learning_rate": 4.822565042171583e-05, + "loss": 0.4488, + "step": 104360 + }, + { + "epoch": 2.323050213675214, + "grad_norm": 0.502573549747467, + "learning_rate": 4.819529227697894e-05, + "loss": 0.3895, + "step": 104370 + }, + { + "epoch": 2.323272792022792, + "grad_norm": 0.5004000663757324, + "learning_rate": 4.816494238143407e-05, + "loss": 0.4488, + "step": 104380 + }, + { + "epoch": 2.3234953703703702, + "grad_norm": 0.5961329936981201, + "learning_rate": 4.813460073673055e-05, + "loss": 0.3853, + "step": 104390 + }, + { + "epoch": 2.323717948717949, + "grad_norm": 0.8065715432167053, + "learning_rate": 4.810426734451714e-05, + "loss": 0.3902, + "step": 104400 + }, + { + "epoch": 2.323940527065527, + "grad_norm": 0.6661033034324646, + "learning_rate": 4.807394220644219e-05, + "loss": 0.4262, + "step": 104410 + }, + { + "epoch": 2.3241631054131053, + "grad_norm": 0.6344094276428223, + "learning_rate": 4.804362532415358e-05, + "loss": 0.359, + "step": 104420 + }, + { + "epoch": 2.324385683760684, + "grad_norm": 0.4737999141216278, + "learning_rate": 4.801331669929885e-05, + "loss": 0.5379, + "step": 104430 + }, + { + "epoch": 2.324608262108262, + "grad_norm": 0.8009842038154602, + "learning_rate": 4.798301633352484e-05, + "loss": 0.5161, + "step": 104440 + }, + { + "epoch": 2.3248308404558404, + "grad_norm": 0.5520380139350891, + "learning_rate": 4.7952724228478164e-05, + "loss": 0.4088, + "step": 104450 + }, + { + "epoch": 2.3250534188034186, + "grad_norm": 0.4492236375808716, + "learning_rate": 4.792244038580494e-05, + "loss": 0.5594, + "step": 104460 + }, + { + "epoch": 2.3252759971509973, + "grad_norm": 0.6150158047676086, + "learning_rate": 4.789216480715084e-05, + "loss": 0.3366, + "step": 104470 + }, + { + "epoch": 2.3254985754985755, + "grad_norm": 0.5869104862213135, + "learning_rate": 4.786189749416099e-05, + "loss": 0.5082, + "step": 104480 + }, + { + "epoch": 2.3257211538461537, + "grad_norm": 0.5086728930473328, + "learning_rate": 4.7831638448480176e-05, + "loss": 0.4535, + "step": 104490 + }, + { + "epoch": 2.325943732193732, + "grad_norm": 1.0462831258773804, + "learning_rate": 4.780138767175273e-05, + "loss": 0.5004, + "step": 104500 + }, + { + "epoch": 2.3261663105413106, + "grad_norm": 0.4210734963417053, + "learning_rate": 4.777114516562244e-05, + "loss": 0.4724, + "step": 104510 + }, + { + "epoch": 2.326388888888889, + "grad_norm": 0.5561769008636475, + "learning_rate": 4.774091093173274e-05, + "loss": 0.499, + "step": 104520 + }, + { + "epoch": 2.326611467236467, + "grad_norm": 0.4487607777118683, + "learning_rate": 4.771068497172657e-05, + "loss": 0.4394, + "step": 104530 + }, + { + "epoch": 2.3268340455840457, + "grad_norm": 1.0502511262893677, + "learning_rate": 4.7680467287246465e-05, + "loss": 0.388, + "step": 104540 + }, + { + "epoch": 2.327056623931624, + "grad_norm": 0.5806379318237305, + "learning_rate": 4.765025787993445e-05, + "loss": 0.5453, + "step": 104550 + }, + { + "epoch": 2.327279202279202, + "grad_norm": 0.5412917137145996, + "learning_rate": 4.7620056751432155e-05, + "loss": 0.4893, + "step": 104560 + }, + { + "epoch": 2.327501780626781, + "grad_norm": 0.48680606484413147, + "learning_rate": 4.758986390338076e-05, + "loss": 0.497, + "step": 104570 + }, + { + "epoch": 2.327724358974359, + "grad_norm": 0.6453859806060791, + "learning_rate": 4.7559679337420894e-05, + "loss": 0.5302, + "step": 104580 + }, + { + "epoch": 2.3279469373219372, + "grad_norm": 0.5900819301605225, + "learning_rate": 4.752950305519282e-05, + "loss": 0.4153, + "step": 104590 + }, + { + "epoch": 2.328169515669516, + "grad_norm": 0.7351835370063782, + "learning_rate": 4.7499335058336437e-05, + "loss": 0.4622, + "step": 104600 + }, + { + "epoch": 2.328392094017094, + "grad_norm": 0.5503653287887573, + "learning_rate": 4.746917534849098e-05, + "loss": 0.4657, + "step": 104610 + }, + { + "epoch": 2.3286146723646723, + "grad_norm": 0.4437830150127411, + "learning_rate": 4.74390239272954e-05, + "loss": 0.4675, + "step": 104620 + }, + { + "epoch": 2.3288372507122506, + "grad_norm": 0.7353461384773254, + "learning_rate": 4.740888079638815e-05, + "loss": 0.3839, + "step": 104630 + }, + { + "epoch": 2.3290598290598292, + "grad_norm": 0.8295828700065613, + "learning_rate": 4.737874595740728e-05, + "loss": 0.4533, + "step": 104640 + }, + { + "epoch": 2.3292824074074074, + "grad_norm": 0.6678076386451721, + "learning_rate": 4.734861941199025e-05, + "loss": 0.4824, + "step": 104650 + }, + { + "epoch": 2.3295049857549857, + "grad_norm": 0.5148504972457886, + "learning_rate": 4.7318501161774206e-05, + "loss": 0.4704, + "step": 104660 + }, + { + "epoch": 2.329727564102564, + "grad_norm": 0.5634397268295288, + "learning_rate": 4.728839120839581e-05, + "loss": 0.475, + "step": 104670 + }, + { + "epoch": 2.3299501424501425, + "grad_norm": 0.5880442261695862, + "learning_rate": 4.725828955349123e-05, + "loss": 0.428, + "step": 104680 + }, + { + "epoch": 2.3301727207977208, + "grad_norm": 0.5846168994903564, + "learning_rate": 4.722819619869625e-05, + "loss": 0.3902, + "step": 104690 + }, + { + "epoch": 2.330395299145299, + "grad_norm": 0.7889565229415894, + "learning_rate": 4.7198111145646165e-05, + "loss": 0.5206, + "step": 104700 + }, + { + "epoch": 2.3306178774928776, + "grad_norm": 0.5713897943496704, + "learning_rate": 4.7168034395975834e-05, + "loss": 0.5043, + "step": 104710 + }, + { + "epoch": 2.330840455840456, + "grad_norm": 0.49295225739479065, + "learning_rate": 4.713796595131961e-05, + "loss": 0.3995, + "step": 104720 + }, + { + "epoch": 2.331063034188034, + "grad_norm": 0.7045540809631348, + "learning_rate": 4.710790581331148e-05, + "loss": 0.4757, + "step": 104730 + }, + { + "epoch": 2.3312856125356127, + "grad_norm": 0.5422503352165222, + "learning_rate": 4.707785398358486e-05, + "loss": 0.3957, + "step": 104740 + }, + { + "epoch": 2.331508190883191, + "grad_norm": 0.6096838116645813, + "learning_rate": 4.704781046377285e-05, + "loss": 0.4897, + "step": 104750 + }, + { + "epoch": 2.331730769230769, + "grad_norm": 0.6194683313369751, + "learning_rate": 4.701777525550803e-05, + "loss": 0.5267, + "step": 104760 + }, + { + "epoch": 2.331953347578348, + "grad_norm": 0.6207007169723511, + "learning_rate": 4.698774836042254e-05, + "loss": 0.4713, + "step": 104770 + }, + { + "epoch": 2.332175925925926, + "grad_norm": 0.6871147751808167, + "learning_rate": 4.695772978014812e-05, + "loss": 0.4667, + "step": 104780 + }, + { + "epoch": 2.3323985042735043, + "grad_norm": 0.5978109240531921, + "learning_rate": 4.692771951631589e-05, + "loss": 0.4706, + "step": 104790 + }, + { + "epoch": 2.3326210826210825, + "grad_norm": 0.7385008335113525, + "learning_rate": 4.689771757055672e-05, + "loss": 0.5148, + "step": 104800 + }, + { + "epoch": 2.332843660968661, + "grad_norm": 0.5949559211730957, + "learning_rate": 4.6867723944500874e-05, + "loss": 0.4718, + "step": 104810 + }, + { + "epoch": 2.3330662393162394, + "grad_norm": 0.529515266418457, + "learning_rate": 4.6837738639778294e-05, + "loss": 0.3999, + "step": 104820 + }, + { + "epoch": 2.3332888176638176, + "grad_norm": 0.654148519039154, + "learning_rate": 4.680776165801837e-05, + "loss": 0.4769, + "step": 104830 + }, + { + "epoch": 2.333511396011396, + "grad_norm": 0.6815111637115479, + "learning_rate": 4.677779300085008e-05, + "loss": 0.4645, + "step": 104840 + }, + { + "epoch": 2.3337339743589745, + "grad_norm": 0.5101546049118042, + "learning_rate": 4.6747832669902035e-05, + "loss": 0.4376, + "step": 104850 + }, + { + "epoch": 2.3339565527065527, + "grad_norm": 0.4611969292163849, + "learning_rate": 4.6717880666802206e-05, + "loss": 0.386, + "step": 104860 + }, + { + "epoch": 2.334179131054131, + "grad_norm": 0.6382802128791809, + "learning_rate": 4.668793699317815e-05, + "loss": 0.55, + "step": 104870 + }, + { + "epoch": 2.3344017094017095, + "grad_norm": 0.7732513546943665, + "learning_rate": 4.665800165065712e-05, + "loss": 0.5326, + "step": 104880 + }, + { + "epoch": 2.3346242877492878, + "grad_norm": 0.5758798122406006, + "learning_rate": 4.66280746408658e-05, + "loss": 0.4083, + "step": 104890 + }, + { + "epoch": 2.334846866096866, + "grad_norm": 0.5310105681419373, + "learning_rate": 4.659815596543049e-05, + "loss": 0.3584, + "step": 104900 + }, + { + "epoch": 2.3350694444444446, + "grad_norm": 0.657589316368103, + "learning_rate": 4.656824562597695e-05, + "loss": 0.5431, + "step": 104910 + }, + { + "epoch": 2.335292022792023, + "grad_norm": 0.5576604008674622, + "learning_rate": 4.653834362413059e-05, + "loss": 0.5104, + "step": 104920 + }, + { + "epoch": 2.335514601139601, + "grad_norm": 0.32664191722869873, + "learning_rate": 4.6508449961516224e-05, + "loss": 0.3872, + "step": 104930 + }, + { + "epoch": 2.3357371794871793, + "grad_norm": 0.6069530248641968, + "learning_rate": 4.647856463975835e-05, + "loss": 0.4422, + "step": 104940 + }, + { + "epoch": 2.335959757834758, + "grad_norm": 0.43368053436279297, + "learning_rate": 4.644868766048094e-05, + "loss": 0.5391, + "step": 104950 + }, + { + "epoch": 2.336182336182336, + "grad_norm": 0.7271152138710022, + "learning_rate": 4.641881902530754e-05, + "loss": 0.4033, + "step": 104960 + }, + { + "epoch": 2.3364049145299144, + "grad_norm": 0.587011456489563, + "learning_rate": 4.6388958735861246e-05, + "loss": 0.4844, + "step": 104970 + }, + { + "epoch": 2.336627492877493, + "grad_norm": 0.8586376905441284, + "learning_rate": 4.6359106793764676e-05, + "loss": 0.366, + "step": 104980 + }, + { + "epoch": 2.3368500712250713, + "grad_norm": 0.7252295613288879, + "learning_rate": 4.6329263200640105e-05, + "loss": 0.4419, + "step": 104990 + }, + { + "epoch": 2.3370726495726495, + "grad_norm": 0.5614856481552124, + "learning_rate": 4.6299427958109155e-05, + "loss": 0.5309, + "step": 105000 + }, + { + "epoch": 2.3372952279202277, + "grad_norm": 0.5738231539726257, + "learning_rate": 4.626960106779306e-05, + "loss": 0.4925, + "step": 105010 + }, + { + "epoch": 2.3375178062678064, + "grad_norm": 0.5994122624397278, + "learning_rate": 4.623978253131267e-05, + "loss": 0.4677, + "step": 105020 + }, + { + "epoch": 2.3377403846153846, + "grad_norm": 0.813988208770752, + "learning_rate": 4.620997235028841e-05, + "loss": 0.4449, + "step": 105030 + }, + { + "epoch": 2.337962962962963, + "grad_norm": 0.4093601107597351, + "learning_rate": 4.6180170526340114e-05, + "loss": 0.4821, + "step": 105040 + }, + { + "epoch": 2.3381855413105415, + "grad_norm": 0.6398217082023621, + "learning_rate": 4.615037706108731e-05, + "loss": 0.4613, + "step": 105050 + }, + { + "epoch": 2.3384081196581197, + "grad_norm": 0.669044554233551, + "learning_rate": 4.6120591956148994e-05, + "loss": 0.4632, + "step": 105060 + }, + { + "epoch": 2.338630698005698, + "grad_norm": 0.587679922580719, + "learning_rate": 4.609081521314365e-05, + "loss": 0.3624, + "step": 105070 + }, + { + "epoch": 2.3388532763532766, + "grad_norm": 0.6691391468048096, + "learning_rate": 4.606104683368937e-05, + "loss": 0.4374, + "step": 105080 + }, + { + "epoch": 2.3390758547008548, + "grad_norm": 0.5915713906288147, + "learning_rate": 4.603128681940385e-05, + "loss": 0.3768, + "step": 105090 + }, + { + "epoch": 2.339298433048433, + "grad_norm": 0.5158896446228027, + "learning_rate": 4.6001535171904245e-05, + "loss": 0.5167, + "step": 105100 + }, + { + "epoch": 2.339521011396011, + "grad_norm": 0.5442484617233276, + "learning_rate": 4.5971791892807293e-05, + "loss": 0.4577, + "step": 105110 + }, + { + "epoch": 2.33974358974359, + "grad_norm": 0.3447693884372711, + "learning_rate": 4.594205698372931e-05, + "loss": 0.4921, + "step": 105120 + }, + { + "epoch": 2.339966168091168, + "grad_norm": 0.6204404234886169, + "learning_rate": 4.5912330446286e-05, + "loss": 0.4168, + "step": 105130 + }, + { + "epoch": 2.3401887464387463, + "grad_norm": 0.5360138416290283, + "learning_rate": 4.588261228209287e-05, + "loss": 0.6229, + "step": 105140 + }, + { + "epoch": 2.3402777777777777, + "eval_loss": 0.5290852189064026, + "eval_runtime": 337.4554, + "eval_samples_per_second": 7.008, + "eval_steps_per_second": 7.008, + "step": 105144 + }, + { + "epoch": 2.340411324786325, + "grad_norm": 0.3526182770729065, + "learning_rate": 4.5852902492764705e-05, + "loss": 0.3732, + "step": 105150 + }, + { + "epoch": 2.340633903133903, + "grad_norm": 0.5401740074157715, + "learning_rate": 4.5823201079916e-05, + "loss": 0.5155, + "step": 105160 + }, + { + "epoch": 2.3408564814814814, + "grad_norm": 0.8644507527351379, + "learning_rate": 4.579350804516076e-05, + "loss": 0.5004, + "step": 105170 + }, + { + "epoch": 2.3410790598290596, + "grad_norm": 0.6037582159042358, + "learning_rate": 4.576382339011254e-05, + "loss": 0.5717, + "step": 105180 + }, + { + "epoch": 2.3413016381766383, + "grad_norm": 0.6918118000030518, + "learning_rate": 4.5734147116384395e-05, + "loss": 0.5744, + "step": 105190 + }, + { + "epoch": 2.3415242165242165, + "grad_norm": 0.5624716877937317, + "learning_rate": 4.570447922558907e-05, + "loss": 0.5157, + "step": 105200 + }, + { + "epoch": 2.3417467948717947, + "grad_norm": 0.6044760346412659, + "learning_rate": 4.567481971933858e-05, + "loss": 0.5491, + "step": 105210 + }, + { + "epoch": 2.3419693732193734, + "grad_norm": 0.6322526931762695, + "learning_rate": 4.564516859924475e-05, + "loss": 0.4647, + "step": 105220 + }, + { + "epoch": 2.3421919515669516, + "grad_norm": 0.7589077353477478, + "learning_rate": 4.561552586691879e-05, + "loss": 0.5316, + "step": 105230 + }, + { + "epoch": 2.34241452991453, + "grad_norm": 0.4485898017883301, + "learning_rate": 4.558589152397155e-05, + "loss": 0.4465, + "step": 105240 + }, + { + "epoch": 2.3426371082621085, + "grad_norm": 0.49135929346084595, + "learning_rate": 4.555626557201338e-05, + "loss": 0.4638, + "step": 105250 + }, + { + "epoch": 2.3428596866096867, + "grad_norm": 0.5671507716178894, + "learning_rate": 4.552664801265421e-05, + "loss": 0.3968, + "step": 105260 + }, + { + "epoch": 2.343082264957265, + "grad_norm": 0.6243501305580139, + "learning_rate": 4.54970388475034e-05, + "loss": 0.578, + "step": 105270 + }, + { + "epoch": 2.343304843304843, + "grad_norm": 0.43660518527030945, + "learning_rate": 4.546743807817004e-05, + "loss": 0.51, + "step": 105280 + }, + { + "epoch": 2.343527421652422, + "grad_norm": 0.7414266467094421, + "learning_rate": 4.5437845706262546e-05, + "loss": 0.5355, + "step": 105290 + }, + { + "epoch": 2.34375, + "grad_norm": 0.478127658367157, + "learning_rate": 4.5408261733389054e-05, + "loss": 0.4781, + "step": 105300 + }, + { + "epoch": 2.343972578347578, + "grad_norm": 0.7516529560089111, + "learning_rate": 4.537868616115717e-05, + "loss": 0.525, + "step": 105310 + }, + { + "epoch": 2.344195156695157, + "grad_norm": 0.5840153098106384, + "learning_rate": 4.534911899117405e-05, + "loss": 0.4286, + "step": 105320 + }, + { + "epoch": 2.344417735042735, + "grad_norm": 0.44853419065475464, + "learning_rate": 4.53195602250464e-05, + "loss": 0.5242, + "step": 105330 + }, + { + "epoch": 2.3446403133903133, + "grad_norm": 0.4717266857624054, + "learning_rate": 4.529000986438055e-05, + "loss": 0.3967, + "step": 105340 + }, + { + "epoch": 2.3448628917378915, + "grad_norm": 0.641845166683197, + "learning_rate": 4.526046791078216e-05, + "loss": 0.5133, + "step": 105350 + }, + { + "epoch": 2.34508547008547, + "grad_norm": 0.6157569885253906, + "learning_rate": 4.523093436585659e-05, + "loss": 0.448, + "step": 105360 + }, + { + "epoch": 2.3453080484330484, + "grad_norm": 0.57972252368927, + "learning_rate": 4.520140923120877e-05, + "loss": 0.4681, + "step": 105370 + }, + { + "epoch": 2.3455306267806266, + "grad_norm": 0.6386918425559998, + "learning_rate": 4.517189250844309e-05, + "loss": 0.4064, + "step": 105380 + }, + { + "epoch": 2.3457532051282053, + "grad_norm": 0.34322062134742737, + "learning_rate": 4.514238419916359e-05, + "loss": 0.3958, + "step": 105390 + }, + { + "epoch": 2.3459757834757835, + "grad_norm": 0.3749435842037201, + "learning_rate": 4.5112884304973626e-05, + "loss": 0.4942, + "step": 105400 + }, + { + "epoch": 2.3461983618233617, + "grad_norm": 0.7757494449615479, + "learning_rate": 4.508339282747633e-05, + "loss": 0.6077, + "step": 105410 + }, + { + "epoch": 2.3464209401709404, + "grad_norm": 0.6120235919952393, + "learning_rate": 4.505390976827437e-05, + "loss": 0.37, + "step": 105420 + }, + { + "epoch": 2.3466435185185186, + "grad_norm": 0.3525809049606323, + "learning_rate": 4.502443512896972e-05, + "loss": 0.4643, + "step": 105430 + }, + { + "epoch": 2.346866096866097, + "grad_norm": 0.5600558519363403, + "learning_rate": 4.499496891116413e-05, + "loss": 0.5139, + "step": 105440 + }, + { + "epoch": 2.347088675213675, + "grad_norm": 0.7498864531517029, + "learning_rate": 4.4965511116458836e-05, + "loss": 0.4153, + "step": 105450 + }, + { + "epoch": 2.3473112535612537, + "grad_norm": 0.5569818615913391, + "learning_rate": 4.493606174645457e-05, + "loss": 0.4323, + "step": 105460 + }, + { + "epoch": 2.347533831908832, + "grad_norm": 0.72532057762146, + "learning_rate": 4.490662080275165e-05, + "loss": 0.4834, + "step": 105470 + }, + { + "epoch": 2.34775641025641, + "grad_norm": 0.7324677109718323, + "learning_rate": 4.487718828695e-05, + "loss": 0.5242, + "step": 105480 + }, + { + "epoch": 2.347978988603989, + "grad_norm": 0.34302493929862976, + "learning_rate": 4.484776420064885e-05, + "loss": 0.3734, + "step": 105490 + }, + { + "epoch": 2.348201566951567, + "grad_norm": 1.0544486045837402, + "learning_rate": 4.481834854544722e-05, + "loss": 0.4965, + "step": 105500 + }, + { + "epoch": 2.3484241452991452, + "grad_norm": 0.44091662764549255, + "learning_rate": 4.4788941322943555e-05, + "loss": 0.4322, + "step": 105510 + }, + { + "epoch": 2.3486467236467234, + "grad_norm": 0.4288344383239746, + "learning_rate": 4.475954253473596e-05, + "loss": 0.405, + "step": 105520 + }, + { + "epoch": 2.348869301994302, + "grad_norm": 0.47448357939720154, + "learning_rate": 4.473015218242182e-05, + "loss": 0.4113, + "step": 105530 + }, + { + "epoch": 2.3490918803418803, + "grad_norm": 0.5844582319259644, + "learning_rate": 4.470077026759834e-05, + "loss": 0.4803, + "step": 105540 + }, + { + "epoch": 2.3493144586894585, + "grad_norm": 0.5927007794380188, + "learning_rate": 4.4671396791862144e-05, + "loss": 0.4546, + "step": 105550 + }, + { + "epoch": 2.349537037037037, + "grad_norm": 0.5077242851257324, + "learning_rate": 4.464203175680943e-05, + "loss": 0.4778, + "step": 105560 + }, + { + "epoch": 2.3497596153846154, + "grad_norm": 0.5459374785423279, + "learning_rate": 4.4612675164035864e-05, + "loss": 0.5227, + "step": 105570 + }, + { + "epoch": 2.3499821937321936, + "grad_norm": 0.9779196381568909, + "learning_rate": 4.458332701513672e-05, + "loss": 0.4266, + "step": 105580 + }, + { + "epoch": 2.3502047720797723, + "grad_norm": 0.4522073566913605, + "learning_rate": 4.4553987311706836e-05, + "loss": 0.3806, + "step": 105590 + }, + { + "epoch": 2.3504273504273505, + "grad_norm": 0.44010409712791443, + "learning_rate": 4.452465605534053e-05, + "loss": 0.434, + "step": 105600 + }, + { + "epoch": 2.3506499287749287, + "grad_norm": 0.44877657294273376, + "learning_rate": 4.4495333247631686e-05, + "loss": 0.4163, + "step": 105610 + }, + { + "epoch": 2.350872507122507, + "grad_norm": 0.5671922564506531, + "learning_rate": 4.4466018890173785e-05, + "loss": 0.4689, + "step": 105620 + }, + { + "epoch": 2.3510950854700856, + "grad_norm": 0.7625622153282166, + "learning_rate": 4.443671298455969e-05, + "loss": 0.3857, + "step": 105630 + }, + { + "epoch": 2.351317663817664, + "grad_norm": 0.4385626018047333, + "learning_rate": 4.440741553238197e-05, + "loss": 0.3915, + "step": 105640 + }, + { + "epoch": 2.351540242165242, + "grad_norm": 0.795430600643158, + "learning_rate": 4.4378126535232725e-05, + "loss": 0.4764, + "step": 105650 + }, + { + "epoch": 2.3517628205128207, + "grad_norm": 0.5816218852996826, + "learning_rate": 4.434884599470341e-05, + "loss": 0.5346, + "step": 105660 + }, + { + "epoch": 2.351985398860399, + "grad_norm": 0.8353447914123535, + "learning_rate": 4.431957391238521e-05, + "loss": 0.4902, + "step": 105670 + }, + { + "epoch": 2.352207977207977, + "grad_norm": 0.6605375409126282, + "learning_rate": 4.429031028986885e-05, + "loss": 0.4659, + "step": 105680 + }, + { + "epoch": 2.3524305555555554, + "grad_norm": 0.5393868684768677, + "learning_rate": 4.426105512874448e-05, + "loss": 0.47, + "step": 105690 + }, + { + "epoch": 2.352653133903134, + "grad_norm": 0.40643393993377686, + "learning_rate": 4.4231808430601925e-05, + "loss": 0.4405, + "step": 105700 + }, + { + "epoch": 2.3528757122507122, + "grad_norm": 0.7156741619110107, + "learning_rate": 4.420257019703036e-05, + "loss": 0.471, + "step": 105710 + }, + { + "epoch": 2.3530982905982905, + "grad_norm": 0.42996159195899963, + "learning_rate": 4.417334042961867e-05, + "loss": 0.3584, + "step": 105720 + }, + { + "epoch": 2.353320868945869, + "grad_norm": 0.6254279613494873, + "learning_rate": 4.414411912995522e-05, + "loss": 0.4682, + "step": 105730 + }, + { + "epoch": 2.3535434472934473, + "grad_norm": 0.4967123866081238, + "learning_rate": 4.4114906299627934e-05, + "loss": 0.427, + "step": 105740 + }, + { + "epoch": 2.3537660256410255, + "grad_norm": 0.47954392433166504, + "learning_rate": 4.408570194022426e-05, + "loss": 0.4278, + "step": 105750 + }, + { + "epoch": 2.353988603988604, + "grad_norm": 0.6673099994659424, + "learning_rate": 4.4056506053331224e-05, + "loss": 0.4725, + "step": 105760 + }, + { + "epoch": 2.3542111823361824, + "grad_norm": 0.7563617825508118, + "learning_rate": 4.4027318640535267e-05, + "loss": 0.4144, + "step": 105770 + }, + { + "epoch": 2.3544337606837606, + "grad_norm": 0.6754289269447327, + "learning_rate": 4.3998139703422544e-05, + "loss": 0.5307, + "step": 105780 + }, + { + "epoch": 2.354656339031339, + "grad_norm": 0.6166078448295593, + "learning_rate": 4.396896924357858e-05, + "loss": 0.4877, + "step": 105790 + }, + { + "epoch": 2.3548789173789175, + "grad_norm": 0.6370611190795898, + "learning_rate": 4.393980726258855e-05, + "loss": 0.5551, + "step": 105800 + }, + { + "epoch": 2.3551014957264957, + "grad_norm": 0.7064977288246155, + "learning_rate": 4.391065376203716e-05, + "loss": 0.5666, + "step": 105810 + }, + { + "epoch": 2.355324074074074, + "grad_norm": 0.5430741906166077, + "learning_rate": 4.3881508743508606e-05, + "loss": 0.4114, + "step": 105820 + }, + { + "epoch": 2.355546652421652, + "grad_norm": 0.6265538334846497, + "learning_rate": 4.3852372208586665e-05, + "loss": 0.574, + "step": 105830 + }, + { + "epoch": 2.355769230769231, + "grad_norm": 0.31288737058639526, + "learning_rate": 4.3823244158854725e-05, + "loss": 0.5192, + "step": 105840 + }, + { + "epoch": 2.355991809116809, + "grad_norm": 0.8239595293998718, + "learning_rate": 4.379412459589549e-05, + "loss": 0.5916, + "step": 105850 + }, + { + "epoch": 2.3562143874643873, + "grad_norm": 0.4982580840587616, + "learning_rate": 4.3765013521291385e-05, + "loss": 0.4481, + "step": 105860 + }, + { + "epoch": 2.356436965811966, + "grad_norm": 0.43735408782958984, + "learning_rate": 4.373591093662437e-05, + "loss": 0.5103, + "step": 105870 + }, + { + "epoch": 2.356659544159544, + "grad_norm": 0.774772047996521, + "learning_rate": 4.370681684347586e-05, + "loss": 0.4853, + "step": 105880 + }, + { + "epoch": 2.3568821225071224, + "grad_norm": 0.342276006937027, + "learning_rate": 4.367773124342689e-05, + "loss": 0.4671, + "step": 105890 + }, + { + "epoch": 2.357104700854701, + "grad_norm": 0.7277559638023376, + "learning_rate": 4.364865413805801e-05, + "loss": 0.4475, + "step": 105900 + }, + { + "epoch": 2.3573272792022792, + "grad_norm": 0.6659511923789978, + "learning_rate": 4.361958552894927e-05, + "loss": 0.4472, + "step": 105910 + }, + { + "epoch": 2.3575498575498575, + "grad_norm": 0.6491039395332336, + "learning_rate": 4.3590525417680204e-05, + "loss": 0.3717, + "step": 105920 + }, + { + "epoch": 2.357772435897436, + "grad_norm": 0.4370926320552826, + "learning_rate": 4.3561473805830045e-05, + "loss": 0.3737, + "step": 105930 + }, + { + "epoch": 2.3579950142450143, + "grad_norm": 0.7258709073066711, + "learning_rate": 4.3532430694977454e-05, + "loss": 0.4774, + "step": 105940 + }, + { + "epoch": 2.3582175925925926, + "grad_norm": 0.607122540473938, + "learning_rate": 4.350339608670066e-05, + "loss": 0.4137, + "step": 105950 + }, + { + "epoch": 2.3584401709401708, + "grad_norm": 0.6639084815979004, + "learning_rate": 4.347436998257746e-05, + "loss": 0.5365, + "step": 105960 + }, + { + "epoch": 2.3586627492877494, + "grad_norm": 0.5172412991523743, + "learning_rate": 4.344535238418512e-05, + "loss": 0.3785, + "step": 105970 + }, + { + "epoch": 2.3588853276353277, + "grad_norm": 0.5365101099014282, + "learning_rate": 4.3416343293100556e-05, + "loss": 0.5023, + "step": 105980 + }, + { + "epoch": 2.359107905982906, + "grad_norm": 0.5570228099822998, + "learning_rate": 4.338734271090001e-05, + "loss": 0.4382, + "step": 105990 + }, + { + "epoch": 2.359330484330484, + "grad_norm": 0.4990769624710083, + "learning_rate": 4.335835063915949e-05, + "loss": 0.3926, + "step": 106000 + }, + { + "epoch": 2.3595530626780628, + "grad_norm": 0.5243441462516785, + "learning_rate": 4.332936707945443e-05, + "loss": 0.5187, + "step": 106010 + }, + { + "epoch": 2.359775641025641, + "grad_norm": 0.4713497459888458, + "learning_rate": 4.3300392033359804e-05, + "loss": 0.5061, + "step": 106020 + }, + { + "epoch": 2.359998219373219, + "grad_norm": 0.6179198026657104, + "learning_rate": 4.327142550245018e-05, + "loss": 0.5219, + "step": 106030 + }, + { + "epoch": 2.360220797720798, + "grad_norm": 0.5947954058647156, + "learning_rate": 4.3242467488299635e-05, + "loss": 0.4931, + "step": 106040 + }, + { + "epoch": 2.360443376068376, + "grad_norm": 0.4120878279209137, + "learning_rate": 4.321351799248172e-05, + "loss": 0.5027, + "step": 106050 + }, + { + "epoch": 2.3606659544159543, + "grad_norm": 0.5536032319068909, + "learning_rate": 4.318457701656955e-05, + "loss": 0.5718, + "step": 106060 + }, + { + "epoch": 2.360888532763533, + "grad_norm": 0.7351379990577698, + "learning_rate": 4.315564456213585e-05, + "loss": 0.3884, + "step": 106070 + }, + { + "epoch": 2.361111111111111, + "grad_norm": 0.6175481677055359, + "learning_rate": 4.3126720630752804e-05, + "loss": 0.4066, + "step": 106080 + }, + { + "epoch": 2.3613336894586894, + "grad_norm": 0.5883055329322815, + "learning_rate": 4.3097805223992204e-05, + "loss": 0.3736, + "step": 106090 + }, + { + "epoch": 2.361556267806268, + "grad_norm": 0.5379427075386047, + "learning_rate": 4.306889834342529e-05, + "loss": 0.5544, + "step": 106100 + }, + { + "epoch": 2.3617788461538463, + "grad_norm": 0.461297869682312, + "learning_rate": 4.303999999062298e-05, + "loss": 0.4831, + "step": 106110 + }, + { + "epoch": 2.3620014245014245, + "grad_norm": 0.7035804986953735, + "learning_rate": 4.301111016715551e-05, + "loss": 0.4963, + "step": 106120 + }, + { + "epoch": 2.3622240028490027, + "grad_norm": 0.6022542715072632, + "learning_rate": 4.2982228874592824e-05, + "loss": 0.5043, + "step": 106130 + }, + { + "epoch": 2.3624465811965814, + "grad_norm": 0.6967162489891052, + "learning_rate": 4.295335611450435e-05, + "loss": 0.4758, + "step": 106140 + }, + { + "epoch": 2.3626691595441596, + "grad_norm": 0.9202632308006287, + "learning_rate": 4.2924491888459087e-05, + "loss": 0.4852, + "step": 106150 + }, + { + "epoch": 2.362891737891738, + "grad_norm": 0.605948269367218, + "learning_rate": 4.2895636198025524e-05, + "loss": 0.4538, + "step": 106160 + }, + { + "epoch": 2.363114316239316, + "grad_norm": 0.43158113956451416, + "learning_rate": 4.286678904477175e-05, + "loss": 0.4009, + "step": 106170 + }, + { + "epoch": 2.3633368945868947, + "grad_norm": 0.37621960043907166, + "learning_rate": 4.283795043026524e-05, + "loss": 0.301, + "step": 106180 + }, + { + "epoch": 2.363559472934473, + "grad_norm": 0.5921512246131897, + "learning_rate": 4.280912035607321e-05, + "loss": 0.4489, + "step": 106190 + }, + { + "epoch": 2.363782051282051, + "grad_norm": 0.7292236685752869, + "learning_rate": 4.2780298823762224e-05, + "loss": 0.5285, + "step": 106200 + }, + { + "epoch": 2.3640046296296298, + "grad_norm": 0.35154908895492554, + "learning_rate": 4.275148583489847e-05, + "loss": 0.3637, + "step": 106210 + }, + { + "epoch": 2.364227207977208, + "grad_norm": 0.6260533928871155, + "learning_rate": 4.2722681391047734e-05, + "loss": 0.4117, + "step": 106220 + }, + { + "epoch": 2.364449786324786, + "grad_norm": 0.5895659923553467, + "learning_rate": 4.269388549377524e-05, + "loss": 0.522, + "step": 106230 + }, + { + "epoch": 2.364672364672365, + "grad_norm": 0.7136719822883606, + "learning_rate": 4.266509814464581e-05, + "loss": 0.4461, + "step": 106240 + }, + { + "epoch": 2.364894943019943, + "grad_norm": 0.3097321093082428, + "learning_rate": 4.2636319345223764e-05, + "loss": 0.4203, + "step": 106250 + }, + { + "epoch": 2.3651175213675213, + "grad_norm": 0.8221575617790222, + "learning_rate": 4.260754909707292e-05, + "loss": 0.4898, + "step": 106260 + }, + { + "epoch": 2.3653400997151, + "grad_norm": 0.613925576210022, + "learning_rate": 4.257878740175669e-05, + "loss": 0.4007, + "step": 106270 + }, + { + "epoch": 2.365562678062678, + "grad_norm": 0.5688052773475647, + "learning_rate": 4.2550034260838033e-05, + "loss": 0.4512, + "step": 106280 + }, + { + "epoch": 2.3657852564102564, + "grad_norm": 0.7149258852005005, + "learning_rate": 4.252128967587941e-05, + "loss": 0.4367, + "step": 106290 + }, + { + "epoch": 2.3660078347578346, + "grad_norm": 0.5755921602249146, + "learning_rate": 4.24925536484428e-05, + "loss": 0.4022, + "step": 106300 + }, + { + "epoch": 2.3662304131054133, + "grad_norm": 0.9494009613990784, + "learning_rate": 4.246382618008984e-05, + "loss": 0.4637, + "step": 106310 + }, + { + "epoch": 2.3664529914529915, + "grad_norm": 0.6266781091690063, + "learning_rate": 4.243510727238147e-05, + "loss": 0.4328, + "step": 106320 + }, + { + "epoch": 2.3666755698005697, + "grad_norm": 0.9528439044952393, + "learning_rate": 4.2406396926878423e-05, + "loss": 0.5745, + "step": 106330 + }, + { + "epoch": 2.366898148148148, + "grad_norm": 0.5351330637931824, + "learning_rate": 4.2377695145140714e-05, + "loss": 0.5367, + "step": 106340 + }, + { + "epoch": 2.3671207264957266, + "grad_norm": 0.8411067128181458, + "learning_rate": 4.23490019287281e-05, + "loss": 0.4613, + "step": 106350 + }, + { + "epoch": 2.367343304843305, + "grad_norm": 0.5001861453056335, + "learning_rate": 4.232031727919978e-05, + "loss": 0.5383, + "step": 106360 + }, + { + "epoch": 2.367565883190883, + "grad_norm": 0.6162530779838562, + "learning_rate": 4.2291641198114487e-05, + "loss": 0.454, + "step": 106370 + }, + { + "epoch": 2.3677884615384617, + "grad_norm": 0.7169169187545776, + "learning_rate": 4.2262973687030536e-05, + "loss": 0.4225, + "step": 106380 + }, + { + "epoch": 2.36801103988604, + "grad_norm": 0.23822492361068726, + "learning_rate": 4.2234314747505764e-05, + "loss": 0.3515, + "step": 106390 + }, + { + "epoch": 2.368233618233618, + "grad_norm": 0.5095281004905701, + "learning_rate": 4.220566438109743e-05, + "loss": 0.5625, + "step": 106400 + }, + { + "epoch": 2.3684561965811968, + "grad_norm": 0.513222336769104, + "learning_rate": 4.217702258936247e-05, + "loss": 0.5068, + "step": 106410 + }, + { + "epoch": 2.368678774928775, + "grad_norm": 0.4777398705482483, + "learning_rate": 4.21483893738573e-05, + "loss": 0.4588, + "step": 106420 + }, + { + "epoch": 2.368901353276353, + "grad_norm": 0.765155553817749, + "learning_rate": 4.211976473613788e-05, + "loss": 0.528, + "step": 106430 + }, + { + "epoch": 2.369123931623932, + "grad_norm": 0.6391275525093079, + "learning_rate": 4.209114867775974e-05, + "loss": 0.3648, + "step": 106440 + }, + { + "epoch": 2.36934650997151, + "grad_norm": 0.44867634773254395, + "learning_rate": 4.2062541200277794e-05, + "loss": 0.5285, + "step": 106450 + }, + { + "epoch": 2.3695690883190883, + "grad_norm": 0.6665791869163513, + "learning_rate": 4.2033942305246665e-05, + "loss": 0.4858, + "step": 106460 + }, + { + "epoch": 2.3697916666666665, + "grad_norm": 0.4603506922721863, + "learning_rate": 4.200535199422049e-05, + "loss": 0.4666, + "step": 106470 + }, + { + "epoch": 2.370014245014245, + "grad_norm": 0.5786306858062744, + "learning_rate": 4.197677026875275e-05, + "loss": 0.5452, + "step": 106480 + }, + { + "epoch": 2.3702368233618234, + "grad_norm": 0.4147084355354309, + "learning_rate": 4.194819713039668e-05, + "loss": 0.467, + "step": 106490 + }, + { + "epoch": 2.3704594017094016, + "grad_norm": 0.5698099136352539, + "learning_rate": 4.191963258070497e-05, + "loss": 0.3849, + "step": 106500 + }, + { + "epoch": 2.37068198005698, + "grad_norm": 0.4112623333930969, + "learning_rate": 4.1891076621229844e-05, + "loss": 0.4808, + "step": 106510 + }, + { + "epoch": 2.3709045584045585, + "grad_norm": 0.45197534561157227, + "learning_rate": 4.1862529253523053e-05, + "loss": 0.5485, + "step": 106520 + }, + { + "epoch": 2.3711271367521367, + "grad_norm": 0.419298380613327, + "learning_rate": 4.183399047913592e-05, + "loss": 0.502, + "step": 106530 + }, + { + "epoch": 2.371349715099715, + "grad_norm": 0.5625624656677246, + "learning_rate": 4.1805460299619184e-05, + "loss": 0.4655, + "step": 106540 + }, + { + "epoch": 2.3715722934472936, + "grad_norm": 0.6050288677215576, + "learning_rate": 4.1776938716523216e-05, + "loss": 0.5006, + "step": 106550 + }, + { + "epoch": 2.371794871794872, + "grad_norm": 0.4963371157646179, + "learning_rate": 4.1748425731397944e-05, + "loss": 0.4433, + "step": 106560 + }, + { + "epoch": 2.37201745014245, + "grad_norm": 0.7071536183357239, + "learning_rate": 4.171992134579281e-05, + "loss": 0.4348, + "step": 106570 + }, + { + "epoch": 2.3722400284900287, + "grad_norm": 0.5232623815536499, + "learning_rate": 4.169142556125669e-05, + "loss": 0.4173, + "step": 106580 + }, + { + "epoch": 2.372462606837607, + "grad_norm": 0.4696999490261078, + "learning_rate": 4.166293837933808e-05, + "loss": 0.4365, + "step": 106590 + }, + { + "epoch": 2.372685185185185, + "grad_norm": 0.7387834787368774, + "learning_rate": 4.1634459801585046e-05, + "loss": 0.478, + "step": 106600 + }, + { + "epoch": 2.372907763532764, + "grad_norm": 0.600752055644989, + "learning_rate": 4.1605989829545137e-05, + "loss": 0.4636, + "step": 106610 + }, + { + "epoch": 2.373130341880342, + "grad_norm": 0.5822334885597229, + "learning_rate": 4.157752846476537e-05, + "loss": 0.4002, + "step": 106620 + }, + { + "epoch": 2.37335292022792, + "grad_norm": 0.557744562625885, + "learning_rate": 4.154907570879238e-05, + "loss": 0.4866, + "step": 106630 + }, + { + "epoch": 2.3735754985754984, + "grad_norm": 0.6427473425865173, + "learning_rate": 4.152063156317236e-05, + "loss": 0.4592, + "step": 106640 + }, + { + "epoch": 2.373798076923077, + "grad_norm": 0.4670778214931488, + "learning_rate": 4.1492196029450934e-05, + "loss": 0.4491, + "step": 106650 + }, + { + "epoch": 2.3740206552706553, + "grad_norm": 0.5701555013656616, + "learning_rate": 4.1463769109173354e-05, + "loss": 0.5698, + "step": 106660 + }, + { + "epoch": 2.3742432336182335, + "grad_norm": 0.34977465867996216, + "learning_rate": 4.143535080388439e-05, + "loss": 0.4207, + "step": 106670 + }, + { + "epoch": 2.3744658119658117, + "grad_norm": 0.3792286813259125, + "learning_rate": 4.1406941115128193e-05, + "loss": 0.4499, + "step": 106680 + }, + { + "epoch": 2.3746883903133904, + "grad_norm": 0.36511972546577454, + "learning_rate": 4.137854004444868e-05, + "loss": 0.4308, + "step": 106690 + }, + { + "epoch": 2.3749109686609686, + "grad_norm": 0.67777019739151, + "learning_rate": 4.13501475933892e-05, + "loss": 0.3832, + "step": 106700 + }, + { + "epoch": 2.375133547008547, + "grad_norm": 0.44513630867004395, + "learning_rate": 4.132176376349251e-05, + "loss": 0.5664, + "step": 106710 + }, + { + "epoch": 2.3753561253561255, + "grad_norm": 0.5617483258247375, + "learning_rate": 4.129338855630109e-05, + "loss": 0.4931, + "step": 106720 + }, + { + "epoch": 2.3755787037037037, + "grad_norm": 0.4769580066204071, + "learning_rate": 4.126502197335684e-05, + "loss": 0.4935, + "step": 106730 + }, + { + "epoch": 2.375801282051282, + "grad_norm": 0.8353335857391357, + "learning_rate": 4.123666401620127e-05, + "loss": 0.3768, + "step": 106740 + }, + { + "epoch": 2.3760238603988606, + "grad_norm": 0.6219033002853394, + "learning_rate": 4.120831468637538e-05, + "loss": 0.4713, + "step": 106750 + }, + { + "epoch": 2.376246438746439, + "grad_norm": 0.5097732543945312, + "learning_rate": 4.117997398541962e-05, + "loss": 0.4277, + "step": 106760 + }, + { + "epoch": 2.376469017094017, + "grad_norm": 0.5334221720695496, + "learning_rate": 4.1151641914874086e-05, + "loss": 0.4739, + "step": 106770 + }, + { + "epoch": 2.3766915954415953, + "grad_norm": 0.5444838404655457, + "learning_rate": 4.1123318476278375e-05, + "loss": 0.4664, + "step": 106780 + }, + { + "epoch": 2.376914173789174, + "grad_norm": 0.6150575280189514, + "learning_rate": 4.109500367117158e-05, + "loss": 0.4434, + "step": 106790 + }, + { + "epoch": 2.377136752136752, + "grad_norm": 0.44876211881637573, + "learning_rate": 4.10666975010924e-05, + "loss": 0.4829, + "step": 106800 + }, + { + "epoch": 2.3773593304843303, + "grad_norm": 0.614982008934021, + "learning_rate": 4.103839996757903e-05, + "loss": 0.4204, + "step": 106810 + }, + { + "epoch": 2.377581908831909, + "grad_norm": 0.5357826948165894, + "learning_rate": 4.1010111072169076e-05, + "loss": 0.4094, + "step": 106820 + }, + { + "epoch": 2.3778044871794872, + "grad_norm": 0.5355835556983948, + "learning_rate": 4.09818308163999e-05, + "loss": 0.4143, + "step": 106830 + }, + { + "epoch": 2.3780270655270654, + "grad_norm": 0.37973734736442566, + "learning_rate": 4.095355920180817e-05, + "loss": 0.4967, + "step": 106840 + }, + { + "epoch": 2.3782496438746437, + "grad_norm": 0.46821489930152893, + "learning_rate": 4.092529622993022e-05, + "loss": 0.5378, + "step": 106850 + }, + { + "epoch": 2.3784722222222223, + "grad_norm": 0.7494333386421204, + "learning_rate": 4.0897041902301905e-05, + "loss": 0.5393, + "step": 106860 + }, + { + "epoch": 2.3786948005698005, + "grad_norm": 0.9595987796783447, + "learning_rate": 4.086879622045858e-05, + "loss": 0.438, + "step": 106870 + }, + { + "epoch": 2.3789173789173788, + "grad_norm": 0.5590757131576538, + "learning_rate": 4.084055918593515e-05, + "loss": 0.424, + "step": 106880 + }, + { + "epoch": 2.3791399572649574, + "grad_norm": 0.7187809944152832, + "learning_rate": 4.0812330800266074e-05, + "loss": 0.5109, + "step": 106890 + }, + { + "epoch": 2.3793625356125356, + "grad_norm": 0.325730562210083, + "learning_rate": 4.0784111064985186e-05, + "loss": 0.3554, + "step": 106900 + }, + { + "epoch": 2.379585113960114, + "grad_norm": 0.5590636134147644, + "learning_rate": 4.075589998162608e-05, + "loss": 0.5206, + "step": 106910 + }, + { + "epoch": 2.3798076923076925, + "grad_norm": 0.49287861585617065, + "learning_rate": 4.07276975517217e-05, + "loss": 0.5341, + "step": 106920 + }, + { + "epoch": 2.3800302706552707, + "grad_norm": 0.5416540503501892, + "learning_rate": 4.0699503776804626e-05, + "loss": 0.535, + "step": 106930 + }, + { + "epoch": 2.380252849002849, + "grad_norm": 0.8340360522270203, + "learning_rate": 4.0671318658406944e-05, + "loss": 0.5092, + "step": 106940 + }, + { + "epoch": 2.380475427350427, + "grad_norm": 0.7989697456359863, + "learning_rate": 4.064314219806027e-05, + "loss": 0.4929, + "step": 106950 + }, + { + "epoch": 2.380698005698006, + "grad_norm": 0.6200940012931824, + "learning_rate": 4.0614974397295666e-05, + "loss": 0.3981, + "step": 106960 + }, + { + "epoch": 2.380920584045584, + "grad_norm": 0.4049985408782959, + "learning_rate": 4.058681525764389e-05, + "loss": 0.5013, + "step": 106970 + }, + { + "epoch": 2.3811431623931623, + "grad_norm": 0.6641939878463745, + "learning_rate": 4.0558664780635014e-05, + "loss": 0.4869, + "step": 106980 + }, + { + "epoch": 2.381365740740741, + "grad_norm": 0.5992461442947388, + "learning_rate": 4.053052296779882e-05, + "loss": 0.4807, + "step": 106990 + }, + { + "epoch": 2.381588319088319, + "grad_norm": 0.5013190507888794, + "learning_rate": 4.0502389820664544e-05, + "loss": 0.3836, + "step": 107000 + }, + { + "epoch": 2.3818108974358974, + "grad_norm": 0.46175822615623474, + "learning_rate": 4.0474265340761e-05, + "loss": 0.4792, + "step": 107010 + }, + { + "epoch": 2.3820334757834756, + "grad_norm": 0.5924819111824036, + "learning_rate": 4.044614952961645e-05, + "loss": 0.3602, + "step": 107020 + }, + { + "epoch": 2.3822560541310542, + "grad_norm": 0.6522736549377441, + "learning_rate": 4.0418042388758815e-05, + "loss": 0.4622, + "step": 107030 + }, + { + "epoch": 2.3824786324786325, + "grad_norm": 0.7011052370071411, + "learning_rate": 4.0389943919715335e-05, + "loss": 0.4267, + "step": 107040 + }, + { + "epoch": 2.3827012108262107, + "grad_norm": 0.6333714723587036, + "learning_rate": 4.036185412401297e-05, + "loss": 0.4716, + "step": 107050 + }, + { + "epoch": 2.3829237891737893, + "grad_norm": 0.469301700592041, + "learning_rate": 4.033377300317813e-05, + "loss": 0.3148, + "step": 107060 + }, + { + "epoch": 2.3831463675213675, + "grad_norm": 0.7111797332763672, + "learning_rate": 4.030570055873679e-05, + "loss": 0.51, + "step": 107070 + }, + { + "epoch": 2.3833689458689458, + "grad_norm": 0.7554060220718384, + "learning_rate": 4.027763679221441e-05, + "loss": 0.5962, + "step": 107080 + }, + { + "epoch": 2.3835915242165244, + "grad_norm": 0.5037725567817688, + "learning_rate": 4.024958170513604e-05, + "loss": 0.4771, + "step": 107090 + }, + { + "epoch": 2.3838141025641026, + "grad_norm": 0.4343712031841278, + "learning_rate": 4.0221535299026195e-05, + "loss": 0.4824, + "step": 107100 + }, + { + "epoch": 2.384036680911681, + "grad_norm": 0.4945704936981201, + "learning_rate": 4.019349757540887e-05, + "loss": 0.4191, + "step": 107110 + }, + { + "epoch": 2.384259259259259, + "grad_norm": 0.5049513578414917, + "learning_rate": 4.016546853580769e-05, + "loss": 0.5334, + "step": 107120 + }, + { + "epoch": 2.3844818376068377, + "grad_norm": 0.5844504833221436, + "learning_rate": 4.0137448181745804e-05, + "loss": 0.5095, + "step": 107130 + }, + { + "epoch": 2.384704415954416, + "grad_norm": 0.7790089845657349, + "learning_rate": 4.010943651474586e-05, + "loss": 0.5077, + "step": 107140 + }, + { + "epoch": 2.384926994301994, + "grad_norm": 0.3328200876712799, + "learning_rate": 4.008143353633003e-05, + "loss": 0.4946, + "step": 107150 + }, + { + "epoch": 2.385149572649573, + "grad_norm": 0.6973555088043213, + "learning_rate": 4.005343924802001e-05, + "loss": 0.4411, + "step": 107160 + }, + { + "epoch": 2.385372150997151, + "grad_norm": 0.5896797180175781, + "learning_rate": 4.0025453651337094e-05, + "loss": 0.3967, + "step": 107170 + }, + { + "epoch": 2.3855947293447293, + "grad_norm": 0.6309689283370972, + "learning_rate": 3.9997476747801945e-05, + "loss": 0.5664, + "step": 107180 + }, + { + "epoch": 2.3858173076923075, + "grad_norm": 0.6303597688674927, + "learning_rate": 3.996950853893488e-05, + "loss": 0.448, + "step": 107190 + }, + { + "epoch": 2.386039886039886, + "grad_norm": 0.5085480809211731, + "learning_rate": 3.994154902625573e-05, + "loss": 0.5309, + "step": 107200 + }, + { + "epoch": 2.3862624643874644, + "grad_norm": 0.5023293495178223, + "learning_rate": 3.991359821128384e-05, + "loss": 0.3437, + "step": 107210 + }, + { + "epoch": 2.3864850427350426, + "grad_norm": 0.4966486394405365, + "learning_rate": 3.9885656095538137e-05, + "loss": 0.4056, + "step": 107220 + }, + { + "epoch": 2.3867076210826212, + "grad_norm": 0.7862597703933716, + "learning_rate": 3.985772268053689e-05, + "loss": 0.4386, + "step": 107230 + }, + { + "epoch": 2.3869301994301995, + "grad_norm": 0.4529586136341095, + "learning_rate": 3.9829797967798156e-05, + "loss": 0.418, + "step": 107240 + }, + { + "epoch": 2.3871527777777777, + "grad_norm": 0.4913751780986786, + "learning_rate": 3.9801881958839274e-05, + "loss": 0.3807, + "step": 107250 + }, + { + "epoch": 2.3873753561253563, + "grad_norm": 0.6710889935493469, + "learning_rate": 3.977397465517725e-05, + "loss": 0.4312, + "step": 107260 + }, + { + "epoch": 2.3875979344729346, + "grad_norm": 0.5019149780273438, + "learning_rate": 3.974607605832863e-05, + "loss": 0.4696, + "step": 107270 + }, + { + "epoch": 2.3878205128205128, + "grad_norm": 0.6020123958587646, + "learning_rate": 3.97181861698094e-05, + "loss": 0.4775, + "step": 107280 + }, + { + "epoch": 2.388043091168091, + "grad_norm": 0.5125402808189392, + "learning_rate": 3.969030499113517e-05, + "loss": 0.461, + "step": 107290 + }, + { + "epoch": 2.3882656695156697, + "grad_norm": 0.562674343585968, + "learning_rate": 3.9662432523821e-05, + "loss": 0.3317, + "step": 107300 + }, + { + "epoch": 2.388488247863248, + "grad_norm": 0.4142382740974426, + "learning_rate": 3.963456876938154e-05, + "loss": 0.5279, + "step": 107310 + }, + { + "epoch": 2.388710826210826, + "grad_norm": 0.5948408246040344, + "learning_rate": 3.9606713729330865e-05, + "loss": 0.4231, + "step": 107320 + }, + { + "epoch": 2.3889334045584047, + "grad_norm": 0.4571043848991394, + "learning_rate": 3.957886740518266e-05, + "loss": 0.56, + "step": 107330 + }, + { + "epoch": 2.389155982905983, + "grad_norm": 0.641856849193573, + "learning_rate": 3.955102979845013e-05, + "loss": 0.4425, + "step": 107340 + }, + { + "epoch": 2.389378561253561, + "grad_norm": 0.6879642009735107, + "learning_rate": 3.9523200910645984e-05, + "loss": 0.4254, + "step": 107350 + }, + { + "epoch": 2.3896011396011394, + "grad_norm": 0.6293129920959473, + "learning_rate": 3.949538074328254e-05, + "loss": 0.4756, + "step": 107360 + }, + { + "epoch": 2.389823717948718, + "grad_norm": 0.43237632513046265, + "learning_rate": 3.946756929787143e-05, + "loss": 0.4297, + "step": 107370 + }, + { + "epoch": 2.3900462962962963, + "grad_norm": 0.6030610203742981, + "learning_rate": 3.9439766575924076e-05, + "loss": 0.4089, + "step": 107380 + }, + { + "epoch": 2.3902688746438745, + "grad_norm": 0.48837748169898987, + "learning_rate": 3.941197257895122e-05, + "loss": 0.5213, + "step": 107390 + }, + { + "epoch": 2.390491452991453, + "grad_norm": 0.8221680521965027, + "learning_rate": 3.938418730846321e-05, + "loss": 0.4774, + "step": 107400 + }, + { + "epoch": 2.3907140313390314, + "grad_norm": 0.4908098876476288, + "learning_rate": 3.9356410765969965e-05, + "loss": 0.4163, + "step": 107410 + }, + { + "epoch": 2.3909366096866096, + "grad_norm": 0.7985237836837769, + "learning_rate": 3.932864295298084e-05, + "loss": 0.5171, + "step": 107420 + }, + { + "epoch": 2.3911591880341883, + "grad_norm": 0.537944495677948, + "learning_rate": 3.9300883871004815e-05, + "loss": 0.3917, + "step": 107430 + }, + { + "epoch": 2.3913817663817665, + "grad_norm": 0.571877121925354, + "learning_rate": 3.927313352155031e-05, + "loss": 0.4366, + "step": 107440 + }, + { + "epoch": 2.3916043447293447, + "grad_norm": 0.6063857078552246, + "learning_rate": 3.924539190612537e-05, + "loss": 0.562, + "step": 107450 + }, + { + "epoch": 2.391826923076923, + "grad_norm": 0.6366663575172424, + "learning_rate": 3.921765902623735e-05, + "loss": 0.5283, + "step": 107460 + }, + { + "epoch": 2.3920495014245016, + "grad_norm": 0.6072849631309509, + "learning_rate": 3.91899348833934e-05, + "loss": 0.4876, + "step": 107470 + }, + { + "epoch": 2.39227207977208, + "grad_norm": 0.7319457530975342, + "learning_rate": 3.916221947909999e-05, + "loss": 0.5053, + "step": 107480 + }, + { + "epoch": 2.392494658119658, + "grad_norm": 0.5945011973381042, + "learning_rate": 3.9134512814863336e-05, + "loss": 0.4319, + "step": 107490 + }, + { + "epoch": 2.392717236467236, + "grad_norm": 0.47057005763053894, + "learning_rate": 3.910681489218888e-05, + "loss": 0.3741, + "step": 107500 + }, + { + "epoch": 2.392939814814815, + "grad_norm": 0.3048020601272583, + "learning_rate": 3.907912571258181e-05, + "loss": 0.4148, + "step": 107510 + }, + { + "epoch": 2.393162393162393, + "grad_norm": 0.3933880925178528, + "learning_rate": 3.9051445277546825e-05, + "loss": 0.48, + "step": 107520 + }, + { + "epoch": 2.3933849715099713, + "grad_norm": 0.6014164090156555, + "learning_rate": 3.902377358858802e-05, + "loss": 0.3565, + "step": 107530 + }, + { + "epoch": 2.39360754985755, + "grad_norm": 0.5383347272872925, + "learning_rate": 3.899611064720916e-05, + "loss": 0.6023, + "step": 107540 + }, + { + "epoch": 2.393830128205128, + "grad_norm": 0.6487884521484375, + "learning_rate": 3.896845645491343e-05, + "loss": 0.5128, + "step": 107550 + }, + { + "epoch": 2.3940527065527064, + "grad_norm": 0.5151840448379517, + "learning_rate": 3.894081101320359e-05, + "loss": 0.6098, + "step": 107560 + }, + { + "epoch": 2.394275284900285, + "grad_norm": 0.5944096446037292, + "learning_rate": 3.891317432358195e-05, + "loss": 0.4749, + "step": 107570 + }, + { + "epoch": 2.3944978632478633, + "grad_norm": 0.5349414944648743, + "learning_rate": 3.888554638755029e-05, + "loss": 0.4657, + "step": 107580 + }, + { + "epoch": 2.3947204415954415, + "grad_norm": 0.6621114611625671, + "learning_rate": 3.885792720660999e-05, + "loss": 0.388, + "step": 107590 + }, + { + "epoch": 2.39494301994302, + "grad_norm": 0.4630861282348633, + "learning_rate": 3.8830316782261765e-05, + "loss": 0.5104, + "step": 107600 + }, + { + "epoch": 2.3951655982905984, + "grad_norm": 0.3979974091053009, + "learning_rate": 3.880271511600608e-05, + "loss": 0.3779, + "step": 107610 + }, + { + "epoch": 2.3953881766381766, + "grad_norm": 0.5173378586769104, + "learning_rate": 3.877512220934287e-05, + "loss": 0.444, + "step": 107620 + }, + { + "epoch": 2.395610754985755, + "grad_norm": 0.4683818817138672, + "learning_rate": 3.874753806377147e-05, + "loss": 0.5606, + "step": 107630 + }, + { + "epoch": 2.3958333333333335, + "grad_norm": 0.4673824906349182, + "learning_rate": 3.871996268079083e-05, + "loss": 0.3646, + "step": 107640 + }, + { + "epoch": 2.3960559116809117, + "grad_norm": 0.5951061248779297, + "learning_rate": 3.869239606189947e-05, + "loss": 0.5753, + "step": 107650 + }, + { + "epoch": 2.39627849002849, + "grad_norm": 1.1411864757537842, + "learning_rate": 3.866483820859541e-05, + "loss": 0.4492, + "step": 107660 + }, + { + "epoch": 2.396501068376068, + "grad_norm": 0.7248709797859192, + "learning_rate": 3.8637289122376054e-05, + "loss": 0.4289, + "step": 107670 + }, + { + "epoch": 2.396723646723647, + "grad_norm": 0.5003977417945862, + "learning_rate": 3.860974880473851e-05, + "loss": 0.3993, + "step": 107680 + }, + { + "epoch": 2.396946225071225, + "grad_norm": 0.7477527260780334, + "learning_rate": 3.858221725717932e-05, + "loss": 0.426, + "step": 107690 + }, + { + "epoch": 2.3971688034188032, + "grad_norm": 0.5571163892745972, + "learning_rate": 3.855469448119462e-05, + "loss": 0.3665, + "step": 107700 + }, + { + "epoch": 2.397391381766382, + "grad_norm": 0.4843011498451233, + "learning_rate": 3.852718047827997e-05, + "loss": 0.4136, + "step": 107710 + }, + { + "epoch": 2.39761396011396, + "grad_norm": 0.8067624568939209, + "learning_rate": 3.84996752499305e-05, + "loss": 0.5301, + "step": 107720 + }, + { + "epoch": 2.3978365384615383, + "grad_norm": 0.5896735191345215, + "learning_rate": 3.847217879764098e-05, + "loss": 0.5473, + "step": 107730 + }, + { + "epoch": 2.398059116809117, + "grad_norm": 0.4599475562572479, + "learning_rate": 3.8444691122905406e-05, + "loss": 0.4437, + "step": 107740 + }, + { + "epoch": 2.398281695156695, + "grad_norm": 0.6119899749755859, + "learning_rate": 3.841721222721766e-05, + "loss": 0.4372, + "step": 107750 + }, + { + "epoch": 2.3985042735042734, + "grad_norm": 0.507407546043396, + "learning_rate": 3.8389742112070805e-05, + "loss": 0.4957, + "step": 107760 + }, + { + "epoch": 2.398726851851852, + "grad_norm": 0.5270352959632874, + "learning_rate": 3.836228077895765e-05, + "loss": 0.4259, + "step": 107770 + }, + { + "epoch": 2.3989494301994303, + "grad_norm": 0.6012681126594543, + "learning_rate": 3.833482822937051e-05, + "loss": 0.5108, + "step": 107780 + }, + { + "epoch": 2.3991720085470085, + "grad_norm": 0.6297083497047424, + "learning_rate": 3.830738446480113e-05, + "loss": 0.4953, + "step": 107790 + }, + { + "epoch": 2.3993945868945867, + "grad_norm": 0.4202558994293213, + "learning_rate": 3.827994948674092e-05, + "loss": 0.3991, + "step": 107800 + }, + { + "epoch": 2.3996171652421654, + "grad_norm": 0.43911489844322205, + "learning_rate": 3.8252523296680564e-05, + "loss": 0.4951, + "step": 107810 + }, + { + "epoch": 2.3998397435897436, + "grad_norm": 0.5340352058410645, + "learning_rate": 3.8225105896110525e-05, + "loss": 0.4703, + "step": 107820 + }, + { + "epoch": 2.400062321937322, + "grad_norm": 0.8210797905921936, + "learning_rate": 3.819769728652065e-05, + "loss": 0.466, + "step": 107830 + }, + { + "epoch": 2.4002849002849, + "grad_norm": 0.7431703805923462, + "learning_rate": 3.817029746940037e-05, + "loss": 0.5111, + "step": 107840 + }, + { + "epoch": 2.4002849002849, + "eval_loss": 0.5277159810066223, + "eval_runtime": 337.619, + "eval_samples_per_second": 7.005, + "eval_steps_per_second": 7.005, + "step": 107840 + }, + { + "epoch": 2.4005074786324787, + "grad_norm": 0.6892507672309875, + "learning_rate": 3.8142906446238614e-05, + "loss": 0.4169, + "step": 107850 + }, + { + "epoch": 2.400730056980057, + "grad_norm": 0.5865117311477661, + "learning_rate": 3.8115524218523865e-05, + "loss": 0.4058, + "step": 107860 + }, + { + "epoch": 2.400952635327635, + "grad_norm": 0.41831985116004944, + "learning_rate": 3.808815078774402e-05, + "loss": 0.4328, + "step": 107870 + }, + { + "epoch": 2.401175213675214, + "grad_norm": 0.36568325757980347, + "learning_rate": 3.8060786155386644e-05, + "loss": 0.5102, + "step": 107880 + }, + { + "epoch": 2.401397792022792, + "grad_norm": 0.7594850063323975, + "learning_rate": 3.8033430322938666e-05, + "loss": 0.4744, + "step": 107890 + }, + { + "epoch": 2.4016203703703702, + "grad_norm": 0.5580376386642456, + "learning_rate": 3.800608329188668e-05, + "loss": 0.578, + "step": 107900 + }, + { + "epoch": 2.401842948717949, + "grad_norm": 0.7795330882072449, + "learning_rate": 3.7978745063716745e-05, + "loss": 0.3751, + "step": 107910 + }, + { + "epoch": 2.402065527065527, + "grad_norm": 0.6691529750823975, + "learning_rate": 3.7951415639914443e-05, + "loss": 0.5008, + "step": 107920 + }, + { + "epoch": 2.4022881054131053, + "grad_norm": 0.6297852993011475, + "learning_rate": 3.792409502196488e-05, + "loss": 0.4381, + "step": 107930 + }, + { + "epoch": 2.402510683760684, + "grad_norm": 0.46467840671539307, + "learning_rate": 3.7896783211352704e-05, + "loss": 0.5051, + "step": 107940 + }, + { + "epoch": 2.402733262108262, + "grad_norm": 0.4547869563102722, + "learning_rate": 3.7869480209562e-05, + "loss": 0.4391, + "step": 107950 + }, + { + "epoch": 2.4029558404558404, + "grad_norm": 0.7416006326675415, + "learning_rate": 3.784218601807645e-05, + "loss": 0.5118, + "step": 107960 + }, + { + "epoch": 2.4031784188034186, + "grad_norm": 0.4923880696296692, + "learning_rate": 3.781490063837927e-05, + "loss": 0.4884, + "step": 107970 + }, + { + "epoch": 2.4034009971509973, + "grad_norm": 0.38714689016342163, + "learning_rate": 3.7787624071953175e-05, + "loss": 0.3763, + "step": 107980 + }, + { + "epoch": 2.4036235754985755, + "grad_norm": 0.5982674956321716, + "learning_rate": 3.776035632028037e-05, + "loss": 0.3924, + "step": 107990 + }, + { + "epoch": 2.4038461538461537, + "grad_norm": 0.4063017964363098, + "learning_rate": 3.7733097384842655e-05, + "loss": 0.3274, + "step": 108000 + }, + { + "epoch": 2.404068732193732, + "grad_norm": 0.34237897396087646, + "learning_rate": 3.770584726712123e-05, + "loss": 0.3434, + "step": 108010 + }, + { + "epoch": 2.4042913105413106, + "grad_norm": 1.1153937578201294, + "learning_rate": 3.767860596859696e-05, + "loss": 0.4604, + "step": 108020 + }, + { + "epoch": 2.404513888888889, + "grad_norm": 0.7363869547843933, + "learning_rate": 3.7651373490750096e-05, + "loss": 0.4872, + "step": 108030 + }, + { + "epoch": 2.404736467236467, + "grad_norm": 0.404489666223526, + "learning_rate": 3.762414983506049e-05, + "loss": 0.4847, + "step": 108040 + }, + { + "epoch": 2.4049590455840457, + "grad_norm": 0.38278475403785706, + "learning_rate": 3.75969350030075e-05, + "loss": 0.4051, + "step": 108050 + }, + { + "epoch": 2.405181623931624, + "grad_norm": 0.5841982364654541, + "learning_rate": 3.756972899607003e-05, + "loss": 0.4173, + "step": 108060 + }, + { + "epoch": 2.405404202279202, + "grad_norm": 0.40750545263290405, + "learning_rate": 3.754253181572645e-05, + "loss": 0.4454, + "step": 108070 + }, + { + "epoch": 2.405626780626781, + "grad_norm": 0.6064510345458984, + "learning_rate": 3.7515343463454735e-05, + "loss": 0.5399, + "step": 108080 + }, + { + "epoch": 2.405849358974359, + "grad_norm": 0.3883700966835022, + "learning_rate": 3.748816394073222e-05, + "loss": 0.4375, + "step": 108090 + }, + { + "epoch": 2.4060719373219372, + "grad_norm": 0.36801886558532715, + "learning_rate": 3.746099324903591e-05, + "loss": 0.3955, + "step": 108100 + }, + { + "epoch": 2.406294515669516, + "grad_norm": 0.743415355682373, + "learning_rate": 3.743383138984229e-05, + "loss": 0.5687, + "step": 108110 + }, + { + "epoch": 2.406517094017094, + "grad_norm": 0.5999566316604614, + "learning_rate": 3.740667836462737e-05, + "loss": 0.5204, + "step": 108120 + }, + { + "epoch": 2.4067396723646723, + "grad_norm": 0.5922669172286987, + "learning_rate": 3.7379534174866635e-05, + "loss": 0.3788, + "step": 108130 + }, + { + "epoch": 2.4069622507122506, + "grad_norm": 0.6521261930465698, + "learning_rate": 3.735239882203518e-05, + "loss": 0.415, + "step": 108140 + }, + { + "epoch": 2.4071848290598292, + "grad_norm": 0.6018521189689636, + "learning_rate": 3.732527230760749e-05, + "loss": 0.3698, + "step": 108150 + }, + { + "epoch": 2.4074074074074074, + "grad_norm": 0.7880030274391174, + "learning_rate": 3.729815463305772e-05, + "loss": 0.4128, + "step": 108160 + }, + { + "epoch": 2.4076299857549857, + "grad_norm": 0.6043182015419006, + "learning_rate": 3.7271045799859384e-05, + "loss": 0.4286, + "step": 108170 + }, + { + "epoch": 2.407852564102564, + "grad_norm": 0.6066815853118896, + "learning_rate": 3.7243945809485624e-05, + "loss": 0.3493, + "step": 108180 + }, + { + "epoch": 2.4080751424501425, + "grad_norm": 0.6289936304092407, + "learning_rate": 3.721685466340909e-05, + "loss": 0.542, + "step": 108190 + }, + { + "epoch": 2.4082977207977208, + "grad_norm": 0.6707183122634888, + "learning_rate": 3.718977236310195e-05, + "loss": 0.4285, + "step": 108200 + }, + { + "epoch": 2.408520299145299, + "grad_norm": 0.5168840289115906, + "learning_rate": 3.716269891003583e-05, + "loss": 0.4894, + "step": 108210 + }, + { + "epoch": 2.4087428774928776, + "grad_norm": 0.522630512714386, + "learning_rate": 3.713563430568203e-05, + "loss": 0.395, + "step": 108220 + }, + { + "epoch": 2.408965455840456, + "grad_norm": 0.5039717555046082, + "learning_rate": 3.710857855151113e-05, + "loss": 0.5266, + "step": 108230 + }, + { + "epoch": 2.409188034188034, + "grad_norm": 0.5010952353477478, + "learning_rate": 3.708153164899342e-05, + "loss": 0.473, + "step": 108240 + }, + { + "epoch": 2.4094106125356127, + "grad_norm": 0.7485960125923157, + "learning_rate": 3.705449359959865e-05, + "loss": 0.5431, + "step": 108250 + }, + { + "epoch": 2.409633190883191, + "grad_norm": 0.6157149076461792, + "learning_rate": 3.7027464404796096e-05, + "loss": 0.4947, + "step": 108260 + }, + { + "epoch": 2.409855769230769, + "grad_norm": 0.7727802991867065, + "learning_rate": 3.700044406605458e-05, + "loss": 0.4065, + "step": 108270 + }, + { + "epoch": 2.410078347578348, + "grad_norm": 0.48511838912963867, + "learning_rate": 3.6973432584842337e-05, + "loss": 0.3984, + "step": 108280 + }, + { + "epoch": 2.410300925925926, + "grad_norm": 0.7944969534873962, + "learning_rate": 3.6946429962627224e-05, + "loss": 0.5269, + "step": 108290 + }, + { + "epoch": 2.4105235042735043, + "grad_norm": 0.6611111760139465, + "learning_rate": 3.691943620087663e-05, + "loss": 0.4587, + "step": 108300 + }, + { + "epoch": 2.4107460826210825, + "grad_norm": 0.5456627607345581, + "learning_rate": 3.689245130105734e-05, + "loss": 0.3479, + "step": 108310 + }, + { + "epoch": 2.410968660968661, + "grad_norm": 0.5392328500747681, + "learning_rate": 3.686547526463575e-05, + "loss": 0.4299, + "step": 108320 + }, + { + "epoch": 2.4111912393162394, + "grad_norm": 0.3954230546951294, + "learning_rate": 3.6838508093077806e-05, + "loss": 0.4817, + "step": 108330 + }, + { + "epoch": 2.4114138176638176, + "grad_norm": 0.5684217810630798, + "learning_rate": 3.6811549787848884e-05, + "loss": 0.3802, + "step": 108340 + }, + { + "epoch": 2.411636396011396, + "grad_norm": 0.6959905624389648, + "learning_rate": 3.678460035041395e-05, + "loss": 0.5219, + "step": 108350 + }, + { + "epoch": 2.4118589743589745, + "grad_norm": 0.5525913238525391, + "learning_rate": 3.6757659782237505e-05, + "loss": 0.4986, + "step": 108360 + }, + { + "epoch": 2.4120815527065527, + "grad_norm": 0.643782913684845, + "learning_rate": 3.67307280847834e-05, + "loss": 0.419, + "step": 108370 + }, + { + "epoch": 2.412304131054131, + "grad_norm": 0.4591728448867798, + "learning_rate": 3.67038052595152e-05, + "loss": 0.3671, + "step": 108380 + }, + { + "epoch": 2.4125267094017095, + "grad_norm": 0.45649808645248413, + "learning_rate": 3.667689130789589e-05, + "loss": 0.496, + "step": 108390 + }, + { + "epoch": 2.4127492877492878, + "grad_norm": 0.49152353405952454, + "learning_rate": 3.664998623138807e-05, + "loss": 0.4469, + "step": 108400 + }, + { + "epoch": 2.412971866096866, + "grad_norm": 0.6421986222267151, + "learning_rate": 3.662309003145366e-05, + "loss": 0.5647, + "step": 108410 + }, + { + "epoch": 2.4131944444444446, + "grad_norm": 0.5374763607978821, + "learning_rate": 3.659620270955428e-05, + "loss": 0.3204, + "step": 108420 + }, + { + "epoch": 2.413417022792023, + "grad_norm": 0.5448712110519409, + "learning_rate": 3.656932426715103e-05, + "loss": 0.5383, + "step": 108430 + }, + { + "epoch": 2.413639601139601, + "grad_norm": 0.5975387692451477, + "learning_rate": 3.654245470570454e-05, + "loss": 0.3897, + "step": 108440 + }, + { + "epoch": 2.4138621794871793, + "grad_norm": 0.3369348347187042, + "learning_rate": 3.651559402667481e-05, + "loss": 0.5226, + "step": 108450 + }, + { + "epoch": 2.414084757834758, + "grad_norm": 0.6690844297409058, + "learning_rate": 3.6488742231521545e-05, + "loss": 0.5524, + "step": 108460 + }, + { + "epoch": 2.414307336182336, + "grad_norm": 0.6922747492790222, + "learning_rate": 3.6461899321703894e-05, + "loss": 0.3865, + "step": 108470 + }, + { + "epoch": 2.4145299145299144, + "grad_norm": 0.4084574580192566, + "learning_rate": 3.6435065298680504e-05, + "loss": 0.4244, + "step": 108480 + }, + { + "epoch": 2.414752492877493, + "grad_norm": 0.37243160605430603, + "learning_rate": 3.640824016390956e-05, + "loss": 0.4692, + "step": 108490 + }, + { + "epoch": 2.4149750712250713, + "grad_norm": 0.781517505645752, + "learning_rate": 3.6381423918848825e-05, + "loss": 0.5295, + "step": 108500 + }, + { + "epoch": 2.4151976495726495, + "grad_norm": 0.5566474795341492, + "learning_rate": 3.63546165649554e-05, + "loss": 0.475, + "step": 108510 + }, + { + "epoch": 2.4154202279202277, + "grad_norm": 0.45444151759147644, + "learning_rate": 3.6327818103686086e-05, + "loss": 0.4576, + "step": 108520 + }, + { + "epoch": 2.4156428062678064, + "grad_norm": 0.5889346599578857, + "learning_rate": 3.63010285364971e-05, + "loss": 0.3677, + "step": 108530 + }, + { + "epoch": 2.4158653846153846, + "grad_norm": 0.4560697078704834, + "learning_rate": 3.627424786484432e-05, + "loss": 0.3781, + "step": 108540 + }, + { + "epoch": 2.416087962962963, + "grad_norm": 0.7297571897506714, + "learning_rate": 3.624747609018289e-05, + "loss": 0.4507, + "step": 108550 + }, + { + "epoch": 2.4163105413105415, + "grad_norm": 0.6629948616027832, + "learning_rate": 3.622071321396763e-05, + "loss": 0.4697, + "step": 108560 + }, + { + "epoch": 2.4165331196581197, + "grad_norm": 0.45788052678108215, + "learning_rate": 3.619395923765292e-05, + "loss": 0.3583, + "step": 108570 + }, + { + "epoch": 2.416755698005698, + "grad_norm": 0.3835255801677704, + "learning_rate": 3.61672141626926e-05, + "loss": 0.473, + "step": 108580 + }, + { + "epoch": 2.4169782763532766, + "grad_norm": 0.5667798519134521, + "learning_rate": 3.614047799053995e-05, + "loss": 0.3157, + "step": 108590 + }, + { + "epoch": 2.4172008547008548, + "grad_norm": 0.37759262323379517, + "learning_rate": 3.611375072264784e-05, + "loss": 0.3444, + "step": 108600 + }, + { + "epoch": 2.417423433048433, + "grad_norm": 0.5107159614562988, + "learning_rate": 3.6087032360468684e-05, + "loss": 0.4706, + "step": 108610 + }, + { + "epoch": 2.417646011396011, + "grad_norm": 0.5266587734222412, + "learning_rate": 3.606032290545438e-05, + "loss": 0.4191, + "step": 108620 + }, + { + "epoch": 2.41786858974359, + "grad_norm": 0.6173784732818604, + "learning_rate": 3.603362235905634e-05, + "loss": 0.4651, + "step": 108630 + }, + { + "epoch": 2.418091168091168, + "grad_norm": 0.6883417367935181, + "learning_rate": 3.600693072272554e-05, + "loss": 0.3607, + "step": 108640 + }, + { + "epoch": 2.4183137464387463, + "grad_norm": 0.5021166205406189, + "learning_rate": 3.598024799791233e-05, + "loss": 0.4198, + "step": 108650 + }, + { + "epoch": 2.418536324786325, + "grad_norm": 0.5681913495063782, + "learning_rate": 3.595357418606671e-05, + "loss": 0.473, + "step": 108660 + }, + { + "epoch": 2.418758903133903, + "grad_norm": 0.53594571352005, + "learning_rate": 3.592690928863822e-05, + "loss": 0.4205, + "step": 108670 + }, + { + "epoch": 2.4189814814814814, + "grad_norm": 0.8201348185539246, + "learning_rate": 3.590025330707574e-05, + "loss": 0.3999, + "step": 108680 + }, + { + "epoch": 2.4192040598290596, + "grad_norm": 0.5095071196556091, + "learning_rate": 3.587360624282783e-05, + "loss": 0.4599, + "step": 108690 + }, + { + "epoch": 2.4194266381766383, + "grad_norm": 0.745123565196991, + "learning_rate": 3.5846968097342534e-05, + "loss": 0.5154, + "step": 108700 + }, + { + "epoch": 2.4196492165242165, + "grad_norm": 0.4678294062614441, + "learning_rate": 3.5820338872067417e-05, + "loss": 0.3351, + "step": 108710 + }, + { + "epoch": 2.4198717948717947, + "grad_norm": 0.7734829783439636, + "learning_rate": 3.579371856844942e-05, + "loss": 0.4579, + "step": 108720 + }, + { + "epoch": 2.4200943732193734, + "grad_norm": 0.5143256783485413, + "learning_rate": 3.576710718793519e-05, + "loss": 0.4507, + "step": 108730 + }, + { + "epoch": 2.4203169515669516, + "grad_norm": 0.39224982261657715, + "learning_rate": 3.574050473197081e-05, + "loss": 0.3424, + "step": 108740 + }, + { + "epoch": 2.42053952991453, + "grad_norm": 0.6342445611953735, + "learning_rate": 3.571391120200187e-05, + "loss": 0.4639, + "step": 108750 + }, + { + "epoch": 2.4207621082621085, + "grad_norm": 0.5755688548088074, + "learning_rate": 3.568732659947349e-05, + "loss": 0.3544, + "step": 108760 + }, + { + "epoch": 2.4209846866096867, + "grad_norm": 0.5118335485458374, + "learning_rate": 3.56607509258303e-05, + "loss": 0.4368, + "step": 108770 + }, + { + "epoch": 2.421207264957265, + "grad_norm": 0.6136965155601501, + "learning_rate": 3.563418418251647e-05, + "loss": 0.5165, + "step": 108780 + }, + { + "epoch": 2.421429843304843, + "grad_norm": 0.6362696290016174, + "learning_rate": 3.560762637097559e-05, + "loss": 0.4246, + "step": 108790 + }, + { + "epoch": 2.421652421652422, + "grad_norm": 0.6194811463356018, + "learning_rate": 3.558107749265092e-05, + "loss": 0.4304, + "step": 108800 + }, + { + "epoch": 2.421875, + "grad_norm": 0.6822486519813538, + "learning_rate": 3.555453754898506e-05, + "loss": 0.5694, + "step": 108810 + }, + { + "epoch": 2.422097578347578, + "grad_norm": 0.6611093878746033, + "learning_rate": 3.5528006541420233e-05, + "loss": 0.4672, + "step": 108820 + }, + { + "epoch": 2.422320156695157, + "grad_norm": 0.531535804271698, + "learning_rate": 3.5501484471398164e-05, + "loss": 0.5314, + "step": 108830 + }, + { + "epoch": 2.422542735042735, + "grad_norm": 0.7303971648216248, + "learning_rate": 3.547497134036011e-05, + "loss": 0.473, + "step": 108840 + }, + { + "epoch": 2.4227653133903133, + "grad_norm": 0.6678654551506042, + "learning_rate": 3.5448467149746854e-05, + "loss": 0.4744, + "step": 108850 + }, + { + "epoch": 2.4229878917378915, + "grad_norm": 0.6118271350860596, + "learning_rate": 3.542197190099854e-05, + "loss": 0.4503, + "step": 108860 + }, + { + "epoch": 2.42321047008547, + "grad_norm": 0.6433374881744385, + "learning_rate": 3.5395485595555014e-05, + "loss": 0.5033, + "step": 108870 + }, + { + "epoch": 2.4234330484330484, + "grad_norm": 0.802843451499939, + "learning_rate": 3.536900823485554e-05, + "loss": 0.5305, + "step": 108880 + }, + { + "epoch": 2.4236556267806266, + "grad_norm": 0.6826706528663635, + "learning_rate": 3.534253982033895e-05, + "loss": 0.5862, + "step": 108890 + }, + { + "epoch": 2.4238782051282053, + "grad_norm": 0.6310650706291199, + "learning_rate": 3.5316080353443516e-05, + "loss": 0.6473, + "step": 108900 + }, + { + "epoch": 2.4241007834757835, + "grad_norm": 0.7067840099334717, + "learning_rate": 3.528962983560711e-05, + "loss": 0.5158, + "step": 108910 + }, + { + "epoch": 2.4243233618233617, + "grad_norm": 0.642220139503479, + "learning_rate": 3.526318826826711e-05, + "loss": 0.5143, + "step": 108920 + }, + { + "epoch": 2.4245459401709404, + "grad_norm": 0.5776522755622864, + "learning_rate": 3.523675565286031e-05, + "loss": 0.4242, + "step": 108930 + }, + { + "epoch": 2.4247685185185186, + "grad_norm": 0.6654059886932373, + "learning_rate": 3.521033199082304e-05, + "loss": 0.5211, + "step": 108940 + }, + { + "epoch": 2.424991096866097, + "grad_norm": 0.7176336050033569, + "learning_rate": 3.5183917283591225e-05, + "loss": 0.4434, + "step": 108950 + }, + { + "epoch": 2.425213675213675, + "grad_norm": 0.5302131772041321, + "learning_rate": 3.515751153260027e-05, + "loss": 0.3175, + "step": 108960 + }, + { + "epoch": 2.4254362535612537, + "grad_norm": 0.6184340119361877, + "learning_rate": 3.5131114739285096e-05, + "loss": 0.442, + "step": 108970 + }, + { + "epoch": 2.425658831908832, + "grad_norm": 0.6138894557952881, + "learning_rate": 3.510472690508011e-05, + "loss": 0.5474, + "step": 108980 + }, + { + "epoch": 2.42588141025641, + "grad_norm": 0.6392364501953125, + "learning_rate": 3.5078348031419316e-05, + "loss": 0.421, + "step": 108990 + }, + { + "epoch": 2.426103988603989, + "grad_norm": 0.5919948816299438, + "learning_rate": 3.505197811973604e-05, + "loss": 0.3428, + "step": 109000 + }, + { + "epoch": 2.426326566951567, + "grad_norm": 0.6280859708786011, + "learning_rate": 3.502561717146331e-05, + "loss": 0.4399, + "step": 109010 + }, + { + "epoch": 2.4265491452991452, + "grad_norm": 0.610748291015625, + "learning_rate": 3.499926518803358e-05, + "loss": 0.4614, + "step": 109020 + }, + { + "epoch": 2.4267717236467234, + "grad_norm": 0.48234373331069946, + "learning_rate": 3.497292217087889e-05, + "loss": 0.4259, + "step": 109030 + }, + { + "epoch": 2.426994301994302, + "grad_norm": 0.7132697105407715, + "learning_rate": 3.494658812143068e-05, + "loss": 0.4475, + "step": 109040 + }, + { + "epoch": 2.4272168803418803, + "grad_norm": 0.5572307109832764, + "learning_rate": 3.492026304111999e-05, + "loss": 0.5108, + "step": 109050 + }, + { + "epoch": 2.4274394586894585, + "grad_norm": 0.7285090684890747, + "learning_rate": 3.48939469313774e-05, + "loss": 0.5296, + "step": 109060 + }, + { + "epoch": 2.427662037037037, + "grad_norm": 0.5368955731391907, + "learning_rate": 3.48676397936329e-05, + "loss": 0.4226, + "step": 109070 + }, + { + "epoch": 2.4278846153846154, + "grad_norm": 0.5352765321731567, + "learning_rate": 3.484134162931598e-05, + "loss": 0.3651, + "step": 109080 + }, + { + "epoch": 2.4281071937321936, + "grad_norm": 0.675270676612854, + "learning_rate": 3.4815052439855766e-05, + "loss": 0.4504, + "step": 109090 + }, + { + "epoch": 2.4283297720797723, + "grad_norm": 0.7123696208000183, + "learning_rate": 3.478877222668084e-05, + "loss": 0.5784, + "step": 109100 + }, + { + "epoch": 2.4285523504273505, + "grad_norm": 0.4823485314846039, + "learning_rate": 3.476250099121927e-05, + "loss": 0.3552, + "step": 109110 + }, + { + "epoch": 2.4287749287749287, + "grad_norm": 0.6309967637062073, + "learning_rate": 3.4736238734898665e-05, + "loss": 0.4613, + "step": 109120 + }, + { + "epoch": 2.428997507122507, + "grad_norm": 0.6412575840950012, + "learning_rate": 3.4709985459146186e-05, + "loss": 0.3747, + "step": 109130 + }, + { + "epoch": 2.4292200854700856, + "grad_norm": 0.6632343530654907, + "learning_rate": 3.4683741165388374e-05, + "loss": 0.4115, + "step": 109140 + }, + { + "epoch": 2.429442663817664, + "grad_norm": 0.5699517130851746, + "learning_rate": 3.4657505855051386e-05, + "loss": 0.4816, + "step": 109150 + }, + { + "epoch": 2.429665242165242, + "grad_norm": 0.7767205238342285, + "learning_rate": 3.463127952956089e-05, + "loss": 0.432, + "step": 109160 + }, + { + "epoch": 2.4298878205128207, + "grad_norm": 0.7094178795814514, + "learning_rate": 3.460506219034203e-05, + "loss": 0.4014, + "step": 109170 + }, + { + "epoch": 2.430110398860399, + "grad_norm": 0.5864173769950867, + "learning_rate": 3.457885383881949e-05, + "loss": 0.4635, + "step": 109180 + }, + { + "epoch": 2.430332977207977, + "grad_norm": 0.5907089114189148, + "learning_rate": 3.4552654476417536e-05, + "loss": 0.5184, + "step": 109190 + }, + { + "epoch": 2.4305555555555554, + "grad_norm": 0.677842915058136, + "learning_rate": 3.452646410455969e-05, + "loss": 0.4403, + "step": 109200 + }, + { + "epoch": 2.430778133903134, + "grad_norm": 0.3627990782260895, + "learning_rate": 3.450028272466932e-05, + "loss": 0.6292, + "step": 109210 + }, + { + "epoch": 2.4310007122507122, + "grad_norm": 0.6533386707305908, + "learning_rate": 3.447411033816901e-05, + "loss": 0.459, + "step": 109220 + }, + { + "epoch": 2.4312232905982905, + "grad_norm": 0.5333442687988281, + "learning_rate": 3.444794694648106e-05, + "loss": 0.4785, + "step": 109230 + }, + { + "epoch": 2.431445868945869, + "grad_norm": 1.0702968835830688, + "learning_rate": 3.44217925510272e-05, + "loss": 0.4997, + "step": 109240 + }, + { + "epoch": 2.4316684472934473, + "grad_norm": 0.6150128841400146, + "learning_rate": 3.439564715322867e-05, + "loss": 0.5079, + "step": 109250 + }, + { + "epoch": 2.4318910256410255, + "grad_norm": 0.8526206612586975, + "learning_rate": 3.436951075450625e-05, + "loss": 0.5689, + "step": 109260 + }, + { + "epoch": 2.432113603988604, + "grad_norm": 0.2673870921134949, + "learning_rate": 3.4343383356280246e-05, + "loss": 0.4312, + "step": 109270 + }, + { + "epoch": 2.4323361823361824, + "grad_norm": 0.4728868305683136, + "learning_rate": 3.431726495997036e-05, + "loss": 0.4545, + "step": 109280 + }, + { + "epoch": 2.4325587606837606, + "grad_norm": 0.5523426532745361, + "learning_rate": 3.429115556699594e-05, + "loss": 0.5015, + "step": 109290 + }, + { + "epoch": 2.432781339031339, + "grad_norm": 0.4013436734676361, + "learning_rate": 3.4265055178775785e-05, + "loss": 0.4636, + "step": 109300 + }, + { + "epoch": 2.4330039173789175, + "grad_norm": 0.41342243552207947, + "learning_rate": 3.42389637967282e-05, + "loss": 0.4356, + "step": 109310 + }, + { + "epoch": 2.4332264957264957, + "grad_norm": 0.5798662900924683, + "learning_rate": 3.421288142227106e-05, + "loss": 0.4689, + "step": 109320 + }, + { + "epoch": 2.433449074074074, + "grad_norm": 0.6059615015983582, + "learning_rate": 3.418680805682162e-05, + "loss": 0.4677, + "step": 109330 + }, + { + "epoch": 2.433671652421652, + "grad_norm": 0.2866314649581909, + "learning_rate": 3.416074370179678e-05, + "loss": 0.4174, + "step": 109340 + }, + { + "epoch": 2.433894230769231, + "grad_norm": 0.3867923319339752, + "learning_rate": 3.413468835861293e-05, + "loss": 0.3449, + "step": 109350 + }, + { + "epoch": 2.434116809116809, + "grad_norm": 0.5961967706680298, + "learning_rate": 3.4108642028685864e-05, + "loss": 0.4518, + "step": 109360 + }, + { + "epoch": 2.4343393874643873, + "grad_norm": 0.6358778476715088, + "learning_rate": 3.4082604713430985e-05, + "loss": 0.4875, + "step": 109370 + }, + { + "epoch": 2.434561965811966, + "grad_norm": 0.7603932619094849, + "learning_rate": 3.4056576414263184e-05, + "loss": 0.4939, + "step": 109380 + }, + { + "epoch": 2.434784544159544, + "grad_norm": 0.9518799781799316, + "learning_rate": 3.4030557132596884e-05, + "loss": 0.5118, + "step": 109390 + }, + { + "epoch": 2.4350071225071224, + "grad_norm": 0.8648528456687927, + "learning_rate": 3.400454686984595e-05, + "loss": 0.4375, + "step": 109400 + }, + { + "epoch": 2.435229700854701, + "grad_norm": 0.46407267451286316, + "learning_rate": 3.397854562742391e-05, + "loss": 0.4906, + "step": 109410 + }, + { + "epoch": 2.4354522792022792, + "grad_norm": 0.6812346577644348, + "learning_rate": 3.395255340674355e-05, + "loss": 0.3656, + "step": 109420 + }, + { + "epoch": 2.4356748575498575, + "grad_norm": 0.6520447731018066, + "learning_rate": 3.392657020921737e-05, + "loss": 0.5176, + "step": 109430 + }, + { + "epoch": 2.435897435897436, + "grad_norm": 0.6260280013084412, + "learning_rate": 3.390059603625733e-05, + "loss": 0.4418, + "step": 109440 + }, + { + "epoch": 2.4361200142450143, + "grad_norm": 0.4960933327674866, + "learning_rate": 3.387463088927492e-05, + "loss": 0.4674, + "step": 109450 + }, + { + "epoch": 2.4363425925925926, + "grad_norm": 0.5193760395050049, + "learning_rate": 3.384867476968101e-05, + "loss": 0.4425, + "step": 109460 + }, + { + "epoch": 2.4365651709401708, + "grad_norm": 0.5269632935523987, + "learning_rate": 3.3822727678886124e-05, + "loss": 0.4172, + "step": 109470 + }, + { + "epoch": 2.4367877492877494, + "grad_norm": 0.847545325756073, + "learning_rate": 3.379678961830026e-05, + "loss": 0.5111, + "step": 109480 + }, + { + "epoch": 2.4370103276353277, + "grad_norm": 0.7173821926116943, + "learning_rate": 3.377086058933297e-05, + "loss": 0.4615, + "step": 109490 + }, + { + "epoch": 2.437232905982906, + "grad_norm": 0.4638817608356476, + "learning_rate": 3.3744940593393125e-05, + "loss": 0.4021, + "step": 109500 + }, + { + "epoch": 2.437455484330484, + "grad_norm": 0.48426464200019836, + "learning_rate": 3.371902963188933e-05, + "loss": 0.4959, + "step": 109510 + }, + { + "epoch": 2.4376780626780628, + "grad_norm": 0.4158429205417633, + "learning_rate": 3.369312770622959e-05, + "loss": 0.5318, + "step": 109520 + }, + { + "epoch": 2.437900641025641, + "grad_norm": 0.6626630425453186, + "learning_rate": 3.366723481782141e-05, + "loss": 0.4656, + "step": 109530 + }, + { + "epoch": 2.438123219373219, + "grad_norm": 0.39689040184020996, + "learning_rate": 3.3641350968071885e-05, + "loss": 0.5204, + "step": 109540 + }, + { + "epoch": 2.438345797720798, + "grad_norm": 0.7641111016273499, + "learning_rate": 3.361547615838758e-05, + "loss": 0.5227, + "step": 109550 + }, + { + "epoch": 2.438568376068376, + "grad_norm": 0.48284536600112915, + "learning_rate": 3.358961039017445e-05, + "loss": 0.4339, + "step": 109560 + }, + { + "epoch": 2.4387909544159543, + "grad_norm": 0.7062486410140991, + "learning_rate": 3.356375366483813e-05, + "loss": 0.5128, + "step": 109570 + }, + { + "epoch": 2.439013532763533, + "grad_norm": 0.5849929451942444, + "learning_rate": 3.353790598378368e-05, + "loss": 0.4973, + "step": 109580 + }, + { + "epoch": 2.439236111111111, + "grad_norm": 0.5276649594306946, + "learning_rate": 3.3512067348415744e-05, + "loss": 0.3899, + "step": 109590 + }, + { + "epoch": 2.4394586894586894, + "grad_norm": 0.4740630090236664, + "learning_rate": 3.34862377601383e-05, + "loss": 0.3603, + "step": 109600 + }, + { + "epoch": 2.439681267806268, + "grad_norm": 0.6667687892913818, + "learning_rate": 3.346041722035502e-05, + "loss": 0.4622, + "step": 109610 + }, + { + "epoch": 2.4399038461538463, + "grad_norm": 0.6276548504829407, + "learning_rate": 3.343460573046902e-05, + "loss": 0.4644, + "step": 109620 + }, + { + "epoch": 2.4401264245014245, + "grad_norm": 0.5454540848731995, + "learning_rate": 3.340880329188294e-05, + "loss": 0.5998, + "step": 109630 + }, + { + "epoch": 2.4403490028490027, + "grad_norm": 0.6678075790405273, + "learning_rate": 3.338300990599881e-05, + "loss": 0.4705, + "step": 109640 + }, + { + "epoch": 2.4405715811965814, + "grad_norm": 0.7437434792518616, + "learning_rate": 3.335722557421832e-05, + "loss": 0.5217, + "step": 109650 + }, + { + "epoch": 2.4407941595441596, + "grad_norm": 0.7812333703041077, + "learning_rate": 3.333145029794262e-05, + "loss": 0.4913, + "step": 109660 + }, + { + "epoch": 2.441016737891738, + "grad_norm": 0.9800385236740112, + "learning_rate": 3.330568407857235e-05, + "loss": 0.4852, + "step": 109670 + }, + { + "epoch": 2.441239316239316, + "grad_norm": 0.47481921315193176, + "learning_rate": 3.327992691750768e-05, + "loss": 0.4849, + "step": 109680 + }, + { + "epoch": 2.4414618945868947, + "grad_norm": 0.6780035495758057, + "learning_rate": 3.3254178816148294e-05, + "loss": 0.5307, + "step": 109690 + }, + { + "epoch": 2.441684472934473, + "grad_norm": 0.5398222804069519, + "learning_rate": 3.3228439775893295e-05, + "loss": 0.3707, + "step": 109700 + }, + { + "epoch": 2.441907051282051, + "grad_norm": 0.38698190450668335, + "learning_rate": 3.320270979814142e-05, + "loss": 0.3428, + "step": 109710 + }, + { + "epoch": 2.4421296296296298, + "grad_norm": 0.5703821778297424, + "learning_rate": 3.317698888429086e-05, + "loss": 0.6002, + "step": 109720 + }, + { + "epoch": 2.442352207977208, + "grad_norm": 0.4589943587779999, + "learning_rate": 3.315127703573926e-05, + "loss": 0.4608, + "step": 109730 + }, + { + "epoch": 2.442574786324786, + "grad_norm": 0.5299825668334961, + "learning_rate": 3.312557425388385e-05, + "loss": 0.5301, + "step": 109740 + }, + { + "epoch": 2.442797364672365, + "grad_norm": 0.5066646933555603, + "learning_rate": 3.309988054012134e-05, + "loss": 0.4337, + "step": 109750 + }, + { + "epoch": 2.443019943019943, + "grad_norm": 0.4790757894515991, + "learning_rate": 3.307419589584797e-05, + "loss": 0.413, + "step": 109760 + }, + { + "epoch": 2.4432425213675213, + "grad_norm": 0.4142204225063324, + "learning_rate": 3.304852032245949e-05, + "loss": 0.4809, + "step": 109770 + }, + { + "epoch": 2.4434650997151, + "grad_norm": 0.6871556639671326, + "learning_rate": 3.302285382135104e-05, + "loss": 0.5838, + "step": 109780 + }, + { + "epoch": 2.443687678062678, + "grad_norm": 0.4145238697528839, + "learning_rate": 3.299719639391739e-05, + "loss": 0.4666, + "step": 109790 + }, + { + "epoch": 2.4439102564102564, + "grad_norm": 0.4546247124671936, + "learning_rate": 3.2971548041552826e-05, + "loss": 0.4192, + "step": 109800 + }, + { + "epoch": 2.4441328347578346, + "grad_norm": 0.6559073328971863, + "learning_rate": 3.2945908765651066e-05, + "loss": 0.4374, + "step": 109810 + }, + { + "epoch": 2.4443554131054133, + "grad_norm": 0.6892669200897217, + "learning_rate": 3.292027856760538e-05, + "loss": 0.3472, + "step": 109820 + }, + { + "epoch": 2.4445779914529915, + "grad_norm": 0.6675540208816528, + "learning_rate": 3.289465744880858e-05, + "loss": 0.5037, + "step": 109830 + }, + { + "epoch": 2.4448005698005697, + "grad_norm": 0.4565548896789551, + "learning_rate": 3.286904541065285e-05, + "loss": 0.4619, + "step": 109840 + }, + { + "epoch": 2.445023148148148, + "grad_norm": 0.7293078303337097, + "learning_rate": 3.284344245453006e-05, + "loss": 0.4634, + "step": 109850 + }, + { + "epoch": 2.4452457264957266, + "grad_norm": 0.47439637780189514, + "learning_rate": 3.281784858183139e-05, + "loss": 0.3893, + "step": 109860 + }, + { + "epoch": 2.445468304843305, + "grad_norm": 0.553246796131134, + "learning_rate": 3.2792263793947705e-05, + "loss": 0.4897, + "step": 109870 + }, + { + "epoch": 2.445690883190883, + "grad_norm": 0.7342193722724915, + "learning_rate": 3.2766688092269284e-05, + "loss": 0.4816, + "step": 109880 + }, + { + "epoch": 2.4459134615384617, + "grad_norm": 0.5039429068565369, + "learning_rate": 3.274112147818593e-05, + "loss": 0.4815, + "step": 109890 + }, + { + "epoch": 2.44613603988604, + "grad_norm": 0.5331164598464966, + "learning_rate": 3.271556395308695e-05, + "loss": 0.4651, + "step": 109900 + }, + { + "epoch": 2.446358618233618, + "grad_norm": 0.5317120552062988, + "learning_rate": 3.269001551836124e-05, + "loss": 0.5472, + "step": 109910 + }, + { + "epoch": 2.4465811965811968, + "grad_norm": 0.603212833404541, + "learning_rate": 3.266447617539698e-05, + "loss": 0.4639, + "step": 109920 + }, + { + "epoch": 2.446803774928775, + "grad_norm": 0.5109623074531555, + "learning_rate": 3.26389459255821e-05, + "loss": 0.4415, + "step": 109930 + }, + { + "epoch": 2.447026353276353, + "grad_norm": 0.7490630149841309, + "learning_rate": 3.261342477030389e-05, + "loss": 0.4875, + "step": 109940 + }, + { + "epoch": 2.447248931623932, + "grad_norm": 0.528417706489563, + "learning_rate": 3.258791271094921e-05, + "loss": 0.4238, + "step": 109950 + }, + { + "epoch": 2.44747150997151, + "grad_norm": 0.31074804067611694, + "learning_rate": 3.256240974890441e-05, + "loss": 0.3852, + "step": 109960 + }, + { + "epoch": 2.4476940883190883, + "grad_norm": 0.7868363261222839, + "learning_rate": 3.2536915885555367e-05, + "loss": 0.3732, + "step": 109970 + }, + { + "epoch": 2.4479166666666665, + "grad_norm": 0.4590838849544525, + "learning_rate": 3.251143112228743e-05, + "loss": 0.4183, + "step": 109980 + }, + { + "epoch": 2.448139245014245, + "grad_norm": 0.7592793703079224, + "learning_rate": 3.248595546048536e-05, + "loss": 0.4788, + "step": 109990 + }, + { + "epoch": 2.4483618233618234, + "grad_norm": 0.6533322930335999, + "learning_rate": 3.2460488901533635e-05, + "loss": 0.4313, + "step": 110000 + }, + { + "epoch": 2.4485844017094016, + "grad_norm": 0.8436979055404663, + "learning_rate": 3.2435031446816075e-05, + "loss": 0.5353, + "step": 110010 + }, + { + "epoch": 2.44880698005698, + "grad_norm": 0.4479171335697174, + "learning_rate": 3.240958309771609e-05, + "loss": 0.4168, + "step": 110020 + }, + { + "epoch": 2.4490295584045585, + "grad_norm": 0.9244013428688049, + "learning_rate": 3.238414385561655e-05, + "loss": 0.4576, + "step": 110030 + }, + { + "epoch": 2.4492521367521367, + "grad_norm": 0.49031445384025574, + "learning_rate": 3.235871372189985e-05, + "loss": 0.4534, + "step": 110040 + }, + { + "epoch": 2.449474715099715, + "grad_norm": 0.66366046667099, + "learning_rate": 3.2333292697947934e-05, + "loss": 0.4447, + "step": 110050 + }, + { + "epoch": 2.4496972934472936, + "grad_norm": 0.44934597611427307, + "learning_rate": 3.230788078514211e-05, + "loss": 0.3764, + "step": 110060 + }, + { + "epoch": 2.449919871794872, + "grad_norm": 0.25253328680992126, + "learning_rate": 3.2282477984863326e-05, + "loss": 0.4856, + "step": 110070 + }, + { + "epoch": 2.45014245014245, + "grad_norm": 0.6734674572944641, + "learning_rate": 3.225708429849197e-05, + "loss": 0.4353, + "step": 110080 + }, + { + "epoch": 2.4503650284900287, + "grad_norm": 0.5335591435432434, + "learning_rate": 3.2231699727407984e-05, + "loss": 0.4386, + "step": 110090 + }, + { + "epoch": 2.450587606837607, + "grad_norm": 1.137387990951538, + "learning_rate": 3.220632427299077e-05, + "loss": 0.4026, + "step": 110100 + }, + { + "epoch": 2.450810185185185, + "grad_norm": 0.748821496963501, + "learning_rate": 3.2180957936619324e-05, + "loss": 0.474, + "step": 110110 + }, + { + "epoch": 2.451032763532764, + "grad_norm": 0.47783905267715454, + "learning_rate": 3.215560071967198e-05, + "loss": 0.453, + "step": 110120 + }, + { + "epoch": 2.451255341880342, + "grad_norm": 0.4711649417877197, + "learning_rate": 3.213025262352667e-05, + "loss": 0.5357, + "step": 110130 + }, + { + "epoch": 2.45147792022792, + "grad_norm": 0.3978164494037628, + "learning_rate": 3.210491364956085e-05, + "loss": 0.4032, + "step": 110140 + }, + { + "epoch": 2.4517004985754984, + "grad_norm": 0.7222580909729004, + "learning_rate": 3.207958379915148e-05, + "loss": 0.5119, + "step": 110150 + }, + { + "epoch": 2.451923076923077, + "grad_norm": 0.4609293043613434, + "learning_rate": 3.205426307367498e-05, + "loss": 0.316, + "step": 110160 + }, + { + "epoch": 2.4521456552706553, + "grad_norm": 0.8023900985717773, + "learning_rate": 3.202895147450731e-05, + "loss": 0.4822, + "step": 110170 + }, + { + "epoch": 2.4523682336182335, + "grad_norm": 0.47974106669425964, + "learning_rate": 3.200364900302393e-05, + "loss": 0.4279, + "step": 110180 + }, + { + "epoch": 2.4525908119658117, + "grad_norm": 0.4707964062690735, + "learning_rate": 3.197835566059983e-05, + "loss": 0.367, + "step": 110190 + }, + { + "epoch": 2.4528133903133904, + "grad_norm": 0.6110070943832397, + "learning_rate": 3.1953071448609396e-05, + "loss": 0.4845, + "step": 110200 + }, + { + "epoch": 2.4530359686609686, + "grad_norm": 0.7285255789756775, + "learning_rate": 3.192779636842662e-05, + "loss": 0.508, + "step": 110210 + }, + { + "epoch": 2.453258547008547, + "grad_norm": 0.630529522895813, + "learning_rate": 3.190253042142499e-05, + "loss": 0.4566, + "step": 110220 + }, + { + "epoch": 2.4534811253561255, + "grad_norm": 0.5275912880897522, + "learning_rate": 3.1877273608977455e-05, + "loss": 0.4482, + "step": 110230 + }, + { + "epoch": 2.4537037037037037, + "grad_norm": 0.6072720885276794, + "learning_rate": 3.185202593245655e-05, + "loss": 0.5253, + "step": 110240 + }, + { + "epoch": 2.453926282051282, + "grad_norm": 0.5082536935806274, + "learning_rate": 3.182678739323417e-05, + "loss": 0.5118, + "step": 110250 + }, + { + "epoch": 2.4541488603988606, + "grad_norm": 0.5880118608474731, + "learning_rate": 3.1801557992681875e-05, + "loss": 0.4145, + "step": 110260 + }, + { + "epoch": 2.454371438746439, + "grad_norm": 0.32024118304252625, + "learning_rate": 3.177633773217057e-05, + "loss": 0.4723, + "step": 110270 + }, + { + "epoch": 2.454594017094017, + "grad_norm": 0.5531744360923767, + "learning_rate": 3.1751126613070805e-05, + "loss": 0.4664, + "step": 110280 + }, + { + "epoch": 2.4548165954415953, + "grad_norm": 0.49054989218711853, + "learning_rate": 3.1725924636752525e-05, + "loss": 0.5336, + "step": 110290 + }, + { + "epoch": 2.455039173789174, + "grad_norm": 0.5179944634437561, + "learning_rate": 3.1700731804585285e-05, + "loss": 0.4942, + "step": 110300 + }, + { + "epoch": 2.455261752136752, + "grad_norm": 0.7548027634620667, + "learning_rate": 3.1675548117938025e-05, + "loss": 0.4363, + "step": 110310 + }, + { + "epoch": 2.4554843304843303, + "grad_norm": 0.5132513046264648, + "learning_rate": 3.165037357817928e-05, + "loss": 0.4183, + "step": 110320 + }, + { + "epoch": 2.455706908831909, + "grad_norm": 0.6227317452430725, + "learning_rate": 3.1625208186677115e-05, + "loss": 0.4933, + "step": 110330 + }, + { + "epoch": 2.4559294871794872, + "grad_norm": 0.5882665514945984, + "learning_rate": 3.1600051944798935e-05, + "loss": 0.3687, + "step": 110340 + }, + { + "epoch": 2.4561520655270654, + "grad_norm": 0.707750678062439, + "learning_rate": 3.157490485391177e-05, + "loss": 0.4263, + "step": 110350 + }, + { + "epoch": 2.4563746438746437, + "grad_norm": 0.4505460560321808, + "learning_rate": 3.1549766915382165e-05, + "loss": 0.4801, + "step": 110360 + }, + { + "epoch": 2.4565972222222223, + "grad_norm": 0.5214482545852661, + "learning_rate": 3.152463813057618e-05, + "loss": 0.434, + "step": 110370 + }, + { + "epoch": 2.4568198005698005, + "grad_norm": 0.4563765823841095, + "learning_rate": 3.1499518500859216e-05, + "loss": 0.3846, + "step": 110380 + }, + { + "epoch": 2.4570423789173788, + "grad_norm": 0.3778112530708313, + "learning_rate": 3.147440802759636e-05, + "loss": 0.3905, + "step": 110390 + }, + { + "epoch": 2.4572649572649574, + "grad_norm": 0.6129999756813049, + "learning_rate": 3.144930671215218e-05, + "loss": 0.4428, + "step": 110400 + }, + { + "epoch": 2.4574875356125356, + "grad_norm": 0.5692477822303772, + "learning_rate": 3.142421455589062e-05, + "loss": 0.4818, + "step": 110410 + }, + { + "epoch": 2.457710113960114, + "grad_norm": 0.5394112467765808, + "learning_rate": 3.1399131560175245e-05, + "loss": 0.4291, + "step": 110420 + }, + { + "epoch": 2.4579326923076925, + "grad_norm": 0.6020023822784424, + "learning_rate": 3.1374057726369076e-05, + "loss": 0.4352, + "step": 110430 + }, + { + "epoch": 2.4581552706552707, + "grad_norm": 0.4298074245452881, + "learning_rate": 3.134899305583465e-05, + "loss": 0.4341, + "step": 110440 + }, + { + "epoch": 2.458377849002849, + "grad_norm": 0.42165303230285645, + "learning_rate": 3.1323937549934015e-05, + "loss": 0.4388, + "step": 110450 + }, + { + "epoch": 2.458600427350427, + "grad_norm": 1.0087652206420898, + "learning_rate": 3.129889121002873e-05, + "loss": 0.4093, + "step": 110460 + }, + { + "epoch": 2.458823005698006, + "grad_norm": 0.40058034658432007, + "learning_rate": 3.127385403747976e-05, + "loss": 0.4373, + "step": 110470 + }, + { + "epoch": 2.459045584045584, + "grad_norm": 0.3793736696243286, + "learning_rate": 3.1248826033647695e-05, + "loss": 0.4836, + "step": 110480 + }, + { + "epoch": 2.4592681623931623, + "grad_norm": 0.4240846633911133, + "learning_rate": 3.1223807199892576e-05, + "loss": 0.4503, + "step": 110490 + }, + { + "epoch": 2.459490740740741, + "grad_norm": 0.5919007658958435, + "learning_rate": 3.1198797537573975e-05, + "loss": 0.4513, + "step": 110500 + }, + { + "epoch": 2.459713319088319, + "grad_norm": 0.5737066268920898, + "learning_rate": 3.117379704805086e-05, + "loss": 0.4486, + "step": 110510 + }, + { + "epoch": 2.4599358974358974, + "grad_norm": 0.5145678520202637, + "learning_rate": 3.114880573268182e-05, + "loss": 0.4535, + "step": 110520 + }, + { + "epoch": 2.4601584757834756, + "grad_norm": 0.5854604840278625, + "learning_rate": 3.1123823592824887e-05, + "loss": 0.4105, + "step": 110530 + }, + { + "epoch": 2.460292022792023, + "eval_loss": 0.5268440842628479, + "eval_runtime": 337.3162, + "eval_samples_per_second": 7.011, + "eval_steps_per_second": 7.011, + "step": 110536 + }, + { + "epoch": 2.4603810541310542, + "grad_norm": 0.5479249954223633, + "learning_rate": 3.1098850629837705e-05, + "loss": 0.5216, + "step": 110540 + }, + { + "epoch": 2.4606036324786325, + "grad_norm": 0.2902457118034363, + "learning_rate": 3.107388684507717e-05, + "loss": 0.4611, + "step": 110550 + }, + { + "epoch": 2.4608262108262107, + "grad_norm": 0.4755493700504303, + "learning_rate": 3.104893223989995e-05, + "loss": 0.3868, + "step": 110560 + }, + { + "epoch": 2.4610487891737893, + "grad_norm": 0.5796645879745483, + "learning_rate": 3.102398681566203e-05, + "loss": 0.4204, + "step": 110570 + }, + { + "epoch": 2.4612713675213675, + "grad_norm": 1.0918166637420654, + "learning_rate": 3.099905057371901e-05, + "loss": 0.6028, + "step": 110580 + }, + { + "epoch": 2.4614939458689458, + "grad_norm": 0.4401234984397888, + "learning_rate": 3.097412351542595e-05, + "loss": 0.4773, + "step": 110590 + }, + { + "epoch": 2.4617165242165244, + "grad_norm": 0.5752748847007751, + "learning_rate": 3.094920564213741e-05, + "loss": 0.544, + "step": 110600 + }, + { + "epoch": 2.4619391025641026, + "grad_norm": 0.5700196027755737, + "learning_rate": 3.09242969552074e-05, + "loss": 0.4934, + "step": 110610 + }, + { + "epoch": 2.462161680911681, + "grad_norm": 0.6183973550796509, + "learning_rate": 3.089939745598949e-05, + "loss": 0.4989, + "step": 110620 + }, + { + "epoch": 2.462384259259259, + "grad_norm": 0.616034746170044, + "learning_rate": 3.087450714583675e-05, + "loss": 0.4945, + "step": 110630 + }, + { + "epoch": 2.4626068376068377, + "grad_norm": 0.5810309052467346, + "learning_rate": 3.0849626026101796e-05, + "loss": 0.3826, + "step": 110640 + }, + { + "epoch": 2.462829415954416, + "grad_norm": 0.6257836222648621, + "learning_rate": 3.082475409813659e-05, + "loss": 0.4761, + "step": 110650 + }, + { + "epoch": 2.463051994301994, + "grad_norm": 0.47654014825820923, + "learning_rate": 3.0799891363292755e-05, + "loss": 0.486, + "step": 110660 + }, + { + "epoch": 2.463274572649573, + "grad_norm": 0.37999096512794495, + "learning_rate": 3.0775037822921325e-05, + "loss": 0.3583, + "step": 110670 + }, + { + "epoch": 2.463497150997151, + "grad_norm": 0.6416193246841431, + "learning_rate": 3.075019347837291e-05, + "loss": 0.5002, + "step": 110680 + }, + { + "epoch": 2.4637197293447293, + "grad_norm": 0.6239182353019714, + "learning_rate": 3.07253583309975e-05, + "loss": 0.4129, + "step": 110690 + }, + { + "epoch": 2.4639423076923075, + "grad_norm": 0.7033472061157227, + "learning_rate": 3.07005323821447e-05, + "loss": 0.4989, + "step": 110700 + }, + { + "epoch": 2.464164886039886, + "grad_norm": 0.42417532205581665, + "learning_rate": 3.067571563316356e-05, + "loss": 0.4806, + "step": 110710 + }, + { + "epoch": 2.4643874643874644, + "grad_norm": 0.6155065298080444, + "learning_rate": 3.065090808540265e-05, + "loss": 0.4852, + "step": 110720 + }, + { + "epoch": 2.4646100427350426, + "grad_norm": 0.6713215112686157, + "learning_rate": 3.062610974021001e-05, + "loss": 0.4321, + "step": 110730 + }, + { + "epoch": 2.4648326210826212, + "grad_norm": 0.6614099740982056, + "learning_rate": 3.06013205989333e-05, + "loss": 0.4752, + "step": 110740 + }, + { + "epoch": 2.4650551994301995, + "grad_norm": 0.7085887789726257, + "learning_rate": 3.057654066291944e-05, + "loss": 0.5011, + "step": 110750 + }, + { + "epoch": 2.4652777777777777, + "grad_norm": 0.5104491114616394, + "learning_rate": 3.055176993351505e-05, + "loss": 0.3953, + "step": 110760 + }, + { + "epoch": 2.4655003561253563, + "grad_norm": 0.7498946785926819, + "learning_rate": 3.052700841206626e-05, + "loss": 0.4204, + "step": 110770 + }, + { + "epoch": 2.4657229344729346, + "grad_norm": 0.49787577986717224, + "learning_rate": 3.0502256099918524e-05, + "loss": 0.4701, + "step": 110780 + }, + { + "epoch": 2.4659455128205128, + "grad_norm": 0.3691440224647522, + "learning_rate": 3.0477512998416946e-05, + "loss": 0.4112, + "step": 110790 + }, + { + "epoch": 2.466168091168091, + "grad_norm": 0.4830651879310608, + "learning_rate": 3.0452779108906072e-05, + "loss": 0.4191, + "step": 110800 + }, + { + "epoch": 2.4663906695156697, + "grad_norm": 0.5980969667434692, + "learning_rate": 3.0428054432730002e-05, + "loss": 0.4378, + "step": 110810 + }, + { + "epoch": 2.466613247863248, + "grad_norm": 0.7120261788368225, + "learning_rate": 3.040333897123231e-05, + "loss": 0.5123, + "step": 110820 + }, + { + "epoch": 2.466835826210826, + "grad_norm": 0.5967547297477722, + "learning_rate": 3.037863272575596e-05, + "loss": 0.4578, + "step": 110830 + }, + { + "epoch": 2.4670584045584047, + "grad_norm": 0.5113248229026794, + "learning_rate": 3.03539356976436e-05, + "loss": 0.5512, + "step": 110840 + }, + { + "epoch": 2.467280982905983, + "grad_norm": 0.48821139335632324, + "learning_rate": 3.032924788823721e-05, + "loss": 0.5002, + "step": 110850 + }, + { + "epoch": 2.467503561253561, + "grad_norm": 0.6478132009506226, + "learning_rate": 3.0304569298878414e-05, + "loss": 0.4861, + "step": 110860 + }, + { + "epoch": 2.4677261396011394, + "grad_norm": 0.6745758056640625, + "learning_rate": 3.027989993090823e-05, + "loss": 0.5007, + "step": 110870 + }, + { + "epoch": 2.467948717948718, + "grad_norm": 0.6008769273757935, + "learning_rate": 3.025523978566729e-05, + "loss": 0.4701, + "step": 110880 + }, + { + "epoch": 2.4681712962962963, + "grad_norm": 0.6006442308425903, + "learning_rate": 3.0230588864495523e-05, + "loss": 0.5536, + "step": 110890 + }, + { + "epoch": 2.4683938746438745, + "grad_norm": 0.6266177892684937, + "learning_rate": 3.0205947168732575e-05, + "loss": 0.4486, + "step": 110900 + }, + { + "epoch": 2.468616452991453, + "grad_norm": 0.46426495909690857, + "learning_rate": 3.018131469971741e-05, + "loss": 0.3734, + "step": 110910 + }, + { + "epoch": 2.4688390313390314, + "grad_norm": 0.6160069704055786, + "learning_rate": 3.0156691458788634e-05, + "loss": 0.5496, + "step": 110920 + }, + { + "epoch": 2.4690616096866096, + "grad_norm": 0.433323472738266, + "learning_rate": 3.013207744728428e-05, + "loss": 0.4332, + "step": 110930 + }, + { + "epoch": 2.4692841880341883, + "grad_norm": 0.2941664755344391, + "learning_rate": 3.01074726665419e-05, + "loss": 0.4586, + "step": 110940 + }, + { + "epoch": 2.4695067663817665, + "grad_norm": 0.8551943898200989, + "learning_rate": 3.0082877117898523e-05, + "loss": 0.4004, + "step": 110950 + }, + { + "epoch": 2.4697293447293447, + "grad_norm": 0.7417336702346802, + "learning_rate": 3.0058290802690758e-05, + "loss": 0.5409, + "step": 110960 + }, + { + "epoch": 2.469951923076923, + "grad_norm": 0.47685909271240234, + "learning_rate": 3.0033713722254564e-05, + "loss": 0.3813, + "step": 110970 + }, + { + "epoch": 2.4701745014245016, + "grad_norm": 0.3092232942581177, + "learning_rate": 3.0009145877925472e-05, + "loss": 0.4626, + "step": 110980 + }, + { + "epoch": 2.47039707977208, + "grad_norm": 0.46632498502731323, + "learning_rate": 2.998458727103859e-05, + "loss": 0.4482, + "step": 110990 + }, + { + "epoch": 2.470619658119658, + "grad_norm": 0.5601755976676941, + "learning_rate": 2.9960037902928383e-05, + "loss": 0.449, + "step": 111000 + }, + { + "epoch": 2.470842236467236, + "grad_norm": 0.3991512656211853, + "learning_rate": 2.9935497774928946e-05, + "loss": 0.3801, + "step": 111010 + }, + { + "epoch": 2.471064814814815, + "grad_norm": 0.49260395765304565, + "learning_rate": 2.9910966888373802e-05, + "loss": 0.4444, + "step": 111020 + }, + { + "epoch": 2.471287393162393, + "grad_norm": 0.746168851852417, + "learning_rate": 2.988644524459594e-05, + "loss": 0.4011, + "step": 111030 + }, + { + "epoch": 2.4715099715099713, + "grad_norm": 0.5269076228141785, + "learning_rate": 2.9861932844927932e-05, + "loss": 0.3908, + "step": 111040 + }, + { + "epoch": 2.47173254985755, + "grad_norm": 0.4912148118019104, + "learning_rate": 2.9837429690701734e-05, + "loss": 0.4341, + "step": 111050 + }, + { + "epoch": 2.471955128205128, + "grad_norm": 0.4004683196544647, + "learning_rate": 2.9812935783248906e-05, + "loss": 0.3901, + "step": 111060 + }, + { + "epoch": 2.4721777065527064, + "grad_norm": 0.5287625193595886, + "learning_rate": 2.9788451123900473e-05, + "loss": 0.4416, + "step": 111070 + }, + { + "epoch": 2.472400284900285, + "grad_norm": 0.7422000169754028, + "learning_rate": 2.9763975713986948e-05, + "loss": 0.3797, + "step": 111080 + }, + { + "epoch": 2.4726228632478633, + "grad_norm": 0.8129696249961853, + "learning_rate": 2.973950955483835e-05, + "loss": 0.5297, + "step": 111090 + }, + { + "epoch": 2.4728454415954415, + "grad_norm": 0.4405764639377594, + "learning_rate": 2.9715052647784226e-05, + "loss": 0.4761, + "step": 111100 + }, + { + "epoch": 2.47306801994302, + "grad_norm": 0.8199268579483032, + "learning_rate": 2.969060499415348e-05, + "loss": 0.5153, + "step": 111110 + }, + { + "epoch": 2.4732905982905984, + "grad_norm": 0.6683918237686157, + "learning_rate": 2.9666166595274702e-05, + "loss": 0.3938, + "step": 111120 + }, + { + "epoch": 2.4735131766381766, + "grad_norm": 0.5031790733337402, + "learning_rate": 2.9641737452475872e-05, + "loss": 0.3214, + "step": 111130 + }, + { + "epoch": 2.473735754985755, + "grad_norm": 0.5017289519309998, + "learning_rate": 2.961731756708448e-05, + "loss": 0.4305, + "step": 111140 + }, + { + "epoch": 2.4739583333333335, + "grad_norm": 0.7404310703277588, + "learning_rate": 2.9592906940427534e-05, + "loss": 0.4257, + "step": 111150 + }, + { + "epoch": 2.4741809116809117, + "grad_norm": 0.6419523358345032, + "learning_rate": 2.9568505573831574e-05, + "loss": 0.5095, + "step": 111160 + }, + { + "epoch": 2.47440349002849, + "grad_norm": 0.380824476480484, + "learning_rate": 2.9544113468622492e-05, + "loss": 0.3854, + "step": 111170 + }, + { + "epoch": 2.474626068376068, + "grad_norm": 0.6494007706642151, + "learning_rate": 2.9519730626125874e-05, + "loss": 0.5319, + "step": 111180 + }, + { + "epoch": 2.474848646723647, + "grad_norm": 0.46472489833831787, + "learning_rate": 2.9495357047666618e-05, + "loss": 0.4741, + "step": 111190 + }, + { + "epoch": 2.475071225071225, + "grad_norm": 0.7664408087730408, + "learning_rate": 2.9470992734569236e-05, + "loss": 0.4839, + "step": 111200 + }, + { + "epoch": 2.4752938034188032, + "grad_norm": 0.5787988305091858, + "learning_rate": 2.944663768815772e-05, + "loss": 0.4816, + "step": 111210 + }, + { + "epoch": 2.475516381766382, + "grad_norm": 0.5572113394737244, + "learning_rate": 2.9422291909755517e-05, + "loss": 0.5709, + "step": 111220 + }, + { + "epoch": 2.47573896011396, + "grad_norm": 0.5527510046958923, + "learning_rate": 2.9397955400685618e-05, + "loss": 0.4282, + "step": 111230 + }, + { + "epoch": 2.4759615384615383, + "grad_norm": 0.43369776010513306, + "learning_rate": 2.937362816227054e-05, + "loss": 0.4293, + "step": 111240 + }, + { + "epoch": 2.476184116809117, + "grad_norm": 0.5547199845314026, + "learning_rate": 2.9349310195832135e-05, + "loss": 0.3511, + "step": 111250 + }, + { + "epoch": 2.476406695156695, + "grad_norm": 0.32809948921203613, + "learning_rate": 2.9325001502691907e-05, + "loss": 0.5122, + "step": 111260 + }, + { + "epoch": 2.4766292735042734, + "grad_norm": 0.6356037855148315, + "learning_rate": 2.930070208417084e-05, + "loss": 0.516, + "step": 111270 + }, + { + "epoch": 2.476851851851852, + "grad_norm": 0.548595666885376, + "learning_rate": 2.9276411941589342e-05, + "loss": 0.448, + "step": 111280 + }, + { + "epoch": 2.4770744301994303, + "grad_norm": 0.5238490700721741, + "learning_rate": 2.925213107626743e-05, + "loss": 0.4345, + "step": 111290 + }, + { + "epoch": 2.4772970085470085, + "grad_norm": 0.3655892014503479, + "learning_rate": 2.9227859489524467e-05, + "loss": 0.4731, + "step": 111300 + }, + { + "epoch": 2.4775195868945867, + "grad_norm": 0.7614578604698181, + "learning_rate": 2.9203597182679444e-05, + "loss": 0.4322, + "step": 111310 + }, + { + "epoch": 2.4777421652421654, + "grad_norm": 0.8040751814842224, + "learning_rate": 2.9179344157050724e-05, + "loss": 0.4649, + "step": 111320 + }, + { + "epoch": 2.4779647435897436, + "grad_norm": 0.7341752052307129, + "learning_rate": 2.9155100413956306e-05, + "loss": 0.4542, + "step": 111330 + }, + { + "epoch": 2.478187321937322, + "grad_norm": 0.5599677562713623, + "learning_rate": 2.913086595471357e-05, + "loss": 0.4664, + "step": 111340 + }, + { + "epoch": 2.4784099002849, + "grad_norm": 0.551198422908783, + "learning_rate": 2.9106640780639472e-05, + "loss": 0.5055, + "step": 111350 + }, + { + "epoch": 2.4786324786324787, + "grad_norm": 0.6239669322967529, + "learning_rate": 2.9082424893050398e-05, + "loss": 0.5534, + "step": 111360 + }, + { + "epoch": 2.478855056980057, + "grad_norm": 0.5712863802909851, + "learning_rate": 2.9058218293262297e-05, + "loss": 0.4345, + "step": 111370 + }, + { + "epoch": 2.479077635327635, + "grad_norm": 0.5392316579818726, + "learning_rate": 2.903402098259058e-05, + "loss": 0.3854, + "step": 111380 + }, + { + "epoch": 2.479300213675214, + "grad_norm": 0.6309967637062073, + "learning_rate": 2.9009832962350092e-05, + "loss": 0.4446, + "step": 111390 + }, + { + "epoch": 2.479522792022792, + "grad_norm": 0.667599081993103, + "learning_rate": 2.8985654233855243e-05, + "loss": 0.4203, + "step": 111400 + }, + { + "epoch": 2.4797453703703702, + "grad_norm": 0.7004653215408325, + "learning_rate": 2.8961484798419934e-05, + "loss": 0.4952, + "step": 111410 + }, + { + "epoch": 2.479967948717949, + "grad_norm": 0.4802926480770111, + "learning_rate": 2.8937324657357632e-05, + "loss": 0.5486, + "step": 111420 + }, + { + "epoch": 2.480190527065527, + "grad_norm": 0.5662277936935425, + "learning_rate": 2.8913173811981086e-05, + "loss": 0.5311, + "step": 111430 + }, + { + "epoch": 2.4804131054131053, + "grad_norm": 0.794284999370575, + "learning_rate": 2.8889032263602733e-05, + "loss": 0.4408, + "step": 111440 + }, + { + "epoch": 2.480635683760684, + "grad_norm": 0.5913161635398865, + "learning_rate": 2.88649000135345e-05, + "loss": 0.5315, + "step": 111450 + }, + { + "epoch": 2.480858262108262, + "grad_norm": 0.4098511040210724, + "learning_rate": 2.8840777063087655e-05, + "loss": 0.3523, + "step": 111460 + }, + { + "epoch": 2.4810808404558404, + "grad_norm": 0.6115807890892029, + "learning_rate": 2.8816663413573096e-05, + "loss": 0.3661, + "step": 111470 + }, + { + "epoch": 2.4813034188034186, + "grad_norm": 0.8771860599517822, + "learning_rate": 2.8792559066301183e-05, + "loss": 0.4146, + "step": 111480 + }, + { + "epoch": 2.4815259971509973, + "grad_norm": 0.7560129761695862, + "learning_rate": 2.8768464022581755e-05, + "loss": 0.5119, + "step": 111490 + }, + { + "epoch": 2.4817485754985755, + "grad_norm": 0.590459406375885, + "learning_rate": 2.8744378283724184e-05, + "loss": 0.4552, + "step": 111500 + }, + { + "epoch": 2.4819711538461537, + "grad_norm": 0.6007856726646423, + "learning_rate": 2.872030185103729e-05, + "loss": 0.4496, + "step": 111510 + }, + { + "epoch": 2.482193732193732, + "grad_norm": 0.7245687246322632, + "learning_rate": 2.8696234725829452e-05, + "loss": 0.4736, + "step": 111520 + }, + { + "epoch": 2.4824163105413106, + "grad_norm": 0.5625166893005371, + "learning_rate": 2.867217690940842e-05, + "loss": 0.4426, + "step": 111530 + }, + { + "epoch": 2.482638888888889, + "grad_norm": 0.4895360469818115, + "learning_rate": 2.864812840308153e-05, + "loss": 0.4357, + "step": 111540 + }, + { + "epoch": 2.482861467236467, + "grad_norm": 0.502686619758606, + "learning_rate": 2.862408920815567e-05, + "loss": 0.3162, + "step": 111550 + }, + { + "epoch": 2.4830840455840457, + "grad_norm": 0.5399205684661865, + "learning_rate": 2.8600059325937057e-05, + "loss": 0.4838, + "step": 111560 + }, + { + "epoch": 2.483306623931624, + "grad_norm": 0.9145728349685669, + "learning_rate": 2.8576038757731537e-05, + "loss": 0.523, + "step": 111570 + }, + { + "epoch": 2.483529202279202, + "grad_norm": 0.6872720122337341, + "learning_rate": 2.8552027504844404e-05, + "loss": 0.3491, + "step": 111580 + }, + { + "epoch": 2.483751780626781, + "grad_norm": 0.5079686045646667, + "learning_rate": 2.8528025568580495e-05, + "loss": 0.3177, + "step": 111590 + }, + { + "epoch": 2.483974358974359, + "grad_norm": 0.5998859405517578, + "learning_rate": 2.8504032950243998e-05, + "loss": 0.4102, + "step": 111600 + }, + { + "epoch": 2.4841969373219372, + "grad_norm": 0.7380645275115967, + "learning_rate": 2.8480049651138752e-05, + "loss": 0.4272, + "step": 111610 + }, + { + "epoch": 2.484419515669516, + "grad_norm": 0.4750724732875824, + "learning_rate": 2.8456075672568028e-05, + "loss": 0.4503, + "step": 111620 + }, + { + "epoch": 2.484642094017094, + "grad_norm": 0.7769150733947754, + "learning_rate": 2.843211101583456e-05, + "loss": 0.533, + "step": 111630 + }, + { + "epoch": 2.4848646723646723, + "grad_norm": 0.36780810356140137, + "learning_rate": 2.840815568224067e-05, + "loss": 0.3738, + "step": 111640 + }, + { + "epoch": 2.4850872507122506, + "grad_norm": 0.7026530504226685, + "learning_rate": 2.8384209673088036e-05, + "loss": 0.4269, + "step": 111650 + }, + { + "epoch": 2.4853098290598292, + "grad_norm": 0.7291935086250305, + "learning_rate": 2.8360272989678005e-05, + "loss": 0.5512, + "step": 111660 + }, + { + "epoch": 2.4855324074074074, + "grad_norm": 0.6416599750518799, + "learning_rate": 2.8336345633311178e-05, + "loss": 0.4808, + "step": 111670 + }, + { + "epoch": 2.4857549857549857, + "grad_norm": 0.607215940952301, + "learning_rate": 2.831242760528794e-05, + "loss": 0.3958, + "step": 111680 + }, + { + "epoch": 2.485977564102564, + "grad_norm": 0.557759165763855, + "learning_rate": 2.8288518906907868e-05, + "loss": 0.4252, + "step": 111690 + }, + { + "epoch": 2.4862001424501425, + "grad_norm": 0.5209227204322815, + "learning_rate": 2.8264619539470262e-05, + "loss": 0.4442, + "step": 111700 + }, + { + "epoch": 2.4864227207977208, + "grad_norm": 0.6094182133674622, + "learning_rate": 2.824072950427381e-05, + "loss": 0.4224, + "step": 111710 + }, + { + "epoch": 2.486645299145299, + "grad_norm": 0.6645049452781677, + "learning_rate": 2.8216848802616723e-05, + "loss": 0.4077, + "step": 111720 + }, + { + "epoch": 2.4868678774928776, + "grad_norm": 0.40797844529151917, + "learning_rate": 2.819297743579674e-05, + "loss": 0.4602, + "step": 111730 + }, + { + "epoch": 2.487090455840456, + "grad_norm": 0.42032089829444885, + "learning_rate": 2.816911540511098e-05, + "loss": 0.4549, + "step": 111740 + }, + { + "epoch": 2.487313034188034, + "grad_norm": 0.49241432547569275, + "learning_rate": 2.8145262711856158e-05, + "loss": 0.4144, + "step": 111750 + }, + { + "epoch": 2.4875356125356127, + "grad_norm": 0.42551571130752563, + "learning_rate": 2.812141935732844e-05, + "loss": 0.5501, + "step": 111760 + }, + { + "epoch": 2.487758190883191, + "grad_norm": 0.617946207523346, + "learning_rate": 2.8097585342823496e-05, + "loss": 0.4737, + "step": 111770 + }, + { + "epoch": 2.487980769230769, + "grad_norm": 0.48028650879859924, + "learning_rate": 2.8073760669636495e-05, + "loss": 0.4125, + "step": 111780 + }, + { + "epoch": 2.488203347578348, + "grad_norm": 0.6532488465309143, + "learning_rate": 2.804994533906209e-05, + "loss": 0.5161, + "step": 111790 + }, + { + "epoch": 2.488425925925926, + "grad_norm": 0.8077260255813599, + "learning_rate": 2.8026139352394464e-05, + "loss": 0.4786, + "step": 111800 + }, + { + "epoch": 2.4886485042735043, + "grad_norm": 0.628367006778717, + "learning_rate": 2.8002342710927166e-05, + "loss": 0.4582, + "step": 111810 + }, + { + "epoch": 2.4888710826210825, + "grad_norm": 0.6152153015136719, + "learning_rate": 2.79785554159534e-05, + "loss": 0.5696, + "step": 111820 + }, + { + "epoch": 2.489093660968661, + "grad_norm": 0.6126071214675903, + "learning_rate": 2.7954777468765735e-05, + "loss": 0.415, + "step": 111830 + }, + { + "epoch": 2.4893162393162394, + "grad_norm": 0.5737091302871704, + "learning_rate": 2.7931008870656272e-05, + "loss": 0.412, + "step": 111840 + }, + { + "epoch": 2.4895388176638176, + "grad_norm": 0.7487857341766357, + "learning_rate": 2.7907249622916686e-05, + "loss": 0.5441, + "step": 111850 + }, + { + "epoch": 2.489761396011396, + "grad_norm": 0.4528280794620514, + "learning_rate": 2.7883499726838015e-05, + "loss": 0.4189, + "step": 111860 + }, + { + "epoch": 2.4899839743589745, + "grad_norm": 0.5734400749206543, + "learning_rate": 2.785975918371091e-05, + "loss": 0.3899, + "step": 111870 + }, + { + "epoch": 2.4902065527065527, + "grad_norm": 0.4233279526233673, + "learning_rate": 2.7836027994825387e-05, + "loss": 0.3746, + "step": 111880 + }, + { + "epoch": 2.490429131054131, + "grad_norm": 0.511591911315918, + "learning_rate": 2.7812306161471013e-05, + "loss": 0.4438, + "step": 111890 + }, + { + "epoch": 2.4906517094017095, + "grad_norm": 0.7605999708175659, + "learning_rate": 2.7788593684936914e-05, + "loss": 0.4961, + "step": 111900 + }, + { + "epoch": 2.4908742877492878, + "grad_norm": 0.5704527497291565, + "learning_rate": 2.776489056651159e-05, + "loss": 0.3872, + "step": 111910 + }, + { + "epoch": 2.491096866096866, + "grad_norm": 0.5394191145896912, + "learning_rate": 2.7741196807483126e-05, + "loss": 0.3728, + "step": 111920 + }, + { + "epoch": 2.4913194444444446, + "grad_norm": 0.3802001476287842, + "learning_rate": 2.7717512409139044e-05, + "loss": 0.3585, + "step": 111930 + }, + { + "epoch": 2.491542022792023, + "grad_norm": 1.008232831954956, + "learning_rate": 2.7693837372766407e-05, + "loss": 0.5734, + "step": 111940 + }, + { + "epoch": 2.491764601139601, + "grad_norm": 0.5891938805580139, + "learning_rate": 2.7670171699651714e-05, + "loss": 0.5298, + "step": 111950 + }, + { + "epoch": 2.4919871794871793, + "grad_norm": 0.5905712842941284, + "learning_rate": 2.7646515391080917e-05, + "loss": 0.3535, + "step": 111960 + }, + { + "epoch": 2.492209757834758, + "grad_norm": 0.5723505616188049, + "learning_rate": 2.762286844833957e-05, + "loss": 0.4352, + "step": 111970 + }, + { + "epoch": 2.492432336182336, + "grad_norm": 0.5312645435333252, + "learning_rate": 2.7599230872712656e-05, + "loss": 0.4558, + "step": 111980 + }, + { + "epoch": 2.4926549145299144, + "grad_norm": 0.7636615037918091, + "learning_rate": 2.757560266548469e-05, + "loss": 0.4947, + "step": 111990 + }, + { + "epoch": 2.492877492877493, + "grad_norm": 0.5262453556060791, + "learning_rate": 2.7551983827939622e-05, + "loss": 0.5204, + "step": 112000 + }, + { + "epoch": 2.4931000712250713, + "grad_norm": 0.8180909156799316, + "learning_rate": 2.7528374361360953e-05, + "loss": 0.5831, + "step": 112010 + }, + { + "epoch": 2.4933226495726495, + "grad_norm": 0.5332491993904114, + "learning_rate": 2.7504774267031596e-05, + "loss": 0.4673, + "step": 112020 + }, + { + "epoch": 2.4935452279202277, + "grad_norm": 0.5418760776519775, + "learning_rate": 2.748118354623399e-05, + "loss": 0.3911, + "step": 112030 + }, + { + "epoch": 2.4937678062678064, + "grad_norm": 0.7037185430526733, + "learning_rate": 2.7457602200250134e-05, + "loss": 0.4271, + "step": 112040 + }, + { + "epoch": 2.4939903846153846, + "grad_norm": 0.42940235137939453, + "learning_rate": 2.7434030230361395e-05, + "loss": 0.3757, + "step": 112050 + }, + { + "epoch": 2.494212962962963, + "grad_norm": 0.7504114508628845, + "learning_rate": 2.7410467637848736e-05, + "loss": 0.4133, + "step": 112060 + }, + { + "epoch": 2.4944355413105415, + "grad_norm": 0.29727378487586975, + "learning_rate": 2.7386914423992593e-05, + "loss": 0.4416, + "step": 112070 + }, + { + "epoch": 2.4946581196581197, + "grad_norm": 0.5232381224632263, + "learning_rate": 2.7363370590072768e-05, + "loss": 0.4077, + "step": 112080 + }, + { + "epoch": 2.494880698005698, + "grad_norm": 0.46083950996398926, + "learning_rate": 2.7339836137368768e-05, + "loss": 0.3682, + "step": 112090 + }, + { + "epoch": 2.4951032763532766, + "grad_norm": 0.4653097987174988, + "learning_rate": 2.7316311067159394e-05, + "loss": 0.4621, + "step": 112100 + }, + { + "epoch": 2.4953258547008548, + "grad_norm": 0.6140937209129333, + "learning_rate": 2.7292795380723024e-05, + "loss": 0.5033, + "step": 112110 + }, + { + "epoch": 2.495548433048433, + "grad_norm": 0.9223384857177734, + "learning_rate": 2.7269289079337544e-05, + "loss": 0.476, + "step": 112120 + }, + { + "epoch": 2.495771011396011, + "grad_norm": 0.7549685835838318, + "learning_rate": 2.7245792164280293e-05, + "loss": 0.5107, + "step": 112130 + }, + { + "epoch": 2.49599358974359, + "grad_norm": 0.5670859813690186, + "learning_rate": 2.722230463682811e-05, + "loss": 0.4197, + "step": 112140 + }, + { + "epoch": 2.496216168091168, + "grad_norm": 0.6729944944381714, + "learning_rate": 2.7198826498257403e-05, + "loss": 0.4879, + "step": 112150 + }, + { + "epoch": 2.4964387464387463, + "grad_norm": 0.5611489415168762, + "learning_rate": 2.7175357749843855e-05, + "loss": 0.4378, + "step": 112160 + }, + { + "epoch": 2.496661324786325, + "grad_norm": 0.5496370792388916, + "learning_rate": 2.7151898392862874e-05, + "loss": 0.4153, + "step": 112170 + }, + { + "epoch": 2.496883903133903, + "grad_norm": 0.6167081594467163, + "learning_rate": 2.7128448428589216e-05, + "loss": 0.3725, + "step": 112180 + }, + { + "epoch": 2.4971064814814814, + "grad_norm": 0.6268320679664612, + "learning_rate": 2.7105007858297193e-05, + "loss": 0.4281, + "step": 112190 + }, + { + "epoch": 2.4973290598290596, + "grad_norm": 0.6511839628219604, + "learning_rate": 2.708157668326059e-05, + "loss": 0.4173, + "step": 112200 + }, + { + "epoch": 2.4975516381766383, + "grad_norm": 0.5909841656684875, + "learning_rate": 2.70581549047527e-05, + "loss": 0.5347, + "step": 112210 + }, + { + "epoch": 2.4977742165242165, + "grad_norm": 0.4672543406486511, + "learning_rate": 2.7034742524046232e-05, + "loss": 0.3949, + "step": 112220 + }, + { + "epoch": 2.4979967948717947, + "grad_norm": 0.6269485354423523, + "learning_rate": 2.7011339542413462e-05, + "loss": 0.3727, + "step": 112230 + }, + { + "epoch": 2.4982193732193734, + "grad_norm": 0.7204062342643738, + "learning_rate": 2.6987945961126082e-05, + "loss": 0.336, + "step": 112240 + }, + { + "epoch": 2.4984419515669516, + "grad_norm": 0.4992566406726837, + "learning_rate": 2.6964561781455368e-05, + "loss": 0.6063, + "step": 112250 + }, + { + "epoch": 2.49866452991453, + "grad_norm": 0.7157484292984009, + "learning_rate": 2.6941187004672007e-05, + "loss": 0.3249, + "step": 112260 + }, + { + "epoch": 2.4988871082621085, + "grad_norm": 0.912991464138031, + "learning_rate": 2.6917821632046213e-05, + "loss": 0.4232, + "step": 112270 + }, + { + "epoch": 2.4991096866096867, + "grad_norm": 0.6335121393203735, + "learning_rate": 2.689446566484768e-05, + "loss": 0.4938, + "step": 112280 + }, + { + "epoch": 2.499332264957265, + "grad_norm": 0.5526174902915955, + "learning_rate": 2.6871119104345653e-05, + "loss": 0.4716, + "step": 112290 + }, + { + "epoch": 2.499554843304843, + "grad_norm": 0.41126495599746704, + "learning_rate": 2.6847781951808682e-05, + "loss": 0.4587, + "step": 112300 + }, + { + "epoch": 2.499777421652422, + "grad_norm": 0.38075271248817444, + "learning_rate": 2.682445420850501e-05, + "loss": 0.4874, + "step": 112310 + }, + { + "epoch": 2.5, + "grad_norm": 0.48302462697029114, + "learning_rate": 2.6801135875702254e-05, + "loss": 0.3682, + "step": 112320 + }, + { + "epoch": 2.500222578347578, + "grad_norm": 0.46705126762390137, + "learning_rate": 2.6777826954667552e-05, + "loss": 0.4931, + "step": 112330 + }, + { + "epoch": 2.5004451566951564, + "grad_norm": 0.9624470472335815, + "learning_rate": 2.67545274466676e-05, + "loss": 0.5135, + "step": 112340 + }, + { + "epoch": 2.500667735042735, + "grad_norm": 0.4059145748615265, + "learning_rate": 2.6731237352968408e-05, + "loss": 0.4634, + "step": 112350 + }, + { + "epoch": 2.5008903133903133, + "grad_norm": 0.6012235283851624, + "learning_rate": 2.670795667483561e-05, + "loss": 0.5085, + "step": 112360 + }, + { + "epoch": 2.5011128917378915, + "grad_norm": 0.6443610191345215, + "learning_rate": 2.668468541353435e-05, + "loss": 0.4311, + "step": 112370 + }, + { + "epoch": 2.50133547008547, + "grad_norm": 0.4933191239833832, + "learning_rate": 2.6661423570329125e-05, + "loss": 0.5441, + "step": 112380 + }, + { + "epoch": 2.5015580484330484, + "grad_norm": 0.43826714158058167, + "learning_rate": 2.6638171146484058e-05, + "loss": 0.3718, + "step": 112390 + }, + { + "epoch": 2.5017806267806266, + "grad_norm": 0.7164596319198608, + "learning_rate": 2.6614928143262695e-05, + "loss": 0.5705, + "step": 112400 + }, + { + "epoch": 2.5020032051282053, + "grad_norm": 0.2963273227214813, + "learning_rate": 2.6591694561928073e-05, + "loss": 0.3864, + "step": 112410 + }, + { + "epoch": 2.5022257834757835, + "grad_norm": 0.7462092041969299, + "learning_rate": 2.6568470403742706e-05, + "loss": 0.4764, + "step": 112420 + }, + { + "epoch": 2.5024483618233617, + "grad_norm": 0.5694229006767273, + "learning_rate": 2.6545255669968704e-05, + "loss": 0.4772, + "step": 112430 + }, + { + "epoch": 2.5026709401709404, + "grad_norm": 0.5272653102874756, + "learning_rate": 2.6522050361867435e-05, + "loss": 0.4651, + "step": 112440 + }, + { + "epoch": 2.5028935185185186, + "grad_norm": 0.44499674439430237, + "learning_rate": 2.64988544807e-05, + "loss": 0.5382, + "step": 112450 + }, + { + "epoch": 2.503116096866097, + "grad_norm": 1.0100992918014526, + "learning_rate": 2.6475668027726807e-05, + "loss": 0.4666, + "step": 112460 + }, + { + "epoch": 2.5033386752136755, + "grad_norm": 0.39378368854522705, + "learning_rate": 2.6452491004207948e-05, + "loss": 0.5063, + "step": 112470 + }, + { + "epoch": 2.5035612535612537, + "grad_norm": 0.6302194595336914, + "learning_rate": 2.642932341140274e-05, + "loss": 0.3958, + "step": 112480 + }, + { + "epoch": 2.503783831908832, + "grad_norm": 0.4820767641067505, + "learning_rate": 2.640616525057018e-05, + "loss": 0.4808, + "step": 112490 + }, + { + "epoch": 2.50400641025641, + "grad_norm": 0.6411905288696289, + "learning_rate": 2.6383016522968728e-05, + "loss": 0.4497, + "step": 112500 + }, + { + "epoch": 2.5042289886039883, + "grad_norm": 0.6101178526878357, + "learning_rate": 2.6359877229856334e-05, + "loss": 0.4695, + "step": 112510 + }, + { + "epoch": 2.504451566951567, + "grad_norm": 0.4686170518398285, + "learning_rate": 2.63367473724903e-05, + "loss": 0.3935, + "step": 112520 + }, + { + "epoch": 2.5046741452991452, + "grad_norm": 0.3824845850467682, + "learning_rate": 2.631362695212758e-05, + "loss": 0.4467, + "step": 112530 + }, + { + "epoch": 2.5048967236467234, + "grad_norm": 0.5891847014427185, + "learning_rate": 2.6290515970024567e-05, + "loss": 0.4516, + "step": 112540 + }, + { + "epoch": 2.505119301994302, + "grad_norm": 0.6469375491142273, + "learning_rate": 2.6267414427437122e-05, + "loss": 0.4168, + "step": 112550 + }, + { + "epoch": 2.5053418803418803, + "grad_norm": 0.5130420923233032, + "learning_rate": 2.6244322325620596e-05, + "loss": 0.4113, + "step": 112560 + }, + { + "epoch": 2.5055644586894585, + "grad_norm": 0.6089246273040771, + "learning_rate": 2.6221239665829878e-05, + "loss": 0.4957, + "step": 112570 + }, + { + "epoch": 2.505787037037037, + "grad_norm": 0.39957547187805176, + "learning_rate": 2.6198166449319228e-05, + "loss": 0.4694, + "step": 112580 + }, + { + "epoch": 2.5060096153846154, + "grad_norm": 0.8839907646179199, + "learning_rate": 2.6175102677342488e-05, + "loss": 0.5822, + "step": 112590 + }, + { + "epoch": 2.5062321937321936, + "grad_norm": 0.5108381509780884, + "learning_rate": 2.6152048351153013e-05, + "loss": 0.5334, + "step": 112600 + }, + { + "epoch": 2.5064547720797723, + "grad_norm": 0.7718234062194824, + "learning_rate": 2.612900347200351e-05, + "loss": 0.5677, + "step": 112610 + }, + { + "epoch": 2.5066773504273505, + "grad_norm": 0.6024258732795715, + "learning_rate": 2.610596804114629e-05, + "loss": 0.519, + "step": 112620 + }, + { + "epoch": 2.5068999287749287, + "grad_norm": 0.6917393803596497, + "learning_rate": 2.6082942059833105e-05, + "loss": 0.5002, + "step": 112630 + }, + { + "epoch": 2.5071225071225074, + "grad_norm": 0.6841616034507751, + "learning_rate": 2.6059925529315242e-05, + "loss": 0.4769, + "step": 112640 + }, + { + "epoch": 2.5073450854700856, + "grad_norm": 0.5699818730354309, + "learning_rate": 2.603691845084346e-05, + "loss": 0.434, + "step": 112650 + }, + { + "epoch": 2.507567663817664, + "grad_norm": 0.5113489031791687, + "learning_rate": 2.6013920825667913e-05, + "loss": 0.4695, + "step": 112660 + }, + { + "epoch": 2.507790242165242, + "grad_norm": 0.553261935710907, + "learning_rate": 2.5990932655038313e-05, + "loss": 0.4824, + "step": 112670 + }, + { + "epoch": 2.5080128205128203, + "grad_norm": 0.5432833433151245, + "learning_rate": 2.5967953940203902e-05, + "loss": 0.4808, + "step": 112680 + }, + { + "epoch": 2.508235398860399, + "grad_norm": 0.8696192502975464, + "learning_rate": 2.594498468241333e-05, + "loss": 0.5923, + "step": 112690 + }, + { + "epoch": 2.508457977207977, + "grad_norm": 0.4584144353866577, + "learning_rate": 2.5922024882914797e-05, + "loss": 0.4772, + "step": 112700 + }, + { + "epoch": 2.5086805555555554, + "grad_norm": 0.653127908706665, + "learning_rate": 2.589907454295597e-05, + "loss": 0.4796, + "step": 112710 + }, + { + "epoch": 2.508903133903134, + "grad_norm": 0.6973347067832947, + "learning_rate": 2.587613366378392e-05, + "loss": 0.4021, + "step": 112720 + }, + { + "epoch": 2.5091257122507122, + "grad_norm": 0.6502978205680847, + "learning_rate": 2.585320224664536e-05, + "loss": 0.4891, + "step": 112730 + }, + { + "epoch": 2.5093482905982905, + "grad_norm": 0.4977133870124817, + "learning_rate": 2.583028029278629e-05, + "loss": 0.4121, + "step": 112740 + }, + { + "epoch": 2.509570868945869, + "grad_norm": 0.5418941378593445, + "learning_rate": 2.5807367803452387e-05, + "loss": 0.415, + "step": 112750 + }, + { + "epoch": 2.5097934472934473, + "grad_norm": 0.5099831819534302, + "learning_rate": 2.578446477988872e-05, + "loss": 0.3829, + "step": 112760 + }, + { + "epoch": 2.5100160256410255, + "grad_norm": 0.7447317838668823, + "learning_rate": 2.5761571223339842e-05, + "loss": 0.6159, + "step": 112770 + }, + { + "epoch": 2.510238603988604, + "grad_norm": 0.6790410876274109, + "learning_rate": 2.573868713504983e-05, + "loss": 0.4888, + "step": 112780 + }, + { + "epoch": 2.5104611823361824, + "grad_norm": 0.5578905344009399, + "learning_rate": 2.571581251626225e-05, + "loss": 0.4079, + "step": 112790 + }, + { + "epoch": 2.5106837606837606, + "grad_norm": 0.7169860005378723, + "learning_rate": 2.5692947368220057e-05, + "loss": 0.4423, + "step": 112800 + }, + { + "epoch": 2.5109063390313393, + "grad_norm": 0.6103445887565613, + "learning_rate": 2.567009169216581e-05, + "loss": 0.4139, + "step": 112810 + }, + { + "epoch": 2.5111289173789175, + "grad_norm": 0.5489679574966431, + "learning_rate": 2.5647245489341475e-05, + "loss": 0.4583, + "step": 112820 + }, + { + "epoch": 2.5113514957264957, + "grad_norm": 0.6748605370521545, + "learning_rate": 2.5624408760988572e-05, + "loss": 0.5856, + "step": 112830 + }, + { + "epoch": 2.511574074074074, + "grad_norm": 0.6140767931938171, + "learning_rate": 2.5601581508348037e-05, + "loss": 0.4323, + "step": 112840 + }, + { + "epoch": 2.511796652421652, + "grad_norm": 0.5786221623420715, + "learning_rate": 2.557876373266037e-05, + "loss": 0.4812, + "step": 112850 + }, + { + "epoch": 2.512019230769231, + "grad_norm": 0.7067697644233704, + "learning_rate": 2.5555955435165424e-05, + "loss": 0.5013, + "step": 112860 + }, + { + "epoch": 2.512241809116809, + "grad_norm": 0.4107857644557953, + "learning_rate": 2.5533156617102717e-05, + "loss": 0.4277, + "step": 112870 + }, + { + "epoch": 2.5124643874643873, + "grad_norm": 0.38731175661087036, + "learning_rate": 2.5510367279711057e-05, + "loss": 0.5341, + "step": 112880 + }, + { + "epoch": 2.512686965811966, + "grad_norm": 0.7229180335998535, + "learning_rate": 2.5487587424228897e-05, + "loss": 0.4351, + "step": 112890 + }, + { + "epoch": 2.512909544159544, + "grad_norm": 0.4220271408557892, + "learning_rate": 2.5464817051894097e-05, + "loss": 0.5107, + "step": 112900 + }, + { + "epoch": 2.5131321225071224, + "grad_norm": 0.7265833020210266, + "learning_rate": 2.5442056163943994e-05, + "loss": 0.5244, + "step": 112910 + }, + { + "epoch": 2.513354700854701, + "grad_norm": 0.5902624726295471, + "learning_rate": 2.5419304761615492e-05, + "loss": 0.4583, + "step": 112920 + }, + { + "epoch": 2.5135772792022792, + "grad_norm": 0.3740479350090027, + "learning_rate": 2.539656284614491e-05, + "loss": 0.4094, + "step": 112930 + }, + { + "epoch": 2.5137998575498575, + "grad_norm": 0.7416092157363892, + "learning_rate": 2.5373830418767996e-05, + "loss": 0.5095, + "step": 112940 + }, + { + "epoch": 2.514022435897436, + "grad_norm": 0.7496693134307861, + "learning_rate": 2.5351107480720093e-05, + "loss": 0.428, + "step": 112950 + }, + { + "epoch": 2.5142450142450143, + "grad_norm": 0.5957769155502319, + "learning_rate": 2.532839403323599e-05, + "loss": 0.3798, + "step": 112960 + }, + { + "epoch": 2.5144675925925926, + "grad_norm": 0.48463383316993713, + "learning_rate": 2.530569007754995e-05, + "loss": 0.4725, + "step": 112970 + }, + { + "epoch": 2.5146901709401708, + "grad_norm": 0.6827375292778015, + "learning_rate": 2.5282995614895733e-05, + "loss": 0.367, + "step": 112980 + }, + { + "epoch": 2.5149127492877494, + "grad_norm": 0.5751871466636658, + "learning_rate": 2.526031064650658e-05, + "loss": 0.3899, + "step": 112990 + }, + { + "epoch": 2.5151353276353277, + "grad_norm": 0.6092050671577454, + "learning_rate": 2.5237635173615214e-05, + "loss": 0.4741, + "step": 113000 + }, + { + "epoch": 2.515357905982906, + "grad_norm": 0.6803910732269287, + "learning_rate": 2.5214969197453765e-05, + "loss": 0.5417, + "step": 113010 + }, + { + "epoch": 2.515580484330484, + "grad_norm": 0.5000934600830078, + "learning_rate": 2.5192312719253997e-05, + "loss": 0.4847, + "step": 113020 + }, + { + "epoch": 2.5158030626780628, + "grad_norm": 0.47814443707466125, + "learning_rate": 2.5169665740247038e-05, + "loss": 0.3029, + "step": 113030 + }, + { + "epoch": 2.516025641025641, + "grad_norm": 0.5441074371337891, + "learning_rate": 2.5147028261663573e-05, + "loss": 0.4252, + "step": 113040 + }, + { + "epoch": 2.516248219373219, + "grad_norm": 0.5150730609893799, + "learning_rate": 2.512440028473373e-05, + "loss": 0.4169, + "step": 113050 + }, + { + "epoch": 2.516470797720798, + "grad_norm": 0.45782437920570374, + "learning_rate": 2.5101781810687186e-05, + "loss": 0.4274, + "step": 113060 + }, + { + "epoch": 2.516693376068376, + "grad_norm": 0.6092635989189148, + "learning_rate": 2.507917284075294e-05, + "loss": 0.4242, + "step": 113070 + }, + { + "epoch": 2.5169159544159543, + "grad_norm": 0.6076841354370117, + "learning_rate": 2.5056573376159654e-05, + "loss": 0.4427, + "step": 113080 + }, + { + "epoch": 2.517138532763533, + "grad_norm": 0.46662288904190063, + "learning_rate": 2.5033983418135386e-05, + "loss": 0.3541, + "step": 113090 + }, + { + "epoch": 2.517361111111111, + "grad_norm": 0.624786913394928, + "learning_rate": 2.501140296790767e-05, + "loss": 0.4764, + "step": 113100 + }, + { + "epoch": 2.5175836894586894, + "grad_norm": 0.31760746240615845, + "learning_rate": 2.498883202670359e-05, + "loss": 0.4431, + "step": 113110 + }, + { + "epoch": 2.517806267806268, + "grad_norm": 0.4739936888217926, + "learning_rate": 2.4966270595749652e-05, + "loss": 0.4813, + "step": 113120 + }, + { + "epoch": 2.5180288461538463, + "grad_norm": 0.6213451623916626, + "learning_rate": 2.49437186762719e-05, + "loss": 0.3937, + "step": 113130 + }, + { + "epoch": 2.5182514245014245, + "grad_norm": 0.5369846820831299, + "learning_rate": 2.4921176269495772e-05, + "loss": 0.5293, + "step": 113140 + }, + { + "epoch": 2.5184740028490027, + "grad_norm": 0.30878716707229614, + "learning_rate": 2.4898643376646204e-05, + "loss": 0.3833, + "step": 113150 + }, + { + "epoch": 2.5186965811965814, + "grad_norm": 0.6332671046257019, + "learning_rate": 2.4876119998947723e-05, + "loss": 0.4891, + "step": 113160 + }, + { + "epoch": 2.5189191595441596, + "grad_norm": 0.47033417224884033, + "learning_rate": 2.4853606137624218e-05, + "loss": 0.3578, + "step": 113170 + }, + { + "epoch": 2.519141737891738, + "grad_norm": 0.6995198130607605, + "learning_rate": 2.483110179389916e-05, + "loss": 0.4035, + "step": 113180 + }, + { + "epoch": 2.519364316239316, + "grad_norm": 0.5590694546699524, + "learning_rate": 2.480860696899543e-05, + "loss": 0.5847, + "step": 113190 + }, + { + "epoch": 2.5195868945868947, + "grad_norm": 0.7594995498657227, + "learning_rate": 2.478612166413543e-05, + "loss": 0.5278, + "step": 113200 + }, + { + "epoch": 2.519809472934473, + "grad_norm": 0.6225067973136902, + "learning_rate": 2.476364588054101e-05, + "loss": 0.3405, + "step": 113210 + }, + { + "epoch": 2.520032051282051, + "grad_norm": 0.7573350071907043, + "learning_rate": 2.4741179619433496e-05, + "loss": 0.4253, + "step": 113220 + }, + { + "epoch": 2.5202546296296298, + "grad_norm": 0.6507413387298584, + "learning_rate": 2.4718722882033763e-05, + "loss": 0.4175, + "step": 113230 + }, + { + "epoch": 2.5202991452991452, + "eval_loss": 0.5251317620277405, + "eval_runtime": 337.3492, + "eval_samples_per_second": 7.011, + "eval_steps_per_second": 7.011, + "step": 113232 + }, + { + "epoch": 2.520477207977208, + "grad_norm": 0.6276082992553711, + "learning_rate": 2.469627566956214e-05, + "loss": 0.4185, + "step": 113240 + }, + { + "epoch": 2.520699786324786, + "grad_norm": 0.5301204323768616, + "learning_rate": 2.4673837983238392e-05, + "loss": 0.3982, + "step": 113250 + }, + { + "epoch": 2.520922364672365, + "grad_norm": 0.5993652939796448, + "learning_rate": 2.465140982428187e-05, + "loss": 0.5212, + "step": 113260 + }, + { + "epoch": 2.521144943019943, + "grad_norm": 0.5609367489814758, + "learning_rate": 2.462899119391122e-05, + "loss": 0.4105, + "step": 113270 + }, + { + "epoch": 2.5213675213675213, + "grad_norm": 0.5710123777389526, + "learning_rate": 2.460658209334481e-05, + "loss": 0.3595, + "step": 113280 + }, + { + "epoch": 2.5215900997151, + "grad_norm": 0.4957076609134674, + "learning_rate": 2.458418252380028e-05, + "loss": 0.3283, + "step": 113290 + }, + { + "epoch": 2.521812678062678, + "grad_norm": 0.5248811841011047, + "learning_rate": 2.456179248649486e-05, + "loss": 0.3924, + "step": 113300 + }, + { + "epoch": 2.5220352564102564, + "grad_norm": 0.4309212267398834, + "learning_rate": 2.4539411982645268e-05, + "loss": 0.32, + "step": 113310 + }, + { + "epoch": 2.5222578347578346, + "grad_norm": 0.5939930081367493, + "learning_rate": 2.4517041013467656e-05, + "loss": 0.4909, + "step": 113320 + }, + { + "epoch": 2.5224804131054133, + "grad_norm": 0.725871741771698, + "learning_rate": 2.449467958017768e-05, + "loss": 0.4288, + "step": 113330 + }, + { + "epoch": 2.5227029914529915, + "grad_norm": 0.49962177872657776, + "learning_rate": 2.447232768399057e-05, + "loss": 0.3743, + "step": 113340 + }, + { + "epoch": 2.5229255698005697, + "grad_norm": 0.618010938167572, + "learning_rate": 2.4449985326120794e-05, + "loss": 0.5669, + "step": 113350 + }, + { + "epoch": 2.523148148148148, + "grad_norm": 0.7896900773048401, + "learning_rate": 2.442765250778254e-05, + "loss": 0.5632, + "step": 113360 + }, + { + "epoch": 2.5233707264957266, + "grad_norm": 0.5548889636993408, + "learning_rate": 2.4405329230189366e-05, + "loss": 0.4774, + "step": 113370 + }, + { + "epoch": 2.523593304843305, + "grad_norm": 0.43557047843933105, + "learning_rate": 2.4383015494554374e-05, + "loss": 0.3379, + "step": 113380 + }, + { + "epoch": 2.523815883190883, + "grad_norm": 0.47517427802085876, + "learning_rate": 2.4360711302090122e-05, + "loss": 0.4197, + "step": 113390 + }, + { + "epoch": 2.5240384615384617, + "grad_norm": 0.5873776078224182, + "learning_rate": 2.4338416654008556e-05, + "loss": 0.4637, + "step": 113400 + }, + { + "epoch": 2.52426103988604, + "grad_norm": 0.574032723903656, + "learning_rate": 2.4316131551521215e-05, + "loss": 0.4954, + "step": 113410 + }, + { + "epoch": 2.524483618233618, + "grad_norm": 0.32711052894592285, + "learning_rate": 2.4293855995839175e-05, + "loss": 0.3539, + "step": 113420 + }, + { + "epoch": 2.5247061965811968, + "grad_norm": 0.5761905908584595, + "learning_rate": 2.4271589988172783e-05, + "loss": 0.4722, + "step": 113430 + }, + { + "epoch": 2.524928774928775, + "grad_norm": 0.4207303524017334, + "learning_rate": 2.4249333529732066e-05, + "loss": 0.4692, + "step": 113440 + }, + { + "epoch": 2.525151353276353, + "grad_norm": 0.7163510322570801, + "learning_rate": 2.4227086621726414e-05, + "loss": 0.496, + "step": 113450 + }, + { + "epoch": 2.525373931623932, + "grad_norm": 0.6225937604904175, + "learning_rate": 2.4204849265364794e-05, + "loss": 0.5276, + "step": 113460 + }, + { + "epoch": 2.52559650997151, + "grad_norm": 0.4533245265483856, + "learning_rate": 2.418262146185557e-05, + "loss": 0.4885, + "step": 113470 + }, + { + "epoch": 2.5258190883190883, + "grad_norm": 0.5328688621520996, + "learning_rate": 2.416040321240667e-05, + "loss": 0.5348, + "step": 113480 + }, + { + "epoch": 2.5260416666666665, + "grad_norm": 0.4904916286468506, + "learning_rate": 2.4138194518225344e-05, + "loss": 0.4447, + "step": 113490 + }, + { + "epoch": 2.526264245014245, + "grad_norm": 0.6268056631088257, + "learning_rate": 2.411599538051852e-05, + "loss": 0.5405, + "step": 113500 + }, + { + "epoch": 2.5264868233618234, + "grad_norm": 0.5416222810745239, + "learning_rate": 2.40938058004925e-05, + "loss": 0.4533, + "step": 113510 + }, + { + "epoch": 2.5267094017094016, + "grad_norm": 0.505547285079956, + "learning_rate": 2.407162577935309e-05, + "loss": 0.3999, + "step": 113520 + }, + { + "epoch": 2.52693198005698, + "grad_norm": 0.6490024924278259, + "learning_rate": 2.4049455318305536e-05, + "loss": 0.4791, + "step": 113530 + }, + { + "epoch": 2.5271545584045585, + "grad_norm": 0.7141279578208923, + "learning_rate": 2.4027294418554602e-05, + "loss": 0.5274, + "step": 113540 + }, + { + "epoch": 2.5273771367521367, + "grad_norm": 0.6870497465133667, + "learning_rate": 2.400514308130457e-05, + "loss": 0.4472, + "step": 113550 + }, + { + "epoch": 2.527599715099715, + "grad_norm": 0.738166332244873, + "learning_rate": 2.398300130775917e-05, + "loss": 0.4365, + "step": 113560 + }, + { + "epoch": 2.5278222934472936, + "grad_norm": 0.334552526473999, + "learning_rate": 2.3960869099121542e-05, + "loss": 0.3378, + "step": 113570 + }, + { + "epoch": 2.528044871794872, + "grad_norm": 0.6833900213241577, + "learning_rate": 2.3938746456594375e-05, + "loss": 0.2975, + "step": 113580 + }, + { + "epoch": 2.52826745014245, + "grad_norm": 0.8802908658981323, + "learning_rate": 2.3916633381379862e-05, + "loss": 0.5202, + "step": 113590 + }, + { + "epoch": 2.5284900284900287, + "grad_norm": 0.641450047492981, + "learning_rate": 2.389452987467966e-05, + "loss": 0.4123, + "step": 113600 + }, + { + "epoch": 2.528712606837607, + "grad_norm": 0.7287760972976685, + "learning_rate": 2.387243593769486e-05, + "loss": 0.4331, + "step": 113610 + }, + { + "epoch": 2.528935185185185, + "grad_norm": 0.7095321416854858, + "learning_rate": 2.3850351571626118e-05, + "loss": 0.4529, + "step": 113620 + }, + { + "epoch": 2.529157763532764, + "grad_norm": 0.43919217586517334, + "learning_rate": 2.3828276777673432e-05, + "loss": 0.5158, + "step": 113630 + }, + { + "epoch": 2.529380341880342, + "grad_norm": 0.4373423755168915, + "learning_rate": 2.3806211557036394e-05, + "loss": 0.5182, + "step": 113640 + }, + { + "epoch": 2.52960292022792, + "grad_norm": 0.5198220610618591, + "learning_rate": 2.3784155910914118e-05, + "loss": 0.4346, + "step": 113650 + }, + { + "epoch": 2.5298254985754984, + "grad_norm": 0.5108519792556763, + "learning_rate": 2.376210984050502e-05, + "loss": 0.4769, + "step": 113660 + }, + { + "epoch": 2.5300480769230766, + "grad_norm": 0.5933377146720886, + "learning_rate": 2.374007334700714e-05, + "loss": 0.4075, + "step": 113670 + }, + { + "epoch": 2.5302706552706553, + "grad_norm": 0.5652526021003723, + "learning_rate": 2.371804643161797e-05, + "loss": 0.4435, + "step": 113680 + }, + { + "epoch": 2.5304932336182335, + "grad_norm": 0.3907783627510071, + "learning_rate": 2.369602909553448e-05, + "loss": 0.4308, + "step": 113690 + }, + { + "epoch": 2.5307158119658117, + "grad_norm": 0.49446383118629456, + "learning_rate": 2.3674021339953134e-05, + "loss": 0.3531, + "step": 113700 + }, + { + "epoch": 2.5309383903133904, + "grad_norm": 0.7018763422966003, + "learning_rate": 2.365202316606978e-05, + "loss": 0.4816, + "step": 113710 + }, + { + "epoch": 2.5311609686609686, + "grad_norm": 0.9009945392608643, + "learning_rate": 2.3630034575079842e-05, + "loss": 0.4595, + "step": 113720 + }, + { + "epoch": 2.531383547008547, + "grad_norm": 0.716633141040802, + "learning_rate": 2.36080555681782e-05, + "loss": 0.4499, + "step": 113730 + }, + { + "epoch": 2.5316061253561255, + "grad_norm": 0.6000513434410095, + "learning_rate": 2.3586086146559237e-05, + "loss": 0.4465, + "step": 113740 + }, + { + "epoch": 2.5318287037037037, + "grad_norm": 0.5333988666534424, + "learning_rate": 2.3564126311416757e-05, + "loss": 0.4665, + "step": 113750 + }, + { + "epoch": 2.532051282051282, + "grad_norm": 0.7265472412109375, + "learning_rate": 2.3542176063944154e-05, + "loss": 0.4582, + "step": 113760 + }, + { + "epoch": 2.5322738603988606, + "grad_norm": 0.3761736750602722, + "learning_rate": 2.35202354053341e-05, + "loss": 0.4536, + "step": 113770 + }, + { + "epoch": 2.532496438746439, + "grad_norm": 0.49985337257385254, + "learning_rate": 2.3498304336778974e-05, + "loss": 0.3787, + "step": 113780 + }, + { + "epoch": 2.532719017094017, + "grad_norm": 0.4030935764312744, + "learning_rate": 2.3476382859470445e-05, + "loss": 0.3677, + "step": 113790 + }, + { + "epoch": 2.5329415954415957, + "grad_norm": 0.5702828764915466, + "learning_rate": 2.3454470974599763e-05, + "loss": 0.4384, + "step": 113800 + }, + { + "epoch": 2.533164173789174, + "grad_norm": 0.6399329900741577, + "learning_rate": 2.343256868335768e-05, + "loss": 0.4785, + "step": 113810 + }, + { + "epoch": 2.533386752136752, + "grad_norm": 0.6407581567764282, + "learning_rate": 2.3410675986934338e-05, + "loss": 0.4976, + "step": 113820 + }, + { + "epoch": 2.5336093304843303, + "grad_norm": 0.6700713634490967, + "learning_rate": 2.3388792886519428e-05, + "loss": 0.4077, + "step": 113830 + }, + { + "epoch": 2.5338319088319086, + "grad_norm": 0.37148866057395935, + "learning_rate": 2.336691938330213e-05, + "loss": 0.4419, + "step": 113840 + }, + { + "epoch": 2.5340544871794872, + "grad_norm": 0.5929746627807617, + "learning_rate": 2.334505547847101e-05, + "loss": 0.4789, + "step": 113850 + }, + { + "epoch": 2.5342770655270654, + "grad_norm": 0.4155455529689789, + "learning_rate": 2.3323201173214182e-05, + "loss": 0.4486, + "step": 113860 + }, + { + "epoch": 2.5344996438746437, + "grad_norm": 0.6532012224197388, + "learning_rate": 2.3301356468719227e-05, + "loss": 0.5254, + "step": 113870 + }, + { + "epoch": 2.5347222222222223, + "grad_norm": 0.30867519974708557, + "learning_rate": 2.327952136617324e-05, + "loss": 0.4076, + "step": 113880 + }, + { + "epoch": 2.5349448005698005, + "grad_norm": 0.5477968454360962, + "learning_rate": 2.325769586676272e-05, + "loss": 0.3542, + "step": 113890 + }, + { + "epoch": 2.5351673789173788, + "grad_norm": 0.502474844455719, + "learning_rate": 2.323587997167371e-05, + "loss": 0.386, + "step": 113900 + }, + { + "epoch": 2.5353899572649574, + "grad_norm": 0.5856671333312988, + "learning_rate": 2.3214073682091676e-05, + "loss": 0.431, + "step": 113910 + }, + { + "epoch": 2.5356125356125356, + "grad_norm": 0.7741441130638123, + "learning_rate": 2.3192276999201633e-05, + "loss": 0.502, + "step": 113920 + }, + { + "epoch": 2.535835113960114, + "grad_norm": 0.46092262864112854, + "learning_rate": 2.3170489924187956e-05, + "loss": 0.4478, + "step": 113930 + }, + { + "epoch": 2.5360576923076925, + "grad_norm": 0.5415853261947632, + "learning_rate": 2.3148712458234623e-05, + "loss": 0.5132, + "step": 113940 + }, + { + "epoch": 2.5362802706552707, + "grad_norm": 0.6181982159614563, + "learning_rate": 2.3126944602525026e-05, + "loss": 0.3757, + "step": 113950 + }, + { + "epoch": 2.536502849002849, + "grad_norm": 0.5846037864685059, + "learning_rate": 2.310518635824206e-05, + "loss": 0.5341, + "step": 113960 + }, + { + "epoch": 2.5367254273504276, + "grad_norm": 0.8839473128318787, + "learning_rate": 2.3083437726568092e-05, + "loss": 0.5566, + "step": 113970 + }, + { + "epoch": 2.536948005698006, + "grad_norm": 0.7175643444061279, + "learning_rate": 2.3061698708684953e-05, + "loss": 0.4288, + "step": 113980 + }, + { + "epoch": 2.537170584045584, + "grad_norm": 0.6663437485694885, + "learning_rate": 2.3039969305773945e-05, + "loss": 0.4593, + "step": 113990 + }, + { + "epoch": 2.5373931623931623, + "grad_norm": 0.5830166339874268, + "learning_rate": 2.3018249519015854e-05, + "loss": 0.5306, + "step": 114000 + }, + { + "epoch": 2.5376157407407405, + "grad_norm": 0.4129011929035187, + "learning_rate": 2.2996539349590985e-05, + "loss": 0.3761, + "step": 114010 + }, + { + "epoch": 2.537838319088319, + "grad_norm": 0.42609769105911255, + "learning_rate": 2.2974838798679055e-05, + "loss": 0.4356, + "step": 114020 + }, + { + "epoch": 2.5380608974358974, + "grad_norm": 0.3722520172595978, + "learning_rate": 2.2953147867459324e-05, + "loss": 0.4635, + "step": 114030 + }, + { + "epoch": 2.5382834757834756, + "grad_norm": 0.5488028526306152, + "learning_rate": 2.2931466557110492e-05, + "loss": 0.3463, + "step": 114040 + }, + { + "epoch": 2.5385060541310542, + "grad_norm": 0.4920119345188141, + "learning_rate": 2.290979486881073e-05, + "loss": 0.5244, + "step": 114050 + }, + { + "epoch": 2.5387286324786325, + "grad_norm": 0.39283880591392517, + "learning_rate": 2.288813280373765e-05, + "loss": 0.4253, + "step": 114060 + }, + { + "epoch": 2.5389512108262107, + "grad_norm": 0.3846190571784973, + "learning_rate": 2.2866480363068422e-05, + "loss": 0.4048, + "step": 114070 + }, + { + "epoch": 2.5391737891737893, + "grad_norm": 0.4565149247646332, + "learning_rate": 2.2844837547979657e-05, + "loss": 0.3872, + "step": 114080 + }, + { + "epoch": 2.5393963675213675, + "grad_norm": 0.3994213044643402, + "learning_rate": 2.2823204359647445e-05, + "loss": 0.4686, + "step": 114090 + }, + { + "epoch": 2.5396189458689458, + "grad_norm": 0.523123025894165, + "learning_rate": 2.2801580799247367e-05, + "loss": 0.4059, + "step": 114100 + }, + { + "epoch": 2.5398415242165244, + "grad_norm": 0.5251439213752747, + "learning_rate": 2.2779966867954426e-05, + "loss": 0.3668, + "step": 114110 + }, + { + "epoch": 2.5400641025641026, + "grad_norm": 0.42427927255630493, + "learning_rate": 2.2758362566943236e-05, + "loss": 0.3829, + "step": 114120 + }, + { + "epoch": 2.540286680911681, + "grad_norm": 0.5992891788482666, + "learning_rate": 2.2736767897387655e-05, + "loss": 0.5138, + "step": 114130 + }, + { + "epoch": 2.5405092592592595, + "grad_norm": 0.9765251278877258, + "learning_rate": 2.2715182860461214e-05, + "loss": 0.503, + "step": 114140 + }, + { + "epoch": 2.5407318376068377, + "grad_norm": 0.6045100092887878, + "learning_rate": 2.2693607457336885e-05, + "loss": 0.4657, + "step": 114150 + }, + { + "epoch": 2.540954415954416, + "grad_norm": 0.5176226496696472, + "learning_rate": 2.2672041689187085e-05, + "loss": 0.5246, + "step": 114160 + }, + { + "epoch": 2.541176994301994, + "grad_norm": 0.5346884727478027, + "learning_rate": 2.26504855571837e-05, + "loss": 0.4133, + "step": 114170 + }, + { + "epoch": 2.5413995726495724, + "grad_norm": 0.42462992668151855, + "learning_rate": 2.2628939062498146e-05, + "loss": 0.416, + "step": 114180 + }, + { + "epoch": 2.541622150997151, + "grad_norm": 0.4547101557254791, + "learning_rate": 2.2607402206301243e-05, + "loss": 0.4994, + "step": 114190 + }, + { + "epoch": 2.5418447293447293, + "grad_norm": 0.6860995292663574, + "learning_rate": 2.25858749897633e-05, + "loss": 0.3547, + "step": 114200 + }, + { + "epoch": 2.5420673076923075, + "grad_norm": 0.29589366912841797, + "learning_rate": 2.256435741405414e-05, + "loss": 0.4633, + "step": 114210 + }, + { + "epoch": 2.542289886039886, + "grad_norm": 0.8256801962852478, + "learning_rate": 2.254284948034304e-05, + "loss": 0.3776, + "step": 114220 + }, + { + "epoch": 2.5425124643874644, + "grad_norm": 0.5243028998374939, + "learning_rate": 2.252135118979879e-05, + "loss": 0.3772, + "step": 114230 + }, + { + "epoch": 2.5427350427350426, + "grad_norm": 0.6656879186630249, + "learning_rate": 2.24998625435896e-05, + "loss": 0.5634, + "step": 114240 + }, + { + "epoch": 2.5429576210826212, + "grad_norm": 0.5538183450698853, + "learning_rate": 2.2478383542883208e-05, + "loss": 0.47, + "step": 114250 + }, + { + "epoch": 2.5431801994301995, + "grad_norm": 0.4275704324245453, + "learning_rate": 2.245691418884679e-05, + "loss": 0.4122, + "step": 114260 + }, + { + "epoch": 2.5434027777777777, + "grad_norm": 0.4692855179309845, + "learning_rate": 2.2435454482646966e-05, + "loss": 0.4622, + "step": 114270 + }, + { + "epoch": 2.5436253561253563, + "grad_norm": 0.4590589702129364, + "learning_rate": 2.2414004425449918e-05, + "loss": 0.3253, + "step": 114280 + }, + { + "epoch": 2.5438479344729346, + "grad_norm": 0.8088108897209167, + "learning_rate": 2.2392564018421247e-05, + "loss": 0.4271, + "step": 114290 + }, + { + "epoch": 2.5440705128205128, + "grad_norm": 0.3927932381629944, + "learning_rate": 2.237113326272604e-05, + "loss": 0.514, + "step": 114300 + }, + { + "epoch": 2.5442930911680914, + "grad_norm": 0.6496009230613708, + "learning_rate": 2.23497121595289e-05, + "loss": 0.6287, + "step": 114310 + }, + { + "epoch": 2.5445156695156697, + "grad_norm": 0.6508887410163879, + "learning_rate": 2.2328300709993788e-05, + "loss": 0.425, + "step": 114320 + }, + { + "epoch": 2.544738247863248, + "grad_norm": 0.8003069162368774, + "learning_rate": 2.2306898915284324e-05, + "loss": 0.4814, + "step": 114330 + }, + { + "epoch": 2.544960826210826, + "grad_norm": 0.6616761088371277, + "learning_rate": 2.2285506776563382e-05, + "loss": 0.6248, + "step": 114340 + }, + { + "epoch": 2.5451834045584043, + "grad_norm": 0.6189643144607544, + "learning_rate": 2.2264124294993493e-05, + "loss": 0.4891, + "step": 114350 + }, + { + "epoch": 2.545405982905983, + "grad_norm": 0.4615582227706909, + "learning_rate": 2.22427514717366e-05, + "loss": 0.4754, + "step": 114360 + }, + { + "epoch": 2.545628561253561, + "grad_norm": 0.44101470708847046, + "learning_rate": 2.2221388307954106e-05, + "loss": 0.4633, + "step": 114370 + }, + { + "epoch": 2.5458511396011394, + "grad_norm": 0.4261623024940491, + "learning_rate": 2.2200034804806902e-05, + "loss": 0.36, + "step": 114380 + }, + { + "epoch": 2.546073717948718, + "grad_norm": 0.910829484462738, + "learning_rate": 2.217869096345535e-05, + "loss": 0.4595, + "step": 114390 + }, + { + "epoch": 2.5462962962962963, + "grad_norm": 0.6673019528388977, + "learning_rate": 2.2157356785059347e-05, + "loss": 0.4857, + "step": 114400 + }, + { + "epoch": 2.5465188746438745, + "grad_norm": 0.7158443331718445, + "learning_rate": 2.213603227077814e-05, + "loss": 0.6056, + "step": 114410 + }, + { + "epoch": 2.546741452991453, + "grad_norm": 0.661539614200592, + "learning_rate": 2.2114717421770535e-05, + "loss": 0.5789, + "step": 114420 + }, + { + "epoch": 2.5469640313390314, + "grad_norm": 0.5644783973693848, + "learning_rate": 2.209341223919481e-05, + "loss": 0.4612, + "step": 114430 + }, + { + "epoch": 2.5471866096866096, + "grad_norm": 0.4332530200481415, + "learning_rate": 2.2072116724208747e-05, + "loss": 0.3601, + "step": 114440 + }, + { + "epoch": 2.5474091880341883, + "grad_norm": 0.47315365076065063, + "learning_rate": 2.2050830877969485e-05, + "loss": 0.4737, + "step": 114450 + }, + { + "epoch": 2.5476317663817665, + "grad_norm": 0.6797741651535034, + "learning_rate": 2.2029554701633725e-05, + "loss": 0.5661, + "step": 114460 + }, + { + "epoch": 2.5478543447293447, + "grad_norm": 0.6491184830665588, + "learning_rate": 2.200828819635772e-05, + "loss": 0.4086, + "step": 114470 + }, + { + "epoch": 2.5480769230769234, + "grad_norm": 0.5176466703414917, + "learning_rate": 2.1987031363297005e-05, + "loss": 0.406, + "step": 114480 + }, + { + "epoch": 2.5482995014245016, + "grad_norm": 0.6204827427864075, + "learning_rate": 2.1965784203606732e-05, + "loss": 0.4653, + "step": 114490 + }, + { + "epoch": 2.54852207977208, + "grad_norm": 0.6510611772537231, + "learning_rate": 2.1944546718441483e-05, + "loss": 0.511, + "step": 114500 + }, + { + "epoch": 2.548744658119658, + "grad_norm": 0.7228637337684631, + "learning_rate": 2.1923318908955338e-05, + "loss": 0.4948, + "step": 114510 + }, + { + "epoch": 2.548967236467236, + "grad_norm": 0.4623255729675293, + "learning_rate": 2.1902100776301815e-05, + "loss": 0.4478, + "step": 114520 + }, + { + "epoch": 2.549189814814815, + "grad_norm": 0.5250282883644104, + "learning_rate": 2.188089232163393e-05, + "loss": 0.3574, + "step": 114530 + }, + { + "epoch": 2.549412393162393, + "grad_norm": 0.5935671925544739, + "learning_rate": 2.185969354610422e-05, + "loss": 0.4522, + "step": 114540 + }, + { + "epoch": 2.5496349715099713, + "grad_norm": 0.4981185495853424, + "learning_rate": 2.183850445086455e-05, + "loss": 0.3517, + "step": 114550 + }, + { + "epoch": 2.54985754985755, + "grad_norm": 0.5429505109786987, + "learning_rate": 2.1817325037066393e-05, + "loss": 0.4767, + "step": 114560 + }, + { + "epoch": 2.550080128205128, + "grad_norm": 0.6192172765731812, + "learning_rate": 2.179615530586072e-05, + "loss": 0.5638, + "step": 114570 + }, + { + "epoch": 2.5503027065527064, + "grad_norm": 0.4617951512336731, + "learning_rate": 2.177499525839779e-05, + "loss": 0.4171, + "step": 114580 + }, + { + "epoch": 2.550525284900285, + "grad_norm": 0.5843008756637573, + "learning_rate": 2.1753844895827546e-05, + "loss": 0.5141, + "step": 114590 + }, + { + "epoch": 2.5507478632478633, + "grad_norm": 0.41912171244621277, + "learning_rate": 2.173270421929927e-05, + "loss": 0.4071, + "step": 114600 + }, + { + "epoch": 2.5509704415954415, + "grad_norm": 0.5310803055763245, + "learning_rate": 2.1711573229961822e-05, + "loss": 0.4288, + "step": 114610 + }, + { + "epoch": 2.55119301994302, + "grad_norm": 0.6280931234359741, + "learning_rate": 2.1690451928963396e-05, + "loss": 0.5028, + "step": 114620 + }, + { + "epoch": 2.5514155982905984, + "grad_norm": 0.42037948966026306, + "learning_rate": 2.1669340317451803e-05, + "loss": 0.4915, + "step": 114630 + }, + { + "epoch": 2.5516381766381766, + "grad_norm": 0.7464925646781921, + "learning_rate": 2.1648238396574237e-05, + "loss": 0.4069, + "step": 114640 + }, + { + "epoch": 2.551860754985755, + "grad_norm": 0.679133951663971, + "learning_rate": 2.1627146167477385e-05, + "loss": 0.4133, + "step": 114650 + }, + { + "epoch": 2.5520833333333335, + "grad_norm": 0.5979849100112915, + "learning_rate": 2.1606063631307437e-05, + "loss": 0.4781, + "step": 114660 + }, + { + "epoch": 2.5523059116809117, + "grad_norm": 0.5203874111175537, + "learning_rate": 2.15849907892101e-05, + "loss": 0.3303, + "step": 114670 + }, + { + "epoch": 2.55252849002849, + "grad_norm": 0.43817025423049927, + "learning_rate": 2.1563927642330352e-05, + "loss": 0.4806, + "step": 114680 + }, + { + "epoch": 2.552751068376068, + "grad_norm": 0.4426831603050232, + "learning_rate": 2.1542874191812866e-05, + "loss": 0.5032, + "step": 114690 + }, + { + "epoch": 2.552973646723647, + "grad_norm": 0.42767632007598877, + "learning_rate": 2.1521830438801715e-05, + "loss": 0.44, + "step": 114700 + }, + { + "epoch": 2.553196225071225, + "grad_norm": 0.3472227454185486, + "learning_rate": 2.150079638444038e-05, + "loss": 0.4662, + "step": 114710 + }, + { + "epoch": 2.5534188034188032, + "grad_norm": 0.464819073677063, + "learning_rate": 2.147977202987188e-05, + "loss": 0.4489, + "step": 114720 + }, + { + "epoch": 2.553641381766382, + "grad_norm": 0.6429574489593506, + "learning_rate": 2.1458757376238724e-05, + "loss": 0.42, + "step": 114730 + }, + { + "epoch": 2.55386396011396, + "grad_norm": 0.598283588886261, + "learning_rate": 2.1437752424682843e-05, + "loss": 0.5584, + "step": 114740 + }, + { + "epoch": 2.5540865384615383, + "grad_norm": 0.6327041983604431, + "learning_rate": 2.1416757176345724e-05, + "loss": 0.353, + "step": 114750 + }, + { + "epoch": 2.554309116809117, + "grad_norm": 0.5017842650413513, + "learning_rate": 2.1395771632368168e-05, + "loss": 0.4499, + "step": 114760 + }, + { + "epoch": 2.554531695156695, + "grad_norm": 0.8473617434501648, + "learning_rate": 2.1374795793890612e-05, + "loss": 0.5589, + "step": 114770 + }, + { + "epoch": 2.5547542735042734, + "grad_norm": 0.3383786678314209, + "learning_rate": 2.135382966205286e-05, + "loss": 0.5012, + "step": 114780 + }, + { + "epoch": 2.554976851851852, + "grad_norm": 0.48342782258987427, + "learning_rate": 2.1332873237994245e-05, + "loss": 0.3938, + "step": 114790 + }, + { + "epoch": 2.5551994301994303, + "grad_norm": 0.4264233410358429, + "learning_rate": 2.1311926522853587e-05, + "loss": 0.519, + "step": 114800 + }, + { + "epoch": 2.5554220085470085, + "grad_norm": 0.41103747487068176, + "learning_rate": 2.1290989517769133e-05, + "loss": 0.3774, + "step": 114810 + }, + { + "epoch": 2.5556445868945867, + "grad_norm": 0.5277079343795776, + "learning_rate": 2.1270062223878595e-05, + "loss": 0.4255, + "step": 114820 + }, + { + "epoch": 2.5558671652421654, + "grad_norm": 0.894993245601654, + "learning_rate": 2.124914464231922e-05, + "loss": 0.4757, + "step": 114830 + }, + { + "epoch": 2.5560897435897436, + "grad_norm": 0.5365956425666809, + "learning_rate": 2.1228236774227605e-05, + "loss": 0.497, + "step": 114840 + }, + { + "epoch": 2.556312321937322, + "grad_norm": 0.5834557414054871, + "learning_rate": 2.120733862073998e-05, + "loss": 0.4723, + "step": 114850 + }, + { + "epoch": 2.5565349002849, + "grad_norm": 0.7872776985168457, + "learning_rate": 2.1186450182991925e-05, + "loss": 0.5038, + "step": 114860 + }, + { + "epoch": 2.5567574786324787, + "grad_norm": 0.5317719578742981, + "learning_rate": 2.116557146211855e-05, + "loss": 0.4716, + "step": 114870 + }, + { + "epoch": 2.556980056980057, + "grad_norm": 0.6921097636222839, + "learning_rate": 2.1144702459254416e-05, + "loss": 0.5058, + "step": 114880 + }, + { + "epoch": 2.557202635327635, + "grad_norm": 0.5789677500724792, + "learning_rate": 2.112384317553362e-05, + "loss": 0.3581, + "step": 114890 + }, + { + "epoch": 2.557425213675214, + "grad_norm": 0.7021106481552124, + "learning_rate": 2.1102993612089584e-05, + "loss": 0.3649, + "step": 114900 + }, + { + "epoch": 2.557647792022792, + "grad_norm": 0.5997939705848694, + "learning_rate": 2.1082153770055312e-05, + "loss": 0.4336, + "step": 114910 + }, + { + "epoch": 2.5578703703703702, + "grad_norm": 0.663595974445343, + "learning_rate": 2.106132365056328e-05, + "loss": 0.4295, + "step": 114920 + }, + { + "epoch": 2.558092948717949, + "grad_norm": 0.7464686036109924, + "learning_rate": 2.1040503254745404e-05, + "loss": 0.4655, + "step": 114930 + }, + { + "epoch": 2.558315527065527, + "grad_norm": 0.5191732048988342, + "learning_rate": 2.1019692583733087e-05, + "loss": 0.4615, + "step": 114940 + }, + { + "epoch": 2.5585381054131053, + "grad_norm": 0.5464897155761719, + "learning_rate": 2.0998891638657247e-05, + "loss": 0.3702, + "step": 114950 + }, + { + "epoch": 2.558760683760684, + "grad_norm": 0.4492281973361969, + "learning_rate": 2.0978100420648117e-05, + "loss": 0.4609, + "step": 114960 + }, + { + "epoch": 2.558983262108262, + "grad_norm": 0.47617268562316895, + "learning_rate": 2.095731893083561e-05, + "loss": 0.3809, + "step": 114970 + }, + { + "epoch": 2.5592058404558404, + "grad_norm": 0.5587918758392334, + "learning_rate": 2.0936547170348917e-05, + "loss": 0.4479, + "step": 114980 + }, + { + "epoch": 2.5594284188034186, + "grad_norm": 0.5930055379867554, + "learning_rate": 2.0915785140316845e-05, + "loss": 0.5025, + "step": 114990 + }, + { + "epoch": 2.5596509971509973, + "grad_norm": 0.5976328253746033, + "learning_rate": 2.08950328418676e-05, + "loss": 0.4871, + "step": 115000 + }, + { + "epoch": 2.5598735754985755, + "grad_norm": 0.8843815326690674, + "learning_rate": 2.0874290276128906e-05, + "loss": 0.5638, + "step": 115010 + }, + { + "epoch": 2.5600961538461537, + "grad_norm": 0.5686395764350891, + "learning_rate": 2.0853557444227922e-05, + "loss": 0.4578, + "step": 115020 + }, + { + "epoch": 2.560318732193732, + "grad_norm": 0.4919164180755615, + "learning_rate": 2.083283434729131e-05, + "loss": 0.3756, + "step": 115030 + }, + { + "epoch": 2.5605413105413106, + "grad_norm": 0.5874354839324951, + "learning_rate": 2.0812120986445116e-05, + "loss": 0.4839, + "step": 115040 + }, + { + "epoch": 2.560763888888889, + "grad_norm": 0.6515677571296692, + "learning_rate": 2.079141736281498e-05, + "loss": 0.4587, + "step": 115050 + }, + { + "epoch": 2.560986467236467, + "grad_norm": 0.5897663831710815, + "learning_rate": 2.077072347752591e-05, + "loss": 0.387, + "step": 115060 + }, + { + "epoch": 2.5612090455840457, + "grad_norm": 0.4548929035663605, + "learning_rate": 2.0750039331702452e-05, + "loss": 0.4837, + "step": 115070 + }, + { + "epoch": 2.561431623931624, + "grad_norm": 0.7838677763938904, + "learning_rate": 2.0729364926468598e-05, + "loss": 0.4296, + "step": 115080 + }, + { + "epoch": 2.561654202279202, + "grad_norm": 0.34927403926849365, + "learning_rate": 2.0708700262947843e-05, + "loss": 0.4602, + "step": 115090 + }, + { + "epoch": 2.561876780626781, + "grad_norm": 0.5421298742294312, + "learning_rate": 2.0688045342263075e-05, + "loss": 0.437, + "step": 115100 + }, + { + "epoch": 2.562099358974359, + "grad_norm": 0.6211506724357605, + "learning_rate": 2.066740016553672e-05, + "loss": 0.4838, + "step": 115110 + }, + { + "epoch": 2.5623219373219372, + "grad_norm": 0.5304540395736694, + "learning_rate": 2.0646764733890645e-05, + "loss": 0.4963, + "step": 115120 + }, + { + "epoch": 2.562544515669516, + "grad_norm": 0.705946683883667, + "learning_rate": 2.062613904844617e-05, + "loss": 0.4476, + "step": 115130 + }, + { + "epoch": 2.562767094017094, + "grad_norm": 0.5351716876029968, + "learning_rate": 2.0605523110324155e-05, + "loss": 0.4003, + "step": 115140 + }, + { + "epoch": 2.5629896723646723, + "grad_norm": 0.47889044880867004, + "learning_rate": 2.058491692064488e-05, + "loss": 0.5015, + "step": 115150 + }, + { + "epoch": 2.5632122507122506, + "grad_norm": 0.7898067235946655, + "learning_rate": 2.0564320480528076e-05, + "loss": 0.4364, + "step": 115160 + }, + { + "epoch": 2.5634348290598292, + "grad_norm": 0.4348805248737335, + "learning_rate": 2.0543733791093046e-05, + "loss": 0.4275, + "step": 115170 + }, + { + "epoch": 2.5636574074074074, + "grad_norm": 0.3964775800704956, + "learning_rate": 2.052315685345838e-05, + "loss": 0.4295, + "step": 115180 + }, + { + "epoch": 2.5638799857549857, + "grad_norm": 0.6023658514022827, + "learning_rate": 2.05025896687423e-05, + "loss": 0.525, + "step": 115190 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.7158157825469971, + "learning_rate": 2.0482032238062464e-05, + "loss": 0.469, + "step": 115200 + }, + { + "epoch": 2.5643251424501425, + "grad_norm": 0.5601397752761841, + "learning_rate": 2.046148456253594e-05, + "loss": 0.5173, + "step": 115210 + }, + { + "epoch": 2.5645477207977208, + "grad_norm": 0.36717069149017334, + "learning_rate": 2.0440946643279313e-05, + "loss": 0.3301, + "step": 115220 + }, + { + "epoch": 2.564770299145299, + "grad_norm": 0.35347598791122437, + "learning_rate": 2.0420418481408676e-05, + "loss": 0.3723, + "step": 115230 + }, + { + "epoch": 2.5649928774928776, + "grad_norm": 0.30606773495674133, + "learning_rate": 2.039990007803949e-05, + "loss": 0.4165, + "step": 115240 + }, + { + "epoch": 2.565215455840456, + "grad_norm": 0.5062880516052246, + "learning_rate": 2.0379391434286778e-05, + "loss": 0.4515, + "step": 115250 + }, + { + "epoch": 2.565438034188034, + "grad_norm": 0.6843504309654236, + "learning_rate": 2.0358892551264953e-05, + "loss": 0.3867, + "step": 115260 + }, + { + "epoch": 2.5656606125356127, + "grad_norm": 0.5426856875419617, + "learning_rate": 2.0338403430087972e-05, + "loss": 0.3925, + "step": 115270 + }, + { + "epoch": 2.565883190883191, + "grad_norm": 0.5436319708824158, + "learning_rate": 2.031792407186921e-05, + "loss": 0.5195, + "step": 115280 + }, + { + "epoch": 2.566105769230769, + "grad_norm": 0.38191789388656616, + "learning_rate": 2.029745447772158e-05, + "loss": 0.4129, + "step": 115290 + }, + { + "epoch": 2.566328347578348, + "grad_norm": 0.48173078894615173, + "learning_rate": 2.0276994648757364e-05, + "loss": 0.3255, + "step": 115300 + }, + { + "epoch": 2.566550925925926, + "grad_norm": 0.6359217166900635, + "learning_rate": 2.025654458608841e-05, + "loss": 0.5472, + "step": 115310 + }, + { + "epoch": 2.5667735042735043, + "grad_norm": 0.5844587087631226, + "learning_rate": 2.0236104290825962e-05, + "loss": 0.4527, + "step": 115320 + }, + { + "epoch": 2.5669960826210825, + "grad_norm": 0.5271782875061035, + "learning_rate": 2.0215673764080734e-05, + "loss": 0.4227, + "step": 115330 + }, + { + "epoch": 2.5672186609686607, + "grad_norm": 0.7398836016654968, + "learning_rate": 2.019525300696301e-05, + "loss": 0.5061, + "step": 115340 + }, + { + "epoch": 2.5674412393162394, + "grad_norm": 0.44776272773742676, + "learning_rate": 2.0174842020582396e-05, + "loss": 0.4443, + "step": 115350 + }, + { + "epoch": 2.5676638176638176, + "grad_norm": 0.6261894702911377, + "learning_rate": 2.0154440806048137e-05, + "loss": 0.4071, + "step": 115360 + }, + { + "epoch": 2.567886396011396, + "grad_norm": 0.3250281810760498, + "learning_rate": 2.0134049364468765e-05, + "loss": 0.376, + "step": 115370 + }, + { + "epoch": 2.5681089743589745, + "grad_norm": 0.46543797850608826, + "learning_rate": 2.0113667696952377e-05, + "loss": 0.4741, + "step": 115380 + }, + { + "epoch": 2.5683315527065527, + "grad_norm": 0.4046928286552429, + "learning_rate": 2.0093295804606594e-05, + "loss": 0.4669, + "step": 115390 + }, + { + "epoch": 2.568554131054131, + "grad_norm": 0.6142890453338623, + "learning_rate": 2.0072933688538354e-05, + "loss": 0.3958, + "step": 115400 + }, + { + "epoch": 2.5687767094017095, + "grad_norm": 0.478249728679657, + "learning_rate": 2.0052581349854192e-05, + "loss": 0.558, + "step": 115410 + }, + { + "epoch": 2.5689992877492878, + "grad_norm": 0.5002285242080688, + "learning_rate": 2.0032238789660074e-05, + "loss": 0.3963, + "step": 115420 + }, + { + "epoch": 2.569221866096866, + "grad_norm": 0.5920494794845581, + "learning_rate": 2.001190600906142e-05, + "loss": 0.3864, + "step": 115430 + }, + { + "epoch": 2.5694444444444446, + "grad_norm": 0.5024237036705017, + "learning_rate": 1.9991583009163152e-05, + "loss": 0.5187, + "step": 115440 + }, + { + "epoch": 2.569667022792023, + "grad_norm": 0.46773669123649597, + "learning_rate": 1.9971269791069648e-05, + "loss": 0.3918, + "step": 115450 + }, + { + "epoch": 2.569889601139601, + "grad_norm": 0.5565283298492432, + "learning_rate": 1.9950966355884694e-05, + "loss": 0.4201, + "step": 115460 + }, + { + "epoch": 2.5701121794871797, + "grad_norm": 0.5695311427116394, + "learning_rate": 1.9930672704711628e-05, + "loss": 0.4914, + "step": 115470 + }, + { + "epoch": 2.570334757834758, + "grad_norm": 0.7093086242675781, + "learning_rate": 1.9910388838653215e-05, + "loss": 0.495, + "step": 115480 + }, + { + "epoch": 2.570557336182336, + "grad_norm": 0.5606511831283569, + "learning_rate": 1.9890114758811728e-05, + "loss": 0.3263, + "step": 115490 + }, + { + "epoch": 2.5707799145299144, + "grad_norm": 0.5101292729377747, + "learning_rate": 1.986985046628882e-05, + "loss": 0.4229, + "step": 115500 + }, + { + "epoch": 2.5710024928774926, + "grad_norm": 0.7715269327163696, + "learning_rate": 1.9849595962185698e-05, + "loss": 0.4277, + "step": 115510 + }, + { + "epoch": 2.5712250712250713, + "grad_norm": 0.469260573387146, + "learning_rate": 1.9829351247603058e-05, + "loss": 0.4764, + "step": 115520 + }, + { + "epoch": 2.5714476495726495, + "grad_norm": 0.5756816267967224, + "learning_rate": 1.9809116323640908e-05, + "loss": 0.3906, + "step": 115530 + }, + { + "epoch": 2.5716702279202277, + "grad_norm": 0.5862997174263, + "learning_rate": 1.9788891191398907e-05, + "loss": 0.5252, + "step": 115540 + }, + { + "epoch": 2.5718928062678064, + "grad_norm": 0.6412315964698792, + "learning_rate": 1.9768675851976083e-05, + "loss": 0.5215, + "step": 115550 + }, + { + "epoch": 2.5721153846153846, + "grad_norm": 0.6483317017555237, + "learning_rate": 1.9748470306470934e-05, + "loss": 0.4839, + "step": 115560 + }, + { + "epoch": 2.572337962962963, + "grad_norm": 0.4820649325847626, + "learning_rate": 1.9728274555981498e-05, + "loss": 0.4263, + "step": 115570 + }, + { + "epoch": 2.5725605413105415, + "grad_norm": 0.557349681854248, + "learning_rate": 1.970808860160518e-05, + "loss": 0.4429, + "step": 115580 + }, + { + "epoch": 2.5727831196581197, + "grad_norm": 0.5963757038116455, + "learning_rate": 1.968791244443897e-05, + "loss": 0.5557, + "step": 115590 + }, + { + "epoch": 2.573005698005698, + "grad_norm": 0.6934390664100647, + "learning_rate": 1.9667746085579175e-05, + "loss": 0.58, + "step": 115600 + }, + { + "epoch": 2.5732282763532766, + "grad_norm": 0.8542941808700562, + "learning_rate": 1.964758952612167e-05, + "loss": 0.4716, + "step": 115610 + }, + { + "epoch": 2.5734508547008548, + "grad_norm": 0.42587122321128845, + "learning_rate": 1.9627442767161843e-05, + "loss": 0.45, + "step": 115620 + }, + { + "epoch": 2.573673433048433, + "grad_norm": 0.704937219619751, + "learning_rate": 1.9607305809794395e-05, + "loss": 0.5379, + "step": 115630 + }, + { + "epoch": 2.5738960113960117, + "grad_norm": 0.6337405443191528, + "learning_rate": 1.9587178655113636e-05, + "loss": 0.4186, + "step": 115640 + }, + { + "epoch": 2.57411858974359, + "grad_norm": 0.6829156279563904, + "learning_rate": 1.9567061304213263e-05, + "loss": 0.4903, + "step": 115650 + }, + { + "epoch": 2.574341168091168, + "grad_norm": 0.597281277179718, + "learning_rate": 1.9546953758186536e-05, + "loss": 0.3835, + "step": 115660 + }, + { + "epoch": 2.5745637464387463, + "grad_norm": 0.32157281041145325, + "learning_rate": 1.9526856018126028e-05, + "loss": 0.3429, + "step": 115670 + }, + { + "epoch": 2.5747863247863245, + "grad_norm": 0.49477922916412354, + "learning_rate": 1.9506768085123904e-05, + "loss": 0.5195, + "step": 115680 + }, + { + "epoch": 2.575008903133903, + "grad_norm": 0.6303307414054871, + "learning_rate": 1.9486689960271763e-05, + "loss": 0.46, + "step": 115690 + }, + { + "epoch": 2.5752314814814814, + "grad_norm": 0.59251868724823, + "learning_rate": 1.9466621644660664e-05, + "loss": 0.5057, + "step": 115700 + }, + { + "epoch": 2.5754540598290596, + "grad_norm": 0.38228195905685425, + "learning_rate": 1.9446563139381135e-05, + "loss": 0.4508, + "step": 115710 + }, + { + "epoch": 2.5756766381766383, + "grad_norm": 0.7206634879112244, + "learning_rate": 1.9426514445523168e-05, + "loss": 0.5761, + "step": 115720 + }, + { + "epoch": 2.5758992165242165, + "grad_norm": 0.5861138701438904, + "learning_rate": 1.9406475564176276e-05, + "loss": 0.4115, + "step": 115730 + }, + { + "epoch": 2.5761217948717947, + "grad_norm": 0.6063085198402405, + "learning_rate": 1.938644649642931e-05, + "loss": 0.4398, + "step": 115740 + }, + { + "epoch": 2.5763443732193734, + "grad_norm": 0.3896350860595703, + "learning_rate": 1.936642724337072e-05, + "loss": 0.4346, + "step": 115750 + }, + { + "epoch": 2.5765669515669516, + "grad_norm": 0.3690728545188904, + "learning_rate": 1.9346417806088325e-05, + "loss": 0.421, + "step": 115760 + }, + { + "epoch": 2.57678952991453, + "grad_norm": 0.3766835331916809, + "learning_rate": 1.932641818566947e-05, + "loss": 0.3846, + "step": 115770 + }, + { + "epoch": 2.5770121082621085, + "grad_norm": 0.6418868899345398, + "learning_rate": 1.930642838320096e-05, + "loss": 0.5609, + "step": 115780 + }, + { + "epoch": 2.5772346866096867, + "grad_norm": 0.4302945137023926, + "learning_rate": 1.928644839976905e-05, + "loss": 0.4284, + "step": 115790 + }, + { + "epoch": 2.577457264957265, + "grad_norm": 0.4001218378543854, + "learning_rate": 1.9266478236459506e-05, + "loss": 0.4796, + "step": 115800 + }, + { + "epoch": 2.5776798433048436, + "grad_norm": 0.4011891782283783, + "learning_rate": 1.9246517894357474e-05, + "loss": 0.5099, + "step": 115810 + }, + { + "epoch": 2.577902421652422, + "grad_norm": 0.6663089394569397, + "learning_rate": 1.922656737454762e-05, + "loss": 0.4387, + "step": 115820 + }, + { + "epoch": 2.578125, + "grad_norm": 0.534116804599762, + "learning_rate": 1.920662667811408e-05, + "loss": 0.406, + "step": 115830 + }, + { + "epoch": 2.578347578347578, + "grad_norm": 0.40283769369125366, + "learning_rate": 1.9186695806140433e-05, + "loss": 0.474, + "step": 115840 + }, + { + "epoch": 2.5785701566951564, + "grad_norm": 0.8041699528694153, + "learning_rate": 1.9166774759709783e-05, + "loss": 0.4056, + "step": 115850 + }, + { + "epoch": 2.578792735042735, + "grad_norm": 0.6072338819503784, + "learning_rate": 1.9146863539904625e-05, + "loss": 0.4541, + "step": 115860 + }, + { + "epoch": 2.5790153133903133, + "grad_norm": 0.41169029474258423, + "learning_rate": 1.9126962147806983e-05, + "loss": 0.3805, + "step": 115870 + }, + { + "epoch": 2.5792378917378915, + "grad_norm": 0.5600550770759583, + "learning_rate": 1.9107070584498297e-05, + "loss": 0.4154, + "step": 115880 + }, + { + "epoch": 2.57946047008547, + "grad_norm": 0.4978649914264679, + "learning_rate": 1.9087188851059423e-05, + "loss": 0.4275, + "step": 115890 + }, + { + "epoch": 2.5796830484330484, + "grad_norm": 0.42050987482070923, + "learning_rate": 1.9067316948570825e-05, + "loss": 0.3358, + "step": 115900 + }, + { + "epoch": 2.5799056267806266, + "grad_norm": 0.46916189789772034, + "learning_rate": 1.904745487811235e-05, + "loss": 0.3698, + "step": 115910 + }, + { + "epoch": 2.5801282051282053, + "grad_norm": 0.4058810770511627, + "learning_rate": 1.902760264076331e-05, + "loss": 0.4384, + "step": 115920 + }, + { + "epoch": 2.580306267806268, + "eval_loss": 0.5236759781837463, + "eval_runtime": 337.2759, + "eval_samples_per_second": 7.012, + "eval_steps_per_second": 7.012, + "step": 115928 + }, + { + "epoch": 2.5803507834757835, + "grad_norm": 0.7314088940620422, + "learning_rate": 1.900776023760249e-05, + "loss": 0.4296, + "step": 115930 + }, + { + "epoch": 2.5805733618233617, + "grad_norm": 0.8279690742492676, + "learning_rate": 1.898792766970816e-05, + "loss": 0.3954, + "step": 115940 + }, + { + "epoch": 2.5807959401709404, + "grad_norm": 0.5086784958839417, + "learning_rate": 1.8968104938158015e-05, + "loss": 0.4982, + "step": 115950 + }, + { + "epoch": 2.5810185185185186, + "grad_norm": 0.5648263692855835, + "learning_rate": 1.8948292044029238e-05, + "loss": 0.4381, + "step": 115960 + }, + { + "epoch": 2.581241096866097, + "grad_norm": 0.7056523561477661, + "learning_rate": 1.8928488988398495e-05, + "loss": 0.4564, + "step": 115970 + }, + { + "epoch": 2.5814636752136755, + "grad_norm": 0.46383073925971985, + "learning_rate": 1.8908695772341887e-05, + "loss": 0.5456, + "step": 115980 + }, + { + "epoch": 2.5816862535612537, + "grad_norm": 0.5759848356246948, + "learning_rate": 1.8888912396935e-05, + "loss": 0.4889, + "step": 115990 + }, + { + "epoch": 2.581908831908832, + "grad_norm": 0.5033251047134399, + "learning_rate": 1.886913886325288e-05, + "loss": 0.5239, + "step": 116000 + }, + { + "epoch": 2.58213141025641, + "grad_norm": 0.5119370818138123, + "learning_rate": 1.8849375172370064e-05, + "loss": 0.3714, + "step": 116010 + }, + { + "epoch": 2.5823539886039883, + "grad_norm": 0.44290006160736084, + "learning_rate": 1.88296213253605e-05, + "loss": 0.516, + "step": 116020 + }, + { + "epoch": 2.582576566951567, + "grad_norm": 0.4895089566707611, + "learning_rate": 1.8809877323297576e-05, + "loss": 0.4879, + "step": 116030 + }, + { + "epoch": 2.5827991452991452, + "grad_norm": 0.7203282713890076, + "learning_rate": 1.879014316725427e-05, + "loss": 0.6404, + "step": 116040 + }, + { + "epoch": 2.5830217236467234, + "grad_norm": 0.3906666934490204, + "learning_rate": 1.877041885830293e-05, + "loss": 0.47, + "step": 116050 + }, + { + "epoch": 2.583244301994302, + "grad_norm": 0.6866021156311035, + "learning_rate": 1.875070439751536e-05, + "loss": 0.5185, + "step": 116060 + }, + { + "epoch": 2.5834668803418803, + "grad_norm": 0.7713746428489685, + "learning_rate": 1.8730999785962934e-05, + "loss": 0.4818, + "step": 116070 + }, + { + "epoch": 2.5836894586894585, + "grad_norm": 0.5962246060371399, + "learning_rate": 1.8711305024716386e-05, + "loss": 0.4263, + "step": 116080 + }, + { + "epoch": 2.583912037037037, + "grad_norm": 0.6574836373329163, + "learning_rate": 1.8691620114845888e-05, + "loss": 0.3811, + "step": 116090 + }, + { + "epoch": 2.5841346153846154, + "grad_norm": 0.6990258097648621, + "learning_rate": 1.8671945057421203e-05, + "loss": 0.4852, + "step": 116100 + }, + { + "epoch": 2.5843571937321936, + "grad_norm": 0.4760742485523224, + "learning_rate": 1.8652279853511435e-05, + "loss": 0.4472, + "step": 116110 + }, + { + "epoch": 2.5845797720797723, + "grad_norm": 0.7832134366035461, + "learning_rate": 1.863262450418526e-05, + "loss": 0.4873, + "step": 116120 + }, + { + "epoch": 2.5848023504273505, + "grad_norm": 0.45979663729667664, + "learning_rate": 1.8612979010510732e-05, + "loss": 0.3146, + "step": 116130 + }, + { + "epoch": 2.5850249287749287, + "grad_norm": 0.49060124158859253, + "learning_rate": 1.8593343373555426e-05, + "loss": 0.3877, + "step": 116140 + }, + { + "epoch": 2.5852475071225074, + "grad_norm": 0.680504560470581, + "learning_rate": 1.85737175943864e-05, + "loss": 0.4624, + "step": 116150 + }, + { + "epoch": 2.5854700854700856, + "grad_norm": 0.5428845286369324, + "learning_rate": 1.8554101674070058e-05, + "loss": 0.5332, + "step": 116160 + }, + { + "epoch": 2.585692663817664, + "grad_norm": 0.5724543929100037, + "learning_rate": 1.853449561367233e-05, + "loss": 0.4961, + "step": 116170 + }, + { + "epoch": 2.585915242165242, + "grad_norm": 0.6779829263687134, + "learning_rate": 1.851489941425868e-05, + "loss": 0.4369, + "step": 116180 + }, + { + "epoch": 2.5861378205128203, + "grad_norm": 0.4919106066226959, + "learning_rate": 1.849531307689394e-05, + "loss": 0.5544, + "step": 116190 + }, + { + "epoch": 2.586360398860399, + "grad_norm": 0.7075668573379517, + "learning_rate": 1.84757366026425e-05, + "loss": 0.5148, + "step": 116200 + }, + { + "epoch": 2.586582977207977, + "grad_norm": 0.4392474293708801, + "learning_rate": 1.845616999256814e-05, + "loss": 0.4088, + "step": 116210 + }, + { + "epoch": 2.5868055555555554, + "grad_norm": 0.5524398684501648, + "learning_rate": 1.8436613247734136e-05, + "loss": 0.3931, + "step": 116220 + }, + { + "epoch": 2.587028133903134, + "grad_norm": 0.6485047340393066, + "learning_rate": 1.8417066369203173e-05, + "loss": 0.5104, + "step": 116230 + }, + { + "epoch": 2.5872507122507122, + "grad_norm": 0.39343759417533875, + "learning_rate": 1.839752935803749e-05, + "loss": 0.4568, + "step": 116240 + }, + { + "epoch": 2.5874732905982905, + "grad_norm": 0.5676214694976807, + "learning_rate": 1.8378002215298707e-05, + "loss": 0.3959, + "step": 116250 + }, + { + "epoch": 2.587695868945869, + "grad_norm": 0.5716341733932495, + "learning_rate": 1.835848494204797e-05, + "loss": 0.4073, + "step": 116260 + }, + { + "epoch": 2.5879184472934473, + "grad_norm": 0.4870680868625641, + "learning_rate": 1.833897753934588e-05, + "loss": 0.4599, + "step": 116270 + }, + { + "epoch": 2.5881410256410255, + "grad_norm": 0.5869102478027344, + "learning_rate": 1.8319480008252478e-05, + "loss": 0.4154, + "step": 116280 + }, + { + "epoch": 2.588363603988604, + "grad_norm": 0.5533974170684814, + "learning_rate": 1.8299992349827246e-05, + "loss": 0.441, + "step": 116290 + }, + { + "epoch": 2.5885861823361824, + "grad_norm": 0.735202431678772, + "learning_rate": 1.8280514565129182e-05, + "loss": 0.3809, + "step": 116300 + }, + { + "epoch": 2.5888087606837606, + "grad_norm": 0.6630352735519409, + "learning_rate": 1.8261046655216708e-05, + "loss": 0.3626, + "step": 116310 + }, + { + "epoch": 2.5890313390313393, + "grad_norm": 0.549351692199707, + "learning_rate": 1.824158862114773e-05, + "loss": 0.5333, + "step": 116320 + }, + { + "epoch": 2.5892539173789175, + "grad_norm": 0.6176263093948364, + "learning_rate": 1.822214046397963e-05, + "loss": 0.4987, + "step": 116330 + }, + { + "epoch": 2.5894764957264957, + "grad_norm": 0.48720836639404297, + "learning_rate": 1.8202702184769226e-05, + "loss": 0.4306, + "step": 116340 + }, + { + "epoch": 2.589699074074074, + "grad_norm": 0.7484619617462158, + "learning_rate": 1.8183273784572808e-05, + "loss": 0.4462, + "step": 116350 + }, + { + "epoch": 2.589921652421652, + "grad_norm": 0.5729781985282898, + "learning_rate": 1.816385526444615e-05, + "loss": 0.4253, + "step": 116360 + }, + { + "epoch": 2.590144230769231, + "grad_norm": 0.5132777094841003, + "learning_rate": 1.8144446625444457e-05, + "loss": 0.3839, + "step": 116370 + }, + { + "epoch": 2.590366809116809, + "grad_norm": 0.5946417450904846, + "learning_rate": 1.812504786862237e-05, + "loss": 0.5092, + "step": 116380 + }, + { + "epoch": 2.5905893874643873, + "grad_norm": 0.6767193078994751, + "learning_rate": 1.8105658995034093e-05, + "loss": 0.5002, + "step": 116390 + }, + { + "epoch": 2.590811965811966, + "grad_norm": 0.7960422039031982, + "learning_rate": 1.8086280005733202e-05, + "loss": 0.4476, + "step": 116400 + }, + { + "epoch": 2.591034544159544, + "grad_norm": 0.5478686094284058, + "learning_rate": 1.8066910901772836e-05, + "loss": 0.4791, + "step": 116410 + }, + { + "epoch": 2.5912571225071224, + "grad_norm": 0.5975253582000732, + "learning_rate": 1.80475516842054e-05, + "loss": 0.4985, + "step": 116420 + }, + { + "epoch": 2.591479700854701, + "grad_norm": 0.592153787612915, + "learning_rate": 1.802820235408298e-05, + "loss": 0.5786, + "step": 116430 + }, + { + "epoch": 2.5917022792022792, + "grad_norm": 0.28754061460494995, + "learning_rate": 1.8008862912457046e-05, + "loss": 0.3352, + "step": 116440 + }, + { + "epoch": 2.5919248575498575, + "grad_norm": 0.4851224422454834, + "learning_rate": 1.7989533360378453e-05, + "loss": 0.443, + "step": 116450 + }, + { + "epoch": 2.592147435897436, + "grad_norm": 0.5925357341766357, + "learning_rate": 1.797021369889762e-05, + "loss": 0.5076, + "step": 116460 + }, + { + "epoch": 2.5923700142450143, + "grad_norm": 0.5017402172088623, + "learning_rate": 1.7950903929064378e-05, + "loss": 0.4064, + "step": 116470 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.5168089270591736, + "learning_rate": 1.7931604051928063e-05, + "loss": 0.4107, + "step": 116480 + }, + { + "epoch": 2.5928151709401708, + "grad_norm": 0.6962186098098755, + "learning_rate": 1.7912314068537416e-05, + "loss": 0.4251, + "step": 116490 + }, + { + "epoch": 2.5930377492877494, + "grad_norm": 0.3982653021812439, + "learning_rate": 1.789303397994073e-05, + "loss": 0.4421, + "step": 116500 + }, + { + "epoch": 2.5932603276353277, + "grad_norm": 0.4287591874599457, + "learning_rate": 1.7873763787185614e-05, + "loss": 0.4046, + "step": 116510 + }, + { + "epoch": 2.593482905982906, + "grad_norm": 0.4040102958679199, + "learning_rate": 1.7854503491319298e-05, + "loss": 0.5503, + "step": 116520 + }, + { + "epoch": 2.593705484330484, + "grad_norm": 0.46851760149002075, + "learning_rate": 1.7835253093388337e-05, + "loss": 0.4146, + "step": 116530 + }, + { + "epoch": 2.5939280626780628, + "grad_norm": 0.5532066822052002, + "learning_rate": 1.7816012594438903e-05, + "loss": 0.4898, + "step": 116540 + }, + { + "epoch": 2.594150641025641, + "grad_norm": 0.4478902518749237, + "learning_rate": 1.7796781995516464e-05, + "loss": 0.5938, + "step": 116550 + }, + { + "epoch": 2.594373219373219, + "grad_norm": 0.6147230267524719, + "learning_rate": 1.7777561297666033e-05, + "loss": 0.5559, + "step": 116560 + }, + { + "epoch": 2.594595797720798, + "grad_norm": 0.398787260055542, + "learning_rate": 1.7758350501932086e-05, + "loss": 0.424, + "step": 116570 + }, + { + "epoch": 2.594818376068376, + "grad_norm": 0.6829454898834229, + "learning_rate": 1.7739149609358607e-05, + "loss": 0.487, + "step": 116580 + }, + { + "epoch": 2.5950409544159543, + "grad_norm": 0.849534809589386, + "learning_rate": 1.771995862098892e-05, + "loss": 0.5313, + "step": 116590 + }, + { + "epoch": 2.595263532763533, + "grad_norm": 0.4778141677379608, + "learning_rate": 1.7700777537865897e-05, + "loss": 0.3923, + "step": 116600 + }, + { + "epoch": 2.595486111111111, + "grad_norm": 0.5346719026565552, + "learning_rate": 1.7681606361031866e-05, + "loss": 0.4585, + "step": 116610 + }, + { + "epoch": 2.5957086894586894, + "grad_norm": 0.42185789346694946, + "learning_rate": 1.766244509152859e-05, + "loss": 0.4035, + "step": 116620 + }, + { + "epoch": 2.595931267806268, + "grad_norm": 0.6329679489135742, + "learning_rate": 1.76432937303973e-05, + "loss": 0.4382, + "step": 116630 + }, + { + "epoch": 2.5961538461538463, + "grad_norm": 0.740755558013916, + "learning_rate": 1.7624152278678775e-05, + "loss": 0.5053, + "step": 116640 + }, + { + "epoch": 2.5963764245014245, + "grad_norm": 0.4598102569580078, + "learning_rate": 1.7605020737413057e-05, + "loss": 0.3666, + "step": 116650 + }, + { + "epoch": 2.5965990028490027, + "grad_norm": 0.370423287153244, + "learning_rate": 1.7585899107639837e-05, + "loss": 0.4285, + "step": 116660 + }, + { + "epoch": 2.5968215811965814, + "grad_norm": 0.5081289410591125, + "learning_rate": 1.756678739039823e-05, + "loss": 0.5261, + "step": 116670 + }, + { + "epoch": 2.5970441595441596, + "grad_norm": 0.550074577331543, + "learning_rate": 1.7547685586726726e-05, + "loss": 0.3716, + "step": 116680 + }, + { + "epoch": 2.597266737891738, + "grad_norm": 0.7760112881660461, + "learning_rate": 1.752859369766331e-05, + "loss": 0.4597, + "step": 116690 + }, + { + "epoch": 2.597489316239316, + "grad_norm": 0.5838767886161804, + "learning_rate": 1.750951172424551e-05, + "loss": 0.4031, + "step": 116700 + }, + { + "epoch": 2.5977118945868947, + "grad_norm": 0.5418928861618042, + "learning_rate": 1.749043966751025e-05, + "loss": 0.4342, + "step": 116710 + }, + { + "epoch": 2.597934472934473, + "grad_norm": 0.6233574748039246, + "learning_rate": 1.7471377528493926e-05, + "loss": 0.4812, + "step": 116720 + }, + { + "epoch": 2.598157051282051, + "grad_norm": 0.7421544790267944, + "learning_rate": 1.7452325308232332e-05, + "loss": 0.4897, + "step": 116730 + }, + { + "epoch": 2.5983796296296298, + "grad_norm": 0.4924778938293457, + "learning_rate": 1.7433283007760836e-05, + "loss": 0.2949, + "step": 116740 + }, + { + "epoch": 2.598602207977208, + "grad_norm": 0.6824201345443726, + "learning_rate": 1.7414250628114192e-05, + "loss": 0.5032, + "step": 116750 + }, + { + "epoch": 2.598824786324786, + "grad_norm": 0.49224621057510376, + "learning_rate": 1.739522817032664e-05, + "loss": 0.4255, + "step": 116760 + }, + { + "epoch": 2.599047364672365, + "grad_norm": 0.4551924467086792, + "learning_rate": 1.737621563543188e-05, + "loss": 0.4551, + "step": 116770 + }, + { + "epoch": 2.599269943019943, + "grad_norm": 0.6858125925064087, + "learning_rate": 1.7357213024463093e-05, + "loss": 0.5896, + "step": 116780 + }, + { + "epoch": 2.5994925213675213, + "grad_norm": 0.6564284563064575, + "learning_rate": 1.7338220338452825e-05, + "loss": 0.4219, + "step": 116790 + }, + { + "epoch": 2.5997150997151, + "grad_norm": 0.41564351320266724, + "learning_rate": 1.7319237578433256e-05, + "loss": 0.4131, + "step": 116800 + }, + { + "epoch": 2.599937678062678, + "grad_norm": 0.559563398361206, + "learning_rate": 1.7300264745435824e-05, + "loss": 0.3525, + "step": 116810 + }, + { + "epoch": 2.6001602564102564, + "grad_norm": 0.565727949142456, + "learning_rate": 1.7281301840491572e-05, + "loss": 0.4309, + "step": 116820 + }, + { + "epoch": 2.6003828347578346, + "grad_norm": 0.7862516045570374, + "learning_rate": 1.7262348864630938e-05, + "loss": 0.5738, + "step": 116830 + }, + { + "epoch": 2.6006054131054133, + "grad_norm": 0.583437442779541, + "learning_rate": 1.7243405818883883e-05, + "loss": 0.5902, + "step": 116840 + }, + { + "epoch": 2.6008279914529915, + "grad_norm": 0.5324410796165466, + "learning_rate": 1.7224472704279758e-05, + "loss": 0.3872, + "step": 116850 + }, + { + "epoch": 2.6010505698005697, + "grad_norm": 0.5166898965835571, + "learning_rate": 1.720554952184745e-05, + "loss": 0.4868, + "step": 116860 + }, + { + "epoch": 2.601273148148148, + "grad_norm": 0.4185858964920044, + "learning_rate": 1.7186636272615187e-05, + "loss": 0.4315, + "step": 116870 + }, + { + "epoch": 2.6014957264957266, + "grad_norm": 0.8711520433425903, + "learning_rate": 1.7167732957610784e-05, + "loss": 0.4937, + "step": 116880 + }, + { + "epoch": 2.601718304843305, + "grad_norm": 0.6560893058776855, + "learning_rate": 1.714883957786142e-05, + "loss": 0.4683, + "step": 116890 + }, + { + "epoch": 2.601940883190883, + "grad_norm": 0.38894492387771606, + "learning_rate": 1.7129956134393832e-05, + "loss": 0.3793, + "step": 116900 + }, + { + "epoch": 2.6021634615384617, + "grad_norm": 0.49460241198539734, + "learning_rate": 1.711108262823411e-05, + "loss": 0.5123, + "step": 116910 + }, + { + "epoch": 2.60238603988604, + "grad_norm": 0.6922730207443237, + "learning_rate": 1.709221906040792e-05, + "loss": 0.4265, + "step": 116920 + }, + { + "epoch": 2.602608618233618, + "grad_norm": 0.5051377415657043, + "learning_rate": 1.707336543194027e-05, + "loss": 0.4481, + "step": 116930 + }, + { + "epoch": 2.6028311965811968, + "grad_norm": 0.7844445109367371, + "learning_rate": 1.7054521743855666e-05, + "loss": 0.4749, + "step": 116940 + }, + { + "epoch": 2.603053774928775, + "grad_norm": 0.6359633803367615, + "learning_rate": 1.7035687997178117e-05, + "loss": 0.5296, + "step": 116950 + }, + { + "epoch": 2.603276353276353, + "grad_norm": 0.62961745262146, + "learning_rate": 1.701686419293107e-05, + "loss": 0.5121, + "step": 116960 + }, + { + "epoch": 2.603498931623932, + "grad_norm": 0.7465632557868958, + "learning_rate": 1.6998050332137416e-05, + "loss": 0.469, + "step": 116970 + }, + { + "epoch": 2.60372150997151, + "grad_norm": 0.4284519851207733, + "learning_rate": 1.6979246415819517e-05, + "loss": 0.4509, + "step": 116980 + }, + { + "epoch": 2.6039440883190883, + "grad_norm": 0.6444926261901855, + "learning_rate": 1.6960452444999198e-05, + "loss": 0.421, + "step": 116990 + }, + { + "epoch": 2.6041666666666665, + "grad_norm": 0.54566490650177, + "learning_rate": 1.6941668420697775e-05, + "loss": 0.3855, + "step": 117000 + }, + { + "epoch": 2.604389245014245, + "grad_norm": 0.4093930721282959, + "learning_rate": 1.6922894343935903e-05, + "loss": 0.4589, + "step": 117010 + }, + { + "epoch": 2.6046118233618234, + "grad_norm": 0.5715624690055847, + "learning_rate": 1.6904130215733825e-05, + "loss": 0.3965, + "step": 117020 + }, + { + "epoch": 2.6048344017094016, + "grad_norm": 0.5708182454109192, + "learning_rate": 1.68853760371112e-05, + "loss": 0.4052, + "step": 117030 + }, + { + "epoch": 2.60505698005698, + "grad_norm": 0.528289794921875, + "learning_rate": 1.6866631809087162e-05, + "loss": 0.4247, + "step": 117040 + }, + { + "epoch": 2.6052795584045585, + "grad_norm": 0.4165605902671814, + "learning_rate": 1.6847897532680257e-05, + "loss": 0.3851, + "step": 117050 + }, + { + "epoch": 2.6055021367521367, + "grad_norm": 0.76469886302948, + "learning_rate": 1.682917320890858e-05, + "loss": 0.4445, + "step": 117060 + }, + { + "epoch": 2.605724715099715, + "grad_norm": 0.6084073781967163, + "learning_rate": 1.6810458838789578e-05, + "loss": 0.5383, + "step": 117070 + }, + { + "epoch": 2.6059472934472936, + "grad_norm": 0.5931957960128784, + "learning_rate": 1.6791754423340177e-05, + "loss": 0.482, + "step": 117080 + }, + { + "epoch": 2.606169871794872, + "grad_norm": 0.5516034960746765, + "learning_rate": 1.677305996357681e-05, + "loss": 0.4666, + "step": 117090 + }, + { + "epoch": 2.60639245014245, + "grad_norm": 0.39802029728889465, + "learning_rate": 1.675437546051537e-05, + "loss": 0.3665, + "step": 117100 + }, + { + "epoch": 2.6066150284900287, + "grad_norm": 0.7053261399269104, + "learning_rate": 1.673570091517118e-05, + "loss": 0.4161, + "step": 117110 + }, + { + "epoch": 2.606837606837607, + "grad_norm": 0.5836366415023804, + "learning_rate": 1.671703632855901e-05, + "loss": 0.4877, + "step": 117120 + }, + { + "epoch": 2.607060185185185, + "grad_norm": 0.7553135752677917, + "learning_rate": 1.6698381701693156e-05, + "loss": 0.4377, + "step": 117130 + }, + { + "epoch": 2.607282763532764, + "grad_norm": 0.5414380431175232, + "learning_rate": 1.6679737035587317e-05, + "loss": 0.4461, + "step": 117140 + }, + { + "epoch": 2.607505341880342, + "grad_norm": 0.5662949681282043, + "learning_rate": 1.666110233125462e-05, + "loss": 0.406, + "step": 117150 + }, + { + "epoch": 2.60772792022792, + "grad_norm": 0.7300053834915161, + "learning_rate": 1.6642477589707695e-05, + "loss": 0.4483, + "step": 117160 + }, + { + "epoch": 2.6079504985754984, + "grad_norm": 0.5964715480804443, + "learning_rate": 1.6623862811958646e-05, + "loss": 0.4592, + "step": 117170 + }, + { + "epoch": 2.6081730769230766, + "grad_norm": 0.6311967968940735, + "learning_rate": 1.660525799901902e-05, + "loss": 0.4315, + "step": 117180 + }, + { + "epoch": 2.6083956552706553, + "grad_norm": 0.2812231183052063, + "learning_rate": 1.6586663151899784e-05, + "loss": 0.4067, + "step": 117190 + }, + { + "epoch": 2.6086182336182335, + "grad_norm": 0.5866037011146545, + "learning_rate": 1.6568078271611487e-05, + "loss": 0.5272, + "step": 117200 + }, + { + "epoch": 2.6088408119658117, + "grad_norm": 0.540464460849762, + "learning_rate": 1.6549503359163965e-05, + "loss": 0.4435, + "step": 117210 + }, + { + "epoch": 2.6090633903133904, + "grad_norm": 0.811968982219696, + "learning_rate": 1.6530938415566566e-05, + "loss": 0.5487, + "step": 117220 + }, + { + "epoch": 2.6092859686609686, + "grad_norm": 0.766233503818512, + "learning_rate": 1.6512383441828196e-05, + "loss": 0.6048, + "step": 117230 + }, + { + "epoch": 2.609508547008547, + "grad_norm": 0.9112457633018494, + "learning_rate": 1.6493838438957087e-05, + "loss": 0.5794, + "step": 117240 + }, + { + "epoch": 2.6097311253561255, + "grad_norm": 0.7300744652748108, + "learning_rate": 1.647530340796104e-05, + "loss": 0.4959, + "step": 117250 + }, + { + "epoch": 2.6099537037037037, + "grad_norm": 0.8856310248374939, + "learning_rate": 1.6456778349847245e-05, + "loss": 0.5404, + "step": 117260 + }, + { + "epoch": 2.610176282051282, + "grad_norm": 0.4345667362213135, + "learning_rate": 1.643826326562241e-05, + "loss": 0.4244, + "step": 117270 + }, + { + "epoch": 2.6103988603988606, + "grad_norm": 0.5131345391273499, + "learning_rate": 1.6419758156292575e-05, + "loss": 0.4662, + "step": 117280 + }, + { + "epoch": 2.610621438746439, + "grad_norm": 0.5059243440628052, + "learning_rate": 1.6401263022863378e-05, + "loss": 0.3407, + "step": 117290 + }, + { + "epoch": 2.610844017094017, + "grad_norm": 0.5901126265525818, + "learning_rate": 1.638277786633984e-05, + "loss": 0.5093, + "step": 117300 + }, + { + "epoch": 2.6110665954415957, + "grad_norm": 0.4596213400363922, + "learning_rate": 1.6364302687726464e-05, + "loss": 0.3714, + "step": 117310 + }, + { + "epoch": 2.611289173789174, + "grad_norm": 0.6311203241348267, + "learning_rate": 1.6345837488027228e-05, + "loss": 0.4221, + "step": 117320 + }, + { + "epoch": 2.611511752136752, + "grad_norm": 0.5315381288528442, + "learning_rate": 1.632738226824555e-05, + "loss": 0.405, + "step": 117330 + }, + { + "epoch": 2.6117343304843303, + "grad_norm": 0.5763476490974426, + "learning_rate": 1.6308937029384254e-05, + "loss": 0.3797, + "step": 117340 + }, + { + "epoch": 2.6119569088319086, + "grad_norm": 0.5535160899162292, + "learning_rate": 1.6290501772445732e-05, + "loss": 0.4283, + "step": 117350 + }, + { + "epoch": 2.6121794871794872, + "grad_norm": 0.7270311117172241, + "learning_rate": 1.6272076498431676e-05, + "loss": 0.4913, + "step": 117360 + }, + { + "epoch": 2.6124020655270654, + "grad_norm": 0.7906320095062256, + "learning_rate": 1.6253661208343417e-05, + "loss": 0.4636, + "step": 117370 + }, + { + "epoch": 2.6126246438746437, + "grad_norm": 0.7179734110832214, + "learning_rate": 1.6235255903181623e-05, + "loss": 0.4199, + "step": 117380 + }, + { + "epoch": 2.6128472222222223, + "grad_norm": 0.6319173574447632, + "learning_rate": 1.621686058394647e-05, + "loss": 0.6118, + "step": 117390 + }, + { + "epoch": 2.6130698005698005, + "grad_norm": 0.5031944513320923, + "learning_rate": 1.6198475251637557e-05, + "loss": 0.5795, + "step": 117400 + }, + { + "epoch": 2.6132923789173788, + "grad_norm": 0.7253941893577576, + "learning_rate": 1.6180099907254e-05, + "loss": 0.345, + "step": 117410 + }, + { + "epoch": 2.6135149572649574, + "grad_norm": 0.9181031584739685, + "learning_rate": 1.6161734551794238e-05, + "loss": 0.4828, + "step": 117420 + }, + { + "epoch": 2.6137375356125356, + "grad_norm": 0.4423508644104004, + "learning_rate": 1.6143379186256346e-05, + "loss": 0.5258, + "step": 117430 + }, + { + "epoch": 2.613960113960114, + "grad_norm": 0.43034684658050537, + "learning_rate": 1.6125033811637723e-05, + "loss": 0.4249, + "step": 117440 + }, + { + "epoch": 2.6141826923076925, + "grad_norm": 0.7726715803146362, + "learning_rate": 1.6106698428935307e-05, + "loss": 0.4994, + "step": 117450 + }, + { + "epoch": 2.6144052706552707, + "grad_norm": 0.42630285024642944, + "learning_rate": 1.6088373039145478e-05, + "loss": 0.3627, + "step": 117460 + }, + { + "epoch": 2.614627849002849, + "grad_norm": 0.576209306716919, + "learning_rate": 1.607005764326397e-05, + "loss": 0.3649, + "step": 117470 + }, + { + "epoch": 2.6148504273504276, + "grad_norm": 0.4724714457988739, + "learning_rate": 1.60517522422861e-05, + "loss": 0.5291, + "step": 117480 + }, + { + "epoch": 2.615073005698006, + "grad_norm": 0.5985315442085266, + "learning_rate": 1.6033456837206628e-05, + "loss": 0.4899, + "step": 117490 + }, + { + "epoch": 2.615295584045584, + "grad_norm": 0.47378864884376526, + "learning_rate": 1.6015171429019694e-05, + "loss": 0.5148, + "step": 117500 + }, + { + "epoch": 2.6155181623931623, + "grad_norm": 0.6309303641319275, + "learning_rate": 1.5996896018718966e-05, + "loss": 0.4023, + "step": 117510 + }, + { + "epoch": 2.6157407407407405, + "grad_norm": 0.6247247457504272, + "learning_rate": 1.597863060729752e-05, + "loss": 0.3988, + "step": 117520 + }, + { + "epoch": 2.615963319088319, + "grad_norm": 0.5093616247177124, + "learning_rate": 1.5960375195747958e-05, + "loss": 0.5054, + "step": 117530 + }, + { + "epoch": 2.6161858974358974, + "grad_norm": 0.6017331480979919, + "learning_rate": 1.5942129785062242e-05, + "loss": 0.4704, + "step": 117540 + }, + { + "epoch": 2.6164084757834756, + "grad_norm": 0.4164072573184967, + "learning_rate": 1.592389437623192e-05, + "loss": 0.5016, + "step": 117550 + }, + { + "epoch": 2.6166310541310542, + "grad_norm": 0.5958421230316162, + "learning_rate": 1.5905668970247833e-05, + "loss": 0.5233, + "step": 117560 + }, + { + "epoch": 2.6168536324786325, + "grad_norm": 0.742854118347168, + "learning_rate": 1.5887453568100398e-05, + "loss": 0.3935, + "step": 117570 + }, + { + "epoch": 2.6170762108262107, + "grad_norm": 0.42327070236206055, + "learning_rate": 1.586924817077946e-05, + "loss": 0.4621, + "step": 117580 + }, + { + "epoch": 2.6172987891737893, + "grad_norm": 0.7418227195739746, + "learning_rate": 1.5851052779274343e-05, + "loss": 0.434, + "step": 117590 + }, + { + "epoch": 2.6175213675213675, + "grad_norm": 0.4669528305530548, + "learning_rate": 1.5832867394573746e-05, + "loss": 0.4765, + "step": 117600 + }, + { + "epoch": 2.6177439458689458, + "grad_norm": 0.741690993309021, + "learning_rate": 1.58146920176659e-05, + "loss": 0.4918, + "step": 117610 + }, + { + "epoch": 2.6179665242165244, + "grad_norm": 0.5707095861434937, + "learning_rate": 1.5796526649538455e-05, + "loss": 0.4863, + "step": 117620 + }, + { + "epoch": 2.6181891025641026, + "grad_norm": 0.6767379641532898, + "learning_rate": 1.5778371291178606e-05, + "loss": 0.558, + "step": 117630 + }, + { + "epoch": 2.618411680911681, + "grad_norm": 0.5096617341041565, + "learning_rate": 1.5760225943572826e-05, + "loss": 0.4515, + "step": 117640 + }, + { + "epoch": 2.6186342592592595, + "grad_norm": 0.60428386926651, + "learning_rate": 1.5742090607707195e-05, + "loss": 0.5742, + "step": 117650 + }, + { + "epoch": 2.6188568376068377, + "grad_norm": 0.2778734862804413, + "learning_rate": 1.5723965284567188e-05, + "loss": 0.4612, + "step": 117660 + }, + { + "epoch": 2.619079415954416, + "grad_norm": 0.4761873483657837, + "learning_rate": 1.5705849975137775e-05, + "loss": 0.5468, + "step": 117670 + }, + { + "epoch": 2.619301994301994, + "grad_norm": 0.5155042409896851, + "learning_rate": 1.5687744680403348e-05, + "loss": 0.4406, + "step": 117680 + }, + { + "epoch": 2.6195245726495724, + "grad_norm": 0.5542302131652832, + "learning_rate": 1.5669649401347786e-05, + "loss": 0.4618, + "step": 117690 + }, + { + "epoch": 2.619747150997151, + "grad_norm": 0.7806945443153381, + "learning_rate": 1.5651564138954345e-05, + "loss": 0.4865, + "step": 117700 + }, + { + "epoch": 2.6199697293447293, + "grad_norm": 0.5195888876914978, + "learning_rate": 1.5633488894205817e-05, + "loss": 0.4072, + "step": 117710 + }, + { + "epoch": 2.6201923076923075, + "grad_norm": 0.5302631258964539, + "learning_rate": 1.5615423668084483e-05, + "loss": 0.4955, + "step": 117720 + }, + { + "epoch": 2.620414886039886, + "grad_norm": 0.4892368018627167, + "learning_rate": 1.5597368461571916e-05, + "loss": 0.4808, + "step": 117730 + }, + { + "epoch": 2.6206374643874644, + "grad_norm": 0.5570464134216309, + "learning_rate": 1.5579323275649327e-05, + "loss": 0.547, + "step": 117740 + }, + { + "epoch": 2.6208600427350426, + "grad_norm": 0.5267153978347778, + "learning_rate": 1.5561288111297266e-05, + "loss": 0.5734, + "step": 117750 + }, + { + "epoch": 2.6210826210826212, + "grad_norm": 0.41973552107810974, + "learning_rate": 1.55432629694958e-05, + "loss": 0.354, + "step": 117760 + }, + { + "epoch": 2.6213051994301995, + "grad_norm": 0.5145451426506042, + "learning_rate": 1.5525247851224466e-05, + "loss": 0.463, + "step": 117770 + }, + { + "epoch": 2.6215277777777777, + "grad_norm": 0.5300337672233582, + "learning_rate": 1.550724275746216e-05, + "loss": 0.4699, + "step": 117780 + }, + { + "epoch": 2.6217503561253563, + "grad_norm": 0.442150741815567, + "learning_rate": 1.5489247689187293e-05, + "loss": 0.4428, + "step": 117790 + }, + { + "epoch": 2.6219729344729346, + "grad_norm": 0.4735947251319885, + "learning_rate": 1.5471262647377773e-05, + "loss": 0.5018, + "step": 117800 + }, + { + "epoch": 2.6221955128205128, + "grad_norm": 0.34947624802589417, + "learning_rate": 1.5453287633010884e-05, + "loss": 0.3744, + "step": 117810 + }, + { + "epoch": 2.6224180911680914, + "grad_norm": 0.3847612738609314, + "learning_rate": 1.5435322647063445e-05, + "loss": 0.5047, + "step": 117820 + }, + { + "epoch": 2.6226406695156697, + "grad_norm": 0.5073989033699036, + "learning_rate": 1.5417367690511676e-05, + "loss": 0.4404, + "step": 117830 + }, + { + "epoch": 2.622863247863248, + "grad_norm": 0.6535903215408325, + "learning_rate": 1.539942276433124e-05, + "loss": 0.3612, + "step": 117840 + }, + { + "epoch": 2.623085826210826, + "grad_norm": 0.6124248504638672, + "learning_rate": 1.5381487869497314e-05, + "loss": 0.4105, + "step": 117850 + }, + { + "epoch": 2.6233084045584043, + "grad_norm": 0.623004674911499, + "learning_rate": 1.5363563006984426e-05, + "loss": 0.4834, + "step": 117860 + }, + { + "epoch": 2.623530982905983, + "grad_norm": 0.44189321994781494, + "learning_rate": 1.5345648177766692e-05, + "loss": 0.5217, + "step": 117870 + }, + { + "epoch": 2.623753561253561, + "grad_norm": 0.6079039573669434, + "learning_rate": 1.5327743382817594e-05, + "loss": 0.4193, + "step": 117880 + }, + { + "epoch": 2.6239761396011394, + "grad_norm": 0.8342279195785522, + "learning_rate": 1.5309848623110113e-05, + "loss": 0.5486, + "step": 117890 + }, + { + "epoch": 2.624198717948718, + "grad_norm": 0.5069679021835327, + "learning_rate": 1.5291963899616645e-05, + "loss": 0.3736, + "step": 117900 + }, + { + "epoch": 2.6244212962962963, + "grad_norm": 0.5442230701446533, + "learning_rate": 1.5274089213309107e-05, + "loss": 0.451, + "step": 117910 + }, + { + "epoch": 2.6246438746438745, + "grad_norm": 0.4876982569694519, + "learning_rate": 1.5256224565158738e-05, + "loss": 0.4097, + "step": 117920 + }, + { + "epoch": 2.624866452991453, + "grad_norm": 0.47937461733818054, + "learning_rate": 1.5238369956136368e-05, + "loss": 0.5374, + "step": 117930 + }, + { + "epoch": 2.6250890313390314, + "grad_norm": 0.6652742028236389, + "learning_rate": 1.5220525387212236e-05, + "loss": 0.5051, + "step": 117940 + }, + { + "epoch": 2.6253116096866096, + "grad_norm": 0.5420684218406677, + "learning_rate": 1.5202690859356017e-05, + "loss": 0.4197, + "step": 117950 + }, + { + "epoch": 2.6255341880341883, + "grad_norm": 0.5405448079109192, + "learning_rate": 1.5184866373536866e-05, + "loss": 0.5455, + "step": 117960 + }, + { + "epoch": 2.6257567663817665, + "grad_norm": 0.6384806632995605, + "learning_rate": 1.5167051930723386e-05, + "loss": 0.4563, + "step": 117970 + }, + { + "epoch": 2.6259793447293447, + "grad_norm": 0.5483444929122925, + "learning_rate": 1.5149247531883603e-05, + "loss": 0.4079, + "step": 117980 + }, + { + "epoch": 2.6262019230769234, + "grad_norm": 0.47904133796691895, + "learning_rate": 1.5131453177985055e-05, + "loss": 0.4591, + "step": 117990 + }, + { + "epoch": 2.6264245014245016, + "grad_norm": 0.5649526715278625, + "learning_rate": 1.5113668869994657e-05, + "loss": 0.457, + "step": 118000 + }, + { + "epoch": 2.62664707977208, + "grad_norm": 0.5389604568481445, + "learning_rate": 1.5095894608878835e-05, + "loss": 0.4295, + "step": 118010 + }, + { + "epoch": 2.626869658119658, + "grad_norm": 0.511400043964386, + "learning_rate": 1.5078130395603485e-05, + "loss": 0.4049, + "step": 118020 + }, + { + "epoch": 2.627092236467236, + "grad_norm": 0.7150872945785522, + "learning_rate": 1.5060376231133899e-05, + "loss": 0.4275, + "step": 118030 + }, + { + "epoch": 2.627314814814815, + "grad_norm": 0.4138595461845398, + "learning_rate": 1.5042632116434885e-05, + "loss": 0.4917, + "step": 118040 + }, + { + "epoch": 2.627537393162393, + "grad_norm": 0.40053409337997437, + "learning_rate": 1.5024898052470671e-05, + "loss": 0.5104, + "step": 118050 + }, + { + "epoch": 2.6277599715099713, + "grad_norm": 0.5890064239501953, + "learning_rate": 1.5007174040204908e-05, + "loss": 0.4719, + "step": 118060 + }, + { + "epoch": 2.62798254985755, + "grad_norm": 0.6815602779388428, + "learning_rate": 1.4989460080600736e-05, + "loss": 0.3667, + "step": 118070 + }, + { + "epoch": 2.628205128205128, + "grad_norm": 0.6920440196990967, + "learning_rate": 1.4971756174620766e-05, + "loss": 0.4127, + "step": 118080 + }, + { + "epoch": 2.6284277065527064, + "grad_norm": 0.6863391995429993, + "learning_rate": 1.4954062323227025e-05, + "loss": 0.455, + "step": 118090 + }, + { + "epoch": 2.628650284900285, + "grad_norm": 0.5050503611564636, + "learning_rate": 1.4936378527381034e-05, + "loss": 0.4967, + "step": 118100 + }, + { + "epoch": 2.6288728632478633, + "grad_norm": 0.6623749136924744, + "learning_rate": 1.491870478804378e-05, + "loss": 0.4824, + "step": 118110 + }, + { + "epoch": 2.6290954415954415, + "grad_norm": 0.49631795287132263, + "learning_rate": 1.4901041106175606e-05, + "loss": 0.3832, + "step": 118120 + }, + { + "epoch": 2.62931801994302, + "grad_norm": 0.46611085534095764, + "learning_rate": 1.4883387482736343e-05, + "loss": 0.471, + "step": 118130 + }, + { + "epoch": 2.6295405982905984, + "grad_norm": 0.37215563654899597, + "learning_rate": 1.4865743918685382e-05, + "loss": 0.5166, + "step": 118140 + }, + { + "epoch": 2.6297631766381766, + "grad_norm": 0.6707950234413147, + "learning_rate": 1.4848110414981419e-05, + "loss": 0.5848, + "step": 118150 + }, + { + "epoch": 2.629985754985755, + "grad_norm": 0.6057896614074707, + "learning_rate": 1.4830486972582735e-05, + "loss": 0.4721, + "step": 118160 + }, + { + "epoch": 2.6302083333333335, + "grad_norm": 0.5633417963981628, + "learning_rate": 1.4812873592446962e-05, + "loss": 0.4018, + "step": 118170 + }, + { + "epoch": 2.6304309116809117, + "grad_norm": 0.5686827301979065, + "learning_rate": 1.4795270275531225e-05, + "loss": 0.4613, + "step": 118180 + }, + { + "epoch": 2.63065349002849, + "grad_norm": 0.5282633304595947, + "learning_rate": 1.477767702279218e-05, + "loss": 0.3315, + "step": 118190 + }, + { + "epoch": 2.630876068376068, + "grad_norm": 0.4773818850517273, + "learning_rate": 1.476009383518575e-05, + "loss": 0.4479, + "step": 118200 + }, + { + "epoch": 2.631098646723647, + "grad_norm": 0.6412972807884216, + "learning_rate": 1.4742520713667462e-05, + "loss": 0.5118, + "step": 118210 + }, + { + "epoch": 2.631321225071225, + "grad_norm": 0.6847419142723083, + "learning_rate": 1.4724957659192262e-05, + "loss": 0.5176, + "step": 118220 + }, + { + "epoch": 2.6315438034188032, + "grad_norm": 0.41816556453704834, + "learning_rate": 1.470740467271452e-05, + "loss": 0.4863, + "step": 118230 + }, + { + "epoch": 2.631766381766382, + "grad_norm": 0.8198537230491638, + "learning_rate": 1.468986175518814e-05, + "loss": 0.4454, + "step": 118240 + }, + { + "epoch": 2.63198896011396, + "grad_norm": 0.41886764764785767, + "learning_rate": 1.4672328907566357e-05, + "loss": 0.3753, + "step": 118250 + }, + { + "epoch": 2.6322115384615383, + "grad_norm": 0.6245961785316467, + "learning_rate": 1.4654806130801945e-05, + "loss": 0.4448, + "step": 118260 + }, + { + "epoch": 2.632434116809117, + "grad_norm": 0.539581835269928, + "learning_rate": 1.4637293425847077e-05, + "loss": 0.5647, + "step": 118270 + }, + { + "epoch": 2.632656695156695, + "grad_norm": 0.5460532307624817, + "learning_rate": 1.4619790793653432e-05, + "loss": 0.3981, + "step": 118280 + }, + { + "epoch": 2.6328792735042734, + "grad_norm": 0.6665621399879456, + "learning_rate": 1.4602298235172118e-05, + "loss": 0.4673, + "step": 118290 + }, + { + "epoch": 2.633101851851852, + "grad_norm": 0.5957037210464478, + "learning_rate": 1.4584815751353687e-05, + "loss": 0.3986, + "step": 118300 + }, + { + "epoch": 2.6333244301994303, + "grad_norm": 0.6215893626213074, + "learning_rate": 1.4567343343148154e-05, + "loss": 0.5036, + "step": 118310 + }, + { + "epoch": 2.6335470085470085, + "grad_norm": 0.518753707408905, + "learning_rate": 1.4549881011504985e-05, + "loss": 0.3934, + "step": 118320 + }, + { + "epoch": 2.6337695868945867, + "grad_norm": 0.6107826232910156, + "learning_rate": 1.4532428757373129e-05, + "loss": 0.4698, + "step": 118330 + }, + { + "epoch": 2.6339921652421654, + "grad_norm": 0.6774536967277527, + "learning_rate": 1.4514986581700895e-05, + "loss": 0.4642, + "step": 118340 + }, + { + "epoch": 2.6342147435897436, + "grad_norm": 0.4905300438404083, + "learning_rate": 1.4497554485436148e-05, + "loss": 0.3734, + "step": 118350 + }, + { + "epoch": 2.634437321937322, + "grad_norm": 0.36812686920166016, + "learning_rate": 1.448013246952613e-05, + "loss": 0.3994, + "step": 118360 + }, + { + "epoch": 2.6346599002849, + "grad_norm": 0.502136766910553, + "learning_rate": 1.4462720534917596e-05, + "loss": 0.4834, + "step": 118370 + }, + { + "epoch": 2.6348824786324787, + "grad_norm": 0.646876335144043, + "learning_rate": 1.4445318682556741e-05, + "loss": 0.4909, + "step": 118380 + }, + { + "epoch": 2.635105056980057, + "grad_norm": 0.42079877853393555, + "learning_rate": 1.4427926913389145e-05, + "loss": 0.5332, + "step": 118390 + }, + { + "epoch": 2.635327635327635, + "grad_norm": 0.43850240111351013, + "learning_rate": 1.4410545228359962e-05, + "loss": 0.5189, + "step": 118400 + }, + { + "epoch": 2.635550213675214, + "grad_norm": 0.4544200003147125, + "learning_rate": 1.4393173628413636e-05, + "loss": 0.3596, + "step": 118410 + }, + { + "epoch": 2.635772792022792, + "grad_norm": 0.48858514428138733, + "learning_rate": 1.4375812114494192e-05, + "loss": 0.424, + "step": 118420 + }, + { + "epoch": 2.6359953703703702, + "grad_norm": 0.6129411458969116, + "learning_rate": 1.4358460687545094e-05, + "loss": 0.4825, + "step": 118430 + }, + { + "epoch": 2.636217948717949, + "grad_norm": 0.5403198599815369, + "learning_rate": 1.4341119348509191e-05, + "loss": 0.5829, + "step": 118440 + }, + { + "epoch": 2.636440527065527, + "grad_norm": 0.6376327276229858, + "learning_rate": 1.4323788098328882e-05, + "loss": 0.479, + "step": 118450 + }, + { + "epoch": 2.6366631054131053, + "grad_norm": 0.5461758971214294, + "learning_rate": 1.4306466937945906e-05, + "loss": 0.5078, + "step": 118460 + }, + { + "epoch": 2.636885683760684, + "grad_norm": 0.6305150389671326, + "learning_rate": 1.4289155868301574e-05, + "loss": 0.3263, + "step": 118470 + }, + { + "epoch": 2.637108262108262, + "grad_norm": 0.3602633476257324, + "learning_rate": 1.4271854890336511e-05, + "loss": 0.3087, + "step": 118480 + }, + { + "epoch": 2.6373308404558404, + "grad_norm": 0.4451531767845154, + "learning_rate": 1.42545640049909e-05, + "loss": 0.4953, + "step": 118490 + }, + { + "epoch": 2.6375534188034186, + "grad_norm": 0.4219418168067932, + "learning_rate": 1.4237283213204322e-05, + "loss": 0.3477, + "step": 118500 + }, + { + "epoch": 2.6377759971509973, + "grad_norm": 0.6085394024848938, + "learning_rate": 1.4220012515915893e-05, + "loss": 0.4057, + "step": 118510 + }, + { + "epoch": 2.6379985754985755, + "grad_norm": 0.42439699172973633, + "learning_rate": 1.4202751914064038e-05, + "loss": 0.4828, + "step": 118520 + }, + { + "epoch": 2.6382211538461537, + "grad_norm": 0.6015419363975525, + "learning_rate": 1.4185501408586743e-05, + "loss": 0.4617, + "step": 118530 + }, + { + "epoch": 2.638443732193732, + "grad_norm": 0.5368427038192749, + "learning_rate": 1.4168261000421434e-05, + "loss": 0.4131, + "step": 118540 + }, + { + "epoch": 2.6386663105413106, + "grad_norm": 0.6470973491668701, + "learning_rate": 1.415103069050494e-05, + "loss": 0.4663, + "step": 118550 + }, + { + "epoch": 2.638888888888889, + "grad_norm": 0.4165962040424347, + "learning_rate": 1.4133810479773579e-05, + "loss": 0.519, + "step": 118560 + }, + { + "epoch": 2.639111467236467, + "grad_norm": 0.6669519543647766, + "learning_rate": 1.4116600369163113e-05, + "loss": 0.4566, + "step": 118570 + }, + { + "epoch": 2.6393340455840457, + "grad_norm": 0.5125649571418762, + "learning_rate": 1.409940035960875e-05, + "loss": 0.3817, + "step": 118580 + }, + { + "epoch": 2.639556623931624, + "grad_norm": 0.6859048008918762, + "learning_rate": 1.408221045204514e-05, + "loss": 0.4928, + "step": 118590 + }, + { + "epoch": 2.639779202279202, + "grad_norm": 0.6684611439704895, + "learning_rate": 1.406503064740643e-05, + "loss": 0.5164, + "step": 118600 + }, + { + "epoch": 2.640001780626781, + "grad_norm": 0.9643305540084839, + "learning_rate": 1.40478609466262e-05, + "loss": 0.5385, + "step": 118610 + }, + { + "epoch": 2.640224358974359, + "grad_norm": 0.710292398929596, + "learning_rate": 1.403070135063742e-05, + "loss": 0.4999, + "step": 118620 + }, + { + "epoch": 2.6403133903133904, + "eval_loss": 0.5221165418624878, + "eval_runtime": 337.3697, + "eval_samples_per_second": 7.01, + "eval_steps_per_second": 7.01, + "step": 118624 + }, + { + "epoch": 2.6404469373219372, + "grad_norm": 0.5253585577011108, + "learning_rate": 1.4013551860372542e-05, + "loss": 0.453, + "step": 118630 + }, + { + "epoch": 2.640669515669516, + "grad_norm": 0.6748680472373962, + "learning_rate": 1.3996412476763555e-05, + "loss": 0.523, + "step": 118640 + }, + { + "epoch": 2.640892094017094, + "grad_norm": 0.4046613872051239, + "learning_rate": 1.3979283200741755e-05, + "loss": 0.4159, + "step": 118650 + }, + { + "epoch": 2.6411146723646723, + "grad_norm": 0.5051271319389343, + "learning_rate": 1.396216403323798e-05, + "loss": 0.5347, + "step": 118660 + }, + { + "epoch": 2.6413372507122506, + "grad_norm": 0.564983606338501, + "learning_rate": 1.3945054975182504e-05, + "loss": 0.4794, + "step": 118670 + }, + { + "epoch": 2.6415598290598292, + "grad_norm": 0.3454379141330719, + "learning_rate": 1.3927956027505095e-05, + "loss": 0.4743, + "step": 118680 + }, + { + "epoch": 2.6417824074074074, + "grad_norm": 0.4999922215938568, + "learning_rate": 1.3910867191134857e-05, + "loss": 0.4711, + "step": 118690 + }, + { + "epoch": 2.6420049857549857, + "grad_norm": 0.7926509380340576, + "learning_rate": 1.3893788467000424e-05, + "loss": 0.4454, + "step": 118700 + }, + { + "epoch": 2.642227564102564, + "grad_norm": 0.6050231456756592, + "learning_rate": 1.3876719856029875e-05, + "loss": 0.4505, + "step": 118710 + }, + { + "epoch": 2.6424501424501425, + "grad_norm": 0.4802277088165283, + "learning_rate": 1.3859661359150756e-05, + "loss": 0.3975, + "step": 118720 + }, + { + "epoch": 2.6426727207977208, + "grad_norm": 0.6203573942184448, + "learning_rate": 1.3842612977289992e-05, + "loss": 0.5449, + "step": 118730 + }, + { + "epoch": 2.642895299145299, + "grad_norm": 0.4395126700401306, + "learning_rate": 1.3825574711374067e-05, + "loss": 0.339, + "step": 118740 + }, + { + "epoch": 2.6431178774928776, + "grad_norm": 0.1692124754190445, + "learning_rate": 1.3808546562328839e-05, + "loss": 0.3725, + "step": 118750 + }, + { + "epoch": 2.643340455840456, + "grad_norm": 0.5445780158042908, + "learning_rate": 1.3791528531079568e-05, + "loss": 0.4446, + "step": 118760 + }, + { + "epoch": 2.643563034188034, + "grad_norm": 0.6528578400611877, + "learning_rate": 1.3774520618551112e-05, + "loss": 0.4721, + "step": 118770 + }, + { + "epoch": 2.6437856125356127, + "grad_norm": 0.6830431222915649, + "learning_rate": 1.3757522825667646e-05, + "loss": 0.3605, + "step": 118780 + }, + { + "epoch": 2.644008190883191, + "grad_norm": 0.5826247930526733, + "learning_rate": 1.3740535153352829e-05, + "loss": 0.5418, + "step": 118790 + }, + { + "epoch": 2.644230769230769, + "grad_norm": 0.5273993611335754, + "learning_rate": 1.3723557602529836e-05, + "loss": 0.4902, + "step": 118800 + }, + { + "epoch": 2.644453347578348, + "grad_norm": 0.3481096029281616, + "learning_rate": 1.3706590174121193e-05, + "loss": 0.4143, + "step": 118810 + }, + { + "epoch": 2.644675925925926, + "grad_norm": 0.5116106271743774, + "learning_rate": 1.3689632869048985e-05, + "loss": 0.4321, + "step": 118820 + }, + { + "epoch": 2.6448985042735043, + "grad_norm": 0.3914758563041687, + "learning_rate": 1.367268568823461e-05, + "loss": 0.4034, + "step": 118830 + }, + { + "epoch": 2.6451210826210825, + "grad_norm": 0.6352466940879822, + "learning_rate": 1.3655748632599042e-05, + "loss": 0.4712, + "step": 118840 + }, + { + "epoch": 2.6453436609686607, + "grad_norm": 0.4531133472919464, + "learning_rate": 1.3638821703062632e-05, + "loss": 0.3649, + "step": 118850 + }, + { + "epoch": 2.6455662393162394, + "grad_norm": 0.8250698447227478, + "learning_rate": 1.3621904900545224e-05, + "loss": 0.4037, + "step": 118860 + }, + { + "epoch": 2.6457888176638176, + "grad_norm": 0.6969398856163025, + "learning_rate": 1.3604998225966082e-05, + "loss": 0.3893, + "step": 118870 + }, + { + "epoch": 2.646011396011396, + "grad_norm": 0.45444926619529724, + "learning_rate": 1.3588101680243936e-05, + "loss": 0.4794, + "step": 118880 + }, + { + "epoch": 2.6462339743589745, + "grad_norm": 0.3588857650756836, + "learning_rate": 1.3571215264296944e-05, + "loss": 0.4206, + "step": 118890 + }, + { + "epoch": 2.6464565527065527, + "grad_norm": 0.5009655356407166, + "learning_rate": 1.3554338979042746e-05, + "loss": 0.5148, + "step": 118900 + }, + { + "epoch": 2.646679131054131, + "grad_norm": 0.27271145582199097, + "learning_rate": 1.3537472825398368e-05, + "loss": 0.4099, + "step": 118910 + }, + { + "epoch": 2.6469017094017095, + "grad_norm": 0.6450316309928894, + "learning_rate": 1.3520616804280383e-05, + "loss": 0.3745, + "step": 118920 + }, + { + "epoch": 2.6471242877492878, + "grad_norm": 0.47894465923309326, + "learning_rate": 1.3503770916604707e-05, + "loss": 0.4188, + "step": 118930 + }, + { + "epoch": 2.647346866096866, + "grad_norm": 0.47971856594085693, + "learning_rate": 1.3486935163286807e-05, + "loss": 0.4623, + "step": 118940 + }, + { + "epoch": 2.6475694444444446, + "grad_norm": 0.5302867293357849, + "learning_rate": 1.3470109545241549e-05, + "loss": 0.538, + "step": 118950 + }, + { + "epoch": 2.647792022792023, + "grad_norm": 0.8558449745178223, + "learning_rate": 1.3453294063383247e-05, + "loss": 0.4335, + "step": 118960 + }, + { + "epoch": 2.648014601139601, + "grad_norm": 0.7814370393753052, + "learning_rate": 1.3436488718625661e-05, + "loss": 0.4907, + "step": 118970 + }, + { + "epoch": 2.6482371794871797, + "grad_norm": 0.35361289978027344, + "learning_rate": 1.341969351188197e-05, + "loss": 0.4447, + "step": 118980 + }, + { + "epoch": 2.648459757834758, + "grad_norm": 0.47035202383995056, + "learning_rate": 1.340290844406491e-05, + "loss": 0.369, + "step": 118990 + }, + { + "epoch": 2.648682336182336, + "grad_norm": 0.5264750719070435, + "learning_rate": 1.338613351608653e-05, + "loss": 0.4621, + "step": 119000 + }, + { + "epoch": 2.6489049145299144, + "grad_norm": 0.656773030757904, + "learning_rate": 1.3369368728858434e-05, + "loss": 0.4295, + "step": 119010 + }, + { + "epoch": 2.6491274928774926, + "grad_norm": 0.6832476258277893, + "learning_rate": 1.335261408329167e-05, + "loss": 0.4045, + "step": 119020 + }, + { + "epoch": 2.6493500712250713, + "grad_norm": 0.5290317535400391, + "learning_rate": 1.3335869580296601e-05, + "loss": 0.4378, + "step": 119030 + }, + { + "epoch": 2.6495726495726495, + "grad_norm": 0.3806043565273285, + "learning_rate": 1.331913522078323e-05, + "loss": 0.36, + "step": 119040 + }, + { + "epoch": 2.6497952279202277, + "grad_norm": 0.3884319067001343, + "learning_rate": 1.3302411005660853e-05, + "loss": 0.4233, + "step": 119050 + }, + { + "epoch": 2.6500178062678064, + "grad_norm": 0.38250046968460083, + "learning_rate": 1.3285696935838276e-05, + "loss": 0.3661, + "step": 119060 + }, + { + "epoch": 2.6502403846153846, + "grad_norm": 0.5914880633354187, + "learning_rate": 1.3268993012223795e-05, + "loss": 0.3904, + "step": 119070 + }, + { + "epoch": 2.650462962962963, + "grad_norm": 0.5131521821022034, + "learning_rate": 1.3252299235725108e-05, + "loss": 0.4551, + "step": 119080 + }, + { + "epoch": 2.6506855413105415, + "grad_norm": 0.5576712489128113, + "learning_rate": 1.3235615607249352e-05, + "loss": 0.364, + "step": 119090 + }, + { + "epoch": 2.6509081196581197, + "grad_norm": 0.7232601046562195, + "learning_rate": 1.321894212770316e-05, + "loss": 0.4554, + "step": 119100 + }, + { + "epoch": 2.651130698005698, + "grad_norm": 0.31656399369239807, + "learning_rate": 1.3202278797992518e-05, + "loss": 0.472, + "step": 119110 + }, + { + "epoch": 2.6513532763532766, + "grad_norm": 0.574393093585968, + "learning_rate": 1.318562561902299e-05, + "loss": 0.4564, + "step": 119120 + }, + { + "epoch": 2.6515758547008548, + "grad_norm": 0.48993220925331116, + "learning_rate": 1.3168982591699496e-05, + "loss": 0.4432, + "step": 119130 + }, + { + "epoch": 2.651798433048433, + "grad_norm": 0.6290052533149719, + "learning_rate": 1.3152349716926427e-05, + "loss": 0.556, + "step": 119140 + }, + { + "epoch": 2.6520210113960117, + "grad_norm": 0.5945664048194885, + "learning_rate": 1.3135726995607634e-05, + "loss": 0.5677, + "step": 119150 + }, + { + "epoch": 2.65224358974359, + "grad_norm": 0.5955460071563721, + "learning_rate": 1.311911442864644e-05, + "loss": 0.4777, + "step": 119160 + }, + { + "epoch": 2.652466168091168, + "grad_norm": 0.4934992790222168, + "learning_rate": 1.3102512016945522e-05, + "loss": 0.466, + "step": 119170 + }, + { + "epoch": 2.6526887464387463, + "grad_norm": 0.48935940861701965, + "learning_rate": 1.3085919761407139e-05, + "loss": 0.4081, + "step": 119180 + }, + { + "epoch": 2.6529113247863245, + "grad_norm": 0.35348719358444214, + "learning_rate": 1.3069337662932857e-05, + "loss": 0.4904, + "step": 119190 + }, + { + "epoch": 2.653133903133903, + "grad_norm": 0.6942500472068787, + "learning_rate": 1.3052765722423798e-05, + "loss": 0.3155, + "step": 119200 + }, + { + "epoch": 2.6533564814814814, + "grad_norm": 0.6411790251731873, + "learning_rate": 1.303620394078049e-05, + "loss": 0.4522, + "step": 119210 + }, + { + "epoch": 2.6535790598290596, + "grad_norm": 0.560257613658905, + "learning_rate": 1.301965231890292e-05, + "loss": 0.498, + "step": 119220 + }, + { + "epoch": 2.6538016381766383, + "grad_norm": 0.5010212659835815, + "learning_rate": 1.3003110857690504e-05, + "loss": 0.3595, + "step": 119230 + }, + { + "epoch": 2.6540242165242165, + "grad_norm": 0.6719934940338135, + "learning_rate": 1.2986579558042166e-05, + "loss": 0.466, + "step": 119240 + }, + { + "epoch": 2.6542467948717947, + "grad_norm": 0.7256160974502563, + "learning_rate": 1.2970058420856168e-05, + "loss": 0.4716, + "step": 119250 + }, + { + "epoch": 2.6544693732193734, + "grad_norm": 0.6219938397407532, + "learning_rate": 1.29535474470303e-05, + "loss": 0.4953, + "step": 119260 + }, + { + "epoch": 2.6546919515669516, + "grad_norm": 0.7596551775932312, + "learning_rate": 1.293704663746178e-05, + "loss": 0.3799, + "step": 119270 + }, + { + "epoch": 2.65491452991453, + "grad_norm": 0.5391525030136108, + "learning_rate": 1.2920555993047268e-05, + "loss": 0.4662, + "step": 119280 + }, + { + "epoch": 2.6551371082621085, + "grad_norm": 0.6525141596794128, + "learning_rate": 1.2904075514682956e-05, + "loss": 0.3562, + "step": 119290 + }, + { + "epoch": 2.6553596866096867, + "grad_norm": 0.46071699261665344, + "learning_rate": 1.2887605203264286e-05, + "loss": 0.4846, + "step": 119300 + }, + { + "epoch": 2.655582264957265, + "grad_norm": 0.5057551860809326, + "learning_rate": 1.287114505968634e-05, + "loss": 0.3729, + "step": 119310 + }, + { + "epoch": 2.6558048433048436, + "grad_norm": 0.43811896443367004, + "learning_rate": 1.2854695084843604e-05, + "loss": 0.3254, + "step": 119320 + }, + { + "epoch": 2.656027421652422, + "grad_norm": 0.5691989064216614, + "learning_rate": 1.2838255279629897e-05, + "loss": 0.4391, + "step": 119330 + }, + { + "epoch": 2.65625, + "grad_norm": 0.41873759031295776, + "learning_rate": 1.2821825644938613e-05, + "loss": 0.4538, + "step": 119340 + }, + { + "epoch": 2.656472578347578, + "grad_norm": 0.6020520925521851, + "learning_rate": 1.2805406181662549e-05, + "loss": 0.4993, + "step": 119350 + }, + { + "epoch": 2.6566951566951564, + "grad_norm": 0.7073390483856201, + "learning_rate": 1.2788996890693972e-05, + "loss": 0.46, + "step": 119360 + }, + { + "epoch": 2.656917735042735, + "grad_norm": 0.4874557554721832, + "learning_rate": 1.2772597772924566e-05, + "loss": 0.3843, + "step": 119370 + }, + { + "epoch": 2.6571403133903133, + "grad_norm": 0.5705388784408569, + "learning_rate": 1.2756208829245486e-05, + "loss": 0.5698, + "step": 119380 + }, + { + "epoch": 2.6573628917378915, + "grad_norm": 0.6027542352676392, + "learning_rate": 1.2739830060547287e-05, + "loss": 0.4355, + "step": 119390 + }, + { + "epoch": 2.65758547008547, + "grad_norm": 0.4651222825050354, + "learning_rate": 1.272346146772001e-05, + "loss": 0.4487, + "step": 119400 + }, + { + "epoch": 2.6578080484330484, + "grad_norm": 0.5996605157852173, + "learning_rate": 1.2707103051653147e-05, + "loss": 0.4809, + "step": 119410 + }, + { + "epoch": 2.6580306267806266, + "grad_norm": 0.5189327001571655, + "learning_rate": 1.2690754813235628e-05, + "loss": 0.4816, + "step": 119420 + }, + { + "epoch": 2.6582532051282053, + "grad_norm": 0.5221971869468689, + "learning_rate": 1.2674416753355878e-05, + "loss": 0.5121, + "step": 119430 + }, + { + "epoch": 2.6584757834757835, + "grad_norm": 0.6243450045585632, + "learning_rate": 1.265808887290163e-05, + "loss": 0.5636, + "step": 119440 + }, + { + "epoch": 2.6586983618233617, + "grad_norm": 0.48516786098480225, + "learning_rate": 1.2641771172760197e-05, + "loss": 0.2967, + "step": 119450 + }, + { + "epoch": 2.6589209401709404, + "grad_norm": 0.5596334338188171, + "learning_rate": 1.2625463653818315e-05, + "loss": 0.5162, + "step": 119460 + }, + { + "epoch": 2.6591435185185186, + "grad_norm": 0.8100598454475403, + "learning_rate": 1.2609166316962117e-05, + "loss": 0.4156, + "step": 119470 + }, + { + "epoch": 2.659366096866097, + "grad_norm": 0.349343478679657, + "learning_rate": 1.259287916307723e-05, + "loss": 0.4248, + "step": 119480 + }, + { + "epoch": 2.6595886752136755, + "grad_norm": 0.39619576930999756, + "learning_rate": 1.2576602193048703e-05, + "loss": 0.5515, + "step": 119490 + }, + { + "epoch": 2.6598112535612537, + "grad_norm": 0.4198961555957794, + "learning_rate": 1.2560335407761047e-05, + "loss": 0.4431, + "step": 119500 + }, + { + "epoch": 2.660033831908832, + "grad_norm": 0.5467174053192139, + "learning_rate": 1.2544078808098203e-05, + "loss": 0.3629, + "step": 119510 + }, + { + "epoch": 2.66025641025641, + "grad_norm": 0.6161094307899475, + "learning_rate": 1.2527832394943596e-05, + "loss": 0.5464, + "step": 119520 + }, + { + "epoch": 2.6604789886039883, + "grad_norm": 0.5048872232437134, + "learning_rate": 1.2511596169180028e-05, + "loss": 0.4542, + "step": 119530 + }, + { + "epoch": 2.660701566951567, + "grad_norm": 0.48899000883102417, + "learning_rate": 1.249537013168982e-05, + "loss": 0.4037, + "step": 119540 + }, + { + "epoch": 2.6609241452991452, + "grad_norm": 0.722078263759613, + "learning_rate": 1.2479154283354688e-05, + "loss": 0.4348, + "step": 119550 + }, + { + "epoch": 2.6611467236467234, + "grad_norm": 0.44458404183387756, + "learning_rate": 1.246294862505586e-05, + "loss": 0.3578, + "step": 119560 + }, + { + "epoch": 2.661369301994302, + "grad_norm": 0.6896154284477234, + "learning_rate": 1.2446753157673896e-05, + "loss": 0.4801, + "step": 119570 + }, + { + "epoch": 2.6615918803418803, + "grad_norm": 0.4716028571128845, + "learning_rate": 1.2430567882088895e-05, + "loss": 0.6153, + "step": 119580 + }, + { + "epoch": 2.6618144586894585, + "grad_norm": 0.8424140810966492, + "learning_rate": 1.2414392799180396e-05, + "loss": 0.5301, + "step": 119590 + }, + { + "epoch": 2.662037037037037, + "grad_norm": 0.49317121505737305, + "learning_rate": 1.2398227909827387e-05, + "loss": 0.4794, + "step": 119600 + }, + { + "epoch": 2.6622596153846154, + "grad_norm": 0.3669206500053406, + "learning_rate": 1.2382073214908207e-05, + "loss": 0.5194, + "step": 119610 + }, + { + "epoch": 2.6624821937321936, + "grad_norm": 0.6917450428009033, + "learning_rate": 1.2365928715300779e-05, + "loss": 0.4911, + "step": 119620 + }, + { + "epoch": 2.6627047720797723, + "grad_norm": 0.6095952987670898, + "learning_rate": 1.2349794411882376e-05, + "loss": 0.4598, + "step": 119630 + }, + { + "epoch": 2.6629273504273505, + "grad_norm": 0.6148474812507629, + "learning_rate": 1.2333670305529788e-05, + "loss": 0.5021, + "step": 119640 + }, + { + "epoch": 2.6631499287749287, + "grad_norm": 0.679924488067627, + "learning_rate": 1.2317556397119156e-05, + "loss": 0.3886, + "step": 119650 + }, + { + "epoch": 2.6633725071225074, + "grad_norm": 0.3880091905593872, + "learning_rate": 1.2301452687526182e-05, + "loss": 0.4774, + "step": 119660 + }, + { + "epoch": 2.6635950854700856, + "grad_norm": 0.6816239953041077, + "learning_rate": 1.2285359177625922e-05, + "loss": 0.5377, + "step": 119670 + }, + { + "epoch": 2.663817663817664, + "grad_norm": 0.44697943329811096, + "learning_rate": 1.2269275868292896e-05, + "loss": 0.4146, + "step": 119680 + }, + { + "epoch": 2.664040242165242, + "grad_norm": 0.3887006938457489, + "learning_rate": 1.2253202760401138e-05, + "loss": 0.4018, + "step": 119690 + }, + { + "epoch": 2.6642628205128203, + "grad_norm": 0.8247577548027039, + "learning_rate": 1.2237139854823997e-05, + "loss": 0.4313, + "step": 119700 + }, + { + "epoch": 2.664485398860399, + "grad_norm": 0.6009088158607483, + "learning_rate": 1.2221087152434418e-05, + "loss": 0.3529, + "step": 119710 + }, + { + "epoch": 2.664707977207977, + "grad_norm": 0.7377880215644836, + "learning_rate": 1.2205044654104657e-05, + "loss": 0.4198, + "step": 119720 + }, + { + "epoch": 2.6649305555555554, + "grad_norm": 0.6384598016738892, + "learning_rate": 1.2189012360706508e-05, + "loss": 0.4098, + "step": 119730 + }, + { + "epoch": 2.665153133903134, + "grad_norm": 0.6163082122802734, + "learning_rate": 1.2172990273111206e-05, + "loss": 0.4649, + "step": 119740 + }, + { + "epoch": 2.6653757122507122, + "grad_norm": 0.7366446256637573, + "learning_rate": 1.2156978392189367e-05, + "loss": 0.4807, + "step": 119750 + }, + { + "epoch": 2.6655982905982905, + "grad_norm": 0.6339846253395081, + "learning_rate": 1.214097671881107e-05, + "loss": 0.4824, + "step": 119760 + }, + { + "epoch": 2.665820868945869, + "grad_norm": 0.48082682490348816, + "learning_rate": 1.212498525384591e-05, + "loss": 0.3937, + "step": 119770 + }, + { + "epoch": 2.6660434472934473, + "grad_norm": 0.8619961738586426, + "learning_rate": 1.2109003998162838e-05, + "loss": 0.5142, + "step": 119780 + }, + { + "epoch": 2.6662660256410255, + "grad_norm": 0.48731106519699097, + "learning_rate": 1.2093032952630312e-05, + "loss": 0.3976, + "step": 119790 + }, + { + "epoch": 2.666488603988604, + "grad_norm": 0.5623659491539001, + "learning_rate": 1.2077072118116217e-05, + "loss": 0.4175, + "step": 119800 + }, + { + "epoch": 2.6667111823361824, + "grad_norm": 0.592086672782898, + "learning_rate": 1.206112149548786e-05, + "loss": 0.5092, + "step": 119810 + }, + { + "epoch": 2.6669337606837606, + "grad_norm": 0.46243971586227417, + "learning_rate": 1.2045181085612011e-05, + "loss": 0.4151, + "step": 119820 + }, + { + "epoch": 2.6671563390313393, + "grad_norm": 0.44234499335289, + "learning_rate": 1.2029250889354893e-05, + "loss": 0.3803, + "step": 119830 + }, + { + "epoch": 2.6673789173789175, + "grad_norm": 0.34458523988723755, + "learning_rate": 1.2013330907582143e-05, + "loss": 0.3126, + "step": 119840 + }, + { + "epoch": 2.6676014957264957, + "grad_norm": 0.4783085286617279, + "learning_rate": 1.1997421141158893e-05, + "loss": 0.4291, + "step": 119850 + }, + { + "epoch": 2.667824074074074, + "grad_norm": 0.623274564743042, + "learning_rate": 1.1981521590949674e-05, + "loss": 0.4811, + "step": 119860 + }, + { + "epoch": 2.668046652421652, + "grad_norm": 0.7949987053871155, + "learning_rate": 1.1965632257818527e-05, + "loss": 0.566, + "step": 119870 + }, + { + "epoch": 2.668269230769231, + "grad_norm": 0.5925506949424744, + "learning_rate": 1.1949753142628827e-05, + "loss": 0.4659, + "step": 119880 + }, + { + "epoch": 2.668491809116809, + "grad_norm": 0.4603303074836731, + "learning_rate": 1.1933884246243464e-05, + "loss": 0.5669, + "step": 119890 + }, + { + "epoch": 2.6687143874643873, + "grad_norm": 0.5920960307121277, + "learning_rate": 1.1918025569524815e-05, + "loss": 0.4726, + "step": 119900 + }, + { + "epoch": 2.668936965811966, + "grad_norm": 0.3981236219406128, + "learning_rate": 1.1902177113334634e-05, + "loss": 0.3575, + "step": 119910 + }, + { + "epoch": 2.669159544159544, + "grad_norm": 0.6927867531776428, + "learning_rate": 1.188633887853412e-05, + "loss": 0.5089, + "step": 119920 + }, + { + "epoch": 2.6693821225071224, + "grad_norm": 0.43674182891845703, + "learning_rate": 1.1870510865983942e-05, + "loss": 0.3711, + "step": 119930 + }, + { + "epoch": 2.669604700854701, + "grad_norm": 0.6672431826591492, + "learning_rate": 1.1854693076544254e-05, + "loss": 0.5577, + "step": 119940 + }, + { + "epoch": 2.6698272792022792, + "grad_norm": 0.5630993247032166, + "learning_rate": 1.1838885511074549e-05, + "loss": 0.4317, + "step": 119950 + }, + { + "epoch": 2.6700498575498575, + "grad_norm": 0.5109251141548157, + "learning_rate": 1.1823088170433828e-05, + "loss": 0.5359, + "step": 119960 + }, + { + "epoch": 2.670272435897436, + "grad_norm": 0.6072834730148315, + "learning_rate": 1.1807301055480557e-05, + "loss": 0.4943, + "step": 119970 + }, + { + "epoch": 2.6704950142450143, + "grad_norm": 0.6109597682952881, + "learning_rate": 1.1791524167072588e-05, + "loss": 0.3994, + "step": 119980 + }, + { + "epoch": 2.6707175925925926, + "grad_norm": 0.7994768023490906, + "learning_rate": 1.1775757506067275e-05, + "loss": 0.4038, + "step": 119990 + }, + { + "epoch": 2.6709401709401708, + "grad_norm": 0.47031915187835693, + "learning_rate": 1.1760001073321381e-05, + "loss": 0.3227, + "step": 120000 + }, + { + "epoch": 2.6711627492877494, + "grad_norm": 0.8149853348731995, + "learning_rate": 1.1744254869691173e-05, + "loss": 0.464, + "step": 120010 + }, + { + "epoch": 2.6713853276353277, + "grad_norm": 1.0185492038726807, + "learning_rate": 1.1728518896032215e-05, + "loss": 0.4302, + "step": 120020 + }, + { + "epoch": 2.671607905982906, + "grad_norm": 0.38112345337867737, + "learning_rate": 1.1712793153199686e-05, + "loss": 0.412, + "step": 120030 + }, + { + "epoch": 2.671830484330484, + "grad_norm": 0.7294852137565613, + "learning_rate": 1.1697077642048127e-05, + "loss": 0.5178, + "step": 120040 + }, + { + "epoch": 2.6720530626780628, + "grad_norm": 0.7971472144126892, + "learning_rate": 1.1681372363431498e-05, + "loss": 0.5041, + "step": 120050 + }, + { + "epoch": 2.672275641025641, + "grad_norm": 0.5222724080085754, + "learning_rate": 1.1665677318203273e-05, + "loss": 0.4679, + "step": 120060 + }, + { + "epoch": 2.672498219373219, + "grad_norm": 0.3848299980163574, + "learning_rate": 1.16499925072163e-05, + "loss": 0.5168, + "step": 120070 + }, + { + "epoch": 2.672720797720798, + "grad_norm": 0.573528528213501, + "learning_rate": 1.1634317931322969e-05, + "loss": 0.4596, + "step": 120080 + }, + { + "epoch": 2.672943376068376, + "grad_norm": 0.7404229044914246, + "learning_rate": 1.1618653591375016e-05, + "loss": 0.5278, + "step": 120090 + }, + { + "epoch": 2.6731659544159543, + "grad_norm": 0.7535010576248169, + "learning_rate": 1.1602999488223609e-05, + "loss": 0.5029, + "step": 120100 + }, + { + "epoch": 2.673388532763533, + "grad_norm": 0.529496967792511, + "learning_rate": 1.1587355622719421e-05, + "loss": 0.4024, + "step": 120110 + }, + { + "epoch": 2.673611111111111, + "grad_norm": 0.44174572825431824, + "learning_rate": 1.1571721995712592e-05, + "loss": 0.3879, + "step": 120120 + }, + { + "epoch": 2.6738336894586894, + "grad_norm": 0.7702245116233826, + "learning_rate": 1.1556098608052624e-05, + "loss": 0.4502, + "step": 120130 + }, + { + "epoch": 2.674056267806268, + "grad_norm": 0.5648883581161499, + "learning_rate": 1.1540485460588546e-05, + "loss": 0.485, + "step": 120140 + }, + { + "epoch": 2.6742788461538463, + "grad_norm": 0.4519728422164917, + "learning_rate": 1.152488255416877e-05, + "loss": 0.4139, + "step": 120150 + }, + { + "epoch": 2.6745014245014245, + "grad_norm": 0.43161413073539734, + "learning_rate": 1.1509289889641173e-05, + "loss": 0.3445, + "step": 120160 + }, + { + "epoch": 2.6747240028490027, + "grad_norm": 0.5353063941001892, + "learning_rate": 1.1493707467853053e-05, + "loss": 0.5018, + "step": 120170 + }, + { + "epoch": 2.6749465811965814, + "grad_norm": 0.46572011709213257, + "learning_rate": 1.1478135289651182e-05, + "loss": 0.4157, + "step": 120180 + }, + { + "epoch": 2.6751691595441596, + "grad_norm": 0.45290809869766235, + "learning_rate": 1.1462573355881767e-05, + "loss": 0.4243, + "step": 120190 + }, + { + "epoch": 2.675391737891738, + "grad_norm": 0.5023192763328552, + "learning_rate": 1.1447021667390468e-05, + "loss": 0.4561, + "step": 120200 + }, + { + "epoch": 2.675614316239316, + "grad_norm": 0.4624986946582794, + "learning_rate": 1.1431480225022406e-05, + "loss": 0.3579, + "step": 120210 + }, + { + "epoch": 2.6758368945868947, + "grad_norm": 0.5679937601089478, + "learning_rate": 1.1415949029622042e-05, + "loss": 0.4312, + "step": 120220 + }, + { + "epoch": 2.676059472934473, + "grad_norm": 0.475293904542923, + "learning_rate": 1.140042808203341e-05, + "loss": 0.4307, + "step": 120230 + }, + { + "epoch": 2.676282051282051, + "grad_norm": 0.641948938369751, + "learning_rate": 1.1384917383099902e-05, + "loss": 0.4208, + "step": 120240 + }, + { + "epoch": 2.6765046296296298, + "grad_norm": 0.5572640299797058, + "learning_rate": 1.1369416933664379e-05, + "loss": 0.5328, + "step": 120250 + }, + { + "epoch": 2.676727207977208, + "grad_norm": 0.48038625717163086, + "learning_rate": 1.1353926734569143e-05, + "loss": 0.4274, + "step": 120260 + }, + { + "epoch": 2.676949786324786, + "grad_norm": 0.6205059885978699, + "learning_rate": 1.1338446786655987e-05, + "loss": 0.4661, + "step": 120270 + }, + { + "epoch": 2.677172364672365, + "grad_norm": 0.4956248700618744, + "learning_rate": 1.1322977090766062e-05, + "loss": 0.4248, + "step": 120280 + }, + { + "epoch": 2.677394943019943, + "grad_norm": 0.7365691661834717, + "learning_rate": 1.1307517647740052e-05, + "loss": 0.4274, + "step": 120290 + }, + { + "epoch": 2.6776175213675213, + "grad_norm": 0.520764946937561, + "learning_rate": 1.1292068458417993e-05, + "loss": 0.3888, + "step": 120300 + }, + { + "epoch": 2.6778400997151, + "grad_norm": 0.4983401298522949, + "learning_rate": 1.1276629523639392e-05, + "loss": 0.4346, + "step": 120310 + }, + { + "epoch": 2.678062678062678, + "grad_norm": 0.51678466796875, + "learning_rate": 1.1261200844243247e-05, + "loss": 0.4706, + "step": 120320 + }, + { + "epoch": 2.6782852564102564, + "grad_norm": 0.6123687028884888, + "learning_rate": 1.1245782421067951e-05, + "loss": 0.3839, + "step": 120330 + }, + { + "epoch": 2.6785078347578346, + "grad_norm": 0.6919697523117065, + "learning_rate": 1.1230374254951415e-05, + "loss": 0.5442, + "step": 120340 + }, + { + "epoch": 2.6787304131054133, + "grad_norm": 0.46390897035598755, + "learning_rate": 1.121497634673081e-05, + "loss": 0.5094, + "step": 120350 + }, + { + "epoch": 2.6789529914529915, + "grad_norm": 0.5452059507369995, + "learning_rate": 1.1199588697242957e-05, + "loss": 0.4072, + "step": 120360 + }, + { + "epoch": 2.6791755698005697, + "grad_norm": 0.4743301570415497, + "learning_rate": 1.1184211307324055e-05, + "loss": 0.3753, + "step": 120370 + }, + { + "epoch": 2.679398148148148, + "grad_norm": 0.6292110085487366, + "learning_rate": 1.1168844177809635e-05, + "loss": 0.5162, + "step": 120380 + }, + { + "epoch": 2.6796207264957266, + "grad_norm": 0.5056225061416626, + "learning_rate": 1.1153487309534804e-05, + "loss": 0.4024, + "step": 120390 + }, + { + "epoch": 2.679843304843305, + "grad_norm": 0.5967031121253967, + "learning_rate": 1.1138140703334078e-05, + "loss": 0.4496, + "step": 120400 + }, + { + "epoch": 2.680065883190883, + "grad_norm": 0.7600603699684143, + "learning_rate": 1.1122804360041406e-05, + "loss": 0.4953, + "step": 120410 + }, + { + "epoch": 2.6802884615384617, + "grad_norm": 0.5037348866462708, + "learning_rate": 1.1107478280490147e-05, + "loss": 0.4603, + "step": 120420 + }, + { + "epoch": 2.68051103988604, + "grad_norm": 0.46349087357521057, + "learning_rate": 1.109216246551319e-05, + "loss": 0.434, + "step": 120430 + }, + { + "epoch": 2.680733618233618, + "grad_norm": 0.38922572135925293, + "learning_rate": 1.1076856915942757e-05, + "loss": 0.3545, + "step": 120440 + }, + { + "epoch": 2.6809561965811968, + "grad_norm": 0.6287493705749512, + "learning_rate": 1.1061561632610562e-05, + "loss": 0.4892, + "step": 120450 + }, + { + "epoch": 2.681178774928775, + "grad_norm": 0.5363261699676514, + "learning_rate": 1.1046276616347784e-05, + "loss": 0.4329, + "step": 120460 + }, + { + "epoch": 2.681401353276353, + "grad_norm": 0.7095743417739868, + "learning_rate": 1.1031001867985003e-05, + "loss": 0.5604, + "step": 120470 + }, + { + "epoch": 2.681623931623932, + "grad_norm": 0.5759169459342957, + "learning_rate": 1.1015737388352333e-05, + "loss": 0.4218, + "step": 120480 + }, + { + "epoch": 2.68184650997151, + "grad_norm": 0.5130805969238281, + "learning_rate": 1.1000483178279152e-05, + "loss": 0.4957, + "step": 120490 + }, + { + "epoch": 2.6820690883190883, + "grad_norm": 0.5419637560844421, + "learning_rate": 1.0985239238594447e-05, + "loss": 0.3262, + "step": 120500 + }, + { + "epoch": 2.6822916666666665, + "grad_norm": 0.45975545048713684, + "learning_rate": 1.0970005570126618e-05, + "loss": 0.4493, + "step": 120510 + }, + { + "epoch": 2.682514245014245, + "grad_norm": 0.5595857501029968, + "learning_rate": 1.0954782173703404e-05, + "loss": 0.5724, + "step": 120520 + }, + { + "epoch": 2.6827368233618234, + "grad_norm": 0.5037173628807068, + "learning_rate": 1.0939569050152055e-05, + "loss": 0.4328, + "step": 120530 + }, + { + "epoch": 2.6829594017094016, + "grad_norm": 0.6380975246429443, + "learning_rate": 1.092436620029933e-05, + "loss": 0.4615, + "step": 120540 + }, + { + "epoch": 2.68318198005698, + "grad_norm": 0.6233899593353271, + "learning_rate": 1.0909173624971325e-05, + "loss": 0.4491, + "step": 120550 + }, + { + "epoch": 2.6834045584045585, + "grad_norm": 0.6080841422080994, + "learning_rate": 1.0893991324993602e-05, + "loss": 0.5008, + "step": 120560 + }, + { + "epoch": 2.6836271367521367, + "grad_norm": 0.5344551801681519, + "learning_rate": 1.0878819301191235e-05, + "loss": 0.4807, + "step": 120570 + }, + { + "epoch": 2.683849715099715, + "grad_norm": 0.39450156688690186, + "learning_rate": 1.0863657554388629e-05, + "loss": 0.3421, + "step": 120580 + }, + { + "epoch": 2.6840722934472936, + "grad_norm": 0.615376353263855, + "learning_rate": 1.0848506085409704e-05, + "loss": 0.5475, + "step": 120590 + }, + { + "epoch": 2.684294871794872, + "grad_norm": 0.4130651354789734, + "learning_rate": 1.0833364895077801e-05, + "loss": 0.3752, + "step": 120600 + }, + { + "epoch": 2.68451745014245, + "grad_norm": 0.6055234670639038, + "learning_rate": 1.0818233984215753e-05, + "loss": 0.5713, + "step": 120610 + }, + { + "epoch": 2.6847400284900287, + "grad_norm": 0.41650620102882385, + "learning_rate": 1.08031133536457e-05, + "loss": 0.3, + "step": 120620 + }, + { + "epoch": 2.684962606837607, + "grad_norm": 0.5312291979789734, + "learning_rate": 1.078800300418934e-05, + "loss": 0.4901, + "step": 120630 + }, + { + "epoch": 2.685185185185185, + "grad_norm": 0.5251030325889587, + "learning_rate": 1.0772902936667817e-05, + "loss": 0.311, + "step": 120640 + }, + { + "epoch": 2.685407763532764, + "grad_norm": 0.37884873151779175, + "learning_rate": 1.0757813151901652e-05, + "loss": 0.4589, + "step": 120650 + }, + { + "epoch": 2.685630341880342, + "grad_norm": 0.64932781457901, + "learning_rate": 1.0742733650710834e-05, + "loss": 0.4904, + "step": 120660 + }, + { + "epoch": 2.68585292022792, + "grad_norm": 0.5982466340065002, + "learning_rate": 1.0727664433914818e-05, + "loss": 0.4441, + "step": 120670 + }, + { + "epoch": 2.6860754985754984, + "grad_norm": 0.5319939255714417, + "learning_rate": 1.071260550233244e-05, + "loss": 0.3659, + "step": 120680 + }, + { + "epoch": 2.6862980769230766, + "grad_norm": 0.5466464757919312, + "learning_rate": 1.0697556856782043e-05, + "loss": 0.5018, + "step": 120690 + }, + { + "epoch": 2.6865206552706553, + "grad_norm": 0.5915857553482056, + "learning_rate": 1.0682518498081373e-05, + "loss": 0.4001, + "step": 120700 + }, + { + "epoch": 2.6867432336182335, + "grad_norm": 0.630190372467041, + "learning_rate": 1.0667490427047666e-05, + "loss": 0.4807, + "step": 120710 + }, + { + "epoch": 2.6869658119658117, + "grad_norm": 0.5770119428634644, + "learning_rate": 1.0652472644497492e-05, + "loss": 0.4412, + "step": 120720 + }, + { + "epoch": 2.6871883903133904, + "grad_norm": 0.4673517942428589, + "learning_rate": 1.063746515124695e-05, + "loss": 0.5254, + "step": 120730 + }, + { + "epoch": 2.6874109686609686, + "grad_norm": 0.6391720771789551, + "learning_rate": 1.0622467948111613e-05, + "loss": 0.5048, + "step": 120740 + }, + { + "epoch": 2.687633547008547, + "grad_norm": 0.49109727144241333, + "learning_rate": 1.0607481035906387e-05, + "loss": 0.4711, + "step": 120750 + }, + { + "epoch": 2.6878561253561255, + "grad_norm": 0.714725136756897, + "learning_rate": 1.0592504415445659e-05, + "loss": 0.4323, + "step": 120760 + }, + { + "epoch": 2.6880787037037037, + "grad_norm": 0.4830571413040161, + "learning_rate": 1.0577538087543293e-05, + "loss": 0.4638, + "step": 120770 + }, + { + "epoch": 2.688301282051282, + "grad_norm": 0.4310127794742584, + "learning_rate": 1.0562582053012592e-05, + "loss": 0.3241, + "step": 120780 + }, + { + "epoch": 2.6885238603988606, + "grad_norm": 0.557829737663269, + "learning_rate": 1.0547636312666287e-05, + "loss": 0.5086, + "step": 120790 + }, + { + "epoch": 2.688746438746439, + "grad_norm": 0.4715104401111603, + "learning_rate": 1.0532700867316503e-05, + "loss": 0.4818, + "step": 120800 + }, + { + "epoch": 2.688969017094017, + "grad_norm": 0.6122713685035706, + "learning_rate": 1.0517775717774836e-05, + "loss": 0.4238, + "step": 120810 + }, + { + "epoch": 2.6891915954415957, + "grad_norm": 0.3812810778617859, + "learning_rate": 1.050286086485237e-05, + "loss": 0.4984, + "step": 120820 + }, + { + "epoch": 2.689414173789174, + "grad_norm": 0.5915753841400146, + "learning_rate": 1.048795630935957e-05, + "loss": 0.4732, + "step": 120830 + }, + { + "epoch": 2.689636752136752, + "grad_norm": 0.49963366985321045, + "learning_rate": 1.0473062052106364e-05, + "loss": 0.5449, + "step": 120840 + }, + { + "epoch": 2.6898593304843303, + "grad_norm": 0.7948408126831055, + "learning_rate": 1.045817809390215e-05, + "loss": 0.4657, + "step": 120850 + }, + { + "epoch": 2.6900819088319086, + "grad_norm": 0.587867021560669, + "learning_rate": 1.0443304435555656e-05, + "loss": 0.5226, + "step": 120860 + }, + { + "epoch": 2.6903044871794872, + "grad_norm": 0.3600319027900696, + "learning_rate": 1.0428441077875239e-05, + "loss": 0.483, + "step": 120870 + }, + { + "epoch": 2.6905270655270654, + "grad_norm": 0.3752707540988922, + "learning_rate": 1.0413588021668475e-05, + "loss": 0.3287, + "step": 120880 + }, + { + "epoch": 2.6907496438746437, + "grad_norm": 0.5882196426391602, + "learning_rate": 1.0398745267742538e-05, + "loss": 0.4309, + "step": 120890 + }, + { + "epoch": 2.6909722222222223, + "grad_norm": 0.49909356236457825, + "learning_rate": 1.0383912816904007e-05, + "loss": 0.3353, + "step": 120900 + }, + { + "epoch": 2.6911948005698005, + "grad_norm": 0.6589367389678955, + "learning_rate": 1.0369090669958881e-05, + "loss": 0.5086, + "step": 120910 + }, + { + "epoch": 2.6914173789173788, + "grad_norm": 0.4251459538936615, + "learning_rate": 1.0354278827712605e-05, + "loss": 0.4085, + "step": 120920 + }, + { + "epoch": 2.6916399572649574, + "grad_norm": 0.6321707367897034, + "learning_rate": 1.0339477290970112e-05, + "loss": 0.4632, + "step": 120930 + }, + { + "epoch": 2.6918625356125356, + "grad_norm": 0.8624523878097534, + "learning_rate": 1.0324686060535649e-05, + "loss": 0.4292, + "step": 120940 + }, + { + "epoch": 2.692085113960114, + "grad_norm": 0.547972559928894, + "learning_rate": 1.0309905137213017e-05, + "loss": 0.3422, + "step": 120950 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.889786958694458, + "learning_rate": 1.0295134521805417e-05, + "loss": 0.4889, + "step": 120960 + }, + { + "epoch": 2.6925302706552707, + "grad_norm": 0.5785326361656189, + "learning_rate": 1.0280374215115518e-05, + "loss": 0.4494, + "step": 120970 + }, + { + "epoch": 2.692752849002849, + "grad_norm": 0.5202192664146423, + "learning_rate": 1.0265624217945392e-05, + "loss": 0.457, + "step": 120980 + }, + { + "epoch": 2.6929754273504276, + "grad_norm": 0.4780712425708771, + "learning_rate": 1.0250884531096593e-05, + "loss": 0.4192, + "step": 120990 + }, + { + "epoch": 2.693198005698006, + "grad_norm": 0.9215281009674072, + "learning_rate": 1.0236155155370087e-05, + "loss": 0.5269, + "step": 121000 + }, + { + "epoch": 2.693420584045584, + "grad_norm": 0.5440164804458618, + "learning_rate": 1.0221436091566205e-05, + "loss": 0.422, + "step": 121010 + }, + { + "epoch": 2.6936431623931623, + "grad_norm": 0.5080778002738953, + "learning_rate": 1.0206727340484845e-05, + "loss": 0.5341, + "step": 121020 + }, + { + "epoch": 2.6938657407407405, + "grad_norm": 0.45328041911125183, + "learning_rate": 1.019202890292532e-05, + "loss": 0.4234, + "step": 121030 + }, + { + "epoch": 2.694088319088319, + "grad_norm": 0.7312338948249817, + "learning_rate": 1.0177340779686306e-05, + "loss": 0.4812, + "step": 121040 + }, + { + "epoch": 2.6943108974358974, + "grad_norm": 0.6298159956932068, + "learning_rate": 1.0162662971565984e-05, + "loss": 0.538, + "step": 121050 + }, + { + "epoch": 2.6945334757834756, + "grad_norm": 0.520063579082489, + "learning_rate": 1.0147995479361982e-05, + "loss": 0.5705, + "step": 121060 + }, + { + "epoch": 2.6947560541310542, + "grad_norm": 0.42589840292930603, + "learning_rate": 1.0133338303871354e-05, + "loss": 0.4456, + "step": 121070 + }, + { + "epoch": 2.6949786324786325, + "grad_norm": 0.38978275656700134, + "learning_rate": 1.0118691445890504e-05, + "loss": 0.4925, + "step": 121080 + }, + { + "epoch": 2.6952012108262107, + "grad_norm": 0.31580010056495667, + "learning_rate": 1.0104054906215422e-05, + "loss": 0.3951, + "step": 121090 + }, + { + "epoch": 2.6954237891737893, + "grad_norm": 0.6156284213066101, + "learning_rate": 1.0089428685641467e-05, + "loss": 0.413, + "step": 121100 + }, + { + "epoch": 2.6956463675213675, + "grad_norm": 0.5855969786643982, + "learning_rate": 1.0074812784963406e-05, + "loss": 0.5138, + "step": 121110 + }, + { + "epoch": 2.6958689458689458, + "grad_norm": 0.6864350438117981, + "learning_rate": 1.0060207204975492e-05, + "loss": 0.5307, + "step": 121120 + }, + { + "epoch": 2.6960915242165244, + "grad_norm": 0.795219898223877, + "learning_rate": 1.0045611946471467e-05, + "loss": 0.4583, + "step": 121130 + }, + { + "epoch": 2.6963141025641026, + "grad_norm": 0.6658293008804321, + "learning_rate": 1.0031027010244365e-05, + "loss": 0.4645, + "step": 121140 + }, + { + "epoch": 2.696536680911681, + "grad_norm": 0.5443158745765686, + "learning_rate": 1.001645239708675e-05, + "loss": 0.4279, + "step": 121150 + }, + { + "epoch": 2.6967592592592595, + "grad_norm": 0.5901051163673401, + "learning_rate": 1.0001888107790635e-05, + "loss": 0.4095, + "step": 121160 + }, + { + "epoch": 2.6969818376068377, + "grad_norm": 0.4878370463848114, + "learning_rate": 9.987334143147475e-06, + "loss": 0.5041, + "step": 121170 + }, + { + "epoch": 2.697204415954416, + "grad_norm": 0.47627171874046326, + "learning_rate": 9.972790503948127e-06, + "loss": 0.4282, + "step": 121180 + }, + { + "epoch": 2.697426994301994, + "grad_norm": 0.6253020167350769, + "learning_rate": 9.958257190982889e-06, + "loss": 0.4362, + "step": 121190 + }, + { + "epoch": 2.6976495726495724, + "grad_norm": 0.6432648301124573, + "learning_rate": 9.943734205041554e-06, + "loss": 0.6053, + "step": 121200 + }, + { + "epoch": 2.697872150997151, + "grad_norm": 0.5386398434638977, + "learning_rate": 9.929221546913293e-06, + "loss": 0.4857, + "step": 121210 + }, + { + "epoch": 2.6980947293447293, + "grad_norm": 0.5033283829689026, + "learning_rate": 9.914719217386714e-06, + "loss": 0.3696, + "step": 121220 + }, + { + "epoch": 2.6983173076923075, + "grad_norm": 0.39687106013298035, + "learning_rate": 9.900227217249925e-06, + "loss": 0.3999, + "step": 121230 + }, + { + "epoch": 2.698539886039886, + "grad_norm": 0.6018307209014893, + "learning_rate": 9.885745547290382e-06, + "loss": 0.501, + "step": 121240 + }, + { + "epoch": 2.6987624643874644, + "grad_norm": 0.6582804918289185, + "learning_rate": 9.871274208295079e-06, + "loss": 0.4494, + "step": 121250 + }, + { + "epoch": 2.6989850427350426, + "grad_norm": 0.6227666735649109, + "learning_rate": 9.856813201050408e-06, + "loss": 0.4289, + "step": 121260 + }, + { + "epoch": 2.6992076210826212, + "grad_norm": 0.5360127687454224, + "learning_rate": 9.84236252634212e-06, + "loss": 0.3291, + "step": 121270 + }, + { + "epoch": 2.6994301994301995, + "grad_norm": 0.6764222383499146, + "learning_rate": 9.827922184955563e-06, + "loss": 0.5577, + "step": 121280 + }, + { + "epoch": 2.6996527777777777, + "grad_norm": 0.5015178918838501, + "learning_rate": 9.813492177675376e-06, + "loss": 0.5306, + "step": 121290 + }, + { + "epoch": 2.6998753561253563, + "grad_norm": 0.4428309202194214, + "learning_rate": 9.799072505285711e-06, + "loss": 0.4774, + "step": 121300 + }, + { + "epoch": 2.7000979344729346, + "grad_norm": 0.59559565782547, + "learning_rate": 9.784663168570163e-06, + "loss": 0.4743, + "step": 121310 + }, + { + "epoch": 2.7003205128205128, + "grad_norm": 0.6105912923812866, + "learning_rate": 9.770264168311705e-06, + "loss": 0.4392, + "step": 121320 + }, + { + "epoch": 2.7003205128205128, + "eval_loss": 0.5214730501174927, + "eval_runtime": 337.0066, + "eval_samples_per_second": 7.018, + "eval_steps_per_second": 7.018, + "step": 121320 + }, + { + "epoch": 2.7005430911680914, + "grad_norm": 0.5224626660346985, + "learning_rate": 9.755875505292843e-06, + "loss": 0.4719, + "step": 121330 + }, + { + "epoch": 2.7007656695156697, + "grad_norm": 0.546825110912323, + "learning_rate": 9.741497180295445e-06, + "loss": 0.4225, + "step": 121340 + }, + { + "epoch": 2.700988247863248, + "grad_norm": 0.7341123819351196, + "learning_rate": 9.727129194100881e-06, + "loss": 0.4578, + "step": 121350 + }, + { + "epoch": 2.701210826210826, + "grad_norm": 0.4602246880531311, + "learning_rate": 9.71277154748984e-06, + "loss": 0.365, + "step": 121360 + }, + { + "epoch": 2.7014334045584043, + "grad_norm": 0.5582806468009949, + "learning_rate": 9.698424241242587e-06, + "loss": 0.5116, + "step": 121370 + }, + { + "epoch": 2.701655982905983, + "grad_norm": 0.6341031789779663, + "learning_rate": 9.68408727613872e-06, + "loss": 0.4161, + "step": 121380 + }, + { + "epoch": 2.701878561253561, + "grad_norm": 0.6534328460693359, + "learning_rate": 9.669760652957393e-06, + "loss": 0.4649, + "step": 121390 + }, + { + "epoch": 2.7021011396011394, + "grad_norm": 0.4892594814300537, + "learning_rate": 9.655444372477073e-06, + "loss": 0.5891, + "step": 121400 + }, + { + "epoch": 2.702323717948718, + "grad_norm": 0.6224949359893799, + "learning_rate": 9.641138435475693e-06, + "loss": 0.4574, + "step": 121410 + }, + { + "epoch": 2.7025462962962963, + "grad_norm": 0.2790619432926178, + "learning_rate": 9.626842842730744e-06, + "loss": 0.3769, + "step": 121420 + }, + { + "epoch": 2.7027688746438745, + "grad_norm": 0.36917656660079956, + "learning_rate": 9.612557595018955e-06, + "loss": 0.4212, + "step": 121430 + }, + { + "epoch": 2.702991452991453, + "grad_norm": 0.4125138819217682, + "learning_rate": 9.598282693116668e-06, + "loss": 0.3707, + "step": 121440 + }, + { + "epoch": 2.7032140313390314, + "grad_norm": 0.5624163746833801, + "learning_rate": 9.584018137799544e-06, + "loss": 0.4378, + "step": 121450 + }, + { + "epoch": 2.7034366096866096, + "grad_norm": 0.6187195181846619, + "learning_rate": 9.569763929842767e-06, + "loss": 0.2982, + "step": 121460 + }, + { + "epoch": 2.7036591880341883, + "grad_norm": 0.6008877158164978, + "learning_rate": 9.555520070020919e-06, + "loss": 0.4625, + "step": 121470 + }, + { + "epoch": 2.7038817663817665, + "grad_norm": 0.5377522110939026, + "learning_rate": 9.54128655910802e-06, + "loss": 0.4653, + "step": 121480 + }, + { + "epoch": 2.7041043447293447, + "grad_norm": 0.5690780282020569, + "learning_rate": 9.527063397877523e-06, + "loss": 0.388, + "step": 121490 + }, + { + "epoch": 2.7043269230769234, + "grad_norm": 0.4713515043258667, + "learning_rate": 9.512850587102317e-06, + "loss": 0.4918, + "step": 121500 + }, + { + "epoch": 2.7045495014245016, + "grad_norm": 0.5271637439727783, + "learning_rate": 9.498648127554765e-06, + "loss": 0.3648, + "step": 121510 + }, + { + "epoch": 2.70477207977208, + "grad_norm": 0.604483962059021, + "learning_rate": 9.484456020006627e-06, + "loss": 0.5279, + "step": 121520 + }, + { + "epoch": 2.704994658119658, + "grad_norm": 0.7051653861999512, + "learning_rate": 9.470274265229106e-06, + "loss": 0.4251, + "step": 121530 + }, + { + "epoch": 2.705217236467236, + "grad_norm": 0.3371043801307678, + "learning_rate": 9.456102863992855e-06, + "loss": 0.428, + "step": 121540 + }, + { + "epoch": 2.705439814814815, + "grad_norm": 0.3929579257965088, + "learning_rate": 9.441941817067944e-06, + "loss": 0.3709, + "step": 121550 + }, + { + "epoch": 2.705662393162393, + "grad_norm": 0.5030765533447266, + "learning_rate": 9.427791125223962e-06, + "loss": 0.4252, + "step": 121560 + }, + { + "epoch": 2.7058849715099713, + "grad_norm": 0.5232594609260559, + "learning_rate": 9.413650789229778e-06, + "loss": 0.4406, + "step": 121570 + }, + { + "epoch": 2.70610754985755, + "grad_norm": 0.4874250888824463, + "learning_rate": 9.399520809853824e-06, + "loss": 0.5244, + "step": 121580 + }, + { + "epoch": 2.706330128205128, + "grad_norm": 0.7548015117645264, + "learning_rate": 9.385401187863952e-06, + "loss": 0.4662, + "step": 121590 + }, + { + "epoch": 2.7065527065527064, + "grad_norm": 0.6102350950241089, + "learning_rate": 9.371291924027437e-06, + "loss": 0.4242, + "step": 121600 + }, + { + "epoch": 2.706775284900285, + "grad_norm": 0.3742469847202301, + "learning_rate": 9.357193019110956e-06, + "loss": 0.3777, + "step": 121610 + }, + { + "epoch": 2.7069978632478633, + "grad_norm": 0.44158703088760376, + "learning_rate": 9.343104473880715e-06, + "loss": 0.3878, + "step": 121620 + }, + { + "epoch": 2.7072204415954415, + "grad_norm": 0.5411059260368347, + "learning_rate": 9.329026289102216e-06, + "loss": 0.5364, + "step": 121630 + }, + { + "epoch": 2.70744301994302, + "grad_norm": 0.6011901497840881, + "learning_rate": 9.314958465540514e-06, + "loss": 0.4326, + "step": 121640 + }, + { + "epoch": 2.7076655982905984, + "grad_norm": 0.814118504524231, + "learning_rate": 9.300901003960083e-06, + "loss": 0.4772, + "step": 121650 + }, + { + "epoch": 2.7078881766381766, + "grad_norm": 0.3137904703617096, + "learning_rate": 9.286853905124825e-06, + "loss": 0.4548, + "step": 121660 + }, + { + "epoch": 2.708110754985755, + "grad_norm": 0.3962802588939667, + "learning_rate": 9.272817169798043e-06, + "loss": 0.4031, + "step": 121670 + }, + { + "epoch": 2.7083333333333335, + "grad_norm": 0.6601922512054443, + "learning_rate": 9.2587907987425e-06, + "loss": 0.4737, + "step": 121680 + }, + { + "epoch": 2.7085559116809117, + "grad_norm": 0.536861777305603, + "learning_rate": 9.244774792720413e-06, + "loss": 0.5034, + "step": 121690 + }, + { + "epoch": 2.70877849002849, + "grad_norm": 0.6117525100708008, + "learning_rate": 9.230769152493435e-06, + "loss": 0.4652, + "step": 121700 + }, + { + "epoch": 2.709001068376068, + "grad_norm": 0.5301530361175537, + "learning_rate": 9.216773878822626e-06, + "loss": 0.4876, + "step": 121710 + }, + { + "epoch": 2.709223646723647, + "grad_norm": 0.6408038139343262, + "learning_rate": 9.202788972468491e-06, + "loss": 0.5645, + "step": 121720 + }, + { + "epoch": 2.709446225071225, + "grad_norm": 0.7939885258674622, + "learning_rate": 9.188814434191017e-06, + "loss": 0.5038, + "step": 121730 + }, + { + "epoch": 2.7096688034188032, + "grad_norm": 0.5108567476272583, + "learning_rate": 9.174850264749557e-06, + "loss": 0.4415, + "step": 121740 + }, + { + "epoch": 2.709891381766382, + "grad_norm": 0.6715599894523621, + "learning_rate": 9.160896464902945e-06, + "loss": 0.4724, + "step": 121750 + }, + { + "epoch": 2.71011396011396, + "grad_norm": 0.4106365740299225, + "learning_rate": 9.14695303540949e-06, + "loss": 0.2988, + "step": 121760 + }, + { + "epoch": 2.7103365384615383, + "grad_norm": 0.7415056824684143, + "learning_rate": 9.133019977026825e-06, + "loss": 0.5563, + "step": 121770 + }, + { + "epoch": 2.710559116809117, + "grad_norm": 0.4635258615016937, + "learning_rate": 9.11909729051208e-06, + "loss": 0.4499, + "step": 121780 + }, + { + "epoch": 2.710781695156695, + "grad_norm": 0.4406909942626953, + "learning_rate": 9.105184976621895e-06, + "loss": 0.4234, + "step": 121790 + }, + { + "epoch": 2.7110042735042734, + "grad_norm": 0.42780137062072754, + "learning_rate": 9.091283036112197e-06, + "loss": 0.4446, + "step": 121800 + }, + { + "epoch": 2.711226851851852, + "grad_norm": 0.7454397678375244, + "learning_rate": 9.077391469738471e-06, + "loss": 0.4549, + "step": 121810 + }, + { + "epoch": 2.7114494301994303, + "grad_norm": 0.39332252740859985, + "learning_rate": 9.063510278255583e-06, + "loss": 0.4143, + "step": 121820 + }, + { + "epoch": 2.7116720085470085, + "grad_norm": 0.49680182337760925, + "learning_rate": 9.049639462417858e-06, + "loss": 0.398, + "step": 121830 + }, + { + "epoch": 2.7118945868945867, + "grad_norm": 0.3992108404636383, + "learning_rate": 9.035779022979074e-06, + "loss": 0.4048, + "step": 121840 + }, + { + "epoch": 2.7121171652421654, + "grad_norm": 0.5084406733512878, + "learning_rate": 9.02192896069236e-06, + "loss": 0.4337, + "step": 121850 + }, + { + "epoch": 2.7123397435897436, + "grad_norm": 0.43044665455818176, + "learning_rate": 9.008089276310361e-06, + "loss": 0.4056, + "step": 121860 + }, + { + "epoch": 2.712562321937322, + "grad_norm": 0.5675276517868042, + "learning_rate": 8.99425997058514e-06, + "loss": 0.4977, + "step": 121870 + }, + { + "epoch": 2.7127849002849, + "grad_norm": 0.44733113050460815, + "learning_rate": 8.980441044268207e-06, + "loss": 0.4105, + "step": 121880 + }, + { + "epoch": 2.7130074786324787, + "grad_norm": 0.38512033224105835, + "learning_rate": 8.966632498110494e-06, + "loss": 0.3537, + "step": 121890 + }, + { + "epoch": 2.713230056980057, + "grad_norm": 0.48589053750038147, + "learning_rate": 8.952834332862381e-06, + "loss": 0.4738, + "step": 121900 + }, + { + "epoch": 2.713452635327635, + "grad_norm": 0.3985794186592102, + "learning_rate": 8.9390465492736e-06, + "loss": 0.3923, + "step": 121910 + }, + { + "epoch": 2.713675213675214, + "grad_norm": 0.7381534576416016, + "learning_rate": 8.925269148093485e-06, + "loss": 0.5137, + "step": 121920 + }, + { + "epoch": 2.713897792022792, + "grad_norm": 0.7137097120285034, + "learning_rate": 8.911502130070637e-06, + "loss": 0.5386, + "step": 121930 + }, + { + "epoch": 2.7141203703703702, + "grad_norm": 0.6199268698692322, + "learning_rate": 8.89774549595319e-06, + "loss": 0.5787, + "step": 121940 + }, + { + "epoch": 2.714342948717949, + "grad_norm": 0.6272393465042114, + "learning_rate": 8.883999246488706e-06, + "loss": 0.4451, + "step": 121950 + }, + { + "epoch": 2.714565527065527, + "grad_norm": 0.38012218475341797, + "learning_rate": 8.870263382424138e-06, + "loss": 0.4138, + "step": 121960 + }, + { + "epoch": 2.7147881054131053, + "grad_norm": 0.4783298969268799, + "learning_rate": 8.856537904505935e-06, + "loss": 0.5385, + "step": 121970 + }, + { + "epoch": 2.715010683760684, + "grad_norm": 0.8279988765716553, + "learning_rate": 8.842822813479968e-06, + "loss": 0.4848, + "step": 121980 + }, + { + "epoch": 2.715233262108262, + "grad_norm": 0.5451609492301941, + "learning_rate": 8.829118110091484e-06, + "loss": 0.4799, + "step": 121990 + }, + { + "epoch": 2.7154558404558404, + "grad_norm": 0.37781020998954773, + "learning_rate": 8.815423795085197e-06, + "loss": 0.4188, + "step": 122000 + }, + { + "epoch": 2.7156784188034186, + "grad_norm": 0.46710386872291565, + "learning_rate": 8.801739869205316e-06, + "loss": 0.5301, + "step": 122010 + }, + { + "epoch": 2.7159009971509973, + "grad_norm": 0.5526740550994873, + "learning_rate": 8.788066333195399e-06, + "loss": 0.446, + "step": 122020 + }, + { + "epoch": 2.7161235754985755, + "grad_norm": 0.5683900713920593, + "learning_rate": 8.774403187798497e-06, + "loss": 0.3673, + "step": 122030 + }, + { + "epoch": 2.7163461538461537, + "grad_norm": 0.6068999767303467, + "learning_rate": 8.76075043375708e-06, + "loss": 0.4733, + "step": 122040 + }, + { + "epoch": 2.716568732193732, + "grad_norm": 0.6357635855674744, + "learning_rate": 8.747108071813025e-06, + "loss": 0.4225, + "step": 122050 + }, + { + "epoch": 2.7167913105413106, + "grad_norm": 0.4891846179962158, + "learning_rate": 8.733476102707717e-06, + "loss": 0.5168, + "step": 122060 + }, + { + "epoch": 2.717013888888889, + "grad_norm": 0.6161572933197021, + "learning_rate": 8.71985452718187e-06, + "loss": 0.355, + "step": 122070 + }, + { + "epoch": 2.717236467236467, + "grad_norm": 0.5957919359207153, + "learning_rate": 8.706243345975695e-06, + "loss": 0.4717, + "step": 122080 + }, + { + "epoch": 2.7174590455840457, + "grad_norm": 0.6445571184158325, + "learning_rate": 8.69264255982889e-06, + "loss": 0.5461, + "step": 122090 + }, + { + "epoch": 2.717681623931624, + "grad_norm": 0.35034939646720886, + "learning_rate": 8.679052169480485e-06, + "loss": 0.431, + "step": 122100 + }, + { + "epoch": 2.717904202279202, + "grad_norm": 0.4366070330142975, + "learning_rate": 8.665472175668997e-06, + "loss": 0.4499, + "step": 122110 + }, + { + "epoch": 2.718126780626781, + "grad_norm": 0.5147125124931335, + "learning_rate": 8.651902579132421e-06, + "loss": 0.4197, + "step": 122120 + }, + { + "epoch": 2.718349358974359, + "grad_norm": 0.6924682855606079, + "learning_rate": 8.63834338060807e-06, + "loss": 0.4195, + "step": 122130 + }, + { + "epoch": 2.7185719373219372, + "grad_norm": 0.304049015045166, + "learning_rate": 8.624794580832807e-06, + "loss": 0.4128, + "step": 122140 + }, + { + "epoch": 2.718794515669516, + "grad_norm": 0.5142202377319336, + "learning_rate": 8.61125618054286e-06, + "loss": 0.4986, + "step": 122150 + }, + { + "epoch": 2.719017094017094, + "grad_norm": 0.4628162384033203, + "learning_rate": 8.597728180473951e-06, + "loss": 0.4266, + "step": 122160 + }, + { + "epoch": 2.7192396723646723, + "grad_norm": 0.6526134014129639, + "learning_rate": 8.584210581361162e-06, + "loss": 0.4388, + "step": 122170 + }, + { + "epoch": 2.7194622507122506, + "grad_norm": 0.650456428527832, + "learning_rate": 8.570703383939105e-06, + "loss": 0.4422, + "step": 122180 + }, + { + "epoch": 2.7196848290598292, + "grad_norm": 0.6294580698013306, + "learning_rate": 8.557206588941701e-06, + "loss": 0.4221, + "step": 122190 + }, + { + "epoch": 2.7199074074074074, + "grad_norm": 0.3640579283237457, + "learning_rate": 8.543720197102457e-06, + "loss": 0.4043, + "step": 122200 + }, + { + "epoch": 2.7201299857549857, + "grad_norm": 0.754707396030426, + "learning_rate": 8.53024420915416e-06, + "loss": 0.4449, + "step": 122210 + }, + { + "epoch": 2.720352564102564, + "grad_norm": 0.6340663433074951, + "learning_rate": 8.516778625829135e-06, + "loss": 0.4252, + "step": 122220 + }, + { + "epoch": 2.7205751424501425, + "grad_norm": 0.610907793045044, + "learning_rate": 8.503323447859113e-06, + "loss": 0.4931, + "step": 122230 + }, + { + "epoch": 2.7207977207977208, + "grad_norm": 0.7051807641983032, + "learning_rate": 8.48987867597526e-06, + "loss": 0.5408, + "step": 122240 + }, + { + "epoch": 2.721020299145299, + "grad_norm": 0.5701954364776611, + "learning_rate": 8.47644431090817e-06, + "loss": 0.4381, + "step": 122250 + }, + { + "epoch": 2.7212428774928776, + "grad_norm": 0.4436449110507965, + "learning_rate": 8.463020353387929e-06, + "loss": 0.4001, + "step": 122260 + }, + { + "epoch": 2.721465455840456, + "grad_norm": 0.5930556058883667, + "learning_rate": 8.449606804143928e-06, + "loss": 0.4776, + "step": 122270 + }, + { + "epoch": 2.721688034188034, + "grad_norm": 0.611627995967865, + "learning_rate": 8.436203663905095e-06, + "loss": 0.4434, + "step": 122280 + }, + { + "epoch": 2.7219106125356127, + "grad_norm": 0.5522210001945496, + "learning_rate": 8.422810933399783e-06, + "loss": 0.5635, + "step": 122290 + }, + { + "epoch": 2.722133190883191, + "grad_norm": 0.5032176375389099, + "learning_rate": 8.409428613355764e-06, + "loss": 0.4678, + "step": 122300 + }, + { + "epoch": 2.722355769230769, + "grad_norm": 0.47020357847213745, + "learning_rate": 8.396056704500254e-06, + "loss": 0.4779, + "step": 122310 + }, + { + "epoch": 2.722578347578348, + "grad_norm": 0.6725762486457825, + "learning_rate": 8.382695207559854e-06, + "loss": 0.4695, + "step": 122320 + }, + { + "epoch": 2.722800925925926, + "grad_norm": 0.7783358693122864, + "learning_rate": 8.36934412326067e-06, + "loss": 0.4296, + "step": 122330 + }, + { + "epoch": 2.7230235042735043, + "grad_norm": 0.519523024559021, + "learning_rate": 8.356003452328209e-06, + "loss": 0.5414, + "step": 122340 + }, + { + "epoch": 2.7232460826210825, + "grad_norm": 0.5169113278388977, + "learning_rate": 8.342673195487383e-06, + "loss": 0.4473, + "step": 122350 + }, + { + "epoch": 2.7234686609686607, + "grad_norm": 0.8073785305023193, + "learning_rate": 8.32935335346261e-06, + "loss": 0.4466, + "step": 122360 + }, + { + "epoch": 2.7236912393162394, + "grad_norm": 0.6161701083183289, + "learning_rate": 8.316043926977667e-06, + "loss": 0.4347, + "step": 122370 + }, + { + "epoch": 2.7239138176638176, + "grad_norm": 0.5894777774810791, + "learning_rate": 8.302744916755822e-06, + "loss": 0.4344, + "step": 122380 + }, + { + "epoch": 2.724136396011396, + "grad_norm": 0.5426009297370911, + "learning_rate": 8.289456323519762e-06, + "loss": 0.4598, + "step": 122390 + }, + { + "epoch": 2.7243589743589745, + "grad_norm": 0.5296372771263123, + "learning_rate": 8.276178147991598e-06, + "loss": 0.4751, + "step": 122400 + }, + { + "epoch": 2.7245815527065527, + "grad_norm": 0.643884003162384, + "learning_rate": 8.262910390892863e-06, + "loss": 0.4724, + "step": 122410 + }, + { + "epoch": 2.724804131054131, + "grad_norm": 0.42206260561943054, + "learning_rate": 8.249653052944517e-06, + "loss": 0.4427, + "step": 122420 + }, + { + "epoch": 2.7250267094017095, + "grad_norm": 0.5802809000015259, + "learning_rate": 8.236406134867003e-06, + "loss": 0.3634, + "step": 122430 + }, + { + "epoch": 2.7252492877492878, + "grad_norm": 0.45060160756111145, + "learning_rate": 8.22316963738019e-06, + "loss": 0.4823, + "step": 122440 + }, + { + "epoch": 2.725471866096866, + "grad_norm": 0.9987406134605408, + "learning_rate": 8.209943561203326e-06, + "loss": 0.491, + "step": 122450 + }, + { + "epoch": 2.7256944444444446, + "grad_norm": 0.5410594344139099, + "learning_rate": 8.196727907055124e-06, + "loss": 0.5319, + "step": 122460 + }, + { + "epoch": 2.725917022792023, + "grad_norm": 0.7260173559188843, + "learning_rate": 8.183522675653764e-06, + "loss": 0.5818, + "step": 122470 + }, + { + "epoch": 2.726139601139601, + "grad_norm": 0.5966166853904724, + "learning_rate": 8.170327867716788e-06, + "loss": 0.4377, + "step": 122480 + }, + { + "epoch": 2.7263621794871797, + "grad_norm": 0.5492916107177734, + "learning_rate": 8.157143483961239e-06, + "loss": 0.4088, + "step": 122490 + }, + { + "epoch": 2.726584757834758, + "grad_norm": 0.4295927584171295, + "learning_rate": 8.143969525103544e-06, + "loss": 0.47, + "step": 122500 + }, + { + "epoch": 2.726807336182336, + "grad_norm": 0.37031441926956177, + "learning_rate": 8.130805991859625e-06, + "loss": 0.4653, + "step": 122510 + }, + { + "epoch": 2.7270299145299144, + "grad_norm": 0.7110081315040588, + "learning_rate": 8.11765288494477e-06, + "loss": 0.4348, + "step": 122520 + }, + { + "epoch": 2.7272524928774926, + "grad_norm": 0.6578905582427979, + "learning_rate": 8.104510205073745e-06, + "loss": 0.4397, + "step": 122530 + }, + { + "epoch": 2.7274750712250713, + "grad_norm": 0.6855700016021729, + "learning_rate": 8.091377952960755e-06, + "loss": 0.4916, + "step": 122540 + }, + { + "epoch": 2.7276976495726495, + "grad_norm": 0.3897780179977417, + "learning_rate": 8.078256129319383e-06, + "loss": 0.4268, + "step": 122550 + }, + { + "epoch": 2.7279202279202277, + "grad_norm": 0.42931514978408813, + "learning_rate": 8.065144734862661e-06, + "loss": 0.3575, + "step": 122560 + }, + { + "epoch": 2.7281428062678064, + "grad_norm": 0.6458748579025269, + "learning_rate": 8.05204377030313e-06, + "loss": 0.4615, + "step": 122570 + }, + { + "epoch": 2.7283653846153846, + "grad_norm": 0.8001543879508972, + "learning_rate": 8.038953236352664e-06, + "loss": 0.565, + "step": 122580 + }, + { + "epoch": 2.728587962962963, + "grad_norm": 0.3280816972255707, + "learning_rate": 8.025873133722606e-06, + "loss": 0.4612, + "step": 122590 + }, + { + "epoch": 2.7288105413105415, + "grad_norm": 0.4333963394165039, + "learning_rate": 8.012803463123764e-06, + "loss": 0.4722, + "step": 122600 + }, + { + "epoch": 2.7290331196581197, + "grad_norm": 0.5036003589630127, + "learning_rate": 7.999744225266392e-06, + "loss": 0.3942, + "step": 122610 + }, + { + "epoch": 2.729255698005698, + "grad_norm": 0.7043850421905518, + "learning_rate": 7.986695420860057e-06, + "loss": 0.4804, + "step": 122620 + }, + { + "epoch": 2.7294782763532766, + "grad_norm": 0.5668345093727112, + "learning_rate": 7.973657050613881e-06, + "loss": 0.4459, + "step": 122630 + }, + { + "epoch": 2.7297008547008548, + "grad_norm": 0.5528721213340759, + "learning_rate": 7.960629115236384e-06, + "loss": 0.4946, + "step": 122640 + }, + { + "epoch": 2.729923433048433, + "grad_norm": 0.8780606985092163, + "learning_rate": 7.947611615435513e-06, + "loss": 0.4543, + "step": 122650 + }, + { + "epoch": 2.7301460113960117, + "grad_norm": 0.3899797201156616, + "learning_rate": 7.934604551918657e-06, + "loss": 0.4836, + "step": 122660 + }, + { + "epoch": 2.73036858974359, + "grad_norm": 0.5518772602081299, + "learning_rate": 7.921607925392605e-06, + "loss": 0.4694, + "step": 122670 + }, + { + "epoch": 2.730591168091168, + "grad_norm": 0.6687607169151306, + "learning_rate": 7.908621736563659e-06, + "loss": 0.4681, + "step": 122680 + }, + { + "epoch": 2.7308137464387463, + "grad_norm": 0.4973803162574768, + "learning_rate": 7.895645986137434e-06, + "loss": 0.5181, + "step": 122690 + }, + { + "epoch": 2.7310363247863245, + "grad_norm": 0.5222212076187134, + "learning_rate": 7.882680674819054e-06, + "loss": 0.4345, + "step": 122700 + }, + { + "epoch": 2.731258903133903, + "grad_norm": 1.0800514221191406, + "learning_rate": 7.86972580331311e-06, + "loss": 0.5682, + "step": 122710 + }, + { + "epoch": 2.7314814814814814, + "grad_norm": 0.4344554543495178, + "learning_rate": 7.856781372323551e-06, + "loss": 0.391, + "step": 122720 + }, + { + "epoch": 2.7317040598290596, + "grad_norm": 0.8073518872261047, + "learning_rate": 7.84384738255377e-06, + "loss": 0.45, + "step": 122730 + }, + { + "epoch": 2.7319266381766383, + "grad_norm": 0.5675332546234131, + "learning_rate": 7.830923834706627e-06, + "loss": 0.4147, + "step": 122740 + }, + { + "epoch": 2.7321492165242165, + "grad_norm": 0.4464460611343384, + "learning_rate": 7.818010729484426e-06, + "loss": 0.5413, + "step": 122750 + }, + { + "epoch": 2.7323717948717947, + "grad_norm": 0.5830329060554504, + "learning_rate": 7.805108067588829e-06, + "loss": 0.3051, + "step": 122760 + }, + { + "epoch": 2.7325943732193734, + "grad_norm": 0.6713405251502991, + "learning_rate": 7.792215849720985e-06, + "loss": 0.5106, + "step": 122770 + }, + { + "epoch": 2.7328169515669516, + "grad_norm": 0.6539282202720642, + "learning_rate": 7.77933407658149e-06, + "loss": 0.5021, + "step": 122780 + }, + { + "epoch": 2.73303952991453, + "grad_norm": 0.5176177620887756, + "learning_rate": 7.76646274887034e-06, + "loss": 0.3564, + "step": 122790 + }, + { + "epoch": 2.7332621082621085, + "grad_norm": 0.6762197613716125, + "learning_rate": 7.753601867286975e-06, + "loss": 0.3903, + "step": 122800 + }, + { + "epoch": 2.7334846866096867, + "grad_norm": 0.6328567862510681, + "learning_rate": 7.74075143253028e-06, + "loss": 0.4979, + "step": 122810 + }, + { + "epoch": 2.733707264957265, + "grad_norm": 0.5372480154037476, + "learning_rate": 7.727911445298542e-06, + "loss": 0.4814, + "step": 122820 + }, + { + "epoch": 2.7339298433048436, + "grad_norm": 0.5866361856460571, + "learning_rate": 7.71508190628949e-06, + "loss": 0.4676, + "step": 122830 + }, + { + "epoch": 2.734152421652422, + "grad_norm": 0.5748997330665588, + "learning_rate": 7.702262816200323e-06, + "loss": 0.4107, + "step": 122840 + }, + { + "epoch": 2.734375, + "grad_norm": 0.7153517603874207, + "learning_rate": 7.689454175727573e-06, + "loss": 0.455, + "step": 122850 + }, + { + "epoch": 2.734597578347578, + "grad_norm": 0.9207581281661987, + "learning_rate": 7.676655985567326e-06, + "loss": 0.4165, + "step": 122860 + }, + { + "epoch": 2.7348201566951564, + "grad_norm": 0.3741627037525177, + "learning_rate": 7.663868246415051e-06, + "loss": 0.4885, + "step": 122870 + }, + { + "epoch": 2.735042735042735, + "grad_norm": 0.5004806518554688, + "learning_rate": 7.651090958965612e-06, + "loss": 0.4983, + "step": 122880 + }, + { + "epoch": 2.7352653133903133, + "grad_norm": 0.7157542109489441, + "learning_rate": 7.638324123913387e-06, + "loss": 0.5098, + "step": 122890 + }, + { + "epoch": 2.7354878917378915, + "grad_norm": 0.5806912779808044, + "learning_rate": 7.625567741952067e-06, + "loss": 0.3866, + "step": 122900 + }, + { + "epoch": 2.73571047008547, + "grad_norm": 0.7059860229492188, + "learning_rate": 7.612821813774895e-06, + "loss": 0.444, + "step": 122910 + }, + { + "epoch": 2.7359330484330484, + "grad_norm": 0.6227195858955383, + "learning_rate": 7.600086340074475e-06, + "loss": 0.4417, + "step": 122920 + }, + { + "epoch": 2.7361556267806266, + "grad_norm": 0.5309995412826538, + "learning_rate": 7.587361321542874e-06, + "loss": 0.4916, + "step": 122930 + }, + { + "epoch": 2.7363782051282053, + "grad_norm": 0.3757227063179016, + "learning_rate": 7.574646758871562e-06, + "loss": 0.3326, + "step": 122940 + }, + { + "epoch": 2.7366007834757835, + "grad_norm": 0.5011971592903137, + "learning_rate": 7.561942652751475e-06, + "loss": 0.4943, + "step": 122950 + }, + { + "epoch": 2.7368233618233617, + "grad_norm": 0.5790525674819946, + "learning_rate": 7.549249003872993e-06, + "loss": 0.4616, + "step": 122960 + }, + { + "epoch": 2.7370459401709404, + "grad_norm": 0.502332329750061, + "learning_rate": 7.536565812925877e-06, + "loss": 0.5342, + "step": 122970 + }, + { + "epoch": 2.7372685185185186, + "grad_norm": 0.5204288363456726, + "learning_rate": 7.523893080599287e-06, + "loss": 0.3748, + "step": 122980 + }, + { + "epoch": 2.737491096866097, + "grad_norm": 0.717951238155365, + "learning_rate": 7.511230807581937e-06, + "loss": 0.4289, + "step": 122990 + }, + { + "epoch": 2.7377136752136755, + "grad_norm": 0.5042845606803894, + "learning_rate": 7.498578994561878e-06, + "loss": 0.4215, + "step": 123000 + }, + { + "epoch": 2.7379362535612537, + "grad_norm": 0.7990171313285828, + "learning_rate": 7.485937642226604e-06, + "loss": 0.5998, + "step": 123010 + }, + { + "epoch": 2.738158831908832, + "grad_norm": 1.1951872110366821, + "learning_rate": 7.473306751263098e-06, + "loss": 0.5114, + "step": 123020 + }, + { + "epoch": 2.73838141025641, + "grad_norm": 0.42899465560913086, + "learning_rate": 7.460686322357724e-06, + "loss": 0.3831, + "step": 123030 + }, + { + "epoch": 2.7386039886039883, + "grad_norm": 0.6046960353851318, + "learning_rate": 7.448076356196265e-06, + "loss": 0.4278, + "step": 123040 + }, + { + "epoch": 2.738826566951567, + "grad_norm": 0.42047208547592163, + "learning_rate": 7.435476853463974e-06, + "loss": 0.4256, + "step": 123050 + }, + { + "epoch": 2.7390491452991452, + "grad_norm": 0.46061971783638, + "learning_rate": 7.422887814845481e-06, + "loss": 0.4766, + "step": 123060 + }, + { + "epoch": 2.7392717236467234, + "grad_norm": 0.6004669070243835, + "learning_rate": 7.4103092410249485e-06, + "loss": 0.4347, + "step": 123070 + }, + { + "epoch": 2.739494301994302, + "grad_norm": 0.6010010242462158, + "learning_rate": 7.397741132685854e-06, + "loss": 0.5443, + "step": 123080 + }, + { + "epoch": 2.7397168803418803, + "grad_norm": 0.4315003454685211, + "learning_rate": 7.385183490511183e-06, + "loss": 0.4179, + "step": 123090 + }, + { + "epoch": 2.7399394586894585, + "grad_norm": 0.5793805122375488, + "learning_rate": 7.3726363151833454e-06, + "loss": 0.5219, + "step": 123100 + }, + { + "epoch": 2.740162037037037, + "grad_norm": 0.5742940902709961, + "learning_rate": 7.360099607384152e-06, + "loss": 0.3331, + "step": 123110 + }, + { + "epoch": 2.7403846153846154, + "grad_norm": 0.4294794201850891, + "learning_rate": 7.347573367794814e-06, + "loss": 0.4009, + "step": 123120 + }, + { + "epoch": 2.7406071937321936, + "grad_norm": 0.3603975772857666, + "learning_rate": 7.3350575970960515e-06, + "loss": 0.3148, + "step": 123130 + }, + { + "epoch": 2.7408297720797723, + "grad_norm": 0.6385037899017334, + "learning_rate": 7.322552295967966e-06, + "loss": 0.5181, + "step": 123140 + }, + { + "epoch": 2.7410523504273505, + "grad_norm": 0.44872191548347473, + "learning_rate": 7.310057465090148e-06, + "loss": 0.3141, + "step": 123150 + }, + { + "epoch": 2.7412749287749287, + "grad_norm": 0.6203681230545044, + "learning_rate": 7.297573105141542e-06, + "loss": 0.4231, + "step": 123160 + }, + { + "epoch": 2.7414975071225074, + "grad_norm": 0.4686487019062042, + "learning_rate": 7.285099216800584e-06, + "loss": 0.4527, + "step": 123170 + }, + { + "epoch": 2.7417200854700856, + "grad_norm": 0.5996116995811462, + "learning_rate": 7.2726358007450865e-06, + "loss": 0.4712, + "step": 123180 + }, + { + "epoch": 2.741942663817664, + "grad_norm": 0.512990415096283, + "learning_rate": 7.260182857652331e-06, + "loss": 0.5061, + "step": 123190 + }, + { + "epoch": 2.742165242165242, + "grad_norm": 0.454429566860199, + "learning_rate": 7.24774038819902e-06, + "loss": 0.4519, + "step": 123200 + }, + { + "epoch": 2.7423878205128203, + "grad_norm": 0.5068633556365967, + "learning_rate": 7.235308393061302e-06, + "loss": 0.5578, + "step": 123210 + }, + { + "epoch": 2.742610398860399, + "grad_norm": 0.41929909586906433, + "learning_rate": 7.222886872914703e-06, + "loss": 0.4626, + "step": 123220 + }, + { + "epoch": 2.742832977207977, + "grad_norm": 0.778678297996521, + "learning_rate": 7.210475828434304e-06, + "loss": 0.4833, + "step": 123230 + }, + { + "epoch": 2.7430555555555554, + "grad_norm": 0.5140986442565918, + "learning_rate": 7.198075260294413e-06, + "loss": 0.4896, + "step": 123240 + }, + { + "epoch": 2.743278133903134, + "grad_norm": 0.6002315282821655, + "learning_rate": 7.185685169168999e-06, + "loss": 0.4238, + "step": 123250 + }, + { + "epoch": 2.7435007122507122, + "grad_norm": 0.7444051504135132, + "learning_rate": 7.1733055557312574e-06, + "loss": 0.3522, + "step": 123260 + }, + { + "epoch": 2.7437232905982905, + "grad_norm": 0.5440506339073181, + "learning_rate": 7.160936420653963e-06, + "loss": 0.3381, + "step": 123270 + }, + { + "epoch": 2.743945868945869, + "grad_norm": 0.6836532354354858, + "learning_rate": 7.1485777646092435e-06, + "loss": 0.5332, + "step": 123280 + }, + { + "epoch": 2.7441684472934473, + "grad_norm": 0.4120360314846039, + "learning_rate": 7.136229588268673e-06, + "loss": 0.5638, + "step": 123290 + }, + { + "epoch": 2.7443910256410255, + "grad_norm": 0.5837830305099487, + "learning_rate": 7.1238918923032915e-06, + "loss": 0.5332, + "step": 123300 + }, + { + "epoch": 2.744613603988604, + "grad_norm": 0.5079546570777893, + "learning_rate": 7.11156467738352e-06, + "loss": 0.494, + "step": 123310 + }, + { + "epoch": 2.7448361823361824, + "grad_norm": 0.46274396777153015, + "learning_rate": 7.099247944179221e-06, + "loss": 0.4824, + "step": 123320 + }, + { + "epoch": 2.7450587606837606, + "grad_norm": 0.48522692918777466, + "learning_rate": 7.0869416933597055e-06, + "loss": 0.4459, + "step": 123330 + }, + { + "epoch": 2.7452813390313393, + "grad_norm": 0.5520329475402832, + "learning_rate": 7.074645925593704e-06, + "loss": 0.4302, + "step": 123340 + }, + { + "epoch": 2.7455039173789175, + "grad_norm": 0.7036489248275757, + "learning_rate": 7.0623606415493705e-06, + "loss": 0.4475, + "step": 123350 + }, + { + "epoch": 2.7457264957264957, + "grad_norm": 0.38540059328079224, + "learning_rate": 7.050085841894349e-06, + "loss": 0.3854, + "step": 123360 + }, + { + "epoch": 2.745949074074074, + "grad_norm": 0.49837902188301086, + "learning_rate": 7.0378215272955735e-06, + "loss": 0.3951, + "step": 123370 + }, + { + "epoch": 2.746171652421652, + "grad_norm": 0.7058712840080261, + "learning_rate": 7.025567698419555e-06, + "loss": 0.4343, + "step": 123380 + }, + { + "epoch": 2.746394230769231, + "grad_norm": 0.5129693746566772, + "learning_rate": 7.013324355932182e-06, + "loss": 0.4658, + "step": 123390 + }, + { + "epoch": 2.746616809116809, + "grad_norm": 0.4032246470451355, + "learning_rate": 7.001091500498724e-06, + "loss": 0.5098, + "step": 123400 + }, + { + "epoch": 2.7468393874643873, + "grad_norm": 0.5415821075439453, + "learning_rate": 6.98886913278396e-06, + "loss": 0.4953, + "step": 123410 + }, + { + "epoch": 2.747061965811966, + "grad_norm": 0.6676926612854004, + "learning_rate": 6.976657253452046e-06, + "loss": 0.5028, + "step": 123420 + }, + { + "epoch": 2.747284544159544, + "grad_norm": 0.49392446875572205, + "learning_rate": 6.964455863166586e-06, + "loss": 0.3837, + "step": 123430 + }, + { + "epoch": 2.7475071225071224, + "grad_norm": 0.7210075855255127, + "learning_rate": 6.952264962590649e-06, + "loss": 0.4951, + "step": 123440 + }, + { + "epoch": 2.747729700854701, + "grad_norm": 0.7871028780937195, + "learning_rate": 6.940084552386661e-06, + "loss": 0.5332, + "step": 123450 + }, + { + "epoch": 2.7479522792022792, + "grad_norm": 0.7726924419403076, + "learning_rate": 6.9279146332165146e-06, + "loss": 0.4536, + "step": 123460 + }, + { + "epoch": 2.7481748575498575, + "grad_norm": 0.8336293697357178, + "learning_rate": 6.915755205741548e-06, + "loss": 0.4544, + "step": 123470 + }, + { + "epoch": 2.748397435897436, + "grad_norm": 0.6057497262954712, + "learning_rate": 6.903606270622498e-06, + "loss": 0.6092, + "step": 123480 + }, + { + "epoch": 2.7486200142450143, + "grad_norm": 0.47200432419776917, + "learning_rate": 6.8914678285195935e-06, + "loss": 0.4223, + "step": 123490 + }, + { + "epoch": 2.7488425925925926, + "grad_norm": 0.46911531686782837, + "learning_rate": 6.8793398800923725e-06, + "loss": 0.4531, + "step": 123500 + }, + { + "epoch": 2.7490651709401708, + "grad_norm": 0.639143705368042, + "learning_rate": 6.8672224259999304e-06, + "loss": 0.333, + "step": 123510 + }, + { + "epoch": 2.7492877492877494, + "grad_norm": 0.5659869909286499, + "learning_rate": 6.855115466900741e-06, + "loss": 0.4955, + "step": 123520 + }, + { + "epoch": 2.7495103276353277, + "grad_norm": 0.6819111704826355, + "learning_rate": 6.8430190034527e-06, + "loss": 0.5726, + "step": 123530 + }, + { + "epoch": 2.749732905982906, + "grad_norm": 0.5177549123764038, + "learning_rate": 6.830933036313103e-06, + "loss": 0.4823, + "step": 123540 + }, + { + "epoch": 2.749955484330484, + "grad_norm": 0.511566698551178, + "learning_rate": 6.818857566138759e-06, + "loss": 0.4393, + "step": 123550 + }, + { + "epoch": 2.7501780626780628, + "grad_norm": 0.7504387497901917, + "learning_rate": 6.806792593585831e-06, + "loss": 0.4776, + "step": 123560 + }, + { + "epoch": 2.750400641025641, + "grad_norm": 0.6001155972480774, + "learning_rate": 6.794738119309952e-06, + "loss": 0.4133, + "step": 123570 + }, + { + "epoch": 2.750623219373219, + "grad_norm": 0.5214382410049438, + "learning_rate": 6.782694143966173e-06, + "loss": 0.3677, + "step": 123580 + }, + { + "epoch": 2.750845797720798, + "grad_norm": 0.32441210746765137, + "learning_rate": 6.770660668208973e-06, + "loss": 0.3922, + "step": 123590 + }, + { + "epoch": 2.751068376068376, + "grad_norm": 0.7337201237678528, + "learning_rate": 6.758637692692249e-06, + "loss": 0.4378, + "step": 123600 + }, + { + "epoch": 2.7512909544159543, + "grad_norm": 0.5562883615493774, + "learning_rate": 6.746625218069347e-06, + "loss": 0.4645, + "step": 123610 + }, + { + "epoch": 2.751513532763533, + "grad_norm": 0.5827807784080505, + "learning_rate": 6.7346232449930545e-06, + "loss": 0.3289, + "step": 123620 + }, + { + "epoch": 2.751736111111111, + "grad_norm": 0.44625237584114075, + "learning_rate": 6.722631774115518e-06, + "loss": 0.4546, + "step": 123630 + }, + { + "epoch": 2.7519586894586894, + "grad_norm": 0.5511494874954224, + "learning_rate": 6.710650806088415e-06, + "loss": 0.4813, + "step": 123640 + }, + { + "epoch": 2.752181267806268, + "grad_norm": 0.5771028995513916, + "learning_rate": 6.698680341562757e-06, + "loss": 0.452, + "step": 123650 + }, + { + "epoch": 2.7524038461538463, + "grad_norm": 0.4411100149154663, + "learning_rate": 6.686720381189071e-06, + "loss": 0.461, + "step": 123660 + }, + { + "epoch": 2.7526264245014245, + "grad_norm": 0.6261259913444519, + "learning_rate": 6.67477092561728e-06, + "loss": 0.4468, + "step": 123670 + }, + { + "epoch": 2.7528490028490027, + "grad_norm": 0.5436714291572571, + "learning_rate": 6.662831975496664e-06, + "loss": 0.5056, + "step": 123680 + }, + { + "epoch": 2.7530715811965814, + "grad_norm": 0.6230029463768005, + "learning_rate": 6.650903531476038e-06, + "loss": 0.4329, + "step": 123690 + }, + { + "epoch": 2.7532941595441596, + "grad_norm": 0.46451959013938904, + "learning_rate": 6.638985594203595e-06, + "loss": 0.4803, + "step": 123700 + }, + { + "epoch": 2.753516737891738, + "grad_norm": 0.33018794655799866, + "learning_rate": 6.627078164326972e-06, + "loss": 0.3603, + "step": 123710 + }, + { + "epoch": 2.753739316239316, + "grad_norm": 0.6304956078529358, + "learning_rate": 6.615181242493207e-06, + "loss": 0.4246, + "step": 123720 + }, + { + "epoch": 2.7539618945868947, + "grad_norm": 0.32348451018333435, + "learning_rate": 6.603294829348849e-06, + "loss": 0.4164, + "step": 123730 + }, + { + "epoch": 2.754184472934473, + "grad_norm": 0.49687933921813965, + "learning_rate": 6.591418925539739e-06, + "loss": 0.4356, + "step": 123740 + }, + { + "epoch": 2.754407051282051, + "grad_norm": 0.5507271885871887, + "learning_rate": 6.5795535317112686e-06, + "loss": 0.3441, + "step": 123750 + }, + { + "epoch": 2.7546296296296298, + "grad_norm": 0.313600093126297, + "learning_rate": 6.567698648508214e-06, + "loss": 0.4408, + "step": 123760 + }, + { + "epoch": 2.754852207977208, + "grad_norm": 0.7065009474754333, + "learning_rate": 6.5558542765747465e-06, + "loss": 0.5691, + "step": 123770 + }, + { + "epoch": 2.755074786324786, + "grad_norm": 0.8352875113487244, + "learning_rate": 6.544020416554508e-06, + "loss": 0.4224, + "step": 123780 + }, + { + "epoch": 2.755297364672365, + "grad_norm": 0.5489425659179688, + "learning_rate": 6.532197069090584e-06, + "loss": 0.5168, + "step": 123790 + }, + { + "epoch": 2.755519943019943, + "grad_norm": 0.8018131256103516, + "learning_rate": 6.520384234825438e-06, + "loss": 0.4949, + "step": 123800 + }, + { + "epoch": 2.7557425213675213, + "grad_norm": 0.7913144826889038, + "learning_rate": 6.5085819144010244e-06, + "loss": 0.424, + "step": 123810 + }, + { + "epoch": 2.7559650997151, + "grad_norm": 0.5006592869758606, + "learning_rate": 6.49679010845865e-06, + "loss": 0.4368, + "step": 123820 + }, + { + "epoch": 2.756187678062678, + "grad_norm": 0.5908501744270325, + "learning_rate": 6.485008817639116e-06, + "loss": 0.4645, + "step": 123830 + }, + { + "epoch": 2.7564102564102564, + "grad_norm": 0.6493932008743286, + "learning_rate": 6.473238042582619e-06, + "loss": 0.3913, + "step": 123840 + }, + { + "epoch": 2.7566328347578346, + "grad_norm": 0.6723030805587769, + "learning_rate": 6.461477783928804e-06, + "loss": 0.4737, + "step": 123850 + }, + { + "epoch": 2.7568554131054133, + "grad_norm": 0.3196379244327545, + "learning_rate": 6.449728042316716e-06, + "loss": 0.4074, + "step": 123860 + }, + { + "epoch": 2.7570779914529915, + "grad_norm": 0.42770472168922424, + "learning_rate": 6.437988818384865e-06, + "loss": 0.4899, + "step": 123870 + }, + { + "epoch": 2.7573005698005697, + "grad_norm": 0.8020942807197571, + "learning_rate": 6.426260112771165e-06, + "loss": 0.5244, + "step": 123880 + }, + { + "epoch": 2.757523148148148, + "grad_norm": 0.6081737279891968, + "learning_rate": 6.414541926112949e-06, + "loss": 0.3525, + "step": 123890 + }, + { + "epoch": 2.7577457264957266, + "grad_norm": 0.5558830499649048, + "learning_rate": 6.4028342590469966e-06, + "loss": 0.4875, + "step": 123900 + }, + { + "epoch": 2.757968304843305, + "grad_norm": 0.7631189823150635, + "learning_rate": 6.391137112209511e-06, + "loss": 0.4206, + "step": 123910 + }, + { + "epoch": 2.758190883190883, + "grad_norm": 0.5545490980148315, + "learning_rate": 6.379450486236138e-06, + "loss": 0.4263, + "step": 123920 + }, + { + "epoch": 2.7584134615384617, + "grad_norm": 0.39973217248916626, + "learning_rate": 6.367774381761926e-06, + "loss": 0.4443, + "step": 123930 + }, + { + "epoch": 2.75863603988604, + "grad_norm": 0.34016963839530945, + "learning_rate": 6.356108799421368e-06, + "loss": 0.4435, + "step": 123940 + }, + { + "epoch": 2.758858618233618, + "grad_norm": 0.5182162523269653, + "learning_rate": 6.344453739848399e-06, + "loss": 0.381, + "step": 123950 + }, + { + "epoch": 2.7590811965811968, + "grad_norm": 0.5975072979927063, + "learning_rate": 6.332809203676315e-06, + "loss": 0.4656, + "step": 123960 + }, + { + "epoch": 2.759303774928775, + "grad_norm": 0.5642514228820801, + "learning_rate": 6.321175191537942e-06, + "loss": 0.4357, + "step": 123970 + }, + { + "epoch": 2.759526353276353, + "grad_norm": 0.4777040183544159, + "learning_rate": 6.309551704065464e-06, + "loss": 0.4087, + "step": 123980 + }, + { + "epoch": 2.759748931623932, + "grad_norm": 0.5907963514328003, + "learning_rate": 6.297938741890486e-06, + "loss": 0.4484, + "step": 123990 + }, + { + "epoch": 2.75997150997151, + "grad_norm": 0.6896931529045105, + "learning_rate": 6.286336305644125e-06, + "loss": 0.4042, + "step": 124000 + }, + { + "epoch": 2.7601940883190883, + "grad_norm": 0.37854743003845215, + "learning_rate": 6.2747443959568106e-06, + "loss": 0.3652, + "step": 124010 + }, + { + "epoch": 2.760327635327635, + "eval_loss": 0.5210950970649719, + "eval_runtime": 337.1981, + "eval_samples_per_second": 7.014, + "eval_steps_per_second": 7.014, + "step": 124016 + }, + { + "epoch": 2.7604166666666665, + "grad_norm": 0.5546755194664001, + "learning_rate": 6.2631630134585066e-06, + "loss": 0.4984, + "step": 124020 + }, + { + "epoch": 2.760639245014245, + "grad_norm": 0.3803851902484894, + "learning_rate": 6.251592158778485e-06, + "loss": 0.3478, + "step": 124030 + }, + { + "epoch": 2.7608618233618234, + "grad_norm": 0.8168506622314453, + "learning_rate": 6.240031832545579e-06, + "loss": 0.5383, + "step": 124040 + }, + { + "epoch": 2.7610844017094016, + "grad_norm": 0.4964335560798645, + "learning_rate": 6.22848203538795e-06, + "loss": 0.4466, + "step": 124050 + }, + { + "epoch": 2.76130698005698, + "grad_norm": 0.4928797781467438, + "learning_rate": 6.216942767933232e-06, + "loss": 0.4293, + "step": 124060 + }, + { + "epoch": 2.7615295584045585, + "grad_norm": 0.5578243136405945, + "learning_rate": 6.205414030808476e-06, + "loss": 0.4597, + "step": 124070 + }, + { + "epoch": 2.7617521367521367, + "grad_norm": 0.9077683687210083, + "learning_rate": 6.1938958246402055e-06, + "loss": 0.4468, + "step": 124080 + }, + { + "epoch": 2.761974715099715, + "grad_norm": 0.40201684832572937, + "learning_rate": 6.182388150054252e-06, + "loss": 0.4142, + "step": 124090 + }, + { + "epoch": 2.7621972934472936, + "grad_norm": 0.38440248370170593, + "learning_rate": 6.170891007676005e-06, + "loss": 0.4585, + "step": 124100 + }, + { + "epoch": 2.762419871794872, + "grad_norm": 0.4817364811897278, + "learning_rate": 6.15940439813023e-06, + "loss": 0.4741, + "step": 124110 + }, + { + "epoch": 2.76264245014245, + "grad_norm": 0.5801695585250854, + "learning_rate": 6.147928322041096e-06, + "loss": 0.4417, + "step": 124120 + }, + { + "epoch": 2.7628650284900287, + "grad_norm": 0.5268610119819641, + "learning_rate": 6.136462780032215e-06, + "loss": 0.5868, + "step": 124130 + }, + { + "epoch": 2.763087606837607, + "grad_norm": 0.3276066482067108, + "learning_rate": 6.125007772726665e-06, + "loss": 0.4841, + "step": 124140 + }, + { + "epoch": 2.763310185185185, + "grad_norm": 0.8135481476783752, + "learning_rate": 6.113563300746928e-06, + "loss": 0.5494, + "step": 124150 + }, + { + "epoch": 2.763532763532764, + "grad_norm": 0.6948876976966858, + "learning_rate": 6.102129364714881e-06, + "loss": 0.384, + "step": 124160 + }, + { + "epoch": 2.763755341880342, + "grad_norm": 0.7384427189826965, + "learning_rate": 6.090705965251831e-06, + "loss": 0.4289, + "step": 124170 + }, + { + "epoch": 2.76397792022792, + "grad_norm": 0.6335643529891968, + "learning_rate": 6.079293102978567e-06, + "loss": 0.4731, + "step": 124180 + }, + { + "epoch": 2.7642004985754984, + "grad_norm": 0.566062867641449, + "learning_rate": 6.067890778515261e-06, + "loss": 0.5385, + "step": 124190 + }, + { + "epoch": 2.7644230769230766, + "grad_norm": 0.39977380633354187, + "learning_rate": 6.056498992481552e-06, + "loss": 0.4217, + "step": 124200 + }, + { + "epoch": 2.7646456552706553, + "grad_norm": 0.48577919602394104, + "learning_rate": 6.045117745496431e-06, + "loss": 0.4004, + "step": 124210 + }, + { + "epoch": 2.7648682336182335, + "grad_norm": 0.6411281824111938, + "learning_rate": 6.033747038178428e-06, + "loss": 0.3765, + "step": 124220 + }, + { + "epoch": 2.7650908119658117, + "grad_norm": 0.8137752413749695, + "learning_rate": 6.022386871145358e-06, + "loss": 0.5462, + "step": 124230 + }, + { + "epoch": 2.7653133903133904, + "grad_norm": 0.6225287914276123, + "learning_rate": 6.011037245014594e-06, + "loss": 0.461, + "step": 124240 + }, + { + "epoch": 2.7655359686609686, + "grad_norm": 0.3489503860473633, + "learning_rate": 5.999698160402889e-06, + "loss": 0.349, + "step": 124250 + }, + { + "epoch": 2.765758547008547, + "grad_norm": 0.5511321425437927, + "learning_rate": 5.988369617926371e-06, + "loss": 0.4547, + "step": 124260 + }, + { + "epoch": 2.7659811253561255, + "grad_norm": 0.5289965867996216, + "learning_rate": 5.977051618200702e-06, + "loss": 0.4483, + "step": 124270 + }, + { + "epoch": 2.7662037037037037, + "grad_norm": 0.686833381652832, + "learning_rate": 5.965744161840881e-06, + "loss": 0.4185, + "step": 124280 + }, + { + "epoch": 2.766426282051282, + "grad_norm": 0.5750662088394165, + "learning_rate": 5.95444724946137e-06, + "loss": 0.4877, + "step": 124290 + }, + { + "epoch": 2.7666488603988606, + "grad_norm": 0.5627291798591614, + "learning_rate": 5.943160881676036e-06, + "loss": 0.4081, + "step": 124300 + }, + { + "epoch": 2.766871438746439, + "grad_norm": 0.8569827079772949, + "learning_rate": 5.9318850590982076e-06, + "loss": 0.4886, + "step": 124310 + }, + { + "epoch": 2.767094017094017, + "grad_norm": 0.6511522531509399, + "learning_rate": 5.9206197823406196e-06, + "loss": 0.6063, + "step": 124320 + }, + { + "epoch": 2.7673165954415957, + "grad_norm": 0.6511116027832031, + "learning_rate": 5.909365052015403e-06, + "loss": 0.3868, + "step": 124330 + }, + { + "epoch": 2.767539173789174, + "grad_norm": 0.5759599804878235, + "learning_rate": 5.8981208687342024e-06, + "loss": 0.5286, + "step": 124340 + }, + { + "epoch": 2.767761752136752, + "grad_norm": 0.4088830351829529, + "learning_rate": 5.886887233108018e-06, + "loss": 0.5101, + "step": 124350 + }, + { + "epoch": 2.7679843304843303, + "grad_norm": 0.39041629433631897, + "learning_rate": 5.875664145747295e-06, + "loss": 0.3891, + "step": 124360 + }, + { + "epoch": 2.7682069088319086, + "grad_norm": 0.5441334843635559, + "learning_rate": 5.864451607261901e-06, + "loss": 0.4568, + "step": 124370 + }, + { + "epoch": 2.7684294871794872, + "grad_norm": 0.5711949467658997, + "learning_rate": 5.853249618261103e-06, + "loss": 0.4501, + "step": 124380 + }, + { + "epoch": 2.7686520655270654, + "grad_norm": 0.7167291045188904, + "learning_rate": 5.842058179353682e-06, + "loss": 0.476, + "step": 124390 + }, + { + "epoch": 2.7688746438746437, + "grad_norm": 0.373648077249527, + "learning_rate": 5.830877291147752e-06, + "loss": 0.3384, + "step": 124400 + }, + { + "epoch": 2.7690972222222223, + "grad_norm": 0.7929762005805969, + "learning_rate": 5.819706954250936e-06, + "loss": 0.3829, + "step": 124410 + }, + { + "epoch": 2.7693198005698005, + "grad_norm": 0.5502915978431702, + "learning_rate": 5.808547169270173e-06, + "loss": 0.4779, + "step": 124420 + }, + { + "epoch": 2.7695423789173788, + "grad_norm": 0.43421173095703125, + "learning_rate": 5.797397936811933e-06, + "loss": 0.3928, + "step": 124430 + }, + { + "epoch": 2.7697649572649574, + "grad_norm": 0.49342089891433716, + "learning_rate": 5.786259257482085e-06, + "loss": 0.3114, + "step": 124440 + }, + { + "epoch": 2.7699875356125356, + "grad_norm": 0.793108344078064, + "learning_rate": 5.775131131885903e-06, + "loss": 0.538, + "step": 124450 + }, + { + "epoch": 2.770210113960114, + "grad_norm": 0.6837170124053955, + "learning_rate": 5.764013560628079e-06, + "loss": 0.6071, + "step": 124460 + }, + { + "epoch": 2.7704326923076925, + "grad_norm": 0.7336123585700989, + "learning_rate": 5.752906544312753e-06, + "loss": 0.4091, + "step": 124470 + }, + { + "epoch": 2.7706552706552707, + "grad_norm": 0.567870557308197, + "learning_rate": 5.741810083543531e-06, + "loss": 0.4292, + "step": 124480 + }, + { + "epoch": 2.770877849002849, + "grad_norm": 0.667264997959137, + "learning_rate": 5.730724178923353e-06, + "loss": 0.5032, + "step": 124490 + }, + { + "epoch": 2.7711004273504276, + "grad_norm": 0.5979461073875427, + "learning_rate": 5.719648831054691e-06, + "loss": 0.3892, + "step": 124500 + }, + { + "epoch": 2.771323005698006, + "grad_norm": 0.7902460694313049, + "learning_rate": 5.708584040539333e-06, + "loss": 0.5792, + "step": 124510 + }, + { + "epoch": 2.771545584045584, + "grad_norm": 0.5519484877586365, + "learning_rate": 5.697529807978574e-06, + "loss": 0.4082, + "step": 124520 + }, + { + "epoch": 2.7717681623931623, + "grad_norm": 0.4593891203403473, + "learning_rate": 5.686486133973112e-06, + "loss": 0.4129, + "step": 124530 + }, + { + "epoch": 2.7719907407407405, + "grad_norm": 0.5422836542129517, + "learning_rate": 5.675453019123089e-06, + "loss": 0.555, + "step": 124540 + }, + { + "epoch": 2.772213319088319, + "grad_norm": 0.7721577286720276, + "learning_rate": 5.664430464028004e-06, + "loss": 0.5353, + "step": 124550 + }, + { + "epoch": 2.7724358974358974, + "grad_norm": 0.42286717891693115, + "learning_rate": 5.653418469286864e-06, + "loss": 0.3813, + "step": 124560 + }, + { + "epoch": 2.7726584757834756, + "grad_norm": 0.7902956604957581, + "learning_rate": 5.64241703549806e-06, + "loss": 0.4462, + "step": 124570 + }, + { + "epoch": 2.7728810541310542, + "grad_norm": 0.4630812108516693, + "learning_rate": 5.6314261632594455e-06, + "loss": 0.4752, + "step": 124580 + }, + { + "epoch": 2.7731036324786325, + "grad_norm": 0.702799379825592, + "learning_rate": 5.620445853168232e-06, + "loss": 0.4306, + "step": 124590 + }, + { + "epoch": 2.7733262108262107, + "grad_norm": 0.7227345705032349, + "learning_rate": 5.609476105821099e-06, + "loss": 0.3994, + "step": 124600 + }, + { + "epoch": 2.7735487891737893, + "grad_norm": 0.485037237405777, + "learning_rate": 5.59851692181419e-06, + "loss": 0.4176, + "step": 124610 + }, + { + "epoch": 2.7737713675213675, + "grad_norm": 0.5751772522926331, + "learning_rate": 5.587568301743007e-06, + "loss": 0.3802, + "step": 124620 + }, + { + "epoch": 2.7739939458689458, + "grad_norm": 0.5980727076530457, + "learning_rate": 5.576630246202519e-06, + "loss": 0.4938, + "step": 124630 + }, + { + "epoch": 2.7742165242165244, + "grad_norm": 0.7590274214744568, + "learning_rate": 5.565702755787116e-06, + "loss": 0.4358, + "step": 124640 + }, + { + "epoch": 2.7744391025641026, + "grad_norm": 0.48052945733070374, + "learning_rate": 5.554785831090592e-06, + "loss": 0.5187, + "step": 124650 + }, + { + "epoch": 2.774661680911681, + "grad_norm": 0.44331279397010803, + "learning_rate": 5.543879472706181e-06, + "loss": 0.387, + "step": 124660 + }, + { + "epoch": 2.7748842592592595, + "grad_norm": 0.47672250866889954, + "learning_rate": 5.532983681226567e-06, + "loss": 0.4659, + "step": 124670 + }, + { + "epoch": 2.7751068376068377, + "grad_norm": 0.5024214386940002, + "learning_rate": 5.522098457243807e-06, + "loss": 0.4339, + "step": 124680 + }, + { + "epoch": 2.775329415954416, + "grad_norm": 0.406444251537323, + "learning_rate": 5.511223801349408e-06, + "loss": 0.3837, + "step": 124690 + }, + { + "epoch": 2.775551994301994, + "grad_norm": 0.5859958529472351, + "learning_rate": 5.50035971413434e-06, + "loss": 0.4784, + "step": 124700 + }, + { + "epoch": 2.7757745726495724, + "grad_norm": 0.5342255234718323, + "learning_rate": 5.489506196188931e-06, + "loss": 0.4102, + "step": 124710 + }, + { + "epoch": 2.775997150997151, + "grad_norm": 0.516022801399231, + "learning_rate": 5.478663248103022e-06, + "loss": 0.5015, + "step": 124720 + }, + { + "epoch": 2.7762197293447293, + "grad_norm": 0.6034132242202759, + "learning_rate": 5.467830870465762e-06, + "loss": 0.5297, + "step": 124730 + }, + { + "epoch": 2.7764423076923075, + "grad_norm": 0.4431053698062897, + "learning_rate": 5.457009063865837e-06, + "loss": 0.425, + "step": 124740 + }, + { + "epoch": 2.776664886039886, + "grad_norm": 0.6430148482322693, + "learning_rate": 5.446197828891309e-06, + "loss": 0.4861, + "step": 124750 + }, + { + "epoch": 2.7768874643874644, + "grad_norm": 0.4936456084251404, + "learning_rate": 5.435397166129641e-06, + "loss": 0.4151, + "step": 124760 + }, + { + "epoch": 2.7771100427350426, + "grad_norm": 0.5719223022460938, + "learning_rate": 5.4246070761677646e-06, + "loss": 0.5108, + "step": 124770 + }, + { + "epoch": 2.7773326210826212, + "grad_norm": 0.6354074478149414, + "learning_rate": 5.413827559592055e-06, + "loss": 0.4843, + "step": 124780 + }, + { + "epoch": 2.7775551994301995, + "grad_norm": 0.6604524850845337, + "learning_rate": 5.403058616988243e-06, + "loss": 0.4567, + "step": 124790 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.666455864906311, + "learning_rate": 5.392300248941551e-06, + "loss": 0.5346, + "step": 124800 + }, + { + "epoch": 2.7780003561253563, + "grad_norm": 0.45406487584114075, + "learning_rate": 5.3815524560365314e-06, + "loss": 0.4984, + "step": 124810 + }, + { + "epoch": 2.7782229344729346, + "grad_norm": 0.5581161379814148, + "learning_rate": 5.370815238857296e-06, + "loss": 0.5025, + "step": 124820 + }, + { + "epoch": 2.7784455128205128, + "grad_norm": 0.48750677704811096, + "learning_rate": 5.3600885979872895e-06, + "loss": 0.4852, + "step": 124830 + }, + { + "epoch": 2.7786680911680914, + "grad_norm": 0.5511132478713989, + "learning_rate": 5.3493725340094e-06, + "loss": 0.5341, + "step": 124840 + }, + { + "epoch": 2.7788906695156697, + "grad_norm": 0.6001371741294861, + "learning_rate": 5.338667047505963e-06, + "loss": 0.5021, + "step": 124850 + }, + { + "epoch": 2.779113247863248, + "grad_norm": 0.6603468656539917, + "learning_rate": 5.327972139058712e-06, + "loss": 0.4132, + "step": 124860 + }, + { + "epoch": 2.779335826210826, + "grad_norm": 0.5646976232528687, + "learning_rate": 5.317287809248828e-06, + "loss": 0.481, + "step": 124870 + }, + { + "epoch": 2.7795584045584043, + "grad_norm": 0.7938178777694702, + "learning_rate": 5.306614058656889e-06, + "loss": 0.5433, + "step": 124880 + }, + { + "epoch": 2.779780982905983, + "grad_norm": 0.6014885306358337, + "learning_rate": 5.295950887862922e-06, + "loss": 0.4261, + "step": 124890 + }, + { + "epoch": 2.780003561253561, + "grad_norm": 1.25307035446167, + "learning_rate": 5.285298297446395e-06, + "loss": 0.482, + "step": 124900 + }, + { + "epoch": 2.7802261396011394, + "grad_norm": 0.49934521317481995, + "learning_rate": 5.274656287986135e-06, + "loss": 0.4825, + "step": 124910 + }, + { + "epoch": 2.780448717948718, + "grad_norm": 0.7475011944770813, + "learning_rate": 5.264024860060501e-06, + "loss": 0.4176, + "step": 124920 + }, + { + "epoch": 2.7806712962962963, + "grad_norm": 0.4853910207748413, + "learning_rate": 5.253404014247143e-06, + "loss": 0.3782, + "step": 124930 + }, + { + "epoch": 2.7808938746438745, + "grad_norm": 0.3415036201477051, + "learning_rate": 5.242793751123265e-06, + "loss": 0.4971, + "step": 124940 + }, + { + "epoch": 2.781116452991453, + "grad_norm": 0.4938315451145172, + "learning_rate": 5.232194071265384e-06, + "loss": 0.3815, + "step": 124950 + }, + { + "epoch": 2.7813390313390314, + "grad_norm": 0.6822963356971741, + "learning_rate": 5.2216049752495275e-06, + "loss": 0.465, + "step": 124960 + }, + { + "epoch": 2.7815616096866096, + "grad_norm": 0.44059741497039795, + "learning_rate": 5.211026463651103e-06, + "loss": 0.4366, + "step": 124970 + }, + { + "epoch": 2.7817841880341883, + "grad_norm": 0.5339191555976868, + "learning_rate": 5.200458537044961e-06, + "loss": 0.4865, + "step": 124980 + }, + { + "epoch": 2.7820067663817665, + "grad_norm": 0.5560976266860962, + "learning_rate": 5.189901196005376e-06, + "loss": 0.4412, + "step": 124990 + }, + { + "epoch": 2.7822293447293447, + "grad_norm": 0.725497841835022, + "learning_rate": 5.179354441106066e-06, + "loss": 0.3774, + "step": 125000 + }, + { + "epoch": 2.7824519230769234, + "grad_norm": 0.5456880927085876, + "learning_rate": 5.168818272920084e-06, + "loss": 0.4396, + "step": 125010 + }, + { + "epoch": 2.7826745014245016, + "grad_norm": 0.6306502819061279, + "learning_rate": 5.1582926920200395e-06, + "loss": 0.5471, + "step": 125020 + }, + { + "epoch": 2.78289707977208, + "grad_norm": 0.5285366773605347, + "learning_rate": 5.1477776989778515e-06, + "loss": 0.4517, + "step": 125030 + }, + { + "epoch": 2.783119658119658, + "grad_norm": 0.5569385886192322, + "learning_rate": 5.1372732943649524e-06, + "loss": 0.4586, + "step": 125040 + }, + { + "epoch": 2.783342236467236, + "grad_norm": 0.4127381145954132, + "learning_rate": 5.126779478752131e-06, + "loss": 0.503, + "step": 125050 + }, + { + "epoch": 2.783564814814815, + "grad_norm": 0.6115913987159729, + "learning_rate": 5.116296252709685e-06, + "loss": 0.4978, + "step": 125060 + }, + { + "epoch": 2.783787393162393, + "grad_norm": 0.6389346122741699, + "learning_rate": 5.105823616807226e-06, + "loss": 0.5112, + "step": 125070 + }, + { + "epoch": 2.7840099715099713, + "grad_norm": 0.7028676271438599, + "learning_rate": 5.095361571613833e-06, + "loss": 0.5135, + "step": 125080 + }, + { + "epoch": 2.78423254985755, + "grad_norm": 0.6609174609184265, + "learning_rate": 5.0849101176980496e-06, + "loss": 0.5216, + "step": 125090 + }, + { + "epoch": 2.784455128205128, + "grad_norm": 0.49764448404312134, + "learning_rate": 5.074469255627823e-06, + "loss": 0.4357, + "step": 125100 + }, + { + "epoch": 2.7846777065527064, + "grad_norm": 0.4654216766357422, + "learning_rate": 5.064038985970499e-06, + "loss": 0.3966, + "step": 125110 + }, + { + "epoch": 2.784900284900285, + "grad_norm": 0.7057489156723022, + "learning_rate": 5.0536193092928894e-06, + "loss": 0.4451, + "step": 125120 + }, + { + "epoch": 2.7851228632478633, + "grad_norm": 0.5709713101387024, + "learning_rate": 5.043210226161211e-06, + "loss": 0.4062, + "step": 125130 + }, + { + "epoch": 2.7853454415954415, + "grad_norm": 0.535394549369812, + "learning_rate": 5.032811737141074e-06, + "loss": 0.4252, + "step": 125140 + }, + { + "epoch": 2.78556801994302, + "grad_norm": 0.5492165684700012, + "learning_rate": 5.022423842797563e-06, + "loss": 0.3341, + "step": 125150 + }, + { + "epoch": 2.7857905982905984, + "grad_norm": 0.3868101239204407, + "learning_rate": 5.012046543695137e-06, + "loss": 0.3926, + "step": 125160 + }, + { + "epoch": 2.7860131766381766, + "grad_norm": 0.5796651840209961, + "learning_rate": 5.001679840397744e-06, + "loss": 0.4226, + "step": 125170 + }, + { + "epoch": 2.786235754985755, + "grad_norm": 0.4309646785259247, + "learning_rate": 4.991323733468689e-06, + "loss": 0.4942, + "step": 125180 + }, + { + "epoch": 2.7864583333333335, + "grad_norm": 0.430446594953537, + "learning_rate": 4.980978223470745e-06, + "loss": 0.3336, + "step": 125190 + }, + { + "epoch": 2.7866809116809117, + "grad_norm": 0.3631274104118347, + "learning_rate": 4.9706433109661276e-06, + "loss": 0.3256, + "step": 125200 + }, + { + "epoch": 2.78690349002849, + "grad_norm": 0.3939351737499237, + "learning_rate": 4.960318996516411e-06, + "loss": 0.4333, + "step": 125210 + }, + { + "epoch": 2.787126068376068, + "grad_norm": 0.6610859036445618, + "learning_rate": 4.9500052806825905e-06, + "loss": 0.3629, + "step": 125220 + }, + { + "epoch": 2.787348646723647, + "grad_norm": 0.8630814552307129, + "learning_rate": 4.939702164025173e-06, + "loss": 0.3412, + "step": 125230 + }, + { + "epoch": 2.787571225071225, + "grad_norm": 0.7305048704147339, + "learning_rate": 4.929409647103999e-06, + "loss": 0.5311, + "step": 125240 + }, + { + "epoch": 2.7877938034188032, + "grad_norm": 0.5739668607711792, + "learning_rate": 4.919127730478423e-06, + "loss": 0.4532, + "step": 125250 + }, + { + "epoch": 2.788016381766382, + "grad_norm": 0.7734736204147339, + "learning_rate": 4.90885641470713e-06, + "loss": 0.5096, + "step": 125260 + }, + { + "epoch": 2.78823896011396, + "grad_norm": 0.4135821759700775, + "learning_rate": 4.898595700348297e-06, + "loss": 0.4665, + "step": 125270 + }, + { + "epoch": 2.7884615384615383, + "grad_norm": 0.6077858209609985, + "learning_rate": 4.8883455879595195e-06, + "loss": 0.3647, + "step": 125280 + }, + { + "epoch": 2.788684116809117, + "grad_norm": 0.7480467557907104, + "learning_rate": 4.8781060780977325e-06, + "loss": 0.478, + "step": 125290 + }, + { + "epoch": 2.788906695156695, + "grad_norm": 0.49442699551582336, + "learning_rate": 4.867877171319402e-06, + "loss": 0.473, + "step": 125300 + }, + { + "epoch": 2.7891292735042734, + "grad_norm": 0.38600459694862366, + "learning_rate": 4.85765886818037e-06, + "loss": 0.365, + "step": 125310 + }, + { + "epoch": 2.789351851851852, + "grad_norm": 0.4062456786632538, + "learning_rate": 4.847451169235906e-06, + "loss": 0.4105, + "step": 125320 + }, + { + "epoch": 2.7895744301994303, + "grad_norm": 0.49349337816238403, + "learning_rate": 4.83725407504072e-06, + "loss": 0.5138, + "step": 125330 + }, + { + "epoch": 2.7897970085470085, + "grad_norm": 0.7541506886482239, + "learning_rate": 4.827067586148903e-06, + "loss": 0.4975, + "step": 125340 + }, + { + "epoch": 2.7900195868945867, + "grad_norm": 0.7903211116790771, + "learning_rate": 4.816891703114013e-06, + "loss": 0.4628, + "step": 125350 + }, + { + "epoch": 2.7902421652421654, + "grad_norm": 0.7652298808097839, + "learning_rate": 4.806726426489006e-06, + "loss": 0.5695, + "step": 125360 + }, + { + "epoch": 2.7904647435897436, + "grad_norm": 0.7200684547424316, + "learning_rate": 4.7965717568262844e-06, + "loss": 0.4031, + "step": 125370 + }, + { + "epoch": 2.790687321937322, + "grad_norm": 0.41406622529029846, + "learning_rate": 4.786427694677653e-06, + "loss": 0.5313, + "step": 125380 + }, + { + "epoch": 2.7909099002849, + "grad_norm": 0.546970009803772, + "learning_rate": 4.7762942405943365e-06, + "loss": 0.4762, + "step": 125390 + }, + { + "epoch": 2.7911324786324787, + "grad_norm": 0.5090673565864563, + "learning_rate": 4.766171395127006e-06, + "loss": 0.4604, + "step": 125400 + }, + { + "epoch": 2.791355056980057, + "grad_norm": 0.5817758440971375, + "learning_rate": 4.756059158825754e-06, + "loss": 0.4538, + "step": 125410 + }, + { + "epoch": 2.791577635327635, + "grad_norm": 0.4301309287548065, + "learning_rate": 4.74595753224012e-06, + "loss": 0.4941, + "step": 125420 + }, + { + "epoch": 2.791800213675214, + "grad_norm": 0.555213987827301, + "learning_rate": 4.7358665159189516e-06, + "loss": 0.447, + "step": 125430 + }, + { + "epoch": 2.792022792022792, + "grad_norm": 0.5680949687957764, + "learning_rate": 4.725786110410657e-06, + "loss": 0.5646, + "step": 125440 + }, + { + "epoch": 2.7922453703703702, + "grad_norm": 0.568378746509552, + "learning_rate": 4.715716316262997e-06, + "loss": 0.4267, + "step": 125450 + }, + { + "epoch": 2.792467948717949, + "grad_norm": 0.6673989295959473, + "learning_rate": 4.7056571340232225e-06, + "loss": 0.4868, + "step": 125460 + }, + { + "epoch": 2.792690527065527, + "grad_norm": 0.5990675687789917, + "learning_rate": 4.6956085642378745e-06, + "loss": 0.4915, + "step": 125470 + }, + { + "epoch": 2.7929131054131053, + "grad_norm": 0.4197758734226227, + "learning_rate": 4.685570607453027e-06, + "loss": 0.448, + "step": 125480 + }, + { + "epoch": 2.793135683760684, + "grad_norm": 0.6081216335296631, + "learning_rate": 4.675543264214222e-06, + "loss": 0.486, + "step": 125490 + }, + { + "epoch": 2.793358262108262, + "grad_norm": 0.5012257099151611, + "learning_rate": 4.665526535066245e-06, + "loss": 0.4603, + "step": 125500 + }, + { + "epoch": 2.7935808404558404, + "grad_norm": 0.5224705934524536, + "learning_rate": 4.655520420553483e-06, + "loss": 0.3978, + "step": 125510 + }, + { + "epoch": 2.7938034188034186, + "grad_norm": 0.5236812233924866, + "learning_rate": 4.645524921219635e-06, + "loss": 0.4805, + "step": 125520 + }, + { + "epoch": 2.7940259971509973, + "grad_norm": 0.6798905730247498, + "learning_rate": 4.635540037607911e-06, + "loss": 0.5028, + "step": 125530 + }, + { + "epoch": 2.7942485754985755, + "grad_norm": 0.3394434452056885, + "learning_rate": 4.6255657702608535e-06, + "loss": 0.4286, + "step": 125540 + }, + { + "epoch": 2.7944711538461537, + "grad_norm": 0.42189839482307434, + "learning_rate": 4.615602119720519e-06, + "loss": 0.4352, + "step": 125550 + }, + { + "epoch": 2.794693732193732, + "grad_norm": 0.5317621827125549, + "learning_rate": 4.605649086528319e-06, + "loss": 0.448, + "step": 125560 + }, + { + "epoch": 2.7949163105413106, + "grad_norm": 0.7038161158561707, + "learning_rate": 4.595706671225086e-06, + "loss": 0.5898, + "step": 125570 + }, + { + "epoch": 2.795138888888889, + "grad_norm": 0.44456297159194946, + "learning_rate": 4.5857748743511234e-06, + "loss": 0.403, + "step": 125580 + }, + { + "epoch": 2.795361467236467, + "grad_norm": 0.6659712791442871, + "learning_rate": 4.575853696446131e-06, + "loss": 0.3923, + "step": 125590 + }, + { + "epoch": 2.7955840455840457, + "grad_norm": 0.4893523156642914, + "learning_rate": 4.565943138049233e-06, + "loss": 0.3691, + "step": 125600 + }, + { + "epoch": 2.795806623931624, + "grad_norm": 0.6819437146186829, + "learning_rate": 4.556043199698956e-06, + "loss": 0.5384, + "step": 125610 + }, + { + "epoch": 2.796029202279202, + "grad_norm": 0.3867335319519043, + "learning_rate": 4.54615388193329e-06, + "loss": 0.423, + "step": 125620 + }, + { + "epoch": 2.796251780626781, + "grad_norm": 0.614936888217926, + "learning_rate": 4.5362751852896516e-06, + "loss": 0.4349, + "step": 125630 + }, + { + "epoch": 2.796474358974359, + "grad_norm": 0.513956606388092, + "learning_rate": 4.526407110304831e-06, + "loss": 0.3862, + "step": 125640 + }, + { + "epoch": 2.7966969373219372, + "grad_norm": 0.5959755778312683, + "learning_rate": 4.516549657515046e-06, + "loss": 0.4267, + "step": 125650 + }, + { + "epoch": 2.796919515669516, + "grad_norm": 0.6045657992362976, + "learning_rate": 4.506702827456e-06, + "loss": 0.3683, + "step": 125660 + }, + { + "epoch": 2.797142094017094, + "grad_norm": 0.5449283719062805, + "learning_rate": 4.496866620662754e-06, + "loss": 0.4474, + "step": 125670 + }, + { + "epoch": 2.7973646723646723, + "grad_norm": 0.5704584717750549, + "learning_rate": 4.487041037669814e-06, + "loss": 0.3552, + "step": 125680 + }, + { + "epoch": 2.7975872507122506, + "grad_norm": 0.8659281730651855, + "learning_rate": 4.477226079011154e-06, + "loss": 0.4936, + "step": 125690 + }, + { + "epoch": 2.7978098290598292, + "grad_norm": 0.4428424537181854, + "learning_rate": 4.467421745220058e-06, + "loss": 0.422, + "step": 125700 + }, + { + "epoch": 2.7980324074074074, + "grad_norm": 0.660078763961792, + "learning_rate": 4.457628036829342e-06, + "loss": 0.5, + "step": 125710 + }, + { + "epoch": 2.7982549857549857, + "grad_norm": 0.46205437183380127, + "learning_rate": 4.447844954371206e-06, + "loss": 0.468, + "step": 125720 + }, + { + "epoch": 2.798477564102564, + "grad_norm": 0.49835318326950073, + "learning_rate": 4.438072498377244e-06, + "loss": 0.3658, + "step": 125730 + }, + { + "epoch": 2.7987001424501425, + "grad_norm": 0.7425667643547058, + "learning_rate": 4.428310669378543e-06, + "loss": 0.5988, + "step": 125740 + }, + { + "epoch": 2.7989227207977208, + "grad_norm": 0.3822995126247406, + "learning_rate": 4.418559467905503e-06, + "loss": 0.5071, + "step": 125750 + }, + { + "epoch": 2.799145299145299, + "grad_norm": 0.5722664594650269, + "learning_rate": 4.408818894488076e-06, + "loss": 0.4471, + "step": 125760 + }, + { + "epoch": 2.7993678774928776, + "grad_norm": 0.29368162155151367, + "learning_rate": 4.399088949655572e-06, + "loss": 0.401, + "step": 125770 + }, + { + "epoch": 2.799590455840456, + "grad_norm": 0.33703193068504333, + "learning_rate": 4.38936963393668e-06, + "loss": 0.5182, + "step": 125780 + }, + { + "epoch": 2.799813034188034, + "grad_norm": 0.43243011832237244, + "learning_rate": 4.379660947859554e-06, + "loss": 0.4736, + "step": 125790 + }, + { + "epoch": 2.8000356125356127, + "grad_norm": 0.5151998400688171, + "learning_rate": 4.3699628919518175e-06, + "loss": 0.4869, + "step": 125800 + }, + { + "epoch": 2.800258190883191, + "grad_norm": 0.6595456600189209, + "learning_rate": 4.360275466740449e-06, + "loss": 0.4484, + "step": 125810 + }, + { + "epoch": 2.800480769230769, + "grad_norm": 0.796154797077179, + "learning_rate": 4.350598672751849e-06, + "loss": 0.4647, + "step": 125820 + }, + { + "epoch": 2.800703347578348, + "grad_norm": 0.4498961567878723, + "learning_rate": 4.340932510511908e-06, + "loss": 0.4975, + "step": 125830 + }, + { + "epoch": 2.800925925925926, + "grad_norm": 0.46631789207458496, + "learning_rate": 4.331276980545873e-06, + "loss": 0.4019, + "step": 125840 + }, + { + "epoch": 2.8011485042735043, + "grad_norm": 0.7194572687149048, + "learning_rate": 4.321632083378413e-06, + "loss": 0.4679, + "step": 125850 + }, + { + "epoch": 2.8013710826210825, + "grad_norm": 0.6665747761726379, + "learning_rate": 4.311997819533664e-06, + "loss": 0.4112, + "step": 125860 + }, + { + "epoch": 2.8015936609686607, + "grad_norm": 0.6622679233551025, + "learning_rate": 4.302374189535141e-06, + "loss": 0.53, + "step": 125870 + }, + { + "epoch": 2.8018162393162394, + "grad_norm": 0.38641971349716187, + "learning_rate": 4.2927611939058034e-06, + "loss": 0.4984, + "step": 125880 + }, + { + "epoch": 2.8020388176638176, + "grad_norm": 0.5284295082092285, + "learning_rate": 4.283158833168055e-06, + "loss": 0.3875, + "step": 125890 + }, + { + "epoch": 2.802261396011396, + "grad_norm": 0.7484142184257507, + "learning_rate": 4.273567107843657e-06, + "loss": 0.5029, + "step": 125900 + }, + { + "epoch": 2.8024839743589745, + "grad_norm": 0.5397800207138062, + "learning_rate": 4.26398601845388e-06, + "loss": 0.3654, + "step": 125910 + }, + { + "epoch": 2.8027065527065527, + "grad_norm": 0.5725739002227783, + "learning_rate": 4.25441556551931e-06, + "loss": 0.3719, + "step": 125920 + }, + { + "epoch": 2.802929131054131, + "grad_norm": 0.5289227962493896, + "learning_rate": 4.244855749560062e-06, + "loss": 0.4642, + "step": 125930 + }, + { + "epoch": 2.8031517094017095, + "grad_norm": 0.7622392177581787, + "learning_rate": 4.235306571095588e-06, + "loss": 0.4956, + "step": 125940 + }, + { + "epoch": 2.8033742877492878, + "grad_norm": 0.48069143295288086, + "learning_rate": 4.225768030644828e-06, + "loss": 0.3907, + "step": 125950 + }, + { + "epoch": 2.803596866096866, + "grad_norm": 0.5886992812156677, + "learning_rate": 4.2162401287261015e-06, + "loss": 0.4595, + "step": 125960 + }, + { + "epoch": 2.8038194444444446, + "grad_norm": 0.5862254500389099, + "learning_rate": 4.206722865857171e-06, + "loss": 0.4601, + "step": 125970 + }, + { + "epoch": 2.804042022792023, + "grad_norm": 0.37320825457572937, + "learning_rate": 4.19721624255518e-06, + "loss": 0.3773, + "step": 125980 + }, + { + "epoch": 2.804264601139601, + "grad_norm": 0.7246841192245483, + "learning_rate": 4.187720259336781e-06, + "loss": 0.4905, + "step": 125990 + }, + { + "epoch": 2.8044871794871797, + "grad_norm": 0.588402509689331, + "learning_rate": 4.178234916717938e-06, + "loss": 0.5129, + "step": 126000 + }, + { + "epoch": 2.804709757834758, + "grad_norm": 0.6401274800300598, + "learning_rate": 4.168760215214129e-06, + "loss": 0.5112, + "step": 126010 + }, + { + "epoch": 2.804932336182336, + "grad_norm": 0.683003306388855, + "learning_rate": 4.159296155340187e-06, + "loss": 0.4992, + "step": 126020 + }, + { + "epoch": 2.8051549145299144, + "grad_norm": 0.5846920609474182, + "learning_rate": 4.149842737610432e-06, + "loss": 0.5349, + "step": 126030 + }, + { + "epoch": 2.8053774928774926, + "grad_norm": 0.45215484499931335, + "learning_rate": 4.140399962538543e-06, + "loss": 0.4911, + "step": 126040 + }, + { + "epoch": 2.8056000712250713, + "grad_norm": 0.6347735524177551, + "learning_rate": 4.130967830637666e-06, + "loss": 0.5719, + "step": 126050 + }, + { + "epoch": 2.8058226495726495, + "grad_norm": 0.7858456969261169, + "learning_rate": 4.121546342420346e-06, + "loss": 0.4583, + "step": 126060 + }, + { + "epoch": 2.8060452279202277, + "grad_norm": 0.5836461186408997, + "learning_rate": 4.112135498398528e-06, + "loss": 0.4243, + "step": 126070 + }, + { + "epoch": 2.8062678062678064, + "grad_norm": 0.47294196486473083, + "learning_rate": 4.102735299083649e-06, + "loss": 0.4377, + "step": 126080 + }, + { + "epoch": 2.8064903846153846, + "grad_norm": 0.5034077763557434, + "learning_rate": 4.093345744986499e-06, + "loss": 0.4291, + "step": 126090 + }, + { + "epoch": 2.806712962962963, + "grad_norm": 0.38051632046699524, + "learning_rate": 4.083966836617315e-06, + "loss": 0.4075, + "step": 126100 + }, + { + "epoch": 2.8069355413105415, + "grad_norm": 0.7089518308639526, + "learning_rate": 4.074598574485778e-06, + "loss": 0.4402, + "step": 126110 + }, + { + "epoch": 2.8071581196581197, + "grad_norm": 0.5025211572647095, + "learning_rate": 4.0652409591009245e-06, + "loss": 0.4942, + "step": 126120 + }, + { + "epoch": 2.807380698005698, + "grad_norm": 0.43359848856925964, + "learning_rate": 4.0558939909712825e-06, + "loss": 0.5245, + "step": 126130 + }, + { + "epoch": 2.8076032763532766, + "grad_norm": 0.43336302042007446, + "learning_rate": 4.0465576706047785e-06, + "loss": 0.4345, + "step": 126140 + }, + { + "epoch": 2.8078258547008548, + "grad_norm": 0.6243006587028503, + "learning_rate": 4.037231998508717e-06, + "loss": 0.5161, + "step": 126150 + }, + { + "epoch": 2.808048433048433, + "grad_norm": 0.5328023433685303, + "learning_rate": 4.027916975189916e-06, + "loss": 0.4368, + "step": 126160 + }, + { + "epoch": 2.8082710113960117, + "grad_norm": 0.6565059423446655, + "learning_rate": 4.018612601154525e-06, + "loss": 0.4876, + "step": 126170 + }, + { + "epoch": 2.80849358974359, + "grad_norm": 0.4544715881347656, + "learning_rate": 4.0093188769081635e-06, + "loss": 0.4173, + "step": 126180 + }, + { + "epoch": 2.808716168091168, + "grad_norm": 0.38822659850120544, + "learning_rate": 4.000035802955871e-06, + "loss": 0.5016, + "step": 126190 + }, + { + "epoch": 2.8089387464387463, + "grad_norm": 0.5639353394508362, + "learning_rate": 3.990763379802087e-06, + "loss": 0.4352, + "step": 126200 + }, + { + "epoch": 2.8091613247863245, + "grad_norm": 0.38232603669166565, + "learning_rate": 3.981501607950655e-06, + "loss": 0.4434, + "step": 126210 + }, + { + "epoch": 2.809383903133903, + "grad_norm": 0.6549859642982483, + "learning_rate": 3.972250487904905e-06, + "loss": 0.4758, + "step": 126220 + }, + { + "epoch": 2.8096064814814814, + "grad_norm": 0.5064232349395752, + "learning_rate": 3.963010020167546e-06, + "loss": 0.4107, + "step": 126230 + }, + { + "epoch": 2.8098290598290596, + "grad_norm": 0.514802873134613, + "learning_rate": 3.953780205240709e-06, + "loss": 0.4204, + "step": 126240 + }, + { + "epoch": 2.8100516381766383, + "grad_norm": 0.48119041323661804, + "learning_rate": 3.9445610436259494e-06, + "loss": 0.3383, + "step": 126250 + }, + { + "epoch": 2.8102742165242165, + "grad_norm": 0.5888558030128479, + "learning_rate": 3.9353525358242434e-06, + "loss": 0.4728, + "step": 126260 + }, + { + "epoch": 2.8104967948717947, + "grad_norm": 0.4223555624485016, + "learning_rate": 3.926154682335992e-06, + "loss": 0.4459, + "step": 126270 + }, + { + "epoch": 2.8107193732193734, + "grad_norm": 0.5040433406829834, + "learning_rate": 3.916967483660994e-06, + "loss": 0.5122, + "step": 126280 + }, + { + "epoch": 2.8109419515669516, + "grad_norm": 0.643981397151947, + "learning_rate": 3.907790940298495e-06, + "loss": 0.3608, + "step": 126290 + }, + { + "epoch": 2.81116452991453, + "grad_norm": 0.5450314283370972, + "learning_rate": 3.898625052747185e-06, + "loss": 0.3875, + "step": 126300 + }, + { + "epoch": 2.8113871082621085, + "grad_norm": 0.9616496562957764, + "learning_rate": 3.88946982150511e-06, + "loss": 0.4265, + "step": 126310 + }, + { + "epoch": 2.8116096866096867, + "grad_norm": 0.6152632832527161, + "learning_rate": 3.8803252470697825e-06, + "loss": 0.4989, + "step": 126320 + }, + { + "epoch": 2.811832264957265, + "grad_norm": 0.8327240347862244, + "learning_rate": 3.8711913299381615e-06, + "loss": 0.4178, + "step": 126330 + }, + { + "epoch": 2.8120548433048436, + "grad_norm": 0.4893125593662262, + "learning_rate": 3.862068070606539e-06, + "loss": 0.4356, + "step": 126340 + }, + { + "epoch": 2.812277421652422, + "grad_norm": 0.3900088667869568, + "learning_rate": 3.8529554695706956e-06, + "loss": 0.4796, + "step": 126350 + }, + { + "epoch": 2.8125, + "grad_norm": 0.4981417655944824, + "learning_rate": 3.843853527325836e-06, + "loss": 0.4961, + "step": 126360 + }, + { + "epoch": 2.812722578347578, + "grad_norm": 0.5793061256408691, + "learning_rate": 3.834762244366563e-06, + "loss": 0.407, + "step": 126370 + }, + { + "epoch": 2.8129451566951564, + "grad_norm": 0.5003716349601746, + "learning_rate": 3.825681621186905e-06, + "loss": 0.5477, + "step": 126380 + }, + { + "epoch": 2.813167735042735, + "grad_norm": 0.6106581091880798, + "learning_rate": 3.816611658280289e-06, + "loss": 0.4214, + "step": 126390 + }, + { + "epoch": 2.8133903133903133, + "grad_norm": 0.526982843875885, + "learning_rate": 3.80755235613961e-06, + "loss": 0.5292, + "step": 126400 + }, + { + "epoch": 2.8136128917378915, + "grad_norm": 0.5547360777854919, + "learning_rate": 3.7985037152571403e-06, + "loss": 0.453, + "step": 126410 + }, + { + "epoch": 2.81383547008547, + "grad_norm": 0.6486905217170715, + "learning_rate": 3.789465736124598e-06, + "loss": 0.4772, + "step": 126420 + }, + { + "epoch": 2.8140580484330484, + "grad_norm": 0.5603926181793213, + "learning_rate": 3.780438419233101e-06, + "loss": 0.3599, + "step": 126430 + }, + { + "epoch": 2.8142806267806266, + "grad_norm": 0.5745129585266113, + "learning_rate": 3.7714217650732354e-06, + "loss": 0.4661, + "step": 126440 + }, + { + "epoch": 2.8145032051282053, + "grad_norm": 0.5068896412849426, + "learning_rate": 3.7624157741349197e-06, + "loss": 0.4888, + "step": 126450 + }, + { + "epoch": 2.8147257834757835, + "grad_norm": 0.5847540497779846, + "learning_rate": 3.7534204469076074e-06, + "loss": 0.5063, + "step": 126460 + }, + { + "epoch": 2.8149483618233617, + "grad_norm": 0.550115704536438, + "learning_rate": 3.7444357838800846e-06, + "loss": 0.4651, + "step": 126470 + }, + { + "epoch": 2.8151709401709404, + "grad_norm": 0.8214982748031616, + "learning_rate": 3.7354617855405614e-06, + "loss": 0.4216, + "step": 126480 + }, + { + "epoch": 2.8153935185185186, + "grad_norm": 0.6252423524856567, + "learning_rate": 3.7264984523767142e-06, + "loss": 0.5309, + "step": 126490 + }, + { + "epoch": 2.815616096866097, + "grad_norm": 0.5870510935783386, + "learning_rate": 3.7175457848756424e-06, + "loss": 0.4906, + "step": 126500 + }, + { + "epoch": 2.8158386752136755, + "grad_norm": 0.5593652129173279, + "learning_rate": 3.7086037835238007e-06, + "loss": 0.4567, + "step": 126510 + }, + { + "epoch": 2.8160612535612537, + "grad_norm": 0.3342607915401459, + "learning_rate": 3.6996724488071342e-06, + "loss": 0.5018, + "step": 126520 + }, + { + "epoch": 2.816283831908832, + "grad_norm": 0.510809600353241, + "learning_rate": 3.6907517812109436e-06, + "loss": 0.3862, + "step": 126530 + }, + { + "epoch": 2.81650641025641, + "grad_norm": 0.6610252261161804, + "learning_rate": 3.681841781220019e-06, + "loss": 0.4706, + "step": 126540 + }, + { + "epoch": 2.8167289886039883, + "grad_norm": 0.6215569376945496, + "learning_rate": 3.672942449318528e-06, + "loss": 0.4252, + "step": 126550 + }, + { + "epoch": 2.816951566951567, + "grad_norm": 0.5946906208992004, + "learning_rate": 3.6640537859900627e-06, + "loss": 0.4338, + "step": 126560 + }, + { + "epoch": 2.8171741452991452, + "grad_norm": 0.6003249883651733, + "learning_rate": 3.6551757917176357e-06, + "loss": 0.3925, + "step": 126570 + }, + { + "epoch": 2.8173967236467234, + "grad_norm": 0.6325357556343079, + "learning_rate": 3.6463084669837057e-06, + "loss": 0.4601, + "step": 126580 + }, + { + "epoch": 2.817619301994302, + "grad_norm": 0.6173017621040344, + "learning_rate": 3.6374518122701095e-06, + "loss": 0.4204, + "step": 126590 + }, + { + "epoch": 2.8178418803418803, + "grad_norm": 0.509817898273468, + "learning_rate": 3.628605828058129e-06, + "loss": 0.4388, + "step": 126600 + }, + { + "epoch": 2.8180644586894585, + "grad_norm": 0.6608301997184753, + "learning_rate": 3.619770514828469e-06, + "loss": 0.5231, + "step": 126610 + }, + { + "epoch": 2.818287037037037, + "grad_norm": 0.471277117729187, + "learning_rate": 3.610945873061233e-06, + "loss": 0.406, + "step": 126620 + }, + { + "epoch": 2.8185096153846154, + "grad_norm": 0.47932931780815125, + "learning_rate": 3.6021319032359724e-06, + "loss": 0.3938, + "step": 126630 + }, + { + "epoch": 2.8187321937321936, + "grad_norm": 0.5314711928367615, + "learning_rate": 3.5933286058316808e-06, + "loss": 0.3801, + "step": 126640 + }, + { + "epoch": 2.8189547720797723, + "grad_norm": 0.42228272557258606, + "learning_rate": 3.5845359813266643e-06, + "loss": 0.4342, + "step": 126650 + }, + { + "epoch": 2.8191773504273505, + "grad_norm": 0.6404412388801575, + "learning_rate": 3.5757540301987415e-06, + "loss": 0.4596, + "step": 126660 + }, + { + "epoch": 2.8193999287749287, + "grad_norm": 0.6327511072158813, + "learning_rate": 3.566982752925152e-06, + "loss": 0.5203, + "step": 126670 + }, + { + "epoch": 2.8196225071225074, + "grad_norm": 0.5190104246139526, + "learning_rate": 3.5582221499825598e-06, + "loss": 0.5352, + "step": 126680 + }, + { + "epoch": 2.8198450854700856, + "grad_norm": 0.5785208344459534, + "learning_rate": 3.5494722218469614e-06, + "loss": 0.4493, + "step": 126690 + }, + { + "epoch": 2.820067663817664, + "grad_norm": 0.447017639875412, + "learning_rate": 3.5407329689938653e-06, + "loss": 0.3606, + "step": 126700 + }, + { + "epoch": 2.820290242165242, + "grad_norm": 0.6020628809928894, + "learning_rate": 3.5320043918981804e-06, + "loss": 0.3456, + "step": 126710 + }, + { + "epoch": 2.820334757834758, + "eval_loss": 0.5204442739486694, + "eval_runtime": 337.1596, + "eval_samples_per_second": 7.014, + "eval_steps_per_second": 7.014, + "step": 126712 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.4100450575351715, + "learning_rate": 3.5232864910341943e-06, + "loss": 0.4373, + "step": 126720 + }, + { + "epoch": 2.820735398860399, + "grad_norm": 0.6168124079704285, + "learning_rate": 3.514579266875684e-06, + "loss": 0.4102, + "step": 126730 + }, + { + "epoch": 2.820957977207977, + "grad_norm": 0.5441197752952576, + "learning_rate": 3.5058827198957812e-06, + "loss": 0.5041, + "step": 126740 + }, + { + "epoch": 2.8211805555555554, + "grad_norm": 0.4099632799625397, + "learning_rate": 3.497196850567064e-06, + "loss": 0.4067, + "step": 126750 + }, + { + "epoch": 2.821403133903134, + "grad_norm": 0.5914316177368164, + "learning_rate": 3.4885216593615323e-06, + "loss": 0.5025, + "step": 126760 + }, + { + "epoch": 2.8216257122507122, + "grad_norm": 0.6284675598144531, + "learning_rate": 3.4798571467506314e-06, + "loss": 0.4211, + "step": 126770 + }, + { + "epoch": 2.8218482905982905, + "grad_norm": 0.47363045811653137, + "learning_rate": 3.4712033132051403e-06, + "loss": 0.5034, + "step": 126780 + }, + { + "epoch": 2.822070868945869, + "grad_norm": 0.4073488712310791, + "learning_rate": 3.4625601591953716e-06, + "loss": 0.452, + "step": 126790 + }, + { + "epoch": 2.8222934472934473, + "grad_norm": 0.45261600613594055, + "learning_rate": 3.4539276851909496e-06, + "loss": 0.4788, + "step": 126800 + }, + { + "epoch": 2.8225160256410255, + "grad_norm": 0.563221275806427, + "learning_rate": 3.4453058916610103e-06, + "loss": 0.4369, + "step": 126810 + }, + { + "epoch": 2.822738603988604, + "grad_norm": 0.6849488019943237, + "learning_rate": 3.436694779074068e-06, + "loss": 0.5502, + "step": 126820 + }, + { + "epoch": 2.8229611823361824, + "grad_norm": 0.5914343595504761, + "learning_rate": 3.4280943478980364e-06, + "loss": 0.4897, + "step": 126830 + }, + { + "epoch": 2.8231837606837606, + "grad_norm": 0.45933276414871216, + "learning_rate": 3.419504598600276e-06, + "loss": 0.4959, + "step": 126840 + }, + { + "epoch": 2.8234063390313393, + "grad_norm": 0.4319513440132141, + "learning_rate": 3.4109255316475463e-06, + "loss": 0.4694, + "step": 126850 + }, + { + "epoch": 2.8236289173789175, + "grad_norm": 0.4329933822154999, + "learning_rate": 3.4023571475060746e-06, + "loss": 0.3935, + "step": 126860 + }, + { + "epoch": 2.8238514957264957, + "grad_norm": 0.6808799505233765, + "learning_rate": 3.3937994466414656e-06, + "loss": 0.5708, + "step": 126870 + }, + { + "epoch": 2.824074074074074, + "grad_norm": 0.36101189255714417, + "learning_rate": 3.385252429518726e-06, + "loss": 0.4141, + "step": 126880 + }, + { + "epoch": 2.824296652421652, + "grad_norm": 0.6104711890220642, + "learning_rate": 3.3767160966023503e-06, + "loss": 0.5179, + "step": 126890 + }, + { + "epoch": 2.824519230769231, + "grad_norm": 0.6297375559806824, + "learning_rate": 3.36819044835619e-06, + "loss": 0.5327, + "step": 126900 + }, + { + "epoch": 2.824741809116809, + "grad_norm": 0.7188384532928467, + "learning_rate": 3.3596754852435187e-06, + "loss": 0.4808, + "step": 126910 + }, + { + "epoch": 2.8249643874643873, + "grad_norm": 0.429747998714447, + "learning_rate": 3.351171207727055e-06, + "loss": 0.4285, + "step": 126920 + }, + { + "epoch": 2.825186965811966, + "grad_norm": 0.5166102647781372, + "learning_rate": 3.342677616268919e-06, + "loss": 0.4007, + "step": 126930 + }, + { + "epoch": 2.825409544159544, + "grad_norm": 0.5530314445495605, + "learning_rate": 3.334194711330696e-06, + "loss": 0.4761, + "step": 126940 + }, + { + "epoch": 2.8256321225071224, + "grad_norm": 0.5777900815010071, + "learning_rate": 3.325722493373307e-06, + "loss": 0.4773, + "step": 126950 + }, + { + "epoch": 2.825854700854701, + "grad_norm": 0.4906090497970581, + "learning_rate": 3.3172609628571826e-06, + "loss": 0.4418, + "step": 126960 + }, + { + "epoch": 2.8260772792022792, + "grad_norm": 0.5480250716209412, + "learning_rate": 3.308810120242112e-06, + "loss": 0.483, + "step": 126970 + }, + { + "epoch": 2.8262998575498575, + "grad_norm": 0.5232762098312378, + "learning_rate": 3.3003699659873046e-06, + "loss": 0.4827, + "step": 126980 + }, + { + "epoch": 2.826522435897436, + "grad_norm": 0.5735209584236145, + "learning_rate": 3.291940500551416e-06, + "loss": 0.4374, + "step": 126990 + }, + { + "epoch": 2.8267450142450143, + "grad_norm": 0.5852814316749573, + "learning_rate": 3.2835217243925244e-06, + "loss": 0.4975, + "step": 127000 + }, + { + "epoch": 2.8269675925925926, + "grad_norm": 0.7762939929962158, + "learning_rate": 3.275113637968086e-06, + "loss": 0.4723, + "step": 127010 + }, + { + "epoch": 2.8271901709401708, + "grad_norm": 0.6684213280677795, + "learning_rate": 3.2667162417350238e-06, + "loss": 0.5022, + "step": 127020 + }, + { + "epoch": 2.8274127492877494, + "grad_norm": 0.6481077075004578, + "learning_rate": 3.2583295361496845e-06, + "loss": 0.442, + "step": 127030 + }, + { + "epoch": 2.8276353276353277, + "grad_norm": 0.6304945945739746, + "learning_rate": 3.24995352166777e-06, + "loss": 0.5479, + "step": 127040 + }, + { + "epoch": 2.827857905982906, + "grad_norm": 0.45239803194999695, + "learning_rate": 3.2415881987444274e-06, + "loss": 0.5192, + "step": 127050 + }, + { + "epoch": 2.828080484330484, + "grad_norm": 0.5424669981002808, + "learning_rate": 3.2332335678342484e-06, + "loss": 0.4948, + "step": 127060 + }, + { + "epoch": 2.8283030626780628, + "grad_norm": 0.5759827494621277, + "learning_rate": 3.2248896293912477e-06, + "loss": 0.4737, + "step": 127070 + }, + { + "epoch": 2.828525641025641, + "grad_norm": 0.6620804667472839, + "learning_rate": 3.2165563838688408e-06, + "loss": 0.556, + "step": 127080 + }, + { + "epoch": 2.828748219373219, + "grad_norm": 0.5302000641822815, + "learning_rate": 3.2082338317198425e-06, + "loss": 0.5109, + "step": 127090 + }, + { + "epoch": 2.828970797720798, + "grad_norm": 0.6622860431671143, + "learning_rate": 3.199921973396536e-06, + "loss": 0.4877, + "step": 127100 + }, + { + "epoch": 2.829193376068376, + "grad_norm": 0.707613468170166, + "learning_rate": 3.1916208093505595e-06, + "loss": 0.4645, + "step": 127110 + }, + { + "epoch": 2.8294159544159543, + "grad_norm": 0.3570280075073242, + "learning_rate": 3.1833303400330415e-06, + "loss": 0.4153, + "step": 127120 + }, + { + "epoch": 2.829638532763533, + "grad_norm": 0.5294520258903503, + "learning_rate": 3.1750505658944664e-06, + "loss": 0.4884, + "step": 127130 + }, + { + "epoch": 2.829861111111111, + "grad_norm": 0.5556965470314026, + "learning_rate": 3.1667814873847624e-06, + "loss": 0.4469, + "step": 127140 + }, + { + "epoch": 2.8300836894586894, + "grad_norm": 0.5460103750228882, + "learning_rate": 3.1585231049532817e-06, + "loss": 0.3631, + "step": 127150 + }, + { + "epoch": 2.830306267806268, + "grad_norm": 0.6050337553024292, + "learning_rate": 3.1502754190488205e-06, + "loss": 0.4802, + "step": 127160 + }, + { + "epoch": 2.8305288461538463, + "grad_norm": 0.4571405351161957, + "learning_rate": 3.1420384301195316e-06, + "loss": 0.4772, + "step": 127170 + }, + { + "epoch": 2.8307514245014245, + "grad_norm": 0.4969148635864258, + "learning_rate": 3.133812138613057e-06, + "loss": 0.3877, + "step": 127180 + }, + { + "epoch": 2.8309740028490027, + "grad_norm": 0.5661829710006714, + "learning_rate": 3.1255965449763503e-06, + "loss": 0.5466, + "step": 127190 + }, + { + "epoch": 2.8311965811965814, + "grad_norm": 0.3782590329647064, + "learning_rate": 3.117391649655921e-06, + "loss": 0.3612, + "step": 127200 + }, + { + "epoch": 2.8314191595441596, + "grad_norm": 0.42014560103416443, + "learning_rate": 3.10919745309759e-06, + "loss": 0.4288, + "step": 127210 + }, + { + "epoch": 2.831641737891738, + "grad_norm": 0.5029391050338745, + "learning_rate": 3.1010139557466455e-06, + "loss": 0.4194, + "step": 127220 + }, + { + "epoch": 2.831864316239316, + "grad_norm": 0.4752350151538849, + "learning_rate": 3.0928411580477988e-06, + "loss": 0.5007, + "step": 127230 + }, + { + "epoch": 2.8320868945868947, + "grad_norm": 0.5849156975746155, + "learning_rate": 3.0846790604451837e-06, + "loss": 0.4155, + "step": 127240 + }, + { + "epoch": 2.832309472934473, + "grad_norm": 0.6458626389503479, + "learning_rate": 3.0765276633822894e-06, + "loss": 0.492, + "step": 127250 + }, + { + "epoch": 2.832532051282051, + "grad_norm": 0.41135790944099426, + "learning_rate": 3.0683869673020947e-06, + "loss": 0.3612, + "step": 127260 + }, + { + "epoch": 2.8327546296296298, + "grad_norm": 0.5405125617980957, + "learning_rate": 3.0602569726469578e-06, + "loss": 0.4437, + "step": 127270 + }, + { + "epoch": 2.832977207977208, + "grad_norm": 0.8160549402236938, + "learning_rate": 3.05213767985868e-06, + "loss": 0.5167, + "step": 127280 + }, + { + "epoch": 2.833199786324786, + "grad_norm": 0.6462587118148804, + "learning_rate": 3.044029089378486e-06, + "loss": 0.4281, + "step": 127290 + }, + { + "epoch": 2.833422364672365, + "grad_norm": 0.5424714088439941, + "learning_rate": 3.03593120164698e-06, + "loss": 0.477, + "step": 127300 + }, + { + "epoch": 2.833644943019943, + "grad_norm": 0.6870628595352173, + "learning_rate": 3.0278440171042087e-06, + "loss": 0.5667, + "step": 127310 + }, + { + "epoch": 2.8338675213675213, + "grad_norm": 0.3487043082714081, + "learning_rate": 3.019767536189666e-06, + "loss": 0.3682, + "step": 127320 + }, + { + "epoch": 2.8340900997151, + "grad_norm": 0.6313628554344177, + "learning_rate": 3.011701759342178e-06, + "loss": 0.4717, + "step": 127330 + }, + { + "epoch": 2.834312678062678, + "grad_norm": 0.49724081158638, + "learning_rate": 3.003646687000106e-06, + "loss": 0.426, + "step": 127340 + }, + { + "epoch": 2.8345352564102564, + "grad_norm": 0.5283458232879639, + "learning_rate": 2.995602319601121e-06, + "loss": 0.4111, + "step": 127350 + }, + { + "epoch": 2.8347578347578346, + "grad_norm": 0.5319889783859253, + "learning_rate": 2.987568657582385e-06, + "loss": 0.6391, + "step": 127360 + }, + { + "epoch": 2.8349804131054133, + "grad_norm": 0.7493406534194946, + "learning_rate": 2.9795457013804593e-06, + "loss": 0.4456, + "step": 127370 + }, + { + "epoch": 2.8352029914529915, + "grad_norm": 0.5402820706367493, + "learning_rate": 2.9715334514313076e-06, + "loss": 0.5811, + "step": 127380 + }, + { + "epoch": 2.8354255698005697, + "grad_norm": 0.366567462682724, + "learning_rate": 2.963531908170314e-06, + "loss": 0.4182, + "step": 127390 + }, + { + "epoch": 2.835648148148148, + "grad_norm": 0.4801349341869354, + "learning_rate": 2.9555410720323086e-06, + "loss": 0.5394, + "step": 127400 + }, + { + "epoch": 2.8358707264957266, + "grad_norm": 0.5683795213699341, + "learning_rate": 2.9475609434514993e-06, + "loss": 0.4981, + "step": 127410 + }, + { + "epoch": 2.836093304843305, + "grad_norm": 0.67473304271698, + "learning_rate": 2.9395915228615402e-06, + "loss": 0.5979, + "step": 127420 + }, + { + "epoch": 2.836315883190883, + "grad_norm": 0.4936923682689667, + "learning_rate": 2.931632810695506e-06, + "loss": 0.4675, + "step": 127430 + }, + { + "epoch": 2.8365384615384617, + "grad_norm": 0.5283862948417664, + "learning_rate": 2.9236848073858736e-06, + "loss": 0.4716, + "step": 127440 + }, + { + "epoch": 2.83676103988604, + "grad_norm": 0.4947654604911804, + "learning_rate": 2.9157475133645416e-06, + "loss": 0.4225, + "step": 127450 + }, + { + "epoch": 2.836983618233618, + "grad_norm": 0.7152960896492004, + "learning_rate": 2.9078209290628324e-06, + "loss": 0.532, + "step": 127460 + }, + { + "epoch": 2.8372061965811968, + "grad_norm": 0.5929638147354126, + "learning_rate": 2.899905054911489e-06, + "loss": 0.4627, + "step": 127470 + }, + { + "epoch": 2.837428774928775, + "grad_norm": 0.7341625094413757, + "learning_rate": 2.8919998913406353e-06, + "loss": 0.5209, + "step": 127480 + }, + { + "epoch": 2.837651353276353, + "grad_norm": 0.5574802160263062, + "learning_rate": 2.884105438779883e-06, + "loss": 0.4576, + "step": 127490 + }, + { + "epoch": 2.837873931623932, + "grad_norm": 0.6063029766082764, + "learning_rate": 2.876221697658199e-06, + "loss": 0.4474, + "step": 127500 + }, + { + "epoch": 2.83809650997151, + "grad_norm": 0.7553101778030396, + "learning_rate": 2.8683486684040195e-06, + "loss": 0.6091, + "step": 127510 + }, + { + "epoch": 2.8383190883190883, + "grad_norm": 0.6518235206604004, + "learning_rate": 2.8604863514451354e-06, + "loss": 0.5291, + "step": 127520 + }, + { + "epoch": 2.8385416666666665, + "grad_norm": 0.7313382625579834, + "learning_rate": 2.8526347472088266e-06, + "loss": 0.518, + "step": 127530 + }, + { + "epoch": 2.838764245014245, + "grad_norm": 0.5088934302330017, + "learning_rate": 2.844793856121708e-06, + "loss": 0.3735, + "step": 127540 + }, + { + "epoch": 2.8389868233618234, + "grad_norm": 0.9321578741073608, + "learning_rate": 2.836963678609905e-06, + "loss": 0.5138, + "step": 127550 + }, + { + "epoch": 2.8392094017094016, + "grad_norm": 0.5136719346046448, + "learning_rate": 2.829144215098922e-06, + "loss": 0.4957, + "step": 127560 + }, + { + "epoch": 2.83943198005698, + "grad_norm": 0.6601418256759644, + "learning_rate": 2.8213354660136416e-06, + "loss": 0.524, + "step": 127570 + }, + { + "epoch": 2.8396545584045585, + "grad_norm": 0.5681608319282532, + "learning_rate": 2.8135374317783904e-06, + "loss": 0.4735, + "step": 127580 + }, + { + "epoch": 2.8398771367521367, + "grad_norm": 0.7215602993965149, + "learning_rate": 2.8057501128169626e-06, + "loss": 0.4704, + "step": 127590 + }, + { + "epoch": 2.840099715099715, + "grad_norm": 0.4726629853248596, + "learning_rate": 2.797973509552487e-06, + "loss": 0.4404, + "step": 127600 + }, + { + "epoch": 2.8403222934472936, + "grad_norm": 0.367449551820755, + "learning_rate": 2.790207622407581e-06, + "loss": 0.4292, + "step": 127610 + }, + { + "epoch": 2.840544871794872, + "grad_norm": 0.5491681694984436, + "learning_rate": 2.7824524518042405e-06, + "loss": 0.4613, + "step": 127620 + }, + { + "epoch": 2.84076745014245, + "grad_norm": 0.5599938631057739, + "learning_rate": 2.7747079981638614e-06, + "loss": 0.5249, + "step": 127630 + }, + { + "epoch": 2.8409900284900287, + "grad_norm": 0.36625319719314575, + "learning_rate": 2.766974261907307e-06, + "loss": 0.4111, + "step": 127640 + }, + { + "epoch": 2.841212606837607, + "grad_norm": 0.4233880043029785, + "learning_rate": 2.7592512434548413e-06, + "loss": 0.4154, + "step": 127650 + }, + { + "epoch": 2.841435185185185, + "grad_norm": 0.34068143367767334, + "learning_rate": 2.7515389432261508e-06, + "loss": 0.4196, + "step": 127660 + }, + { + "epoch": 2.841657763532764, + "grad_norm": 0.5412957668304443, + "learning_rate": 2.7438373616403e-06, + "loss": 0.436, + "step": 127670 + }, + { + "epoch": 2.841880341880342, + "grad_norm": 0.6452593207359314, + "learning_rate": 2.7361464991157993e-06, + "loss": 0.4758, + "step": 127680 + }, + { + "epoch": 2.84210292022792, + "grad_norm": 0.4374260902404785, + "learning_rate": 2.7284663560706025e-06, + "loss": 0.4615, + "step": 127690 + }, + { + "epoch": 2.8423254985754984, + "grad_norm": 0.43188244104385376, + "learning_rate": 2.7207969329220205e-06, + "loss": 0.4227, + "step": 127700 + }, + { + "epoch": 2.8425480769230766, + "grad_norm": 0.468960165977478, + "learning_rate": 2.7131382300868314e-06, + "loss": 0.3827, + "step": 127710 + }, + { + "epoch": 2.8427706552706553, + "grad_norm": 0.6046063899993896, + "learning_rate": 2.7054902479812128e-06, + "loss": 0.389, + "step": 127720 + }, + { + "epoch": 2.8429932336182335, + "grad_norm": 0.7549774646759033, + "learning_rate": 2.697852987020788e-06, + "loss": 0.5852, + "step": 127730 + }, + { + "epoch": 2.8432158119658117, + "grad_norm": 0.5519178509712219, + "learning_rate": 2.6902264476205363e-06, + "loss": 0.5385, + "step": 127740 + }, + { + "epoch": 2.8434383903133904, + "grad_norm": 0.7807755470275879, + "learning_rate": 2.6826106301949037e-06, + "loss": 0.5021, + "step": 127750 + }, + { + "epoch": 2.8436609686609686, + "grad_norm": 0.48430293798446655, + "learning_rate": 2.6750055351577595e-06, + "loss": 0.4454, + "step": 127760 + }, + { + "epoch": 2.843883547008547, + "grad_norm": 0.4661320447921753, + "learning_rate": 2.667411162922329e-06, + "loss": 0.3889, + "step": 127770 + }, + { + "epoch": 2.8441061253561255, + "grad_norm": 0.5276303291320801, + "learning_rate": 2.6598275139013253e-06, + "loss": 0.4635, + "step": 127780 + }, + { + "epoch": 2.8443287037037037, + "grad_norm": 0.6416008472442627, + "learning_rate": 2.6522545885068417e-06, + "loss": 0.455, + "step": 127790 + }, + { + "epoch": 2.844551282051282, + "grad_norm": 0.44830918312072754, + "learning_rate": 2.644692387150416e-06, + "loss": 0.3487, + "step": 127800 + }, + { + "epoch": 2.8447738603988606, + "grad_norm": 0.4769445061683655, + "learning_rate": 2.6371409102429634e-06, + "loss": 0.3851, + "step": 127810 + }, + { + "epoch": 2.844996438746439, + "grad_norm": 0.5918802618980408, + "learning_rate": 2.629600158194845e-06, + "loss": 0.4871, + "step": 127820 + }, + { + "epoch": 2.845219017094017, + "grad_norm": 0.36495670676231384, + "learning_rate": 2.622070131415821e-06, + "loss": 0.4913, + "step": 127830 + }, + { + "epoch": 2.8454415954415957, + "grad_norm": 0.4469436705112457, + "learning_rate": 2.614550830315099e-06, + "loss": 0.5122, + "step": 127840 + }, + { + "epoch": 2.845664173789174, + "grad_norm": 0.39218777418136597, + "learning_rate": 2.607042255301262e-06, + "loss": 0.3885, + "step": 127850 + }, + { + "epoch": 2.845886752136752, + "grad_norm": 0.513641357421875, + "learning_rate": 2.5995444067823393e-06, + "loss": 0.4105, + "step": 127860 + }, + { + "epoch": 2.8461093304843303, + "grad_norm": 0.4854564070701599, + "learning_rate": 2.592057285165761e-06, + "loss": 0.5354, + "step": 127870 + }, + { + "epoch": 2.8463319088319086, + "grad_norm": 0.5514839887619019, + "learning_rate": 2.5845808908584236e-06, + "loss": 0.4986, + "step": 127880 + }, + { + "epoch": 2.8465544871794872, + "grad_norm": 0.6030427813529968, + "learning_rate": 2.5771152242665575e-06, + "loss": 0.5883, + "step": 127890 + }, + { + "epoch": 2.8467770655270654, + "grad_norm": 0.47126203775405884, + "learning_rate": 2.5696602857958607e-06, + "loss": 0.4896, + "step": 127900 + }, + { + "epoch": 2.8469996438746437, + "grad_norm": 0.6155270338058472, + "learning_rate": 2.5622160758514534e-06, + "loss": 0.5152, + "step": 127910 + }, + { + "epoch": 2.8472222222222223, + "grad_norm": 0.9987238645553589, + "learning_rate": 2.554782594837857e-06, + "loss": 0.4822, + "step": 127920 + }, + { + "epoch": 2.8474448005698005, + "grad_norm": 0.6122363805770874, + "learning_rate": 2.547359843159014e-06, + "loss": 0.5391, + "step": 127930 + }, + { + "epoch": 2.8476673789173788, + "grad_norm": 0.47502201795578003, + "learning_rate": 2.5399478212182916e-06, + "loss": 0.3807, + "step": 127940 + }, + { + "epoch": 2.8478899572649574, + "grad_norm": 0.510317325592041, + "learning_rate": 2.532546529418456e-06, + "loss": 0.577, + "step": 127950 + }, + { + "epoch": 2.8481125356125356, + "grad_norm": 0.6082240343093872, + "learning_rate": 2.5251559681616744e-06, + "loss": 0.4693, + "step": 127960 + }, + { + "epoch": 2.848335113960114, + "grad_norm": 0.463861346244812, + "learning_rate": 2.5177761378495812e-06, + "loss": 0.4554, + "step": 127970 + }, + { + "epoch": 2.8485576923076925, + "grad_norm": 0.5722596645355225, + "learning_rate": 2.510407038883189e-06, + "loss": 0.4323, + "step": 127980 + }, + { + "epoch": 2.8487802706552707, + "grad_norm": 0.47360795736312866, + "learning_rate": 2.5030486716629554e-06, + "loss": 0.4379, + "step": 127990 + }, + { + "epoch": 2.849002849002849, + "grad_norm": 0.8248348236083984, + "learning_rate": 2.495701036588738e-06, + "loss": 0.5192, + "step": 128000 + }, + { + "epoch": 2.8492254273504276, + "grad_norm": 0.43759846687316895, + "learning_rate": 2.4883641340598174e-06, + "loss": 0.3721, + "step": 128010 + }, + { + "epoch": 2.849448005698006, + "grad_norm": 0.3280053436756134, + "learning_rate": 2.481037964474897e-06, + "loss": 0.4763, + "step": 128020 + }, + { + "epoch": 2.849670584045584, + "grad_norm": 0.5017495155334473, + "learning_rate": 2.4737225282320363e-06, + "loss": 0.4446, + "step": 128030 + }, + { + "epoch": 2.8498931623931623, + "grad_norm": 0.6453086137771606, + "learning_rate": 2.466417825728828e-06, + "loss": 0.349, + "step": 128040 + }, + { + "epoch": 2.8501157407407405, + "grad_norm": 0.6192348599433899, + "learning_rate": 2.4591238573621556e-06, + "loss": 0.439, + "step": 128050 + }, + { + "epoch": 2.850338319088319, + "grad_norm": 0.4385824203491211, + "learning_rate": 2.451840623528412e-06, + "loss": 0.4657, + "step": 128060 + }, + { + "epoch": 2.8505608974358974, + "grad_norm": 0.5602866411209106, + "learning_rate": 2.44456812462337e-06, + "loss": 0.4046, + "step": 128070 + }, + { + "epoch": 2.8507834757834756, + "grad_norm": 0.37369683384895325, + "learning_rate": 2.4373063610422462e-06, + "loss": 0.5432, + "step": 128080 + }, + { + "epoch": 2.8510060541310542, + "grad_norm": 0.5913586020469666, + "learning_rate": 2.430055333179615e-06, + "loss": 0.4079, + "step": 128090 + }, + { + "epoch": 2.8512286324786325, + "grad_norm": 0.6015235185623169, + "learning_rate": 2.4228150414295157e-06, + "loss": 0.4556, + "step": 128100 + }, + { + "epoch": 2.8514512108262107, + "grad_norm": 0.6328086256980896, + "learning_rate": 2.4155854861853675e-06, + "loss": 0.4006, + "step": 128110 + }, + { + "epoch": 2.8516737891737893, + "grad_norm": 0.594316840171814, + "learning_rate": 2.4083666678400784e-06, + "loss": 0.3517, + "step": 128120 + }, + { + "epoch": 2.8518963675213675, + "grad_norm": 0.4189058244228363, + "learning_rate": 2.4011585867858898e-06, + "loss": 0.4068, + "step": 128130 + }, + { + "epoch": 2.8521189458689458, + "grad_norm": 0.6700271964073181, + "learning_rate": 2.393961243414511e-06, + "loss": 0.4682, + "step": 128140 + }, + { + "epoch": 2.8523415242165244, + "grad_norm": 0.6584945321083069, + "learning_rate": 2.3867746381170285e-06, + "loss": 0.5559, + "step": 128150 + }, + { + "epoch": 2.8525641025641026, + "grad_norm": 1.0810476541519165, + "learning_rate": 2.3795987712840194e-06, + "loss": 0.5629, + "step": 128160 + }, + { + "epoch": 2.852786680911681, + "grad_norm": 0.5784086585044861, + "learning_rate": 2.3724336433053716e-06, + "loss": 0.3555, + "step": 128170 + }, + { + "epoch": 2.8530092592592595, + "grad_norm": 0.4716869592666626, + "learning_rate": 2.3652792545704627e-06, + "loss": 0.4536, + "step": 128180 + }, + { + "epoch": 2.8532318376068377, + "grad_norm": 0.5608788728713989, + "learning_rate": 2.35813560546807e-06, + "loss": 0.3744, + "step": 128190 + }, + { + "epoch": 2.853454415954416, + "grad_norm": 0.5514051914215088, + "learning_rate": 2.3510026963863953e-06, + "loss": 0.3658, + "step": 128200 + }, + { + "epoch": 2.853676994301994, + "grad_norm": 0.5831300020217896, + "learning_rate": 2.343880527713038e-06, + "loss": 0.4468, + "step": 128210 + }, + { + "epoch": 2.8538995726495724, + "grad_norm": 0.47793155908584595, + "learning_rate": 2.3367690998350233e-06, + "loss": 0.4621, + "step": 128220 + }, + { + "epoch": 2.854122150997151, + "grad_norm": 0.5891512036323547, + "learning_rate": 2.329668413138797e-06, + "loss": 0.4112, + "step": 128230 + }, + { + "epoch": 2.8543447293447293, + "grad_norm": 0.6231299042701721, + "learning_rate": 2.3225784680101837e-06, + "loss": 0.4276, + "step": 128240 + }, + { + "epoch": 2.8545673076923075, + "grad_norm": 0.4069092571735382, + "learning_rate": 2.3154992648344977e-06, + "loss": 0.4711, + "step": 128250 + }, + { + "epoch": 2.854789886039886, + "grad_norm": 0.3998798727989197, + "learning_rate": 2.3084308039964087e-06, + "loss": 0.3865, + "step": 128260 + }, + { + "epoch": 2.8550124643874644, + "grad_norm": 0.5938044786453247, + "learning_rate": 2.301373085880032e-06, + "loss": 0.3994, + "step": 128270 + }, + { + "epoch": 2.8552350427350426, + "grad_norm": 0.41773974895477295, + "learning_rate": 2.294326110868883e-06, + "loss": 0.484, + "step": 128280 + }, + { + "epoch": 2.8554576210826212, + "grad_norm": 0.6640601754188538, + "learning_rate": 2.287289879345922e-06, + "loss": 0.4097, + "step": 128290 + }, + { + "epoch": 2.8556801994301995, + "grad_norm": 0.4293261766433716, + "learning_rate": 2.280264391693465e-06, + "loss": 0.6003, + "step": 128300 + }, + { + "epoch": 2.8559027777777777, + "grad_norm": 0.5650845170021057, + "learning_rate": 2.2732496482932964e-06, + "loss": 0.4486, + "step": 128310 + }, + { + "epoch": 2.8561253561253563, + "grad_norm": 0.5928626656532288, + "learning_rate": 2.2662456495265994e-06, + "loss": 0.5542, + "step": 128320 + }, + { + "epoch": 2.8563479344729346, + "grad_norm": 0.40868303179740906, + "learning_rate": 2.259252395774003e-06, + "loss": 0.3978, + "step": 128330 + }, + { + "epoch": 2.8565705128205128, + "grad_norm": 0.5804070830345154, + "learning_rate": 2.2522698874155147e-06, + "loss": 0.4422, + "step": 128340 + }, + { + "epoch": 2.8567930911680914, + "grad_norm": 0.669746458530426, + "learning_rate": 2.245298124830564e-06, + "loss": 0.4555, + "step": 128350 + }, + { + "epoch": 2.8570156695156697, + "grad_norm": 0.3729129135608673, + "learning_rate": 2.238337108398003e-06, + "loss": 0.4551, + "step": 128360 + }, + { + "epoch": 2.857238247863248, + "grad_norm": 0.8856843709945679, + "learning_rate": 2.231386838496086e-06, + "loss": 0.4375, + "step": 128370 + }, + { + "epoch": 2.857460826210826, + "grad_norm": 0.4789460301399231, + "learning_rate": 2.224447315502509e-06, + "loss": 0.4658, + "step": 128380 + }, + { + "epoch": 2.8576834045584043, + "grad_norm": 0.6349992156028748, + "learning_rate": 2.2175185397943945e-06, + "loss": 0.4891, + "step": 128390 + }, + { + "epoch": 2.857905982905983, + "grad_norm": 0.6243805885314941, + "learning_rate": 2.2106005117482176e-06, + "loss": 0.4175, + "step": 128400 + }, + { + "epoch": 2.858128561253561, + "grad_norm": 0.7456267476081848, + "learning_rate": 2.203693231739923e-06, + "loss": 0.4172, + "step": 128410 + }, + { + "epoch": 2.8583511396011394, + "grad_norm": 0.47280463576316833, + "learning_rate": 2.1967967001448543e-06, + "loss": 0.4273, + "step": 128420 + }, + { + "epoch": 2.858573717948718, + "grad_norm": 0.7351090312004089, + "learning_rate": 2.1899109173378006e-06, + "loss": 0.5241, + "step": 128430 + }, + { + "epoch": 2.8587962962962963, + "grad_norm": 0.7132675051689148, + "learning_rate": 2.18303588369293e-06, + "loss": 0.4239, + "step": 128440 + }, + { + "epoch": 2.8590188746438745, + "grad_norm": 0.3661953806877136, + "learning_rate": 2.1761715995838094e-06, + "loss": 0.3363, + "step": 128450 + }, + { + "epoch": 2.859241452991453, + "grad_norm": 0.5877260565757751, + "learning_rate": 2.169318065383474e-06, + "loss": 0.3942, + "step": 128460 + }, + { + "epoch": 2.8594640313390314, + "grad_norm": 0.4182318150997162, + "learning_rate": 2.162475281464338e-06, + "loss": 0.3919, + "step": 128470 + }, + { + "epoch": 2.8596866096866096, + "grad_norm": 0.5911064743995667, + "learning_rate": 2.15564324819828e-06, + "loss": 0.451, + "step": 128480 + }, + { + "epoch": 2.8599091880341883, + "grad_norm": 0.3642343282699585, + "learning_rate": 2.148821965956516e-06, + "loss": 0.4713, + "step": 128490 + }, + { + "epoch": 2.8601317663817665, + "grad_norm": 0.6984941959381104, + "learning_rate": 2.142011435109725e-06, + "loss": 0.5226, + "step": 128500 + }, + { + "epoch": 2.8603543447293447, + "grad_norm": 0.578916609287262, + "learning_rate": 2.1352116560280354e-06, + "loss": 0.3807, + "step": 128510 + }, + { + "epoch": 2.8605769230769234, + "grad_norm": 0.7987368702888489, + "learning_rate": 2.128422629080884e-06, + "loss": 0.4601, + "step": 128520 + }, + { + "epoch": 2.8607995014245016, + "grad_norm": 0.6165527701377869, + "learning_rate": 2.1216443546372643e-06, + "loss": 0.525, + "step": 128530 + }, + { + "epoch": 2.86102207977208, + "grad_norm": 0.5393047332763672, + "learning_rate": 2.11487683306546e-06, + "loss": 0.4502, + "step": 128540 + }, + { + "epoch": 2.861244658119658, + "grad_norm": 0.658702552318573, + "learning_rate": 2.108120064733243e-06, + "loss": 0.2996, + "step": 128550 + }, + { + "epoch": 2.861467236467236, + "grad_norm": 0.45965611934661865, + "learning_rate": 2.1013740500078092e-06, + "loss": 0.42, + "step": 128560 + }, + { + "epoch": 2.861689814814815, + "grad_norm": 0.4500766694545746, + "learning_rate": 2.0946387892557097e-06, + "loss": 0.4174, + "step": 128570 + }, + { + "epoch": 2.861912393162393, + "grad_norm": 0.5137393474578857, + "learning_rate": 2.08791428284294e-06, + "loss": 0.4264, + "step": 128580 + }, + { + "epoch": 2.8621349715099713, + "grad_norm": 0.5398975610733032, + "learning_rate": 2.0812005311349192e-06, + "loss": 0.5062, + "step": 128590 + }, + { + "epoch": 2.86235754985755, + "grad_norm": 0.5769351124763489, + "learning_rate": 2.074497534496489e-06, + "loss": 0.5821, + "step": 128600 + }, + { + "epoch": 2.862580128205128, + "grad_norm": 0.42253240942955017, + "learning_rate": 2.0678052932919133e-06, + "loss": 0.4055, + "step": 128610 + }, + { + "epoch": 2.8628027065527064, + "grad_norm": 0.523838460445404, + "learning_rate": 2.061123807884813e-06, + "loss": 0.486, + "step": 128620 + }, + { + "epoch": 2.863025284900285, + "grad_norm": 0.8152051568031311, + "learning_rate": 2.0544530786382963e-06, + "loss": 0.4849, + "step": 128630 + }, + { + "epoch": 2.8632478632478633, + "grad_norm": 0.4388711750507355, + "learning_rate": 2.0477931059148303e-06, + "loss": 0.4611, + "step": 128640 + }, + { + "epoch": 2.8634704415954415, + "grad_norm": 0.6085971593856812, + "learning_rate": 2.041143890076369e-06, + "loss": 0.4708, + "step": 128650 + }, + { + "epoch": 2.86369301994302, + "grad_norm": 0.6009733080863953, + "learning_rate": 2.0345054314841794e-06, + "loss": 0.5275, + "step": 128660 + }, + { + "epoch": 2.8639155982905984, + "grad_norm": 0.4018401503562927, + "learning_rate": 2.027877730499039e-06, + "loss": 0.4101, + "step": 128670 + }, + { + "epoch": 2.8641381766381766, + "grad_norm": 0.46677204966545105, + "learning_rate": 2.021260787481083e-06, + "loss": 0.3558, + "step": 128680 + }, + { + "epoch": 2.864360754985755, + "grad_norm": 0.4653564691543579, + "learning_rate": 2.0146546027898674e-06, + "loss": 0.4384, + "step": 128690 + }, + { + "epoch": 2.8645833333333335, + "grad_norm": 0.4220982491970062, + "learning_rate": 2.008059176784438e-06, + "loss": 0.4263, + "step": 128700 + }, + { + "epoch": 2.8648059116809117, + "grad_norm": 0.6132118105888367, + "learning_rate": 2.0014745098231314e-06, + "loss": 0.5962, + "step": 128710 + }, + { + "epoch": 2.86502849002849, + "grad_norm": 0.7703640460968018, + "learning_rate": 1.994900602263794e-06, + "loss": 0.4711, + "step": 128720 + }, + { + "epoch": 2.865251068376068, + "grad_norm": 0.6671891808509827, + "learning_rate": 1.9883374544636512e-06, + "loss": 0.4536, + "step": 128730 + }, + { + "epoch": 2.865473646723647, + "grad_norm": 0.46717581152915955, + "learning_rate": 1.9817850667793737e-06, + "loss": 0.427, + "step": 128740 + }, + { + "epoch": 2.865696225071225, + "grad_norm": 0.4125966727733612, + "learning_rate": 1.975243439566987e-06, + "loss": 0.4134, + "step": 128750 + }, + { + "epoch": 2.8659188034188032, + "grad_norm": 0.400468111038208, + "learning_rate": 1.9687125731819635e-06, + "loss": 0.3723, + "step": 128760 + }, + { + "epoch": 2.866141381766382, + "grad_norm": 0.48118314146995544, + "learning_rate": 1.9621924679792403e-06, + "loss": 0.4294, + "step": 128770 + }, + { + "epoch": 2.86636396011396, + "grad_norm": 0.5905781388282776, + "learning_rate": 1.955683124313068e-06, + "loss": 0.4468, + "step": 128780 + }, + { + "epoch": 2.8665865384615383, + "grad_norm": 0.7723633646965027, + "learning_rate": 1.94918454253723e-06, + "loss": 0.3429, + "step": 128790 + }, + { + "epoch": 2.866809116809117, + "grad_norm": 0.43844008445739746, + "learning_rate": 1.9426967230048443e-06, + "loss": 0.3715, + "step": 128800 + }, + { + "epoch": 2.867031695156695, + "grad_norm": 0.6145934462547302, + "learning_rate": 1.9362196660684286e-06, + "loss": 0.4389, + "step": 128810 + }, + { + "epoch": 2.8672542735042734, + "grad_norm": 0.7906620502471924, + "learning_rate": 1.92975337207999e-06, + "loss": 0.43, + "step": 128820 + }, + { + "epoch": 2.867476851851852, + "grad_norm": 0.6965433359146118, + "learning_rate": 1.923297841390892e-06, + "loss": 0.3551, + "step": 128830 + }, + { + "epoch": 2.8676994301994303, + "grad_norm": 0.5444207191467285, + "learning_rate": 1.916853074351943e-06, + "loss": 0.474, + "step": 128840 + }, + { + "epoch": 2.8679220085470085, + "grad_norm": 0.5031320452690125, + "learning_rate": 1.910419071313374e-06, + "loss": 0.4394, + "step": 128850 + }, + { + "epoch": 2.8681445868945867, + "grad_norm": 0.5183652639389038, + "learning_rate": 1.9039958326247942e-06, + "loss": 0.4802, + "step": 128860 + }, + { + "epoch": 2.8683671652421654, + "grad_norm": 0.48607540130615234, + "learning_rate": 1.8975833586352576e-06, + "loss": 0.4347, + "step": 128870 + }, + { + "epoch": 2.8685897435897436, + "grad_norm": 0.5282498598098755, + "learning_rate": 1.8911816496931968e-06, + "loss": 0.4464, + "step": 128880 + }, + { + "epoch": 2.868812321937322, + "grad_norm": 0.4897420108318329, + "learning_rate": 1.8847907061464887e-06, + "loss": 0.4407, + "step": 128890 + }, + { + "epoch": 2.8690349002849, + "grad_norm": 0.4289381206035614, + "learning_rate": 1.8784105283424558e-06, + "loss": 0.4751, + "step": 128900 + }, + { + "epoch": 2.8692574786324787, + "grad_norm": 0.5549202561378479, + "learning_rate": 1.8720411166277985e-06, + "loss": 0.3789, + "step": 128910 + }, + { + "epoch": 2.869480056980057, + "grad_norm": 0.39817145466804504, + "learning_rate": 1.8656824713485954e-06, + "loss": 0.4434, + "step": 128920 + }, + { + "epoch": 2.869702635327635, + "grad_norm": 0.5653124451637268, + "learning_rate": 1.8593345928504368e-06, + "loss": 0.514, + "step": 128930 + }, + { + "epoch": 2.869925213675214, + "grad_norm": 0.4783931076526642, + "learning_rate": 1.8529974814782248e-06, + "loss": 0.5078, + "step": 128940 + }, + { + "epoch": 2.870147792022792, + "grad_norm": 0.6050539016723633, + "learning_rate": 1.8466711375763278e-06, + "loss": 0.4924, + "step": 128950 + }, + { + "epoch": 2.8703703703703702, + "grad_norm": 0.5471101403236389, + "learning_rate": 1.8403555614885604e-06, + "loss": 0.4408, + "step": 128960 + }, + { + "epoch": 2.870592948717949, + "grad_norm": 0.5318120121955872, + "learning_rate": 1.8340507535580697e-06, + "loss": 0.3713, + "step": 128970 + }, + { + "epoch": 2.870815527065527, + "grad_norm": 0.29087573289871216, + "learning_rate": 1.8277567141275153e-06, + "loss": 0.3768, + "step": 128980 + }, + { + "epoch": 2.8710381054131053, + "grad_norm": 0.6670485138893127, + "learning_rate": 1.8214734435388681e-06, + "loss": 0.6016, + "step": 128990 + }, + { + "epoch": 2.871260683760684, + "grad_norm": 0.6318653225898743, + "learning_rate": 1.8152009421336102e-06, + "loss": 0.3722, + "step": 129000 + }, + { + "epoch": 2.871483262108262, + "grad_norm": 0.506161630153656, + "learning_rate": 1.8089392102525805e-06, + "loss": 0.3665, + "step": 129010 + }, + { + "epoch": 2.8717058404558404, + "grad_norm": 0.5993919968605042, + "learning_rate": 1.8026882482360175e-06, + "loss": 0.4653, + "step": 129020 + }, + { + "epoch": 2.8719284188034186, + "grad_norm": 0.5019202828407288, + "learning_rate": 1.7964480564236276e-06, + "loss": 0.396, + "step": 129030 + }, + { + "epoch": 2.8721509971509973, + "grad_norm": 0.36323311924934387, + "learning_rate": 1.790218635154517e-06, + "loss": 0.3592, + "step": 129040 + }, + { + "epoch": 2.8723735754985755, + "grad_norm": 0.6010159850120544, + "learning_rate": 1.7839999847671928e-06, + "loss": 0.4068, + "step": 129050 + }, + { + "epoch": 2.8725961538461537, + "grad_norm": 0.3936340808868408, + "learning_rate": 1.7777921055995627e-06, + "loss": 0.4292, + "step": 129060 + }, + { + "epoch": 2.872818732193732, + "grad_norm": 0.5482344031333923, + "learning_rate": 1.7715949979890012e-06, + "loss": 0.4296, + "step": 129070 + }, + { + "epoch": 2.8730413105413106, + "grad_norm": 0.4646384119987488, + "learning_rate": 1.7654086622722166e-06, + "loss": 0.4541, + "step": 129080 + }, + { + "epoch": 2.873263888888889, + "grad_norm": 0.6732831001281738, + "learning_rate": 1.759233098785429e-06, + "loss": 0.4736, + "step": 129090 + }, + { + "epoch": 2.873486467236467, + "grad_norm": 0.5085716843605042, + "learning_rate": 1.7530683078641918e-06, + "loss": 0.3397, + "step": 129100 + }, + { + "epoch": 2.8737090455840457, + "grad_norm": 0.5749651789665222, + "learning_rate": 1.7469142898435042e-06, + "loss": 0.4264, + "step": 129110 + }, + { + "epoch": 2.873931623931624, + "grad_norm": 0.6635963916778564, + "learning_rate": 1.7407710450578096e-06, + "loss": 0.3785, + "step": 129120 + }, + { + "epoch": 2.874154202279202, + "grad_norm": 0.6351351141929626, + "learning_rate": 1.7346385738409298e-06, + "loss": 0.5607, + "step": 129130 + }, + { + "epoch": 2.874376780626781, + "grad_norm": 0.42307665944099426, + "learning_rate": 1.7285168765260874e-06, + "loss": 0.5037, + "step": 129140 + }, + { + "epoch": 2.874599358974359, + "grad_norm": 0.3375342786312103, + "learning_rate": 1.7224059534459492e-06, + "loss": 0.4869, + "step": 129150 + }, + { + "epoch": 2.8748219373219372, + "grad_norm": 0.8892328143119812, + "learning_rate": 1.7163058049325831e-06, + "loss": 0.5797, + "step": 129160 + }, + { + "epoch": 2.875044515669516, + "grad_norm": 0.5242050886154175, + "learning_rate": 1.7102164313174795e-06, + "loss": 0.4744, + "step": 129170 + }, + { + "epoch": 2.875267094017094, + "grad_norm": 0.5739059448242188, + "learning_rate": 1.7041378329315515e-06, + "loss": 0.3742, + "step": 129180 + }, + { + "epoch": 2.8754896723646723, + "grad_norm": 0.36049407720565796, + "learning_rate": 1.6980700101051127e-06, + "loss": 0.4889, + "step": 129190 + }, + { + "epoch": 2.8757122507122506, + "grad_norm": 0.519873321056366, + "learning_rate": 1.6920129631678772e-06, + "loss": 0.4065, + "step": 129200 + }, + { + "epoch": 2.8759348290598292, + "grad_norm": 0.3915044665336609, + "learning_rate": 1.6859666924490036e-06, + "loss": 0.4575, + "step": 129210 + }, + { + "epoch": 2.8761574074074074, + "grad_norm": 0.6183444857597351, + "learning_rate": 1.6799311982770517e-06, + "loss": 0.4922, + "step": 129220 + }, + { + "epoch": 2.8763799857549857, + "grad_norm": 0.4814912676811218, + "learning_rate": 1.6739064809799809e-06, + "loss": 0.2975, + "step": 129230 + }, + { + "epoch": 2.876602564102564, + "grad_norm": 0.5634822249412537, + "learning_rate": 1.6678925408851742e-06, + "loss": 0.4692, + "step": 129240 + }, + { + "epoch": 2.8768251424501425, + "grad_norm": 0.5901898741722107, + "learning_rate": 1.661889378319481e-06, + "loss": 0.4541, + "step": 129250 + }, + { + "epoch": 2.8770477207977208, + "grad_norm": 0.5291321873664856, + "learning_rate": 1.655896993609063e-06, + "loss": 0.3769, + "step": 129260 + }, + { + "epoch": 2.877270299145299, + "grad_norm": 0.5238466858863831, + "learning_rate": 1.6499153870795924e-06, + "loss": 0.4004, + "step": 129270 + }, + { + "epoch": 2.8774928774928776, + "grad_norm": 0.5467638969421387, + "learning_rate": 1.6439445590560986e-06, + "loss": 0.4994, + "step": 129280 + }, + { + "epoch": 2.877715455840456, + "grad_norm": 0.5712918043136597, + "learning_rate": 1.637984509863033e-06, + "loss": 0.4272, + "step": 129290 + }, + { + "epoch": 2.877938034188034, + "grad_norm": 0.5815404057502747, + "learning_rate": 1.6320352398242478e-06, + "loss": 0.5296, + "step": 129300 + }, + { + "epoch": 2.8781606125356127, + "grad_norm": 0.47931987047195435, + "learning_rate": 1.6260967492630841e-06, + "loss": 0.5084, + "step": 129310 + }, + { + "epoch": 2.878383190883191, + "grad_norm": 0.46624454855918884, + "learning_rate": 1.6201690385022171e-06, + "loss": 0.5071, + "step": 129320 + }, + { + "epoch": 2.878605769230769, + "grad_norm": 0.4582967460155487, + "learning_rate": 1.6142521078637673e-06, + "loss": 0.4559, + "step": 129330 + }, + { + "epoch": 2.878828347578348, + "grad_norm": 0.45148831605911255, + "learning_rate": 1.608345957669255e-06, + "loss": 0.4768, + "step": 129340 + }, + { + "epoch": 2.879050925925926, + "grad_norm": 0.4900384545326233, + "learning_rate": 1.6024505882396678e-06, + "loss": 0.4518, + "step": 129350 + }, + { + "epoch": 2.8792735042735043, + "grad_norm": 0.5455189943313599, + "learning_rate": 1.5965659998953052e-06, + "loss": 0.4209, + "step": 129360 + }, + { + "epoch": 2.8794960826210825, + "grad_norm": 0.4357905685901642, + "learning_rate": 1.590692192955956e-06, + "loss": 0.3717, + "step": 129370 + }, + { + "epoch": 2.8797186609686607, + "grad_norm": 0.5603689551353455, + "learning_rate": 1.5848291677408312e-06, + "loss": 0.4896, + "step": 129380 + }, + { + "epoch": 2.8799412393162394, + "grad_norm": 0.7707281708717346, + "learning_rate": 1.5789769245685204e-06, + "loss": 0.5265, + "step": 129390 + }, + { + "epoch": 2.8801638176638176, + "grad_norm": 0.5575656294822693, + "learning_rate": 1.5731354637570361e-06, + "loss": 0.408, + "step": 129400 + }, + { + "epoch": 2.8803418803418803, + "eval_loss": 0.5200754404067993, + "eval_runtime": 337.416, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 7.009, + "step": 129408 + } + ], + "logging_steps": 10, + "max_steps": 134784, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 2696, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.400469039544369e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}