{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8313847752663029, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 33.651363372802734, "learning_rate": 5e-06, "loss": 11.4242, "step": 5 }, { "epoch": 0.0, "grad_norm": 30.245094299316406, "learning_rate": 1e-05, "loss": 10.7984, "step": 10 }, { "epoch": 0.0, "grad_norm": 20.595664978027344, "learning_rate": 1.5e-05, "loss": 9.6784, "step": 15 }, { "epoch": 0.0, "grad_norm": 18.10934829711914, "learning_rate": 2e-05, "loss": 8.6549, "step": 20 }, { "epoch": 0.01, "grad_norm": 12.764612197875977, "learning_rate": 2.5e-05, "loss": 8.0527, "step": 25 }, { "epoch": 0.01, "grad_norm": 7.847819805145264, "learning_rate": 3e-05, "loss": 7.8584, "step": 30 }, { "epoch": 0.01, "grad_norm": 7.7772746086120605, "learning_rate": 3.5e-05, "loss": 7.6997, "step": 35 }, { "epoch": 0.01, "grad_norm": 6.401936054229736, "learning_rate": 4e-05, "loss": 7.5018, "step": 40 }, { "epoch": 0.01, "grad_norm": 5.469693660736084, "learning_rate": 4.5e-05, "loss": 7.2811, "step": 45 }, { "epoch": 0.01, "grad_norm": 4.165266036987305, "learning_rate": 5e-05, "loss": 7.047, "step": 50 }, { "epoch": 0.01, "grad_norm": 4.453402519226074, "learning_rate": 5.500000000000001e-05, "loss": 6.8175, "step": 55 }, { "epoch": 0.01, "grad_norm": 6.7616753578186035, "learning_rate": 6e-05, "loss": 6.6509, "step": 60 }, { "epoch": 0.01, "grad_norm": 6.575658798217773, "learning_rate": 6.500000000000001e-05, "loss": 6.4582, "step": 65 }, { "epoch": 0.01, "grad_norm": 6.6263885498046875, "learning_rate": 7e-05, "loss": 6.2432, "step": 70 }, { "epoch": 0.02, "grad_norm": 7.119388580322266, "learning_rate": 7.500000000000001e-05, "loss": 6.0652, "step": 75 }, { "epoch": 0.02, "grad_norm": 4.671682834625244, "learning_rate": 8e-05, "loss": 5.8913, "step": 80 }, { "epoch": 0.02, "grad_norm": 5.266168117523193, "learning_rate": 8.5e-05, "loss": 5.7924, "step": 85 }, { "epoch": 0.02, "grad_norm": 5.612922668457031, "learning_rate": 9e-05, "loss": 5.6806, "step": 90 }, { "epoch": 0.02, "grad_norm": 5.099239349365234, "learning_rate": 9.5e-05, "loss": 5.5746, "step": 95 }, { "epoch": 0.02, "grad_norm": 6.313576698303223, "learning_rate": 0.0001, "loss": 5.4404, "step": 100 }, { "epoch": 0.02, "grad_norm": 6.374126434326172, "learning_rate": 9.999972205865686e-05, "loss": 5.3661, "step": 105 }, { "epoch": 0.02, "grad_norm": 6.098925590515137, "learning_rate": 9.999888823771751e-05, "loss": 5.2502, "step": 110 }, { "epoch": 0.02, "grad_norm": 6.676019668579102, "learning_rate": 9.999749854645204e-05, "loss": 5.2054, "step": 115 }, { "epoch": 0.02, "grad_norm": 4.790042400360107, "learning_rate": 9.99955530003106e-05, "loss": 5.1922, "step": 120 }, { "epoch": 0.03, "grad_norm": 4.1792755126953125, "learning_rate": 9.99930516209231e-05, "loss": 5.0362, "step": 125 }, { "epoch": 0.03, "grad_norm": 3.925230026245117, "learning_rate": 9.998999443609897e-05, "loss": 4.8993, "step": 130 }, { "epoch": 0.03, "grad_norm": 3.8192741870880127, "learning_rate": 9.998638147982696e-05, "loss": 4.8546, "step": 135 }, { "epoch": 0.03, "grad_norm": 5.455966472625732, "learning_rate": 9.998221279227467e-05, "loss": 4.9245, "step": 140 }, { "epoch": 0.03, "grad_norm": 3.5653443336486816, "learning_rate": 9.997748841978812e-05, "loss": 4.794, "step": 145 }, { "epoch": 0.03, "grad_norm": 7.422428131103516, "learning_rate": 9.997220841489122e-05, "loss": 4.6852, "step": 150 }, { "epoch": 0.03, "grad_norm": 4.473473072052002, "learning_rate": 9.996637283628528e-05, "loss": 4.7194, "step": 155 }, { "epoch": 0.03, "grad_norm": 5.07218074798584, "learning_rate": 9.995998174884821e-05, "loss": 4.5755, "step": 160 }, { "epoch": 0.03, "grad_norm": 3.4391303062438965, "learning_rate": 9.995303522363394e-05, "loss": 4.6125, "step": 165 }, { "epoch": 0.04, "grad_norm": 3.6765010356903076, "learning_rate": 9.99455333378715e-05, "loss": 4.5488, "step": 170 }, { "epoch": 0.04, "grad_norm": 3.9314346313476562, "learning_rate": 9.993747617496428e-05, "loss": 4.5681, "step": 175 }, { "epoch": 0.04, "grad_norm": 2.8918819427490234, "learning_rate": 9.9928863824489e-05, "loss": 4.5552, "step": 180 }, { "epoch": 0.04, "grad_norm": 2.8918302059173584, "learning_rate": 9.99196963821948e-05, "loss": 4.5485, "step": 185 }, { "epoch": 0.04, "grad_norm": 2.52258563041687, "learning_rate": 9.990997395000217e-05, "loss": 4.3666, "step": 190 }, { "epoch": 0.04, "grad_norm": 4.107878684997559, "learning_rate": 9.989969663600169e-05, "loss": 4.4963, "step": 195 }, { "epoch": 0.04, "grad_norm": 3.0174808502197266, "learning_rate": 9.9888864554453e-05, "loss": 4.4471, "step": 200 }, { "epoch": 0.04, "grad_norm": 2.8166937828063965, "learning_rate": 9.987747782578342e-05, "loss": 4.3326, "step": 205 }, { "epoch": 0.04, "grad_norm": 3.0829238891601562, "learning_rate": 9.986553657658668e-05, "loss": 4.3306, "step": 210 }, { "epoch": 0.04, "grad_norm": 2.6593828201293945, "learning_rate": 9.985304093962145e-05, "loss": 4.2864, "step": 215 }, { "epoch": 0.05, "grad_norm": 2.727086067199707, "learning_rate": 9.983999105380988e-05, "loss": 4.2608, "step": 220 }, { "epoch": 0.05, "grad_norm": 2.664433240890503, "learning_rate": 9.982638706423608e-05, "loss": 4.3274, "step": 225 }, { "epoch": 0.05, "grad_norm": 4.2005085945129395, "learning_rate": 9.98122291221445e-05, "loss": 4.29, "step": 230 }, { "epoch": 0.05, "grad_norm": 2.2342798709869385, "learning_rate": 9.979751738493826e-05, "loss": 4.1954, "step": 235 }, { "epoch": 0.05, "grad_norm": 2.773253917694092, "learning_rate": 9.978225201617732e-05, "loss": 4.2651, "step": 240 }, { "epoch": 0.05, "grad_norm": 3.033334255218506, "learning_rate": 9.976643318557678e-05, "loss": 4.1532, "step": 245 }, { "epoch": 0.05, "grad_norm": 2.3765692710876465, "learning_rate": 9.975006106900495e-05, "loss": 4.1574, "step": 250 }, { "epoch": 0.05, "grad_norm": 2.4881410598754883, "learning_rate": 9.973313584848132e-05, "loss": 4.151, "step": 255 }, { "epoch": 0.05, "grad_norm": 2.6022536754608154, "learning_rate": 9.971565771217464e-05, "loss": 4.2039, "step": 260 }, { "epoch": 0.06, "grad_norm": 2.6714437007904053, "learning_rate": 9.969762685440076e-05, "loss": 4.1488, "step": 265 }, { "epoch": 0.06, "grad_norm": 2.723792552947998, "learning_rate": 9.967904347562054e-05, "loss": 4.0716, "step": 270 }, { "epoch": 0.06, "grad_norm": 2.407099723815918, "learning_rate": 9.965990778243755e-05, "loss": 4.0903, "step": 275 }, { "epoch": 0.06, "grad_norm": 2.4060311317443848, "learning_rate": 9.964021998759577e-05, "loss": 4.0282, "step": 280 }, { "epoch": 0.06, "grad_norm": 2.4491236209869385, "learning_rate": 9.961998030997733e-05, "loss": 4.0502, "step": 285 }, { "epoch": 0.06, "grad_norm": 2.302164316177368, "learning_rate": 9.95991889745999e-05, "loss": 4.0588, "step": 290 }, { "epoch": 0.06, "grad_norm": 2.6794662475585938, "learning_rate": 9.957784621261441e-05, "loss": 4.033, "step": 295 }, { "epoch": 0.06, "grad_norm": 2.262129306793213, "learning_rate": 9.955595226130226e-05, "loss": 3.9834, "step": 300 }, { "epoch": 0.06, "grad_norm": 2.1817383766174316, "learning_rate": 9.953350736407282e-05, "loss": 3.9371, "step": 305 }, { "epoch": 0.06, "grad_norm": 2.4953038692474365, "learning_rate": 9.951051177046069e-05, "loss": 4.0102, "step": 310 }, { "epoch": 0.07, "grad_norm": 2.575176477432251, "learning_rate": 9.948696573612292e-05, "loss": 4.0396, "step": 315 }, { "epoch": 0.07, "grad_norm": 2.594761610031128, "learning_rate": 9.946286952283618e-05, "loss": 3.9192, "step": 320 }, { "epoch": 0.07, "grad_norm": 2.27431058883667, "learning_rate": 9.943822339849381e-05, "loss": 4.0117, "step": 325 }, { "epoch": 0.07, "grad_norm": 2.3896944522857666, "learning_rate": 9.941302763710288e-05, "loss": 3.9799, "step": 330 }, { "epoch": 0.07, "grad_norm": 1.9870325326919556, "learning_rate": 9.938728251878116e-05, "loss": 3.8637, "step": 335 }, { "epoch": 0.07, "grad_norm": 2.1930291652679443, "learning_rate": 9.936098832975393e-05, "loss": 3.7962, "step": 340 }, { "epoch": 0.07, "grad_norm": 2.4338083267211914, "learning_rate": 9.933414536235091e-05, "loss": 3.9224, "step": 345 }, { "epoch": 0.07, "grad_norm": 2.0370845794677734, "learning_rate": 9.93067539150029e-05, "loss": 3.8586, "step": 350 }, { "epoch": 0.07, "grad_norm": 1.9598731994628906, "learning_rate": 9.927881429223853e-05, "loss": 3.8631, "step": 355 }, { "epoch": 0.07, "grad_norm": 1.95335853099823, "learning_rate": 9.925032680468085e-05, "loss": 3.7463, "step": 360 }, { "epoch": 0.08, "grad_norm": 2.5965161323547363, "learning_rate": 9.922129176904388e-05, "loss": 3.8511, "step": 365 }, { "epoch": 0.08, "grad_norm": 1.8875285387039185, "learning_rate": 9.919170950812911e-05, "loss": 3.7626, "step": 370 }, { "epoch": 0.08, "grad_norm": 2.4620866775512695, "learning_rate": 9.916158035082184e-05, "loss": 3.7785, "step": 375 }, { "epoch": 0.08, "grad_norm": 1.9820605516433716, "learning_rate": 9.913090463208763e-05, "loss": 3.8284, "step": 380 }, { "epoch": 0.08, "grad_norm": 1.7589184045791626, "learning_rate": 9.90996826929685e-05, "loss": 3.7592, "step": 385 }, { "epoch": 0.08, "grad_norm": 1.8934298753738403, "learning_rate": 9.906791488057916e-05, "loss": 3.7244, "step": 390 }, { "epoch": 0.08, "grad_norm": 2.0055394172668457, "learning_rate": 9.903560154810313e-05, "loss": 3.7211, "step": 395 }, { "epoch": 0.08, "grad_norm": 1.9094047546386719, "learning_rate": 9.900274305478887e-05, "loss": 3.7496, "step": 400 }, { "epoch": 0.08, "grad_norm": 1.7309576272964478, "learning_rate": 9.896933976594572e-05, "loss": 3.7648, "step": 405 }, { "epoch": 0.09, "grad_norm": 2.0965123176574707, "learning_rate": 9.893539205293989e-05, "loss": 3.6743, "step": 410 }, { "epoch": 0.09, "grad_norm": 1.9884498119354248, "learning_rate": 9.890090029319028e-05, "loss": 3.6636, "step": 415 }, { "epoch": 0.09, "grad_norm": 1.9006081819534302, "learning_rate": 9.886586487016433e-05, "loss": 3.6416, "step": 420 }, { "epoch": 0.09, "grad_norm": 1.8273619413375854, "learning_rate": 9.883028617337378e-05, "loss": 3.6083, "step": 425 }, { "epoch": 0.09, "grad_norm": 2.13328218460083, "learning_rate": 9.879416459837022e-05, "loss": 3.6626, "step": 430 }, { "epoch": 0.09, "grad_norm": 1.8524399995803833, "learning_rate": 9.875750054674082e-05, "loss": 3.6508, "step": 435 }, { "epoch": 0.09, "grad_norm": 1.8650786876678467, "learning_rate": 9.872029442610382e-05, "loss": 3.6299, "step": 440 }, { "epoch": 0.09, "grad_norm": 1.6650742292404175, "learning_rate": 9.8682546650104e-05, "loss": 3.5859, "step": 445 }, { "epoch": 0.09, "grad_norm": 1.5603471994400024, "learning_rate": 9.864425763840802e-05, "loss": 3.6087, "step": 450 }, { "epoch": 0.09, "grad_norm": 1.5650691986083984, "learning_rate": 9.860542781669988e-05, "loss": 3.6397, "step": 455 }, { "epoch": 0.1, "grad_norm": 1.7188957929611206, "learning_rate": 9.85660576166761e-05, "loss": 3.6476, "step": 460 }, { "epoch": 0.1, "grad_norm": 1.6306991577148438, "learning_rate": 9.852614747604093e-05, "loss": 3.6055, "step": 465 }, { "epoch": 0.1, "grad_norm": 2.1953701972961426, "learning_rate": 9.848569783850145e-05, "loss": 3.5826, "step": 470 }, { "epoch": 0.1, "grad_norm": 2.1616299152374268, "learning_rate": 9.844470915376278e-05, "loss": 3.6174, "step": 475 }, { "epoch": 0.1, "grad_norm": 1.7074434757232666, "learning_rate": 9.840318187752292e-05, "loss": 3.4243, "step": 480 }, { "epoch": 0.1, "grad_norm": 1.6088547706604004, "learning_rate": 9.836111647146771e-05, "loss": 3.4957, "step": 485 }, { "epoch": 0.1, "grad_norm": 1.8437894582748413, "learning_rate": 9.831851340326577e-05, "loss": 3.5407, "step": 490 }, { "epoch": 0.1, "grad_norm": 1.6059656143188477, "learning_rate": 9.82753731465633e-05, "loss": 3.4262, "step": 495 }, { "epoch": 0.1, "grad_norm": 1.987560510635376, "learning_rate": 9.823169618097871e-05, "loss": 3.5446, "step": 500 }, { "epoch": 0.1, "grad_norm": 1.5124210119247437, "learning_rate": 9.81874829920974e-05, "loss": 3.4832, "step": 505 }, { "epoch": 0.11, "grad_norm": 1.5314714908599854, "learning_rate": 9.814273407146623e-05, "loss": 3.4532, "step": 510 }, { "epoch": 0.11, "grad_norm": 1.6071770191192627, "learning_rate": 9.809744991658829e-05, "loss": 3.4115, "step": 515 }, { "epoch": 0.11, "grad_norm": 1.7526746988296509, "learning_rate": 9.805163103091708e-05, "loss": 3.5478, "step": 520 }, { "epoch": 0.11, "grad_norm": 1.570514440536499, "learning_rate": 9.800527792385112e-05, "loss": 3.4841, "step": 525 }, { "epoch": 0.11, "grad_norm": 1.8370064496994019, "learning_rate": 9.79583911107282e-05, "loss": 3.4776, "step": 530 }, { "epoch": 0.11, "grad_norm": 1.590645432472229, "learning_rate": 9.791097111281968e-05, "loss": 3.399, "step": 535 }, { "epoch": 0.11, "grad_norm": 1.5733847618103027, "learning_rate": 9.786301845732467e-05, "loss": 3.3824, "step": 540 }, { "epoch": 0.11, "grad_norm": 1.6162751913070679, "learning_rate": 9.781453367736418e-05, "loss": 3.3776, "step": 545 }, { "epoch": 0.11, "grad_norm": 1.5934056043624878, "learning_rate": 9.776551731197524e-05, "loss": 3.342, "step": 550 }, { "epoch": 0.12, "grad_norm": 1.523496150970459, "learning_rate": 9.771596990610478e-05, "loss": 3.3919, "step": 555 }, { "epoch": 0.12, "grad_norm": 1.703430414199829, "learning_rate": 9.766589201060372e-05, "loss": 3.3063, "step": 560 }, { "epoch": 0.12, "grad_norm": 1.5598702430725098, "learning_rate": 9.761528418222077e-05, "loss": 3.3171, "step": 565 }, { "epoch": 0.12, "grad_norm": 1.7453784942626953, "learning_rate": 9.756414698359624e-05, "loss": 3.379, "step": 570 }, { "epoch": 0.12, "grad_norm": 1.4527804851531982, "learning_rate": 9.75124809832558e-05, "loss": 3.397, "step": 575 }, { "epoch": 0.12, "grad_norm": 1.5606558322906494, "learning_rate": 9.746028675560413e-05, "loss": 3.3644, "step": 580 }, { "epoch": 0.12, "grad_norm": 1.6888071298599243, "learning_rate": 9.740756488091861e-05, "loss": 3.2505, "step": 585 }, { "epoch": 0.12, "grad_norm": 1.5528062582015991, "learning_rate": 9.735431594534277e-05, "loss": 3.3397, "step": 590 }, { "epoch": 0.12, "grad_norm": 1.6364246606826782, "learning_rate": 9.730054054087983e-05, "loss": 3.3461, "step": 595 }, { "epoch": 0.12, "grad_norm": 1.4486875534057617, "learning_rate": 9.724623926538612e-05, "loss": 3.2633, "step": 600 }, { "epoch": 0.13, "grad_norm": 1.5004665851593018, "learning_rate": 9.719141272256443e-05, "loss": 3.2701, "step": 605 }, { "epoch": 0.13, "grad_norm": 1.489261269569397, "learning_rate": 9.713606152195726e-05, "loss": 3.267, "step": 610 }, { "epoch": 0.13, "grad_norm": 1.4302242994308472, "learning_rate": 9.708018627894011e-05, "loss": 3.3144, "step": 615 }, { "epoch": 0.13, "grad_norm": 1.3415112495422363, "learning_rate": 9.702378761471456e-05, "loss": 3.2477, "step": 620 }, { "epoch": 0.13, "grad_norm": 1.5592623949050903, "learning_rate": 9.696686615630146e-05, "loss": 3.2573, "step": 625 }, { "epoch": 0.13, "grad_norm": 1.563595175743103, "learning_rate": 9.690942253653385e-05, "loss": 3.2603, "step": 630 }, { "epoch": 0.13, "grad_norm": 1.4028782844543457, "learning_rate": 9.685145739405002e-05, "loss": 3.3102, "step": 635 }, { "epoch": 0.13, "grad_norm": 1.8662109375, "learning_rate": 9.679297137328634e-05, "loss": 3.2572, "step": 640 }, { "epoch": 0.13, "grad_norm": 1.7463732957839966, "learning_rate": 9.673396512447013e-05, "loss": 3.2952, "step": 645 }, { "epoch": 0.14, "grad_norm": 1.4323670864105225, "learning_rate": 9.667443930361247e-05, "loss": 3.2638, "step": 650 }, { "epoch": 0.14, "grad_norm": 1.3809984922409058, "learning_rate": 9.661439457250076e-05, "loss": 3.2625, "step": 655 }, { "epoch": 0.14, "grad_norm": 1.5374635457992554, "learning_rate": 9.655383159869158e-05, "loss": 3.29, "step": 660 }, { "epoch": 0.14, "grad_norm": 1.4163179397583008, "learning_rate": 9.649275105550309e-05, "loss": 3.1998, "step": 665 }, { "epoch": 0.14, "grad_norm": 1.4354526996612549, "learning_rate": 9.643115362200762e-05, "loss": 3.2943, "step": 670 }, { "epoch": 0.14, "grad_norm": 1.4390583038330078, "learning_rate": 9.636903998302409e-05, "loss": 3.2955, "step": 675 }, { "epoch": 0.14, "grad_norm": 1.4252122640609741, "learning_rate": 9.630641082911045e-05, "loss": 3.2063, "step": 680 }, { "epoch": 0.14, "grad_norm": 1.3900178670883179, "learning_rate": 9.624326685655593e-05, "loss": 3.2456, "step": 685 }, { "epoch": 0.14, "grad_norm": 1.5450825691223145, "learning_rate": 9.617960876737337e-05, "loss": 3.2088, "step": 690 }, { "epoch": 0.14, "grad_norm": 1.4761152267456055, "learning_rate": 9.611543726929134e-05, "loss": 3.1838, "step": 695 }, { "epoch": 0.15, "grad_norm": 1.3841191530227661, "learning_rate": 9.605075307574635e-05, "loss": 3.147, "step": 700 }, { "epoch": 0.15, "grad_norm": 1.4704729318618774, "learning_rate": 9.598555690587487e-05, "loss": 3.1682, "step": 705 }, { "epoch": 0.15, "grad_norm": 1.2939262390136719, "learning_rate": 9.591984948450532e-05, "loss": 3.1114, "step": 710 }, { "epoch": 0.15, "grad_norm": 1.6519654989242554, "learning_rate": 9.585363154215008e-05, "loss": 3.2082, "step": 715 }, { "epoch": 0.15, "grad_norm": 1.2991009950637817, "learning_rate": 9.578690381499728e-05, "loss": 3.1081, "step": 720 }, { "epoch": 0.15, "grad_norm": 1.2765209674835205, "learning_rate": 9.571966704490271e-05, "loss": 3.1507, "step": 725 }, { "epoch": 0.15, "grad_norm": 1.2271565198898315, "learning_rate": 9.565192197938148e-05, "loss": 3.202, "step": 730 }, { "epoch": 0.15, "grad_norm": 1.3048691749572754, "learning_rate": 9.558366937159977e-05, "loss": 3.1312, "step": 735 }, { "epoch": 0.15, "grad_norm": 1.8889031410217285, "learning_rate": 9.551490998036646e-05, "loss": 3.1106, "step": 740 }, { "epoch": 0.15, "grad_norm": 1.3917661905288696, "learning_rate": 9.544564457012463e-05, "loss": 3.1339, "step": 745 }, { "epoch": 0.16, "grad_norm": 1.3188519477844238, "learning_rate": 9.537587391094314e-05, "loss": 3.1738, "step": 750 }, { "epoch": 0.16, "grad_norm": 1.403839111328125, "learning_rate": 9.5305598778508e-05, "loss": 3.1675, "step": 755 }, { "epoch": 0.16, "grad_norm": 1.42656409740448, "learning_rate": 9.52348199541138e-05, "loss": 3.1208, "step": 760 }, { "epoch": 0.16, "grad_norm": 1.6658726930618286, "learning_rate": 9.516353822465504e-05, "loss": 3.0526, "step": 765 }, { "epoch": 0.16, "grad_norm": 1.266996145248413, "learning_rate": 9.509175438261726e-05, "loss": 3.0473, "step": 770 }, { "epoch": 0.16, "grad_norm": 1.3163378238677979, "learning_rate": 9.501946922606838e-05, "loss": 3.0853, "step": 775 }, { "epoch": 0.16, "grad_norm": 1.3675296306610107, "learning_rate": 9.494668355864973e-05, "loss": 3.0912, "step": 780 }, { "epoch": 0.16, "grad_norm": 1.2658545970916748, "learning_rate": 9.487339818956716e-05, "loss": 3.1622, "step": 785 }, { "epoch": 0.16, "grad_norm": 1.3492169380187988, "learning_rate": 9.479961393358203e-05, "loss": 3.1041, "step": 790 }, { "epoch": 0.17, "grad_norm": 1.3128236532211304, "learning_rate": 9.472533161100215e-05, "loss": 3.0637, "step": 795 }, { "epoch": 0.17, "grad_norm": 1.3049265146255493, "learning_rate": 9.465055204767265e-05, "loss": 3.0836, "step": 800 }, { "epoch": 0.17, "grad_norm": 1.2383729219436646, "learning_rate": 9.457527607496685e-05, "loss": 3.0421, "step": 805 }, { "epoch": 0.17, "grad_norm": 1.2715247869491577, "learning_rate": 9.44995045297769e-05, "loss": 3.0051, "step": 810 }, { "epoch": 0.17, "grad_norm": 1.276710867881775, "learning_rate": 9.442323825450464e-05, "loss": 3.0368, "step": 815 }, { "epoch": 0.17, "grad_norm": 1.3114001750946045, "learning_rate": 9.43464780970521e-05, "loss": 3.06, "step": 820 }, { "epoch": 0.17, "grad_norm": 1.2944623231887817, "learning_rate": 9.426922491081212e-05, "loss": 3.084, "step": 825 }, { "epoch": 0.17, "grad_norm": 1.2431820631027222, "learning_rate": 9.419147955465888e-05, "loss": 3.0199, "step": 830 }, { "epoch": 0.17, "grad_norm": 1.3521535396575928, "learning_rate": 9.411324289293832e-05, "loss": 3.0438, "step": 835 }, { "epoch": 0.17, "grad_norm": 1.3010791540145874, "learning_rate": 9.403451579545859e-05, "loss": 3.0329, "step": 840 }, { "epoch": 0.18, "grad_norm": 1.430463194847107, "learning_rate": 9.395529913748025e-05, "loss": 3.003, "step": 845 }, { "epoch": 0.18, "grad_norm": 19.969221115112305, "learning_rate": 9.387559379970672e-05, "loss": 3.0045, "step": 850 }, { "epoch": 0.18, "grad_norm": 1.2505444288253784, "learning_rate": 9.379540066827431e-05, "loss": 3.023, "step": 855 }, { "epoch": 0.18, "grad_norm": 1.25497305393219, "learning_rate": 9.371472063474248e-05, "loss": 3.0441, "step": 860 }, { "epoch": 0.18, "grad_norm": 1.1969456672668457, "learning_rate": 9.363355459608394e-05, "loss": 2.9503, "step": 865 }, { "epoch": 0.18, "grad_norm": 1.1699817180633545, "learning_rate": 9.355190345467457e-05, "loss": 2.9636, "step": 870 }, { "epoch": 0.18, "grad_norm": 1.318147897720337, "learning_rate": 9.346976811828352e-05, "loss": 2.9839, "step": 875 }, { "epoch": 0.18, "grad_norm": 1.2377957105636597, "learning_rate": 9.338714950006297e-05, "loss": 3.065, "step": 880 }, { "epoch": 0.18, "grad_norm": 1.4408152103424072, "learning_rate": 9.330404851853817e-05, "loss": 2.9526, "step": 885 }, { "epoch": 0.18, "grad_norm": 1.2080788612365723, "learning_rate": 9.3220466097597e-05, "loss": 3.0015, "step": 890 }, { "epoch": 0.19, "grad_norm": 1.2468167543411255, "learning_rate": 9.313640316647991e-05, "loss": 3.0341, "step": 895 }, { "epoch": 0.19, "grad_norm": 1.2537471055984497, "learning_rate": 9.305186065976945e-05, "loss": 3.0205, "step": 900 }, { "epoch": 0.19, "grad_norm": 1.183099389076233, "learning_rate": 9.296683951737993e-05, "loss": 3.0408, "step": 905 }, { "epoch": 0.19, "grad_norm": 1.1875871419906616, "learning_rate": 9.288134068454697e-05, "loss": 2.9291, "step": 910 }, { "epoch": 0.19, "grad_norm": 1.2846369743347168, "learning_rate": 9.2795365111817e-05, "loss": 2.9141, "step": 915 }, { "epoch": 0.19, "grad_norm": 1.204579472541809, "learning_rate": 9.270891375503665e-05, "loss": 2.9797, "step": 920 }, { "epoch": 0.19, "grad_norm": 1.171580195426941, "learning_rate": 9.262198757534218e-05, "loss": 2.9523, "step": 925 }, { "epoch": 0.19, "grad_norm": 1.176835298538208, "learning_rate": 9.253458753914874e-05, "loss": 2.9909, "step": 930 }, { "epoch": 0.19, "grad_norm": 1.2722320556640625, "learning_rate": 9.244671461813969e-05, "loss": 2.9957, "step": 935 }, { "epoch": 0.2, "grad_norm": 1.291190505027771, "learning_rate": 9.235836978925572e-05, "loss": 3.0211, "step": 940 }, { "epoch": 0.2, "grad_norm": 1.1778265237808228, "learning_rate": 9.226955403468406e-05, "loss": 2.9882, "step": 945 }, { "epoch": 0.2, "grad_norm": 1.176194667816162, "learning_rate": 9.21802683418475e-05, "loss": 2.938, "step": 950 }, { "epoch": 0.2, "grad_norm": 1.1951394081115723, "learning_rate": 9.209051370339347e-05, "loss": 2.961, "step": 955 }, { "epoch": 0.2, "grad_norm": 1.1604056358337402, "learning_rate": 9.200029111718295e-05, "loss": 2.8819, "step": 960 }, { "epoch": 0.2, "grad_norm": 1.2189394235610962, "learning_rate": 9.190960158627941e-05, "loss": 2.9453, "step": 965 }, { "epoch": 0.2, "grad_norm": 1.1493133306503296, "learning_rate": 9.181844611893766e-05, "loss": 2.9281, "step": 970 }, { "epoch": 0.2, "grad_norm": 1.230989933013916, "learning_rate": 9.172682572859261e-05, "loss": 2.9844, "step": 975 }, { "epoch": 0.2, "grad_norm": 1.1456234455108643, "learning_rate": 9.163474143384806e-05, "loss": 3.028, "step": 980 }, { "epoch": 0.2, "grad_norm": 1.1728698015213013, "learning_rate": 9.154219425846528e-05, "loss": 2.9739, "step": 985 }, { "epoch": 0.21, "grad_norm": 1.1567491292953491, "learning_rate": 9.144918523135175e-05, "loss": 2.857, "step": 990 }, { "epoch": 0.21, "grad_norm": 1.1249184608459473, "learning_rate": 9.13557153865496e-05, "loss": 2.9362, "step": 995 }, { "epoch": 0.21, "grad_norm": 1.1881022453308105, "learning_rate": 9.12617857632242e-05, "loss": 2.9454, "step": 1000 }, { "epoch": 0.21, "grad_norm": 1.1303149461746216, "learning_rate": 9.116739740565259e-05, "loss": 2.8987, "step": 1005 }, { "epoch": 0.21, "grad_norm": 1.099196434020996, "learning_rate": 9.107255136321184e-05, "loss": 2.8858, "step": 1010 }, { "epoch": 0.21, "grad_norm": 1.1457173824310303, "learning_rate": 9.09772486903674e-05, "loss": 2.871, "step": 1015 }, { "epoch": 0.21, "grad_norm": 1.095299482345581, "learning_rate": 9.08814904466614e-05, "loss": 2.8584, "step": 1020 }, { "epoch": 0.21, "grad_norm": 1.2056212425231934, "learning_rate": 9.078527769670085e-05, "loss": 2.9021, "step": 1025 }, { "epoch": 0.21, "grad_norm": 1.2899084091186523, "learning_rate": 9.068861151014575e-05, "loss": 2.9313, "step": 1030 }, { "epoch": 0.22, "grad_norm": 1.143584132194519, "learning_rate": 9.05914929616973e-05, "loss": 2.8577, "step": 1035 }, { "epoch": 0.22, "grad_norm": 1.1004834175109863, "learning_rate": 9.04939231310859e-05, "loss": 2.9285, "step": 1040 }, { "epoch": 0.22, "grad_norm": 1.25986909866333, "learning_rate": 9.039590310305914e-05, "loss": 2.8859, "step": 1045 }, { "epoch": 0.22, "grad_norm": 1.3628671169281006, "learning_rate": 9.029743396736974e-05, "loss": 2.8753, "step": 1050 }, { "epoch": 0.22, "grad_norm": 1.0711766481399536, "learning_rate": 9.019851681876348e-05, "loss": 2.9071, "step": 1055 }, { "epoch": 0.22, "grad_norm": 1.0614064931869507, "learning_rate": 9.009915275696693e-05, "loss": 2.8429, "step": 1060 }, { "epoch": 0.22, "grad_norm": 1.2664223909378052, "learning_rate": 8.999934288667534e-05, "loss": 2.8943, "step": 1065 }, { "epoch": 0.22, "grad_norm": 1.1698609590530396, "learning_rate": 8.989908831754028e-05, "loss": 2.8691, "step": 1070 }, { "epoch": 0.22, "grad_norm": 1.1064506769180298, "learning_rate": 8.979839016415735e-05, "loss": 2.8696, "step": 1075 }, { "epoch": 0.22, "grad_norm": 1.049668788909912, "learning_rate": 8.969724954605373e-05, "loss": 2.8282, "step": 1080 }, { "epoch": 0.23, "grad_norm": 1.2249704599380493, "learning_rate": 8.959566758767581e-05, "loss": 2.7954, "step": 1085 }, { "epoch": 0.23, "grad_norm": 1.0685900449752808, "learning_rate": 8.949364541837661e-05, "loss": 2.8621, "step": 1090 }, { "epoch": 0.23, "grad_norm": 1.3482073545455933, "learning_rate": 8.939118417240329e-05, "loss": 2.8579, "step": 1095 }, { "epoch": 0.23, "grad_norm": 1.239876389503479, "learning_rate": 8.92882849888845e-05, "loss": 2.812, "step": 1100 }, { "epoch": 0.23, "grad_norm": 1.229067087173462, "learning_rate": 8.918494901181773e-05, "loss": 2.8575, "step": 1105 }, { "epoch": 0.23, "grad_norm": 1.3079419136047363, "learning_rate": 8.908117739005659e-05, "loss": 2.848, "step": 1110 }, { "epoch": 0.23, "grad_norm": 1.0963923931121826, "learning_rate": 8.897697127729805e-05, "loss": 2.8002, "step": 1115 }, { "epoch": 0.23, "grad_norm": 1.166383147239685, "learning_rate": 8.887233183206957e-05, "loss": 2.8857, "step": 1120 }, { "epoch": 0.23, "grad_norm": 1.1164528131484985, "learning_rate": 8.876726021771627e-05, "loss": 2.8646, "step": 1125 }, { "epoch": 0.23, "grad_norm": 1.123231291770935, "learning_rate": 8.866175760238798e-05, "loss": 2.8596, "step": 1130 }, { "epoch": 0.24, "grad_norm": 1.088853359222412, "learning_rate": 8.855582515902625e-05, "loss": 2.8162, "step": 1135 }, { "epoch": 0.24, "grad_norm": 1.1311272382736206, "learning_rate": 8.844946406535131e-05, "loss": 2.8188, "step": 1140 }, { "epoch": 0.24, "grad_norm": 1.2540462017059326, "learning_rate": 8.834267550384893e-05, "loss": 2.836, "step": 1145 }, { "epoch": 0.24, "grad_norm": 1.156281590461731, "learning_rate": 8.823546066175741e-05, "loss": 2.8382, "step": 1150 }, { "epoch": 0.24, "grad_norm": 1.1692214012145996, "learning_rate": 8.81278207310542e-05, "loss": 2.8032, "step": 1155 }, { "epoch": 0.24, "grad_norm": 1.1173639297485352, "learning_rate": 8.801975690844278e-05, "loss": 2.7956, "step": 1160 }, { "epoch": 0.24, "grad_norm": 1.0640108585357666, "learning_rate": 8.791127039533934e-05, "loss": 2.8234, "step": 1165 }, { "epoch": 0.24, "grad_norm": 1.5127233266830444, "learning_rate": 8.780236239785935e-05, "loss": 2.8631, "step": 1170 }, { "epoch": 0.24, "grad_norm": 1.0999369621276855, "learning_rate": 8.76930341268042e-05, "loss": 2.8476, "step": 1175 }, { "epoch": 0.25, "grad_norm": 1.0751299858093262, "learning_rate": 8.758328679764776e-05, "loss": 2.7966, "step": 1180 }, { "epoch": 0.25, "grad_norm": 1.1676278114318848, "learning_rate": 8.747312163052284e-05, "loss": 2.8598, "step": 1185 }, { "epoch": 0.25, "grad_norm": 1.0812705755233765, "learning_rate": 8.736253985020761e-05, "loss": 2.8554, "step": 1190 }, { "epoch": 0.25, "grad_norm": 1.0541287660598755, "learning_rate": 8.725154268611203e-05, "loss": 2.8371, "step": 1195 }, { "epoch": 0.25, "grad_norm": 1.0639691352844238, "learning_rate": 8.714013137226411e-05, "loss": 2.7449, "step": 1200 }, { "epoch": 0.25, "grad_norm": 1.0529950857162476, "learning_rate": 8.702830714729628e-05, "loss": 2.8088, "step": 1205 }, { "epoch": 0.25, "grad_norm": 1.1315858364105225, "learning_rate": 8.691607125443153e-05, "loss": 2.8446, "step": 1210 }, { "epoch": 0.25, "grad_norm": 1.0378223657608032, "learning_rate": 8.680342494146967e-05, "loss": 2.8384, "step": 1215 }, { "epoch": 0.25, "grad_norm": 1.0306965112686157, "learning_rate": 8.66903694607734e-05, "loss": 2.847, "step": 1220 }, { "epoch": 0.25, "grad_norm": 1.0947153568267822, "learning_rate": 8.65769060692544e-05, "loss": 2.8501, "step": 1225 }, { "epoch": 0.26, "grad_norm": 1.1506305932998657, "learning_rate": 8.646303602835936e-05, "loss": 2.8494, "step": 1230 }, { "epoch": 0.26, "grad_norm": 1.056742548942566, "learning_rate": 8.634876060405597e-05, "loss": 2.8271, "step": 1235 }, { "epoch": 0.26, "grad_norm": 1.2626043558120728, "learning_rate": 8.623408106681884e-05, "loss": 2.7732, "step": 1240 }, { "epoch": 0.26, "grad_norm": 1.2176762819290161, "learning_rate": 8.611899869161535e-05, "loss": 2.8197, "step": 1245 }, { "epoch": 0.26, "grad_norm": 1.0657603740692139, "learning_rate": 8.600351475789147e-05, "loss": 2.7657, "step": 1250 }, { "epoch": 0.26, "grad_norm": 1.0771147012710571, "learning_rate": 8.588763054955764e-05, "loss": 2.8071, "step": 1255 }, { "epoch": 0.26, "grad_norm": 1.0495407581329346, "learning_rate": 8.57713473549743e-05, "loss": 2.7559, "step": 1260 }, { "epoch": 0.26, "grad_norm": 1.0748529434204102, "learning_rate": 8.565466646693778e-05, "loss": 2.7885, "step": 1265 }, { "epoch": 0.26, "grad_norm": 1.1235424280166626, "learning_rate": 8.553758918266578e-05, "loss": 2.8258, "step": 1270 }, { "epoch": 0.27, "grad_norm": 1.035912036895752, "learning_rate": 8.5420116803783e-05, "loss": 2.7846, "step": 1275 }, { "epoch": 0.27, "grad_norm": 1.0870811939239502, "learning_rate": 8.530225063630668e-05, "loss": 2.8605, "step": 1280 }, { "epoch": 0.27, "grad_norm": 1.0321590900421143, "learning_rate": 8.518399199063205e-05, "loss": 2.7339, "step": 1285 }, { "epoch": 0.27, "grad_norm": 1.1152360439300537, "learning_rate": 8.50653421815178e-05, "loss": 2.8021, "step": 1290 }, { "epoch": 0.27, "grad_norm": 1.0527422428131104, "learning_rate": 8.494630252807138e-05, "loss": 2.8167, "step": 1295 }, { "epoch": 0.27, "grad_norm": 1.065908432006836, "learning_rate": 8.482687435373449e-05, "loss": 2.7533, "step": 1300 }, { "epoch": 0.27, "grad_norm": 1.055224895477295, "learning_rate": 8.470705898626817e-05, "loss": 2.746, "step": 1305 }, { "epoch": 0.27, "grad_norm": 1.0720412731170654, "learning_rate": 8.458685775773822e-05, "loss": 2.7908, "step": 1310 }, { "epoch": 0.27, "grad_norm": 1.1304291486740112, "learning_rate": 8.446627200450025e-05, "loss": 2.7551, "step": 1315 }, { "epoch": 0.27, "grad_norm": 1.0969698429107666, "learning_rate": 8.434530306718493e-05, "loss": 2.755, "step": 1320 }, { "epoch": 0.28, "grad_norm": 1.0863304138183594, "learning_rate": 8.4223952290683e-05, "loss": 2.8039, "step": 1325 }, { "epoch": 0.28, "grad_norm": 1.048060655593872, "learning_rate": 8.41022210241304e-05, "loss": 2.7631, "step": 1330 }, { "epoch": 0.28, "grad_norm": 1.062782645225525, "learning_rate": 8.398011062089316e-05, "loss": 2.8196, "step": 1335 }, { "epoch": 0.28, "grad_norm": 1.044792890548706, "learning_rate": 8.385762243855249e-05, "loss": 2.7331, "step": 1340 }, { "epoch": 0.28, "grad_norm": 1.0595911741256714, "learning_rate": 8.373475783888958e-05, "loss": 2.7563, "step": 1345 }, { "epoch": 0.28, "grad_norm": 1.037611961364746, "learning_rate": 8.36115181878705e-05, "loss": 2.7296, "step": 1350 }, { "epoch": 0.28, "grad_norm": 1.0513596534729004, "learning_rate": 8.348790485563101e-05, "loss": 2.7668, "step": 1355 }, { "epoch": 0.28, "grad_norm": 1.0393058061599731, "learning_rate": 8.336391921646134e-05, "loss": 2.7692, "step": 1360 }, { "epoch": 0.28, "grad_norm": 1.0555886030197144, "learning_rate": 8.323956264879089e-05, "loss": 2.7927, "step": 1365 }, { "epoch": 0.28, "grad_norm": 1.018600583076477, "learning_rate": 8.311483653517294e-05, "loss": 2.7877, "step": 1370 }, { "epoch": 0.29, "grad_norm": 1.0081210136413574, "learning_rate": 8.298974226226919e-05, "loss": 2.7696, "step": 1375 }, { "epoch": 0.29, "grad_norm": 1.0363234281539917, "learning_rate": 8.28642812208345e-05, "loss": 2.7195, "step": 1380 }, { "epoch": 0.29, "grad_norm": 1.0745787620544434, "learning_rate": 8.273845480570123e-05, "loss": 2.6861, "step": 1385 }, { "epoch": 0.29, "grad_norm": 1.073278784751892, "learning_rate": 8.26122644157639e-05, "loss": 2.719, "step": 1390 }, { "epoch": 0.29, "grad_norm": 1.0157283544540405, "learning_rate": 8.248571145396362e-05, "loss": 2.7545, "step": 1395 }, { "epoch": 0.29, "grad_norm": 1.1474837064743042, "learning_rate": 8.235879732727236e-05, "loss": 2.8279, "step": 1400 }, { "epoch": 0.29, "grad_norm": 1.0333293676376343, "learning_rate": 8.223152344667745e-05, "loss": 2.7702, "step": 1405 }, { "epoch": 0.29, "grad_norm": 0.987920880317688, "learning_rate": 8.21038912271658e-05, "loss": 2.7351, "step": 1410 }, { "epoch": 0.29, "grad_norm": 1.0156697034835815, "learning_rate": 8.197590208770824e-05, "loss": 2.7674, "step": 1415 }, { "epoch": 0.3, "grad_norm": 1.0461132526397705, "learning_rate": 8.184755745124371e-05, "loss": 2.745, "step": 1420 }, { "epoch": 0.3, "grad_norm": 1.0527997016906738, "learning_rate": 8.171885874466342e-05, "loss": 2.799, "step": 1425 }, { "epoch": 0.3, "grad_norm": 1.162582278251648, "learning_rate": 8.158980739879507e-05, "loss": 2.7565, "step": 1430 }, { "epoch": 0.3, "grad_norm": 1.0373525619506836, "learning_rate": 8.146040484838677e-05, "loss": 2.7538, "step": 1435 }, { "epoch": 0.3, "grad_norm": 1.0482909679412842, "learning_rate": 8.133065253209132e-05, "loss": 2.8521, "step": 1440 }, { "epoch": 0.3, "grad_norm": 1.0852783918380737, "learning_rate": 8.120055189245e-05, "loss": 2.7346, "step": 1445 }, { "epoch": 0.3, "grad_norm": 1.078517198562622, "learning_rate": 8.10701043758767e-05, "loss": 2.6777, "step": 1450 }, { "epoch": 0.3, "grad_norm": 1.0317991971969604, "learning_rate": 8.093931143264174e-05, "loss": 2.7016, "step": 1455 }, { "epoch": 0.3, "grad_norm": 0.9878939986228943, "learning_rate": 8.080817451685576e-05, "loss": 2.8025, "step": 1460 }, { "epoch": 0.3, "grad_norm": 0.9930393695831299, "learning_rate": 8.067669508645356e-05, "loss": 2.735, "step": 1465 }, { "epoch": 0.31, "grad_norm": 1.0263429880142212, "learning_rate": 8.054487460317797e-05, "loss": 2.7556, "step": 1470 }, { "epoch": 0.31, "grad_norm": 1.0251388549804688, "learning_rate": 8.041271453256345e-05, "loss": 2.7951, "step": 1475 }, { "epoch": 0.31, "grad_norm": 1.0688138008117676, "learning_rate": 8.02802163439199e-05, "loss": 2.6784, "step": 1480 }, { "epoch": 0.31, "grad_norm": 1.0590801239013672, "learning_rate": 8.01473815103163e-05, "loss": 2.7043, "step": 1485 }, { "epoch": 0.31, "grad_norm": 1.0488477945327759, "learning_rate": 8.001421150856434e-05, "loss": 2.7299, "step": 1490 }, { "epoch": 0.31, "grad_norm": 1.0498743057250977, "learning_rate": 7.988070781920197e-05, "loss": 2.7261, "step": 1495 }, { "epoch": 0.31, "grad_norm": 1.0502474308013916, "learning_rate": 7.9746871926477e-05, "loss": 2.756, "step": 1500 }, { "epoch": 0.31, "grad_norm": 1.0085452795028687, "learning_rate": 7.961270531833052e-05, "loss": 2.7687, "step": 1505 }, { "epoch": 0.31, "grad_norm": 1.0181148052215576, "learning_rate": 7.947820948638045e-05, "loss": 2.7631, "step": 1510 }, { "epoch": 0.31, "grad_norm": 0.9904955625534058, "learning_rate": 7.934338592590486e-05, "loss": 2.7268, "step": 1515 }, { "epoch": 0.32, "grad_norm": 1.027920126914978, "learning_rate": 7.92082361358254e-05, "loss": 2.7101, "step": 1520 }, { "epoch": 0.32, "grad_norm": 1.058929443359375, "learning_rate": 7.907276161869065e-05, "loss": 2.7147, "step": 1525 }, { "epoch": 0.32, "grad_norm": 1.049456238746643, "learning_rate": 7.893696388065936e-05, "loss": 2.7167, "step": 1530 }, { "epoch": 0.32, "grad_norm": 1.0867774486541748, "learning_rate": 7.88008444314838e-05, "loss": 2.6422, "step": 1535 }, { "epoch": 0.32, "grad_norm": 0.9677130579948425, "learning_rate": 7.866440478449283e-05, "loss": 2.6352, "step": 1540 }, { "epoch": 0.32, "grad_norm": 1.0104931592941284, "learning_rate": 7.852764645657522e-05, "loss": 2.8134, "step": 1545 }, { "epoch": 0.32, "grad_norm": 0.9881729483604431, "learning_rate": 7.839057096816271e-05, "loss": 2.7757, "step": 1550 }, { "epoch": 0.32, "grad_norm": 1.0977373123168945, "learning_rate": 7.82531798432131e-05, "loss": 2.7072, "step": 1555 }, { "epoch": 0.32, "grad_norm": 1.0084450244903564, "learning_rate": 7.811547460919333e-05, "loss": 2.6915, "step": 1560 }, { "epoch": 0.33, "grad_norm": 0.9954885840415955, "learning_rate": 7.797745679706254e-05, "loss": 2.7586, "step": 1565 }, { "epoch": 0.33, "grad_norm": 1.036213755607605, "learning_rate": 7.783912794125496e-05, "loss": 2.6901, "step": 1570 }, { "epoch": 0.33, "grad_norm": 1.0960286855697632, "learning_rate": 7.770048957966291e-05, "loss": 2.7356, "step": 1575 }, { "epoch": 0.33, "grad_norm": 1.081369161605835, "learning_rate": 7.756154325361967e-05, "loss": 2.6979, "step": 1580 }, { "epoch": 0.33, "grad_norm": 1.055607795715332, "learning_rate": 7.74222905078824e-05, "loss": 2.7447, "step": 1585 }, { "epoch": 0.33, "grad_norm": 0.969778299331665, "learning_rate": 7.728273289061489e-05, "loss": 2.6753, "step": 1590 }, { "epoch": 0.33, "grad_norm": 0.9912227392196655, "learning_rate": 7.714287195337044e-05, "loss": 2.7152, "step": 1595 }, { "epoch": 0.33, "grad_norm": 0.969498872756958, "learning_rate": 7.700270925107448e-05, "loss": 2.7074, "step": 1600 }, { "epoch": 0.33, "grad_norm": 0.9954387545585632, "learning_rate": 7.686224634200742e-05, "loss": 2.7113, "step": 1605 }, { "epoch": 0.33, "grad_norm": 1.036186933517456, "learning_rate": 7.672148478778722e-05, "loss": 2.6632, "step": 1610 }, { "epoch": 0.34, "grad_norm": 0.9863171577453613, "learning_rate": 7.658042615335212e-05, "loss": 2.7085, "step": 1615 }, { "epoch": 0.34, "grad_norm": 0.9429605603218079, "learning_rate": 7.643907200694318e-05, "loss": 2.6517, "step": 1620 }, { "epoch": 0.34, "grad_norm": 1.014510154724121, "learning_rate": 7.629742392008684e-05, "loss": 2.6171, "step": 1625 }, { "epoch": 0.34, "grad_norm": 0.9857035279273987, "learning_rate": 7.615548346757749e-05, "loss": 2.6967, "step": 1630 }, { "epoch": 0.34, "grad_norm": 1.0688436031341553, "learning_rate": 7.60132522274599e-05, "loss": 2.7761, "step": 1635 }, { "epoch": 0.34, "grad_norm": 0.9699895977973938, "learning_rate": 7.587073178101178e-05, "loss": 2.7205, "step": 1640 }, { "epoch": 0.34, "grad_norm": 1.1228586435317993, "learning_rate": 7.572792371272609e-05, "loss": 2.6913, "step": 1645 }, { "epoch": 0.34, "grad_norm": 1.030258297920227, "learning_rate": 7.55848296102935e-05, "loss": 2.6817, "step": 1650 }, { "epoch": 0.34, "grad_norm": 1.0188491344451904, "learning_rate": 7.544145106458465e-05, "loss": 2.7072, "step": 1655 }, { "epoch": 0.35, "grad_norm": 0.9991633892059326, "learning_rate": 7.529778966963259e-05, "loss": 2.6532, "step": 1660 }, { "epoch": 0.35, "grad_norm": 1.0799708366394043, "learning_rate": 7.515384702261496e-05, "loss": 2.6856, "step": 1665 }, { "epoch": 0.35, "grad_norm": 0.9435780644416809, "learning_rate": 7.500962472383627e-05, "loss": 2.6041, "step": 1670 }, { "epoch": 0.35, "grad_norm": 1.0160300731658936, "learning_rate": 7.486512437671011e-05, "loss": 2.7019, "step": 1675 }, { "epoch": 0.35, "grad_norm": 1.114219307899475, "learning_rate": 7.472034758774128e-05, "loss": 2.7862, "step": 1680 }, { "epoch": 0.35, "grad_norm": 0.9939204454421997, "learning_rate": 7.457529596650797e-05, "loss": 2.6545, "step": 1685 }, { "epoch": 0.35, "grad_norm": 0.975845456123352, "learning_rate": 7.442997112564392e-05, "loss": 2.7047, "step": 1690 }, { "epoch": 0.35, "grad_norm": 0.9775164723396301, "learning_rate": 7.428437468082037e-05, "loss": 2.7103, "step": 1695 }, { "epoch": 0.35, "grad_norm": 0.9839438796043396, "learning_rate": 7.413850825072817e-05, "loss": 2.7212, "step": 1700 }, { "epoch": 0.35, "grad_norm": 0.9818946123123169, "learning_rate": 7.39923734570598e-05, "loss": 2.71, "step": 1705 }, { "epoch": 0.36, "grad_norm": 0.9547445178031921, "learning_rate": 7.384597192449126e-05, "loss": 2.7184, "step": 1710 }, { "epoch": 0.36, "grad_norm": 1.0153919458389282, "learning_rate": 7.369930528066412e-05, "loss": 2.6053, "step": 1715 }, { "epoch": 0.36, "grad_norm": 0.9527406096458435, "learning_rate": 7.355237515616732e-05, "loss": 2.6794, "step": 1720 }, { "epoch": 0.36, "grad_norm": 0.9509900808334351, "learning_rate": 7.340518318451914e-05, "loss": 2.6821, "step": 1725 }, { "epoch": 0.36, "grad_norm": 1.0353055000305176, "learning_rate": 7.325773100214893e-05, "loss": 2.6917, "step": 1730 }, { "epoch": 0.36, "grad_norm": 0.9707476496696472, "learning_rate": 7.311002024837899e-05, "loss": 2.6346, "step": 1735 }, { "epoch": 0.36, "grad_norm": 0.999159574508667, "learning_rate": 7.296205256540633e-05, "loss": 2.6769, "step": 1740 }, { "epoch": 0.36, "grad_norm": 0.9343658089637756, "learning_rate": 7.281382959828443e-05, "loss": 2.6112, "step": 1745 }, { "epoch": 0.36, "grad_norm": 1.0174192190170288, "learning_rate": 7.26653529949049e-05, "loss": 2.7012, "step": 1750 }, { "epoch": 0.36, "grad_norm": 1.0045185089111328, "learning_rate": 7.25166244059792e-05, "loss": 2.6414, "step": 1755 }, { "epoch": 0.37, "grad_norm": 0.9818803668022156, "learning_rate": 7.236764548502029e-05, "loss": 2.6397, "step": 1760 }, { "epoch": 0.37, "grad_norm": 1.0778367519378662, "learning_rate": 7.221841788832421e-05, "loss": 2.635, "step": 1765 }, { "epoch": 0.37, "grad_norm": 1.0316153764724731, "learning_rate": 7.206894327495173e-05, "loss": 2.6314, "step": 1770 }, { "epoch": 0.37, "grad_norm": 0.9475739002227783, "learning_rate": 7.191922330670982e-05, "loss": 2.595, "step": 1775 }, { "epoch": 0.37, "grad_norm": 0.9840036034584045, "learning_rate": 7.176925964813326e-05, "loss": 2.6984, "step": 1780 }, { "epoch": 0.37, "grad_norm": 0.9781378507614136, "learning_rate": 7.161905396646607e-05, "loss": 2.6671, "step": 1785 }, { "epoch": 0.37, "grad_norm": 0.9534721970558167, "learning_rate": 7.146860793164299e-05, "loss": 2.6378, "step": 1790 }, { "epoch": 0.37, "grad_norm": 0.9708443880081177, "learning_rate": 7.131792321627098e-05, "loss": 2.6401, "step": 1795 }, { "epoch": 0.37, "grad_norm": 0.9724230766296387, "learning_rate": 7.116700149561048e-05, "loss": 2.6484, "step": 1800 }, { "epoch": 0.38, "grad_norm": 1.0079472064971924, "learning_rate": 7.101584444755696e-05, "loss": 2.6751, "step": 1805 }, { "epoch": 0.38, "grad_norm": 0.9262291789054871, "learning_rate": 7.086445375262212e-05, "loss": 2.5996, "step": 1810 }, { "epoch": 0.38, "grad_norm": 1.0142055749893188, "learning_rate": 7.071283109391528e-05, "loss": 2.7081, "step": 1815 }, { "epoch": 0.38, "grad_norm": 1.0082738399505615, "learning_rate": 7.056097815712466e-05, "loss": 2.6156, "step": 1820 }, { "epoch": 0.38, "grad_norm": 0.9950462579727173, "learning_rate": 7.040889663049862e-05, "loss": 2.6308, "step": 1825 }, { "epoch": 0.38, "grad_norm": 1.0031845569610596, "learning_rate": 7.025658820482693e-05, "loss": 2.6845, "step": 1830 }, { "epoch": 0.38, "grad_norm": 1.0086175203323364, "learning_rate": 7.010405457342192e-05, "loss": 2.6379, "step": 1835 }, { "epoch": 0.38, "grad_norm": 0.97476726770401, "learning_rate": 6.995129743209967e-05, "loss": 2.6356, "step": 1840 }, { "epoch": 0.38, "grad_norm": 0.9677056074142456, "learning_rate": 6.97983184791612e-05, "loss": 2.6214, "step": 1845 }, { "epoch": 0.38, "grad_norm": 1.002587914466858, "learning_rate": 6.964511941537355e-05, "loss": 2.6255, "step": 1850 }, { "epoch": 0.39, "grad_norm": 0.9613837003707886, "learning_rate": 6.949170194395083e-05, "loss": 2.6151, "step": 1855 }, { "epoch": 0.39, "grad_norm": 1.0320087671279907, "learning_rate": 6.933806777053536e-05, "loss": 2.6616, "step": 1860 }, { "epoch": 0.39, "grad_norm": 1.0032234191894531, "learning_rate": 6.918421860317872e-05, "loss": 2.6229, "step": 1865 }, { "epoch": 0.39, "grad_norm": 0.9621163606643677, "learning_rate": 6.903015615232263e-05, "loss": 2.6176, "step": 1870 }, { "epoch": 0.39, "grad_norm": 0.9764354825019836, "learning_rate": 6.887588213078012e-05, "loss": 2.6557, "step": 1875 }, { "epoch": 0.39, "grad_norm": 1.0155274868011475, "learning_rate": 6.87213982537163e-05, "loss": 2.6642, "step": 1880 }, { "epoch": 0.39, "grad_norm": 0.9893097281455994, "learning_rate": 6.856670623862943e-05, "loss": 2.6603, "step": 1885 }, { "epoch": 0.39, "grad_norm": 0.9779693484306335, "learning_rate": 6.841180780533179e-05, "loss": 2.6607, "step": 1890 }, { "epoch": 0.39, "grad_norm": 0.9833638072013855, "learning_rate": 6.82567046759305e-05, "loss": 2.6829, "step": 1895 }, { "epoch": 0.39, "grad_norm": 1.0706186294555664, "learning_rate": 6.810139857480844e-05, "loss": 2.6703, "step": 1900 }, { "epoch": 0.4, "grad_norm": 1.0078755617141724, "learning_rate": 6.794589122860509e-05, "loss": 2.6086, "step": 1905 }, { "epoch": 0.4, "grad_norm": 0.9727465510368347, "learning_rate": 6.779018436619725e-05, "loss": 2.6492, "step": 1910 }, { "epoch": 0.4, "grad_norm": 0.985817015171051, "learning_rate": 6.763427971867992e-05, "loss": 2.5795, "step": 1915 }, { "epoch": 0.4, "grad_norm": 1.0108627080917358, "learning_rate": 6.747817901934699e-05, "loss": 2.6732, "step": 1920 }, { "epoch": 0.4, "grad_norm": 0.9770353436470032, "learning_rate": 6.732188400367197e-05, "loss": 2.616, "step": 1925 }, { "epoch": 0.4, "grad_norm": 0.9645941853523254, "learning_rate": 6.716539640928871e-05, "loss": 2.6159, "step": 1930 }, { "epoch": 0.4, "grad_norm": 0.9376698136329651, "learning_rate": 6.70087179759721e-05, "loss": 2.6428, "step": 1935 }, { "epoch": 0.4, "grad_norm": 0.9465940594673157, "learning_rate": 6.685185044561874e-05, "loss": 2.6459, "step": 1940 }, { "epoch": 0.4, "grad_norm": 0.9306380748748779, "learning_rate": 6.669479556222747e-05, "loss": 2.633, "step": 1945 }, { "epoch": 0.41, "grad_norm": 0.9376333355903625, "learning_rate": 6.653755507188013e-05, "loss": 2.5994, "step": 1950 }, { "epoch": 0.41, "grad_norm": 0.9646631479263306, "learning_rate": 6.638013072272205e-05, "loss": 2.6609, "step": 1955 }, { "epoch": 0.41, "grad_norm": 0.9547147154808044, "learning_rate": 6.622252426494259e-05, "loss": 2.638, "step": 1960 }, { "epoch": 0.41, "grad_norm": 0.9426686763763428, "learning_rate": 6.606473745075581e-05, "loss": 2.5625, "step": 1965 }, { "epoch": 0.41, "grad_norm": 0.9773340821266174, "learning_rate": 6.590677203438084e-05, "loss": 2.6222, "step": 1970 }, { "epoch": 0.41, "grad_norm": 0.9521613121032715, "learning_rate": 6.574862977202252e-05, "loss": 2.6725, "step": 1975 }, { "epoch": 0.41, "grad_norm": 0.9960840940475464, "learning_rate": 6.559031242185174e-05, "loss": 2.629, "step": 1980 }, { "epoch": 0.41, "grad_norm": 0.9541398286819458, "learning_rate": 6.543182174398597e-05, "loss": 2.604, "step": 1985 }, { "epoch": 0.41, "grad_norm": 0.9481289386749268, "learning_rate": 6.52731595004697e-05, "loss": 2.5867, "step": 1990 }, { "epoch": 0.41, "grad_norm": 0.9873711466789246, "learning_rate": 6.51143274552548e-05, "loss": 2.6228, "step": 1995 }, { "epoch": 0.42, "grad_norm": 0.999569296836853, "learning_rate": 6.495532737418098e-05, "loss": 2.6684, "step": 2000 }, { "epoch": 0.42, "grad_norm": 1.019819736480713, "learning_rate": 6.479616102495605e-05, "loss": 2.571, "step": 2005 }, { "epoch": 0.42, "grad_norm": 0.9891926050186157, "learning_rate": 6.463683017713638e-05, "loss": 2.7003, "step": 2010 }, { "epoch": 0.42, "grad_norm": 0.9509793519973755, "learning_rate": 6.447733660210715e-05, "loss": 2.5304, "step": 2015 }, { "epoch": 0.42, "grad_norm": 12.407959938049316, "learning_rate": 6.431768207306272e-05, "loss": 2.6177, "step": 2020 }, { "epoch": 0.42, "grad_norm": 1.0190647840499878, "learning_rate": 6.415786836498684e-05, "loss": 2.5533, "step": 2025 }, { "epoch": 0.42, "grad_norm": 0.9944144487380981, "learning_rate": 6.399789725463298e-05, "loss": 2.5744, "step": 2030 }, { "epoch": 0.42, "grad_norm": 0.9825426340103149, "learning_rate": 6.383777052050458e-05, "loss": 2.6318, "step": 2035 }, { "epoch": 0.42, "grad_norm": 0.9628515839576721, "learning_rate": 6.367748994283518e-05, "loss": 2.6367, "step": 2040 }, { "epoch": 0.43, "grad_norm": 0.9694194197654724, "learning_rate": 6.351705730356877e-05, "loss": 2.5802, "step": 2045 }, { "epoch": 0.43, "grad_norm": 0.967638373374939, "learning_rate": 6.335647438633987e-05, "loss": 2.5357, "step": 2050 }, { "epoch": 0.43, "grad_norm": 0.9158982038497925, "learning_rate": 6.319574297645374e-05, "loss": 2.5934, "step": 2055 }, { "epoch": 0.43, "grad_norm": 0.9469335675239563, "learning_rate": 6.303486486086654e-05, "loss": 2.5442, "step": 2060 }, { "epoch": 0.43, "grad_norm": 0.9547492861747742, "learning_rate": 6.287384182816546e-05, "loss": 2.639, "step": 2065 }, { "epoch": 0.43, "grad_norm": 0.9499625563621521, "learning_rate": 6.271267566854883e-05, "loss": 2.5463, "step": 2070 }, { "epoch": 0.43, "grad_norm": 0.9230909943580627, "learning_rate": 6.255136817380618e-05, "loss": 2.5663, "step": 2075 }, { "epoch": 0.43, "grad_norm": 0.9761818647384644, "learning_rate": 6.23899211372984e-05, "loss": 2.5928, "step": 2080 }, { "epoch": 0.43, "grad_norm": 0.9458242654800415, "learning_rate": 6.222833635393772e-05, "loss": 2.5068, "step": 2085 }, { "epoch": 0.43, "grad_norm": 1.0186399221420288, "learning_rate": 6.206661562016782e-05, "loss": 2.5338, "step": 2090 }, { "epoch": 0.44, "grad_norm": 0.9645049571990967, "learning_rate": 6.190476073394382e-05, "loss": 2.5828, "step": 2095 }, { "epoch": 0.44, "grad_norm": 0.9071171283721924, "learning_rate": 6.17427734947123e-05, "loss": 2.5709, "step": 2100 }, { "epoch": 0.44, "grad_norm": 0.9457060694694519, "learning_rate": 6.158065570339127e-05, "loss": 2.569, "step": 2105 }, { "epoch": 0.44, "grad_norm": 1.0017690658569336, "learning_rate": 6.141840916235021e-05, "loss": 2.6378, "step": 2110 }, { "epoch": 0.44, "grad_norm": 0.9879325032234192, "learning_rate": 6.125603567539001e-05, "loss": 2.5915, "step": 2115 }, { "epoch": 0.44, "grad_norm": 0.9789354205131531, "learning_rate": 6.109353704772284e-05, "loss": 2.6959, "step": 2120 }, { "epoch": 0.44, "grad_norm": 0.9717521071434021, "learning_rate": 6.0930915085952164e-05, "loss": 2.6171, "step": 2125 }, { "epoch": 0.44, "grad_norm": 1.026318907737732, "learning_rate": 6.076817159805267e-05, "loss": 2.59, "step": 2130 }, { "epoch": 0.44, "grad_norm": 0.9704822301864624, "learning_rate": 6.06053083933501e-05, "loss": 2.6399, "step": 2135 }, { "epoch": 0.44, "grad_norm": 0.938707709312439, "learning_rate": 6.044232728250116e-05, "loss": 2.5744, "step": 2140 }, { "epoch": 0.45, "grad_norm": 0.9552039504051208, "learning_rate": 6.027923007747339e-05, "loss": 2.603, "step": 2145 }, { "epoch": 0.45, "grad_norm": 0.9553301334381104, "learning_rate": 6.011601859152506e-05, "loss": 2.6054, "step": 2150 }, { "epoch": 0.45, "grad_norm": 0.9410426616668701, "learning_rate": 5.995269463918495e-05, "loss": 2.6622, "step": 2155 }, { "epoch": 0.45, "grad_norm": 0.929993212223053, "learning_rate": 5.97892600362322e-05, "loss": 2.6446, "step": 2160 }, { "epoch": 0.45, "grad_norm": 0.9774343967437744, "learning_rate": 5.962571659967614e-05, "loss": 2.6214, "step": 2165 }, { "epoch": 0.45, "grad_norm": 0.9398266673088074, "learning_rate": 5.946206614773606e-05, "loss": 2.5305, "step": 2170 }, { "epoch": 0.45, "grad_norm": 1.005797266960144, "learning_rate": 5.929831049982103e-05, "loss": 2.5693, "step": 2175 }, { "epoch": 0.45, "grad_norm": 0.9824793338775635, "learning_rate": 5.9134451476509633e-05, "loss": 2.6274, "step": 2180 }, { "epoch": 0.45, "grad_norm": 0.9403955936431885, "learning_rate": 5.897049089952974e-05, "loss": 2.5396, "step": 2185 }, { "epoch": 0.46, "grad_norm": 0.966221034526825, "learning_rate": 5.880643059173826e-05, "loss": 2.555, "step": 2190 }, { "epoch": 0.46, "grad_norm": 0.9708110690116882, "learning_rate": 5.864227237710093e-05, "loss": 2.5647, "step": 2195 }, { "epoch": 0.46, "grad_norm": 0.9578514695167542, "learning_rate": 5.847801808067189e-05, "loss": 2.6166, "step": 2200 }, { "epoch": 0.46, "grad_norm": 0.9706119298934937, "learning_rate": 5.831366952857357e-05, "loss": 2.6013, "step": 2205 }, { "epoch": 0.46, "grad_norm": 0.961341917514801, "learning_rate": 5.814922854797622e-05, "loss": 2.6273, "step": 2210 }, { "epoch": 0.46, "grad_norm": 0.9282676577568054, "learning_rate": 5.798469696707775e-05, "loss": 2.5886, "step": 2215 }, { "epoch": 0.46, "grad_norm": 1.1473058462142944, "learning_rate": 5.782007661508331e-05, "loss": 2.5553, "step": 2220 }, { "epoch": 0.46, "grad_norm": 0.9114869832992554, "learning_rate": 5.765536932218495e-05, "loss": 2.53, "step": 2225 }, { "epoch": 0.46, "grad_norm": 0.9566113352775574, "learning_rate": 5.7490576919541315e-05, "loss": 2.5858, "step": 2230 }, { "epoch": 0.46, "grad_norm": 0.9558666944503784, "learning_rate": 5.732570123925729e-05, "loss": 2.5127, "step": 2235 }, { "epoch": 0.47, "grad_norm": 0.9059987664222717, "learning_rate": 5.7160744114363593e-05, "loss": 2.5475, "step": 2240 }, { "epoch": 0.47, "grad_norm": 0.9068989157676697, "learning_rate": 5.699570737879641e-05, "loss": 2.5572, "step": 2245 }, { "epoch": 0.47, "grad_norm": 0.9678176045417786, "learning_rate": 5.683059286737702e-05, "loss": 2.5725, "step": 2250 }, { "epoch": 0.47, "grad_norm": 0.9395660161972046, "learning_rate": 5.666540241579139e-05, "loss": 2.5859, "step": 2255 }, { "epoch": 0.47, "grad_norm": 0.9277108311653137, "learning_rate": 5.6500137860569766e-05, "loss": 2.541, "step": 2260 }, { "epoch": 0.47, "grad_norm": 0.9472150206565857, "learning_rate": 5.633480103906624e-05, "loss": 2.6044, "step": 2265 }, { "epoch": 0.47, "grad_norm": 0.9881107807159424, "learning_rate": 5.616939378943834e-05, "loss": 2.5783, "step": 2270 }, { "epoch": 0.47, "grad_norm": 0.9077834486961365, "learning_rate": 5.6003917950626595e-05, "loss": 2.5131, "step": 2275 }, { "epoch": 0.47, "grad_norm": 0.9303057193756104, "learning_rate": 5.583837536233407e-05, "loss": 2.5234, "step": 2280 }, { "epoch": 0.47, "grad_norm": 0.9464006423950195, "learning_rate": 5.567276786500596e-05, "loss": 2.6048, "step": 2285 }, { "epoch": 0.48, "grad_norm": 0.9303953051567078, "learning_rate": 5.5507097299809054e-05, "loss": 2.6002, "step": 2290 }, { "epoch": 0.48, "grad_norm": 0.9646936655044556, "learning_rate": 5.534136550861133e-05, "loss": 2.5923, "step": 2295 }, { "epoch": 0.48, "grad_norm": 0.9685410261154175, "learning_rate": 5.5175574333961465e-05, "loss": 2.5573, "step": 2300 }, { "epoch": 0.48, "grad_norm": 0.9414950609207153, "learning_rate": 5.500972561906832e-05, "loss": 2.6158, "step": 2305 }, { "epoch": 0.48, "grad_norm": 0.9068962931632996, "learning_rate": 5.484382120778048e-05, "loss": 2.6208, "step": 2310 }, { "epoch": 0.48, "grad_norm": 0.9170411229133606, "learning_rate": 5.467786294456575e-05, "loss": 2.523, "step": 2315 }, { "epoch": 0.48, "grad_norm": 0.9522758722305298, "learning_rate": 5.451185267449061e-05, "loss": 2.5167, "step": 2320 }, { "epoch": 0.48, "grad_norm": 0.9827879071235657, "learning_rate": 5.43457922431998e-05, "loss": 2.6251, "step": 2325 }, { "epoch": 0.48, "grad_norm": 0.9485974907875061, "learning_rate": 5.417968349689566e-05, "loss": 2.5533, "step": 2330 }, { "epoch": 0.49, "grad_norm": 0.9093174338340759, "learning_rate": 5.401352828231772e-05, "loss": 2.5801, "step": 2335 }, { "epoch": 0.49, "grad_norm": 0.9550360441207886, "learning_rate": 5.384732844672211e-05, "loss": 2.5791, "step": 2340 }, { "epoch": 0.49, "grad_norm": 1.01358962059021, "learning_rate": 5.368108583786107e-05, "loss": 2.5222, "step": 2345 }, { "epoch": 0.49, "grad_norm": 0.9926130175590515, "learning_rate": 5.3514802303962344e-05, "loss": 2.5862, "step": 2350 }, { "epoch": 0.49, "grad_norm": 0.913351833820343, "learning_rate": 5.334847969370868e-05, "loss": 2.4312, "step": 2355 }, { "epoch": 0.49, "grad_norm": 0.9434849619865417, "learning_rate": 5.3182119856217284e-05, "loss": 2.6087, "step": 2360 }, { "epoch": 0.49, "grad_norm": 0.9408784508705139, "learning_rate": 5.3015724641019214e-05, "loss": 2.5432, "step": 2365 }, { "epoch": 0.49, "grad_norm": 0.9512544870376587, "learning_rate": 5.284929589803884e-05, "loss": 2.5393, "step": 2370 }, { "epoch": 0.49, "grad_norm": 0.9654476642608643, "learning_rate": 5.2682835477573336e-05, "loss": 2.5588, "step": 2375 }, { "epoch": 0.49, "grad_norm": 0.9294528961181641, "learning_rate": 5.2516345230271965e-05, "loss": 2.5511, "step": 2380 }, { "epoch": 0.5, "grad_norm": 0.9224638342857361, "learning_rate": 5.234982700711569e-05, "loss": 2.5216, "step": 2385 }, { "epoch": 0.5, "grad_norm": 0.9053465127944946, "learning_rate": 5.218328265939643e-05, "loss": 2.5536, "step": 2390 }, { "epoch": 0.5, "grad_norm": 0.9090244174003601, "learning_rate": 5.201671403869657e-05, "loss": 2.5365, "step": 2395 }, { "epoch": 0.5, "grad_norm": 0.9613234400749207, "learning_rate": 5.1850122996868366e-05, "loss": 2.572, "step": 2400 }, { "epoch": 0.5, "grad_norm": 0.9468548893928528, "learning_rate": 5.168351138601334e-05, "loss": 2.509, "step": 2405 }, { "epoch": 0.5, "grad_norm": 1.098836898803711, "learning_rate": 5.1516881058461675e-05, "loss": 2.5212, "step": 2410 }, { "epoch": 0.5, "grad_norm": 0.9981148838996887, "learning_rate": 5.135023386675166e-05, "loss": 2.5296, "step": 2415 }, { "epoch": 0.5, "grad_norm": 0.9370298385620117, "learning_rate": 5.118357166360906e-05, "loss": 2.565, "step": 2420 }, { "epoch": 0.5, "grad_norm": 0.9485419988632202, "learning_rate": 5.101689630192655e-05, "loss": 2.4705, "step": 2425 }, { "epoch": 0.51, "grad_norm": 0.9375891089439392, "learning_rate": 5.085020963474307e-05, "loss": 2.4834, "step": 2430 }, { "epoch": 0.51, "grad_norm": 0.9355273246765137, "learning_rate": 5.068351351522329e-05, "loss": 2.6003, "step": 2435 }, { "epoch": 0.51, "grad_norm": 0.9307159781455994, "learning_rate": 5.0516809796636935e-05, "loss": 2.4923, "step": 2440 }, { "epoch": 0.51, "grad_norm": 0.951213002204895, "learning_rate": 5.035010033233821e-05, "loss": 2.5471, "step": 2445 }, { "epoch": 0.51, "grad_norm": 0.8841201663017273, "learning_rate": 5.018338697574523e-05, "loss": 2.4939, "step": 2450 }, { "epoch": 0.51, "grad_norm": 0.9634934067726135, "learning_rate": 5.0016671580319354e-05, "loss": 2.5294, "step": 2455 }, { "epoch": 0.51, "grad_norm": 0.9498656988143921, "learning_rate": 4.984995599954461e-05, "loss": 2.5306, "step": 2460 }, { "epoch": 0.51, "grad_norm": 0.9849341511726379, "learning_rate": 4.968324208690712e-05, "loss": 2.5498, "step": 2465 }, { "epoch": 0.51, "grad_norm": 0.9472889304161072, "learning_rate": 4.951653169587441e-05, "loss": 2.5471, "step": 2470 }, { "epoch": 0.51, "grad_norm": 0.919108510017395, "learning_rate": 4.93498266798749e-05, "loss": 2.5413, "step": 2475 }, { "epoch": 0.52, "grad_norm": 0.9519367814064026, "learning_rate": 4.918312889227722e-05, "loss": 2.5299, "step": 2480 }, { "epoch": 0.52, "grad_norm": 0.9095039367675781, "learning_rate": 4.901644018636966e-05, "loss": 2.5499, "step": 2485 }, { "epoch": 0.52, "grad_norm": 0.9608018398284912, "learning_rate": 4.8849762415339526e-05, "loss": 2.4624, "step": 2490 }, { "epoch": 0.52, "grad_norm": 0.9327159523963928, "learning_rate": 4.868309743225256e-05, "loss": 2.5808, "step": 2495 }, { "epoch": 0.52, "grad_norm": 0.9636819362640381, "learning_rate": 4.851644709003233e-05, "loss": 2.5551, "step": 2500 }, { "epoch": 0.52, "grad_norm": 0.9298324584960938, "learning_rate": 4.834981324143964e-05, "loss": 2.4952, "step": 2505 }, { "epoch": 0.52, "grad_norm": 0.9329404830932617, "learning_rate": 4.818319773905191e-05, "loss": 2.5115, "step": 2510 }, { "epoch": 0.52, "grad_norm": 0.9319917559623718, "learning_rate": 4.801660243524261e-05, "loss": 2.5493, "step": 2515 }, { "epoch": 0.52, "grad_norm": 0.9572451114654541, "learning_rate": 4.7850029182160626e-05, "loss": 2.5974, "step": 2520 }, { "epoch": 0.52, "grad_norm": 0.938522219657898, "learning_rate": 4.768347983170973e-05, "loss": 2.5079, "step": 2525 }, { "epoch": 0.53, "grad_norm": 0.9182351231575012, "learning_rate": 4.7516956235527884e-05, "loss": 2.4939, "step": 2530 }, { "epoch": 0.53, "grad_norm": 0.9615866541862488, "learning_rate": 4.735046024496682e-05, "loss": 2.5146, "step": 2535 }, { "epoch": 0.53, "grad_norm": 0.9523360729217529, "learning_rate": 4.7183993711071286e-05, "loss": 2.4768, "step": 2540 }, { "epoch": 0.53, "grad_norm": 0.9364380240440369, "learning_rate": 4.7017558484558554e-05, "loss": 2.55, "step": 2545 }, { "epoch": 0.53, "grad_norm": 0.9449701905250549, "learning_rate": 4.6851156415797844e-05, "loss": 2.4937, "step": 2550 }, { "epoch": 0.53, "grad_norm": 0.9214823246002197, "learning_rate": 4.6684789354789746e-05, "loss": 2.5316, "step": 2555 }, { "epoch": 0.53, "grad_norm": 0.8947305679321289, "learning_rate": 4.651845915114563e-05, "loss": 2.5681, "step": 2560 }, { "epoch": 0.53, "grad_norm": 0.9033908843994141, "learning_rate": 4.6352167654067095e-05, "loss": 2.5183, "step": 2565 }, { "epoch": 0.53, "grad_norm": 1.0538268089294434, "learning_rate": 4.618591671232544e-05, "loss": 2.5556, "step": 2570 }, { "epoch": 0.54, "grad_norm": 0.9056328535079956, "learning_rate": 4.601970817424106e-05, "loss": 2.5046, "step": 2575 }, { "epoch": 0.54, "grad_norm": 1.1014292240142822, "learning_rate": 4.585354388766292e-05, "loss": 2.5109, "step": 2580 }, { "epoch": 0.54, "grad_norm": 0.8942855596542358, "learning_rate": 4.568742569994802e-05, "loss": 2.4971, "step": 2585 }, { "epoch": 0.54, "grad_norm": 0.8742932677268982, "learning_rate": 4.552135545794086e-05, "loss": 2.5116, "step": 2590 }, { "epoch": 0.54, "grad_norm": 0.9332443475723267, "learning_rate": 4.535533500795288e-05, "loss": 2.511, "step": 2595 }, { "epoch": 0.54, "grad_norm": 0.9206485152244568, "learning_rate": 4.5189366195741953e-05, "loss": 2.5107, "step": 2600 }, { "epoch": 0.54, "grad_norm": 0.9191185832023621, "learning_rate": 4.502345086649186e-05, "loss": 2.4824, "step": 2605 }, { "epoch": 0.54, "grad_norm": 0.9247689247131348, "learning_rate": 4.485759086479179e-05, "loss": 2.4865, "step": 2610 }, { "epoch": 0.54, "grad_norm": 0.9270159006118774, "learning_rate": 4.469178803461579e-05, "loss": 2.5851, "step": 2615 }, { "epoch": 0.54, "grad_norm": 0.9007619023323059, "learning_rate": 4.4526044219302326e-05, "loss": 2.4677, "step": 2620 }, { "epoch": 0.55, "grad_norm": 0.9385355114936829, "learning_rate": 4.4360361261533745e-05, "loss": 2.4913, "step": 2625 }, { "epoch": 0.55, "grad_norm": 0.936024010181427, "learning_rate": 4.419474100331579e-05, "loss": 2.5147, "step": 2630 }, { "epoch": 0.55, "grad_norm": 0.9851381778717041, "learning_rate": 4.402918528595715e-05, "loss": 2.5625, "step": 2635 }, { "epoch": 0.55, "grad_norm": 0.9795789122581482, "learning_rate": 4.386369595004896e-05, "loss": 2.5276, "step": 2640 }, { "epoch": 0.55, "grad_norm": 0.9688675403594971, "learning_rate": 4.3698274835444354e-05, "loss": 2.5187, "step": 2645 }, { "epoch": 0.55, "grad_norm": 0.9564111828804016, "learning_rate": 4.3532923781238e-05, "loss": 2.5103, "step": 2650 }, { "epoch": 0.55, "grad_norm": 0.9104586243629456, "learning_rate": 4.336764462574566e-05, "loss": 2.5078, "step": 2655 }, { "epoch": 0.55, "grad_norm": 0.9217368364334106, "learning_rate": 4.320243920648376e-05, "loss": 2.5819, "step": 2660 }, { "epoch": 0.55, "grad_norm": 0.8980958461761475, "learning_rate": 4.303730936014894e-05, "loss": 2.5107, "step": 2665 }, { "epoch": 0.55, "grad_norm": 0.9054094552993774, "learning_rate": 4.287225692259765e-05, "loss": 2.4817, "step": 2670 }, { "epoch": 0.56, "grad_norm": 0.9648101925849915, "learning_rate": 4.270728372882575e-05, "loss": 2.4727, "step": 2675 }, { "epoch": 0.56, "grad_norm": 1.0163148641586304, "learning_rate": 4.254239161294804e-05, "loss": 2.5593, "step": 2680 }, { "epoch": 0.56, "grad_norm": 0.9777525067329407, "learning_rate": 4.237758240817802e-05, "loss": 2.5291, "step": 2685 }, { "epoch": 0.56, "grad_norm": 0.9313981533050537, "learning_rate": 4.2212857946807336e-05, "loss": 2.5546, "step": 2690 }, { "epoch": 0.56, "grad_norm": 0.9364113807678223, "learning_rate": 4.2048220060185516e-05, "loss": 2.4344, "step": 2695 }, { "epoch": 0.56, "grad_norm": 0.912443995475769, "learning_rate": 4.188367057869957e-05, "loss": 2.5345, "step": 2700 }, { "epoch": 0.56, "grad_norm": 0.9088121652603149, "learning_rate": 4.171921133175365e-05, "loss": 2.5508, "step": 2705 }, { "epoch": 0.56, "grad_norm": 0.9182174801826477, "learning_rate": 4.155484414774872e-05, "loss": 2.4666, "step": 2710 }, { "epoch": 0.56, "grad_norm": 0.9237582087516785, "learning_rate": 4.139057085406221e-05, "loss": 2.52, "step": 2715 }, { "epoch": 0.57, "grad_norm": 0.9367585778236389, "learning_rate": 4.1226393277027726e-05, "loss": 2.5637, "step": 2720 }, { "epoch": 0.57, "grad_norm": 0.9298025965690613, "learning_rate": 4.106231324191471e-05, "loss": 2.5733, "step": 2725 }, { "epoch": 0.57, "grad_norm": 0.921440064907074, "learning_rate": 4.089833257290817e-05, "loss": 2.509, "step": 2730 }, { "epoch": 0.57, "grad_norm": 0.9271067380905151, "learning_rate": 4.073445309308842e-05, "loss": 2.5282, "step": 2735 }, { "epoch": 0.57, "grad_norm": 0.9048083424568176, "learning_rate": 4.0570676624410756e-05, "loss": 2.5263, "step": 2740 }, { "epoch": 0.57, "grad_norm": 0.9049689173698425, "learning_rate": 4.040700498768525e-05, "loss": 2.5656, "step": 2745 }, { "epoch": 0.57, "grad_norm": 0.9169425368309021, "learning_rate": 4.024344000255648e-05, "loss": 2.5417, "step": 2750 }, { "epoch": 0.57, "grad_norm": 0.9237762093544006, "learning_rate": 4.0079983487483313e-05, "loss": 2.4893, "step": 2755 }, { "epoch": 0.57, "grad_norm": 0.9239086508750916, "learning_rate": 3.9916637259718683e-05, "loss": 2.5609, "step": 2760 }, { "epoch": 0.57, "grad_norm": 0.9628281593322754, "learning_rate": 3.9753403135289396e-05, "loss": 2.5858, "step": 2765 }, { "epoch": 0.58, "grad_norm": 0.9475969672203064, "learning_rate": 3.9590282928975914e-05, "loss": 2.5439, "step": 2770 }, { "epoch": 0.58, "grad_norm": 0.9971244931221008, "learning_rate": 3.942727845429221e-05, "loss": 2.5218, "step": 2775 }, { "epoch": 0.58, "grad_norm": 0.9252620935440063, "learning_rate": 3.926439152346558e-05, "loss": 2.5145, "step": 2780 }, { "epoch": 0.58, "grad_norm": 0.9607737064361572, "learning_rate": 3.910162394741653e-05, "loss": 2.4541, "step": 2785 }, { "epoch": 0.58, "grad_norm": 0.9223071932792664, "learning_rate": 3.893897753573861e-05, "loss": 2.4738, "step": 2790 }, { "epoch": 0.58, "grad_norm": 0.9555774927139282, "learning_rate": 3.877645409667829e-05, "loss": 2.431, "step": 2795 }, { "epoch": 0.58, "grad_norm": 0.9491199254989624, "learning_rate": 3.861405543711491e-05, "loss": 2.4294, "step": 2800 }, { "epoch": 0.58, "grad_norm": 0.9491716623306274, "learning_rate": 3.8451783362540507e-05, "loss": 2.4642, "step": 2805 }, { "epoch": 0.58, "grad_norm": 0.9213913679122925, "learning_rate": 3.828963967703983e-05, "loss": 2.4871, "step": 2810 }, { "epoch": 0.59, "grad_norm": 0.9365127682685852, "learning_rate": 3.8127626183270223e-05, "loss": 2.4805, "step": 2815 }, { "epoch": 0.59, "grad_norm": 0.908562958240509, "learning_rate": 3.796574468244161e-05, "loss": 2.4572, "step": 2820 }, { "epoch": 0.59, "grad_norm": 0.9270069599151611, "learning_rate": 3.7803996974296444e-05, "loss": 2.5138, "step": 2825 }, { "epoch": 0.59, "grad_norm": 0.9667962193489075, "learning_rate": 3.7642384857089776e-05, "loss": 2.5384, "step": 2830 }, { "epoch": 0.59, "grad_norm": 0.9173279404640198, "learning_rate": 3.748091012756915e-05, "loss": 2.5528, "step": 2835 }, { "epoch": 0.59, "grad_norm": 0.9413408041000366, "learning_rate": 3.731957458095467e-05, "loss": 2.4776, "step": 2840 }, { "epoch": 0.59, "grad_norm": 0.9140501618385315, "learning_rate": 3.71583800109191e-05, "loss": 2.5965, "step": 2845 }, { "epoch": 0.59, "grad_norm": 0.9697785377502441, "learning_rate": 3.699732820956784e-05, "loss": 2.4439, "step": 2850 }, { "epoch": 0.59, "grad_norm": 0.9393496513366699, "learning_rate": 3.6836420967419057e-05, "loss": 2.4883, "step": 2855 }, { "epoch": 0.59, "grad_norm": 0.9118742942810059, "learning_rate": 3.6675660073383745e-05, "loss": 2.5305, "step": 2860 }, { "epoch": 0.6, "grad_norm": 0.8981103301048279, "learning_rate": 3.6515047314745856e-05, "loss": 2.4762, "step": 2865 }, { "epoch": 0.6, "grad_norm": 0.9413728713989258, "learning_rate": 3.6354584477142437e-05, "loss": 2.5435, "step": 2870 }, { "epoch": 0.6, "grad_norm": 0.9207038879394531, "learning_rate": 3.6194273344543736e-05, "loss": 2.5114, "step": 2875 }, { "epoch": 0.6, "grad_norm": 0.9038093090057373, "learning_rate": 3.6034115699233425e-05, "loss": 2.4749, "step": 2880 }, { "epoch": 0.6, "grad_norm": 0.9292318224906921, "learning_rate": 3.5874113321788736e-05, "loss": 2.4526, "step": 2885 }, { "epoch": 0.6, "grad_norm": 0.8991361856460571, "learning_rate": 3.571426799106071e-05, "loss": 2.4961, "step": 2890 }, { "epoch": 0.6, "grad_norm": 0.9043452739715576, "learning_rate": 3.555458148415437e-05, "loss": 2.468, "step": 2895 }, { "epoch": 0.6, "grad_norm": 0.8990312218666077, "learning_rate": 3.539505557640901e-05, "loss": 2.4527, "step": 2900 }, { "epoch": 0.6, "grad_norm": 0.9166343808174133, "learning_rate": 3.523569204137843e-05, "loss": 2.5101, "step": 2905 }, { "epoch": 0.6, "grad_norm": 0.9148818850517273, "learning_rate": 3.5076492650811246e-05, "loss": 2.4654, "step": 2910 }, { "epoch": 0.61, "grad_norm": 0.8907680511474609, "learning_rate": 3.491745917463113e-05, "loss": 2.513, "step": 2915 }, { "epoch": 0.61, "grad_norm": 0.8982478380203247, "learning_rate": 3.475859338091721e-05, "loss": 2.4826, "step": 2920 }, { "epoch": 0.61, "grad_norm": 0.9068254828453064, "learning_rate": 3.4599897035884374e-05, "loss": 2.4805, "step": 2925 }, { "epoch": 0.61, "grad_norm": 0.9060400128364563, "learning_rate": 3.444137190386363e-05, "loss": 2.4454, "step": 2930 }, { "epoch": 0.61, "grad_norm": 0.9024885296821594, "learning_rate": 3.4283019747282514e-05, "loss": 2.4804, "step": 2935 }, { "epoch": 0.61, "grad_norm": 0.8852449655532837, "learning_rate": 3.412484232664545e-05, "loss": 2.4749, "step": 2940 }, { "epoch": 0.61, "grad_norm": 0.9173283576965332, "learning_rate": 3.396684140051424e-05, "loss": 2.4272, "step": 2945 }, { "epoch": 0.61, "grad_norm": 0.8929095268249512, "learning_rate": 3.3809018725488466e-05, "loss": 2.4746, "step": 2950 }, { "epoch": 0.61, "grad_norm": 0.9127289652824402, "learning_rate": 3.365137605618598e-05, "loss": 2.4833, "step": 2955 }, { "epoch": 0.62, "grad_norm": 0.9289007782936096, "learning_rate": 3.3493915145223395e-05, "loss": 2.4848, "step": 2960 }, { "epoch": 0.62, "grad_norm": 1.0071487426757812, "learning_rate": 3.3336637743196584e-05, "loss": 2.4652, "step": 2965 }, { "epoch": 0.62, "grad_norm": 0.938108503818512, "learning_rate": 3.317954559866126e-05, "loss": 2.4782, "step": 2970 }, { "epoch": 0.62, "grad_norm": 0.9593190550804138, "learning_rate": 3.302264045811344e-05, "loss": 2.4936, "step": 2975 }, { "epoch": 0.62, "grad_norm": 0.8893548846244812, "learning_rate": 3.286592406597021e-05, "loss": 2.4851, "step": 2980 }, { "epoch": 0.62, "grad_norm": 0.8929252624511719, "learning_rate": 3.270939816455012e-05, "loss": 2.4784, "step": 2985 }, { "epoch": 0.62, "grad_norm": 0.9195278286933899, "learning_rate": 3.255306449405395e-05, "loss": 2.5345, "step": 2990 }, { "epoch": 0.62, "grad_norm": 0.9475163221359253, "learning_rate": 3.2396924792545304e-05, "loss": 2.4952, "step": 2995 }, { "epoch": 0.62, "grad_norm": 0.9016882181167603, "learning_rate": 3.224098079593132e-05, "loss": 2.4655, "step": 3000 }, { "epoch": 0.62, "grad_norm": 0.9258350729942322, "learning_rate": 3.2085234237943354e-05, "loss": 2.4314, "step": 3005 }, { "epoch": 0.63, "grad_norm": 0.9156048893928528, "learning_rate": 3.19296868501177e-05, "loss": 2.5102, "step": 3010 }, { "epoch": 0.63, "grad_norm": 0.9461023807525635, "learning_rate": 3.177434036177636e-05, "loss": 2.4557, "step": 3015 }, { "epoch": 0.63, "grad_norm": 0.9364247918128967, "learning_rate": 3.1619196500007804e-05, "loss": 2.5106, "step": 3020 }, { "epoch": 0.63, "grad_norm": 0.9256249070167542, "learning_rate": 3.146425698964776e-05, "loss": 2.4673, "step": 3025 }, { "epoch": 0.63, "grad_norm": 0.9104235768318176, "learning_rate": 3.1309523553260046e-05, "loss": 2.5164, "step": 3030 }, { "epoch": 0.63, "grad_norm": 0.947726309299469, "learning_rate": 3.115499791111743e-05, "loss": 2.5692, "step": 3035 }, { "epoch": 0.63, "grad_norm": 0.9046145677566528, "learning_rate": 3.10006817811825e-05, "loss": 2.4769, "step": 3040 }, { "epoch": 0.63, "grad_norm": 0.9253242015838623, "learning_rate": 3.084657687908855e-05, "loss": 2.4542, "step": 3045 }, { "epoch": 0.63, "grad_norm": 0.9397364854812622, "learning_rate": 3.069268491812052e-05, "loss": 2.4806, "step": 3050 }, { "epoch": 0.63, "grad_norm": 0.9234438538551331, "learning_rate": 3.0539007609195934e-05, "loss": 2.5438, "step": 3055 }, { "epoch": 0.64, "grad_norm": 0.9175778031349182, "learning_rate": 3.0385546660845908e-05, "loss": 2.467, "step": 3060 }, { "epoch": 0.64, "grad_norm": 0.9659436941146851, "learning_rate": 3.0232303779196132e-05, "loss": 2.5125, "step": 3065 }, { "epoch": 0.64, "grad_norm": 0.9433670043945312, "learning_rate": 3.0079280667947885e-05, "loss": 2.4722, "step": 3070 }, { "epoch": 0.64, "grad_norm": 0.9202114343643188, "learning_rate": 2.9926479028359132e-05, "loss": 2.489, "step": 3075 }, { "epoch": 0.64, "grad_norm": 0.9274362325668335, "learning_rate": 2.97739005592256e-05, "loss": 2.4452, "step": 3080 }, { "epoch": 0.64, "grad_norm": 0.9218001961708069, "learning_rate": 2.962154695686187e-05, "loss": 2.4078, "step": 3085 }, { "epoch": 0.64, "grad_norm": 0.9582597017288208, "learning_rate": 2.9469419915082536e-05, "loss": 2.4814, "step": 3090 }, { "epoch": 0.64, "grad_norm": 0.9216330051422119, "learning_rate": 2.9317521125183368e-05, "loss": 2.4246, "step": 3095 }, { "epoch": 0.64, "grad_norm": 0.92444908618927, "learning_rate": 2.9165852275922524e-05, "loss": 2.513, "step": 3100 }, { "epoch": 0.65, "grad_norm": 0.9081312417984009, "learning_rate": 2.901441505350174e-05, "loss": 2.4436, "step": 3105 }, { "epoch": 0.65, "grad_norm": 0.9419301748275757, "learning_rate": 2.886321114154762e-05, "loss": 2.529, "step": 3110 }, { "epoch": 0.65, "grad_norm": 0.920991837978363, "learning_rate": 2.87122422210929e-05, "loss": 2.4549, "step": 3115 }, { "epoch": 0.65, "grad_norm": 0.9346429109573364, "learning_rate": 2.8561509970557736e-05, "loss": 2.471, "step": 3120 }, { "epoch": 0.65, "grad_norm": 0.9041022658348083, "learning_rate": 2.8411016065731146e-05, "loss": 2.4544, "step": 3125 }, { "epoch": 0.65, "grad_norm": 0.8967710137367249, "learning_rate": 2.826076217975222e-05, "loss": 2.4794, "step": 3130 }, { "epoch": 0.65, "grad_norm": 0.9126041531562805, "learning_rate": 2.8110749983091632e-05, "loss": 2.4372, "step": 3135 }, { "epoch": 0.65, "grad_norm": 0.9003017544746399, "learning_rate": 2.7960981143533053e-05, "loss": 2.4927, "step": 3140 }, { "epoch": 0.65, "grad_norm": 0.8914475440979004, "learning_rate": 2.781145732615457e-05, "loss": 2.4697, "step": 3145 }, { "epoch": 0.65, "grad_norm": 0.936227023601532, "learning_rate": 2.7662180193310218e-05, "loss": 2.461, "step": 3150 }, { "epoch": 0.66, "grad_norm": 0.8699093461036682, "learning_rate": 2.751315140461145e-05, "loss": 2.465, "step": 3155 }, { "epoch": 0.66, "grad_norm": 0.9162988662719727, "learning_rate": 2.7364372616908744e-05, "loss": 2.5081, "step": 3160 }, { "epoch": 0.66, "grad_norm": 0.9147549271583557, "learning_rate": 2.7215845484273152e-05, "loss": 2.4671, "step": 3165 }, { "epoch": 0.66, "grad_norm": 0.9275458455085754, "learning_rate": 2.7067571657977893e-05, "loss": 2.5228, "step": 3170 }, { "epoch": 0.66, "grad_norm": 0.9181550741195679, "learning_rate": 2.691955278648003e-05, "loss": 2.438, "step": 3175 }, { "epoch": 0.66, "grad_norm": 0.9136456847190857, "learning_rate": 2.6771790515402112e-05, "loss": 2.4914, "step": 3180 }, { "epoch": 0.66, "grad_norm": 0.9101700186729431, "learning_rate": 2.6624286487513916e-05, "loss": 2.4316, "step": 3185 }, { "epoch": 0.66, "grad_norm": 0.9077585339546204, "learning_rate": 2.6477042342714137e-05, "loss": 2.4393, "step": 3190 }, { "epoch": 0.66, "grad_norm": 0.911003053188324, "learning_rate": 2.633005971801219e-05, "loss": 2.5048, "step": 3195 }, { "epoch": 0.67, "grad_norm": 0.9087389707565308, "learning_rate": 2.6183340247510013e-05, "loss": 2.4693, "step": 3200 }, { "epoch": 0.67, "grad_norm": 0.9216850399971008, "learning_rate": 2.6036885562383856e-05, "loss": 2.5016, "step": 3205 }, { "epoch": 0.67, "grad_norm": 0.9231410026550293, "learning_rate": 2.5890697290866206e-05, "loss": 2.445, "step": 3210 }, { "epoch": 0.67, "grad_norm": 0.9174930453300476, "learning_rate": 2.5744777058227642e-05, "loss": 2.4903, "step": 3215 }, { "epoch": 0.67, "grad_norm": 0.9033377766609192, "learning_rate": 2.5599126486758777e-05, "loss": 2.4601, "step": 3220 }, { "epoch": 0.67, "grad_norm": 0.9198935031890869, "learning_rate": 2.5453747195752243e-05, "loss": 2.459, "step": 3225 }, { "epoch": 0.67, "grad_norm": 0.8865978717803955, "learning_rate": 2.530864080148464e-05, "loss": 2.4891, "step": 3230 }, { "epoch": 0.67, "grad_norm": 0.8864363431930542, "learning_rate": 2.5163808917198615e-05, "loss": 2.4203, "step": 3235 }, { "epoch": 0.67, "grad_norm": 0.9178940653800964, "learning_rate": 2.501925315308492e-05, "loss": 2.4455, "step": 3240 }, { "epoch": 0.67, "grad_norm": 0.9083361625671387, "learning_rate": 2.4874975116264477e-05, "loss": 2.372, "step": 3245 }, { "epoch": 0.68, "grad_norm": 0.8891680836677551, "learning_rate": 2.4730976410770534e-05, "loss": 2.4615, "step": 3250 }, { "epoch": 0.68, "grad_norm": 0.9084404706954956, "learning_rate": 2.458725863753084e-05, "loss": 2.5105, "step": 3255 }, { "epoch": 0.68, "grad_norm": 0.911205530166626, "learning_rate": 2.4443823394349834e-05, "loss": 2.4586, "step": 3260 }, { "epoch": 0.68, "grad_norm": 0.9256761074066162, "learning_rate": 2.430067227589088e-05, "loss": 2.4955, "step": 3265 }, { "epoch": 0.68, "grad_norm": 0.9415754675865173, "learning_rate": 2.4157806873658517e-05, "loss": 2.4188, "step": 3270 }, { "epoch": 0.68, "grad_norm": 0.9153307676315308, "learning_rate": 2.401522877598087e-05, "loss": 2.4196, "step": 3275 }, { "epoch": 0.68, "grad_norm": 0.8939723372459412, "learning_rate": 2.3872939567991827e-05, "loss": 2.421, "step": 3280 }, { "epoch": 0.68, "grad_norm": 0.9284604787826538, "learning_rate": 2.373094083161353e-05, "loss": 2.4798, "step": 3285 }, { "epoch": 0.68, "grad_norm": 0.8872914910316467, "learning_rate": 2.358923414553877e-05, "loss": 2.474, "step": 3290 }, { "epoch": 0.68, "grad_norm": 0.8954537510871887, "learning_rate": 2.3447821085213405e-05, "loss": 2.5011, "step": 3295 }, { "epoch": 0.69, "grad_norm": 0.8992528319358826, "learning_rate": 2.3306703222818878e-05, "loss": 2.4536, "step": 3300 }, { "epoch": 0.69, "grad_norm": 0.922997236251831, "learning_rate": 2.3165882127254705e-05, "loss": 2.4254, "step": 3305 }, { "epoch": 0.69, "grad_norm": 0.9199411869049072, "learning_rate": 2.302535936412108e-05, "loss": 2.4285, "step": 3310 }, { "epoch": 0.69, "grad_norm": 0.9020093679428101, "learning_rate": 2.2885136495701415e-05, "loss": 2.4685, "step": 3315 }, { "epoch": 0.69, "grad_norm": 0.894803524017334, "learning_rate": 2.274521508094501e-05, "loss": 2.3846, "step": 3320 }, { "epoch": 0.69, "grad_norm": 0.9010105133056641, "learning_rate": 2.2605596675449698e-05, "loss": 2.5001, "step": 3325 }, { "epoch": 0.69, "grad_norm": 0.8836818933486938, "learning_rate": 2.246628283144457e-05, "loss": 2.4844, "step": 3330 }, { "epoch": 0.69, "grad_norm": 0.957069456577301, "learning_rate": 2.232727509777269e-05, "loss": 2.4861, "step": 3335 }, { "epoch": 0.69, "grad_norm": 0.9847378730773926, "learning_rate": 2.2188575019873932e-05, "loss": 2.4528, "step": 3340 }, { "epoch": 0.7, "grad_norm": 0.9106289744377136, "learning_rate": 2.2050184139767704e-05, "loss": 2.4389, "step": 3345 }, { "epoch": 0.7, "grad_norm": 0.9224086403846741, "learning_rate": 2.191210399603591e-05, "loss": 2.4644, "step": 3350 }, { "epoch": 0.7, "grad_norm": 0.9473267197608948, "learning_rate": 2.1774336123805772e-05, "loss": 2.3514, "step": 3355 }, { "epoch": 0.7, "grad_norm": 0.9258838295936584, "learning_rate": 2.1636882054732776e-05, "loss": 2.461, "step": 3360 }, { "epoch": 0.7, "grad_norm": 0.9247808456420898, "learning_rate": 2.1499743316983684e-05, "loss": 2.3768, "step": 3365 }, { "epoch": 0.7, "grad_norm": 0.9250257015228271, "learning_rate": 2.1362921435219473e-05, "loss": 2.4103, "step": 3370 }, { "epoch": 0.7, "grad_norm": 0.917296290397644, "learning_rate": 2.1226417930578464e-05, "loss": 2.4581, "step": 3375 }, { "epoch": 0.7, "grad_norm": 0.9212003350257874, "learning_rate": 2.109023432065935e-05, "loss": 2.4713, "step": 3380 }, { "epoch": 0.7, "grad_norm": 0.9421318173408508, "learning_rate": 2.095437211950434e-05, "loss": 2.3969, "step": 3385 }, { "epoch": 0.7, "grad_norm": 0.927157461643219, "learning_rate": 2.0818832837582352e-05, "loss": 2.4147, "step": 3390 }, { "epoch": 0.71, "grad_norm": 0.9017496705055237, "learning_rate": 2.068361798177218e-05, "loss": 2.4576, "step": 3395 }, { "epoch": 0.71, "grad_norm": 0.9253119826316833, "learning_rate": 2.0548729055345778e-05, "loss": 2.4253, "step": 3400 }, { "epoch": 0.71, "grad_norm": 0.9048140645027161, "learning_rate": 2.0414167557951514e-05, "loss": 2.4676, "step": 3405 }, { "epoch": 0.71, "grad_norm": 0.9291346073150635, "learning_rate": 2.0279934985597527e-05, "loss": 2.4913, "step": 3410 }, { "epoch": 0.71, "grad_norm": 0.9075619578361511, "learning_rate": 2.0146032830635054e-05, "loss": 2.4127, "step": 3415 }, { "epoch": 0.71, "grad_norm": 0.9002858400344849, "learning_rate": 2.001246258174192e-05, "loss": 2.4856, "step": 3420 }, { "epoch": 0.71, "grad_norm": 0.8779779672622681, "learning_rate": 1.9879225723905886e-05, "loss": 2.4432, "step": 3425 }, { "epoch": 0.71, "grad_norm": 0.9209660291671753, "learning_rate": 1.9746323738408203e-05, "loss": 2.4792, "step": 3430 }, { "epoch": 0.71, "grad_norm": 0.9068074226379395, "learning_rate": 1.9613758102807117e-05, "loss": 2.467, "step": 3435 }, { "epoch": 0.71, "grad_norm": 0.8899487853050232, "learning_rate": 1.9481530290921474e-05, "loss": 2.3785, "step": 3440 }, { "epoch": 0.72, "grad_norm": 0.9144604802131653, "learning_rate": 1.934964177281428e-05, "loss": 2.434, "step": 3445 }, { "epoch": 0.72, "grad_norm": 0.9262046217918396, "learning_rate": 1.9218094014776434e-05, "loss": 2.4116, "step": 3450 }, { "epoch": 0.72, "grad_norm": 0.9320298433303833, "learning_rate": 1.9086888479310333e-05, "loss": 2.4139, "step": 3455 }, { "epoch": 0.72, "grad_norm": 0.9386606812477112, "learning_rate": 1.895602662511371e-05, "loss": 2.4334, "step": 3460 }, { "epoch": 0.72, "grad_norm": 0.9101786613464355, "learning_rate": 1.8825509907063327e-05, "loss": 2.4237, "step": 3465 }, { "epoch": 0.72, "grad_norm": 0.9514603614807129, "learning_rate": 1.8695339776198872e-05, "loss": 2.4838, "step": 3470 }, { "epoch": 0.72, "grad_norm": 0.9128882884979248, "learning_rate": 1.8565517679706783e-05, "loss": 2.4205, "step": 3475 }, { "epoch": 0.72, "grad_norm": 0.9061757922172546, "learning_rate": 1.8436045060904174e-05, "loss": 2.4656, "step": 3480 }, { "epoch": 0.72, "grad_norm": 0.9159146547317505, "learning_rate": 1.830692335922279e-05, "loss": 2.4476, "step": 3485 }, { "epoch": 0.73, "grad_norm": 0.9187010526657104, "learning_rate": 1.8178154010192994e-05, "loss": 2.4313, "step": 3490 }, { "epoch": 0.73, "grad_norm": 0.9203282594680786, "learning_rate": 1.8049738445427822e-05, "loss": 2.4357, "step": 3495 }, { "epoch": 0.73, "grad_norm": 0.8951599597930908, "learning_rate": 1.7921678092607052e-05, "loss": 2.493, "step": 3500 }, { "epoch": 0.73, "grad_norm": 0.9371747374534607, "learning_rate": 1.7793974375461352e-05, "loss": 2.4493, "step": 3505 }, { "epoch": 0.73, "grad_norm": 0.8991261124610901, "learning_rate": 1.7666628713756417e-05, "loss": 2.4306, "step": 3510 }, { "epoch": 0.73, "grad_norm": 0.9095786809921265, "learning_rate": 1.7539642523277228e-05, "loss": 2.4822, "step": 3515 }, { "epoch": 0.73, "grad_norm": 0.9121299386024475, "learning_rate": 1.7413017215812273e-05, "loss": 2.413, "step": 3520 }, { "epoch": 0.73, "grad_norm": 0.9428772926330566, "learning_rate": 1.728675419913788e-05, "loss": 2.4214, "step": 3525 }, { "epoch": 0.73, "grad_norm": 0.9310411214828491, "learning_rate": 1.716085487700253e-05, "loss": 2.4111, "step": 3530 }, { "epoch": 0.73, "grad_norm": 0.8934375047683716, "learning_rate": 1.703532064911131e-05, "loss": 2.4061, "step": 3535 }, { "epoch": 0.74, "grad_norm": 0.9108890891075134, "learning_rate": 1.6910152911110283e-05, "loss": 2.4731, "step": 3540 }, { "epoch": 0.74, "grad_norm": 0.9230520725250244, "learning_rate": 1.6785353054571024e-05, "loss": 2.3733, "step": 3545 }, { "epoch": 0.74, "grad_norm": 0.9198956489562988, "learning_rate": 1.666092246697512e-05, "loss": 2.5013, "step": 3550 }, { "epoch": 0.74, "grad_norm": 0.9111266136169434, "learning_rate": 1.6536862531698766e-05, "loss": 2.4381, "step": 3555 }, { "epoch": 0.74, "grad_norm": 0.9198375940322876, "learning_rate": 1.6413174627997328e-05, "loss": 2.5059, "step": 3560 }, { "epoch": 0.74, "grad_norm": 0.9285071492195129, "learning_rate": 1.6289860130990147e-05, "loss": 2.4141, "step": 3565 }, { "epoch": 0.74, "grad_norm": 0.9231954216957092, "learning_rate": 1.6166920411645064e-05, "loss": 2.4381, "step": 3570 }, { "epoch": 0.74, "grad_norm": 0.8866026401519775, "learning_rate": 1.6044356836763315e-05, "loss": 2.3406, "step": 3575 }, { "epoch": 0.74, "grad_norm": 0.9078811407089233, "learning_rate": 1.5922170768964285e-05, "loss": 2.4162, "step": 3580 }, { "epoch": 0.75, "grad_norm": 0.9162793755531311, "learning_rate": 1.5800363566670362e-05, "loss": 2.4726, "step": 3585 }, { "epoch": 0.75, "grad_norm": 0.8958753347396851, "learning_rate": 1.5678936584091852e-05, "loss": 2.4159, "step": 3590 }, { "epoch": 0.75, "grad_norm": 0.9262574315071106, "learning_rate": 1.5557891171211892e-05, "loss": 2.4923, "step": 3595 }, { "epoch": 0.75, "grad_norm": 0.9506419897079468, "learning_rate": 1.5437228673771465e-05, "loss": 2.4013, "step": 3600 }, { "epoch": 0.75, "grad_norm": 0.9091145992279053, "learning_rate": 1.5316950433254445e-05, "loss": 2.3992, "step": 3605 }, { "epoch": 0.75, "grad_norm": 0.9276121854782104, "learning_rate": 1.5197057786872649e-05, "loss": 2.4998, "step": 3610 }, { "epoch": 0.75, "grad_norm": 0.9171314239501953, "learning_rate": 1.5077552067551015e-05, "loss": 2.4324, "step": 3615 }, { "epoch": 0.75, "grad_norm": 0.9585117697715759, "learning_rate": 1.4958434603912747e-05, "loss": 2.5077, "step": 3620 }, { "epoch": 0.75, "grad_norm": 0.9010992050170898, "learning_rate": 1.4839706720264546e-05, "loss": 2.5017, "step": 3625 }, { "epoch": 0.75, "grad_norm": 0.9106521606445312, "learning_rate": 1.4721369736581924e-05, "loss": 2.3687, "step": 3630 }, { "epoch": 0.76, "grad_norm": 0.9210944771766663, "learning_rate": 1.4603424968494484e-05, "loss": 2.4641, "step": 3635 }, { "epoch": 0.76, "grad_norm": 0.9032136797904968, "learning_rate": 1.448587372727132e-05, "loss": 2.4258, "step": 3640 }, { "epoch": 0.76, "grad_norm": 0.9011669158935547, "learning_rate": 1.4368717319806419e-05, "loss": 2.4067, "step": 3645 }, { "epoch": 0.76, "grad_norm": 0.9185218214988708, "learning_rate": 1.4251957048604152e-05, "loss": 2.4289, "step": 3650 }, { "epoch": 0.76, "grad_norm": 0.8981814980506897, "learning_rate": 1.413559421176479e-05, "loss": 2.4876, "step": 3655 }, { "epoch": 0.76, "grad_norm": 0.9443618655204773, "learning_rate": 1.4019630102970056e-05, "loss": 2.5391, "step": 3660 }, { "epoch": 0.76, "grad_norm": 0.9296389818191528, "learning_rate": 1.3904066011468753e-05, "loss": 2.4167, "step": 3665 }, { "epoch": 0.76, "grad_norm": 0.9274140000343323, "learning_rate": 1.3788903222062433e-05, "loss": 2.4408, "step": 3670 }, { "epoch": 0.76, "grad_norm": 0.9020739793777466, "learning_rate": 1.3674143015091118e-05, "loss": 2.475, "step": 3675 }, { "epoch": 0.76, "grad_norm": 0.9183312058448792, "learning_rate": 1.355978666641905e-05, "loss": 2.4201, "step": 3680 }, { "epoch": 0.77, "grad_norm": 0.9057744145393372, "learning_rate": 1.3445835447420507e-05, "loss": 2.4336, "step": 3685 }, { "epoch": 0.77, "grad_norm": 0.8912809491157532, "learning_rate": 1.3332290624965688e-05, "loss": 2.4268, "step": 3690 }, { "epoch": 0.77, "grad_norm": 0.9013752937316895, "learning_rate": 1.3219153461406609e-05, "loss": 2.4332, "step": 3695 }, { "epoch": 0.77, "grad_norm": 0.9307184815406799, "learning_rate": 1.3106425214563078e-05, "loss": 2.3963, "step": 3700 }, { "epoch": 0.77, "grad_norm": 0.9287288784980774, "learning_rate": 1.2994107137708716e-05, "loss": 2.5023, "step": 3705 }, { "epoch": 0.77, "grad_norm": 0.9417144656181335, "learning_rate": 1.2882200479556988e-05, "loss": 2.4823, "step": 3710 }, { "epoch": 0.77, "grad_norm": 0.9298997521400452, "learning_rate": 1.2770706484247397e-05, "loss": 2.456, "step": 3715 }, { "epoch": 0.77, "grad_norm": 0.9269828796386719, "learning_rate": 1.2659626391331564e-05, "loss": 2.4669, "step": 3720 }, { "epoch": 0.77, "grad_norm": 0.9162706136703491, "learning_rate": 1.2548961435759493e-05, "loss": 2.4185, "step": 3725 }, { "epoch": 0.78, "grad_norm": 0.9016087055206299, "learning_rate": 1.2438712847865846e-05, "loss": 2.4509, "step": 3730 }, { "epoch": 0.78, "grad_norm": 0.9222176671028137, "learning_rate": 1.2328881853356244e-05, "loss": 2.4106, "step": 3735 }, { "epoch": 0.78, "grad_norm": 0.8969464898109436, "learning_rate": 1.221946967329365e-05, "loss": 2.4994, "step": 3740 }, { "epoch": 0.78, "grad_norm": 0.9177476763725281, "learning_rate": 1.2110477524084796e-05, "loss": 2.3859, "step": 3745 }, { "epoch": 0.78, "grad_norm": 0.9398115277290344, "learning_rate": 1.2001906617466657e-05, "loss": 2.3963, "step": 3750 }, { "epoch": 0.78, "grad_norm": 0.9646539092063904, "learning_rate": 1.1893758160492978e-05, "loss": 2.3666, "step": 3755 }, { "epoch": 0.78, "grad_norm": 0.9097631573677063, "learning_rate": 1.1786033355520859e-05, "loss": 2.3711, "step": 3760 }, { "epoch": 0.78, "grad_norm": 0.8921908140182495, "learning_rate": 1.1678733400197373e-05, "loss": 2.4005, "step": 3765 }, { "epoch": 0.78, "grad_norm": 0.913013756275177, "learning_rate": 1.1571859487446263e-05, "loss": 2.4578, "step": 3770 }, { "epoch": 0.78, "grad_norm": 0.9429560303688049, "learning_rate": 1.1465412805454695e-05, "loss": 2.413, "step": 3775 }, { "epoch": 0.79, "grad_norm": 0.9588865637779236, "learning_rate": 1.1359394537660011e-05, "loss": 2.4039, "step": 3780 }, { "epoch": 0.79, "grad_norm": 0.9314767122268677, "learning_rate": 1.125380586273661e-05, "loss": 2.4643, "step": 3785 }, { "epoch": 0.79, "grad_norm": 0.9301546216011047, "learning_rate": 1.1148647954582808e-05, "loss": 2.3927, "step": 3790 }, { "epoch": 0.79, "grad_norm": 0.9069781303405762, "learning_rate": 1.1043921982307819e-05, "loss": 2.4396, "step": 3795 }, { "epoch": 0.79, "grad_norm": 0.9067623615264893, "learning_rate": 1.0939629110218735e-05, "loss": 2.4054, "step": 3800 }, { "epoch": 0.79, "grad_norm": 0.9323053956031799, "learning_rate": 1.0835770497807596e-05, "loss": 2.4356, "step": 3805 }, { "epoch": 0.79, "grad_norm": 0.9316867589950562, "learning_rate": 1.0732347299738493e-05, "loss": 2.3777, "step": 3810 }, { "epoch": 0.79, "grad_norm": 0.9224940538406372, "learning_rate": 1.0629360665834732e-05, "loss": 2.3806, "step": 3815 }, { "epoch": 0.79, "grad_norm": 0.9173339605331421, "learning_rate": 1.052681174106604e-05, "loss": 2.405, "step": 3820 }, { "epoch": 0.8, "grad_norm": 0.8999264240264893, "learning_rate": 1.0424701665535852e-05, "loss": 2.4004, "step": 3825 }, { "epoch": 0.8, "grad_norm": 0.8964149355888367, "learning_rate": 1.0323031574468638e-05, "loss": 2.4523, "step": 3830 }, { "epoch": 0.8, "grad_norm": 0.9076893925666809, "learning_rate": 1.0221802598197261e-05, "loss": 2.3863, "step": 3835 }, { "epoch": 0.8, "grad_norm": 0.9315918684005737, "learning_rate": 1.0121015862150423e-05, "loss": 2.3959, "step": 3840 }, { "epoch": 0.8, "grad_norm": 0.938804030418396, "learning_rate": 1.0020672486840154e-05, "loss": 2.3878, "step": 3845 }, { "epoch": 0.8, "grad_norm": 0.908531665802002, "learning_rate": 9.920773587849364e-06, "loss": 2.4283, "step": 3850 }, { "epoch": 0.8, "grad_norm": 0.9469609260559082, "learning_rate": 9.821320275819401e-06, "loss": 2.4166, "step": 3855 }, { "epoch": 0.8, "grad_norm": 0.9202249646186829, "learning_rate": 9.72231365643777e-06, "loss": 2.4384, "step": 3860 }, { "epoch": 0.8, "grad_norm": 0.9061878323554993, "learning_rate": 9.623754830425779e-06, "loss": 2.4621, "step": 3865 }, { "epoch": 0.8, "grad_norm": 0.9435011744499207, "learning_rate": 9.52564489352632e-06, "loss": 2.4808, "step": 3870 }, { "epoch": 0.81, "grad_norm": 0.8984233736991882, "learning_rate": 9.427984936491702e-06, "loss": 2.4471, "step": 3875 }, { "epoch": 0.81, "grad_norm": 0.9315876364707947, "learning_rate": 9.330776045071509e-06, "loss": 2.3882, "step": 3880 }, { "epoch": 0.81, "grad_norm": 0.9606491327285767, "learning_rate": 9.23401930000054e-06, "loss": 2.4099, "step": 3885 }, { "epoch": 0.81, "grad_norm": 0.9043254852294922, "learning_rate": 9.137715776986772e-06, "loss": 2.3666, "step": 3890 }, { "epoch": 0.81, "grad_norm": 0.9284046292304993, "learning_rate": 9.041866546699434e-06, "loss": 2.4225, "step": 3895 }, { "epoch": 0.81, "grad_norm": 0.9200391173362732, "learning_rate": 8.946472674757078e-06, "loss": 2.384, "step": 3900 }, { "epoch": 0.81, "grad_norm": 0.9081897139549255, "learning_rate": 8.851535221715735e-06, "loss": 2.4135, "step": 3905 }, { "epoch": 0.81, "grad_norm": 0.8964848518371582, "learning_rate": 8.757055243057132e-06, "loss": 2.3975, "step": 3910 }, { "epoch": 0.81, "grad_norm": 0.9136962890625, "learning_rate": 8.663033789176967e-06, "loss": 2.4181, "step": 3915 }, { "epoch": 0.81, "grad_norm": 0.9067717790603638, "learning_rate": 8.5694719053732e-06, "loss": 2.4189, "step": 3920 }, { "epoch": 0.82, "grad_norm": 0.91167813539505, "learning_rate": 8.476370631834458e-06, "loss": 2.421, "step": 3925 }, { "epoch": 0.82, "grad_norm": 0.8846667408943176, "learning_rate": 8.383731003628452e-06, "loss": 2.4894, "step": 3930 }, { "epoch": 0.82, "grad_norm": 0.9231117963790894, "learning_rate": 8.291554050690508e-06, "loss": 2.5637, "step": 3935 }, { "epoch": 0.82, "grad_norm": 0.8884219527244568, "learning_rate": 8.199840797812058e-06, "loss": 2.411, "step": 3940 }, { "epoch": 0.82, "grad_norm": 0.9300855994224548, "learning_rate": 8.108592264629295e-06, "loss": 2.3647, "step": 3945 }, { "epoch": 0.82, "grad_norm": 0.9004824161529541, "learning_rate": 8.017809465611803e-06, "loss": 2.4171, "step": 3950 }, { "epoch": 0.82, "grad_norm": 0.9084776043891907, "learning_rate": 7.927493410051324e-06, "loss": 2.3586, "step": 3955 }, { "epoch": 0.82, "grad_norm": 0.9107391834259033, "learning_rate": 7.837645102050473e-06, "loss": 2.3917, "step": 3960 }, { "epoch": 0.82, "grad_norm": 0.9120099544525146, "learning_rate": 7.748265540511635e-06, "loss": 2.3377, "step": 3965 }, { "epoch": 0.83, "grad_norm": 0.9412475228309631, "learning_rate": 7.65935571912582e-06, "loss": 2.4913, "step": 3970 }, { "epoch": 0.83, "grad_norm": 0.9313330054283142, "learning_rate": 7.5709166263616405e-06, "loss": 2.435, "step": 3975 }, { "epoch": 0.83, "grad_norm": 0.9151592254638672, "learning_rate": 7.482949245454302e-06, "loss": 2.419, "step": 3980 }, { "epoch": 0.83, "grad_norm": 0.9209585189819336, "learning_rate": 7.3954545543946876e-06, "loss": 2.4214, "step": 3985 }, { "epoch": 0.83, "grad_norm": 0.8722422122955322, "learning_rate": 7.308433525918468e-06, "loss": 2.3465, "step": 3990 }, { "epoch": 0.83, "grad_norm": 0.9042191505432129, "learning_rate": 7.221887127495313e-06, "loss": 2.4304, "step": 3995 }, { "epoch": 0.83, "grad_norm": 0.9069880843162537, "learning_rate": 7.1358163213181114e-06, "loss": 2.3537, "step": 4000 } ], "logging_steps": 5, "max_steps": 4811, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 1.1658574908358656e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }